Create your Gitee Account
Explore and code with more than 6 million developers,Free private repositories !:)
Sign up
This repository doesn't specify license. Without author's permission, this code is only for learning and cannot be used for other purposes.
Clone or download
get_question.py 3.75 KB
Copy Edit Web IDE Raw Blame History
lwitcher authored 2019-08-07 07:11 . upd
#coding:utf-8
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time,os,sys
from selenium.webdriver.common.keys import Keys
import requests
from bs4 import BeautifulSoup
url='https://www.zhihu.com/question/{}'.format(sys.argv[1])
num=150
if len(sys.argv) > 1:
num=int(sys.argv[2])
chrome_options = Options()
chrome_options.add_argument('--headless')
#chrome_options.add_argument('--single-process')
prefs = {"profile.managed_default_content_settings.images":2}
chrome_options.add_experimental_option("prefs",prefs)
browser=webdriver.Chrome(chrome_options=chrome_options)
browser.get(url)
def download_all_pic(content):
with open('need_download.txt','a+') as f:
soup=BeautifulSoup(content,'html.parser')
for pic in soup.find_all('img'):
f.write(pic.attrs['src']+'\n')
#print(pic.attrs['src'])
pic.attrs['src']=pic.attrs['src'].split('/')[-1]
return soup.prettify()
title=browser.find_elements_by_css_selector('.QuestionHeader-title')[1].text
#print(title)
os.system('mkdir {}'.format(title))
os.chdir(title)
title_link='<a href={}>{}</a>'.format(url,title)
#expand question
question_content=''
btns=browser.find_elements_by_css_selector('.Button.QuestionRichText-more.Button--plain')
if len(btns) > 0:
browser.find_element_by_css_selector('.QuestionRichText.QuestionRichText--expandable.QuestionRichText--collapsed').click()
question_content=browser.find_element_by_css_selector('.QuestionRichText.QuestionRichText--expandable').find_element_by_css_selector('.RichText.ztext').get_attribute('innerHTML')
else:
try:
question_content=browser.find_element_by_css_selector('.QuestionRichText.QuestionRichText--collapsed').find_element_by_css_selector('.RichText.ztext').get_attribute('innerHTML')
except Exception as e:#有时候连描述都没有,只有标题
question_content='<br/>'#title
#print(question_content)
answers=[]
retry=1
while True:
get_num = len(browser.find_elements_by_css_selector('.ContentItem.AnswerItem'))
print(get_num)
if get_num < num and retry < num:
#print(len(browser.find_elements_by_css_selector('.ContentItem.AnswerItem')))
browser.execute_script("window.scrollBy(0,document.body.scrollHeight)")
browser.execute_script("window.scrollBy(0,100)")
time.sleep(.3)
retry+=1
else:
for item in browser.find_elements_by_css_selector('.ContentItem.AnswerItem'):
d={}
content=item.find_element_by_css_selector('.RichText.ztext.CopyrightRichText-richText')
d['content']=content.get_attribute('innerHTML')
try:
user_name=item.find_element_by_css_selector('.UserLink.AuthorInfo-name').find_element_by_css_selector('.UserLink-link')
d['user']=user_name.get_attribute('outerHTML')
except Exception as e:
user_name='匿名用户'
d['user']=user_name
print(d['user'])
agree_num=item.find_element_by_css_selector('.Button.VoteButton.VoteButton--up').get_attribute('aria-label')
d['agree_num']=agree_num
answers.append(d)
break
browser.quit()
with open('book.html','w+') as f:
f.write(title_link+question_content)
for x in answers:
x='<h3 class=chapter>'+x['user']+'</h3>\t'+x['agree_num']+'<br/>'+x['content']+'<HR><HR><HR>'
f.write(download_all_pic(x))
#os.system('sort need_download.txt|uniq > need_download2.txt && wget -i need_download2.txt && rm -f need_download*.txt'.format(title))
os.system('sort need_download.txt|uniq > need_download2.txt && aria2c -s 30 -i need_download2.txt'.format(title))
os.system('ebook-convert book.html book.mobi && mv book.mobi ../mobi/{}.mobi'.format(title,title))

Comment ( 0 )

Sign in for post a comment