Create your Gitee Account
Explore and code with more than 6 million developers,Free private repositories !:)
Sign up
This repository doesn't specify license. Without author's permission, this code is only for learning and cannot be used for other purposes.
Clone or download
get_zhuanlan.py 1.62 KB
Copy Edit Web IDE Raw Blame History
lwitcher authored 2019-08-07 07:11 . upd
#coding:utf-8
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time,os,sys
from selenium.webdriver.common.keys import Keys
import requests
from bs4 import BeautifulSoup
url='https://zhuanlan.zhihu.com/p/73822472'
chrome_options = Options()
chrome_options.add_argument('--headless')
prefs = {"profile.managed_default_content_settings.images":2}
chrome_options.add_experimental_option("prefs",prefs)
browser=webdriver.Chrome(chrome_options=chrome_options)
browser.get(url)
def download_all_pic(content):
with open('need_download.txt','a+') as f:
soup=BeautifulSoup(content,'html.parser')
for pic in soup.find_all('img'):
f.write(pic.attrs['src']+'\n')
#print(pic.attrs['src'])
pic.attrs['src']=pic.attrs['src'].split('/')[-1]
return soup.prettify()
title=browser.find_element_by_css_selector('.Post-Title').text
#print(title)
os.system('mkdir {}'.format(title))
os.chdir(title)
title_link='<a href={} class=chapter>{}</a>'.format(url,title)
#expand question
question_content=''
question_content=browser.find_element_by_css_selector('.Post-RichTextContainer').get_attribute('outerHTML')
browser.quit()
with open('book.html','w+') as f:
f.write(title_link+download_all_pic(question_content))
#os.system('sort need_download.txt|uniq > need_download2.txt && wget -i need_download2.txt && rm -f need_download*.txt'.format(title))
os.system('sort need_download.txt|uniq > need_download2.txt && aria2c -s 30 -i need_download2.txt'.format(title))
os.system('ebook-convert book.html book.mobi && mv book.mobi ../mobi/{}.mobi'.format(title,title))

Comment ( 0 )

Sign in for post a comment