Create your Gitee Account
Explore and code with more than 6 million developers,Free private repositories !:)
Sign up
Clone or download
scratch_interlib.py 4.00 KB
Copy Edit Web IDE Raw Blame History
井梧故园秋 authored 2017-09-23 10:42 . bee
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import re
import os
from urllib import request
import pymysql
from hashlib import md5
# user_agent_str = 'Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) \
# AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
categories = ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'X', 'Z') # 22 个分类
base_urls = ('http://opac.whlib.org.cn/opac') # , 'http://61.184.105.28:8090/opac' # 黄石一直是瘫痪的
log_file = 'log.log' # 日志存放位置
img_path = '/backup/cover/' # 图片存放位置
file_handle = open(log_file, 'w+')
connect = pymysql.connect(host="120.27.202.235", port=3306, user="root", passwd="G9mn4[K3", db="mybookgoal", charset="utf8", cursorclass=pymysql.cursors.DictCursor)
cursor = connect.cursor()
sql_select = "select * from my_cover_copy where isbn=%s limit 1"
sql_insert = "insert into my_cover_copy (isbn, name, md5) values (%s, %s, %s)"
def md5_file(name): # 对文件的 md5 操作,类似 php 的 md5_file
md5_object = md5()
file_handle = open(name, 'rb')
md5_object.update(file_handle.read())
file_handle.close()
return md5_object.hexdigest()
def log(data): # 日志记录
print(data)
file_handle.write(data+'\n')
def down_image(url, file_name): # 下载图片
r = requests.get(url, headers=headers)
if r.iter_content():
temp_file = open(file_name, 'wb')
for line in r.iter_content():
temp_file.write(line)
temp_file.close()
return True
else:
return False
def down(url, save_path):
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, "html.parser")
for i in soup.find_all('div', attrs={'class': 'p-img'}, limit=1):
for imgs in i.find_all('img', limit=1):
if imgs.get('src'):
down_image('http:' + imgs.get('src'), save_path)
for base_url in base_urls: # 遍历网址【一般一个市一个】
log('=====> 抓取网址 ' + base_url)
for category in categories: # 遍历分类
log('====> 抓取分类 ' + str(category))
search_url = '/search?q=' + category + '&searchType=standard&isFacet=false&view=simple&searchWay=class&booktype=1&rows=1000&page=' # 抓取的链接
page = 1 # 当前抓取页
total_page = 1 # 总页数
while True:
log('==> 抓取第 ' + str(page) + ' 页')
r = requests.get(base_url+search_url+str(page)) # 抓取
soup = BeautifulSoup(r.text, "html.parser") # 分析网页
if page == 1: # 第一页 - 分析总页数
total_page_span = soup.find('span', attrs={'class': 'disabled'})
total_page = int(re.search("\d+", total_page_span.string).group())
log('===> 共计 ' + str(total_page) + ' 页')
item = 1
for img in soup.table.find_all('img', attrs={'class': 'bookcover_img'}):
if img['isbn']: # 找到isbn
isbn = img['isbn'].replace('-', '')
log('=> 第 ' + str(page) + ' 页 第 ' + str(item) + ' 行 isbn ' + isbn + ' bookrecno ' + img['bookrecno'])
# cursor.execute(sql_select, isbn)
# cover_info = cursor.fetchone()
# if cover_info:
save_path = img_path + isbn + '.jpg'
if os.path.exists(save_path): # 判断图片存在
log('ISBN ' + isbn + ' 图片已存在')
else:
log('抓取ISBN ' + isbn + ' 封面')
try:
result = down('http://search.jd.com/Search?keyword='+isbn, save_path) # 下载图片
if result: # 下载成功 - 记库
try:
cursor.execute(sql_insert, (isbn, isbn+'.jpg', md5_file(save_path)))
connect.commit()
except Exception as e:
print('=====> insert into database error: ', sql_insert, isbn, isbn+'.jpg', md5_file(save_path), e)
except Exception as e:
print('=====> down_image error: ', e)
item = item + 1 # 记录
if page >= total_page: # 当执行到最后一页退出
break
page = page + 1 # 页

Comment ( 0 )

Sign in for post a comment