1 Star 0 Fork 1

jasper / DvisionSpiderCN

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
Spider.py 1.93 KB
一键复制 编辑 原始数据 按行查看 历史
Ma.YL 提交于 2017-04-14 12:33 . create project
#!/usr/bin/python
# -*- coding: utf-8 -*-
__author__ = 'Ma.YL'
import urllib2
from bs4 import BeautifulSoup
class Spider:
def __init__(self, url=None):
if url is None:
self.URL = "http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/"
else:
self.URL = url
def GetCurrentUrl(self,referer=None,selector=None):
url = self.URL
user_agent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"
header = {"User-Agent": user_agent, "Referer": referer}
request = urllib2.Request(url, None, header)
request.get_method = lambda: 'GET'
try:
response = urllib2.urlopen(request)
soup = BeautifulSoup(response.read(), "lxml")
href = soup.select(selector)
return href
except urllib2.URLError, e:
return e.errno , e.reason
def FindChilren(self,html,selector):
listTag = html.select(selector)
try:
return listTag[len(listTag)-1].get_text().strip()
except:
return ""
class DvisionData:
def __init__(self):
self.URL = "http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/"
def GetAllDvisionData(self):
spider = Spider()
referer = "http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201608/t20160809_1386477.html"
selector = "ul[class='center_list_contlist'] > li > a"
url = spider.GetCurrentUrl(referer, selector)[0]["href"][2:]
url = "http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/" + url
spider = Spider(url)
selector = "p[class='MsoNormal']"
result = spider.GetCurrentUrl(None, selector)
dictDvision = {}
for r in result:
key = spider.FindChilren(r, "span['lang']")
value = spider.FindChilren(r, "span['style']")
dictDvision[key] = value
return dictDvision
Python
1
https://gitee.com/lscherry/dvisionspidercn.git
git@gitee.com:lscherry/dvisionspidercn.git
lscherry
dvisionspidercn
DvisionSpiderCN
master

搜索帮助