代码拉取完成,页面将自动刷新
const puppeteer = require('puppeteer');
const awaitTo = require('async-await-error-handling');
const ora = require('ora');
const chalk = require('chalk');
const path = require('path');
const fs = require('fs');
const { timeout, writeFileSync } = require('./utils');
const spinner1 = ora({
color: 'yellow'
});
const spinner2 = ora({
color: 'yellow'
});
const spinner3 = ora({
color: 'yellow'
});
// 省份 (一般不发生变化)
const provinces = require('./data/provinces')["86"];
// 省份代码前缀
const pcodes = [];
provinces.forEach(province => {
// 710000(台湾) 910000(两个特别行政区)
let code = province.code;
if (code !== '710000' && code !== '910000') {
// 过滤掉港澳台
pcodes.push(code.substring(0, 2));
}
});
// 城市
let cities = [];
// if (fs.existsSync(path.resolve(__dirname, './data/cities.js'))) {
// cities = require('./data/cities.js');
// }
// 区域名称
let areas = [];
// if (fs.existsSync(path.resolve(__dirname, './data/areas.js'))) {
// areas = require('./data/areas.js');
// }
let towns = [];
// if (fs.existsSync(path.resolve(__dirname, './data/township.js'))) {
// towns = require('./data/township.js');
// }
// 提取路径 todo: 16年改为18年数据
const target = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/#{route}.html';
let url = '';
let type = 0; // 0:抓取市级数据; 1:抓取省级数据; 2抓取乡镇数据
// 当前正在抓取的目标
let curCity = '';
let curPCode = '';
// 临时变量
// let tempCityList = [];
// let tempAreaList = [];
// let tempTownList = [];
/**
* 抓取市区数据
* @param page
* @param pcode
* @returns {Promise<void>}
*/
async function getCitiesByPCode (page, pcode) {
url = target.replace('#{route}', pcode);
const parentCode = `${pcode}0000`;
await page.goto(url);
let province = '';
for(let index=0; index<provinces.length; index++){
if(parentCode === provinces[index].code){
province = provinces[index].text;
break;
}
}
spinner1.text = chalk.blue(`正在抓取${province}的市级数据:${url}`);
cities = await page.evaluate((parentCode, cities) => {
const list = [...document.querySelectorAll('.citytable .citytr')];
console.log(JSON.stringify(list));
let tempList = [];
list.forEach(el => {
let hasChildren = true;
if(el.firstChild && el.firstChild.firstChild){
hasChildren = el.firstChild.firstChild.nodeName === 'A'; // a 标签
} else {
hasChildren = false;
}
const t = el.innerText.split('\t');
tempList.push({
code: t[0],
text: t[1],
hasChildren: hasChildren,
});
});
cities.push({parentCode:parentCode, data:tempList});
return cities;
}, parentCode, cities);
}
/**
* 抓取县区数据
* @param page
* @param city
* @returns {Promise<void>}
*/
async function getAreasByCCode (page, city, parentCode) {
url = target.replace('#{route}', `${city.code.slice(0, 2)}/${city.code.slice(0, 4)}`);
await page.goto(url);
let province = '';
for(let index=0; index<provinces.length; index++){
if(parentCode === provinces[index].code){
province = provinces[index].text;
break;
}
}
spinner2.text = chalk.blue(`正在抓取 ${province}/${city.text} 的县区数据:${url}`);
areas = await page.evaluate((city, areas) => {
let list = [...document.querySelectorAll('.countytable .countytr')];
let haNaiZhanZhou = false;
if (!list.length) {
// 修正海南省-儋州市 or 广东-东莞市 or 广东-中山市 的区域数据,下设均为乡镇;
list = [...document.querySelectorAll('.towntable .towntr')];
haNaiZhanZhou = true;
}
let emptyChildFlag = false;
let tempArray = [];
list.forEach(el => {
let hasChildren = true;
if(el.firstChild && el.firstChild.firstChild){
hasChildren = el.firstChild.firstChild.nodeName === 'A'; // a 标签
} else {
hasChildren = false;
}
const t = el.innerText.split('\t');
if(haNaiZhanZhou || t[1].endsWith('镇') || t[1].endsWith('乡')){
emptyChildFlag = true;
}
tempArray.push({
code: t[0],
text: t[1],
hasChildren: hasChildren,
})
});
// todo: 如果有下设区域直接为乡镇级别,更改标志; 三级区级别可能下设子乡镇!!!,统一处理为无下设区域;
// 例如: 海南省-儋州市 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/46/04/460400106.html
if(emptyChildFlag){
tempArray.forEach(item=>{
item.hasChildren = false;
});
}
areas.push({parentCode: city.code, data: tempArray});
return areas;
}, city, areas, parentCode);
}
async function getTownshipByCCode (page, area) {
// 某个县级的下设乡镇信息\
const provinceCode2 = area.code.slice(0,2);
url = target.replace('#{route}', `${provinceCode2}/${area.code.slice(2,4)}/${area.code.slice(0, 6)}`);
console.log('\n', 'navigateToUrl: ', url);
await page.goto(url);
let province = '';
for(let index=0; index < provinces.length; index++){
if(provinceCode2 === provinces[index].code.slice(0, 2)){
province = provinces[index].text;
break;
}
}
let city = '';
const cityCode2 = area.code.slice(0,4);
for(let index=0; index < cities.length; index++){
if(provinceCode2 === cities[index].parentCode.slice(0, 2)){
let data = cities[index].data;
for(let y = 0; y < data.length; y++){
if(data[y].code.slice(0,4) === cityCode2){
city = data[y].text;
break;
}
}
break;
}
}
spinner3.text = chalk.blue(`正在抓取 ${province}/${city}/${area.text} 的乡镇数据:${url}`);
// console.log(`正在抓取 ${province}/${city}/${area.text} 的乡镇数据:${url}`);
// https://github.com/puppeteer/puppeteer/blob/v2.0.0/docs/api.md#pageevaluatepagefunction-args
towns = await page.evaluate((area, towns) => {
let list = [...document.querySelectorAll('.towntable .towntr')];
let tempList = [];
list.forEach(el => {
const t = el.innerText.split('\t');
t[1] = t[1].replace('办事处', '');
tempList.push({
code: t[0],
text: t[1],
hasChildren: false, // 抓取到乡镇数据后,停止下一级数据抓取;
})
});
towns.push({parentCode: area.code, data: tempList});
return towns;
}, area, towns);
}
process.on('unhandledRejection', (err) => {
console.log('\n', chalk.red(`抓取数据失败,失败链接: ${url}\n`), err.message);
process.exit(1);
});
(async () => {
spinner1.start(chalk.blue('开始抓取市区数据....'));
const browser = await puppeteer.launch();
const page = await browser.newPage();
if (!cities.length) {
for(let i = 0, l = pcodes.length; i < l; i++) {
const pcode = pcodes[i];
console.log('\n 市区代码:',pcode, "\n");
await timeout(1500);
let tempExeCode = -Number(pcode);
console.log(typeof tempExeCode, tempExeCode);
while(tempExeCode < 0){
const [err, data] = await awaitTo(getCitiesByPCode(page, pcode));
if (err) {
// 这个重试主要是处理因避免耗时(Navigation Timeout Exceeded)导致的错误
console.log('\n', chalk.red(`抓取城市数据失败,失败链接: ${url},错误信息: ${err.message},正在重试....\n`));
// let [err] = await getCitiesByPCode(page, pcode);
} else{
tempExeCode = -tempExeCode;
console.log("\n", '抓取成功!', tempExeCode, data);
}
}
}
writeFileSync('./data/cities.js', cities);
spinner1.succeed(chalk.green('市区数据抓取完毕,开始抓取县区数据....'));
} else {
spinner1.succeed(chalk.green('市区数据已经抓取过,开始抓取县区数据....'));
}
// ----------------------------------------------------------
type = 1;
console.log('\n');
spinner2.start(chalk.blue('正在抓取县区数据....'));
if(!areas.length){
for(let i = 0; i < cities.length; i++) {
const provinceCity = cities[i]; // 某个省的城市数据
const parentCode = provinceCity.parentCode;
// console.log(provinceCity);
if((typeof provinceCity.data) !== "object" || provinceCity.data.length <= 0){
console.error('县区数据结构错误!');
continue;
}
for(let y = 0; y < provinceCity.data.length; y++){
let city = provinceCity.data[y];
console.log(city);
await timeout(3000);
let tempExeCode = -Number(city.code);
while(tempExeCode < 0){
const [err, data] = await awaitTo(getAreasByCCode(page, city, parentCode));
console.log(err, data);
if (err) {
// 这个重试主要是处理因避免耗时(Navigation Timeout Exceeded)导致的错误
console.log('\n', chalk.red(`抓取县区数据失败,失败链接: ${url},错误信息: ${err.message},正在重试....\n`));
// await getAreasByCCode(page, city);
} else{
tempExeCode = -tempExeCode;
console.log("\n抓取成功!", tempExeCode, data);
}
}
}
}
writeFileSync('./data/areas.js', areas);
spinner2.succeed(chalk.green('县区数据抓取完毕'));
} else{
spinner2.succeed(chalk.green('县区数据已经抓取过,开始抓取乡镇数据....'));
}
// ----------------------------------------------------------
type = 2;
console.log('\n');
spinner3.start(chalk.blue('正在抓取乡镇数据....'));
// console.log('正在抓取乡镇数据....');
// console.log(areas);
for(let i= 0; i < areas.length; i++){
let tempAreaList = areas[i];
if((typeof tempAreaList.data) !== 'object' || tempAreaList.data.length <= 0){
console.error("区县数据结构错误");
continue;
}
for(let y = 0; y < tempAreaList.data.length; y++){
let tempArea = tempAreaList.data[y];
console.log(tempArea);
if(!tempArea.hasChildren){
console.log(`---${tempArea.text}` + '_' + `${tempArea.code}无子数据,已跳过!!!---`);
continue;
}
await timeout(3000);
let tempExeCode = -Number(tempArea.code);
while(tempExeCode < 0){
console.log('\n>>>>>>', tempArea);
const [err, data] = await awaitTo(getTownshipByCCode(page, tempArea));
if (err) {
// 这个重试主要是处理因避免耗时(Navigation Timeout Exceeded)导致的错误
console.log('\n', chalk.red(`抓取乡镇数据失败,失败链接: ${url},错误信息: ${err.message},正在重试....\n`));
} else{
tempExeCode = -tempExeCode;
console.log("\n抓取成功!", tempExeCode, data);
}
}
}
}
//Township.js
writeFileSync('./data/township.js', towns);
spinner3.succeed(chalk.green('乡镇数据抓取完毕'));
// console.log('乡镇数据抓取完毕');
await browser.close();
})();
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。