验证中...
Languages: JavaScript
Categories: JavaScript 工具
Latest update 2019-06-05 16:27
node获取指定网页数据
Raw Copy
const fetch = require('node-fetch'); // 轻量级的http请求模块
var cheerio = require('cheerio'); // 快速,灵活和精简的核心jQuery实现,专为服务器而设计。
var fs = require('fs'); // 引入fs模块
var Excel = require('exceljs'); // 引入模块
let data = []; // 每页数据集合
let orderNum = 1; // 序号
let workbook = new Excel.Workbook(); // 实例化对象
let sheetName = workbook.addWorksheet("中国税网"); // 添加一个工作薄并命名为item参数
let writTxt = function () {
// data = [];
// let valAry = [no, obj.type, obj.no, obj.title, obj.href, obj.date, obj.note, obj.detail.qa, obj.detail.answer]
data.map(item => {
// 插入每行数据
sheetName.addRow([item.title, item.date, item.source, item.href, item.content]);
})
data = [];
workbook.xlsx.writeFile("中国税网.xlsx")
.then(function () {
console.log('中国税网.xlsx');
});
// fs.writeFile('./test2.json', JSON.stringify(writeData), function (err) {
// if (err) {
// throw err;
// }
// console.log('保存成功' + orderNum + '条');
// });
};
// 获取页面内容
let getUrlContent = function (index, aryLength, currPage, totalPage) {
let obj = data[index];
fetch(obj.href, { timeout: 30000 })
.then(res => res.text())
.then(body => {
const $ = cheerio.load(body);
let contentPs = $(".article-wrap").find(".article-main").children('p');
let contentStr = '';
obj.title = $(".article-wrap").children("h1").text();
obj.date = $(".article-wrap").find(".publish-time").text();
obj.source = $(".article-wrap").find(".source").text().replace(/来源:/g, '');
contentPs.map(index => {
if (contentPs.eq(index).text()) {
contentStr += contentPs.eq(index).text();
contentStr += "\r\n";
};
});
obj.content = contentStr;
console.log('正在请求第【' + orderNum + '】条')
orderNum++;
if (index == aryLength - 1) {
writTxt();
if (currPage < totalPage) {
getPage(++currPage);
}
} else {
runGetUrl(++index, aryLength, currPage, totalPage);
};
}).catch(err => {
console.log('正在请求第【' + orderNum + '】条')
orderNum++;
if (index == aryLength - 1) {
writTxt();
if (currPage < totalPage) {
getPage(++currPage);
}
} else {
runGetUrl(++index, aryLength, currPage, totalPage);
};
console.log("出现错误!", err)
});
};
// 循环获取每个url
let runGetUrl = function (index, aryLength, currPage, totalPage) {
getUrlContent(index, aryLength, currPage, totalPage);
};
// x-www-form-urlencoded
const { URLSearchParams } = require('url');
// 获取每页链接集合
let getPage = function (currPage) {
currPage = currPage || 1;
const params = new URLSearchParams();
// params.append('pageSize', 10);
// params.append('currPage', currPage);
// fetch('http://www.zjport.gov.cn/ask/govConsultSearch!ftSearchQus.jspa?type=search&deptId=&industryId=', { method: 'POST', body: params, timeout: 30000, })
fetch(`http://www.ctaxnews.com.cn/node_27${currPage == 1 ? '' : '_' + currPage}.html`, { method: 'get', timeout: 30000, })
.then(res => res.text())
.then(body => {
let bodyObj = {};
const $ = cheerio.load(body);
const trs = $(".newsList-main").find(".title");
for (let index = 0; index < trs.length; index++) {
const element = trs.eq(index).find('a').attr("href");
bodyObj = {
href: element,
};
data.push(bodyObj);
// 判断页面链接是否提取完
if (index == trs.length - 1) {
// 提取5页内容
// 每页数据集合中的索引值,数据长度,当前页,一共页数
runGetUrl(0, data.length, currPage, 10);
};
};
});
};
// 从第一页开始获取数据
getPage(1);

Comment list( 0 )

You need to Sign in for post a comment

Help Search

Gitee_sixth 5th_float_left_close