diff --git a/wenlin/app.js b/wenlin/app.js new file mode 100644 index 0000000000000000000000000000000000000000..3f57fc90a83adb008925e6c6c53983540274f50f --- /dev/null +++ b/wenlin/app.js @@ -0,0 +1,12 @@ +'use strict' + +const domdate = require('./crawler/domdata'); +const article=require('./model/article'); +const fs=require('fs'); +const url = 'https://www.cnblogs.com/sitehome/p/'; + +article.sync({force:true}); +for (var i = 1; i <= 200; i++) { + var surl = url + i; + domdate(surl); +} \ No newline at end of file diff --git a/wenlin/config.js b/wenlin/config.js new file mode 100644 index 0000000000000000000000000000000000000000..264cb3dd869ce8ed0cdfc59878fb1f22418705b1 --- /dev/null +++ b/wenlin/config.js @@ -0,0 +1,11 @@ +// 配置数据库信息 + +var config = { + database:'CommonPermission', //库名 + userName:'sa', //用户名 + password:'123456', // 密码 + host:'localhost', // 本地数据库 + dialect:'mssql' // 语言 +} + +module.exports=config; \ No newline at end of file diff --git a/wenlin/crawler/domdata.js b/wenlin/crawler/domdata.js new file mode 100644 index 0000000000000000000000000000000000000000..8b577ffe5ab1a12b9eca9de834912fe39c91abfb --- /dev/null +++ b/wenlin/crawler/domdata.js @@ -0,0 +1,58 @@ +'use strict' +// 爬取数据 + +const Crawler = require('crawler'); +const article = require('../model/article'); +const http = require('http'); +const path = require('path'); +const fs = require('fs'); +const request = require('request'); +const url = 'https://www.cnblogs.com' + +const crawler = new Crawler({ + maxConnections: 100, + callback: function (error, res, done) { + if (error) { + console.log(`错误信息:${error}`) + } else { + var $ = res.$; + var list = $('#post_list .post_item').toArray(); + console.log(list.length); + list.forEach(element => { + var txt = $(element); + var title = txt.find('.post_item_body h3 a').text(); + var authortime = txt.find('.post_item_body .post_item_foot ').text().trim()//.replace(/[\n\r]/g, ''); + var author = authortime.split('发').shift().replace(/[\n\r]/g, ''); + var releasetime = authortime.match(/[ \n\r]/, '').input.split('\r\n')[1].split('于')[1] + console.log(title); + // article.sync({ force: true }).then(() => { + article.create({ + title: `《${title}》`, + author: author, + releasetime: releasetime + }).then((row) => { + console.log(`添加到数据库的Id为:${row.id}`); + }) + // }) + }); + + // 下载作者头像 + $('#post_list img').each(function (i, elem) { + var letimg = $(this).attr('src');//https://pic.cnblogs.com/face/1752549/20200315200934.png + var ent = letimg.split('.').pop();// png + request(letimg).pipe(fs.createWriteStream(path.join(__dirname, 'img', i + '.' + ent), { + 'enconding': 'binary' + })) + console.log(path.join(__dirname, 'img', i + '.' + ent)) + }) + + } + done(); + } +}) + +//crawler.queue(url) + +module.exports = (url) => { + crawler.queue(url) +} diff --git a/wenlin/crawler/img/0.png b/wenlin/crawler/img/0.png new file mode 100644 index 0000000000000000000000000000000000000000..c86bad64bd65308fd725226bfbaf9e4bcc164016 Binary files /dev/null and b/wenlin/crawler/img/0.png differ diff --git a/wenlin/crawler/img/1.png b/wenlin/crawler/img/1.png new file mode 100644 index 0000000000000000000000000000000000000000..511e15aefc0358d754fe2b4273f9f517414d4adf Binary files /dev/null and b/wenlin/crawler/img/1.png differ diff --git a/wenlin/crawler/img/10.png b/wenlin/crawler/img/10.png new file mode 100644 index 0000000000000000000000000000000000000000..ffe73cc90e10cfb3c562575b704ebdb1683822ee Binary files /dev/null and b/wenlin/crawler/img/10.png differ diff --git a/wenlin/crawler/img/11.png b/wenlin/crawler/img/11.png new file mode 100644 index 0000000000000000000000000000000000000000..92ac8d7707c7b6e1749f88c12957b0d079e01ea9 Binary files /dev/null and b/wenlin/crawler/img/11.png differ diff --git a/wenlin/crawler/img/12.png b/wenlin/crawler/img/12.png new file mode 100644 index 0000000000000000000000000000000000000000..94995b53e2014aada3cefc28475984944fc2aee6 Binary files /dev/null and b/wenlin/crawler/img/12.png differ diff --git a/wenlin/crawler/img/13.png b/wenlin/crawler/img/13.png new file mode 100644 index 0000000000000000000000000000000000000000..adf193a61ce8c74161d8d91a5d35afe30979cfd4 Binary files /dev/null and b/wenlin/crawler/img/13.png differ diff --git a/wenlin/crawler/img/14.png b/wenlin/crawler/img/14.png new file mode 100644 index 0000000000000000000000000000000000000000..dd1bcab7ff972a45ed6b8eeb6aaa62bc443a1acf Binary files /dev/null and b/wenlin/crawler/img/14.png differ diff --git a/wenlin/crawler/img/15.png b/wenlin/crawler/img/15.png new file mode 100644 index 0000000000000000000000000000000000000000..c678eb6540019aaf89b1b473bcc7103231a89e16 Binary files /dev/null and b/wenlin/crawler/img/15.png differ diff --git a/wenlin/crawler/img/16.png b/wenlin/crawler/img/16.png new file mode 100644 index 0000000000000000000000000000000000000000..a95858901aadbbf8356583e7e3d23ae8fff753cd Binary files /dev/null and b/wenlin/crawler/img/16.png differ diff --git a/wenlin/crawler/img/17.png b/wenlin/crawler/img/17.png new file mode 100644 index 0000000000000000000000000000000000000000..e053c4a00498145c77252818fe7b22781a567cc2 Binary files /dev/null and b/wenlin/crawler/img/17.png differ diff --git a/wenlin/crawler/img/2.png b/wenlin/crawler/img/2.png new file mode 100644 index 0000000000000000000000000000000000000000..6fd8d86d57c12b536f2e5f03d24aaa9af81b3b56 Binary files /dev/null and b/wenlin/crawler/img/2.png differ diff --git a/wenlin/crawler/img/3.png b/wenlin/crawler/img/3.png new file mode 100644 index 0000000000000000000000000000000000000000..a4670cb68f5f861c4e782904195726693a468851 Binary files /dev/null and b/wenlin/crawler/img/3.png differ diff --git a/wenlin/crawler/img/4.png b/wenlin/crawler/img/4.png new file mode 100644 index 0000000000000000000000000000000000000000..b29dbc411491d54b2c4a3bb8fad4a1ebe44ea3ea Binary files /dev/null and b/wenlin/crawler/img/4.png differ diff --git a/wenlin/crawler/img/5.png b/wenlin/crawler/img/5.png new file mode 100644 index 0000000000000000000000000000000000000000..c40f64fb8b1933558511f785b5953d5afdd04b8a Binary files /dev/null and b/wenlin/crawler/img/5.png differ diff --git a/wenlin/crawler/img/6.png b/wenlin/crawler/img/6.png new file mode 100644 index 0000000000000000000000000000000000000000..18a9cc38584845db592d3914a924a51f817ea17e Binary files /dev/null and b/wenlin/crawler/img/6.png differ diff --git a/wenlin/crawler/img/7.png b/wenlin/crawler/img/7.png new file mode 100644 index 0000000000000000000000000000000000000000..1cf8d383eb90794a796000aa53ef650c711fcfcf Binary files /dev/null and b/wenlin/crawler/img/7.png differ diff --git a/wenlin/crawler/img/8.png b/wenlin/crawler/img/8.png new file mode 100644 index 0000000000000000000000000000000000000000..d50332b1cc3ab2afd6c51badff711b937e2b8624 Binary files /dev/null and b/wenlin/crawler/img/8.png differ diff --git a/wenlin/crawler/img/9.png b/wenlin/crawler/img/9.png new file mode 100644 index 0000000000000000000000000000000000000000..1ab816d84a9f4a8a5b1e7367f8a04d4711c040d5 Binary files /dev/null and b/wenlin/crawler/img/9.png differ diff --git a/wenlin/db.js b/wenlin/db.js new file mode 100644 index 0000000000000000000000000000000000000000..ef854955447475e89f77a1b446b5ab3fbfa92890 --- /dev/null +++ b/wenlin/db.js @@ -0,0 +1,18 @@ +'use strict' +// 链接数据库,测试数据库链接是否成功 + +var config=require('./config'); +var Sequelize=require('sequelize'); + +const sequelize=new Sequelize(config.database,config.userName,config.password,{ + host:config.host, + dialect:config.dialect +}); +//authenticate +sequelize.authenticate().then(()=>{ + console.log(`链接数据库${config.database}成功!`) +}).catch((err)=>{ + console.log(`链接失败:${err}`); +}); + +module.exports=sequelize; \ No newline at end of file diff --git a/wenlin/model/article.js b/wenlin/model/article.js new file mode 100644 index 0000000000000000000000000000000000000000..577192d03f0abe015b06c851f9e7698bfd650db6 --- /dev/null +++ b/wenlin/model/article.js @@ -0,0 +1,22 @@ +'use strict' +// 创建数据库的表 + +const Sequelize = require('sequelize'); +const sequelize = require('../db'); +//标题、作者、发布时间,有三个字段 +const article = sequelize.define('article', { + title: { + type: Sequelize.STRING(100), + allowNull: false + }, + author: { + type: Sequelize.STRING(80), + }, + releasetime: {//datetimeoffset(7) + type: Sequelize.DATE + } + +}); + +// 暴露创建的表 +module.exports=article; \ No newline at end of file