1 Star 1 Fork 1

似水流年/基于nodejs的wallhaven.cc小爬虫

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
my.js 5.43 KB
一键复制 编辑 原始数据 按行查看 历史
似水流年 提交于 2022-01-19 02:58 . update my.js.
let baseUrl = 'https://wallhaven.cc/search?q='//earth&categories=111&purity=110&sorting=relevance&order=desc'
const path = require('path')
const cheerio = require('cheerio')
const fs = require('fs')
const request = require('request')
const readline = require('readline');
const imgDir = path.join(__dirname, 'img')
var search = ''
var allPage = 1
var allImg = []
var url = baseUrl + search + '&categories=111&purity=110&sorting=relevance&order=desc&page='
//创建img目录
fs.mkdir('./img', (error) => {
if (error) {
//console.log(error);
} else {
//console.log('ok');
}
});
//获取总页数
function getAllPage(word) {
let realUrl = baseUrl + word + '&categories=111&purity=110&sorting=relevance&order=desc'
return new Promise((resolve, reject) => {
//console.log('获取图片页数中...')
//encodeURI防止用户输入中文搜索无法解析
request(encodeURI(realUrl), function (err, res, body) {
//console.log(realUrl)
if (!err && res) {
let $ = cheerio.load(body)
var arr = $('h1').text().split(' ')
//console.log(arr)
if (parseInt(arr[0]) == 0) {
console.log('未搜索到图片')
return
} else if (parseInt(arr[0].split(",").join("")) <= 24) {
allPage = 1
console.log('获取到' + allPage + '页图片')
} else {
allPage = Math.ceil(parseInt(arr[0].split(",").join("")) / 24)
console.log('获取到' + allPage + '页图片,每页最多24张图片')
speakOne.question('大兄弟你要撸多少页:', (res) => {
if(!Number(res)){
console.log('请输入数字')
return
}else if(Number(res)>allPage){
console.log('输入的页数大于总页数')
return
}else{
resolve(res)
}
});
//allPage = 10
}
//resolve(allPage)
}
})
})
}
//获取当前页码图片列表
function getPageImgList(page, word) {
return new Promise((resolve, reject) => {
url = baseUrl + word + '&categories=111&purity=110&sorting=relevance&order=desc&page=' + page
request(url, function (err, res, body) {
if (!err && res) {
let $ = cheerio.load(body)
//allImg = []
let imgList = []
$('img').each(function (i, elem) {
var imgSrc = $(this).attr('src')
if (imgSrc) { }
//目标页面有懒加载程序,图片地址先写在data-src当中
let imgOriginalSrc = $(this).attr('data-src')
if (imgOriginalSrc) {
var imgDetail = {
title: imgOriginalSrc.split('/')[4],
name: imgOriginalSrc.split('/')[5].split('.')[0]
}
imgList.push(imgDetail)
}
})
resolve(imgList)
}
})
})
}
//拼接所有图片列表并启动下载
async function getAllImgList(word) {
let a = await getAllPage(word);
//console.log(a)
console.log('获取图片地址中...')
for (let i = 1; i <= a; i++) {
console.log('获取第' + i + '页图片地址')
let b = await getPageImgList(i, word)
allImg = [...allImg, ...b]
}
//console.log(allImg)
console.log('准备下载中...')
downloadImg(allImg)
}
//下载图片
function downloadImg(list) {
//console.log(allImg)
for (let i = 0; i < list.length; i++) {
//在下载完一张图片后休息下,防被侦查到是爬虫从而引发反爬虫机制。
setTimeout(function () {
var imgUrl = 'https://w.wallhaven.cc/full/' + list[i].title + '/wallhaven-' + list[i].name + '.jpg'
request(imgUrl, function (err, res, ) {
console.log(res.statusCode)
//这里判断图片后缀是否正确,默认为jpg,其他情况都是png等类型图片,无法做出具体判断
if (res.statusCode == 404) {
/* var imgUrl1 = 'https://w.wallhaven.cc/full/' + list[i].title + '/wallhaven-' + list[i].name + '.png'
console.log(imgUrl1)
request(imgUrl1).pipe(fs.createWriteStream(path.join(imgDir, i + list[i].name + '.' + 'png'))) */
console.log('未知图片类型')
} else {
var imgUrl2 = 'https://w.wallhaven.cc/full/' + list[i].title + '/wallhaven-' + list[i].name + '.jpg'
console.log(imgUrl2)
//写入jpg图片
request(imgUrl2).pipe(fs.createWriteStream(path.join(imgDir, i + list[i].name + '.' + 'jpg')))
}
})
}, 3000 * i)
}
}
//用户输入命令行
const speakOne = readline.createInterface({
input: process.stdin,
output: process.stdout,
});
speakOne.question('请输入搜索关键字(只支持英文关键字,如girl、boy,英文不好去百度):', (line) => {
getAllImgList(line)
//speakOne.close();
})
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/GentleLiu/wallhaven.git
git@gitee.com:GentleLiu/wallhaven.git
GentleLiu
wallhaven
基于nodejs的wallhaven.cc小爬虫
master

搜索帮助