代码拉取完成,页面将自动刷新
"use strict";
const express = require('express');
const app = express();
const phantom = require('phantom');
const fs = require('fs');
const http = require('http');
const https = require('https');
const mkdirp = require('mkdirp');
const bodyParser = require('body-parser');
const partials = require('express-partials');
const OSS = require('ali-oss');
let aliyunClient = new OSS.Wrapper({
region: 'oss-cn-shanghai',
//云账号AccessKey有所有API访问权限,建议遵循阿里云安全最佳实践,部署在服务端使用RAM子账号或STS,部署在客户端使用STS。
accessKeyId: 'LTAIAqkn3Y8stIRP',
accessKeySecret: 'u6jMObscFqsWOBrlFKDkiwjQ3MdHUF',
bucket: 'weigao'
});
app.use(express.static('target'));
app.use(bodyParser.json());
app.use(bodyParser.urlencoded({extended: true}));
app.engine('.ejs', require('ejs').__express)
app.set('views', __dirname + '/views');
app.set('view engine', 'ejs');
app.use(partials());
app.get('/', function (req, res) {
res.send('Welcome to WEIGAO')
});
app.get('/redirect', function (req, res) {
// 解析 url 参数
var url = req.query.url;
// res.sendFile(__dirname + "/" + "index.html");
res.render('index', {url: url})
});
app.post('/fetch', function (req, res) {
let url = req.body.url;
console.log(url);
if (url) {
const timestamp = new Date().getTime() + "_" + Math.random();
let body = req.body.body;
let sitepage = null;
let phInstance = null;
const path = './target/' + timestamp;
mkdirp(path, function (err) {
if (err) console.error(err)
else console.log('dir created')
});
phantom.create()
.then(instance => {
phInstance = instance;
return instance.createPage();
})
.then(page => {
//设置一下窗口大小
page.property('viewportSize', {width: 414, height: 736, margin: 0});
//设置浏览器请求信息
page.setting("userAgent", "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1");
page.property('onConsoleMessage', function (msg) {
console.log(msg);
});
console.log("open url : " + url + " on : " + new Date().toISOString());
page.open(url)
.then(function () {
//获取title和favicon
page.evaluate(function () {
const title = document.title;
var favicon = "";
var link = document.querySelector("link[rel*='shortcut icon']");
if (link)
favicon = link.href;
else {
var url = location.protocol + "//" + location.host;
if (location.port)
url += ":" + location.port;
favicon = url + "/favicon.ico"
}
return {"title": title, "favicon": favicon};
}).then(function (html) {
console.log(JSON.stringify(html));
try {
fs.writeFile(path + '/title.html', JSON.stringify(html), (err) => {
if (err) throw err;
console.log('The file title.html has been saved!');
});
} catch (e) {
console.log(e);
}
console.log("before wait : " + new Date().toISOString());
setTimeout(function () {
console.log("after wait : " + new Date().toISOString());
//获取图片链接,并且下载下来
page.evaluate(function () {
/*模拟窗口滚动*/
var simulateScroll = function (simulateScrollCallback) {
console.log("start to caculate height")
var body = document.body,
html = document.documentElement;
var height = Math.max(body.scrollHeight, body.offsetHeight,
html.clientHeight, html.scrollHeight, html.offsetHeight);
console.log("height is " + height);
var scrollAndWait = function (h, height) {
console.log("start to scrollAndWait")
console.log("scroll from " + h)
h = h + window.innerHeight;
console.log("scroll to " + h);
window.scroll(0, h);
if (h < height) {
var start = Date.now();
for (; (Date.now() - start) < 2000;);
scrollAndWait(h, height);
}
else if (typeof simulateScrollCallback === "function")
simulateScrollCallback();
};
scrollAndWait(0, height);
};
simulateScroll();
var start = Date.now();
var images = document.getElementsByTagName("img");
console.log("images size is :" + images.length);
var arrayOfUrls = [];
for (var i = 0; i < images.length; i++) {
if (images[i].getAttribute("src") == undefined || images[i].getAttribute("src").indexOf("image/gif;base64") > 0) {
console.log("Img origin is " + i + ": " + images[i].getAttribute("src"));
images[i].setAttribute("src", images[i].getAttribute("data-src"));
}
console.log("Img now is " + i + ": " + images[i].getAttribute("src"));
/*
下载图片到本地并且修改访问连接
*/
var imgPath = images[i].getAttribute("src");
var imgName = "img-" + i + ".png";
const timestamp = new Date().getTime() + "_" + Math.random();
var newName = timestamp+"_"+imgName;
images[i].setAttribute("source-src", imgPath);
images[i].setAttribute("id", "image-load-" + i);
//改变超链接地址到本地
// images[i].setAttribute("src", "./" + imgName);
images[i].setAttribute("src", "http://weigao.oss-cn-shanghai.aliyuncs.com/" + newName);
images[i].setAttribute("class", "");
var data = {
"newName": newName,
"url": imgPath,
"imgName": imgName,
"id": "image-load-" + i,
"position": {
top: images[i].offsetTop,
left: images[i].offsetLeft,
width: images[i].offsetWidth,
height: images[i].offsetHeight
}
};
arrayOfUrls.push(data);
var t = Date.now() - start;
console.log('Loading time ' + t / 1000 + ' sec');
}
return arrayOfUrls;
}).then(function (arrayOfUrls) {
console.log(JSON.stringify(arrayOfUrls));
try {
arrayOfUrls.forEach(function (imageObj, index, array) {
if (imageObj.url.length > 4) {
console.log("downloading " + imageObj.url + "to " + imageObj.imgName);
var client = http;
// You can use url.protocol as well
if (imageObj.url.toString().indexOf("https") === 0) {
client = https;
} else if (imageObj.url.toString().indexOf("http") === -1) {
imageObj.url = "http:" + imageObj.url;
}
client.get(imageObj.url, function (res) {
var html = ''
res.setEncoding('binary');
res.on('data', function (data) {
html += data;
});
res.on('end', function () {
// 将抓取的内容保存到本地文件中
fs.writeFile(path + "/" + imageObj.imgName, html, 'binary', function (err) {
if (err) {
console.log('出现错误!')
}
imageObj.path = path + "/" + imageObj.imgName;
console.log('已输出至' + imageObj.path + '中');
aliyunClient.put(imageObj.newName, imageObj.path).then(function (r1) {
console.log('put success: %j', r1);
console.log('put return url: %j', r1.res.requestUrls[0]);
imageObj.newUrl = r1.res.requestUrls[0];
return imageObj;
})
// .then(function (imageObj) {
// page.evaluate(function (imageObj) {
// let img = document.getElementById(imageObj.id);
// console.log('img id %j name %j change src from %j to %j',
// imageObj.id,
// imageObj.imgName,
// imageObj.src,
// imageObj.newUrl);
// img.setAttribute("src", imageObj.newUrl);
// }, imageObj);
// })
.catch(function (err) {
console.error('error: %j', err);
});
})
})
}).on('error', function (err) {
console.log('错误信息:' + err)
});
}
});
} catch (e) {
console.log(e);
}
}).then(function () {
console.log("eraser script : " + new Date().toISOString());
/*去掉所有script标签*/
page.evaluate(function () {
while (document.getElementsByTagName("script").length > 0)
document.getElementsByTagName("script")[0].outerHTML = "";
}).then(function () {
if (body) {
/*增加广告*/
console.log("adding body = " + body);
if (body)
page.evaluate(function (body) {
document.body.innerHTML = document.body.innerHTML + body;
}, body);
}
const piwik = "<!-- Piwik -->\n" +
"<script type=\"text/javascript\">\n" +
" var _paq = _paq || [];\n" +
" /* tracker methods like \"setCustomDimension\" should be called before \"trackPageView\" */\n" +
" _paq.push(['trackPageView']);\n" +
" _paq.push(['enableLinkTracking']);\n" +
" (function() {\n" +
" var u=\"//shop1.foreveralone.wang/\";\n" +
" _paq.push(['setTrackerUrl', u+'piwik.php']);\n" +
" _paq.push(['setSiteId', '1']);\n" +
" var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];\n" +
" g.type='text/javascript'; g.async=true; g.defer=true; g.src=u+'piwik.js'; s.parentNode.insertBefore(g,s);\n" +
" })();\n" +
"</script>\n" +
"<!-- End Piwik Code -->";
if (piwik) {
/*增加用户行为分析的部分*/
console.log("adding piwik = " + piwik);
if (piwik)
page.evaluate(function (piwik) {
document.body.innerHTML = document.body.innerHTML + piwik;
}, piwik);
}
/*获取文章正文*/
var content = page.evaluate(function () {
console.log("Start download body");
content = document.documentElement.outerHTML;
// console.log(content);
console.log("End download body");
return content;
}).then(function (content) {
function sleep(milliSeconds) {
var startTime = new Date().getTime(); // get the current time
while (new Date().getTime() < startTime + milliSeconds);
}
try {
fs.writeFile(path + '/index.html', content, (err) => {
if (err) throw err;
console.log('The file index.html has been saved!');
});
} catch (e) {
console.log(e);
}
});
}
);
});
}, 10000);
});
//page 代码结束
});
})
.catch(error => {
console.log(error);
phInstance.exit();
});
const host = server.address().address;
const port = server.address().port;
res.send({"url": "/" + timestamp});
} else {
res.send("参数错误");
}
});
let port = 3000;
const args = process.argv.splice(2);
if (args.length > 1) {
port = args[0]
console.log('Port is set to ' + port + '!')
}
const server = app.listen(port, function () {
console.log('Example app listening on port ' + port + '!')
});
/*
等待函数,用来等待直到 testFx 返回 true,才会执行 onReady,可以用来面对比较恶心的网站
*/
var waitFor = function (testFx, onReady, timeOutMillis) {
var maxtimeOutMillis = timeOutMillis ? timeOutMillis : 30000, //< Default Max Timout is 30s
start = new Date().getTime(),
condition = false,
interval = setInterval(function () {
if ((new Date().getTime() - start < maxtimeOutMillis) && !condition) {
// If not time-out yet and condition not yet fulfilled
condition = (typeof(testFx) === "string" ? eval(testFx) : testFx()); //< defensive code
} else {
if (!condition) {
// If condition still not fulfilled (timeout but condition is 'false')
console.log("'waitFor()' timeout");
phantom.exit(1);
} else {
// Condition fulfilled (timeout and/or condition is 'true')
console.log("'waitFor()' finished in " + (new Date().getTime() - start) + "ms.");
typeof(onReady) === "string" ? eval(onReady) : onReady(); //< Do what it's supposed to do once the condition is fulfilled
clearInterval(interval); //< Stop this interval
}
}
}, 250); //< repeat check every 250ms
};
//等待10s时间,用来保证网页加载完毕
function sleep(milliSeconds) {
console.log("before wait : " + new Date().toISOString());
var startTime = new Date().getTime(); // get the current time
while (new Date().getTime() < startTime + milliSeconds);
console.log("after wait : " + new Date().toISOString())
}
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。