1 Star 0 Fork 0

iamcer / PageDataList

Create your Gitee Account
Explore and code with more than 6 million developers,Free private repositories !:)
Sign up
This repository doesn't specify license. Without author's permission, this code is only for learning and cannot be used for other purposes.
Clone or download
index.php 4.94 KB
Copy Edit Web IDE Raw Blame History
不是靖哥哥 authored 2018-12-04 10:19 . ok
<?php
namespace hongjingsheng\PHPSpider;
//引入自动加载文件
require 'vendor/autoload.php';
use QL\QueryList;
use QL\Ext\AbsoluteUrl;
use QL\Ext\DisguisePlugin;
//采集某页面所有的图片
//$data = QueryList::get('');
//
//$res=$data->rules([
// 'title'=>array('.c-container .t>a','text'),
// 'link'=>array('.c-container .t>a','href'),
//
// ])->queryData();
////print_r($res);
class BaiduShouLu
{
#地址仓库
private $urlR = [];
private $initUrl = [];
private $res = [];
private $absoluteUrl = '/';
//单页采集分页地址
public function get($url)
{
$ql = QueryList::getInstance();
$ql->use(AbsoluteUrl::class);
$ql->use(DisguisePlugin::class);
$ql->disguiseIp([
'headers' => [
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding' => 'gzip, deflate, br',
'Accept-Language' => 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
'Connection' => 'keep-alive'
]
])->disguiseUa()->disguise_headers;
$data = QueryList::get($url);
$pageArr = $data->absoluteUrl($this->absoluteUrl)->rules([
'pn' => ['#page>a:not(".n") .pc', 'text', '', function ($content) {
return $content;
}],
'link' => ['#page>a:not(".n")', 'href', '', function ($content) {
return $content;
}],
])->queryData();
return $pageArr;
}
private function run($urlArr)
{
$tmp = $this->get($urlArr['link']);
$this->urlR[] = $tmp;
//打开最后的一个来采集链接,所以检查一下最后一个的pn参数是否比当前的pn参数大,那就继续
$lastPage = $tmp[count($tmp) - 1];
if ((int)$lastPage['pn'] > $urlArr['pn']) {
echo '正在抓取第' . $lastPage['pn'] . '个页面的分页链接' . PHP_EOL;//.$lastPage['link'];
$this->run($lastPage);
} else {
echo PHP_EOL . '—————————————页面链接已经全部抓取——————————————————' . PHP_EOL;
}
return $this->urlR;
}
//去重
private function quchong()
{
foreach ($this->urlR as $item) {
foreach ($item as $value) {
$this->res[$value['pn']] = $value['link'];
}
}
return $this->res;
}
private function setFirst()
{
//插入入口地址
array_unshift($this->urlR, [$this->initUrl]);
}
public function __construct($absoluteUrl, $initUrl)
{
//数据初始化
$this->absoluteUrl = $absoluteUrl;
$this->initUrl = $initUrl;
//操作
$this->run($initUrl);
$this->setFirst();
$this->quchong();
}
/**
* 获取所有分页链接
* @return array
*/
public function getAllPageUrl()
{
return $this->res;
}
/**
* 获取所有数据的列表
* @return array
*/
public function getData($rules=[],$sleep)
{
//获取所有的页面地址
$res = $this->getAllPageUrl();
$dataList = [];
foreach ($res as $page => $url) {
echo $url . PHP_EOL;
usleep((int)$sleep);
$ql = QueryList::getInstance();
$ql->use(AbsoluteUrl::class);
$ql->use(DisguisePlugin::class);
$ql->disguiseIp([
'headers' => [
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding' => 'gzip, deflate, br',
'Accept-Language' => 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
'Connection' => 'keep-alive',
'Cookie' => 'BAIDUID=6C1E59B36E98330F1C62DB9462F9337B:BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDSVRTM=103;BD_CK_SAM=1;BIDUPSID=9A7296404CA49076EFD7CAFCF59C4F8C;H_PS_645EC=083d6Tq%2Fx0ktqqgXumXRlQagkka3IBSpVHCMxKrQVYoxoPLBhYJVLkoi3jA;H_PS_PSSID=1463_25810_21098_26350_27244;PSINO=6;PSTM=1543835809;delPer=0;'
]
])->disguiseUa()->disguise_headers;
$data = QueryList::get($url);
$tmp = $data->rules($rules)->queryData();
print_r($tmp);
die;
if (count($tmp)) {
foreach ($tmp as $item) {
$dataList[] = $item;
}
}
}
return $dataList;
}
}
//new 对象
$baidushoulu = new BaiduShouLu('https://www.baidu.com/', [
'pn' => '1',
'link' => 'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=baidu&wd=site%3Awww.haijufc.com&oq=site%253Awww.haijufc.com'
]);
print_r($baidushoulu->getData([
'title' => ['.c-container .t', 'text'],
'link' => ['.c-container .t>a', 'href'],
],2000000));
//print_r($shouLuList);

Comment ( 0 )

Sign in for post a comment

PHP
1
https://gitee.com/hjsiamcer/PageDataList.git
git@gitee.com:hjsiamcer/PageDataList.git
hjsiamcer
PageDataList
PageDataList
master

Search