master

分支 (3)

标签 (7)

管理

管理

master

newproject

NewProject

V4.0

v3.1

v2.1

v3.0

v2.0

v1.1

v1.0

python2023
/
baiduSpider.py

'''
Author: your name
Date: 2021-01-20 11:39:48
LastEditTime: 2023-04-13 17:01:35
LastEditors: best_tea best-tea@163.com
Description: In User Settings Edit
FilePath: \Pythonwork\hello.py
'''
import requests
import os
import threading
import urllib.parse
import time
import re
import hashlib
import json
import pprint

class picture:
    """
    爬取百度图片
    """
    def __init__(self, picture_name="钢结构节点",picture_number=200 ,path = 'picture'):
        self.save_path = path
        self.picture_number = int(picture_number)
        self.start_time = time.time()
        self.picture_name = picture_name
        self.header = {
#           'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
            'accept-language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'max-age=0',
#           'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
        }
'''
'Accept': text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7
'User-Agent': Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36
'''
        if self.save_path not in os.listdir('.'):
            os.makedirs(self.save_path)
        self.start()
    def start(self):
        for i in range(0,self.picture_number,20):
            self.get_picture_content(i)

'''
https://image.baidu.com/search/index?
tn=baiduimage  &
ps=1    &
ct=201326592    &
lm=-1    &
cl=2    &
nc=1    &
ie=utf-8   &
dyTabStr=MCwzLDgsNSw0LDEsNiwyLDcsOQ%3D%3D&word=%E9%92%A2%E7%BB%93%E6%9E%84%E8%8A%82%E7%82%B9


'''
    def get_picture_content(self,count):
        #url = 'http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&rn=60&word={0}&pn={1}'.format(urllib.parse.quote(self.picture_name),str(count))
        url = 'http://image.baidu.com/search/acjson'
        param = {
            'tn': 'resultjson_com',
            'logid': ' 7517080705015306512',
            'ipn': 'rj',
            'ct': '201326592',
            'is': '',
            'fp': 'result',
            'queryWord': self.picture_name,
            'cl': '2',
            'lm': '-1',
            'ie': 'utf-8',
            'oe': 'utf-8',
            'adpicid': '',
            'st': '',
            'z': '',
            'ic': '',
            'hd': '',
            'latest': '',
            'copyright': '',
            'word': self.picture_name,
            's': '',
            'se': '',
            'tab': '',
            'width': '',
            'height': '',
            'face': '',
            'istype': '',
            'qc': '',
            'nc': '1',
            'fr': '',
            'expermode': '',
            'force': '',
            'cg': 'star',
            'pn': str(count),
            'rn': '20',
            'gsm': '1e',
        }
        print(url,"-------------",count)
        r = requests.get(url,headers = self.header,params=param)
        if r.status_code != 200:
            exit("访问百度图库错误")
        else:
            rdata=r.json()['data']
            #print(rdata)
            #link_url = re.findall('(?<=thumbURL": ").*?', r)
            #new_count = 60 if count+60 < self.picture_number else count + 60 - self.picture_number
            urlpage=[]
            for i in rdata:
                if i.get('thumbURL')!=None:
                    urlpage.append(i['thumbURL'])
                    print(i['thumbURL'])
            print('提取地址完毕')
            dir=self.save_path
            if not os.path.exists(dir):
                os.mkdir(dir)#创建目录方法
            i=0
            for url_input in urlpage:
                res = requests.get(url_input, headers=self.header)
                if res.status_code != 200:
                    exit('访问图片链接错误')
                else:
                    self.save_picture(res.content,str(i))
                    print(url_input)
                i=i+1
    def save_picture(self,content,picture_name):

        with open("{0}/{1}.jpg".format(self.save_path,hashlib.md5(picture_name.encode()).hexdigest()),'wb') as f:
            f.write(content)
            print("Save..."+picture_name)
    def __del__(self):
              print("花费了{}s时间".format(str(time.time()-self.start_time)))


if __name__ == "__main__":
    picture_name = input("输入你要爬取的图片类型：")   #输入你要爬取的图片类型
    number  = input("输入你想爬取的数量：")         #输入你想爬取的数量
    pic = picture(picture_name,number)