代码拉取完成,页面将自动刷新
'''
Author: your name
Date: 2021-01-20 11:39:48
LastEditTime: 2023-04-13 17:01:35
LastEditors: best_tea best-tea@163.com
Description: In User Settings Edit
FilePath: \Pythonwork\hello.py
'''
import requests
import os
import threading
import urllib.parse
import time
import re
import hashlib
import json
import pprint
class picture:
"""
爬取百度图片
"""
def __init__(self, picture_name="钢结构节点",picture_number=200 ,path = 'picture'):
self.save_path = path
self.picture_number = int(picture_number)
self.start_time = time.time()
self.picture_name = picture_name
self.header = {
# 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
'accept-language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
# 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
}
'''
'Accept': text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7
'User-Agent': Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36
'''
if self.save_path not in os.listdir('.'):
os.makedirs(self.save_path)
self.start()
def start(self):
for i in range(0,self.picture_number,20):
self.get_picture_content(i)
'''
https://image.baidu.com/search/index?
tn=baiduimage &
ps=1 &
ct=201326592 &
lm=-1 &
cl=2 &
nc=1 &
ie=utf-8 &
dyTabStr=MCwzLDgsNSw0LDEsNiwyLDcsOQ%3D%3D&word=%E9%92%A2%E7%BB%93%E6%9E%84%E8%8A%82%E7%82%B9
'''
def get_picture_content(self,count):
#url = 'http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&rn=60&word={0}&pn={1}'.format(urllib.parse.quote(self.picture_name),str(count))
url = 'http://image.baidu.com/search/acjson'
param = {
'tn': 'resultjson_com',
'logid': ' 7517080705015306512',
'ipn': 'rj',
'ct': '201326592',
'is': '',
'fp': 'result',
'queryWord': self.picture_name,
'cl': '2',
'lm': '-1',
'ie': 'utf-8',
'oe': 'utf-8',
'adpicid': '',
'st': '',
'z': '',
'ic': '',
'hd': '',
'latest': '',
'copyright': '',
'word': self.picture_name,
's': '',
'se': '',
'tab': '',
'width': '',
'height': '',
'face': '',
'istype': '',
'qc': '',
'nc': '1',
'fr': '',
'expermode': '',
'force': '',
'cg': 'star',
'pn': str(count),
'rn': '20',
'gsm': '1e',
}
print(url,"-------------",count)
r = requests.get(url,headers = self.header,params=param)
if r.status_code != 200:
exit("访问百度图库错误")
else:
rdata=r.json()['data']
#print(rdata)
#link_url = re.findall('(?<=thumbURL": ").*?', r)
#new_count = 60 if count+60 < self.picture_number else count + 60 - self.picture_number
urlpage=[]
for i in rdata:
if i.get('thumbURL')!=None:
urlpage.append(i['thumbURL'])
print(i['thumbURL'])
print('提取地址完毕')
dir=self.save_path
if not os.path.exists(dir):
os.mkdir(dir)#创建目录方法
i=0
for url_input in urlpage:
res = requests.get(url_input, headers=self.header)
if res.status_code != 200:
exit('访问图片链接错误')
else:
self.save_picture(res.content,str(i))
print(url_input)
i=i+1
def save_picture(self,content,picture_name):
with open("{0}/{1}.jpg".format(self.save_path,hashlib.md5(picture_name.encode()).hexdigest()),'wb') as f:
f.write(content)
print("Save..."+picture_name)
def __del__(self):
print("花费了{}s时间".format(str(time.time()-self.start_time)))
if __name__ == "__main__":
picture_name = input("输入你要爬取的图片类型:") #输入你要爬取的图片类型
number = input("输入你想爬取的数量:") #输入你想爬取的数量
pic = picture(picture_name,number)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。