验证中...
Languages: Python
Categories: Web开发技术
Latest update 2019-09-19 13:27
获取妹子图测试 referer修改版
Raw Copy
from bs4 import BeautifulSoup
import re
import requests
import os
useragent="Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
def updateRef(ref,str):
if str:
headref=ref+"/"+str
return headref
def initRef(ref,num):
ref='http://www.mzitu.com/'+num
return ref
def getUrlHtml(url,headers): #网页请求
try:
content=requests.get(url,headers=headers)
return content
except:
return ""
def parserHtml(list,html):
soup=BeautifulSoup(html,"lxml")
postlist=soup.find('p',attrs={'class':'url'}).find_all('a')
for i in postlist:
#创建文件
title=i.get_text()
path=str(title).strip()
os.makedirs(os.path.join("D:\mzitu",path))
os.chdir("D:\mzitu\\"+path)
try:
img_ref = 'http://www.mzitu.com/'
transfer= re.findall(r"\.com/\d{6}?",str(i))[0]
pic_num=re.split(r"/",str(transfer))[1]
# 更新referer
img_ref=img_ref+pic_num
header = {'User-Agent': useragent, 'Referer': head_ref}
href=i.attrs['href']
picture_html=getUrlHtml(href,header)
picture_url_soup=BeautifulSoup(picture_html,"lxml")
max_span=picture_url_soup.find('div',attrs={'class':'pagenavi'}).find_all('span')[-2].get_text()
for page in range(1,int(max_span)+1):
page_url=href+'/'+str(page)
#
img_ref=updateRef(img_ref,str(page+1))
header = {'User-Agent':useragent,'Referer': img_ref}
download_html=getUrlHtml(page_url,header)
download_soup=BeautifulSoup(download_html,'lxml')
sources_url=download_soup.find('div',attrs={'class':'main-image'}).find('img')['src']
name=sources_url[-9:-4]
img=requests.get(sources_url,headers=header)
#保存图片
f=open(name+'.jpg','ab')
f.write(img.content)
f.close()
img_ref=initRef(img_ref,pic_num)
except:
print('aaa')
def main():
listinfo=[]
global head_ref
head_ref = 'http://www.mzitu.com/'
header = {'User-Agent':useragent,'Referer':head_ref}
url="http://www.mzitu.com/all"
html=getUrlHtml(url,header)
parserHtml(listinfo,html)
print(listinfo)
main()

Comment list( 1 )

CADMUS 2019-09-19 13:27

所有BeautifulSoup(html,"lxml")的部分,需要修改为BeautifulSoup(html.text,"lxml")。不然回报错.....我只能帮这么多了。 :sunglasses:

You need to Sign in for post a comment

Help Search