代码拉取完成,页面将自动刷新
"""
作者:20182109卢钟添
文件名称:compare_price
时间2020.5.13
"""
import requests
import re
from pyecharts.charts import Bar
def getHTMLText(url):
kv = {
'cookie': 'thw=cn; cna=qKDMFJTWFjkCATy4KgHYvD4W; lgc=%5Cu8FDB%5Cu884C%5Cu7684vn; tracknick=%5Cu8FDB%5Cu884C%5Cu7684vn; tg=0; hng=CN%7Czh-CN%7CCNY%7C156; t=941b02a4d8b0cf2f789a8f92016f7d89; enc=bQy%2BaVPtK%2FqS0pCJYN1XMSv%2BYe0uLgoiwO4TykMtyUJY%2FbWNI9XKG4cKzGjwF8ErF%2FbZS3W0cszxEe2wEl4rCA%3D%3D; mt=ci=-1_1; v=0; cookie2=19048cef546d1cebb6a445b0468915c1; _tb_token_=e54533e3be77a; alitrackid=www.taobao.com; _samesite_flag_=true; sgcookie=ES1UaAIHdy%2FwGCI3cWeSj; unb=3077937338; uc3=lg2=U%2BGCWk%2F75gdr5Q%3D%3D&nk2=3q51JmlFyyw%3D&id2=UNDTxWofFWy7pA%3D%3D&vt3=F8dBxd7CSpUweIM9Pms%3D; csg=3dc8aeac; cookie17=UNDTxWofFWy7pA%3D%3D; dnk=%5Cu8FDB%5Cu884C%5Cu7684vn; skt=fd99ea0ab9d0c74f; existShop=MTU4MzkyOTEwOA%3D%3D; uc4=id4=0%40UgcjYnjbJ%2FvYq%2BKSdky6mUdqB0cc&nk4=0%403OfyM2lVewNz4tYikhjEK5jQUA%3D%3D; _cc_=URm48syIZQ%3D%3D; _l_g_=Ug%3D%3D; sg=n82; _nk_=%5Cu8FDB%5Cu884C%5Cu7684vn; cookie1=W8yenZ1rnMnGjMjBa9%2B2HZp%2BHg3no3indoqxfgpRMsc%3D; tfstk=ceHOBtsK-eYg3-sf43AncLJa7KYlZqIYogUAHnpWDoc1JPBAisHoebDJwo7TpHC..; JSESSIONID=92DE3D3B3D1972ECF55E660613C5BC3B; lastalitrackid=i.taobao.com; l=dBN59UGnQrzzx2-zKOfi5Sn8UsbtaIOb4sPrD4pGsICPObfe5RYVWZqO-aYwCnGVh6q6J354uljQBeYBqHxnnxv92j-la_Hmn; isg=BL6-wcHobAGTCbhglJRTKA6mD9QA_4J56zN-12jGf4H8C17l0Y8WieThg9fHM3qR; uc1=cookie14=UoTUOafG9CD%2BYA%3D%3D&lng=zh_CN&cookie16=UtASsssmPlP%2Ff1IHDsDaPRu%2BPw%3D%3D&existShop=false&cookie21=VT5L2FSpczFp&tag=8&cookie15=Vq8l%2BKCLz3%2F65A%3D%3D&pas=0',
'user-agent': 'Mozilla/5.0'}
try:
r = requests.get(url, headers=kv, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
# ilt代表爬取的结果的列表
def parsePage(ilt, html, depth):
try:
plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html)
tlt = re.findall(r'\"raw_title\"\:\".*?\"', html)
for i in range(depth):
price = eval(plt[i].split(':')[1])
title = eval(tlt[i].split(':')[1])
ilt.append([price, title])
except:
print("")
def GetHTML(html, goods, depth):
plist = []
tlist = []
try:
plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html) # view_price和商品价格组成键值对,plt是所有爬取到的商品的价格
tlt = re.findall(r'\"raw_title\"\:\".*?\"', html) # raw_title和商品名字组成键值对,tlt是所有爬取到的商品的名字
for i in range(depth):
# eval去掉最外层的引号,split来分割字符串
price = eval(plt[i].split(':')[1])
plist.append(price)
title = eval(tlt[i].split(':')[1])
# 因显示问题,固将字符串截断
title = title[:10]
tlist.append(title)
except:
print("")
bar = Bar()
bar.add_xaxis(tlist)
bar.add_yaxis(goods, plist)
bar.render("D:\my_charts.html")
print("\n可视化视图生成完毕,请到D盘根目录下查看my_charts.html文件")
def printGoodsList(ilt):
tplt = "{:4}\t{:8}\t{:16}"
print(tplt.format("序号", "价格", "商品名称"))
count = 0
for g in ilt:
count = count + 1
print(tplt.format(count, g[0], g[1]))
def main(goods, depth):
start_url = 'https://s.taobao.com/search?q=' + goods
infoList = []
try:
url = start_url + '&s=0'
html = getHTMLText(url)
parsePage(infoList, html, depth)
printGoodsList(infoList)
print("\n\n请问是否需要生成可视化图表(yes/no)\n注:可视化图表可能会因商品名称过长而显示不完全")
an = input()
if an == 'yes':
GetHTML(html, goods, depth)
except:
print("")
goods = input("请输入你想要爬取的商品:")
depth = int(input("请输入你想要爬取的商品数量(小于44,以方便显示):"))
main(goods, depth)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。