代码拉取完成,页面将自动刷新
import requests
import time
from bs4 import BeautifulSoup
#import traceback
import re
import pandas as pd
import random as rd
time_start=time.time()
p_num = 0
while True:
url='http://vip.stock.finance.sina.com.cn/q/go.php/vReport_List/kind/macro/index.phtml?p=' + str(p_num + 1)
try:
r = requests.get(url,'utf-8')
r.raise_for_status()
r.encoding = r.apparent_encoding
demo=r.text
except:
print("fail")
print(p_num)
time.sleep(900)
continue
soup = BeautifulSoup(demo, 'html.parser')
trs=soup.find_all('tr')
type(trs)
if p_num < 1:
hrefList=[]
newsList=[]
tr=trs[2]
type(tr)
for tr in trs:
try:
td=tr.td
num=td.string
td=td.find_next_sibling()
title=td.a['title']
href=td.a['href']
td=td.find_next_sibling()
Type=td.string
td=td.find_next_sibling()
Date=td.string
newsList.append([num,title,Type,Date])
hrefList.append(href)
except:
continue
p_num +=11
if p_num == 5667:
break
if p_num % 50 == 0:
print(p_num)
#time.sleep(rd.random()*2)
print(len(newsList))
print(len(hrefList))
sep = '\n'
fl=open(r'D:\mzw\likelihood\hrefmacro.txt', 'w')
fl.write(sep.join(hrefList))
fl.close()
##################################################################################3
time_start=time.time()
fi=open(r'D:\mzw\likelihood\hrefmacro.txt','r')
txt=fi.readlines()
fi.close()
hrefList=[]
for w in txt:
w=w.replace('\n','')
hrefList.append(w)
print(len(hrefList))
print("Start!")
number = 0
start = 1
while True:
#href=hrefList[0]
href = hrefList[number]
#date = newsList[number][3]
try:
r = requests.get(href)
r.raise_for_status()
r.encoding = r.apparent_encoding
demo = r.text
except:
print("fail")
print(number)
time.sleep(900)
continue
time.sleep(rd.random()*1.5) ##random delay
#print(demo)
soup = BeautifulSoup(demo, 'html.parser')
soup_str = str(soup)
try:
date = re.findall('<span>日期:(.*)</span>' ,soup_str)[0]
title = soup.find('h1')
p=soup.find('p')
except:
number += 1
continue
title = str(title)
title = title.replace('<','')
title = title.replace('>','')
title = title.replace('/','')
title = title.replace('h1','')
#print(title)
#print(date)
#print(p)
p = str(p)
p = p.replace('<','')
p = p.replace('>','')
p = p.replace('=','')
p = p.replace('\xa0','')
p = p.replace('/','')
p = p.replace('\n','')
#p = p.replace('SPAN','')
#p = p.replace('DIV','')
p = p.replace('p','')
p = p.replace('pt','')
p = p.replace('style','')
#p = p.replace('FONT','')
#p = p.replace('STRONG','')
#p = p.replace(' ','')
#p = p.replace('BR','')
p = p.replace('br','')
p = p.replace('h1','')
if start == 1:
df1 = pd.DataFrame([[date,title,p]], columns=['Date','Title','Content'])
start = 0
else:
new = pd.DataFrame({'Date': str(date),
'Title': title,
'Content': p},
index=[1]) #
df1 = df1.append(new,ignore_index=True) #
number += 1
if number == len(hrefList):
df1.to_excel(r'D:\mzw\likelihood\STRATEGY\strategyFinal.xls',sheet_name='Sheet')
break
if number % 25 == 0:
print(number)
#time.sleep(1.5)
if number % 1000 == 0:
df1.to_excel(r'D:\mzw\likelihood\STRATEGY\strategy'+str(number / 1000)+'.xls',sheet_name='Sheet')
start = 1
time_end = time.time()
print('cost',time_end - time_start)
time_start = time.time()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。