2 Star 2 Fork 0

Likelihood-Lab / NLP2019

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
01GetTextData.py 3.93 KB
一键复制 编辑 原始数据 按行查看 历史
嘉文文文 提交于 2019-08-25 16:34 . codes
import requests
import time
from bs4 import BeautifulSoup
#import traceback
import re
import pandas as pd
import random as rd
time_start=time.time()
p_num = 0
while True:
url='http://vip.stock.finance.sina.com.cn/q/go.php/vReport_List/kind/macro/index.phtml?p=' + str(p_num + 1)
try:
r = requests.get(url,'utf-8')
r.raise_for_status()
r.encoding = r.apparent_encoding
demo=r.text
except:
print("fail")
print(p_num)
time.sleep(900)
continue
soup = BeautifulSoup(demo, 'html.parser')
trs=soup.find_all('tr')
type(trs)
if p_num < 1:
hrefList=[]
newsList=[]
tr=trs[2]
type(tr)
for tr in trs:
try:
td=tr.td
num=td.string
td=td.find_next_sibling()
title=td.a['title']
href=td.a['href']
td=td.find_next_sibling()
Type=td.string
td=td.find_next_sibling()
Date=td.string
newsList.append([num,title,Type,Date])
hrefList.append(href)
except:
continue
p_num +=11
if p_num == 5667:
break
if p_num % 50 == 0:
print(p_num)
#time.sleep(rd.random()*2)
print(len(newsList))
print(len(hrefList))
sep = '\n'
fl=open(r'D:\mzw\likelihood\hrefmacro.txt', 'w')
fl.write(sep.join(hrefList))
fl.close()
##################################################################################3
time_start=time.time()
fi=open(r'D:\mzw\likelihood\hrefmacro.txt','r')
txt=fi.readlines()
fi.close()
hrefList=[]
for w in txt:
w=w.replace('\n','')
hrefList.append(w)
print(len(hrefList))
print("Start!")
number = 0
start = 1
while True:
#href=hrefList[0]
href = hrefList[number]
#date = newsList[number][3]
try:
r = requests.get(href)
r.raise_for_status()
r.encoding = r.apparent_encoding
demo = r.text
except:
print("fail")
print(number)
time.sleep(900)
continue
time.sleep(rd.random()*1.5) ##random delay
#print(demo)
soup = BeautifulSoup(demo, 'html.parser')
soup_str = str(soup)
try:
date = re.findall('<span>日期:(.*)</span>' ,soup_str)[0]
title = soup.find('h1')
p=soup.find('p')
except:
number += 1
continue
title = str(title)
title = title.replace('<','')
title = title.replace('>','')
title = title.replace('/','')
title = title.replace('h1','')
#print(title)
#print(date)
#print(p)
p = str(p)
p = p.replace('<','')
p = p.replace('>','')
p = p.replace('=','')
p = p.replace('\xa0','')
p = p.replace('/','')
p = p.replace('\n','')
#p = p.replace('SPAN','')
#p = p.replace('DIV','')
p = p.replace('p','')
p = p.replace('pt','')
p = p.replace('style','')
#p = p.replace('FONT','')
#p = p.replace('STRONG','')
#p = p.replace('&nbsp;','')
#p = p.replace('BR','')
p = p.replace('br','')
p = p.replace('h1','')
if start == 1:
df1 = pd.DataFrame([[date,title,p]], columns=['Date','Title','Content'])
start = 0
else:
new = pd.DataFrame({'Date': str(date),
'Title': title,
'Content': p},
index=[1]) #
df1 = df1.append(new,ignore_index=True) #
number += 1
if number == len(hrefList):
df1.to_excel(r'D:\mzw\likelihood\STRATEGY\strategyFinal.xls',sheet_name='Sheet')
break
if number % 25 == 0:
print(number)
#time.sleep(1.5)
if number % 1000 == 0:
df1.to_excel(r'D:\mzw\likelihood\STRATEGY\strategy'+str(number / 1000)+'.xls',sheet_name='Sheet')
start = 1
time_end = time.time()
print('cost',time_end - time_start)
time_start = time.time()
Python
1
https://gitee.com/likelihoodlab/NLP2019.git
git@gitee.com:likelihoodlab/NLP2019.git
likelihoodlab
NLP2019
NLP2019
master

搜索帮助