代码拉取完成,页面将自动刷新
# -*- coding: utf-8 -*-
import pandas as pd
import time
from collections import defaultdict
import jieba.posseg as jp, jieba
import re
########33 Load Data
texts = []
dates_lst = []
text = ""
date = list(pd.read_excel(r'D:\MACRO\macro'+str(1)+'.0.xls',sheet_name='Sheet').loc[:,'Date'])[0]
dates_lst.append(date)
jieba.load_userdict(r"D:\dict\user_dct_sogou.txt")
for k in range(1,79):
df = pd.read_excel(r'D:\MACRO\macro'+str(k)+'.0.xls',sheet_name='Sheet')
dates = list(df.loc[:,'Date'])
for n in range(len(dates)):
date = dates[n]
if type(date) is not str:
continue
want = 0
for word in jp.cut(df.loc[n,'Title'].replace(" ","")):
if word.flag not in ['eng', 'm' , 'x']:
want = 1
break
if want == 0:
continue
#if dates[n] == date:
text = df.loc[n,'Content']
#else:
texts.append(text)
dates_lst.append(date)
#text = ""
texts = [re.sub('\\.|-| |\t|_|%|%|\[|\]|/|^|\n|的|个|是|兄|也|了|只','',text) for text in texts]
texts = [re.split('[;;。!?…,,::、》《~\?\.“”""()\(\)\t\n]',text) for text in texts]
t = []
for text in texts:
t += text
texts = t
print("Loading data finished!")
def lengeq2(x):
return len(x) >= 2
texts = list(filter(lengeq2, texts))
texts_new = []
def cutwords(x):
return not ((x[1] == 'm') or (x[0] == '%') or (x[0] == '%') or (x[1]=='eng' and len(x[0]) == 1))
text_segment = []
for d in range(len(texts)):
if d % 500 == 0:
print([d,len(texts)])
texts[d] = [tuple(x) for x in jp.cut(texts[d])]
texts[d] = list(filter(cutwords, texts[d]))
if len(texts[d]) > 1:
text_segment.append(texts[d])
phrases = defaultdict(int)
texts = text_segment
num = 1
max_n = 3
start_time = time.time()
for sentence in text_segment:
for id in range(len(sentence)):
word = sentence[id][0]
pos = sentence[id][1]
#We do not want a phrase to start with words with the following properties
if (pos not in ['q','zg','u','uz','ud','uj','uv','ul','x','c','u','p','t','r','ad','m','f']) and (word not in ['经','间','带来','和','对','比例','同比','明显','占','个','应','应该','性','较','与','同时','和','继续','做好','看','得到','依旧','依然','成','变为','变成','称为','成为','还是','还会','还有','相当','相比','也','已','已经','引起','引发','又','大概','约','大约','再次','可','不可','均','这类','这个','这','注','表','图','截至','能','不能','可以','不可以','至','到','自从','自','将','从','这次','到','中','上','是','其','占','主要','包括','中','为','有所','不可能','可能','不要','没有','不会','不是','是','会','有','要','占','。','、','“','”','?',':','!',',',';']):
for n in range(2,min(len(sentence)-id,max_n)+1):
############## We do not want the words with the following properties to end a phrase
if (sentence[id+n-1][1] not in ['u','uz','ud','uj','uv','ul','m','c','p','zg','r','x','t','f','ad']) and (sentence[id+n-1][0] not in ['经','出现','逐年','逐月','逐步','渐渐','逐渐','还','占比','趋','受','显示','总体','迅速','月份','进入','先','累计','持续','先','较为','作为','在','带来','越来越','仍然','原因','兑','应','应该','平均','均','非','总','较','与','同时','和','对','继续','做好','不','开始','看','一定','引起','引发','已','已经','得到','起','依旧','依然','成','变为','变成','称为','成为','还是','还会','还有','稍有','略有','达到','相当','相比','也','同比','又','已','已经','尚待','用于','大概','约','大约','产生','可','不可','注','表','图','一般','截至','能','不能','可以','不可以','至','到','自从','自','将','从','这次','到','中','上','是','其','占','主要','包括','中','为','有所','不可能','可能','不要','没有','不会','不是','是','会','有','要','占','。','、','“','”','?',':','!',',',';']):
phrase = [sentence[id+k][0] for k in range(0,n)]
######### We do not want the following words to end a phrase
if (len(set(phrase)) < len(phrase) - 1) or (phrase[-1] in ['由', '经','出现','逐年','逐月','逐步','渐渐','逐渐','还','占比','趋','受','显示','总体','迅速','月份','先','累计','持续','先','较为','作为','在','带来','越来越','仍然','原因','兑','应','应该','平均','均','非','总','较','与','同时','和','对','继续','做好','不','开始','看','一定','引起','引发','已','已经','得到','起','依旧','依然','成','变为','变成','称为','成为','还是','还会','还有','稍有','略有','达到','相当','相比','也','同比','又','已','已经','尚待','用于','大概','约','大约','产生','可','不可','注','表','图','一般','截至','能','不能','可以','不可以','至','到','自从','自','将','从','这次','到','中','上','是','其','占','主要','包括','中','为','有所','不可能','可能','不要','没有','不会','不是','是','会','有','要','占','。','、','“','”','?',':','!',',',';']) or (phrase[0] in ['出现','增势','小幅','大幅度','增加','情况','显示','低于','高于','慢于','快于','稍有','略有','占比','增量','高出','连续','降速','增速','好','月份','下降','降幅','不断','低于','高于','大大','大幅','间','将','将会','将要','带来','和','对','比例','同比','明显','占','增长','增幅','显著','个','应','应该','性','较','与','同时','和','继续','做好','看','得到','依旧','依然','成','变为','变成','称为','成为','还是','还会','还有','相当','相比','也','已','已经','引起','引发','又','大概','约','大约','再次','可','不可','均','这类','这个','这','注','表','图','截至','能','不能','可以','不可以','至','到','自从','自','将','从','这次','到','中','上','是','其','占','主要','包括','中','为','有所','不可能','可能','不要','没有','不会','不是','是','会','有','要','占','。','、','“','”','?',':','!',',',';']):
continue
phrase = ''.join(phrase)
if len(phrase) > 3:
phrases[phrase] += 1
if num % 1000 == 0:
print([num, len(phrases), len(text_segment)])
num += 1
end_time = time.time()
print("Generating phrases costs " +str(end_time - start_time) + " seconds!")
phrases_keep = {}
for key in phrases:
if phrases[key] >= 40 and not ('再度' in key or '发生' in key or '近期' in key or '亿' in key or '表明' in key or '都' in key or '面对' in key or '再次' in key or '更为' in key or '强调' in key or '相对' in key or '进一步' in key or '更加' in key or '非常' in key or '仍' in key or '仍然' in key or '并未' in key or '尚未' in key or '亿美元' in key or '认为' in key or '和' in key or'一季度' in key or '二季度' in key or '三季度' in key or '四季度' in key or '百分点' in key or '亿元' in key):
phrases_keep[key] = phrases[key]
print("There are totally " + str(len(phrases_keep)) + " phrases kept!")
phrases_want_write = [key + ' ' + str(value) + ' ' + 'n\n' for key, value in phrases_keep.items()]
fo = open(r"D:\dict\phrases_final.txt", "w", encoding = 'utf-8') ######### Final phrase dictionary
fo.writelines(phrases_want_write)
fo.close()
##################################################################################################
dct1 = open(r"D:\dict\user_dct_sogou.txt", encoding = 'utf-8').readlines()
dct2 = open(r"D:\dict\phrases_final.txt", encoding = 'utf-8').readlines()
dct = dct1 + dct2
f = open(r"D:\dict\user_dct_macro.txt", "w", encoding = 'utf-8')
f.writelines(dct)
f.close()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。