2 Star 2 Fork 0

Likelihood-Lab/NLP2019

加入 Gitee
与超过 1400万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
02Phrases.py 8.01 KB
一键复制 编辑 原始数据 按行查看 历史
嘉文文文 提交于 2019-08-25 16:37 +08:00 . codes
# -*- coding: utf-8 -*-
import pandas as pd
import time
from collections import defaultdict
import jieba.posseg as jp, jieba
import re
########33 Load Data
texts = []
dates_lst = []
text = ""
date = list(pd.read_excel(r'D:\MACRO\macro'+str(1)+'.0.xls',sheet_name='Sheet').loc[:,'Date'])[0]
dates_lst.append(date)
jieba.load_userdict(r"D:\dict\user_dct_sogou.txt")
for k in range(1,79):
df = pd.read_excel(r'D:\MACRO\macro'+str(k)+'.0.xls',sheet_name='Sheet')
dates = list(df.loc[:,'Date'])
for n in range(len(dates)):
date = dates[n]
if type(date) is not str:
continue
want = 0
for word in jp.cut(df.loc[n,'Title'].replace(" ","")):
if word.flag not in ['eng', 'm' , 'x']:
want = 1
break
if want == 0:
continue
#if dates[n] == date:
text = df.loc[n,'Content']
#else:
texts.append(text)
dates_lst.append(date)
#text = ""
texts = [re.sub('\\.|-| |\t|_|%|%|\[|\]|/|^|\n|的|个|是|兄|也|了|只','',text) for text in texts]
texts = [re.split('[;;。!?…,,::、》《~\?\.“”""()\(\)\t\n]',text) for text in texts]
t = []
for text in texts:
t += text
texts = t
print("Loading data finished!")
def lengeq2(x):
return len(x) >= 2
texts = list(filter(lengeq2, texts))
texts_new = []
def cutwords(x):
return not ((x[1] == 'm') or (x[0] == '%') or (x[0] == '%') or (x[1]=='eng' and len(x[0]) == 1))
text_segment = []
for d in range(len(texts)):
if d % 500 == 0:
print([d,len(texts)])
texts[d] = [tuple(x) for x in jp.cut(texts[d])]
texts[d] = list(filter(cutwords, texts[d]))
if len(texts[d]) > 1:
text_segment.append(texts[d])
phrases = defaultdict(int)
texts = text_segment
num = 1
max_n = 3
start_time = time.time()
for sentence in text_segment:
for id in range(len(sentence)):
word = sentence[id][0]
pos = sentence[id][1]
#We do not want a phrase to start with words with the following properties
if (pos not in ['q','zg','u','uz','ud','uj','uv','ul','x','c','u','p','t','r','ad','m','f']) and (word not in ['经','间','带来','和','对','比例','同比','明显','占','个','应','应该','性','较','与','同时','和','继续','做好','看','得到','依旧','依然','成','变为','变成','称为','成为','还是','还会','还有','相当','相比','也','已','已经','引起','引发','又','大概','约','大约','再次','可','不可','均','这类','这个','这','注','表','图','截至','能','不能','可以','不可以','至','到','自从','自','将','从','这次','到','中','上','是','其','占','主要','包括','中','为','有所','不可能','可能','不要','没有','不会','不是','是','会','有','要','占','。','、','“','”','?',':','!',',',';']):
for n in range(2,min(len(sentence)-id,max_n)+1):
############## We do not want the words with the following properties to end a phrase
if (sentence[id+n-1][1] not in ['u','uz','ud','uj','uv','ul','m','c','p','zg','r','x','t','f','ad']) and (sentence[id+n-1][0] not in ['经','出现','逐年','逐月','逐步','渐渐','逐渐','还','占比','趋','受','显示','总体','迅速','月份','进入','先','累计','持续','先','较为','作为','在','带来','越来越','仍然','原因','兑','应','应该','平均','均','非','总','较','与','同时','和','对','继续','做好','不','开始','看','一定','引起','引发','已','已经','得到','起','依旧','依然','成','变为','变成','称为','成为','还是','还会','还有','稍有','略有','达到','相当','相比','也','同比','又','已','已经','尚待','用于','大概','约','大约','产生','可','不可','注','表','图','一般','截至','能','不能','可以','不可以','至','到','自从','自','将','从','这次','到','中','上','是','其','占','主要','包括','中','为','有所','不可能','可能','不要','没有','不会','不是','是','会','有','要','占','。','、','“','”','?',':','!',',',';']):
phrase = [sentence[id+k][0] for k in range(0,n)]
######### We do not want the following words to end a phrase
if (len(set(phrase)) < len(phrase) - 1) or (phrase[-1] in ['由', '经','出现','逐年','逐月','逐步','渐渐','逐渐','还','占比','趋','受','显示','总体','迅速','月份','先','累计','持续','先','较为','作为','在','带来','越来越','仍然','原因','兑','应','应该','平均','均','非','总','较','与','同时','和','对','继续','做好','不','开始','看','一定','引起','引发','已','已经','得到','起','依旧','依然','成','变为','变成','称为','成为','还是','还会','还有','稍有','略有','达到','相当','相比','也','同比','又','已','已经','尚待','用于','大概','约','大约','产生','可','不可','注','表','图','一般','截至','能','不能','可以','不可以','至','到','自从','自','将','从','这次','到','中','上','是','其','占','主要','包括','中','为','有所','不可能','可能','不要','没有','不会','不是','是','会','有','要','占','。','、','“','”','?',':','!',',',';']) or (phrase[0] in ['出现','增势','小幅','大幅度','增加','情况','显示','低于','高于','慢于','快于','稍有','略有','占比','增量','高出','连续','降速','增速','好','月份','下降','降幅','不断','低于','高于','大大','大幅','间','将','将会','将要','带来','和','对','比例','同比','明显','占','增长','增幅','显著','个','应','应该','性','较','与','同时','和','继续','做好','看','得到','依旧','依然','成','变为','变成','称为','成为','还是','还会','还有','相当','相比','也','已','已经','引起','引发','又','大概','约','大约','再次','可','不可','均','这类','这个','这','注','表','图','截至','能','不能','可以','不可以','至','到','自从','自','将','从','这次','到','中','上','是','其','占','主要','包括','中','为','有所','不可能','可能','不要','没有','不会','不是','是','会','有','要','占','。','、','“','”','?',':','!',',',';']):
continue
phrase = ''.join(phrase)
if len(phrase) > 3:
phrases[phrase] += 1
if num % 1000 == 0:
print([num, len(phrases), len(text_segment)])
num += 1
end_time = time.time()
print("Generating phrases costs " +str(end_time - start_time) + " seconds!")
phrases_keep = {}
for key in phrases:
if phrases[key] >= 40 and not ('再度' in key or '发生' in key or '近期' in key or '亿' in key or '表明' in key or '都' in key or '面对' in key or '再次' in key or '更为' in key or '强调' in key or '相对' in key or '进一步' in key or '更加' in key or '非常' in key or '仍' in key or '仍然' in key or '并未' in key or '尚未' in key or '亿美元' in key or '认为' in key or '和' in key or'一季度' in key or '二季度' in key or '三季度' in key or '四季度' in key or '百分点' in key or '亿元' in key):
phrases_keep[key] = phrases[key]
print("There are totally " + str(len(phrases_keep)) + " phrases kept!")
phrases_want_write = [key + ' ' + str(value) + ' ' + 'n\n' for key, value in phrases_keep.items()]
fo = open(r"D:\dict\phrases_final.txt", "w", encoding = 'utf-8') ######### Final phrase dictionary
fo.writelines(phrases_want_write)
fo.close()
##################################################################################################
dct1 = open(r"D:\dict\user_dct_sogou.txt", encoding = 'utf-8').readlines()
dct2 = open(r"D:\dict\phrases_final.txt", encoding = 'utf-8').readlines()
dct = dct1 + dct2
f = open(r"D:\dict\user_dct_macro.txt", "w", encoding = 'utf-8')
f.writelines(dct)
f.close()
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/likelihoodlab/NLP2019.git
git@gitee.com:likelihoodlab/NLP2019.git
likelihoodlab
NLP2019
NLP2019
master

搜索帮助