master

分支 (1)

管理

管理

master

NLP2019
/
02Phrases.py

# -*- coding: utf-8 -*-
import pandas as pd
import time
from collections import defaultdict
import jieba.posseg as jp, jieba
import re
########33 Load Data
texts = []
dates_lst = []
text = ""
date = list(pd.read_excel(r'D:\MACRO\macro'+str(1)+'.0.xls',sheet_name='Sheet').loc[:,'Date'])[0]
dates_lst.append(date)
jieba.load_userdict(r"D:\dict\user_dct_sogou.txt")
for k in range(1,79):
    df = pd.read_excel(r'D:\MACRO\macro'+str(k)+'.0.xls',sheet_name='Sheet')
    dates = list(df.loc[:,'Date'])
    for n in range(len(dates)):
        date = dates[n]
        if type(date) is not str:
            continue
        want = 0
        for word in jp.cut(df.loc[n,'Title'].replace(" ","")):
            if word.flag not in ['eng', 'm' , 'x']:
                want = 1
                break
        if want == 0:
            continue
        #if dates[n] == date:
        text = df.loc[n,'Content']
        #else:
        texts.append(text)
        dates_lst.append(date)
        #text = ""
texts = [re.sub('\\.|-| |\t|_|%|％|\[|\]|/|^|\n|的|个|是|兄|也|了|只','',text) for text in texts]
texts = [re.split('[；;。！？…，,：:、》《~\?\.“”""（）\(\)\t\n]',text) for text in texts]
t = []
for text in texts:
    t += text
texts = t
print("Loading data finished!")

def lengeq2(x):
    return len(x) >= 2
texts = list(filter(lengeq2, texts))

texts_new = []
def cutwords(x):
    return not ((x[1] == 'm') or (x[0] == '%') or (x[0] == '％') or (x[1]=='eng' and len(x[0]) == 1))
text_segment = []
for d in range(len(texts)):
    if d % 500 == 0:
        print([d,len(texts)])
    texts[d] = [tuple(x) for x in jp.cut(texts[d])]
    texts[d] = list(filter(cutwords, texts[d]))
    if len(texts[d]) > 1:
        text_segment.append(texts[d])


phrases = defaultdict(int)
texts = text_segment
num = 1
max_n = 3
start_time = time.time()
for sentence in text_segment:
    for id in range(len(sentence)):
        word = sentence[id][0]
        pos = sentence[id][1]
        #We do not want a phrase to start with words with the following properties
        if (pos not in ['q','zg','u','uz','ud','uj','uv','ul','x','c','u','p','t','r','ad','m','f']) and (word not in ['经','间','带来','和','对','比例','同比','明显','占','个','应','应该','性','较','与','同时','和','继续','做好','看','得到','依旧','依然','成','变为','变成','称为','成为','还是','还会','还有','相当','相比','也','已','已经','引起','引发','又','大概','约','大约','再次','可','不可','均','这类','这个','这','注','表','图','截至','能','不能','可以','不可以','至','到','自从','自','将','从','这次','到','中','上','是','其','占','主要','包括','中','为','有所','不可能','可能','不要','没有','不会','不是','是','会','有','要','占','。','、','“','”','？','：','！','，','；']):
            for n in range(2,min(len(sentence)-id,max_n)+1):
                ############## We do not want the words with the following properties to end a phrase
                if (sentence[id+n-1][1] not in ['u','uz','ud','uj','uv','ul','m','c','p','zg','r','x','t','f','ad']) and (sentence[id+n-1][0] not in ['经','出现','逐年','逐月','逐步','渐渐','逐渐','还','占比','趋','受','显示','总体','迅速','月份','进入','先','累计','持续','先','较为','作为','在','带来','越来越','仍然','原因','兑','应','应该','平均','均','非','总','较','与','同时','和','对','继续','做好','不','开始','看','一定','引起','引发','已','已经','得到','起','依旧','依然','成','变为','变成','称为','成为','还是','还会','还有','稍有','略有','达到','相当','相比','也','同比','又','已','已经','尚待','用于','大概','约','大约','产生','可','不可','注','表','图','一般','截至','能','不能','可以','不可以','至','到','自从','自','将','从','这次','到','中','上','是','其','占','主要','包括','中','为','有所','不可能','可能','不要','没有','不会','不是','是','会','有','要','占','。','、','“','”','？','：','！','，','；']):
                    phrase = [sentence[id+k][0] for k in range(0,n)]
                    #########  We do not want the following words to end a phrase
                    if (len(set(phrase)) < len(phrase) - 1) or (phrase[-1] in ['由', '经','出现','逐年','逐月','逐步','渐渐','逐渐','还','占比','趋','受','显示','总体','迅速','月份','先','累计','持续','先','较为','作为','在','带来','越来越','仍然','原因','兑','应','应该','平均','均','非','总','较','与','同时','和','对','继续','做好','不','开始','看','一定','引起','引发','已','已经','得到','起','依旧','依然','成','变为','变成','称为','成为','还是','还会','还有','稍有','略有','达到','相当','相比','也','同比','又','已','已经','尚待','用于','大概','约','大约','产生','可','不可','注','表','图','一般','截至','能','不能','可以','不可以','至','到','自从','自','将','从','这次','到','中','上','是','其','占','主要','包括','中','为','有所','不可能','可能','不要','没有','不会','不是','是','会','有','要','占','。','、','“','”','？','：','！','，','；']) or (phrase[0] in ['出现','增势','小幅','大幅度','增加','情况','显示','低于','高于','慢于','快于','稍有','略有','占比','增量','高出','连续','降速','增速','好','月份','下降','降幅','不断','低于','高于','大大','大幅','间','将','将会','将要','带来','和','对','比例','同比','明显','占','增长','增幅','显著','个','应','应该','性','较','与','同时','和','继续','做好','看','得到','依旧','依然','成','变为','变成','称为','成为','还是','还会','还有','相当','相比','也','已','已经','引起','引发','又','大概','约','大约','再次','可','不可','均','这类','这个','这','注','表','图','截至','能','不能','可以','不可以','至','到','自从','自','将','从','这次','到','中','上','是','其','占','主要','包括','中','为','有所','不可能','可能','不要','没有','不会','不是','是','会','有','要','占','。','、','“','”','？','：','！','，','；']):
                        continue
                    phrase = ''.join(phrase)
                    if len(phrase) > 3:
                        phrases[phrase] += 1
    if num % 1000 == 0:
        print([num, len(phrases), len(text_segment)])
    num += 1
end_time = time.time()
print("Generating phrases costs " +str(end_time - start_time) + " seconds!")

phrases_keep = {}
for key in phrases:
    if phrases[key] >= 40 and not ('再度' in key or '发生' in key or '近期' in key or '亿' in key or '表明' in key or '都' in key or '面对' in key or '再次' in key or '更为' in key or '强调' in key or '相对' in key or '进一步' in key or '更加' in key or '非常' in key or '仍' in key or '仍然' in key or '并未' in key or '尚未' in key or '亿美元' in key or '认为' in key or '和' in key or'一季度' in key or '二季度' in key or '三季度' in key or '四季度' in key or '百分点' in key or '亿元' in key):
        phrases_keep[key] = phrases[key]
print("There are totally " + str(len(phrases_keep)) + " phrases kept!")


phrases_want_write = [key + ' ' + str(value) + ' ' + 'n\n' for key, value in phrases_keep.items()]
fo = open(r"D:\dict\phrases_final.txt", "w", encoding = 'utf-8')          ######### Final phrase dictionary
fo.writelines(phrases_want_write)
fo.close()
##################################################################################################

dct1 = open(r"D:\dict\user_dct_sogou.txt", encoding = 'utf-8').readlines()
dct2 = open(r"D:\dict\phrases_final.txt", encoding = 'utf-8').readlines()
dct = dct1 + dct2
f = open(r"D:\dict\user_dct_macro.txt", "w", encoding = 'utf-8')
f.writelines(dct)
f.close()