1 Star 0 Fork 0

xiezhx / KeyWordExtract

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
debug.py 8.07 KB
一键复制 编辑 原始数据 按行查看 历史
xiezhx 提交于 2021-05-08 17:14 . 1
from jieba import lcut
from os import listdir
from collections import Counter
from math import log
from jieba import cut,set_dictionary as jieba_set_dict,initialize as jieba_init
# 初始化jieba模块
jieba_set_dict("stopwords-master/dict.txt")
jieba_init()
stopwords_list = []
with open("stopwords-master/cn_stopwords.txt", encoding="utf-8") as f:
stopwords_list = f.read().split()
stopwords_list.append(" ")
stopwords_list.append("\t")
stopwords_list.append("\n")
class TFIDF:
idf_dict = dict()
def __init__(self):
self.keywords = dict()
self.tf_dict = dict()
self.keywords_list = []
def fit_idf(self, texts):
counters = []
words = set()
for text in texts:
counters.append(Counter(list(lcut(text))))
for c in counters:
for k in c.keys():
words.add(k)
idf_counter = Counter()
for word in words:
idf_counter[word] = 0
for c in counters:
if c[word] != 0:
idf_counter[word] += 1
length = len(counters)
for key in idf_counter:
TFIDF.idf_dict[key] = log(length/(idf_counter[key] + 1))
def fit_tf(self, text):
pre_words = list(lcut(text))
counter = Counter(pre_words)
length = len(pre_words)
for key in counter:
self.tf_dict[key] = counter[key] / length
def calTF_IDF(self):
for key in self.tf_dict:
self.keywords_list.append((key, self.tf_dict[key] * TFIDF.idf_dict[key]))
self.keywords_list = sorted(self.keywords_list, key = lambda x : x[1], reverse=1)
class TextRank:
max_iter = 100
def __init__(self, text, stopwords_list):
self.text = text
self.stopwords_list = stopwords_list
self.keywords = []
def fit_keywords(self):
arr = list(lcut(self.text))
arr = list(filter(lambda x : x not in stopwords_list, arr))
words = {}
ws ={}
for i in range(len(arr)):
if i + 9 > len(arr):
break
for word in arr[i:i+9]:
if words.get(word) == None:
words[word] = set()
words[word] = words[word].union(set([x for x in arr[i: i + 9] if x != word]))
for key in words:
ws[key] = 1.0
for i in range(TextRank.max_iter):
for key in words:
temp = 0
for inner in words[key]:
temp += ws[inner]/len(words[inner])
ws[key] = 1-0.85+0.85 * temp
self.keywords = sorted(ws.items(), key=lambda x : x[1], reverse=1)
def get_n_keywords(self, n):
pass
def get_n_sentences(sel, n):
pass
from tkinter import Tk,StringVar,Label,Entry,Button
from tkinter.filedialog import askopenfiles
from tkinter.simpledialog import askinteger
class MainTk:
def __init__(self):
self.main_box = Tk()
self.initComponents()
self.initFrame()
def initComponents(self):
self.main_box.title("KeyWord Extract")
self.choose_button1 = Button(self.main_box, text = "TF-IDF")
self.choose_button2 = Button(self.main_box, text = "TextRank")
self.select_path_button = Button(self.main_box, text="路径选择")
def initFrame(self):
self.main_box.geometry("1000x200")
self.choose_button1.place(x = 450, y = 30, width = 100, height = 50)
self.choose_button1.bind("<Button>", lambda even: self.modeChoose(even))
self.choose_button2.place(x = 450, y = 120, width = 100, height = 50)
self.choose_button2.bind("<Button>", lambda even: self.modeChoose(even))
def runMyself(self):
self.main_box.mainloop()
def removeModeButtons(self):
self.choose_button1.place_forget()
self.choose_button2.place_forget()
def TextRankInit(self):
self.textrank_var = StringVar()
self.textrank_label = Label(self.main_box, fg="red", textvar=self.textrank_var)
self.textrank_label.place(x=450, y=175)
self.textrank_path_button = Button(self.main_box, text="文件路径选择")
self.textrank_path_button.bind("<Button>", lambda even: self.selectPathTextRank(even))
self.textrank_path_button.place(x=450, y=75, width = 100,height=50)
def selectPathTextRank(self,event):
#选择文件path_接收文件地址
path_ = askopenfiles(filetypes=[('text files', '.txt'),('pythonfiles',('.py','.pyw'))])
#通过replace函数替换绝对文件地址中的/来使文件可被程序读取
#注意:\\转义后为\,所以\\\\转义后为\\
paths = [x.name for x in path_]
if len(paths) == 0:
self.textrank_var.set("None File Selected")
return None
with open(paths[0], encoding="gbk") as f:
text = f.read()
classer = TextRank(text, stopwords_list)
classer.fit_keywords()
print(classer.keywords[:10])
showed_arr = [x[0] for x in classer.keywords[:10]]
self.textrank_var.set(" ".join(showed_arr))
def selectPathTFIDF(self, event):
#选择文件path_接收文件地址
path_ = askopenfiles(filetypes=[('text files', '.txt'),('pythonfiles',('.py','.pyw'))])
#通过replace函数替换绝对文件地址中的/来使文件可被程序读取
#注意:\\转义后为\,所以\\\\转义后为\\
paths = [x.name for x in path_]
if len(paths) == 0:
self.TFIDF_var.set("None File Selected")
return None
texts = []
for text in paths:
with open(text) as f:
texts.append(f.read())
classer = TFIDF()
classer.fit_idf(texts)
print(paths)
integer = askinteger("Get Index", "输入待提取的文件序号,即第几个文件")
print(integer)
try:
paths[integer]
except Exception as e:
self.TFIDF_var.set(e)
return
classer.fit_tf(texts[integer])
classer.calTF_IDF()
print(classer.keywords_list[:10])
showed_arr = [x[0] for x in classer.keywords_list[:10]]
self.TFIDF_var.set(" ".join(showed_arr))
def TFIDFInit(self):
self.TFIDF_var = StringVar()
self.tfidf_label = Label(self.main_box, fg="red", textvar=self.TFIDF_var)
self.tfidf_label.place(x=450, y=150)
self.TFIDF_path_button = Button(self.main_box, text="文件路径选择")
self.TFIDF_path_button.bind("<Button>", lambda even: self.selectPathTFIDF(even))
self.TFIDF_path_button.place(x=450, y=20, width = 100,height=20)
# self.TFIDF_learning_button = Button(self.main_box, text="学习构筑IDF表")
# self.TFIDF_learning_button.bind("<Button>", lambda even: self.selectPathTFIDF(even))
# self.TFIDF_learning_button.place(x=450, y=50, width = 100,height=20)
def modeChoose(self,event):
button = event.widget
if button['text'] == 'TextRank':
self.removeModeButtons()
self.TextRankInit()
if button['text'] == 'TF-IDF':
self.removeModeButtons()
self.TFIDFInit()
# main_box=Tk()
# main_box.geometry("1000x200")
# #变量path
# path = StringVar()
# #输入框,标记,按键
# #输入框绑定变量path
# Button(main_box, text = "路径选择", command = selectPath).grid(row = 0, column = 2)
# main_box.mainloop()
main = MainTk()
main.runMyself()
1
https://gitee.com/xiezhx9/key-word-extract.git
git@gitee.com:xiezhx9/key-word-extract.git
xiezhx9
key-word-extract
KeyWordExtract
master

搜索帮助