key-word-extract
/
debug.py

from jieba import lcut
from os import listdir
from collections import Counter
from math import log
from jieba import cut,set_dictionary as jieba_set_dict,initialize as jieba_init

# 初始化jieba模块
jieba_set_dict("stopwords-master/dict.txt")
jieba_init()


stopwords_list = []
with open("stopwords-master/cn_stopwords.txt", encoding="utf-8") as f:
    stopwords_list = f.read().split()
stopwords_list.append(" ")
stopwords_list.append("\t")
stopwords_list.append("\n")


class TFIDF:

    idf_dict = dict()

    def __init__(self):
        self.keywords = dict()
        self.tf_dict = dict()
        self.keywords_list = []
    def fit_idf(self, texts):

        counters = []

        words = set()

        for text in texts:
            counters.append(Counter(list(lcut(text))))

        for c in counters:
            for k in c.keys():
                words.add(k)

        idf_counter = Counter()

        for word in words:

            idf_counter[word] = 0
            for c in counters:
                if c[word] != 0:
                    idf_counter[word] += 1


        length = len(counters)


        for key in idf_counter:
            TFIDF.idf_dict[key] = log(length/(idf_counter[key] + 1))
    def fit_tf(self, text):
        pre_words = list(lcut(text))

        counter = Counter(pre_words)

        length = len(pre_words)

        for key in counter:
            self.tf_dict[key] = counter[key] / length


    def calTF_IDF(self):
        for key in self.tf_dict:
            self.keywords_list.append((key, self.tf_dict[key] * TFIDF.idf_dict[key]))

        self.keywords_list = sorted(self.keywords_list, key =  lambda x : x[1], reverse=1)


class TextRank:

    max_iter = 100

    def __init__(self, text, stopwords_list):
        self.text = text
        self.stopwords_list = stopwords_list
        self.keywords = []

    def fit_keywords(self):
        arr = list(lcut(self.text))


        arr = list(filter(lambda x : x not in stopwords_list, arr))

        words = {}
        ws ={}
        for i in range(len(arr)):
            if i + 9 > len(arr):
                break

            for word in arr[i:i+9]:
                if words.get(word) == None:
                    words[word] = set()
                words[word] = words[word].union(set([x for x in arr[i: i + 9] if x != word]))

        for key in words:
            ws[key] = 1.0

        for i in range(TextRank.max_iter):
            for key in words:
                temp = 0
                for inner in words[key]:
                    temp += ws[inner]/len(words[inner])
                ws[key] = 1-0.85+0.85 * temp
        self.keywords = sorted(ws.items(), key=lambda x : x[1], reverse=1)

    def get_n_keywords(self, n):
        pass


    def get_n_sentences(sel, n):
        pass


from tkinter import Tk,StringVar,Label,Entry,Button
from tkinter.filedialog import askopenfiles
from tkinter.simpledialog import askinteger

class MainTk:
    def __init__(self):
        self.main_box = Tk()
        self.initComponents()
        self.initFrame()

    def initComponents(self):
        self.main_box.title("KeyWord Extract")
        self.choose_button1 = Button(self.main_box, text = "TF-IDF")
        self.choose_button2 = Button(self.main_box, text = "TextRank")
        self.select_path_button = Button(self.main_box, text="路径选择")
    def initFrame(self):
        self.main_box.geometry("1000x200")


        self.choose_button1.place(x = 450, y = 30, width = 100, height = 50)
        self.choose_button1.bind("<Button>", lambda even: self.modeChoose(even))


        self.choose_button2.place(x = 450, y = 120, width = 100, height = 50)
        self.choose_button2.bind("<Button>", lambda even: self.modeChoose(even))

    def runMyself(self):
        self.main_box.mainloop()

    def removeModeButtons(self):
        self.choose_button1.place_forget()
        self.choose_button2.place_forget()
    def TextRankInit(self):

        self.textrank_var = StringVar()
        self.textrank_label = Label(self.main_box, fg="red", textvar=self.textrank_var)
        self.textrank_label.place(x=450, y=175)

        self.textrank_path_button = Button(self.main_box, text="文件路径选择")
        self.textrank_path_button.bind("<Button>",  lambda even: self.selectPathTextRank(even))
        self.textrank_path_button.place(x=450, y=75, width = 100,height=50)


    def selectPathTextRank(self,event):
        #选择文件path_接收文件地址
        path_ = askopenfiles(filetypes=[('text files', '.txt'),('pythonfiles',('.py','.pyw'))])

        #通过replace函数替换绝对文件地址中的/来使文件可被程序读取
        #注意：\\转义后为\，所以\\\\转义后为\\
        paths = [x.name for x in path_]

        if len(paths) == 0:
            self.textrank_var.set("None File Selected")
            return None

        with open(paths[0], encoding="gbk") as f:

            text = f.read()
            classer = TextRank(text, stopwords_list)
            classer.fit_keywords()
            print(classer.keywords[:10])
            showed_arr = [x[0] for x in classer.keywords[:10]]

            self.textrank_var.set(" ".join(showed_arr))

    def selectPathTFIDF(self, event):
        #选择文件path_接收文件地址
        path_ = askopenfiles(filetypes=[('text files', '.txt'),('pythonfiles',('.py','.pyw'))])

        #通过replace函数替换绝对文件地址中的/来使文件可被程序读取
        #注意：\\转义后为\，所以\\\\转义后为\\
        paths = [x.name for x in path_]

        if len(paths) == 0:
            self.TFIDF_var.set("None File Selected")
            return None


        texts = []

        for text in paths:
            with open(text) as f:
                texts.append(f.read())
        classer = TFIDF()
        classer.fit_idf(texts)
        print(paths)

        integer = askinteger("Get Index", "输入待提取的文件序号，即第几个文件")
        print(integer)

        try:
            paths[integer]
        except Exception as e:
            self.TFIDF_var.set(e)
            return

        classer.fit_tf(texts[integer])
        classer.calTF_IDF()
        print(classer.keywords_list[:10])

        showed_arr = [x[0] for x in classer.keywords_list[:10]]

        self.TFIDF_var.set(" ".join(showed_arr))

    def TFIDFInit(self):
        self.TFIDF_var = StringVar()

        self.tfidf_label = Label(self.main_box, fg="red", textvar=self.TFIDF_var)
        self.tfidf_label.place(x=450, y=150)


        self.TFIDF_path_button = Button(self.main_box, text="文件路径选择")
        self.TFIDF_path_button.bind("<Button>",  lambda even: self.selectPathTFIDF(even))
        self.TFIDF_path_button.place(x=450, y=20, width = 100,height=20)

        # self.TFIDF_learning_button = Button(self.main_box, text="学习构筑IDF表")
        # self.TFIDF_learning_button.bind("<Button>",  lambda even: self.selectPathTFIDF(even))
        # self.TFIDF_learning_button.place(x=450, y=50, width = 100,height=20)


    def modeChoose(self,event):
        button = event.widget

        if button['text'] == 'TextRank':
            self.removeModeButtons()
            self.TextRankInit()
        if button['text'] == 'TF-IDF':
            self.removeModeButtons()
            self.TFIDFInit()

# main_box=Tk()
# main_box.geometry("1000x200")
# #变量path
# path = StringVar()
# #输入框，标记，按键
# #输入框绑定变量path
# Button(main_box, text = "路径选择", command = selectPath).grid(row = 0, column = 2)
# main_box.mainloop()
main = MainTk()
main.runMyself()