1 Star 24 Fork 13

quke/大学知识图谱

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
question_classifier.py 5.91 KB
一键复制 编辑 原始数据 按行查看 历史
quke 提交于 2022-03-13 20:26 . 修改
#!/usr/bin/env python3
import os
import ahocorasick
# from .config import semantic_slot
# from paddlenlp import Taskflow
from util import logger
from config import object_name_list
from template import semantic_slot
class QuestionClassifier:
def __init__(self, searcher):
# self.g = searcher.g
entity_dict = {i: searcher.get_all_object_name(i) for i in object_name_list}
self.region_words = sum([list(i) for i in entity_dict.values()], [])
# print(self.region_words)
# 构造领域actree
# self.region_tree = self.build_actree(list(self.region_words))
self.region_tree = searcher.region_tree
# 构建词典
self.wdtype_dict = searcher.wdtype_dict
# print(self.wdtype_dict)
self.g = searcher.g
# print(self.region_words)
# 问句疑问词
print('model init finished ......')
return
def classify(self, question):
"""
得到问题类型和相关实体
"""
data = {}
# print(question)
medical_dict = self.check_medical(question)
logger.info(f"找到实体为{medical_dict}") # {'高血压': ['disease']
tmp_medical_dict = medical_dict.copy()
for k, v_list in tmp_medical_dict.items():
if '大学简称' in v_list:
new_k = [i['n.name'] for i in self.g.run(f"match (m:`大学简称`)<-[:`简称`]-(n:`大学`) where m.name='{k}' return n.name")][0]
medical_dict[new_k] = ['大学', ]
del medical_dict[k]
elif '城市' in v_list:
new_k = [i['m.name'] for i in self.g.run(f"match (m:`城市`) where m.name=~'{k}.?' return m.name")][0]
medical_dict[new_k] = ['城市', ]
del medical_dict[k]
# if not medical_dict:
# return {}
data['args'] = medical_dict
# 收集问句当中所涉及到的实体类型
types = []
for type_ in medical_dict.values():
types += type_
question_type = 'others'
question_types = [] # 同时验证了实体和关键词
may_question_types = [] # 只验证关键词
# print(semantic_slot)
for semantic_slot_one in semantic_slot:
# print(self.check_words(semantic_slot_one['keywords'], question))
# print(set(semantic_slot_one['slot_list']) & set(types))
if self.check_words(semantic_slot_one['keywords'], question) and (
(set(semantic_slot_one['slot_list']) & set(types)) or not semantic_slot_one['slot_list']):
question_type = semantic_slot_one['question_type']
question_types.append(question_type)
# print(semantic_slot_one['keywords'])
# print(question)
# print()
if self.check_words(semantic_slot_one['keywords'], question):
may_question_type = semantic_slot_one['question_type']
may_question_types.append(may_question_type)
data['question_types'] = question_types
data['may_question_types'] = may_question_types
# data['question_label'] = question_label
return data
'''构造词对应的类型'''
def build_wdtype_dict(self, entity_dict):
wd_dict = {}
for k, v in entity_dict.items():
for i, v_one in enumerate(v):
if v_one not in wd_dict:
wd_dict[v_one] = [k, ]
else:
wd_dict[v_one].append(k)
return wd_dict
'''构造actree,加速过滤'''
def build_actree(self, wordlist):
actree = ahocorasick.Automaton()
for index, word in enumerate(wordlist):
if not word:
continue
actree.add_word(word, (index, word))
actree.make_automaton()
return actree
'''问句过滤'''
def check_medical(self, question):
region_wds = []
for i in self.region_tree.iter(question):
wd = i[1][1]
region_wds.append(wd)
stop_wds = []
for wd1 in region_wds:
for wd2 in region_wds:
if wd1 in wd2 and wd1 != wd2:
stop_wds.append(wd1)
final_wds = [i for i in region_wds if i not in stop_wds]
final_dict = {i: self.wdtype_dict.get(i) for i in final_wds}
return final_dict
'''基于特征词进行分类'''
def check_words(self, wds, sent):
for wd in wds:
if wd in sent:
return True
return False
# def paddle_ner(self, question):
# ner = Taskflow("ner")
# print(ner)
if __name__ == '__main__':
from neo4j_helper import neo4j_handler
handler = QuestionClassifier(neo4j_handler)
while 1:
# question = input('input an question:')
# question = '溶血反应原因是什么' # {'args': {'溶血反应': ['C类-单采血液成分相关不良反应']}, 'question_types': ['症状原因'], 'question_label': 'C类-单采血液成分相关不良反应'}
# question = '献血不良反应分类指南起草人是谁' # {'args': {'溶血反应': ['C类-单采血液成分相关不良反应']}, 'question_types': ['症状原因'], 'question_label': 'C类-单采血液成分相关不良反应'}
# question = 'A1类-以穿刺部位出血为主要表现的不良反应症状有啥' # {'args': {'溶血反应': ['C类-单采血液成分相关不良反应']}, 'question_types': ['症状原因'], 'question_label': 'C类-单采血液成分相关不良反应'}
# question = '属于什么不良发音'
# question = '昏厥属于什么'
# question = '上呼吸道感染可以献血吗'
# question = '河北有什么大学'
question = '唐山有啥学校'
# question = '兰大属于双一流吗'
# r = handler.paddle_ner(question)
# print(r)
data = handler.classify(question)
print(data)
break
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/qukequke/university-knowledge-map.git
git@gitee.com:qukequke/university-knowledge-map.git
qukequke
university-knowledge-map
大学知识图谱
master

搜索帮助