1 Star 0 Fork 0

beatlesoasis / magazine_ref_count

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
text_processor.py 3.25 KB
一键复制 编辑 原始数据 按行查看 历史
beatlesoasis 提交于 2022-01-27 19:12 . save working prograss
#!/usr/bin/python3
import os
import argparse
import yaml
import references_slicer
import references_classifier
class Configure(object):
def __init__(self):
self.input_dir = ""
self.input_file = ""
self.output_file = ""
self.file_extension = ""
self.ref_start_expr = ""
self.ref_end_expr = ""
self.ref_line_head_expr = ""
self.ref_line_tail_expr = ""
self.similarity_threshold = 0.0
def parse_config(self, config_path: str):
f = open(config_path, "rb")
y = yaml.load(f,Loader=yaml.FullLoader)
self.file_extension = y["configure"]["file_extension"]
self.ref_start_expr = y["configure"]["ref_start_expr"]
self.ref_end_expr = y["configure"]["ref_end_expr"]
self.ref_line_head_expr = y["configure"]["ref_line_head_expr"]
self.ref_line_tail_expr = y["configure"]["ref_line_tail_expr"]
self.similarity_threshold = y["configure"]["similarity_threshold"]
# print(self.__dict__)
return
def classify_references_file(config: Configure):
slicer = references_slicer.RefSlicer(config.ref_start_expr,
config.ref_end_expr,
config.ref_line_head_expr,
config.ref_line_tail_expr)
slicer.open(config.input_file)
slicer.find_all_refs()
classifier = references_classifier.RefClassifier(config.similarity_threshold)
classifier.refs_classify(slicer.get_ref_list())
classifier.export(config.output_file)
def classify_references_dir(config: Configure):
classifier = references_classifier.RefClassifier(config.similarity_threshold)
slicer = references_slicer.RefSlicer(config.ref_start_expr,
config.ref_end_expr,
config.ref_line_head_expr,
config.ref_line_tail_expr)
for root, _, files in os.walk(config.input_dir, topdown=False):
for f in files:
if os.path.splitext(f)[-1][1:] != config.file_extension:
continue
slicer.reset()
slicer.open(os.path.join(root, f))
slicer.find_all_refs()
classifier.refs_classify(slicer.get_ref_list())
classifier.export(config.output_file)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-c", dest = "config_file", help = "configure file. default: config.yml", default = "./config.yml")
parser.add_argument("-f", dest = "input_file", help = "the file want to classfy.", default="")
parser.add_argument("-d", dest = "input_dir", help = "the directory want to classfy.", default="./")
parser.add_argument("-o", dest = "output_file", help = "the file want to output.", default="./ref_count.xlsx")
args = parser.parse_args()
conf = Configure()
conf.parse_config(args.config_file)
conf.input_dir = args.input_dir
conf.input_file = args.input_file
conf.output_file = args.output_file
if conf.input_file != "":
classify_references_file(conf)
else:
classify_references_dir(conf)
exit(0)
Python
1
https://gitee.com/beatlesoasis/magazine_ref_count.git
git@gitee.com:beatlesoasis/magazine_ref_count.git
beatlesoasis
magazine_ref_count
magazine_ref_count
master

搜索帮助