代码拉取完成,页面将自动刷新
# -*- coding: utf-8 -*-
from pyltp import SentenceSplitter
from bs4 import BeautifulSoup
import os
filepath0 = 'E:\\医疗保险语料库\\医疗保险语料原文\\'
outpath = 'E:\\MedicareCorpus\\'
filepath1 = os.listdir(filepath0)
files = []
for path in filepath1:
fp = filepath0 + path
f = os.listdir(fp)
for fi in f:
files.append(fp + '\\' + fi)
k = 1
#分句函数
def sentence_splitter(sentence):
sents = SentenceSplitter.split(sentence)
sents_list = list(sents)
return sents_list
# inputpath="E:\\MedicareCorpus"
# outputpath="E:\\医疗保险语料"
# k=1
# inputfiles=os.listdir(inputpath)
# print(inputfiles)
# for file in inputfiles:
# #打开待分句文件
# fname=inputpath+"\\"+file
# f=open(fname,'r',encoding='utf-8')
# #打开输出文件
# opath=outputpath+"\\"+str(k)+".txt"
# outputfile=open(opath,'w',encoding='utf-8')
# for line in f.readlines():
# sents=sentence_splitter(line)
# for sent in sents:
# if len(sent)>1:
# outputfile.write(sent+'\n')
# f.close()
# outputfile.close()
# k=k+1
for file in files:
try:
fobj = open(file, 'r', encoding='utf-8')
bsobj = BeautifulSoup(fobj.read(), "lxml")
s = bsobj.text
slist = sentence_splitter(s)
outfile = outpath + str(k) + '.txt'
with open(outfile, 'w', encoding='utf-8') as f:
for ss in slist:
if len(ss) > 1:
ss = ss.strip()
ss = ss.strip('\n')
f.write(ss + '\n')
k += 1
except UnicodeDecodeError:
print(file)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。