Ai
1 Star 1 Fork 0

kaycn/blevek

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
tokenizer.go 3.06 KB
一键复制 编辑 原始数据 按行查看 历史
kaycn 提交于 2020-06-06 11:07 +08:00 . 修正bug
package blevejieba
import (
"os"
"regexp"
"strconv"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/yanyiwu/gojieba"
)
var ideographRegexp = regexp.MustCompile(`\p{Han}+`)
// JiebaTokenizer is the beleve tokenizer for jiebago.
type JiebaTokenizer struct {
jieba *gojieba.Jieba
searchMode gojieba.TokenizeMode
useHmm bool
}
func NewJiebaTokenizer(dictFilePath, hmm, userDictPath, idfDict, stopDict string, searchMode bool) (analysis.Tokenizer, error) {
jieba := gojieba.NewJieba(dictFilePath, hmm, userDictPath, idfDict, stopDict)
mode := gojieba.DefaultMode
if searchMode {
mode = gojieba.SearchMode
}
return &JiebaTokenizer{
jieba: jieba,
searchMode: mode,
}, nil
}
// Tokenize cuts input into bleve token stream.
func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
pos := 1
isstr:= 0
var tokenword []gojieba.Word
tokenword, isstr = Dotoken(string(input[:]))
if(tokenword == nil){
tokenword = jt.jieba.Tokenize(string(input), jt.searchMode, true)
}
for _, word := range tokenword {
token := analysis.Token{
Term: []byte(word.Str),
Start: word.Start,
End: word.End,
Position: pos,
Type: detectTokenType(word.Str,isstr),
}
rv = append(rv, &token)
pos++
}
return rv
}
/*
JiebaTokenizerConstructor creates a JiebaTokenizer.
Parameter config can contains following parameter:
dict_path: optional, the path of the dictionary file.
hmm_path: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details.
userdict_path: optional, specify user dict file path
idf_path: optional, specify idf file path
stopdict_path: optional, specify user stop dict file path
is_search: optional, speficy whether to use isSearch mode, see NewJiebaTokenizer for details.
*/
func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (
analysis.Tokenizer, error) {
dictFilePath, ok := config[dictPathKey].(string)
if !ok {
dictFilePath = gojieba.DICT_PATH
}
hmm, ok := config[hmmPathKey].(string)
if !ok {
hmm = gojieba.HMM_PATH
}
userDictPath, ok := config[userDictPathKey].(string)
if !ok {
userDictPath = gojieba.USER_DICT_PATH
}
stopDict, ok := config[stopDictPathKey].(string)
if !ok {
stopDict = gojieba.STOP_WORDS_PATH
}
idfDict, ok := config[idfDictPathKey].(string)
if !ok {
idfDict = gojieba.IDF_PATH
}
searchMode, ok := config[isSearchKey].(bool)
if !ok {
searchMode = true
}
return NewJiebaTokenizer(dictFilePath, hmm, userDictPath, idfDict, stopDict, searchMode)
}
func IsExist(f string) bool {
_, err := os.Stat(f)
return err == nil || os.IsExist(err)
}
func detectTokenType(term string,isstr int) analysis.TokenType {
if(isstr == 1){
return analysis.Ideographic
}
if ideographRegexp.MatchString(term) {
return analysis.Ideographic
}
_, err := strconv.ParseFloat(term, 64)
if err == nil {
return analysis.Numeric
}
return analysis.AlphaNumeric
}
func init() {
registry.RegisterTokenizer(Name, JiebaTokenizerConstructor)
}
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/lkaycn/blevek.git
git@gitee.com:lkaycn/blevek.git
lkaycn
blevek
blevek
v1.0.9

搜索帮助