代码拉取完成,页面将自动刷新
package blevejieba
import (
"os"
"regexp"
"strconv"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/yanyiwu/gojieba"
)
var ideographRegexp = regexp.MustCompile(`\p{Han}+`)
// JiebaTokenizer is the beleve tokenizer for jiebago.
type JiebaTokenizer struct {
jieba *gojieba.Jieba
searchMode gojieba.TokenizeMode
useHmm bool
}
func NewJiebaTokenizer(dictFilePath, hmm, userDictPath, idfDict, stopDict string, searchMode bool) (analysis.Tokenizer, error) {
jieba := gojieba.NewJieba(dictFilePath, hmm, userDictPath, idfDict, stopDict)
mode := gojieba.DefaultMode
if searchMode {
mode = gojieba.SearchMode
}
return &JiebaTokenizer{
jieba: jieba,
searchMode: mode,
}, nil
}
// Tokenize cuts input into bleve token stream.
func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
pos := 1
isstr:= 0
var tokenword []gojieba.Word
tokenword, isstr = Dotoken(string(input[:]))
if(tokenword == nil){
tokenword = jt.jieba.Tokenize(string(input), jt.searchMode, true)
}
for _, word := range tokenword {
token := analysis.Token{
Term: []byte(word.Str),
Start: word.Start,
End: word.End,
Position: pos,
Type: detectTokenType(word.Str,isstr),
}
rv = append(rv, &token)
pos++
}
return rv
}
/*
JiebaTokenizerConstructor creates a JiebaTokenizer.
Parameter config can contains following parameter:
dict_path: optional, the path of the dictionary file.
hmm_path: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details.
userdict_path: optional, specify user dict file path
idf_path: optional, specify idf file path
stopdict_path: optional, specify user stop dict file path
is_search: optional, speficy whether to use isSearch mode, see NewJiebaTokenizer for details.
*/
func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (
analysis.Tokenizer, error) {
dictFilePath, ok := config[dictPathKey].(string)
if !ok {
dictFilePath = gojieba.DICT_PATH
}
hmm, ok := config[hmmPathKey].(string)
if !ok {
hmm = gojieba.HMM_PATH
}
userDictPath, ok := config[userDictPathKey].(string)
if !ok {
userDictPath = gojieba.USER_DICT_PATH
}
stopDict, ok := config[stopDictPathKey].(string)
if !ok {
stopDict = gojieba.STOP_WORDS_PATH
}
idfDict, ok := config[idfDictPathKey].(string)
if !ok {
idfDict = gojieba.IDF_PATH
}
searchMode, ok := config[isSearchKey].(bool)
if !ok {
searchMode = true
}
return NewJiebaTokenizer(dictFilePath, hmm, userDictPath, idfDict, stopDict, searchMode)
}
func IsExist(f string) bool {
_, err := os.Stat(f)
return err == nil || os.IsExist(err)
}
func detectTokenType(term string,isstr int) analysis.TokenType {
if(isstr == 1){
return analysis.Ideographic
}
if ideographRegexp.MatchString(term) {
return analysis.Ideographic
}
_, err := strconv.ParseFloat(term, 64)
if err == nil {
return analysis.Numeric
}
return analysis.AlphaNumeric
}
func init() {
registry.RegisterTokenizer(Name, JiebaTokenizerConstructor)
}
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。