1 Star 0 Fork 0

C'est La Vie / tip-las

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
Formatting.cpp 8.07 KB
一键复制 编辑 原始数据 按行查看 历史
liyc7711 提交于 2015-11-10 14:18 . changs log
#include "Formatting.h"
namespace utility_train
{
Formatting::Formatting()
{
tokenize = new Tokenize();
delimiter = " ";
head = "<#>";
tail = "</#>";
}
Formatting::~Formatting()
{
delete tokenize;
}
void Formatting::WsTrain(const std::string& text, std::vector<std::vector<std::string> >& features, const std::string& language, const std::string& encode)
{
features.clear();
std::string segs[] = { " ", "/" };
std::vector<std::string> words;
StringSplit::SplitByTokens(words, text, segs, 2, false);
int size = words.size();
for (int i = 0; i < size; i++)
{
std::vector<std::pair<std::string, std::string> >tmp;
WSTag(words[i], tmp, language, encode);
for (int j = 0; j < tmp.size(); j++)
{
std::vector<std::string> f;
f.push_back(tmp[j].first);
f.push_back(tmp[j].second);
features.push_back(f);
}
}
}
/*
*************************************************
功能 : 由测试语料生成训练需要的格式
参数 :
返回值 :
-------------------------------------------------
备注 :
-------------------------------------------------
作者 :Li Yachao
时间 :2013-3-13,2015-10-24
*************************************************
*/
void Formatting::WsTest(const std::string& text, std::vector<std::vector<std::string> >& features, std::vector<bool> & seg,
const std::string& language, const std::string& encode)
{
if (text.empty())
{
return;
}
features.clear();
seg.clear();
std::vector< std::string> tokens;
if (language == "TB")
{
Tokenize::TibetanAll(text, tokens);
}
for (int i = 0; i< tokens.size(); i++)
{
bool s = false;
if (StringOperation::IsPostfix(tokens[i], "\xe0\xbc\x8b"))
{
if (tokens[i].size() > 3)
{
tokens[i]= tokens[i].substr(0, tokens[i].size() - 3);
s = true;
}
}
seg.push_back(s);
std::vector<std::string> f;
f.push_back(tokens[i]);
features.push_back(f);
}
}
void Formatting::PosTrainS(const std::string& text, std::vector<std::vector<std::string> >& features)
{
features.clear();
std::vector<std::pair<std::string, std::string> > tokens;
std::string tmp = text;
POSExtract::ParsePOS(tmp, tokens);
std::vector<std::string>sc;//current
std::vector<std::string>sl;//left
std::vector<std::string>sr;//right
for (int i = 0; i<tokens.size(); i++)
{
if (StringOperation::IsPostfix(tokens[i].first, "\xe0\xbc\x8b"))
{
tokens[i].first = tokens[i].first.substr(0, tokens[i].first.size() - 3);
}
}
for (int i = 0; i < tokens.size(); i++)
{
std::vector<std::string> f;
f.push_back(tokens[i].first);
/******syllables features********************/
Tokenize::TibetanAll(tokens[i].first, sc);
if (i > 0)
{
Tokenize::TibetanAll(tokens[i - 1].first, sl);
}
if (i < (tokens.size() - 1))
{
Tokenize::TibetanAll(tokens[i + 1].first, sr);
}
/*current syllables*/
if (sc.size() > 2)
{
f.push_back(sc[0]);
f.push_back(sc[1]);
}
else if (sc.size() == 1)
{
f.push_back(sc[0]);
f.push_back(sc[0]);
}
else
{
f.push_back(head);
f.push_back(tail);
}
/*left syllables*/
if (sl.size() > 1)
{
f.push_back(sl[ sl.size() - 1]);
}
else
{
f.push_back( tail );
}
/*right syllables*/
if (sr.size() > 1)
{
f.push_back(sr[sr.size() - 1]);
}
else
{
f.push_back(head);
}
/******syllables features********************/
f.push_back(tokens[i].second); //
//f.push_back(tokens[i].second.substr(0, 1)); //
features.push_back(f);
}
}
void Formatting::PosTestS(const std::string& text, std::vector<std::vector<std::string> >& features, std::vector<bool>& seg)
{
features.clear();
std::string segs[] = { " ", "/" };
std::vector<std::string> words;
std::vector<std::string>sc; //current
std::vector<std::string>sl; //left
std::vector<std::string>sr ;//right
StringSplit::SplitByTokens(words, text, segs, 2, false);
for (int i = 0; i < words.size(); i++)
{
bool s = false;
if (StringOperation::IsPostfix(words[i], "\xe0\xbc\x8b"))
{
if (words[i].size() > 3)
{
words[i] = words[i].substr(0, words[i].size() - 3);
s = true;
}
}
seg.push_back(s);
std::vector<std::string> f;
f.push_back(words[i]);
/******syllables features********************/
Tokenize::TibetanAll(words[i], sc);
if (i > 0)
{
Tokenize::TibetanAll(words[i - 1], sl);
}
if (i < (words.size() - 1))
{
Tokenize::TibetanAll(words[i + 1], sr);
}
/*current syllables*/
if (sc.size() > 2)
{
f.push_back(sc[0]);
f.push_back(sc[1]);
}
else if (sc.size() == 1)
{
f.push_back(sc[0]);
f.push_back(sc[0]);
}
else
{
f.push_back(head);
f.push_back(tail);
}
/*left syllables*/
if (sl.size() > 1)
{
f.push_back(sl[sl.size() - 1]);
}
else
{
f.push_back(tail);
}
/*right syllables*/
if (sr.size() > 1)
{
f.push_back(sr[sr.size() - 1]);
}
else
{
f.push_back(head);
}
/******syllables features********************/
features.push_back(f); //add
}
}
void Formatting::PosTrain(const std::string& text, std::vector<std::vector<std::string> >& features)
{
features.clear();
std::vector<std::pair<std::string, std::string> > tokens;
std::string tmp = text;
POSExtract::ParsePOS(tmp,tokens);
for (int i = 0; i<tokens.size(); i++)
{
if (StringOperation::IsPostfix(tokens[i].first, "\xe0\xbc\x8b"))
{
tokens[i].first = tokens[i].first.substr(0, tokens[i].first.size() - 3);
}
}
for (int i = 0; i < tokens.size(); i++)
{
std::vector<std::string> f;
f.push_back(tokens[i].first); //
//f.push_back(tokens[i].second.substr(0, 1));
features.push_back(f);
}
}
void Formatting::PosTest(const std::string& text, std::vector<std::vector<std::string> >& features, std::vector<bool>& seg)
{
features.clear();
std::string segs[] = { " ", "/" };
std::vector<std::string> words;
StringSplit::SplitByTokens(words, text, segs, 2, false);
for (int i = 0; i < words.size(); i++)
{
bool s = false;
if (StringOperation::IsPostfix(words[i], "\xe0\xbc\x8b"))
{
if (words[i].size() > 3)
{
words[i] = words[i].substr(0, words[i].size() - 3);
s = true;
}
}
seg.push_back(s);
std::vector<std::string> f;
f.push_back(words[i]);
features.push_back(f);
}
}
/*
*************************************************
功能 : 对字进行位置标注
参数 :
返回值 :
-------------------------------------------------
备注 :
-------------------------------------------------
作者 :Li Yachao
时间 :2013-3-13, 2015-10-26
*************************************************
*/
void Formatting::WSTag(const std::string &word, std::vector<std::pair<std::string, std::string> > &tags, const std::string &language,const std::string& code)
{
tags.clear();
std::vector<std::string>tokens;
if (language == "TB")
{
Tokenize::TibetanAll(word, tokens);
for (int i = 0; i < tokens.size(); i++)
{
if (StringOperation::IsPostfix(tokens[i], "\xe0\xbc\x8b"))
{
if (tokens[i].size() > 3)
{
tokens[i] = tokens[i].substr(0, tokens[i].size() - 3);
}
}
}
}
else if (language == "CHS")
{
if (code == "ANSI")
{
Tokenize::Chinese_ANSI(word, tokens);
}
else
{
Tokenize::Chinese_UTF8(word, tokens);
}
}
else
{
return;
}
int size = tokens.size();
if (size == 1)
{
std::pair<std::string, std::string>p;
p.first = tokens[0];
p.second = "S";
tags.push_back(p);
}
else if (size == 2)
{
std::pair<std::string, std::string>p;
p.first = tokens[0];
p.second = "B";
tags.push_back(p);
p.first = tokens[1];
p.second = "E";
tags.push_back(p);
}
else if (size >= 3)
{
for (int i = 0; i<size; i++)
{
std::pair<std::string, std::string>p;
p.first = tokens[i];
if (i == 0)
{
p.second = "B";
}
else if (i == size - 1)
{
p.second = "E";
}
else
{
p.second = "M";
}
tags.push_back(p);
}
}
else
{
return;
}
}
}
C++
1
https://gitee.com/linkerr/tip-las.git
git@gitee.com:linkerr/tip-las.git
linkerr
tip-las
tip-las
master

搜索帮助