From 83094fceebd4795de5dc60e6322185774203d15b Mon Sep 17 00:00:00 2001 From: yangwentong <425822674@qq.com> Date: Mon, 19 Sep 2022 09:31:06 +0800 Subject: [PATCH] =?UTF-8?q?filter=E4=B8=AD=E5=AD=97=E7=AC=A6=E5=A4=A7?= =?UTF-8?q?=E5=B0=8F=E5=86=99=E9=97=AE=E9=A2=98=E5=B7=B2=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- re2/filtered_re2.cc | 65 ++++++++++++++++++++++++++++++++ re2/testing/filtered_re2_test.cc | 36 ++++++++---------- 2 files changed, 81 insertions(+), 20 deletions(-) diff --git a/re2/filtered_re2.cc b/re2/filtered_re2.cc index 827b1ba..8c42ce4 100644 --- a/re2/filtered_re2.cc +++ b/re2/filtered_re2.cc @@ -234,12 +234,46 @@ std::vector Group_multiple_selection(std::string str, int start_poi * 1. 标准ASCII * 2. 非标准ASCII 如希腊字母 */ + void UpperToLower(std::string &str, int start_post, int end_post) { // 标准ASCII转小写 transform(str.begin(), str.end(), str.begin(), ::tolower); } +void HandleCharacterCase(std::string &str) +{ + std::map m = {{"\u0391", "\u03B1"}, {"\u0392", "\u03B2"}, {"\u0393", "\u03B3"}, + {"\u0394", "\u03B4"}, {"\u0395", "\u03B5"}, {"\u0396", "\u03B6"}, + {"\u0397", "\u03B7"}, {"\u0398", "\u03B8"}, {"\u0399", "\u03B9"}, + {"\u039A", "\u03BA"}, {"\u039B", "\u03BB"}, {"\u039C", "\u03BC"}, + {"\u039D", "\u03BD"}, {"\u039E", "\u03BE"}, {"\u039F", "\u03BF"}, + {"\u03A0", "\u03C0"}, {"\u03A1", "\u03C1"}, {"\u03A2", "\u03C2"}, + {"\u03A3", "\u03C3"}, {"\u03A4", "\u03C4"}, {"\u03A5", "\u03C5"}, + {"\u03A6", "\u03C6"}, {"\u03A7", "\u03C7"}, {"\u03A8", "\u03C8"}, + {"\u03A9", "\u03C9"}}; + for(size_t i = 0; i < str.length(); i += 2) + { + std::string subStr = str.substr(i, 2); + if(m.count(subStr) > 0) + { + str.replace(i, 2, m[subStr]); + continue; + } + else if(subStr == "ϖ") + { + str.replace(i, 2, "π"); + continue; + } + else if(subStr == "ς") + { + str.replace(i, 2, "σ"); + continue; + } + } +} + + bool JudgeIsCharOrNumber(char x) { if ((x >= 'a' && x <= 'z') || (x >= 0 && x <= 9)) @@ -247,6 +281,20 @@ bool JudgeIsCharOrNumber(char x) return false; } +bool JudedIsGreekAlphabet(std::string str) +{ + std::vector vec_alphabet = {"\u03B1", "\u03B2", "\u03B3", "\u03B4", "\u03B5", + "\u03B6", "\u03B7", "\u03B8", "\u03B9", "\u03BA", + "\u03BB", "\u03BC", "\u03BD", "\u03BE", "\u03BF", + "\u03C0", "\u03C1", "\u03C2", "\u03C3", "\u03C4", + "\u03C5", "\u03C6", "\u03C7", "\u03C8", "\u03C9"}; + for(auto x : vec_alphabet) + { + if(x == str) return true; + } + return false; +} + std::vector MyCompile(std::string str) { std::vector my_atoms; // 最终得到的所有atoms @@ -254,10 +302,22 @@ std::vector MyCompile(std::string str) std::vector vec_con; std::vector atoms_tmp; std::string atoms_tmp_string; + std::string subStr; // 将字符串中的大写字符变为小写 UpperToLower(str, 0, str.size()); + HandleCharacterCase(str); for (size_t i = 0; i < str.length(); i++) { + // 处理希腊字母 + subStr.clear(); + subStr = str.substr(i, 2); + if(JudedIsGreekAlphabet(subStr)) + { + ++i; + atoms_tmp_string += subStr; + continue; + } + // 处理括号分组 if (str[i] == '(') { @@ -372,6 +432,11 @@ std::vector MyCompile(std::string str) } } + if(int(str[i]) < 0) + { + atoms_tmp_string += str[i]; + continue; + } } if(vec_atoms_tmp.size() > 0) { diff --git a/re2/testing/filtered_re2_test.cc b/re2/testing/filtered_re2_test.cc index e3c8c94..a2da296 100644 --- a/re2/testing/filtered_re2_test.cc +++ b/re2/testing/filtered_re2_test.cc @@ -61,10 +61,8 @@ TEST(FilteredRE2Test, SmallLatinTest) { v.opts.set_encoding(RE2::Options::EncodingLatin1); v.f.Add("\xde\xadQ\xbe\xef", v.opts, &id); v.f.Compile(&v.atoms); - /* Compile处理十六进制字符串问题 EXPECT_EQ(1, v.atoms.size()); EXPECT_EQ(v.atoms[0], "\xde\xadq\xbe\xef"); - */ v.atom_indices.push_back(0); v.f.AllMatches("foo\xde\xadQ\xbe\xeflemur", v.atom_indices, &v.matches); EXPECT_EQ(1, v.matches.size()); @@ -131,9 +129,7 @@ AtomTest atom_tests[] = { "xbcdea", "xbcdeb", "ybcdea", "ybcdeb" } - }, - /* Compile 处理non-ASCII编码的字符串的大小写问题 - { + },{ // Test upper/lower of non-ASCII. "UnicodeLower", { "(?i)ΔδΠϖπΣςσ", @@ -145,7 +141,6 @@ AtomTest atom_tests[] = { "ψρστυ", }, }, - */ }; void AddRegexpsAndCompile(const char* regexps[], @@ -281,20 +276,21 @@ TEST(FilteredRE2Test, MatchTests) { v.f.AllMatches(text, atom_ids, &matching_regexps); EXPECT_EQ(2, matching_regexps.size()); } - -// TEST(FilteredRE2Test, EmptyStringInStringSetBug) { -// // Bug due to find() finding "" at the start of everything in a string -// // set and thus SimplifyStringSet() would end up erasing everything. -// // In order to test this, we have to keep PrefilterTree from discarding -// // the OR entirely, so we have to make the minimum atom length zero. - -// FilterTestVars v(0); // override the minimum atom length -// const char* regexps[] = {"-R.+(|ADD=;AA){12}}"}; -// const char* atoms[] = {"", "-r", "add=;aa", "}"}; -// AddRegexpsAndCompile(regexps, arraysize(regexps), &v); -// EXPECT_TRUE(CheckExpectedAtoms(atoms, arraysize(atoms), -// "EmptyStringInStringSetBug", &v)); -// } +/* +TEST(FilteredRE2Test, EmptyStringInStringSetBug) { + // Bug due to find() finding "" at the start of everything in a string + // set and thus SimplifyStringSet() would end up erasing everything. + // In order to test this, we have to keep PrefilterTree from discarding + // the OR entirely, so we have to make the minimum atom length zero. + + FilterTestVars v(0); // override the minimum atom length + const char* regexps[] = {"-R.+(|ADD=;AA){12}}"}; + const char* atoms[] = {"", "-r", "add=;aa", "}"}; + AddRegexpsAndCompile(regexps, arraysize(regexps), &v); + EXPECT_TRUE(CheckExpectedAtoms(atoms, arraysize(atoms), + "EmptyStringInStringSetBug", &v)); +} +*/ TEST(FilteredRE2Test, MoveSemantics) { FilterTestVars v1; -- Gitee