From 83094fceebd4795de5dc60e6322185774203d15b Mon Sep 17 00:00:00 2001
From: yangwentong <425822674@qq.com>
Date: Mon, 19 Sep 2022 09:31:06 +0800
Subject: [PATCH] =?UTF-8?q?filter=E4=B8=AD=E5=AD=97=E7=AC=A6=E5=A4=A7?=
 =?UTF-8?q?=E5=B0=8F=E5=86=99=E9=97=AE=E9=A2=98=E5=B7=B2=E5=A4=84=E7=90=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 re2/filtered_re2.cc              | 65 ++++++++++++++++++++++++++++++++
 re2/testing/filtered_re2_test.cc | 36 ++++++++----------
 2 files changed, 81 insertions(+), 20 deletions(-)
diff --git a/re2/filtered_re2.cc b/re2/filtered_re2.cc
index 827b1ba..8c42ce4 100644
--- a/re2/filtered_re2.cc
+++ b/re2/filtered_re2.cc
@@ -234,12 +234,46 @@ std::vector<std::string> Group_multiple_selection(std::string str, int start_poi
  * 1. 标准ASCII
  * 2. 非标准ASCII  如希腊字母
  */
+
 void UpperToLower(std::string &str, int start_post, int end_post)
 {
   // 标准ASCII转小写
   transform(str.begin(), str.end(), str.begin(), ::tolower);
 }
 
+void HandleCharacterCase(std::string &str)
+{
+  std::map<std::string, std::string> m = {{"\u0391", "\u03B1"}, {"\u0392", "\u03B2"}, {"\u0393", "\u03B3"},
+                                            {"\u0394", "\u03B4"}, {"\u0395", "\u03B5"}, {"\u0396", "\u03B6"},
+                                            {"\u0397", "\u03B7"}, {"\u0398", "\u03B8"}, {"\u0399", "\u03B9"},
+                                            {"\u039A", "\u03BA"}, {"\u039B", "\u03BB"}, {"\u039C", "\u03BC"},
+                                            {"\u039D", "\u03BD"}, {"\u039E", "\u03BE"}, {"\u039F", "\u03BF"}, 
+                                            {"\u03A0", "\u03C0"}, {"\u03A1", "\u03C1"}, {"\u03A2", "\u03C2"},
+                                            {"\u03A3", "\u03C3"}, {"\u03A4", "\u03C4"}, {"\u03A5", "\u03C5"},
+                                            {"\u03A6", "\u03C6"}, {"\u03A7", "\u03C7"}, {"\u03A8", "\u03C8"},
+                                            {"\u03A9", "\u03C9"}};
+  for(size_t i = 0; i < str.length(); i += 2)
+  {
+    std::string subStr = str.substr(i, 2);
+    if(m.count(subStr) > 0)
+    {
+      str.replace(i, 2, m[subStr]);
+      continue;
+    }
+    else if(subStr == "ϖ")
+    {
+      str.replace(i, 2, "π");
+      continue;
+    }
+    else if(subStr == "ς")
+    {
+      str.replace(i, 2, "σ");
+      continue;
+    }
+  }
+}
+
+
 bool JudgeIsCharOrNumber(char x)
 {
   if ((x >= 'a' && x <= 'z') || (x >= 0 && x <= 9))
@@ -247,6 +281,20 @@ bool JudgeIsCharOrNumber(char x)
   return false;
 }
 
+bool JudedIsGreekAlphabet(std::string str)
+{
+  std::vector<std::string> vec_alphabet = {"\u03B1", "\u03B2", "\u03B3", "\u03B4", "\u03B5", 
+                                            "\u03B6", "\u03B7", "\u03B8", "\u03B9", "\u03BA", 
+                                            "\u03BB", "\u03BC", "\u03BD", "\u03BE", "\u03BF",
+                                            "\u03C0", "\u03C1", "\u03C2", "\u03C3", "\u03C4",
+                                            "\u03C5", "\u03C6", "\u03C7", "\u03C8", "\u03C9"};
+  for(auto x : vec_alphabet)
+  {
+    if(x == str) return true;
+  }
+  return false;
+}
+
 std::vector<std::string> MyCompile(std::string str)
 {
   std::vector<std::string> my_atoms;      // 最终得到的所有atoms
@@ -254,10 +302,22 @@ std::vector<std::string> MyCompile(std::string str)
   std::vector<std::string> vec_con;
   std::vector<char> atoms_tmp;
   std::string atoms_tmp_string;
+  std::string subStr;
   // 将字符串中的大写字符变为小写
   UpperToLower(str, 0, str.size());
+  HandleCharacterCase(str);
   for (size_t i = 0; i < str.length(); i++)
   {
+    // 处理希腊字母
+    subStr.clear();
+    subStr = str.substr(i, 2);
+    if(JudedIsGreekAlphabet(subStr))
+    {
+      ++i;
+      atoms_tmp_string += subStr;
+      continue;
+    }
+
     // 处理括号分组
     if (str[i] == '(')
     {
@@ -372,6 +432,11 @@ std::vector<std::string> MyCompile(std::string str)
 
       }
     }
+    if(int(str[i]) < 0)
+    {
+      atoms_tmp_string += str[i];
+      continue;
+    }
   }
   if(vec_atoms_tmp.size() > 0)
   {
diff --git a/re2/testing/filtered_re2_test.cc b/re2/testing/filtered_re2_test.cc
index e3c8c94..a2da296 100644
--- a/re2/testing/filtered_re2_test.cc
+++ b/re2/testing/filtered_re2_test.cc
@@ -61,10 +61,8 @@ TEST(FilteredRE2Test, SmallLatinTest) {
   v.opts.set_encoding(RE2::Options::EncodingLatin1);
   v.f.Add("\xde\xadQ\xbe\xef", v.opts, &id);
   v.f.Compile(&v.atoms);
-  /* Compile处理十六进制字符串问题
   EXPECT_EQ(1, v.atoms.size());
   EXPECT_EQ(v.atoms[0], "\xde\xadq\xbe\xef");
-  */
   v.atom_indices.push_back(0);
   v.f.AllMatches("foo\xde\xadQ\xbe\xeflemur", v.atom_indices, &v.matches);
   EXPECT_EQ(1, v.matches.size());
@@ -131,9 +129,7 @@ AtomTest atom_tests[] = {
       "xbcdea", "xbcdeb",
       "ybcdea", "ybcdeb"
     }
-  },
-  /* Compile 处理non-ASCII编码的字符串的大小写问题
-   {
+  },{
     // Test upper/lower of non-ASCII.
     "UnicodeLower", {
       "(?i)ΔδΠϖπΣςσ",
@@ -145,7 +141,6 @@ AtomTest atom_tests[] = {
       "ψρστυ",
     },
   },
-  */
 };
 
 void AddRegexpsAndCompile(const char* regexps[],
@@ -281,20 +276,21 @@ TEST(FilteredRE2Test, MatchTests) {
   v.f.AllMatches(text, atom_ids, &matching_regexps);
   EXPECT_EQ(2, matching_regexps.size());
 }
-
-// TEST(FilteredRE2Test, EmptyStringInStringSetBug) {
-//   // Bug due to find() finding "" at the start of everything in a string
-//   // set and thus SimplifyStringSet() would end up erasing everything.
-//   // In order to test this, we have to keep PrefilterTree from discarding
-//   // the OR entirely, so we have to make the minimum atom length zero.
-
-//   FilterTestVars v(0);  // override the minimum atom length
-//   const char* regexps[] = {"-R.+(|ADD=;AA){12}}"};
-//   const char* atoms[] = {"", "-r", "add=;aa", "}"};
-//   AddRegexpsAndCompile(regexps, arraysize(regexps), &v);
-//   EXPECT_TRUE(CheckExpectedAtoms(atoms, arraysize(atoms),
-//                                  "EmptyStringInStringSetBug", &v));
-// }
+/*
+TEST(FilteredRE2Test, EmptyStringInStringSetBug) {
+  // Bug due to find() finding "" at the start of everything in a string
+  // set and thus SimplifyStringSet() would end up erasing everything.
+  // In order to test this, we have to keep PrefilterTree from discarding
+  // the OR entirely, so we have to make the minimum atom length zero.
+
+  FilterTestVars v(0);  // override the minimum atom length
+  const char* regexps[] = {"-R.+(|ADD=;AA){12}}"};
+  const char* atoms[] = {"", "-r", "add=;aa", "}"};
+  AddRegexpsAndCompile(regexps, arraysize(regexps), &v);
+  EXPECT_TRUE(CheckExpectedAtoms(atoms, arraysize(atoms),
+                                 "EmptyStringInStringSetBug", &v));
+}
+*/
 
 TEST(FilteredRE2Test, MoveSemantics) {
   FilterTestVars v1;
-- 
Gitee