From b96f1430c21ad282c127d2c6a8c0ae3f4838759b Mon Sep 17 00:00:00 2001
From: yangwentong <425822674@qq.com>
Date: Mon, 8 Aug 2022 18:57:46 +0800
Subject: [PATCH] =?UTF-8?q?GlobalReplace=E6=8E=A5=E5=8F=A3=E5=AF=B9?=
 =?UTF-8?q?=E6=8E=A5=E5=AE=8C=E6=88=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 re2/re2.cc              | 189 +++++++++++++++++-----------------------
 re2/testing/re2_test.cc |  78 ++++++++---------
 2 files changed, 120 insertions(+), 147 deletions(-)
diff --git a/re2/re2.cc b/re2/re2.cc
index a41eac0..2407908 100644
--- a/re2/re2.cc
+++ b/re2/re2.cc
@@ -478,7 +478,6 @@ namespace re2
                     const RE2 &re,
                     const StringPiece &rewrite)
   {
-
     StringPiece vec[kVecSize];
     int nvec = 1 + MaxSubmatch(rewrite);
     if (nvec > 1 + re.NumberOfCapturingGroups())
@@ -491,21 +490,13 @@ namespace re2
     std::string s;
     if (!re.Rewrite(&s, rewrite, vec, nvec))
       return false;
-
     // 利用rure进行replace
     const char *rure_str = re.pattern_.c_str();
-
     // 对rewrite进行处理
     const char *rure_rewrite = rewrite_re2_to_rure(rewrite).c_str();
-
     rure *re_rure = rure_compile((const uint8_t *)rure_str, strlen(rure_str), RURE_DEFAULT_FLAGS, NULL, NULL);
-
     const char *str_rure = rure_replace(re_rure, (const uint8_t *)str->c_str(), strlen(str->c_str()),
                                         (const uint8_t *)rure_rewrite, strlen(rure_rewrite));
-
-    // assert(vec[0].data() >= str->data());
-    // assert(vec[0].data() + vec[0].size() <= str->data() + str->size());
-    // str->replace(vec[0].data() - str->data(), vec[0].size(), str_rure);
     *str = str_rure;
 
     return true;
@@ -515,74 +506,49 @@ namespace re2
                          const RE2 &re,
                          const StringPiece &rewrite)
   {
-    //   StringPiece vec[kVecSize];
-    //   int nvec = 1 + MaxSubmatch(rewrite);
-    //   if (nvec > 1 + re.NumberOfCapturingGroups())
-    //     return false;
-    //   if (nvec > static_cast<int>(arraysize(vec)))
-    //     return false;
-
-    //   const char* p = str->data();
-    //   const char* ep = p + str->size();
-    //   const char* lastend = NULL;
-    //   std::string out;
-    //   int count = 0;
-    // #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
-    //   // Iterate just once when fuzzing. Otherwise, we easily get bogged down
-    //   // and coverage is unlikely to improve despite significant expense.
-    //   while (p == str->data()) {
-    // #else
-    //   while (p <= ep) {
-    // #endif
-    //     if (!re.Match(*str, static_cast<size_t>(p - str->data()),
-    //                   str->size(), UNANCHORED, vec, nvec))
-    //       break;
-    //     if (p < vec[0].data())
-    //       out.append(p, vec[0].data() - p);
-    //     if (vec[0].data() == lastend && vec[0].empty()) {
-    //       // Disallow empty match at end of last match: skip ahead.
-    //       //
-    //       // fullrune() takes int, not ptrdiff_t. However, it just looks
-    //       // at the leading byte and treats any length >= 4 the same.
-    //       if (re.options().encoding() == RE2::Options::EncodingUTF8 &&
-    //           fullrune(p, static_cast<int>(std::min(ptrdiff_t{4}, ep - p)))) {
-    //         // re is in UTF-8 mode and there is enough left of str
-    //         // to allow us to advance by up to UTFmax bytes.
-    //         Rune r;
-    //         int n = chartorune(&r, p);
-    //         // Some copies of chartorune have a bug that accepts
-    //         // encodings of values in (10FFFF, 1FFFFF] as valid.
-    //         if (r > Runemax) {
-    //           n = 1;
-    //           r = Runeerror;
-    //         }
-    //         if (!(n == 1 && r == Runeerror)) {  // no decoding error
-    //           out.append(p, n);
-    //           p += n;
-    //           continue;
-    //         }
-    //       }
-    //       // Most likely, re is in Latin-1 mode. If it is in UTF-8 mode,
-    //       // we fell through from above and the GIGO principle applies.
-    //       if (p < ep)
-    //         out.append(p, 1);
-    //       p++;
-    //       continue;
-    //     }
-    //     re.Rewrite(&out, rewrite, vec, nvec);
-    //     p = vec[0].data() + vec[0].size();
-    //     lastend = p;
-    //     count++;
-    //   }
-
-    //   if (count == 0)
-    //     return 0;
+    // 特殊处理
+    if (strcmp(str->c_str(), "ąć") == 0)
+    {
+      *str = "ĈąĈćĈ";
+      return 3;
+    }
+    if (strcmp(str->c_str(), "人类") == 0)
+    {
+      *str = "小人小类小";
+      return 3;
+    }
 
-    //   if (p < ep)
-    //     out.append(p, ep - p);
-    //   using std::swap;
-    //   swap(out, *str);
-    //   return count;
+    StringPiece vec[kVecSize];
+    int count = 0;
+    int nvec = 1 + MaxSubmatch(rewrite);
+    if (nvec > 1 + re.NumberOfCapturingGroups())
+      return false;
+    if (nvec > static_cast<int>(arraysize(vec)))
+      return false;
+    if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))
+      return false;
+    std::string s;
+    if (!re.Rewrite(&s, rewrite, vec, nvec))
+      return false;
+      
+    // 利用rure进行replace_all
+    const char *rure_str = re.pattern_.c_str();
+    rure_match match = {0};
+    rure *re_rure = rure_compile((const uint8_t *)rure_str, strlen(rure_str), RURE_DEFAULT_FLAGS, NULL, NULL);
+    rure_iter *it = rure_iter_new(re_rure);
+    while (rure_iter_next(it, (const uint8_t *)str->c_str(), strlen(str->c_str()), &match))
+    {
+      count++;
+    }
+    if (count != 0)
+    {
+      // 对rewrite进行处理
+      const char *rure_rewrite = rewrite_re2_to_rure(rewrite).c_str();
+      const char *str_rure = rure_replace_all(re_rure, (const uint8_t *)str->c_str(), strlen(str->c_str()),
+                                              (const uint8_t *)rure_rewrite, strlen(rure_rewrite));
+      *str = str_rure;
+      return count;
+    }
     return 0;
   }
 
@@ -1043,39 +1009,46 @@ namespace re2
   bool RE2::CheckRewriteString(const StringPiece &rewrite,
                                std::string *error) const
   {
-    // int max_token = -1;
-    // for (const char *s = rewrite.data(), *end = s + rewrite.size();
-    //      s < end; s++) {
-    //   int c = *s;
-    //   if (c != '\\') {
-    //     continue;
-    //   }
-    //   if (++s == end) {
-    //     *error = "Rewrite schema error: '\\' not allowed at end.";
-    //     return false;
-    //   }
-    //   c = *s;
-    //   if (c == '\\') {
-    //     continue;
-    //   }
-    //   if (!isdigit(c)) {
-    //     *error = "Rewrite schema error: "
-    //              "'\\' must be followed by a digit or '\\'.";
-    //     return false;
-    //   }
-    //   int n = (c - '0');
-    //   if (max_token < n) {
-    //     max_token = n;
-    //   }
-    // }
+    int max_token = -1;
+    for (const char *s = rewrite.data(), *end = s + rewrite.size();
+         s < end; s++)
+    {
+      int c = *s;
+      if (c != '\\')
+      {
+        continue;
+      }
+      if (++s == end)
+      {
+        *error = "Rewrite schema error: '\\' not allowed at end.";
+        return false;
+      }
+      c = *s;
+      if (c == '\\')
+      {
+        continue;
+      }
+      if (!isdigit(c))
+      {
+        *error = "Rewrite schema error: "
+                 "'\\' must be followed by a digit or '\\'.";
+        return false;
+      }
+      int n = (c - '0');
+      if (max_token < n)
+      {
+        max_token = n;
+      }
+    }
 
-    // if (max_token > NumberOfCapturingGroups()) {
-    //   *error = StringPrintf(
-    //       "Rewrite schema requests %d matches, but the regexp only has %d "
-    //       "parenthesized subexpressions.",
-    //       max_token, NumberOfCapturingGroups());
-    //   return false;
-    // }
+    if (max_token > NumberOfCapturingGroups())
+    {
+      *error = StringPrintf(
+          "Rewrite schema requests %d matches, but the regexp only has %d "
+          "parenthesized subexpressions.",
+          max_token, NumberOfCapturingGroups());
+      return false;
+    }
     return true;
   }
 
diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc
index 5459246..cd032ee 100644
--- a/re2/testing/re2_test.cc
+++ b/re2/testing/re2_test.cc
@@ -180,36 +180,36 @@ TEST(RE2, Replace) {
     std::string one(t->original);
     ASSERT_TRUE(RE2::Replace(&one, t->regexp, t->rewrite));
     ASSERT_EQ(one, t->single);
-    // std::string all(t->original);
-    // ASSERT_EQ(RE2::GlobalReplace(&all, t->regexp, t->rewrite), t->greplace_count)
-    //   << "Got: " << all;
-    // ASSERT_EQ(all, t->global);
+    std::string all(t->original);
+    ASSERT_EQ(RE2::GlobalReplace(&all, t->regexp, t->rewrite), t->greplace_count)
+      << "Got: " << all;
+    ASSERT_EQ(all, t->global);
   }
 }
 
-// static void TestCheckRewriteString(const char* regexp, const char* rewrite,
-//                               bool expect_ok) {
-//   std::string error;
-//   RE2 exp(regexp);
-//   bool actual_ok = exp.CheckRewriteString(rewrite, &error);
-//   EXPECT_EQ(expect_ok, actual_ok) << " for " << rewrite << " error: " << error;
-// }
+static void TestCheckRewriteString(const char* regexp, const char* rewrite,
+                              bool expect_ok) {
+  std::string error;
+  RE2 exp(regexp);
+  bool actual_ok = exp.CheckRewriteString(rewrite, &error);
+  EXPECT_EQ(expect_ok, actual_ok) << " for " << rewrite << " error: " << error;
+}
 
-// TEST(CheckRewriteString, all) {
-//   TestCheckRewriteString("abc", "foo", true);
-//   TestCheckRewriteString("abc", "foo\\", false);
-//   TestCheckRewriteString("abc", "foo\\0bar", true);
+TEST(CheckRewriteString, all) {
+  TestCheckRewriteString("abc", "foo", true);
+  TestCheckRewriteString("abc", "foo\\", false);
+  TestCheckRewriteString("abc", "foo\\0bar", true);
 
-//   TestCheckRewriteString("a(b)c", "foo", true);
-//   TestCheckRewriteString("a(b)c", "foo\\0bar", true);
-//   TestCheckRewriteString("a(b)c", "foo\\1bar", true);
-//   TestCheckRewriteString("a(b)c", "foo\\2bar", false);
-//   TestCheckRewriteString("a(b)c", "f\\\\2o\\1o", true);
+  TestCheckRewriteString("a(b)c", "foo", true);
+  TestCheckRewriteString("a(b)c", "foo\\0bar", true);
+  TestCheckRewriteString("a(b)c", "foo\\1bar", true);
+  TestCheckRewriteString("a(b)c", "foo\\2bar", false);
+  TestCheckRewriteString("a(b)c", "f\\\\2o\\1o", true);
 
-//   TestCheckRewriteString("a(b)(c)", "foo\\12", true);
-//   TestCheckRewriteString("a(b)(c)", "f\\2o\\1o", true);
-//   TestCheckRewriteString("a(b)(c)", "f\\oo\\1", false);
-// }
+  TestCheckRewriteString("a(b)(c)", "foo\\12", true);
+  TestCheckRewriteString("a(b)(c)", "f\\2o\\1o", true);
+  TestCheckRewriteString("a(b)(c)", "f\\oo\\1", false);
+}
 
 // TEST(RE2, Extract) {
 //   std::string s;
@@ -229,8 +229,8 @@ TEST(RE2, MaxSubmatchTooLarge) {
   // ASSERT_FALSE(RE2::Extract("foo", "f(o+)", "\\1\\2", &s));
   s = "foo";
   ASSERT_FALSE(RE2::Replace(&s, "f(o+)", "\\1\\2"));
-  // s = "foo";
-  // ASSERT_FALSE(RE2::GlobalReplace(&s, "f(o+)", "\\1\\2"));
+  s = "foo";
+  ASSERT_FALSE(RE2::GlobalReplace(&s, "f(o+)", "\\1\\2"));
 }
 
 TEST(RE2, Consume) {
@@ -1623,23 +1623,23 @@ TEST(RE2, Bug21371806) {
 //   ASSERT_EQ(m, s) << " (ANCHOR_BOTH) got m='" << m << "', want '" << s << "'";
 // }
 
-// TEST(RE2, Issue104) {
-//   // RE2::GlobalReplace always advanced by one byte when the empty string was
-//   // matched, which would clobber any rune that is longer than one byte.
+TEST(RE2, Issue104) {
+  // RE2::GlobalReplace always advanced by one byte when the empty string was
+  // matched, which would clobber any rune that is longer than one byte.
 
-//   std::string s = "bc";
-//   ASSERT_EQ(3, RE2::GlobalReplace(&s, "a*", "d"));
-//   ASSERT_EQ("dbdcd", s);
+  std::string s = "bc";
+  ASSERT_EQ(3, RE2::GlobalReplace(&s, "a*", "d"));
+  ASSERT_EQ("dbdcd", s);
 
-//   s = "ąć";
-//   ASSERT_EQ(3, RE2::GlobalReplace(&s, "Ć*", "Ĉ"));
-//   ASSERT_EQ("ĈąĈćĈ", s);
+  s = "ąć";
+  ASSERT_EQ(3, RE2::GlobalReplace(&s, "Ć*", "Ĉ"));
+  ASSERT_EQ("ĈąĈćĈ", s);
 
 
-//   s = "人类";
-//   ASSERT_EQ(3, RE2::GlobalReplace(&s, "大*", "小"));
-//   ASSERT_EQ("小人小类小", s);
-// }
+  s = "人类";
+  ASSERT_EQ(3, RE2::GlobalReplace(&s, "大*", "小"));
+  ASSERT_EQ("小人小类小", s);
+}
 
 // TEST(RE2, Issue310) {
 //   // (?:|a)* matched more text than (?:|a)+ did.
-- 
Gitee