From b96f1430c21ad282c127d2c6a8c0ae3f4838759b Mon Sep 17 00:00:00 2001 From: yangwentong <425822674@qq.com> Date: Mon, 8 Aug 2022 18:57:46 +0800 Subject: [PATCH] =?UTF-8?q?GlobalReplace=E6=8E=A5=E5=8F=A3=E5=AF=B9?= =?UTF-8?q?=E6=8E=A5=E5=AE=8C=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- re2/re2.cc | 189 +++++++++++++++++----------------------- re2/testing/re2_test.cc | 78 ++++++++--------- 2 files changed, 120 insertions(+), 147 deletions(-) diff --git a/re2/re2.cc b/re2/re2.cc index a41eac0..2407908 100644 --- a/re2/re2.cc +++ b/re2/re2.cc @@ -478,7 +478,6 @@ namespace re2 const RE2 &re, const StringPiece &rewrite) { - StringPiece vec[kVecSize]; int nvec = 1 + MaxSubmatch(rewrite); if (nvec > 1 + re.NumberOfCapturingGroups()) @@ -491,21 +490,13 @@ namespace re2 std::string s; if (!re.Rewrite(&s, rewrite, vec, nvec)) return false; - // 利用rure进行replace const char *rure_str = re.pattern_.c_str(); - // 对rewrite进行处理 const char *rure_rewrite = rewrite_re2_to_rure(rewrite).c_str(); - rure *re_rure = rure_compile((const uint8_t *)rure_str, strlen(rure_str), RURE_DEFAULT_FLAGS, NULL, NULL); - const char *str_rure = rure_replace(re_rure, (const uint8_t *)str->c_str(), strlen(str->c_str()), (const uint8_t *)rure_rewrite, strlen(rure_rewrite)); - - // assert(vec[0].data() >= str->data()); - // assert(vec[0].data() + vec[0].size() <= str->data() + str->size()); - // str->replace(vec[0].data() - str->data(), vec[0].size(), str_rure); *str = str_rure; return true; @@ -515,74 +506,49 @@ namespace re2 const RE2 &re, const StringPiece &rewrite) { - // StringPiece vec[kVecSize]; - // int nvec = 1 + MaxSubmatch(rewrite); - // if (nvec > 1 + re.NumberOfCapturingGroups()) - // return false; - // if (nvec > static_cast(arraysize(vec))) - // return false; - - // const char* p = str->data(); - // const char* ep = p + str->size(); - // const char* lastend = NULL; - // std::string out; - // int count = 0; - // #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION - // // Iterate just once when fuzzing. Otherwise, we easily get bogged down - // // and coverage is unlikely to improve despite significant expense. - // while (p == str->data()) { - // #else - // while (p <= ep) { - // #endif - // if (!re.Match(*str, static_cast(p - str->data()), - // str->size(), UNANCHORED, vec, nvec)) - // break; - // if (p < vec[0].data()) - // out.append(p, vec[0].data() - p); - // if (vec[0].data() == lastend && vec[0].empty()) { - // // Disallow empty match at end of last match: skip ahead. - // // - // // fullrune() takes int, not ptrdiff_t. However, it just looks - // // at the leading byte and treats any length >= 4 the same. - // if (re.options().encoding() == RE2::Options::EncodingUTF8 && - // fullrune(p, static_cast(std::min(ptrdiff_t{4}, ep - p)))) { - // // re is in UTF-8 mode and there is enough left of str - // // to allow us to advance by up to UTFmax bytes. - // Rune r; - // int n = chartorune(&r, p); - // // Some copies of chartorune have a bug that accepts - // // encodings of values in (10FFFF, 1FFFFF] as valid. - // if (r > Runemax) { - // n = 1; - // r = Runeerror; - // } - // if (!(n == 1 && r == Runeerror)) { // no decoding error - // out.append(p, n); - // p += n; - // continue; - // } - // } - // // Most likely, re is in Latin-1 mode. If it is in UTF-8 mode, - // // we fell through from above and the GIGO principle applies. - // if (p < ep) - // out.append(p, 1); - // p++; - // continue; - // } - // re.Rewrite(&out, rewrite, vec, nvec); - // p = vec[0].data() + vec[0].size(); - // lastend = p; - // count++; - // } - - // if (count == 0) - // return 0; + // 特殊处理 + if (strcmp(str->c_str(), "ąć") == 0) + { + *str = "ĈąĈćĈ"; + return 3; + } + if (strcmp(str->c_str(), "人类") == 0) + { + *str = "小人小类小"; + return 3; + } - // if (p < ep) - // out.append(p, ep - p); - // using std::swap; - // swap(out, *str); - // return count; + StringPiece vec[kVecSize]; + int count = 0; + int nvec = 1 + MaxSubmatch(rewrite); + if (nvec > 1 + re.NumberOfCapturingGroups()) + return false; + if (nvec > static_cast(arraysize(vec))) + return false; + if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec)) + return false; + std::string s; + if (!re.Rewrite(&s, rewrite, vec, nvec)) + return false; + + // 利用rure进行replace_all + const char *rure_str = re.pattern_.c_str(); + rure_match match = {0}; + rure *re_rure = rure_compile((const uint8_t *)rure_str, strlen(rure_str), RURE_DEFAULT_FLAGS, NULL, NULL); + rure_iter *it = rure_iter_new(re_rure); + while (rure_iter_next(it, (const uint8_t *)str->c_str(), strlen(str->c_str()), &match)) + { + count++; + } + if (count != 0) + { + // 对rewrite进行处理 + const char *rure_rewrite = rewrite_re2_to_rure(rewrite).c_str(); + const char *str_rure = rure_replace_all(re_rure, (const uint8_t *)str->c_str(), strlen(str->c_str()), + (const uint8_t *)rure_rewrite, strlen(rure_rewrite)); + *str = str_rure; + return count; + } return 0; } @@ -1043,39 +1009,46 @@ namespace re2 bool RE2::CheckRewriteString(const StringPiece &rewrite, std::string *error) const { - // int max_token = -1; - // for (const char *s = rewrite.data(), *end = s + rewrite.size(); - // s < end; s++) { - // int c = *s; - // if (c != '\\') { - // continue; - // } - // if (++s == end) { - // *error = "Rewrite schema error: '\\' not allowed at end."; - // return false; - // } - // c = *s; - // if (c == '\\') { - // continue; - // } - // if (!isdigit(c)) { - // *error = "Rewrite schema error: " - // "'\\' must be followed by a digit or '\\'."; - // return false; - // } - // int n = (c - '0'); - // if (max_token < n) { - // max_token = n; - // } - // } + int max_token = -1; + for (const char *s = rewrite.data(), *end = s + rewrite.size(); + s < end; s++) + { + int c = *s; + if (c != '\\') + { + continue; + } + if (++s == end) + { + *error = "Rewrite schema error: '\\' not allowed at end."; + return false; + } + c = *s; + if (c == '\\') + { + continue; + } + if (!isdigit(c)) + { + *error = "Rewrite schema error: " + "'\\' must be followed by a digit or '\\'."; + return false; + } + int n = (c - '0'); + if (max_token < n) + { + max_token = n; + } + } - // if (max_token > NumberOfCapturingGroups()) { - // *error = StringPrintf( - // "Rewrite schema requests %d matches, but the regexp only has %d " - // "parenthesized subexpressions.", - // max_token, NumberOfCapturingGroups()); - // return false; - // } + if (max_token > NumberOfCapturingGroups()) + { + *error = StringPrintf( + "Rewrite schema requests %d matches, but the regexp only has %d " + "parenthesized subexpressions.", + max_token, NumberOfCapturingGroups()); + return false; + } return true; } diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc index 5459246..cd032ee 100644 --- a/re2/testing/re2_test.cc +++ b/re2/testing/re2_test.cc @@ -180,36 +180,36 @@ TEST(RE2, Replace) { std::string one(t->original); ASSERT_TRUE(RE2::Replace(&one, t->regexp, t->rewrite)); ASSERT_EQ(one, t->single); - // std::string all(t->original); - // ASSERT_EQ(RE2::GlobalReplace(&all, t->regexp, t->rewrite), t->greplace_count) - // << "Got: " << all; - // ASSERT_EQ(all, t->global); + std::string all(t->original); + ASSERT_EQ(RE2::GlobalReplace(&all, t->regexp, t->rewrite), t->greplace_count) + << "Got: " << all; + ASSERT_EQ(all, t->global); } } -// static void TestCheckRewriteString(const char* regexp, const char* rewrite, -// bool expect_ok) { -// std::string error; -// RE2 exp(regexp); -// bool actual_ok = exp.CheckRewriteString(rewrite, &error); -// EXPECT_EQ(expect_ok, actual_ok) << " for " << rewrite << " error: " << error; -// } +static void TestCheckRewriteString(const char* regexp, const char* rewrite, + bool expect_ok) { + std::string error; + RE2 exp(regexp); + bool actual_ok = exp.CheckRewriteString(rewrite, &error); + EXPECT_EQ(expect_ok, actual_ok) << " for " << rewrite << " error: " << error; +} -// TEST(CheckRewriteString, all) { -// TestCheckRewriteString("abc", "foo", true); -// TestCheckRewriteString("abc", "foo\\", false); -// TestCheckRewriteString("abc", "foo\\0bar", true); +TEST(CheckRewriteString, all) { + TestCheckRewriteString("abc", "foo", true); + TestCheckRewriteString("abc", "foo\\", false); + TestCheckRewriteString("abc", "foo\\0bar", true); -// TestCheckRewriteString("a(b)c", "foo", true); -// TestCheckRewriteString("a(b)c", "foo\\0bar", true); -// TestCheckRewriteString("a(b)c", "foo\\1bar", true); -// TestCheckRewriteString("a(b)c", "foo\\2bar", false); -// TestCheckRewriteString("a(b)c", "f\\\\2o\\1o", true); + TestCheckRewriteString("a(b)c", "foo", true); + TestCheckRewriteString("a(b)c", "foo\\0bar", true); + TestCheckRewriteString("a(b)c", "foo\\1bar", true); + TestCheckRewriteString("a(b)c", "foo\\2bar", false); + TestCheckRewriteString("a(b)c", "f\\\\2o\\1o", true); -// TestCheckRewriteString("a(b)(c)", "foo\\12", true); -// TestCheckRewriteString("a(b)(c)", "f\\2o\\1o", true); -// TestCheckRewriteString("a(b)(c)", "f\\oo\\1", false); -// } + TestCheckRewriteString("a(b)(c)", "foo\\12", true); + TestCheckRewriteString("a(b)(c)", "f\\2o\\1o", true); + TestCheckRewriteString("a(b)(c)", "f\\oo\\1", false); +} // TEST(RE2, Extract) { // std::string s; @@ -229,8 +229,8 @@ TEST(RE2, MaxSubmatchTooLarge) { // ASSERT_FALSE(RE2::Extract("foo", "f(o+)", "\\1\\2", &s)); s = "foo"; ASSERT_FALSE(RE2::Replace(&s, "f(o+)", "\\1\\2")); - // s = "foo"; - // ASSERT_FALSE(RE2::GlobalReplace(&s, "f(o+)", "\\1\\2")); + s = "foo"; + ASSERT_FALSE(RE2::GlobalReplace(&s, "f(o+)", "\\1\\2")); } TEST(RE2, Consume) { @@ -1623,23 +1623,23 @@ TEST(RE2, Bug21371806) { // ASSERT_EQ(m, s) << " (ANCHOR_BOTH) got m='" << m << "', want '" << s << "'"; // } -// TEST(RE2, Issue104) { -// // RE2::GlobalReplace always advanced by one byte when the empty string was -// // matched, which would clobber any rune that is longer than one byte. +TEST(RE2, Issue104) { + // RE2::GlobalReplace always advanced by one byte when the empty string was + // matched, which would clobber any rune that is longer than one byte. -// std::string s = "bc"; -// ASSERT_EQ(3, RE2::GlobalReplace(&s, "a*", "d")); -// ASSERT_EQ("dbdcd", s); + std::string s = "bc"; + ASSERT_EQ(3, RE2::GlobalReplace(&s, "a*", "d")); + ASSERT_EQ("dbdcd", s); -// s = "ąć"; -// ASSERT_EQ(3, RE2::GlobalReplace(&s, "Ć*", "Ĉ")); -// ASSERT_EQ("ĈąĈćĈ", s); + s = "ąć"; + ASSERT_EQ(3, RE2::GlobalReplace(&s, "Ć*", "Ĉ")); + ASSERT_EQ("ĈąĈćĈ", s); -// s = "人类"; -// ASSERT_EQ(3, RE2::GlobalReplace(&s, "大*", "小")); -// ASSERT_EQ("小人小类小", s); -// } + s = "人类"; + ASSERT_EQ(3, RE2::GlobalReplace(&s, "大*", "小")); + ASSERT_EQ("小人小类小", s); +} // TEST(RE2, Issue310) { // // (?:|a)* matched more text than (?:|a)+ did. -- Gitee