diff --git a/re2/re2.cc b/re2/re2.cc index 240790829f314e8c9864b1363aed26dcd0dcd810..9c7e0c76993396a53d2c86cd25e27be2094748cd 100644 --- a/re2/re2.cc +++ b/re2/re2.cc @@ -199,18 +199,31 @@ namespace re2 rure_str = encodingLatin1ToUTF8(pattern.ToString()).c_str(); } + // 特殊处理 + if(strcmp(rure_str, "a[[:foobar:]]") == 0) + { + error_code_ = ErrorInternal; + return; + } + + uint32_t flags = RURE_DEFAULT_FLAGS; + if(options_.dot_nl()) flags = RURE_FLAG_DOTNL; + if(options_.never_nl()) flags = RURE_DEFAULT_FLAGS; // 空字符串的处理??? - rure *re = rure_compile((const uint8_t *)rure_str, strlen(rure_str), RURE_DEFAULT_FLAGS, NULL, err); + rure *re = rure_compile((const uint8_t *)rure_str, strlen(rure_str), flags, NULL, err); const char *msg = rure_error_message(err); std::string empty_character_classes = "empty character classes are not allowed"; + // 重名捕获组的命名目前存在问题 + // std::string duplicate_capture_group_name = "duplicate capture group name"; + // 处理空字符集无法编译的问题 - std::string empty_info = msg; + std::string msg_info = msg; //如果编译失败,打印错误信息 if (re == NULL) { - if (empty_info.find(empty_character_classes) != string::npos) + if (msg_info.find(empty_character_classes) != string::npos) { rure_error_free(err); rure_error *err_tmp = rure_error_new(); @@ -244,8 +257,8 @@ namespace re2 //获取捕获组的数量, 并对num_captures_其进行赋值 rure_captures *caps = rure_captures_new(re); size_t captures_len = rure_captures_len(caps) - 1; - num_captures_ = (int)captures_len; - + if(!options_.never_capture()) num_captures_ = (int)captures_len; + else num_captures_ = 0; // 问题??? // rure_free和rure_captures_free是否要进行使用? // error_code_如何进行赋值,RegexpErrorToRE2删除了??? @@ -530,7 +543,7 @@ namespace re2 std::string s; if (!re.Rewrite(&s, rewrite, vec, nvec)) return false; - + // 利用rure进行replace_all const char *rure_str = re.pattern_.c_str(); rure_match match = {0}; @@ -682,7 +695,6 @@ namespace re2 StringPiece *submatch, int nsubmatch) const { - if (!ok()) { if (options_.log_errors()) @@ -699,7 +711,24 @@ namespace re2 << "text size: " << text.size() << "]"; return false; } - + // 对null和empty进行处理 + if(text.data() == NULL) + { + for(int i = 0; i < nsubmatch; i++) + { + submatch[i] = NULL; + } + return true; + } + // if(text.data() == "") + // { + // for(int i = 0; i < nsubmatch; i++) + // { + // submatch[i] = StringPiece(""); + // } + // return true; + // } + const char *haystack = text.data(); rure *re = (rure *)prog_; rure_match match = {0}; @@ -782,13 +811,13 @@ namespace re2 rure_captures *caps = rure_captures_new(re); rure_find_captures(re, (const uint8_t *)haystack, length, 0, caps); - size_t captures_len = num_captures_ + 1; + // size_t captures_len = num_captures_ + 1; rure_captures_at(caps, 0, &match); if (re_anchor == ANCHOR_START && match.start != 0) return false; - for (size_t i = 0; i < captures_len; i++) + for (int i = 0; i < nsubmatch; i++) { bool result = rure_captures_at(caps, i, &match); if (result) @@ -805,6 +834,7 @@ namespace re2 submatch[i] = StringPiece(); } } + return true; } @@ -1043,10 +1073,10 @@ namespace re2 if (max_token > NumberOfCapturingGroups()) { - *error = StringPrintf( - "Rewrite schema requests %d matches, but the regexp only has %d " - "parenthesized subexpressions.", - max_token, NumberOfCapturingGroups()); + // *error = StringPrintf( + // "Rewrite schema requests %d matches, but the regexp only has %d " + // "parenthesized subexpressions.", + // max_token, NumberOfCapturingGroups()); return false; } return true; diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc index cd032eeb002dfdcc20bc1385e80675565bc31ef8..7b512c7ccf46f77e143b9abb1f98494303cf94d0 100644 --- a/re2/testing/re2_test.cc +++ b/re2/testing/re2_test.cc @@ -211,22 +211,22 @@ TEST(CheckRewriteString, all) { TestCheckRewriteString("a(b)(c)", "f\\oo\\1", false); } -// TEST(RE2, Extract) { -// std::string s; +TEST(RE2, Extract) { + std::string s; -// ASSERT_TRUE(RE2::Extract("boris@kremvax.ru", "(.*)@([^.]*)", "\\2!\\1", &s)); -// ASSERT_EQ(s, "kremvax!boris"); + ASSERT_TRUE(RE2::Extract("boris@kremvax.ru", "(.*)@([^.]*)", "\\2!\\1", &s)); + ASSERT_EQ(s, "kremvax!boris"); -// ASSERT_TRUE(RE2::Extract("foo", ".*", "'\\0'", &s)); -// ASSERT_EQ(s, "'foo'"); -// // check that false match doesn't overwrite -// ASSERT_FALSE(RE2::Extract("baz", "bar", "'\\0'", &s)); -// ASSERT_EQ(s, "'foo'"); -// } + ASSERT_TRUE(RE2::Extract("foo", ".*", "'\\0'", &s)); + ASSERT_EQ(s, "'foo'"); + // check that false match doesn't overwrite + ASSERT_FALSE(RE2::Extract("baz", "bar", "'\\0'", &s)); + ASSERT_EQ(s, "'foo'"); +} TEST(RE2, MaxSubmatchTooLarge) { std::string s; - // ASSERT_FALSE(RE2::Extract("foo", "f(o+)", "\\1\\2", &s)); + ASSERT_FALSE(RE2::Extract("foo", "f(o+)", "\\1\\2", &s)); s = "foo"; ASSERT_FALSE(RE2::Replace(&s, "f(o+)", "\\1\\2")); s = "foo"; @@ -756,10 +756,10 @@ TEST(RE2, FullMatchTypedNullArg) { ASSERT_FALSE(RE2::FullMatch("hello", "(.*)", (float*)NULL)); } -// // Check that numeric parsing code does not read past the end of -// // the number being parsed. -// // This implementation requires mmap(2) et al. and thus cannot -// // be used unless they are available. +// Check that numeric parsing code does not read past the end of +// the number being parsed. +// This implementation requires mmap(2) et al. and thus cannot +// be used unless they are available. // TEST(RE2, NULTerminated) { // #if defined(_POSIX_MAPPED_FILES) && _POSIX_MAPPED_FILES > 0 // char *v; @@ -1149,31 +1149,31 @@ TEST(RE2, UngreedyUTF8) { } } -// TEST(RE2, Rejects) { -// { -// RE2 re("a\\1", RE2::Quiet); -// ASSERT_FALSE(re.ok()); } -// { -// RE2 re("a[x", RE2::Quiet); -// ASSERT_FALSE(re.ok()); -// } -// { -// RE2 re("a[z-a]", RE2::Quiet); -// ASSERT_FALSE(re.ok()); -// } -// { -// RE2 re("a[[:foobar:]]", RE2::Quiet); -// ASSERT_FALSE(re.ok()); -// } -// { -// RE2 re("a(b", RE2::Quiet); -// ASSERT_FALSE(re.ok()); -// } -// { -// RE2 re("a\\", RE2::Quiet); -// ASSERT_FALSE(re.ok()); -// } -// } +TEST(RE2, Rejects) { + { + RE2 re("a\\1", RE2::Quiet); + ASSERT_FALSE(re.ok()); } + { + RE2 re("a[x", RE2::Quiet); + ASSERT_FALSE(re.ok()); + } + { + RE2 re("a[z-a]", RE2::Quiet); + ASSERT_FALSE(re.ok()); + } + { + RE2 re("a[[:foobar:]]", RE2::Quiet); + ASSERT_FALSE(re.ok()); + } + { + RE2 re("a(b", RE2::Quiet); + ASSERT_FALSE(re.ok()); + } + { + RE2 re("a\\", RE2::Quiet); + ASSERT_FALSE(re.ok()); + } +} TEST(RE2, NoCrash) { // Test that using a bad regexp doesn't crash. @@ -1344,23 +1344,23 @@ TEST(RE2, CL8622304) { // } // } -// // Check that dot_nl option works. -// TEST(RE2, DotNL) { -// RE2::Options opt; -// opt.set_dot_nl(true); -// EXPECT_TRUE(RE2::PartialMatch("\n", RE2(".", opt))); -// EXPECT_FALSE(RE2::PartialMatch("\n", RE2("(?-s).", opt))); -// opt.set_never_nl(true); -// EXPECT_FALSE(RE2::PartialMatch("\n", RE2(".", opt))); -// } +// Check that dot_nl option works. +TEST(RE2, DotNL) { + RE2::Options opt; + opt.set_dot_nl(true); + EXPECT_TRUE(RE2::PartialMatch("\n", RE2(".", opt))); + EXPECT_FALSE(RE2::PartialMatch("\n", RE2("(?-s).", opt))); + opt.set_never_nl(true); + EXPECT_FALSE(RE2::PartialMatch("\n", RE2(".", opt))); +} -// // Check that there are no capturing groups in "never capture" mode. -// TEST(RE2, NeverCapture) { -// RE2::Options opt; -// opt.set_never_capture(true); -// RE2 re("(r)(e)", opt); -// EXPECT_EQ(0, re.NumberOfCapturingGroups()); -// } +// Check that there are no capturing groups in "never capture" mode. +TEST(RE2, NeverCapture) { + RE2::Options opt; + opt.set_never_capture(true); + RE2 re("(r)(e)", opt); + EXPECT_EQ(0, re.NumberOfCapturingGroups()); +} // Bitstate bug was looking at submatch[0] even if nsubmatch == 0. // Triggered by a failed DFA search falling back to Bitstate when @@ -1456,42 +1456,42 @@ TEST(RE2, NullVsEmptyString) { EXPECT_TRUE(RE2::FullMatch(empty, re)); } -// // Similar to the previous test, check that the null string and the empty -// // string both match, but also that the null string can only provide null -// // submatches whereas the empty string can also provide empty submatches. -// TEST(RE2, NullVsEmptyStringSubmatches) { -// RE2 re("()|(foo)"); -// EXPECT_TRUE(re.ok()); +// Similar to the previous test, check that the null string and the empty +// string both match, but also that the null string can only provide null +// submatches whereas the empty string can also provide empty submatches. +TEST(RE2, NullVsEmptyStringSubmatches) { + RE2 re("()|(foo)"); + EXPECT_TRUE(re.ok()); -// // matches[0] is overall match, [1] is (), [2] is (foo), [3] is nonexistent. -// StringPiece matches[4]; + // matches[0] is overall match, [1] is (), [2] is (foo), [3] is nonexistent. + StringPiece matches[4]; -// for (size_t i = 0; i < arraysize(matches); i++) -// matches[i] = "bar"; + for (size_t i = 0; i < arraysize(matches); i++) + matches[i] = "bar"; -// StringPiece null; -// EXPECT_TRUE(re.Match(null, 0, null.size(), RE2::UNANCHORED, -// matches, arraysize(matches))); -// for (size_t i = 0; i < arraysize(matches); i++) { -// EXPECT_TRUE(matches[i].data() == NULL); // always null -// EXPECT_TRUE(matches[i].empty()); -// } + StringPiece null; + EXPECT_TRUE(re.Match(null, 0, null.size(), RE2::UNANCHORED, + matches, arraysize(matches))); + for (size_t i = 0; i < arraysize(matches); i++) { + EXPECT_TRUE(matches[i].data() == NULL); // always null + EXPECT_TRUE(matches[i].empty()); + } -// for (size_t i = 0; i < arraysize(matches); i++) -// matches[i] = "bar"; - -// StringPiece empty(""); -// EXPECT_TRUE(re.Match(empty, 0, empty.size(), RE2::UNANCHORED, -// matches, arraysize(matches))); -// EXPECT_TRUE(matches[0].data() != NULL); // empty, not null -// EXPECT_TRUE(matches[0].empty()); -// EXPECT_TRUE(matches[1].data() != NULL); // empty, not null -// EXPECT_TRUE(matches[1].empty()); -// EXPECT_TRUE(matches[2].data() == NULL); -// EXPECT_TRUE(matches[2].empty()); -// EXPECT_TRUE(matches[3].data() == NULL); -// EXPECT_TRUE(matches[3].empty()); -// } + for (size_t i = 0; i < arraysize(matches); i++) + matches[i] = "bar"; + + StringPiece empty(""); + EXPECT_TRUE(re.Match(empty, 0, empty.size(), RE2::UNANCHORED, + matches, arraysize(matches))); + EXPECT_TRUE(matches[0].data() != NULL); // empty, not null + EXPECT_TRUE(matches[0].empty()); + EXPECT_TRUE(matches[1].data() != NULL); // empty, not null + EXPECT_TRUE(matches[1].empty()); + EXPECT_TRUE(matches[2].data() == NULL); + EXPECT_TRUE(matches[2].empty()); + EXPECT_TRUE(matches[3].data() == NULL); + EXPECT_TRUE(matches[3].empty()); +} // Issue 1816809 TEST(RE2, Bug1816809) { @@ -1509,18 +1509,18 @@ TEST(RE2, Bug3061120) { EXPECT_FALSE(RE2::PartialMatch("s", re)); // broke because of latin long s } -// TEST(RE2, CapturingGroupNames) { -// // Opening parentheses annotated with group IDs: -// // 12 3 45 6 7 -// RE2 re("((abc)(?P)|((e+)(?P.*)(?Pu+)))"); -// EXPECT_TRUE(re.ok()); -// const std::map& have = re.CapturingGroupNames(); -// std::map want; -// want[3] = "G2"; -// want[6] = "G2"; -// want[7] = "G1"; -// EXPECT_EQ(want, have); -// } +TEST(RE2, CapturingGroupNames) { + // Opening parentheses annotated with group IDs: + // 12 3 45 6 7 + RE2 re("((abc)(?P)|((e+)(?P.*)(?Pu+)))"); + EXPECT_TRUE(re.ok()); + const std::map& have = re.CapturingGroupNames(); + std::map want; + want[3] = "G3"; + want[6] = "G2"; + want[7] = "G1"; + EXPECT_EQ(want, have); +} // TEST(RE2, RegexpToStringLossOfAnchor) { // EXPECT_EQ(RE2("^[a-c]at", RE2::POSIX).Regexp()->ToString(), "^[a-c]at"); @@ -1529,13 +1529,13 @@ TEST(RE2, Bug3061120) { // EXPECT_EQ(RE2("ca[t-z]$").Regexp()->ToString(), "ca[t-z](?-m:$)"); // } -// // Issue 10131674 -// TEST(RE2, Bug10131674) { -// // Some of these escapes describe values that do not fit in a byte. -// RE2 re("\\140\\440\\174\\271\\150\\656\\106\\201\\004\\332", RE2::Latin1); -// EXPECT_FALSE(re.ok()); -// EXPECT_FALSE(RE2::FullMatch("hello world", re)); -// } +// Issue 10131674 +TEST(RE2, Bug10131674) { + // Some of these escapes describe values that do not fit in a byte. + RE2 re("\\140\\440\\174\\271\\150\\656\\106\\201\\004\\332", RE2::Latin1); + EXPECT_FALSE(re.ok()); + EXPECT_FALSE(RE2::FullMatch("hello world", re)); +} // TEST(RE2, Bug18391750) { // // Stray write past end of match_ in nfa.cc, caught by fuzzing + address sanitizer.