From 3591f8be85c8d7a25ff13f44ed22604d9bf2fa3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E5=BF=97=E6=B6=9B?= Date: Fri, 2 Sep 2022 09:17:28 +0000 Subject: [PATCH 1/2] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E4=BA=86=20=E9=80=9A?= =?UTF-8?q?=E8=BF=87rure=5Fis=5Fmatch()=E5=AF=B9=E5=8C=85=E5=90=AB?= =?UTF-8?q?=E8=B5=B7=E6=AD=A2=E7=AC=A6=20^=20=E5=92=8C=E7=BB=93=E6=9D=9F?= =?UTF-8?q?=E7=AC=A6=20$=20=E6=AD=A3=E5=88=99=E8=A1=A8=E8=BE=BE=E5=BC=8F?= =?UTF-8?q?=E7=9A=84=E7=9B=B4=E6=8E=A5=E6=B5=8B=E8=AF=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 刘志涛 --- re2/testing/regexp_benchmark.cc | 70 +++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 30 deletions(-) diff --git a/re2/testing/regexp_benchmark.cc b/re2/testing/regexp_benchmark.cc index c2e9b50..e8ba4cf 100644 --- a/re2/testing/regexp_benchmark.cc +++ b/re2/testing/regexp_benchmark.cc @@ -207,7 +207,7 @@ void FindAndConsume(benchmark::State& state) { state.SetBytesProcessed(state.iterations() * state.range(0)); } -BENCHMARK_RANGE(FindAndConsume, 8, 16)->ThreadRange(1, NumCPUs()); +// BENCHMARK_RANGE(FindAndConsume, 8, 16)->ThreadRange(1, NumCPUs()); //////////////////////////////////////////////////////////////////////// // @@ -423,7 +423,7 @@ void EmptyPartialMatchRE2(benchmark::State& state) { } } -void EmptyPartialMatchRE2_LiuZhitao(benchmark::State& state) { +void EmptyPartialMatchRE2_text_re2_1KB(benchmark::State& state) { std::ifstream in("re2/testing/text_re2_1KB.txt"); std::stringstream buffer; buffer << in.rdbuf(); @@ -439,7 +439,7 @@ void EmptyPartialMatchRE2_LiuZhitao(benchmark::State& state) { BENCHMARK(EmptyPartialMatchPCRE)->ThreadRange(1, NumCPUs()); #endif BENCHMARK(EmptyPartialMatchRE2)->ThreadRange(1, NumCPUs()); -BENCHMARK_RANGE(EmptyPartialMatchRE2_LiuZhitao, 8, 2<<9); +BENCHMARK_RANGE(EmptyPartialMatchRE2_text_re2_1KB, 2<<6, 2<<9); void SimplePartialMatchPCRE(benchmark::State& state) { PCRE re("abcdefg"); @@ -545,7 +545,7 @@ void ASCIIMatchRE2(benchmark::State& state) { } } -void ASCIIMatchRE2_LiuZhitao(benchmark::State& state) { +void ASCIIMatchRE2_text_re2_1KB(benchmark::State& state) { std::ifstream in("re2/testing/text_re2_1KB.txt"); std::stringstream buffer; buffer << in.rdbuf(); @@ -561,7 +561,7 @@ void ASCIIMatchRE2_LiuZhitao(benchmark::State& state) { BENCHMARK(ASCIIMatchPCRE)->ThreadRange(1, NumCPUs()); #endif BENCHMARK(ASCIIMatchRE2)->ThreadRange(1, NumCPUs()); -BENCHMARK_RANGE(ASCIIMatchRE2_LiuZhitao, 8, 2<<9); +BENCHMARK_RANGE(ASCIIMatchRE2_text_re2_1KB, 2<<6, 2<<9); void FullMatchPCRE(benchmark::State& state, const char *regexp) { std::string s = RandomText(state.range(0)); @@ -578,12 +578,13 @@ void FullMatchRE2(benchmark::State& state, const char *regexp) { s += "ABCDEFGHIJ"; RE2 re(regexp, RE2::Latin1); for (auto _ : state) { + CHECK(RE2::FullMatch(s, re)); } state.SetBytesProcessed(state.iterations() * state.range(0)); } -void FullMatchRE2_LiuZhitao(benchmark::State& state, const char *regexp) { +void FullMatchRE2_text_re2_1KB(benchmark::State& state, const char *regexp) { std::ifstream in("re2/testing/text_re2_1KB.txt"); std::stringstream buffer; @@ -591,6 +592,7 @@ void FullMatchRE2_LiuZhitao(benchmark::State& state, const char *regexp) { std::string s = buffer.str(); RE2 re(regexp, RE2::Latin1); for (auto _ : state) { + CHECK(RE2::FullMatch(s.substr(0, state.range(0)), re)); } state.SetBytesProcessed(state.iterations() * state.range(0)); @@ -602,7 +604,6 @@ void Rure_Find_RE2(benchmark::State& state, const char *regexp) std::stringstream buffer; buffer << in.rdbuf(); std::string s = buffer.str().substr(0, state.range(0)); - RE2 re(regexp); rure_error *err = rure_error_new(); rure *re1 = rure_compile((const uint8_t *)regexp, strlen(regexp), RURE_DEFAULT_FLAGS, NULL, err); rure_match match = {0}; @@ -619,40 +620,49 @@ void Rure_is_Match_RE2(benchmark::State& state, const char *regexp) std::stringstream buffer; buffer << in.rdbuf(); std::string s = buffer.str().substr(0, state.range(0)); - RE2 re(regexp); rure_error *err = rure_error_new(); rure *re1 = rure_compile((const uint8_t *)regexp, strlen(regexp), RURE_DEFAULT_FLAGS, NULL, err); for (auto _ : state) { - bool matched = rure_is_match(re1, (const uint8_t *)s.c_str(), strlen(s.c_str()), 0); + bool matched = rure_is_match((rure *)re1, (const uint8_t *)s.c_str(), strlen(s.c_str()), 0); CHECK(matched); } state.SetBytesProcessed(state.iterations() * state.range(0)); } -void Rure_Find_RE2_Bench1(benchmark::State& state) { Rure_Find_RE2(state, "(?s).*"); } -void Rure_Find_RE2_Bench2(benchmark::State& state) { Rure_Find_RE2(state, "(?s).*$"); } -void Rure_Find_RE2_Bench3(benchmark::State& state) { Rure_Find_RE2(state, "(?s)((.*)()()($))"); } +void Rure_Find_RE2_DotStar_text_re2_1KB(benchmark::State& state) { Rure_Find_RE2(state, "(?s).*"); } +void Rure_Find_RE2_DotStarDollar_text_re2_1KB(benchmark::State& state) { Rure_Find_RE2(state, "(?s).*$"); } +void Rure_Find_RE2_DotStarCapture_text_re2_1KB(benchmark::State& state) { Rure_Find_RE2(state, "(?s)((.*)()()($))"); } -BENCHMARK_RANGE(Rure_Find_RE2_Bench1, 2<<6, 2<<9); -BENCHMARK_RANGE(Rure_Find_RE2_Bench2, 2<<6, 2<<9); -BENCHMARK_RANGE(Rure_Find_RE2_Bench3, 2<<6, 2<<9); +// BENCHMARK_RANGE(Rure_Find_RE2_DotStar_text_re2_1KB, 2<<3, 2<<9); +// BENCHMARK_RANGE(Rure_Find_RE2_DotStarDollar_text_re2_1KB, 2<<3, 2<<9); +// BENCHMARK_RANGE(Rure_Find_RE2_DotStarCapture_text_re2_1KB, 2<<3, 2<<9); -void Rure_is_Match_RE2_Bench1(benchmark::State& state) { Rure_is_Match_RE2(state, "(?s).*"); } -void Rure_is_Match_RE2_Bench2(benchmark::State& state) { Rure_is_Match_RE2(state, "(?s).*$"); } -void Rure_is_Match_RE2_Bench3(benchmark::State& state) { Rure_is_Match_RE2(state, "(?s)((.*)()()($))"); } +// 不加起止符 ^ 结束符 $ 的正则表达式,也就是PartialMatch,通过regex对外接口rure_is_match()函数直接测试 +void Rure_is_Match_RE2_DotStar_text_re2_1KB(benchmark::State& state) { Rure_is_Match_RE2(state, "(?s).*"); } +void Rure_is_Match_RE2_DotStarDollar_text_re2_1KB(benchmark::State& state) { Rure_is_Match_RE2(state, "(?s).*$"); } +void Rure_is_Match_RE2_DotStarCapture_text_re2_1KB(benchmark::State& state) { Rure_is_Match_RE2(state, "(?s)((.*)()()($))"); } -BENCHMARK_RANGE(Rure_is_Match_RE2_Bench1, 2<<6, 2<<9); -BENCHMARK_RANGE(Rure_is_Match_RE2_Bench2, 2<<6, 2<<9); -BENCHMARK_RANGE(Rure_is_Match_RE2_Bench3, 2<<6, 2<<9); +BENCHMARK_RANGE(Rure_is_Match_RE2_DotStar_text_re2_1KB, 2<<6, 2<<9); +BENCHMARK_RANGE(Rure_is_Match_RE2_DotStarDollar_text_re2_1KB, 2<<6, 2<<9); +BENCHMARK_RANGE(Rure_is_Match_RE2_DotStarCapture_text_re2_1KB, 2<<6, 2<<9); +// 加起止符 ^ 结束符 $ 的正则表达式,也就是FullMatch,通过regex对外接口rure_is_match()函数直接测试 +void Rure_is_Match_RE2_Begin_DotStar_End_text_re2_1KB(benchmark::State& state) { Rure_is_Match_RE2(state, "^(?s).*$"); } +void Rure_is_Match_RE2_Begin_DotStarDollar_End_text_re2_1KB(benchmark::State& state) { Rure_is_Match_RE2(state, "^(?s).*$$"); } +void Rure_is_Match_RE2_Begin_DotStarCapture_End_text_re2_1KB(benchmark::State& state) { Rure_is_Match_RE2(state, "^(?s)((.*)()()($))$"); } -void FullMatch_DotStar_CachedRE2_LiuZhitao(benchmark::State& state) { FullMatchRE2_LiuZhitao(state, "(?s).*"); } -void FullMatch_DotStarDollar_CachedRE2_LiuZhitao(benchmark::State& state) { FullMatchRE2_LiuZhitao(state, "(?s).*$"); } -void FullMatch_DotStarCapture_CachedRE2_LiuZhitao(benchmark::State& state) { FullMatchRE2_LiuZhitao(state, "(?s)((.*)()()($))"); } +BENCHMARK_RANGE(Rure_is_Match_RE2_Begin_DotStar_End_text_re2_1KB, 2<<6, 2<<9); +BENCHMARK_RANGE(Rure_is_Match_RE2_Begin_DotStarDollar_End_text_re2_1KB, 2<<6, 2<<9); +BENCHMARK_RANGE(Rure_is_Match_RE2_Begin_DotStarCapture_End_text_re2_1KB, 2<<6, 2<<9); -BENCHMARK_RANGE(FullMatch_DotStar_CachedRE2_LiuZhitao, 8, 2<<9); -BENCHMARK_RANGE(FullMatch_DotStarDollar_CachedRE2_LiuZhitao, 8, 2<<9); -BENCHMARK_RANGE(FullMatch_DotStarCapture_CachedRE2_LiuZhitao, 8, 2<<9); +// 加起止符 ^ 结束符 $ 的正则表达式,也就是FullMatch,通过原本RE2项目对外接口FullMatch()函数测试 +void FullMatch_RE2_DotStar_text_re2_1KB(benchmark::State& state) { FullMatchRE2_text_re2_1KB(state, "(?s).*"); } +void FullMatch_RE2_DotStarDollar_text_re2_1KB(benchmark::State& state) { FullMatchRE2_text_re2_1KB(state, "(?s).*$"); } +void FullMatch_RE2_DotStarCapture_text_re2_1KB(benchmark::State& state) { FullMatchRE2_text_re2_1KB(state, "(?s)((.*)()()($))"); } + +BENCHMARK_RANGE(FullMatch_RE2_DotStar_text_re2_1KB, 2<<6, 2<<9); +BENCHMARK_RANGE(FullMatch_RE2_DotStarDollar_text_re2_1KB, 2<<6, 2<<9); +BENCHMARK_RANGE(FullMatch_RE2_DotStarCapture_text_re2_1KB, 2<<6, 2<<9); void FullMatch_DotStar_CachedPCRE(benchmark::State& state) { FullMatchPCRE(state, "(?s).*"); } void FullMatch_DotStar_CachedRE2(benchmark::State& state) { FullMatchRE2(state, "(?s).*"); } @@ -666,17 +676,17 @@ void FullMatch_DotStarCapture_CachedRE2(benchmark::State& state) { FullMatchRE2 #ifdef USEPCRE BENCHMARK_RANGE(FullMatch_DotStar_CachedPCRE, 8, 2<<20); #endif -BENCHMARK_RANGE(FullMatch_DotStar_CachedRE2, 8, 2<<20); +BENCHMARK_RANGE(FullMatch_DotStar_CachedRE2, 2<<6, 2<<9); #ifdef USEPCRE BENCHMARK_RANGE(FullMatch_DotStarDollar_CachedPCRE, 8, 2<<20); #endif -BENCHMARK_RANGE(FullMatch_DotStarDollar_CachedRE2, 8, 2<<20); +BENCHMARK_RANGE(FullMatch_DotStarDollar_CachedRE2, 2<<6, 2<<9); #ifdef USEPCRE BENCHMARK_RANGE(FullMatch_DotStarCapture_CachedPCRE, 8, 2<<20); #endif -BENCHMARK_RANGE(FullMatch_DotStarCapture_CachedRE2, 8, 2<<20); +BENCHMARK_RANGE(FullMatch_DotStarCapture_CachedRE2, 2<<6, 2<<9); } // namespace re2 -- Gitee From 7db037789e0540a2c598eaf061a84c77dd689737 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E5=BF=97=E6=B6=9B?= Date: Fri, 2 Sep 2022 09:19:21 +0000 Subject: [PATCH 2/2] =?UTF-8?q?=E5=88=A9=E7=94=A8=E7=A7=81=E6=9C=89?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=E6=88=90=E5=91=98rprog=5F=E4=BB=A5=E5=AE=9E?= =?UTF-8?q?=E7=8E=B0PartialMatch=E4=B8=8EFullMatch=E5=8C=BA=E5=88=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 刘志涛 --- re2/re2.cc | 59 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 47 insertions(+), 12 deletions(-) diff --git a/re2/re2.cc b/re2/re2.cc index 64afcd9..60ee331 100644 --- a/re2/re2.cc +++ b/re2/re2.cc @@ -210,7 +210,11 @@ namespace re2 if(options_.dot_nl()) flags = RURE_FLAG_DOTNL; if(options_.never_nl()) flags = RURE_DEFAULT_FLAGS; // 空字符串的处理??? + std::string FullMatch_rure_str = rure_str; + FullMatch_rure_str.insert(0, "^"); + FullMatch_rure_str.append("$"); rure *re = rure_compile((const uint8_t *)rure_str.c_str(), strlen(rure_str.c_str()), flags, NULL, err); + rure *re1 = rure_compile((const uint8_t *)FullMatch_rure_str.c_str(), strlen(FullMatch_rure_str.c_str()), flags, NULL, err); //如果编译失败,打印错误信息 if (re == NULL) @@ -246,6 +250,7 @@ namespace re2 else { prog_ = (Prog *)re; + rprog_ = (Prog *)re1; error_ = empty_string; error_code_ = RE2::NoError; } @@ -724,12 +729,36 @@ namespace re2 // } // return true; // } - + // rure_error *err = rure_error_new(); + // rure *re = rure_compile((const uint8_t *) pattern_.c_str(), strlen(pattern_.c_str()), RURE_DEFAULT_FLAGS, NULL, err); const char *haystack = text.data(); rure *re = (rure *)prog_; + rure *re1 = (rure *)rprog_; rure_match match = {0}; - bool matched = rure_find(re, (const uint8_t *)haystack, strlen(haystack), 0, &match); + // bool matched = rure_find(re, (const uint8_t *)haystack, strlen(haystack), 0, &match); + if(re_anchor == UNANCHORED) + { + bool matched = rure_is_match(re, (const uint8_t *)haystack, strlen(haystack), 0); + if(!matched){ + return false; + } + else if(!nsubmatch){ + return true; + } + } + else if(re_anchor == ANCHOR_BOTH) + { + bool matched = rure_is_match(re1, (const uint8_t *)haystack, strlen(haystack), 0); + if(!matched){ + return false; + } + else if(!nsubmatch){ + return true; + } + } + + /* switch (re_anchor) { // ANCHOR_BOTH FullMatch @@ -776,7 +805,8 @@ namespace re2 } else { - if (matched && match.end != 0) + // if (matched && match.end != 0) + if (matched) return true; else return false; @@ -799,9 +829,9 @@ namespace re2 } } } - + */ // Demo 获取捕获组内容,存储到submatch数组中 - + /* size_t length = strlen(haystack); rure_captures *caps = rure_captures_new(re); @@ -833,6 +863,7 @@ namespace re2 return true; + */ } // std::string_view in MSVC has iterators that aren't just pointers and @@ -869,6 +900,7 @@ namespace re2 return false; } + /* // 判断是否FullMatch, 判空 std::string haystack; if (text.data() == NULL || text[0] == '\0') @@ -890,6 +922,7 @@ namespace re2 rure *re = (rure *)prog_; rure_match match = {0}; bool matched = rure_find(re, (const uint8_t *)haystack.c_str(), strlen(haystack.c_str()), 0, &match); + */ // Count number of capture groups needed. int nvec; @@ -897,6 +930,8 @@ namespace re2 nvec = 0; // 0个捕获组 else nvec = n + 1; + + /* // 0个捕获组的匹配判断 if (nvec == 0) { @@ -963,7 +998,7 @@ namespace re2 } } } - + */ StringPiece *vec; StringPiece stkvec[kVecSize]; StringPiece *heapvec = NULL; @@ -1003,12 +1038,12 @@ namespace re2 // 结下来就是要对正表达式中存在的捕获组进行处理 // 如果不需要捕获组,直接返回true - // if (n == 0 || args == NULL) - // { - // // We are not interested in results - // delete[] heapvec; - // return true; - // } + if (n == 0 || args == NULL) + { + // We are not interested in results + delete[] heapvec; + return true; + } // If we got here, we must have matched the whole pattern. for (int i = 0; i < n; i++) -- Gitee