From 8979c4a6f685396b95706ce11d62524e81f74f1f Mon Sep 17 00:00:00 2001 From: yangwentong <425822674@qq.com> Date: Tue, 3 Jan 2023 11:40:11 +0800 Subject: [PATCH 1/4] =?UTF-8?q?=E6=9B=B4=E6=96=B0README?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 75 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 61 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index c734488..d24bf0e 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,19 @@ # re2-rust -a compatible RE2 API( -2021-11-01) by calling Rust library [regex](https://github.com/rust-lang/regex) +## re2-rust介绍 +re2-rust是用来兼容RE2 API[(version 2021-11-01)](https://github.com/google/re2/tree/2021-11-01)的项目,通过调用[Rust正则表达式库](https://github.com/rust-lang/regex)进行实现。re2-rust的功能与原本RE2基本保持一致。 +re2-rust保留了re2中的对外的接口,分别在re2.h、set.h和filtered_re2.h中。 +re2.h中的接口可以实现正则表达式的匹配、查找和替换的功能;set.h中的接口可以同时处理多组正则表达式;filtered_re.h中的接口提供了一种预过滤机制,有助于减少需要实际搜索的regexp的数量。这些接口再调用Rust正则库中提供的接口对用户传递过来的数据进行处理,最后再把结果进行返回。 + + +## 编译、安装re2-rust ``` Shell $ git clone https://gitee.com/openeuler/re2-rust.git $ cd re2-rust ``` - -### 编译、安装re2-rust - -使用openEuler 22.03-LTS +**使用openEuler 22.03-LTS** ``` Shell dnf install git @@ -27,7 +29,7 @@ g++ testinstall.cc -o testinstall -lre2 ./testinstall ``` -Ubuntu 20.04 +**使用Ubuntu 20.04** ``` Shell $ make @@ -37,8 +39,52 @@ $ g++ testinstall.cc -o testinstall -lre2 $ ./testinstall ``` -## Test Rusults +## 性能测试 +RE2-Rust项目中只需要对re2目录下filtered_re2.h、re2.h、set.h文件中声明的部分函数进行性能测试,而filtered_re2.h中的主要函数是通过调用re2.h中的`PartialMatch()`函数实现的,所以下面只对re2.h和set.h文件中主要函数进行性能测试。相关的性能测试代码详见regexp_benchmark.cc文件。 +re2.h文件中相关函数的性能测试: +我们对re2.h对外接口中的`FullMatch()`、`PartialMatch()`、`FindAndConsume()`三个函数进行了测试,下面表格中的re2-c++和re2-rust分别通过上述三个函数测试了表格中的八个正则表达式,但由于这三个函数的本质是调用了`RE2::DoMatch()`函数,所以在表格中不对上面三个函数进行区分。下面表格是regexp_benchmark.cc中一些正则表达式在text_re2_1KB.txt文本下的执行时间。 + +| 正则表达式 (含义) | RE2-C++ | RE2-Rust | PCRE | Regex | +| --------------------------------------- | ------------- | ------------- | ------------ | ------------- | +| `“”` | 339 ns/iter | 213 ns/iter | 133 ns/iter | 54 ns/iter | +| **空** | 3019.80 MB/s | 4785.82 MB/s | 7653.53 MB/s | 18890.09 MB/s | +| `"abcdefg"` | 820 ns/iter | 259 ns/iter | 1686 ns/iter | 97 ns/iter | +| **匹配abcdefg字符串** | 1248.70 MB/s | 3951.78 MB/s | 607.26 MB/s | 10507.00 MB/s | +| `"(?-s)^(?:GET|POST) +([^ ]+) HTTP"` | 343 ns/iter | 246 ns/iter | 147 ns/iter | 92 ns/iter | +| **匹配HTTP请求报文格式** | 2982.79 MB/s | 4157.21 MB/s | 6932.47 MB/s | 11096.98 MB/s | +| `"(?-s)^(.+)"` | 542 ns/iter | 212 ns/iter | 203 ns/iter | 56 ns/iter | +| **匹配行首连续出现一次以上的字符** | 1886.72 MB/s | 4807.80 MB/s | 5031.92 MB/s | 18062.85 MB/s | +| ` "(?-s)^([ -~]+)"` | 557 ns/iter | 217 ns/iter | 190 ns/iter | 59 ns/iter | +| **匹配行首连续出现一次以上的ASCII字符** | 1835.28 MB/s | 4715.87 MB/s | 5365.38 MB/s | 17188.64 MB/s | +| `"(?s).*"` | 349 ns/iter | 21223 ns/iter | 154 ns/iter | 2588 ns/iter | +| **匹配任意字符** | 2929.63 MB/s | 48.25 MB/s | 6640.02 MB/s | 395.62 MB/s | +| `"(?s).*$"` | 11401 ns/iter | 19678 ns/iter | 159 ns/iter | 2468 ns/iter | +| **匹配任意字符** | 89.81 MB/s | 52.04 MB/s | 6415.22 MB/s | 414.86 MB/s | +| `"(?s)((.*)()()($))"` | 11179 ns/iter | 19873 ns/iter | 260 ns/iter | 2488 ns/iter | +| **匹配任意字符** | 91.59 MB/s | 51.53 MB/s | 3937.18 MB/s | 411.54 MB/s | + +注:(?s)表示单行模式 + +set.h文件中相关函数的性能测试: + +可以看到,set.h中主要是下面的函数接口有匹配功能: + +`bool Match(const StringPiece& text, std::vector* v) const;` +上述函数功能为同一文本可同时匹配多个正则表达式,并将匹配到的结果保存到向量v中,若传入的v为空则表示不需要返回匹配结果。 + +我们使用的待匹配文本还是text_re2_1KB.txt中的数据,同时匹配五个正则表达式,分别是`"(?s).*"`、`"(?s).*"`、`"(?s)((.*)()()("`、`"(?*s*)((.∗)()()())"`、`"hwx"`、`"ldi"`。 + +由于对于锚点为`RE2::UNANCHORED`、`RE2::ANCHOR_BOTH`、`RE2::ANCHOR_START`三种不同情况已经在`RE2::Set::Add()`已经进行了处理,所以对锚点三种不同情况的处理并不计算在匹配时间。为方便RE2-Rust与RE2-C++、Regex进行性能对比分析,我们采用锚点为RE2::UNANCHORED进行性能对比,详细性能评测代码见regexp_benchmark.cc文件中`Set_Match_UNANCHORED_RE2()`和`Set_Match_UNANCHORED_NULL_RE2()`函数。下面是set.h文件中`RE2::Set::Match()`在RE2-C++、RE2-Rust、Regex三种不同正则表达式框架下的性能对比结果(PCRE不支持同时匹配多个正则表达式): + +| | RE2-C++ | RE2-Rust | Regex | +| ----------- | ------------ | ------------ | ------------ | +| **V为空** | 1716 ns/iter | 383 ns/iter | 18 ns/iter | +| | 596.67 MB/s | 2671.52 MB/s | 56944 MB/s | +| **V不为空** | 8231 ns/iter | 535 ns/iter | 6686 ns/iter | +| | 124.40 MB/s | 1910.52 MB/s | 153 MB/s | + +另外我们采用第三方正则表达式测试框架regex-performance,通过一些指定的正则表达式,对主流的正则表达式库进行了评测(测试详情可见https://gitee.com/openeuler/re2-rust/blob/master/test-results.txt),得到了如下结果: ``` Total Results: [ ctre] time: 4010462.7 ms, score: 6 points, @@ -56,11 +102,15 @@ Total Results: [rust_regex] time: 4790.2 ms, score: 56 points, [rust_regrs] time: 47772.1 ms, score: 6 points, ``` -从测试结果看re2-rust评分比re2略高,但是耗时re2-rust比re2增加很多,通过仔细分析发现正则表达式'[a-q][^u-z]{13}x'耗时特别高4280.7 - 130.5 = 4150.2 ms,另外'\b\w+nn\b'耗时322.6 - 23.9 = 298.7,除去这两个异常测试项外的16个测试项耗时re2-rust:334.4 ms vs. re2: 362 ms ,也就是说re2-rust在大多数情况下性能比re2要好。 - +从以上测试结果看re2-rust评分比re2略高,但是耗时re2-rust比re2增加很多,通过仔细分析发现正则表达式`'[a-q][^u-z]{13}x'`耗时特别高4280.7 - 130.5 = 4150.2 ms,另外`'\b\w+nn\b'`耗时322.6 - 23.9 = 298.7,除去这两个异常测试项外的16个测试项耗时re2-rust:334.4 ms vs. re2: 362 ms ,也就是说re2-rust在大多数情况下性能比re2要好。 从测试耗时看re2-rust和rust_regex两者相差3%(多次测评结果看两者差距上下浮动5%以内),总体看re2-rust和rust_regex性能基本一致。 +综合对比可知: + +1. RE2-Rust在大部分测试用例下性能优于RE2-C++,而在涉及到捕获组会差于RE2-C++,原因可见https://github.com/rust-lang/regex/discussions/903 +2. RE2-rust和Regex性能大致相当,但是由于RE2-Rust是调用了Regex的对外的C接口,所以RE2-rust会比Regex多了函数调用开销、特殊处理、错误判断等开销,故RE2-Rust性能会略低于Regex +3. RE2-Rust支持多行模式,但不支持同名的捕获组 +4. RE2-Rust比RE2-C++支持更少的转义字符,比如`”\C”` -测试采用第三方正则表达式测试框架regex-performance,测试详情见test-results.txt # Links @@ -68,6 +118,3 @@ Total Results: * https://gitee.com/src-openeuler/re2 * https://github.com/google/re2 * https://gitee.com/mengning997/regex-performance for re2-rust - - - -- Gitee From c216ad3cc7ab0c6354b154fd0405bd24adf8f3b1 Mon Sep 17 00:00:00 2001 From: yangwentong <425822674@qq.com> Date: Wed, 4 Jan 2023 12:10:54 +0800 Subject: [PATCH 2/4] =?UTF-8?q?=E8=A7=A3=E5=86=B3=E5=A4=9A=E7=BA=BF?= =?UTF-8?q?=E7=A8=8B=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- re2/re2.cc | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/re2/re2.cc b/re2/re2.cc index 4091209..4a3d846 100644 --- a/re2/re2.cc +++ b/re2/re2.cc @@ -233,14 +233,13 @@ namespace re2 delete group_names_; } - // Returns named_groups_, computing it if needed. - const std::map &RE2::NamedCapturingGroups() const + std::map *NamedCaptures(re2::Prog *prog) { std::map *temp = new std::map; std::string str; char *name; int i = 0; - rure_iter_capture_names *it = rure_iter_capture_names_new((rure *)prog_); + rure_iter_capture_names *it = rure_iter_capture_names_new((rure *)prog); while (rure_iter_capture_names_next(it, &name)) { str = name; @@ -248,19 +247,16 @@ namespace re2 temp->insert(make_pair(str, i)); ++i; } - named_groups_ = temp; - - return *named_groups_; + return temp; } - - // Returns group_names_, computing it if needed. - const std::map &RE2::CapturingGroupNames() const + + std::map *CaptureNames(re2::Prog *prog) { std::map *temp = new std::map; std::string str; char *name; int i = 0; - rure_iter_capture_names *it = rure_iter_capture_names_new((rure *)prog_); + rure_iter_capture_names *it = rure_iter_capture_names_new((rure *)prog); while (rure_iter_capture_names_next(it, &name)) { str = name; @@ -268,7 +264,32 @@ namespace re2 temp->insert(make_pair(i, str)); ++i; } - group_names_ = temp; + return temp; + } + + // Returns named_groups_, computing it if needed. + const std::map &RE2::NamedCapturingGroups() const + { + std::call_once(named_groups_once_, [](const RE2* re) { + if (re->suffix_regexp_ != NULL) + { + re->named_groups_ = NamedCaptures(re->prog_); + } + if (re->named_groups_ == NULL) + re->named_groups_ = empty_named_groups; + }, this); + return *named_groups_; + } + + // Returns group_names_, computing it if needed. + const std::map &RE2::CapturingGroupNames() const + { + std::call_once(group_names_once_, [](const RE2* re) { + if (re->suffix_regexp_ != NULL) + re->group_names_ = CaptureNames(re->prog_); + if (re->group_names_ == NULL) + re->group_names_ = empty_group_names; + }, this); return *group_names_; } -- Gitee From 8ee853dddd0684835d43d2527ae7806c6f1f11e6 Mon Sep 17 00:00:00 2001 From: yangwentong <425822674@qq.com> Date: Wed, 4 Jan 2023 12:14:11 +0800 Subject: [PATCH 3/4] =?UTF-8?q?=E8=A7=A3=E5=86=B3=E5=86=85=E5=AD=98?= =?UTF-8?q?=E6=B3=84=E6=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- re2/re2.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/re2/re2.cc b/re2/re2.cc index 4a3d846..6b63c67 100644 --- a/re2/re2.cc +++ b/re2/re2.cc @@ -187,6 +187,7 @@ namespace re2 error_ = new std::string(msg); error_code_ = ErrorInternal; // 暂时对这个错误进行赋值,如何处理错误类型??? } + rure_error_free(err); return; } prog_ = (Prog *)re; -- Gitee From 445cf2f364accb90f4b6d0c3d88ee4c69671e33f Mon Sep 17 00:00:00 2001 From: yangwentong <425822674@qq.com> Date: Wed, 4 Jan 2023 14:08:58 +0800 Subject: [PATCH 4/4] =?UTF-8?q?=E6=9B=B4=E6=94=B9Latin1=E7=BC=96=E7=A0=81?= =?UTF-8?q?=E7=9A=84=E5=AD=97=E7=AC=A6=E4=B8=B2=E8=BD=AC=E6=8D=A2=E4=B8=BA?= =?UTF-8?q?UTF8=E7=9A=84=E5=AD=97=E7=AC=A6=E4=B8=B2=E7=9A=84=E6=96=B9?= =?UTF-8?q?=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- re2/re2.cc | 154 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 127 insertions(+), 27 deletions(-) diff --git a/re2/re2.cc b/re2/re2.cc index 6b63c67..54a572c 100644 --- a/re2/re2.cc +++ b/re2/re2.cc @@ -94,30 +94,125 @@ namespace re2 Init(pattern, options); } - std::string encodingLatin1ToUTF8(std::string str) + typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/ + + enum + { + UTFmax = 4, /* maximum bytes per rune */ + Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ + Runeself = 0x80, /* rune and UTF sequences are the same (<) */ + Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0x10FFFF, /* maximum rune value */ + }; + + enum + { + Bit1 = 7, + Bitx = 6, + Bit2 = 5, + Bit3 = 4, + Bit4 = 3, + Bit5 = 2, + + T1 = ((1 << (Bit1 + 1)) - 1) ^ 0xFF, /* 0000 0000 */ + Tx = ((1 << (Bitx + 1)) - 1) ^ 0xFF, /* 1000 0000 */ + T2 = ((1 << (Bit2 + 1)) - 1) ^ 0xFF, /* 1100 0000 */ + T3 = ((1 << (Bit3 + 1)) - 1) ^ 0xFF, /* 1110 0000 */ + T4 = ((1 << (Bit4 + 1)) - 1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1 << (Bit5 + 1)) - 1) ^ 0xFF, /* 1111 1000 */ + + Rune1 = (1 << (Bit1 + 0 * Bitx)) - 1, /* 0000 0000 0111 1111 */ + Rune2 = (1 << (Bit2 + 1 * Bitx)) - 1, /* 0000 0111 1111 1111 */ + Rune3 = (1 << (Bit3 + 2 * Bitx)) - 1, /* 1111 1111 1111 1111 */ + Rune4 = (1 << (Bit4 + 3 * Bitx)) - 1, + /* 0001 1111 1111 1111 1111 1111 */ + + Maskx = (1 << Bitx) - 1, /* 0011 1111 */ + Testx = Maskx ^ 0xFF, /* 1100 0000 */ + + Bad = Runeerror, + }; + int runetochar(char *str, const Rune *rune) { - string strOut; - for (std::string::iterator it = str.begin(); it != str.end(); ++it) + /* Runes are signed, so convert to unsigned for range check. */ + unsigned long c; + + /* + * one character sequence + * 00000-0007F => 00-7F + */ + c = *rune; + if (c <= Rune1) { - uint8_t ch = *it; - if (ch < 0x80) - { - strOut.push_back(ch); - } - else - { - strOut.push_back(0xc0 | ch >> 6); - strOut.push_back(0x80 | (ch & 0x3f)); - } + str[0] = static_cast(c); + return 1; + } + + /* + * two character sequence + * 0080-07FF => T2 Tx + */ + if (c <= Rune2) + { + str[0] = T2 | static_cast(c >> 1 * Bitx); + str[1] = Tx | (c & Maskx); + return 2; + } + + /* + * If the Rune is out of range, convert it to the error rune. + * Do this test here because the error rune encodes to three bytes. + * Doing it earlier would duplicate work, since an out of range + * Rune wouldn't have fit in one or two bytes. + */ + if (c > Runemax) + c = Runeerror; + + /* + * three character sequence + * 0800-FFFF => T3 Tx Tx + */ + if (c <= Rune3) + { + str[0] = T3 | static_cast(c >> 2 * Bitx); + str[1] = Tx | ((c >> 1 * Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence (21-bit value) + * 10000-1FFFFF => T4 Tx Tx Tx + */ + str[0] = T4 | static_cast(c >> 3 * Bitx); + str[1] = Tx | ((c >> 2 * Bitx) & Maskx); + str[2] = Tx | ((c >> 1 * Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; + } + + // Converts latin1 (assumed to be encoded as Latin1 bytes) + // into UTF8 encoding in string. + // Can't use EncodingUtils::EncodeLatin1AsUTF8 because it is + // deprecated and because it rejects code points 0x80-0x9F. + void ConvertLatin1ToUTF8(const StringPiece &latin1, std::string *utf) + { + char buf[UTFmax]; + + utf->clear(); + for (size_t i = 0; i < latin1.size(); i++) + { + Rune r = latin1[i] & 0xFF; + int n = runetochar(buf, &r); + utf->append(buf, n); } - return strOut; } void RE2::Init(const StringPiece &pattern, const Options &options) { std::string rure_str; // 正则表达式UTF-8编码形式 static std::once_flag empty_once; - std::call_once(empty_once, []() { //为了解决多线程中出现的资源竞争导致的数据不一致问题 + std::call_once(empty_once, []() { // 为了解决多线程中出现的资源竞争导致的数据不一致问题 empty_string = new std::string; empty_named_groups = new std::map; empty_group_names = new std::map; @@ -149,7 +244,7 @@ namespace re2 } else { // Latin-1编码 - rure_str = encodingLatin1ToUTF8(pattern.ToString()); + ConvertLatin1ToUTF8(pattern, &rure_str); } uint32_t flags = RURE_DEFAULT_FLAGS; @@ -163,7 +258,7 @@ namespace re2 // for All rure *re = rure_compile((const uint8_t *)rure_str.c_str(), strlen(rure_str.c_str()), flags, NULL, err); - //如果编译失败,打印错误信息 + // 如果编译失败,打印错误信息 if (re == NULL) { const char *msg = rure_error_message(err); @@ -206,7 +301,7 @@ namespace re2 entire_regexp_ = (re2::Regexp *)re; } - //获取捕获组的数量, 并对num_captures_其进行赋值 + // 获取捕获组的数量, 并对num_captures_其进行赋值 rure_captures *caps = rure_captures_new(re); size_t captures_len = rure_captures_len(caps) - 1; if (!options_.never_capture()) @@ -271,26 +366,30 @@ namespace re2 // Returns named_groups_, computing it if needed. const std::map &RE2::NamedCapturingGroups() const { - std::call_once(named_groups_once_, [](const RE2* re) { + std::call_once( + named_groups_once_, [](const RE2 *re) + { if (re->suffix_regexp_ != NULL) { re->named_groups_ = NamedCaptures(re->prog_); } if (re->named_groups_ == NULL) - re->named_groups_ = empty_named_groups; - }, this); + re->named_groups_ = empty_named_groups; }, + this); return *named_groups_; } - + // Returns group_names_, computing it if needed. const std::map &RE2::CapturingGroupNames() const { - std::call_once(group_names_once_, [](const RE2* re) { + std::call_once( + group_names_once_, [](const RE2 *re) + { if (re->suffix_regexp_ != NULL) re->group_names_ = CaptureNames(re->prog_); if (re->group_names_ == NULL) - re->group_names_ = empty_group_names; - }, this); + re->group_names_ = empty_group_names; }, + this); return *group_names_; } @@ -505,7 +604,8 @@ namespace re2 // Latin-1编码转换 if (options_.encoding() == RE2::Options::EncodingLatin1) { - haystack = encodingLatin1ToUTF8(text.as_string()); + ConvertLatin1ToUTF8(text, &haystack); + // haystack = encodingLatin1ToUTF8(text.as_string()); } rure *re = (rure *)prog_; // rure *re1 = (rure *)rprog_; @@ -683,7 +783,7 @@ namespace re2 // vec 用于存放捕获到的数据 // nvec 表示需要捕获的数据的个数 - //此处在改写的时候先不进行任何处理,直接使用之前的Match函数,完成之后在对Match进行改写 + // 此处在改写的时候先不进行任何处理,直接使用之前的Match函数,完成之后在对Match进行改写 if (!Match(text, 0, text.size(), re_anchor, vec, nvec)) { // std::cout << "DoMatch : Match 带参 未匹配"; -- Gitee