diff --git a/re2/filtered_re2.cc b/re2/filtered_re2.cc index 71487cad42e09aae0252ad5c68f933ee0802485d..a025cc57c9b781fba8956f2f5aa80175e08c22ad 100644 --- a/re2/filtered_re2.cc +++ b/re2/filtered_re2.cc @@ -1,7 +1,8 @@ // Copyright 2009 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style +// Use of this source code is governed by a BSD-stylePrefilter // license that can be found in the LICENSE file. #include +#include #include #include "re2/filtered_re2.h" #include @@ -11,8 +12,15 @@ #include "util/util.h" #include "util/logging.h" // #include "re2/prefilter.h" +extern "C" +{ +#include "regex-capi/include/rure.h" +} +using namespace std; namespace re2 { -class Prefilter {}; + +std::map> map_atoms; + // #include "re2/prefilter_tree.h" class PrefilterTree { public: @@ -22,9 +30,22 @@ class Prefilter {}; int getMinAtomLen(){ return min_atom_len_; } + bool get_is_latin_result() {return is_latin;}; + void set_latin(bool x); + std::string get_latin_string() {return str_latin;}; + void set_latin_str(std::string x); + private: const int min_atom_len_; + bool is_latin; + std::string str_latin; }; + void PrefilterTree::set_latin(bool x) { + is_latin = x; + } + void PrefilterTree::set_latin_str(std::string x) { + str_latin = x; + } }; namespace re2 { @@ -64,7 +85,12 @@ RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern, const RE2::Options& options, int* id) { RE2* re = new RE2(pattern, options); RE2::ErrorCode code = re->error_code(); - + if(options.encoding() == RE2::Options::EncodingLatin1) { + prefilter_tree_->set_latin(true); + prefilter_tree_->set_latin_str(pattern.as_string()); + } + else prefilter_tree_->set_latin(false); + if (!re->ok()) { if (options.log_errors()) { LOG(ERROR) << "Couldn't compile regular expression, skipping: " @@ -79,412 +105,8 @@ RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern, return code; } -/** - * 负责对字符集进行连接操作 - * - */ -std::vector Connection(std::string str, std::vector vec1, std::vector vec2) -{ - std::vector vec_tmp; - if (str.size() > 0) - { - for (size_t i = 0; i < vec2.size(); i++) - { - vec_tmp.push_back(str + vec2[i]); - } - } - else if (vec1.size() == 0) - { - for (auto x : vec2) - { - std::string str; - str.push_back(x); - vec_tmp.push_back(str); - } - } - else - { - for (size_t i = 0; i < vec1.size(); i++) - { - for (size_t j = 0; j < vec2.size(); j++) - { - - vec_tmp.push_back(vec1[i] + vec2[j]); - } - } - } - vec1.clear(); - return vec_tmp; -} - -/** - * 处理 - * a[a-c]a[zv] - * [abc] - * [a-c]+ - */ -std::vector CharClassExpansion(std::string str, int start_post, int end_post) -{ - std::vector atoms; // 字符集中的atoms - std::vector vec_tmp; - // std::vector vec_op; - std::vector vec_char; - std::string str_tmp; - - vec_tmp.clear(); - // vec_op.clear(); - vec_char.clear(); - int flag_connect = 0; - for (int i = start_post; i <= end_post; i++) - { - if (str[i] == '[' || str[i] == ']') - { - continue; - } - else if ((str[i] >= 'a' && str[i] <= 'z') || (str[i] >= 0 && str[i] <= 9)) - { - vec_char.push_back(str[i]); - } - else if (str[i] == '-') - { - flag_connect = 1; - continue; - } - } - - // 将字符集拆分的所有可能字符存储到atoms中 - if (flag_connect == 1) - { - char x2 = vec_char[1]; - char x1 = vec_char[0]; - for (int i = int(x1); i <= int(x2); i++) - { - atoms.push_back(char(i)); - } - } - else if (flag_connect == 0) - { - for (auto x : vec_char) - { - atoms.push_back(x); - } - } - return atoms; -} - -/** - * (abc123|abc|ghi789|abc1234) - 3-abc - 6-abc123 - 6-ghi789 - 7-abc1234 - * abc abc123 ghi789 abc1234 - */ -std::vector Group_multiple_selection(std::string str, int start_point, - int end_point, size_t min_atoms_len) -{ - std::string str_tmp; // 暂存atoms - std::multimap atoms_tmp; - std::vector vec_atoms_one; // 第一次暂存所有的atoms - std::vector vec_atoms_two; // 第二次暂存所有的atoms - std::vector vec_atoms_real; // 符合规则的atoms - // 先获取所有的atoms - for (int i = start_point; i <= end_point; i++) - { - if (str[i] == '(') - continue; - if (str[i] == '|' || str[i] == ')') - { - if (str_tmp.size() >= min_atoms_len) - { - atoms_tmp.insert(make_pair(str_tmp.size(), str_tmp)); - } - str_tmp.clear(); - continue; - } - else - { - str_tmp += str[i]; - } - } - // 去除所有所有比最短atoms长的元素,并且最短atoms是他们的子集 - for (auto it = atoms_tmp.begin(); it != atoms_tmp.end(); it++) - { - vec_atoms_one.push_back(it->second); - vec_atoms_two.push_back(it->second); - } - for (size_t i = 0; i < vec_atoms_one.size(); i++) - { - for (size_t j = 0; j < vec_atoms_two.size(); j++) - { - if(vec_atoms_two[j].find(vec_atoms_one[i]) != std::string::npos && vec_atoms_one[i] != vec_atoms_two[j] - && vec_atoms_one[i].size() != 0 && vec_atoms_two[j].size() != 0) - { // 如果包含,则置为空 - vec_atoms_two[j] = "-1"; - } - } - } - // 重新赋值给 vec_atoms_real - for (size_t i = 0; i < vec_atoms_two.size(); i++) - { - if (vec_atoms_two[i] != "-1") - { - vec_atoms_real.push_back(vec_atoms_two[i]); - } - } - return vec_atoms_real; -} - -/** - * 把所有的大写字母转换为小写字母 - * 1. 标准ASCII - * 2. 非标准ASCII 如希腊字母 - */ - -void UpperToLower(std::string &str, int start_post, int end_post) -{ - // 标准ASCII转小写 - transform(str.begin(), str.end(), str.begin(), ::tolower); -} - -void HandleCharacterCase(std::string &str) -{ - std::map m = {{"\u0391", "\u03B1"}, {"\u0392", "\u03B2"}, {"\u0393", "\u03B3"}, - {"\u0394", "\u03B4"}, {"\u0395", "\u03B5"}, {"\u0396", "\u03B6"}, - {"\u0397", "\u03B7"}, {"\u0398", "\u03B8"}, {"\u0399", "\u03B9"}, - {"\u039A", "\u03BA"}, {"\u039B", "\u03BB"}, {"\u039C", "\u03BC"}, - {"\u039D", "\u03BD"}, {"\u039E", "\u03BE"}, {"\u039F", "\u03BF"}, - {"\u03A0", "\u03C0"}, {"\u03A1", "\u03C1"}, {"\u03A2", "\u03C2"}, - {"\u03A3", "\u03C3"}, {"\u03A4", "\u03C4"}, {"\u03A5", "\u03C5"}, - {"\u03A6", "\u03C6"}, {"\u03A7", "\u03C7"}, {"\u03A8", "\u03C8"}, - {"\u03A9", "\u03C9"}}; - for(size_t i = 0; i < str.length(); i += 2) - { - std::string subStr = str.substr(i, 2); - if(m.count(subStr) > 0) - { - str.replace(i, 2, m[subStr]); - continue; - } - else if(subStr == "ϖ") - { - str.replace(i, 2, "π"); - continue; - } - else if(subStr == "ς") - { - str.replace(i, 2, "σ"); - continue; - } - } -} - - -bool JudgeIsCharOrNumber(char x) -{ - if ((x >= 'a' && x <= 'z') || (x >= 0 && x <= 9)) - return true; - return false; -} - -bool JudgeMinux(char x) -{ - if(x == '-') return true; - return false; -} - -bool JudgeRBrace(char x) -{ - if(x == '}') return true; - return false; -} -bool JudgeLBrace(char x) -{ - if(x == '{') return true; - return false; -} - -bool JudedIsGreekAlphabet(std::string str) -{ - std::vector vec_alphabet = {"\u03B1", "\u03B2", "\u03B3", "\u03B4", "\u03B5", - "\u03B6", "\u03B7", "\u03B8", "\u03B9", "\u03BA", - "\u03BB", "\u03BC", "\u03BD", "\u03BE", "\u03BF", - "\u03C0", "\u03C1", "\u03C2", "\u03C3", "\u03C4", - "\u03C5", "\u03C6", "\u03C7", "\u03C8", "\u03C9"}; - for(auto x : vec_alphabet) - { - if(x == str) return true; - } - return false; -} - -std::vector MyCompile(std::string str, size_t min_atoms_len) -{ - std::vector my_atoms; // 最终得到的所有atoms - std::vector vec_atoms_tmp; // 暂存的atom - std::vector vec_con; - std::vector atoms_tmp; - std::string atoms_tmp_string; - std::string subStr; - // 将字符串中的大写字符变为小写 - UpperToLower(str, 0, str.size()); - HandleCharacterCase(str); - for (size_t i = 0; i < str.length(); i++) - { - // 处理希腊字母 - subStr.clear(); - subStr = str.substr(i, 2); - if(JudedIsGreekAlphabet(subStr)) - { - ++i; - atoms_tmp_string += subStr; - continue; - } - if(JudgeLBrace(str[i])) - { - do ++i; while(!JudgeRBrace(str[i])); - ++i; - } - // 处理括号分组 - if (str[i] == '(') - { - int group_start_post = i; - do - ++i; - while (str[i] != ')'); - int group_end_post = i; - - std::vector vec = Group_multiple_selection(str, group_start_post, group_end_post, min_atoms_len); - int tmp_post_group = i; - ++tmp_post_group; - if (str[tmp_post_group] == '.' && vec.size() != 0) - { - ++i; - for (auto x : vec) my_atoms.push_back(x); - } - else if(str[tmp_post_group] == '{' && vec.size() != 0) - { - for (auto x : vec) my_atoms.push_back(x); - - } - // "(abc123|def456|ghi789).*mnop[x-z]+" - continue; - } - if (JudgeIsCharOrNumber(str[i]) || JudgeMinux(str[i]) || JudgeRBrace(str[i])) - { - atoms_tmp_string += str[i]; - continue; - } - // 处理 - if (str[i] == '\\') - { - if (atoms_tmp_string.size() > 0) - { - my_atoms.push_back(atoms_tmp_string); - atoms_tmp_string.clear(); - } - - int escape_char_post = i; - if (JudgeIsCharOrNumber(++escape_char_post)) - ++i; - int escape_plus_post = i; - if (str[++escape_plus_post] == '+') - ++i; - continue; - } - if (str[i] == '*') - { - continue; - } - if (str[i] == '.') - { - if (atoms_tmp_string.size() >= min_atoms_len && vec_atoms_tmp.size() == 0) - { - my_atoms.push_back(atoms_tmp_string); - atoms_tmp_string.clear(); - continue; - } - else if (atoms_tmp_string.size() > 0 && vec_atoms_tmp.size() != 0) - { - for (auto x : vec_atoms_tmp) - { - my_atoms.push_back(x + atoms_tmp_string); - } - atoms_tmp_string.clear(); - vec_atoms_tmp.clear(); - } - else - { - continue; - } - } - // 处理多个字符集 - // a[a-b]a - // a[a-b][a-b] - if(str[i] == '[') - { - int start_post = i; - do ++i; while(str[i] != ']'); - int plus_tmp = i; - // 看是否有 + 号 - if(str[++plus_tmp] == '+') - { - - if(atoms_tmp_string.size() > 2) - { - my_atoms.push_back(atoms_tmp_string); - atoms_tmp_string.clear(); - } - } - else - { - // 如果[x-y]aaab[ab]形式 - if(atoms_tmp_string.size() > 0 && vec_con.size() >0) - { - vec_atoms_tmp.clear(); - for(auto x : vec_con) - { - vec_atoms_tmp.push_back(x + atoms_tmp_string); - } - atoms_tmp_string.clear(); - } - int end_post = i; - atoms_tmp.clear(); - atoms_tmp = CharClassExpansion(str, start_post, end_post); - vec_con.clear(); - vec_con = Connection(atoms_tmp_string, vec_atoms_tmp, atoms_tmp); - atoms_tmp_string.clear(); - vec_atoms_tmp.clear(); - for(size_t i = 0; i < vec_con.size(); i++) - { - vec_atoms_tmp.push_back(vec_con[i]); - } - - } - } - if(int(str[i]) < 0) - { - atoms_tmp_string += str[i]; - continue; - } - } - if(vec_atoms_tmp.size() > 0) - { - for(auto x : vec_atoms_tmp) - { - my_atoms.push_back(x); - } - } - if(atoms_tmp_string.size() >= min_atoms_len) - { - my_atoms.push_back(atoms_tmp_string); - } - return my_atoms; -} - - - void FilteredRE2::Compile(std::vector* atoms) { + map_atoms.clear(); if (compiled_) { LOG(ERROR) << "Compile called already."; return; @@ -495,13 +117,35 @@ void FilteredRE2::Compile(std::vector* atoms) { return; } atoms->clear(); - - for(size_t i = 0; i < re2_vec_.size(); i++) - { - std::vector my_atoms = MyCompile(re2_vec_[i]->pattern(), prefilter_tree_->getMinAtomLen()); - for(auto x : my_atoms) - atoms->push_back(x); + + // 处理latin的情况 + if(prefilter_tree_->get_is_latin_result()) { + std::string str = prefilter_tree_->get_latin_string(); + std::vector vec; + vec.push_back(str); + std::string str_low = str; + transform(str_low.begin(),str_low.end(),str_low.begin(),::tolower); + atoms->push_back(str_low); + map_atoms.insert(map>::value_type(str, vec)); + map_atoms.insert(map>::value_type("total", vec)); + compiled_ = true; + return; } + + for(size_t i = 0; i < re2_vec_.size(); i++) { + // std::vector my_atoms = MyCompile(re2_vec_[i]->pattern(), prefilter_tree_->getMinAtomLen()); + const char *regex = re2_vec_[i]->pattern().c_str(); + std::string regex_str = regex; + MyVec vec = rure_filter_compile((const uint8_t *)regex, strlen(regex), prefilter_tree_->getMinAtomLen()); + int32_t len = vec.len; + std::vector v; + for(int32_t i = 0; i < len; i++) { + atoms->push_back(vec.data[i].atom); + v.push_back(vec.data[i].atom); + } + map_atoms.insert(map>::value_type(regex_str, v)); + } + map_atoms.insert(map>::value_type("total", *atoms)); compiled_ = true; } @@ -522,31 +166,18 @@ void AtomsToRegexps(std::vector re2_vec_, std::vector atoms, std::vec * 如果没有原子, 那么直接会把re加进去。 * 如果这个正则表达式有原子,那么要把该正则表达式的所有的原子的索引全加入,这个正则表达式才能加入成功。 */ - // std::map map; - std::vector atoms_total; - std::vector vec_per_num; + std::vector atoms_total = map_atoms["total"]; std::vector atoms_tmp; - std::vector re_v; - - for(size_t i = 0; i < re2_vec_.size(); i++) - { - std::vector my_atoms = MyCompile(re2_vec_[i]->pattern(), min_atom_len); - - if(my_atoms.size() != 0) - { - for(auto x : my_atoms) - atoms_total.push_back(x); - } - - } for(size_t i = 0; i < atoms.size(); i++) { atoms_tmp.push_back(atoms_total[atoms[i]]); } for(size_t i = 0; i < re2_vec_.size(); i++) { - std::vector my_atoms = MyCompile(re2_vec_[i]->pattern(), min_atom_len); + // std::vector my_atoms = MyCompile(re2_vec_[i]->pattern(), min_atom_len); + std::string str = re2_vec_[i]->pattern(); + std::vector my_atoms = map_atoms[str]; if(my_atoms.size() == 0){ regexps->push_back(i); continue; @@ -567,8 +198,6 @@ void AtomsToRegexps(std::vector re2_vec_, std::vector atoms, std::vec if(count == (int)my_atoms.size()) regexps->push_back(int(i)); } } - - } int FilteredRE2::FirstMatch(const StringPiece& text, @@ -609,6 +238,4 @@ void FilteredRE2::AllPotentials( AtomsToRegexps(re2_vec_, atoms, potential_regexps, prefilter_tree_->getMinAtomLen()); } - - } // namespace re2 diff --git a/re2/testing/filtered_re2_test.cc b/re2/testing/filtered_re2_test.cc index 655b508a08d367b115d1062667d81590615abc8a..41c081e67398b674df4fc13f2dbed06f54dae38e 100644 --- a/re2/testing/filtered_re2_test.cc +++ b/re2/testing/filtered_re2_test.cc @@ -63,6 +63,7 @@ TEST(FilteredRE2Test, SmallLatinTest) { v.f.Compile(&v.atoms); EXPECT_EQ(1, v.atoms.size()); EXPECT_EQ(v.atoms[0], "\xde\xadq\xbe\xef"); + v.atom_indices.push_back(0); v.f.AllMatches("foo\xde\xadQ\xbe\xeflemur", v.atom_indices, &v.matches); EXPECT_EQ(1, v.matches.size()); @@ -136,7 +137,7 @@ AtomTest atom_tests[] = { "ΛΜΝΟΠ", "ψρστυ", }, { - "δδπππσσσ", + "δδπϖπσςσ", "λμνοπ", "ψρστυ", }, diff --git a/regex-capi/include/rure.h b/regex-capi/include/rure.h index a52e5da5862a101302fdc73b372b38158495154a..f51df71d3b6d505d7fa255a0faf0aba321d65394 100644 --- a/regex-capi/include/rure.h +++ b/regex-capi/include/rure.h @@ -116,6 +116,18 @@ typedef struct rure_iter_capture_names rure_iter_capture_names; */ typedef struct rure_error rure_error; +typedef struct +{ + char *atom; +} Atoms; + + +typedef struct +{ + Atoms *data; + int32_t len; +} MyVec; + /* * rure_compile_must compiles the given pattern into a regular expression. If * compilation fails for any reason, an error message is printed to stderr and @@ -629,6 +641,8 @@ const char *rure_rewrite(const uint8_t *rewrite, size_t len, const uint8_t **vec */ size_t rure_replace_count(rure *re, const char *haystack); +MyVec rure_filter_compile(const uint8_t *regex_str, size_t regex_len, size_t min_atoms_len); + #ifdef __cplusplus } #endif \ No newline at end of file diff --git a/regex-capi/src/rure.rs b/regex-capi/src/rure.rs index ce791469545081c7d25b6754429760163e2df5ee..50b54eb6487ed1efc0e7365d035ef3d40dd10186 100644 --- a/regex-capi/src/rure.rs +++ b/regex-capi/src/rure.rs @@ -58,6 +58,17 @@ pub struct IterCaptureNames { name_ptrs: Vec<*mut c_char>, } +#[repr(C)] +pub struct Atoms { + atom: *mut c_char, +} + +#[repr(C)] +pub struct MyVec { + data: *mut Atoms, + len: i32, +} + impl Deref for RegexBytes { type Target = bytes::Regex; fn deref(&self) -> &bytes::Regex { @@ -816,7 +827,6 @@ ffi_fn! { return true; } } - ffi_fn! { fn rure_rewrite_str_convert(rewrite: *const u8, length: size_t) -> *const c_char { let rewrite = unsafe { slice::from_raw_parts(rewrite, length) }; @@ -948,3 +958,247 @@ ffi_fn! { } } +/** + * 负责对字符集进行连接操作 + * + */ +fn connection(str: &str, vec1: Vec, vec2: Vec) -> Vec { + let mut vec_tmp = Vec::new(); + if str.len() > 0 { + for chars in vec2 { + let s = format!("{}{}", str, chars); + vec_tmp.push(s); + } + } else if vec1.len() == 0 { + for elem in vec2 { + vec_tmp.push(elem.to_string()) + } + } else { + for chars_i in vec1 { + for chars_j in vec2.clone() { + let s = format!("{}{}", chars_i, chars_j); + vec_tmp.push(s); + } + } + } + vec_tmp +} + + +/** + * (abc123|abc|ghi789|abc1234) + 3-abc + 6-abc123 + 6-ghi789 + 7-abc1234 + * abc abc123 ghi789 abc1234 + */ +fn group_multiple_selection(str: &str, min_atoms_len: i32) -> Vec { + let mut str_tmp = String::new(); // 暂存atoms + let mut atoms_tmp = Vec::new(); // 最终的atoms + for elem in str.chars() { + if elem == '(' { + continue; + } + if elem == '|' || elem == ')' { + if str_tmp.len() as i32 >= min_atoms_len { + atoms_tmp.push(str_tmp.clone()); + } + str_tmp.clear(); + continue; + }else { + str_tmp.push(elem); + } + } + atoms_tmp.sort_by(|a, b| a.len().cmp(&b.len())); + + for i in 0..atoms_tmp.len() { + let mut j = i + 1; + while j < atoms_tmp.len() { + if atoms_tmp[j].contains(atoms_tmp[i].as_str()) && !atoms_tmp[i].is_empty() { + atoms_tmp.remove(j); + } else { + j += 1; + } + } + } + atoms_tmp +} + +/** + * 处理 + * a[a-c]a[zv] + * [abc] + * [a-c]+ + */ + +fn char_class_expansion(str: &str) -> Vec{ + let mut flag_connect = 0; + let mut atoms_chars = Vec::new(); + let mut vec_char = Vec::new(); + for elem in str.chars() { + if elem == '[' || elem == ']' { + continue; + } else if elem.is_ascii_alphabetic() || elem.is_ascii_digit() { + vec_char.push(elem); + } else if elem == '-' { + flag_connect = 1; + } + } + if flag_connect == 1 { + let x = vec_char[0]; + let y = vec_char[1]; + for elem in x..=y { + atoms_chars.push(elem); + } + + } else { + atoms_chars = vec_char; + } + + atoms_chars +} + +fn my_compile(str: &str, min_atoms_len: i32) -> MyVec { + let mut my_atoms = Vec::new(); // 所有的的原子 + let mut atoms_tmp_string = String::new(); // 暂时存储的字符串 + let mut vec_chars_con: Vec = Vec::new(); + // 将所有的大写字符转换为小写 + let str = str.to_lowercase(); + let chars = str.chars().collect::>(); + let mut i = 0; + while i < chars.len() { + // 处理分组括号 + if chars[i] == '(' { + let group_start_post = i; + while chars[i] != ')' { + i += 1; + } + let group_end_post = i; + let str_group = &str[group_start_post..group_end_post + 1]; + let vec = group_multiple_selection(str_group, min_atoms_len); + + let mut tmp_post_group = i; + tmp_post_group += 1; + if tmp_post_group >= chars.len(){ + if vec.len() != 0 { // 右括号为自后一个字符的情况 + for elem in vec { + my_atoms.push(Atoms{atom: CString::new(elem).unwrap().into_raw()}); + } + } + i += 1; + continue; + } + if chars[tmp_post_group] == '.' && vec.len() != 0 { + i += 1; + for elem in vec { + my_atoms.push(Atoms{atom: CString::new(elem).unwrap().into_raw()}); + } + } else if chars[tmp_post_group] == '{' && vec.len() != 0 { + for elem in vec { + my_atoms.push(Atoms{atom: CString::new(elem).unwrap().into_raw()}); + } + } + i += 1; + continue; + } + if chars[i] == '.' { + if atoms_tmp_string.len() as i32 >= min_atoms_len && vec_chars_con.len() == 0 { + my_atoms.push(Atoms{atom: CString::new(atoms_tmp_string.clone()).unwrap().into_raw()}); + } + if vec_chars_con.len() > 0 && atoms_tmp_string.len() > 0 { + for elems in vec_chars_con.clone() { + my_atoms.push(Atoms{atom: CString::new(format!("{}{}", elems.clone(), atoms_tmp_string)).unwrap().into_raw()}); + } + vec_chars_con.clear(); + } + atoms_tmp_string.clear(); + i += 1; + continue; + } + + if chars[i] == '*' || chars[i] == '+' { + i += 1; + continue; + } + + // 处理多个字符集 + if chars[i] == '[' { + let start_post = i; + while chars[i] != ']' { + i += 1; + } + let mut plus_tmp = i; + plus_tmp += 1; + if plus_tmp < chars.len() && chars[plus_tmp] == '+' { + if atoms_tmp_string.len() as i32 >= min_atoms_len { + my_atoms.push(Atoms{atom: CString::new(atoms_tmp_string.clone()).unwrap().into_raw()}); + atoms_tmp_string.clear(); + i += 2; + continue; + } + } + let str_char_set = &str[start_post..plus_tmp]; + if atoms_tmp_string.len() > 0 && vec_chars_con.len() > 0 { + for elem in vec_chars_con.clone() { + vec_chars_con.push(format!("{}{}", elem, atoms_tmp_string)); + } + atoms_tmp_string.clear(); + } + let atoms_tmp = char_class_expansion(str_char_set); + vec_chars_con = connection(atoms_tmp_string.as_str(), vec_chars_con, atoms_tmp); + atoms_tmp_string.clear(); + + if i == chars.len() - 1 && vec_chars_con.len() > 0 { + for elem in vec_chars_con.clone() { + if elem.len() as i32 >= min_atoms_len { + my_atoms.push(Atoms{atom: CString::new(elem).unwrap().into_raw()}); + } + } + } + i += 1; + continue; + } + + if chars[i] == '{' { + while chars[i] != '}' { + i += 1; + } + i += 1; + continue; + } + + if chars[i] == '\\' { + if atoms_tmp_string.len() as i32 >= min_atoms_len { + my_atoms.push(Atoms{atom: CString::new(atoms_tmp_string.clone()).unwrap().into_raw()}); + atoms_tmp_string.clear(); + } + i += 2; + continue; + } + + if chars[i] != '+' { + atoms_tmp_string.push(chars[i]); + } + + if i == chars.len() - 1 && atoms_tmp_string.len() as i32 >= min_atoms_len { + my_atoms.push(Atoms{atom: CString::new(atoms_tmp_string.clone()).unwrap().into_raw()}); + atoms_tmp_string.clear(); + } + i += 1; + } + let mut a = my_atoms.into_boxed_slice(); + let data = a.as_mut_ptr(); + let len = a.len() as i32; + std::mem::forget(a); + MyVec { data, len } +} + +ffi_fn! { + fn rure_filter_compile(regex_str: *const u8, regex_len: size_t, min_atoms_len: size_t) -> MyVec{ + let r = unsafe { slice::from_raw_parts(regex_str, regex_len) }; + let regex_str = str::from_utf8(r).unwrap(); + let atoms = my_compile(regex_str, min_atoms_len as i32); + atoms + } +}