diff --git a/re2/filtered_re2.cc b/re2/filtered_re2.cc index a108e6863e63ac44ae3033c533ce965574f58368..e9dfc546399f3ae12b49eafd3d635c133671aba9 100644 --- a/re2/filtered_re2.cc +++ b/re2/filtered_re2.cc @@ -28,225 +28,257 @@ extern "C" #include "regex-capi/include/regex_capi.h" } using namespace std; -namespace re2 { +namespace re2 +{ -std::map> map_atoms; + std::map> map_atoms; -// #include "re2/prefilter_tree.h" - class PrefilterTree { - public: - PrefilterTree():min_atom_len_(3){}; - explicit PrefilterTree(int min_atom_len):min_atom_len_(min_atom_len){}; + // #include "re2/prefilter_tree.h" + class PrefilterTree + { + public: + PrefilterTree() : min_atom_len_(3){}; + explicit PrefilterTree(int min_atom_len) : min_atom_len_(min_atom_len){}; ~PrefilterTree(){}; - int getMinAtomLen(){ + int getMinAtomLen() + { return min_atom_len_; } - bool get_is_latin_result() {return is_latin;}; + bool get_is_latin_result() { return is_latin; }; void set_latin(bool x); - std::string get_latin_string() {return str_latin;}; + std::string get_latin_string() { return str_latin; }; void set_latin_str(std::string x); - - private: + + private: const int min_atom_len_; bool is_latin; std::string str_latin; }; - void PrefilterTree::set_latin(bool x) { + void PrefilterTree::set_latin(bool x) + { is_latin = x; } - void PrefilterTree::set_latin_str(std::string x) { + void PrefilterTree::set_latin_str(std::string x) + { str_latin = x; } }; -namespace re2 { - -FilteredRE2::FilteredRE2() - : compiled_(false), - prefilter_tree_(new PrefilterTree()) { -} +namespace re2 +{ -FilteredRE2::FilteredRE2(int min_atom_len) - : compiled_(false), - prefilter_tree_(new PrefilterTree(min_atom_len)) { -} + FilteredRE2::FilteredRE2() + : compiled_(false), + prefilter_tree_(new PrefilterTree()) + { + } -FilteredRE2::~FilteredRE2() { - for (size_t i = 0; i < re2_vec_.size(); i++) - delete re2_vec_[i]; -} + FilteredRE2::FilteredRE2(int min_atom_len) + : compiled_(false), + prefilter_tree_(new PrefilterTree(min_atom_len)) + { + } -FilteredRE2::FilteredRE2(FilteredRE2&& other) - : re2_vec_(std::move(other.re2_vec_)), - compiled_(other.compiled_), - prefilter_tree_(std::move(other.prefilter_tree_)) { - other.re2_vec_.clear(); - other.re2_vec_.shrink_to_fit(); - other.compiled_ = false; - other.prefilter_tree_.reset(new PrefilterTree()); -} + FilteredRE2::~FilteredRE2() + { + for (size_t i = 0; i < re2_vec_.size(); i++) + delete re2_vec_[i]; + } -FilteredRE2& FilteredRE2::operator=(FilteredRE2&& other) { - this->~FilteredRE2(); - (void) new (this) FilteredRE2(std::move(other)); - return *this; -} + FilteredRE2::FilteredRE2(FilteredRE2 &&other) + : re2_vec_(std::move(other.re2_vec_)), + compiled_(other.compiled_), + prefilter_tree_(std::move(other.prefilter_tree_)) + { + other.re2_vec_.clear(); + other.re2_vec_.shrink_to_fit(); + other.compiled_ = false; + other.prefilter_tree_.reset(new PrefilterTree()); + } -RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern, - const RE2::Options& options, int* id) { - RE2* re = new RE2(pattern, options); - RE2::ErrorCode code = re->error_code(); - if(options.encoding() == RE2::Options::EncodingLatin1) { - prefilter_tree_->set_latin(true); - prefilter_tree_->set_latin_str(pattern.as_string()); + FilteredRE2 &FilteredRE2::operator=(FilteredRE2 &&other) + { + this->~FilteredRE2(); + (void)new (this) FilteredRE2(std::move(other)); + return *this; } - else prefilter_tree_->set_latin(false); - - if (!re->ok()) { - if (options.log_errors()) { - LOG(ERROR) << "Couldn't compile regular expression, skipping: " - << pattern << " due to error " << re->error(); + + RE2::ErrorCode FilteredRE2::Add(const StringPiece &pattern, + const RE2::Options &options, int *id) + { + RE2 *re = new RE2(pattern, options); + RE2::ErrorCode code = re->error_code(); + if (options.encoding() == RE2::Options::EncodingLatin1) + { + prefilter_tree_->set_latin(true); + prefilter_tree_->set_latin_str(pattern.as_string()); } - delete re; - } else { - *id = static_cast(re2_vec_.size()); - re2_vec_.push_back(re); - } + else + prefilter_tree_->set_latin(false); - return code; -} + if (!re->ok()) + { + if (options.log_errors()) + { + LOG(ERROR) << "Couldn't compile regular expression, skipping: " + << pattern << " due to error " << re->error(); + } + delete re; + } + else + { + *id = static_cast(re2_vec_.size()); + re2_vec_.push_back(re); + } -void FilteredRE2::Compile(std::vector* atoms) { - map_atoms.clear(); - if (compiled_) { - LOG(ERROR) << "Compile called already."; - return; + return code; } - if (re2_vec_.empty()) { - LOG(ERROR) << "Compile called before Add."; - return; - } - atoms->clear(); - - // 处理latin的情况 - if(prefilter_tree_->get_is_latin_result()) { - std::string str = prefilter_tree_->get_latin_string(); - std::vector vec; - vec.push_back(str); - std::string str_low = str; - transform(str_low.begin(),str_low.end(),str_low.begin(),::tolower); - atoms->push_back(str_low); - map_atoms.insert(map>::value_type(str, vec)); - map_atoms.insert(map>::value_type("total", vec)); - compiled_ = true; - return; - } - - for(size_t i = 0; i < re2_vec_.size(); i++) { - // std::vector my_atoms = MyCompile(re2_vec_[i]->pattern(), prefilter_tree_->getMinAtomLen()); - const char *regex = re2_vec_[i]->pattern().c_str(); - std::string regex_str = regex; - MyVec vec = rure_filter_compile((const uint8_t *)regex, strlen(regex), prefilter_tree_->getMinAtomLen()); - int32_t len = vec.len; - std::vector v; - for(int32_t i = 0; i < len; i++) { - atoms->push_back(vec.data[i].atom); - v.push_back(vec.data[i].atom); + void FilteredRE2::Compile(std::vector *atoms) + { + map_atoms.clear(); + if (compiled_) + { + LOG(ERROR) << "Compile called already."; + return; } - map_atoms.insert(map>::value_type(regex_str, v)); - } - map_atoms.insert(map>::value_type("total", *atoms)); - compiled_ = true; -} -int FilteredRE2::SlowFirstMatch(const StringPiece& text) const { - for (size_t i = 0; i < re2_vec_.size(); i++) - { - if (RE2::PartialMatch(text, re2_vec_[i]->pattern())){ - return static_cast(i); - } + if (re2_vec_.empty()) + { + LOG(ERROR) << "Compile called before Add."; + return; + } + atoms->clear(); + + // 处理latin的情况 + if (prefilter_tree_->get_is_latin_result()) + { + std::string str = prefilter_tree_->get_latin_string(); + std::vector vec; + vec.push_back(str); + std::string str_low = str; + transform(str_low.begin(), str_low.end(), str_low.begin(), ::tolower); + atoms->push_back(str_low); + map_atoms.insert(map>::value_type(str, vec)); + map_atoms.insert(map>::value_type("total", vec)); + compiled_ = true; + return; + } + + for (size_t i = 0; i < re2_vec_.size(); i++) + { + // std::vector my_atoms = MyCompile(re2_vec_[i]->pattern(), prefilter_tree_->getMinAtomLen()); + const char *regex = re2_vec_[i]->pattern().c_str(); + std::string regex_str = regex; + MyVec vec = rure_filter_compile((const uint8_t *)regex, strlen(regex), prefilter_tree_->getMinAtomLen()); + int32_t len = vec.len; + std::vector v; + for (int32_t i = 0; i < len; i++) + { + atoms->push_back(vec.data[i].atom); + v.push_back(vec.data[i].atom); + } + map_atoms.insert(map>::value_type(regex_str, v)); + } + map_atoms.insert(map>::value_type("total", *atoms)); + compiled_ = true; } - return -1; -} -void AtomsToRegexps(std::vector re2_vec_, std::vector atoms, std::vector *regexps, int min_atom_len) -{ - // 根据atoms索引获取regexp索引的规则 - /* - * 如果没有原子, 那么直接会把re加进去。 - * 如果这个正则表达式有原子,那么要把该正则表达式的所有的原子的索引全加入,这个正则表达式才能加入成功。 - */ - - std::vector atoms_total = map_atoms["total"]; - std::vector atoms_tmp; - for(size_t i = 0; i < atoms.size(); i++) + int FilteredRE2::SlowFirstMatch(const StringPiece &text) const { - atoms_tmp.push_back(atoms_total[atoms[i]]); + for (size_t i = 0; i < re2_vec_.size(); i++) + { + if (RE2::PartialMatch(text, re2_vec_[i]->pattern())) + { + return static_cast(i); + } + } + return -1; } - for(size_t i = 0; i < re2_vec_.size(); i++) + + void AtomsToRegexps(std::vector re2_vec_, std::vector atoms, std::vector *regexps, int min_atom_len) { - // std::vector my_atoms = MyCompile(re2_vec_[i]->pattern(), min_atom_len); - std::string str = re2_vec_[i]->pattern(); - std::vector my_atoms = map_atoms[str]; - if(my_atoms.size() == 0){ - regexps->push_back(i); - continue; + // 根据atoms索引获取regexp索引的规则 + /* + * 如果没有原子, 那么直接会把re加进去。 + * 如果这个正则表达式有原子,那么要把该正则表达式的所有的原子的索引全加入,这个正则表达式才能加入成功。 + */ + + std::vector atoms_total = map_atoms["total"]; + std::vector atoms_tmp; + for (size_t i = 0; i < atoms.size(); i++) + { + atoms_tmp.push_back(atoms_total[atoms[i]]); } - else + for (size_t i = 0; i < re2_vec_.size(); i++) { - int count = 0; - for(size_t ii = 0; ii < my_atoms.size(); ii++) + // std::vector my_atoms = MyCompile(re2_vec_[i]->pattern(), min_atom_len); + std::string str = re2_vec_[i]->pattern(); + std::vector my_atoms = map_atoms[str]; + if (my_atoms.size() == 0) { - for(size_t jj = 0; jj < atoms_tmp.size(); jj++) + regexps->push_back(i); + continue; + } + else + { + int count = 0; + for (size_t ii = 0; ii < my_atoms.size(); ii++) { - if(my_atoms[ii] == atoms_tmp[jj]){ - count++; - break; + for (size_t jj = 0; jj < atoms_tmp.size(); jj++) + { + if (my_atoms[ii] == atoms_tmp[jj]) + { + count++; + break; + } } } + if (count == (int)my_atoms.size()) + regexps->push_back(int(i)); } - if(count == (int)my_atoms.size()) regexps->push_back(int(i)); } } -} -int FilteredRE2::FirstMatch(const StringPiece& text, - const std::vector& atoms) const { - if (!compiled_) { - LOG(DFATAL) << "FirstMatch called before Compile."; + int FilteredRE2::FirstMatch(const StringPiece &text, + const std::vector &atoms) const + { + if (!compiled_) + { + LOG(DFATAL) << "FirstMatch called before Compile."; + return -1; + } + std::vector regexps; + + AtomsToRegexps(re2_vec_, atoms, ®exps, prefilter_tree_->getMinAtomLen()); + + for (size_t i = 0; i < regexps.size(); i++) + if (RE2::PartialMatch(text, *re2_vec_[regexps[i]])) + return static_cast(i); return -1; } - std::vector regexps; - - AtomsToRegexps(re2_vec_, atoms, ®exps, prefilter_tree_->getMinAtomLen()); - - for (size_t i = 0; i < regexps.size(); i++) - if (RE2::PartialMatch(text, *re2_vec_[regexps[i]])) - return static_cast(i); - return -1; -} -bool FilteredRE2::AllMatches( - const StringPiece& text, - const std::vector& atoms, - std::vector* matching_regexps) const { - matching_regexps->clear(); + bool FilteredRE2::AllMatches( + const StringPiece &text, + const std::vector &atoms, + std::vector *matching_regexps) const + { + matching_regexps->clear(); - std::vector regexps; - AtomsToRegexps(re2_vec_, atoms, ®exps, prefilter_tree_->getMinAtomLen()); + std::vector regexps; + AtomsToRegexps(re2_vec_, atoms, ®exps, prefilter_tree_->getMinAtomLen()); - for (size_t i = 0; i < re2_vec_.size(); i++) - if (RE2::PartialMatch(text, *re2_vec_[i])) - matching_regexps->push_back(i); - return !matching_regexps->empty(); - -} + for (size_t i = 0; i < re2_vec_.size(); i++) + if (RE2::PartialMatch(text, *re2_vec_[i])) + matching_regexps->push_back(i); + return !matching_regexps->empty(); + } -void FilteredRE2::AllPotentials( - const std::vector& atoms, - std::vector* potential_regexps) const { - AtomsToRegexps(re2_vec_, atoms, potential_regexps, prefilter_tree_->getMinAtomLen()); -} + void FilteredRE2::AllPotentials( + const std::vector &atoms, + std::vector *potential_regexps) const + { + AtomsToRegexps(re2_vec_, atoms, potential_regexps, prefilter_tree_->getMinAtomLen()); + } -} // namespace re2 +} // namespace re2 diff --git a/re2/re2.cc b/re2/re2.cc index fa4f185a0e8d2259c9a972b394ca6c94fa372498..4bf66bcc7d425cd24bdacaee55fef5f0a80b3b4b 100644 --- a/re2/re2.cc +++ b/re2/re2.cc @@ -155,10 +155,12 @@ namespace re2 } uint32_t flags = RURE_DEFAULT_FLAGS; - if(options_.dot_nl()) flags = RURE_FLAG_DOTNL; + if (options_.dot_nl()) + flags = RURE_FLAG_DOTNL; // if(options_.never_nl()) flags = RURE_DEFAULT_FLAGS; - if(options_.encoding() == RE2::Options::EncodingLatin1){ - flags |= RURE_FLAG_UNICODE; + if (options_.encoding() == RE2::Options::EncodingLatin1) + { + flags |= RURE_FLAG_UNICODE; } // for All @@ -185,7 +187,7 @@ namespace re2 LOG(ERROR) << "Error Compile '" << pattern.data() << "':" << msg << "'"; } error_ = new std::string(msg); - error_code_ = ErrorInternal; // 暂时对这个错误进行赋值,如何处理错误类型??? + error_code_ = ErrorInternal; // 暂时对这个错误进行赋值,如何处理错误类型??? } return; } @@ -193,7 +195,7 @@ namespace re2 // for Consume and FindAndConsume suffix_regexp_ = (re2::Regexp *)rure_new((const uint8_t *)pattern.data(), pattern.size()); // for FullMatch - if(rure_str != "") + if (rure_str != "") { std::string FullMatch_rure_str = rure_str; FullMatch_rure_str.insert(0, "^("); @@ -208,11 +210,11 @@ namespace re2 //获取捕获组的数量, 并对num_captures_其进行赋值 rure_captures *caps = rure_captures_new(re); size_t captures_len = rure_captures_len(caps) - 1; - if(!options_.never_capture()) + if (!options_.never_capture()) { num_captures_ = (int)captures_len; } - else + else { num_captures_ = 0; } @@ -220,7 +222,7 @@ namespace re2 rure_captures_free(caps); rure_error_free(err); error_ = empty_string; - error_code_ = RE2::NoError; + error_code_ = RE2::NoError; } RE2::~RE2() @@ -318,7 +320,6 @@ namespace re2 } } - bool RE2::Replace(std::string *str, const RE2 &re, const StringPiece &rewrite) @@ -334,7 +335,7 @@ namespace re2 // 利用rure进行replace const char *rure_str = re.pattern_.c_str(); // 对rewrite进行处理 - const char *rure_rewrite = rure_rewrite_str_convert((const uint8_t*)rewrite.data(), rewrite.size()); + const char *rure_rewrite = rure_rewrite_str_convert((const uint8_t *)rewrite.data(), rewrite.size()); rure *re_rure = rure_compile((const uint8_t *)rure_str, strlen(rure_str), RURE_DEFAULT_FLAGS, NULL, NULL); const char *str_rure = rure_replace(re_rure, (const uint8_t *)str->c_str(), strlen(str->c_str()), @@ -364,7 +365,7 @@ namespace re2 if (count != 0) { // 对rewrite进行处理 - const char *rure_rewrite = rure_rewrite_str_convert((const uint8_t*)rewrite.data(), rewrite.size()); + const char *rure_rewrite = rure_rewrite_str_convert((const uint8_t *)rewrite.data(), rewrite.size()); const char *str_rure = rure_replace_all(rure_re, (const uint8_t *)str->c_str(), strlen(str->c_str()), (const uint8_t *)rure_rewrite, strlen(rure_rewrite)); *str = str_rure; @@ -442,7 +443,7 @@ namespace re2 StringPiece *submatch, int nsubmatch) const { - if(text.size() == 0 && pattern() == "") + if (text.size() == 0 && pattern() == "") { return true; } @@ -463,9 +464,9 @@ namespace re2 return false; } // 对null和empty进行处理 - if(text.data() == NULL) + if (text.data() == NULL) { - for(int i = 0; i < nsubmatch; i++) + for (int i = 0; i < nsubmatch; i++) { submatch[i] = NULL; } @@ -491,19 +492,21 @@ namespace re2 // rure *re1 = (rure *)rprog_; rure_match match = {0}; size_t length = strlen(haystack.c_str()); - if(options_.never_nl()) + if (options_.never_nl()) { std::string strs = haystack + '\n'; size_t pos = strs.find('\n'); bool flag = false; - while(pos != strs.npos) + while (pos != strs.npos) { std::string temp = strs.substr(0, pos); bool matched = rure_is_match(re, (const uint8_t *)temp.c_str(), strlen(temp.c_str()), 0); - if(matched && !nsubmatch){ + if (matched && !nsubmatch) + { return true; } - if(matched && nsubmatch){ + if (matched && nsubmatch) + { haystack = temp; length = strlen(haystack.c_str()); flag = true; @@ -512,41 +515,48 @@ namespace re2 strs = strs.substr(pos + 1, length + 1); pos = strs.find('\n'); } - if(!flag){return false;} + if (!flag) + { + return false; + } } // bool matched = rure_find(re, (const uint8_t *)haystack, strlen(haystack), 0, &match); // 这里没有 if(re_anchor == ANCHOR_START)原因是因为: // 只有Consume()使用了ANCHOR_START,而传入Consume()的参数通常是三个或者三个以上, // 调用Consume()时,nsubmatch不为0,因此会去执行rure_captures_new()、rure_find_captures()、rure_captures_at() - if(re_anchor == UNANCHORED) + if (re_anchor == UNANCHORED) { // bool matched = rure_find(re, (const uint8_t *)haystack.c_str(), length, 0, &match); bool matched = rure_is_match(re, (const uint8_t *)haystack.c_str(), length, 0); - if(!matched){ + if (!matched) + { return false; } - else if(!nsubmatch){ + else if (!nsubmatch) + { return true; } } - else if(re_anchor == ANCHOR_BOTH) + else if (re_anchor == ANCHOR_BOTH) { bool matched = rure_find(re, (const uint8_t *)haystack.c_str(), length, 0, &match); - if(!matched || match.start != 0 || match.end != length){ + if (!matched || match.start != 0 || match.end != length) + { return false; } - else if(!nsubmatch){ + else if (!nsubmatch) + { return true; } } - + // Demo 获取捕获组内容,存储到submatch数组中 rure_captures *caps = rure_captures_new(re); rure_find_captures(re, (const uint8_t *)haystack.c_str(), length, 0, caps); // size_t captures_len = num_captures_ + 1; - + rure_captures_at(caps, 0, &match); if (re_anchor == ANCHOR_START && match.start != 0) return false; @@ -559,13 +569,14 @@ namespace re2 size_t start = match.start; size_t end = match.end; size_t len = end - start; - if(options_.encoding() == RE2::Options::EncodingUTF8){ + if (options_.encoding() == RE2::Options::EncodingUTF8) + { submatch[i] = StringPiece(text.data() + start, static_cast(len)); } - else{ + else + { submatch[i] = StringPiece(text.data() + start, static_cast(len / 2)); } - } else { @@ -608,17 +619,17 @@ namespace re2 // RE has fewer capturing groups than number of Arg pointers passed in. return false; } - + // for Consume and FindAndConsume rure_match match; - if(consumed && n == 0 && + if (consumed && n == 0 && rure_consume((rure *)suffix_regexp_, (const uint8_t *)text.data(), (size_t)text.size(), &match)) { *consumed = match.end; return true; } // for FullMatch(no captures) - if(re_anchor == ANCHOR_BOTH && n == 0 && options_.encoding() == RE2::Options::EncodingUTF8) + if (re_anchor == ANCHOR_BOTH && n == 0 && options_.encoding() == RE2::Options::EncodingUTF8) { bool matched = rure_is_match((rure *)entire_regexp_, (const uint8_t *)text.data(), (size_t)text.size(), 0); return matched; @@ -700,12 +711,12 @@ namespace re2 { int num_caps = NumberOfCapturingGroups(); bool result = rure_check_rewrite_string(rewrite.data(), num_caps); - if(!result){ + if (!result) + { *error = "Rewrite schema error"; return false; } - return true; - + return true; } // Returns the maximum submatch needed for the rewrite to be done by Replace(). @@ -726,13 +737,15 @@ namespace re2 size_t len = rewrite.length(); const char *rewrites[veclen]; size_t rewrites_lengths[veclen]; - for(int i = 0; i < veclen; i++) { + for (int i = 0; i < veclen; i++) + { rewrites[i] = vec[i].data(); rewrites_lengths[i] = vec[i].size(); } - const char *result = rure_rewrite((const uint8_t *)rewrite.data(), len, (const uint8_t **)rewrites, - rewrites_lengths, (size_t)veclen); - if(result != NULL) { + const char *result = rure_rewrite((const uint8_t *)rewrite.data(), len, (const uint8_t **)rewrites, + rewrites_lengths, (size_t)veclen); + if (result != NULL) + { out->assign(result); return true; } diff --git a/re2/set.cc b/re2/set.cc index 2af02a9fb8f8ca8e4ed05666af4d813c970cc147..28d3be827a8ecab615b95ae0fe22a63362169677 100644 --- a/re2/set.cc +++ b/re2/set.cc @@ -48,17 +48,17 @@ namespace re2 elem_.clear(); } - RE2::Set::Set(Set && other) + RE2::Set::Set(Set &&other) : options_(other.options_), anchor_(other.anchor_), compiled_(other.compiled_), prog_(std::move(other.prog_)) { - other.elem_.clear(); - other.elem_.shrink_to_fit(); - other.compiled_ = false; - other.size_ = 0; - other.prog_.reset(); + other.elem_.clear(); + other.elem_.shrink_to_fit(); + other.compiled_ = false; + other.size_ = 0; + other.prog_.reset(); } RE2::Set &RE2::Set::operator=(Set &&other) @@ -68,14 +68,16 @@ namespace re2 return *this; } - int RE2::Set::Add(const StringPiece &pattern, std::string *error) { int place_num = size_; std::string rure_pattern = pattern.as_string(); - if(anchor_ == RE2::ANCHOR_START){ // 处理RE2::ANCHOR_START的情况 + if (anchor_ == RE2::ANCHOR_START) + { // 处理RE2::ANCHOR_START的情况 rure_pattern.insert(0, "^"); - } else if(anchor_ == RE2::ANCHOR_BOTH) { // 处理RE2::ANCHOR_BOTH的情况 + } + else if (anchor_ == RE2::ANCHOR_BOTH) + { // 处理RE2::ANCHOR_BOTH的情况 rure_pattern.insert(0, "^"); rure_pattern.append("$"); } @@ -84,7 +86,7 @@ namespace re2 if (re == NULL) { const char *msg = rure_error_message(err); - if(error != NULL) + if (error != NULL) { error->assign(msg); LOG(ERROR) << "Regexp Error '" << pattern.data() << "':" << msg << "'"; @@ -94,7 +96,7 @@ namespace re2 } else { - elem_.push_back(pair(rure_pattern, (re2::Regexp*)nullptr)); + elem_.push_back(pair(rure_pattern, (re2::Regexp *)nullptr)); size_++; // rure_free(re); return place_num; @@ -103,7 +105,8 @@ namespace re2 bool RE2::Set::Compile() { - if (compiled_) { + if (compiled_) + { LOG(ERROR) << "RE2::Set::Compile() called more than once"; return false; } @@ -111,19 +114,21 @@ namespace re2 const size_t PAT_COUNT = elem_.size(); const char *patterns[PAT_COUNT]; size_t patterns_lengths[PAT_COUNT]; - for (size_t i = 0; i < elem_.size(); i++) { + for (size_t i = 0; i < elem_.size(); i++) + { patterns[i] = elem_[i].first.c_str(); patterns_lengths[i] = elem_[i].first.length(); } - + rure_error *err = rure_error_new(); - rure_set *re = rure_compile_set((const uint8_t **) patterns, - patterns_lengths, PAT_COUNT, 0, NULL, err); - if(re == NULL){ + rure_set *re = rure_compile_set((const uint8_t **)patterns, + patterns_lengths, PAT_COUNT, 0, NULL, err); + if (re == NULL) + { compiled_ = false; rure_set_free(re); return false; - } + } prog_.reset((Prog *)re); compiled_ = true; return true; @@ -137,31 +142,34 @@ namespace re2 bool RE2::Set::Match(const StringPiece &text, std::vector *v, ErrorInfo *error_info) const { - if (!compiled_) { + if (!compiled_) + { LOG(ERROR) << "RE2::Set::Match() called before compiling"; if (error_info != NULL) error_info->kind = kNotCompiled; return false; } - + const char *pat_str = text.data(); size_t length = strlen(pat_str); - if(v == NULL) + if (v == NULL) { - bool result = rure_set_is_match((rure_set *)prog_.get(), - (const uint8_t *)pat_str, length, 0); + bool result = rure_set_is_match((rure_set *)prog_.get(), + (const uint8_t *)pat_str, length, 0); return result; } else - { + { v->clear(); bool matches[elem_.size()]; - bool result = rure_set_matches((rure_set *)prog_.get(), - (const uint8_t *)pat_str, length, 0, matches); - if(!result) return false; - for(size_t i = 0; i < elem_.size(); i++) + bool result = rure_set_matches((rure_set *)prog_.get(), + (const uint8_t *)pat_str, length, 0, matches); + if (!result) + return false; + for (size_t i = 0; i < elem_.size(); i++) { - if(matches[i]) v->push_back(i); + if (matches[i]) + v->push_back(i); } return true; } diff --git a/regex-capi/ctest/test.c b/regex-capi/ctest/test.c index 4aca84abf1238331ca58102d707568fa794349fe..295e9ca3d69af14cf536a1b01d7d6a92b59177eb 100644 --- a/regex-capi/ctest/test.c +++ b/regex-capi/ctest/test.c @@ -7,18 +7,21 @@ #include "regex_capi.h" #ifndef DEBUG - #define DEBUG false +#define DEBUG false #endif -bool test_is_match() { +bool test_is_match() +{ bool passed = true; const char *haystack = "snowman: \xE2\x98\x83"; rure *re = rure_compile_must("\\p{So}$"); bool matched = rure_is_match(re, (const uint8_t *)haystack, strlen(haystack), 0); - if (!matched) { - if (DEBUG) { + if (!matched) + { + if (DEBUG) + { fprintf(stderr, "[test_is_match] expected match, but got no match\n"); } @@ -28,8 +31,8 @@ bool test_is_match() { return passed; } - -bool test_find() { +bool test_find() +{ bool passed = true; const char *haystack = "snowman: \xE2\x98\x83"; @@ -37,16 +40,20 @@ bool test_find() { rure_match match = {0}; bool matched = rure_find(re, (const uint8_t *)haystack, strlen(haystack), 0, &match); - if (!matched) { - if (DEBUG) { + if (!matched) + { + if (DEBUG) + { fprintf(stderr, "[test_find] expected match, but got no match\n"); } passed = false; } size_t expect_start = 9; size_t expect_end = 12; - if (match.start != expect_start || match.end != expect_end) { - if (DEBUG) { + if (match.start != expect_start || match.end != expect_end) + { + if (DEBUG) + { fprintf(stderr, "[test_find] expected match at (%zu, %zu), but " "got match at (%zu, %zu)\n", @@ -58,7 +65,8 @@ bool test_find() { return passed; } -bool test_captures() { +bool test_captures() +{ bool passed = true; const char *haystack = "snowman: \xE2\x98\x83"; @@ -67,8 +75,10 @@ bool test_captures() { rure_captures *caps = rure_captures_new(re); bool matched = rure_find_captures(re, (const uint8_t *)haystack, strlen(haystack), 0, caps); - if (!matched) { - if (DEBUG) { + if (!matched) + { + if (DEBUG) + { fprintf(stderr, "[test_captures] expected match, but got no match\n"); } @@ -76,8 +86,10 @@ bool test_captures() { } size_t expect_captures_len = 3; size_t captures_len = rure_captures_len(caps); - if (captures_len != expect_captures_len) { - if (DEBUG) { + if (captures_len != expect_captures_len) + { + if (DEBUG) + { fprintf(stderr, "[test_captures] " "expected capture group length to be %zd, but " @@ -90,8 +102,10 @@ bool test_captures() { size_t expect_start = 9; size_t expect_end = 12; rure_captures_at(caps, 2, &match); - if (match.start != expect_start || match.end != expect_end) { - if (DEBUG) { + if (match.start != expect_start || match.end != expect_end) + { + if (DEBUG) + { fprintf(stderr, "[test_captures] " "expected capture 2 match at (%zu, %zu), " @@ -106,10 +120,13 @@ done: return passed; } -bool test_iter_capture_name(char *expect, char *given) { +bool test_iter_capture_name(char *expect, char *given) +{ bool passed = true; - if (strcmp(expect, given)) { - if (DEBUG) { + if (strcmp(expect, given)) + { + if (DEBUG) + { fprintf(stderr, "[test_iter_capture_name] expected first capture " "name '%s' got '%s'\n", @@ -120,7 +137,8 @@ bool test_iter_capture_name(char *expect, char *given) { return passed; } -bool test_iter_capture_names() { +bool test_iter_capture_names() +{ bool passed = true; char *name; @@ -129,8 +147,10 @@ bool test_iter_capture_names() { rure_iter_capture_names *it = rure_iter_capture_names_new(re); bool result = rure_iter_capture_names_next(it, &name); - if (!result) { - if (DEBUG) { + if (!result) + { + if (DEBUG) + { fprintf(stderr, "[test_iter_capture_names] expected a second name, " "but got none\n"); @@ -141,19 +161,22 @@ bool test_iter_capture_names() { result = rure_iter_capture_names_next(it, &name); passed = test_iter_capture_name("year", name); - if (!passed) { + if (!passed) + { goto done; } result = rure_iter_capture_names_next(it, &name); passed = test_iter_capture_name("month", name); - if (!passed) { + if (!passed) + { goto done; } result = rure_iter_capture_names_next(it, &name); passed = test_iter_capture_name("day", name); - if (!passed) { + if (!passed) + { goto done; } done: @@ -168,7 +191,8 @@ done: * mode, we can match arbitrary possibly invalid UTF-8 bytes, such as \xFF. * (When Unicode mode is enabled, \xFF won't match .) */ -bool test_flags() { +bool test_flags() +{ bool passed = true; const char *pattern = "."; const char *haystack = "\xFF"; @@ -177,8 +201,10 @@ bool test_flags() { 0, NULL, NULL); bool matched = rure_is_match(re, (const uint8_t *)haystack, strlen(haystack), 0); - if (!matched) { - if (DEBUG) { + if (!matched) + { + if (DEBUG) + { fprintf(stderr, "[test_flags] expected match, but got no match\n"); } passed = false; @@ -187,12 +213,15 @@ bool test_flags() { return passed; } -bool test_compile_error() { +bool test_compile_error() +{ bool passed = true; rure_error *err = rure_error_new(); rure *re = rure_compile((const uint8_t *)"(", 1, 0, NULL, err); - if (re != NULL) { - if (DEBUG) { + if (re != NULL) + { + if (DEBUG) + { fprintf(stderr, "[test_compile_error] " "expected NULL regex pointer, but got non-NULL pointer\n"); @@ -201,12 +230,15 @@ bool test_compile_error() { rure_free(re); } const char *msg = rure_error_message(err); - if (NULL == strstr(msg, "unclosed group")) { - if (DEBUG) { + if (NULL == strstr(msg, "unclosed group")) + { + if (DEBUG) + { fprintf(stderr, "[test_compile_error] " "expected an 'unclosed parenthesis' error message, but " - "got this instead: '%s'\n", msg); + "got this instead: '%s'\n", + msg); } passed = false; } @@ -214,59 +246,63 @@ bool test_compile_error() { return passed; } - -bool test_regex_set_matches() { +bool test_regex_set_matches() +{ #define PAT_COUNT 6 bool passed = true; const char *patterns[] = { - "foo", "barfoo", "\\w+", "\\d+", "foobar", "bar" - }; + "foo", "barfoo", "\\w+", "\\d+", "foobar", "bar"}; const size_t patterns_lengths[] = { - 3, 6, 3, 3, 6, 3 - }; + 3, 6, 3, 3, 6, 3}; rure_error *err = rure_error_new(); - rure_set *re = rure_compile_set((const uint8_t **) patterns, + rure_set *re = rure_compile_set((const uint8_t **)patterns, patterns_lengths, PAT_COUNT, 0, NULL, err); - if (re == NULL) { + if (re == NULL) + { passed = false; goto done2; } - if (rure_set_len(re) != PAT_COUNT) { + if (rure_set_len(re) != PAT_COUNT) + { passed = false; goto done1; } - if (!rure_set_is_match(re, (const uint8_t *) "foobar", 6, 0)) { + if (!rure_set_is_match(re, (const uint8_t *)"foobar", 6, 0)) + { passed = false; goto done1; } - if (rure_set_is_match(re, (const uint8_t *) "", 0, 0)) { + if (rure_set_is_match(re, (const uint8_t *)"", 0, 0)) + { passed = false; goto done1; } bool matches[PAT_COUNT]; - if (!rure_set_matches(re, (const uint8_t *) "foobar", 6, 0, matches)) { + if (!rure_set_matches(re, (const uint8_t *)"foobar", 6, 0, matches)) + { passed = false; goto done1; } const bool match_target[] = { - true, false, true, false, true, true - }; + true, false, true, false, true, true}; int i; - for (i = 0; i < PAT_COUNT; ++i) { - if (matches[i] != match_target[i]) { + for (i = 0; i < PAT_COUNT; ++i) + { + if (matches[i] != match_target[i]) + { passed = false; goto done1; } @@ -281,54 +317,58 @@ done2: #undef PAT_COUNT } -bool test_regex_set_match_start() { +bool test_regex_set_match_start() +{ #define PAT_COUNT 3 bool passed = true; const char *patterns[] = { - "foo", "bar", "fooo" - }; + "foo", "bar", "fooo"}; const size_t patterns_lengths[] = { - 3, 3, 4 - }; + 3, 3, 4}; rure_error *err = rure_error_new(); - rure_set *re = rure_compile_set((const uint8_t **) patterns, + rure_set *re = rure_compile_set((const uint8_t **)patterns, patterns_lengths, PAT_COUNT, 0, NULL, err); - if (re == NULL) { + if (re == NULL) + { passed = false; goto done2; } - if (rure_set_len(re) != PAT_COUNT) { + if (rure_set_len(re) != PAT_COUNT) + { passed = false; goto done1; } - if (rure_set_is_match(re, (const uint8_t *)"foobiasdr", 7, 2)) { + if (rure_set_is_match(re, (const uint8_t *)"foobiasdr", 7, 2)) + { passed = false; goto done1; } { bool matches[PAT_COUNT]; - if (!rure_set_matches(re, (const uint8_t *)"fooobar", 8, 0, matches)) { + if (!rure_set_matches(re, (const uint8_t *)"fooobar", 8, 0, matches)) + { passed = false; goto done1; } const bool match_target[] = { - true, true, true - }; + true, true, true}; int i; - for (i = 0; i < PAT_COUNT; ++i) { - if (matches[i] != match_target[i]) { + for (i = 0; i < PAT_COUNT; ++i) + { + if (matches[i] != match_target[i]) + { passed = false; goto done1; } @@ -337,18 +377,20 @@ bool test_regex_set_match_start() { { bool matches[PAT_COUNT]; - if (!rure_set_matches(re, (const uint8_t *)"fooobar", 7, 1, matches)) { + if (!rure_set_matches(re, (const uint8_t *)"fooobar", 7, 1, matches)) + { passed = false; goto done1; } const bool match_target[] = { - false, true, false - }; + false, true, false}; int i; - for (i = 0; i < PAT_COUNT; ++i) { - if (matches[i] != match_target[i]) { + for (i = 0; i < PAT_COUNT; ++i) + { + if (matches[i] != match_target[i]) + { passed = false; goto done1; } @@ -364,146 +406,159 @@ done2: #undef PAT_COUNT } - -bool test_escape() { +bool test_escape() +{ bool passed = true; const char *pattern = "^[a-z]+.*$"; const char *expected_escaped = "\\^\\[a\\-z\\]\\+\\.\\*\\$"; const char *escaped = rure_escape_must(pattern); - if (!escaped) { - if (DEBUG) { + if (!escaped) + { + if (DEBUG) + { fprintf(stderr, "[test_captures] expected escaped, but got no escaped\n"); } passed = false; - } else if (strcmp(escaped, expected_escaped) != 0) { - if (DEBUG) { + } + else if (strcmp(escaped, expected_escaped) != 0) + { + if (DEBUG) + { fprintf(stderr, "[test_captures] expected \"%s\", but got \"%s\"\n", expected_escaped, escaped); } passed = false; } - rure_cstring_free((char *) escaped); + rure_cstring_free((char *)escaped); return passed; } -bool test_replace_and_replace_all(){ +bool test_replace_and_replace_all() +{ bool passed = true; - typedef struct ReplaceTest { + typedef struct ReplaceTest + { const char *regexp; const char *rewrite; const char *original; const char *single; const char *global; - int greplace_count; - }ReplaceTest; + int greplace_count; + } ReplaceTest; static const ReplaceTest tests[] = { - { "(qu|[b-df-hj-np-tv-z]*)([a-z]+)", - "${2}${1}ay", - "the quick brown fox jumps over the lazy dogs.", - "ethay quick brown fox jumps over the lazy dogs.", - "ethay ickquay ownbray oxfay umpsjay overay ethay azylay ogsday.", - 9 }, - { "\\w+", - "${0}-NOSPAM", - "abcd.efghi@google.com", - "abcd-NOSPAM.efghi@google.com", - "abcd-NOSPAM.efghi-NOSPAM@google-NOSPAM.com-NOSPAM", - 4 }, - { "^", - "(START)", - "foo", - "(START)foo", - "(START)foo", - 1 }, - { "^", - "(START)", - "", - "(START)", - "(START)", - 1 }, - { "$", - "(END)", - "", - "(END)", - "(END)", - 1 }, - { "b", - "bb", - "ababababab", - "abbabababab", - "abbabbabbabbabb", - 5 }, - { "b", - "bb", - "bbbbbb", - "bbbbbbb", - "bbbbbbbbbbbb", - 6 }, - { "b+", - "bb", - "bbbbbb", - "bb", - "bb", - 1 }, - { "b*", - "bb", - "bbbbbb", - "bb", - "bb", - 1 }, - { "b*", - "bb", - "aaaaa", - "bbaaaaa", - "bbabbabbabbabbabb", - 6 }, - - { "a.*a", - "(${0})", - "aba\naba", - "(aba)\naba", - "(aba)\n(aba)", - 2 }, - { "", NULL, NULL, NULL, NULL, 0 } - }; + {"(qu|[b-df-hj-np-tv-z]*)([a-z]+)", + "${2}${1}ay", + "the quick brown fox jumps over the lazy dogs.", + "ethay quick brown fox jumps over the lazy dogs.", + "ethay ickquay ownbray oxfay umpsjay overay ethay azylay ogsday.", + 9}, + {"\\w+", + "${0}-NOSPAM", + "abcd.efghi@google.com", + "abcd-NOSPAM.efghi@google.com", + "abcd-NOSPAM.efghi-NOSPAM@google-NOSPAM.com-NOSPAM", + 4}, + {"^", + "(START)", + "foo", + "(START)foo", + "(START)foo", + 1}, + {"^", + "(START)", + "", + "(START)", + "(START)", + 1}, + {"$", + "(END)", + "", + "(END)", + "(END)", + 1}, + {"b", + "bb", + "ababababab", + "abbabababab", + "abbabbabbabbabb", + 5}, + {"b", + "bb", + "bbbbbb", + "bbbbbbb", + "bbbbbbbbbbbb", + 6}, + {"b+", + "bb", + "bbbbbb", + "bb", + "bb", + 1}, + {"b*", + "bb", + "bbbbbb", + "bb", + "bb", + 1}, + {"b*", + "bb", + "aaaaa", + "bbaaaaa", + "bbabbabbabbabbabb", + 6}, + + {"a.*a", + "(${0})", + "aba\naba", + "(aba)\naba", + "(aba)\n(aba)", + 2}, + {"", NULL, NULL, NULL, NULL, 0}}; const char *haystack; const char *rewrite; - const char* regex; + const char *regex; - for (const ReplaceTest* t = tests; t->original != NULL; t++) { + for (const ReplaceTest *t = tests; t->original != NULL; t++) + { haystack = t->original; regex = t->regexp; rewrite = t->rewrite; rure *re = rure_compile_must(regex); const char *replaced_haystack = rure_replace(re, (const uint8_t *)haystack, strlen(haystack), - (const uint8_t *)rewrite, strlen(rewrite)); + (const uint8_t *)rewrite, strlen(rewrite)); const char *replaced_all_haystack = rure_replace_all(re, (const uint8_t *)haystack, strlen(haystack), - (const uint8_t *)rewrite, strlen(rewrite)); + (const uint8_t *)rewrite, strlen(rewrite)); int result1 = strcmp(t->single, replaced_haystack); int result2 = strcmp(t->global, replaced_all_haystack); - if(result1 != 0 && result2 !=0) passed = false; + if (result1 != 0 && result2 != 0) + passed = false; } passed = true; return passed; } -void run_test(bool (test)(), const char *name, bool *passed) { - if (!test()) { +void run_test(bool(test)(), const char *name, bool *passed) +{ + if (!test()) + { *passed = false; fprintf(stderr, "FAILED: %s\n", name); - } else { + } + else + { fprintf(stderr, "PASSED: %s\n", name); } } -int main() { +int main() +{ bool passed = true; run_test(test_is_match, "test_is_match", &passed); @@ -518,7 +573,8 @@ int main() { run_test(test_escape, "test_escape", &passed); run_test(test_replace_and_replace_all, "test_replace_and_replace_all", &passed); - if (!passed) { + if (!passed) + { exit(1); } return 0; diff --git a/regex-capi/src/error.rs b/regex-capi/src/error.rs index bf15e22b2c251f025ccd40a494663b6efddeeac6..413b2a8327219129ab43adf2932837deb14a3826 100644 --- a/regex-capi/src/error.rs +++ b/regex-capi/src/error.rs @@ -53,7 +53,9 @@ extern "C" fn rure_error_new() -> *mut Error { #[no_mangle] extern "C" fn rure_error_free(err: *mut Error) { - unsafe { drop(Box::from_raw(err)); } + unsafe { + drop(Box::from_raw(err)); + } } #[no_mangle] diff --git a/regex-capi/src/lib.rs b/regex-capi/src/lib.rs index d7b77990aba5ec641f31958a5295ed2c3ae964f2..5784a95ece102b073e509f78469faf4340e0aa36 100644 --- a/regex-capi/src/lib.rs +++ b/regex-capi/src/lib.rs @@ -12,142 +12,142 @@ * Create: 2022-11-25 * Description: Rure is a C API to Rust's regex library. ******************************************************************************/ - #[macro_use] - mod error; - pub use crate::error::*; - - use std::ffi::{CStr, CString}; - use std::ops::Deref; - use std::ptr; - use std::slice; - use std::str; - - use libc::{c_char, size_t}; - - use regex::{bytes, Regex}; - - use crate::error::{Error, ErrorKind}; - use std::io; - use std::io::Write; - use std::process::abort; - - include!("lib_internal.rs"); - - const RURE_FLAG_CASEI: u32 = 1 << 0; - const RURE_FLAG_MULTI: u32 = 1 << 1; - const RURE_FLAG_DOTNL: u32 = 1 << 2; - const RURE_FLAG_SWAP_GREED: u32 = 1 << 3; - const RURE_FLAG_SPACE: u32 = 1 << 4; - const RURE_FLAG_UNICODE: u32 = 1 << 5; - const RURE_DEFAULT_FLAGS: u32 = RURE_FLAG_UNICODE; - - pub struct RegexBytes { - re: bytes::Regex, - // capture_names: HashMap, - } - - pub struct RegexUnicode { - re: Regex, - } - - pub struct Options { - size_limit: usize, - dfa_size_limit: usize, - } - - // The `RegexSet` is not exposed with option support or matching at an - // arbitrary position with a crate just yet. To circumvent this, we use - // the `Exec` structure directly. - pub struct RegexSet { - re: bytes::RegexSet, - } - - #[repr(C)] - pub struct rure_match { - pub start: size_t, - pub end: size_t, - } - - pub struct Captures(bytes::Locations); - - pub struct IterCaptureNames { - capture_names: bytes::CaptureNames<'static>, - name_ptrs: Vec<*mut c_char>, - } - - #[repr(C)] - pub struct Atoms { - atom: *mut c_char, - } - - #[repr(C)] - pub struct MyVec { - data: *mut Atoms, - len: i32, - } - - impl Deref for RegexBytes { - type Target = bytes::Regex; - fn deref(&self) -> &bytes::Regex { - &self.re - } - } - - impl Deref for RegexUnicode { - type Target = Regex; - fn deref(&self) -> &Regex { - &self.re - } - } - - impl Deref for RegexSet { - type Target = bytes::RegexSet; - fn deref(&self) -> &bytes::RegexSet { - &self.re - } - } - - impl Default for Options { - fn default() -> Options { - Options { - size_limit: 10 * (1 << 20), - dfa_size_limit: 2 * (1 << 20), - } - } - } - - #[no_mangle] - extern "C" fn rure_compile_must(pattern: *const c_char) -> *const RegexBytes { - let len = unsafe { CStr::from_ptr(pattern).to_bytes().len() }; - let pat = pattern as *const u8; - let mut err = Error::new(ErrorKind::None); - let re = rure_compile(pat, len, RURE_DEFAULT_FLAGS, ptr::null(), &mut err); - if err.is_err() { - let _ = writeln!(&mut io::stderr(), "{}", err); - let _ = writeln!(&mut io::stderr(), "aborting from rure_compile_must"); - abort() - } - re - } - - #[no_mangle] - extern "C" fn rure_compile( - pattern: *const u8, - length: size_t, - flags: u32, - options: *const Options, - error: *mut Error, - ) -> *const RegexBytes { - let pat = unsafe { slice::from_raw_parts(pattern, length) }; - let pat = match str::from_utf8(pat) { - Ok(pat) => pat, - Err(err) => unsafe { - if !error.is_null() { - *error = Error::new(ErrorKind::Str(err)); - } - return ptr::null(); - }, - }; +#[macro_use] +mod error; +pub use crate::error::*; + +use std::ffi::{CStr, CString}; +use std::ops::Deref; +use std::ptr; +use std::slice; +use std::str; + +use libc::{c_char, size_t}; + +use regex::{bytes, Regex}; + +use crate::error::{Error, ErrorKind}; +use std::io; +use std::io::Write; +use std::process::abort; + +include!("lib_internal.rs"); + +const RURE_FLAG_CASEI: u32 = 1 << 0; +const RURE_FLAG_MULTI: u32 = 1 << 1; +const RURE_FLAG_DOTNL: u32 = 1 << 2; +const RURE_FLAG_SWAP_GREED: u32 = 1 << 3; +const RURE_FLAG_SPACE: u32 = 1 << 4; +const RURE_FLAG_UNICODE: u32 = 1 << 5; +const RURE_DEFAULT_FLAGS: u32 = RURE_FLAG_UNICODE; + +pub struct RegexBytes { + re: bytes::Regex, + // capture_names: HashMap, +} + +pub struct RegexUnicode { + re: Regex, +} + +pub struct Options { + size_limit: usize, + dfa_size_limit: usize, +} + +// The `RegexSet` is not exposed with option support or matching at an +// arbitrary position with a crate just yet. To circumvent this, we use +// the `Exec` structure directly. +pub struct RegexSet { + re: bytes::RegexSet, +} + +#[repr(C)] +pub struct rure_match { + pub start: size_t, + pub end: size_t, +} + +pub struct Captures(bytes::Locations); + +pub struct IterCaptureNames { + capture_names: bytes::CaptureNames<'static>, + name_ptrs: Vec<*mut c_char>, +} + +#[repr(C)] +pub struct Atoms { + atom: *mut c_char, +} + +#[repr(C)] +pub struct MyVec { + data: *mut Atoms, + len: i32, +} + +impl Deref for RegexBytes { + type Target = bytes::Regex; + fn deref(&self) -> &bytes::Regex { + &self.re + } +} + +impl Deref for RegexUnicode { + type Target = Regex; + fn deref(&self) -> &Regex { + &self.re + } +} + +impl Deref for RegexSet { + type Target = bytes::RegexSet; + fn deref(&self) -> &bytes::RegexSet { + &self.re + } +} + +impl Default for Options { + fn default() -> Options { + Options { + size_limit: 10 * (1 << 20), + dfa_size_limit: 2 * (1 << 20), + } + } +} + +#[no_mangle] +extern "C" fn rure_compile_must(pattern: *const c_char) -> *const RegexBytes { + let len = unsafe { CStr::from_ptr(pattern).to_bytes().len() }; + let pat = pattern as *const u8; + let mut err = Error::new(ErrorKind::None); + let re = rure_compile(pat, len, RURE_DEFAULT_FLAGS, ptr::null(), &mut err); + if err.is_err() { + let _ = writeln!(&mut io::stderr(), "{}", err); + let _ = writeln!(&mut io::stderr(), "aborting from rure_compile_must"); + abort() + } + re +} + +#[no_mangle] +extern "C" fn rure_compile( + pattern: *const u8, + length: size_t, + flags: u32, + options: *const Options, + error: *mut Error, +) -> *const RegexBytes { + let pat = unsafe { slice::from_raw_parts(pattern, length) }; + let pat = match str::from_utf8(pat) { + Ok(pat) => pat, + Err(err) => unsafe { + if !error.is_null() { + *error = Error::new(ErrorKind::Str(err)); + } + return ptr::null(); + }, + }; let mut builder = rure_compile_internal(pat, flags); if !options.is_null() { @@ -160,7 +160,7 @@ Ok(re) => { let re = RegexBytes { re }; Box::into_raw(Box::new(re)) - }, + } Err(err) => unsafe { if !error.is_null() { *error = Error::new(ErrorKind::Regex(err)); @@ -168,135 +168,135 @@ ptr::null() }, } - } - - #[no_mangle] - extern "C" fn rure_free(re: *const RegexBytes) { - unsafe { - drop(Box::from_raw(re as *mut Regex)); - } - } - - #[no_mangle] - extern "C" fn rure_is_match( - re: *const RegexBytes, - haystack: *const u8, - len: size_t, - _start: size_t, - ) -> bool { - let re = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(haystack, len) }; - re.is_match(haystack) - } - - #[no_mangle] - extern "C" fn rure_find( - re: *const RegexBytes, - haystack: *const u8, - len: size_t, - start: size_t, - match_info: *mut rure_match, - ) -> bool { - let re = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(haystack, len) }; - re.find_at(haystack, start) - .map(|m| unsafe { - if !match_info.is_null() { - (*match_info).start = m.start(); - (*match_info).end = m.end(); - } - }) - .is_some() - } - - #[no_mangle] - extern "C" fn rure_find_captures( - re: *const RegexBytes, - haystack: *const u8, - len: size_t, - start: size_t, - captures: *mut Captures, - ) -> bool { - let re = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(haystack, len) }; - let slots = unsafe { &mut (*captures).0 }; - re.read_captures_at(slots, haystack, start).is_some() - } - - #[no_mangle] - extern "C" fn rure_iter_capture_names_new(re: *const RegexBytes) -> *mut IterCaptureNames { - let re = unsafe { &*re }; - Box::into_raw(Box::new(IterCaptureNames { - capture_names: re.re.capture_names(), - name_ptrs: Vec::new(), - })) - } - - #[no_mangle] - extern "C" fn rure_iter_capture_names_free(it: *mut IterCaptureNames) { - unsafe { - let it = &mut *it; - while let Some(ptr) = it.name_ptrs.pop() { - drop(CString::from_raw(ptr)); - } - drop(Box::from_raw(it)); - } - } - - #[no_mangle] - extern "C" fn rure_iter_capture_names_next( - it: *mut IterCaptureNames, - capture_name: *mut *mut c_char, - ) -> bool { - if capture_name.is_null() { - return false; - } - let it = unsafe { &mut *it }; - let cn = match it.capture_names.next() { - // Top-level iterator ran out of capture groups - None => return false, - Some(val) => { - match val { - // inner Option didn't have a name - None => "", - Some(name) => name, - } - } - }; - unsafe { - let cs = match CString::new(cn.as_bytes()) { - Result::Ok(val) => val, - Result::Err(_) => return false, - }; - let ptr = cs.into_raw(); - it.name_ptrs.push(ptr); - *capture_name = ptr; - } - true - } - - #[no_mangle] - extern "C" fn rure_captures_new(re: *const RegexBytes) -> *mut Captures { - let re = unsafe { &*re }; - let captures = Captures(re.locations()); - Box::into_raw(Box::new(captures)) - } - - #[no_mangle] - extern "C" fn rure_captures_free(captures: *const Captures) { - unsafe { - drop(Box::from_raw(captures as *mut Captures)); - } - } - - #[no_mangle] - extern "C" fn rure_captures_at( - captures: *const Captures, - i: size_t, - match_info: *mut rure_match, - ) -> bool { - let locs = unsafe { &(*captures).0 }; - match locs.pos(i) { +} + +#[no_mangle] +extern "C" fn rure_free(re: *const RegexBytes) { + unsafe { + drop(Box::from_raw(re as *mut Regex)); + } +} + +#[no_mangle] +extern "C" fn rure_is_match( + re: *const RegexBytes, + haystack: *const u8, + len: size_t, + _start: size_t, +) -> bool { + let re = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(haystack, len) }; + re.is_match(haystack) +} + +#[no_mangle] +extern "C" fn rure_find( + re: *const RegexBytes, + haystack: *const u8, + len: size_t, + start: size_t, + match_info: *mut rure_match, +) -> bool { + let re = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(haystack, len) }; + re.find_at(haystack, start) + .map(|m| unsafe { + if !match_info.is_null() { + (*match_info).start = m.start(); + (*match_info).end = m.end(); + } + }) + .is_some() +} + +#[no_mangle] +extern "C" fn rure_find_captures( + re: *const RegexBytes, + haystack: *const u8, + len: size_t, + start: size_t, + captures: *mut Captures, +) -> bool { + let re = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(haystack, len) }; + let slots = unsafe { &mut (*captures).0 }; + re.read_captures_at(slots, haystack, start).is_some() +} + +#[no_mangle] +extern "C" fn rure_iter_capture_names_new(re: *const RegexBytes) -> *mut IterCaptureNames { + let re = unsafe { &*re }; + Box::into_raw(Box::new(IterCaptureNames { + capture_names: re.re.capture_names(), + name_ptrs: Vec::new(), + })) +} + +#[no_mangle] +extern "C" fn rure_iter_capture_names_free(it: *mut IterCaptureNames) { + unsafe { + let it = &mut *it; + while let Some(ptr) = it.name_ptrs.pop() { + drop(CString::from_raw(ptr)); + } + drop(Box::from_raw(it)); + } +} + +#[no_mangle] +extern "C" fn rure_iter_capture_names_next( + it: *mut IterCaptureNames, + capture_name: *mut *mut c_char, +) -> bool { + if capture_name.is_null() { + return false; + } + let it = unsafe { &mut *it }; + let cn = match it.capture_names.next() { + // Top-level iterator ran out of capture groups + None => return false, + Some(val) => { + match val { + // inner Option didn't have a name + None => "", + Some(name) => name, + } + } + }; + unsafe { + let cs = match CString::new(cn.as_bytes()) { + Result::Ok(val) => val, + Result::Err(_) => return false, + }; + let ptr = cs.into_raw(); + it.name_ptrs.push(ptr); + *capture_name = ptr; + } + true +} + +#[no_mangle] +extern "C" fn rure_captures_new(re: *const RegexBytes) -> *mut Captures { + let re = unsafe { &*re }; + let captures = Captures(re.locations()); + Box::into_raw(Box::new(captures)) +} + +#[no_mangle] +extern "C" fn rure_captures_free(captures: *const Captures) { + unsafe { + drop(Box::from_raw(captures as *mut Captures)); + } +} + +#[no_mangle] +extern "C" fn rure_captures_at( + captures: *const Captures, + i: size_t, + match_info: *mut rure_match, +) -> bool { + let locs = unsafe { &(*captures).0 }; + match locs.pos(i) { Some((start, end)) => { if !match_info.is_null() { unsafe { @@ -308,41 +308,41 @@ } _ => false, } - } - - #[no_mangle] - extern "C" fn rure_captures_len(captures: *const Captures) -> size_t { - unsafe { (*captures).0.len() } - } - - #[no_mangle] - extern "C" fn rure_compile_set( - patterns: *const *const u8, - patterns_lengths: *const size_t, - patterns_count: size_t, - flags: u32, - options: *const Options, - error: *mut Error, - ) -> *const RegexSet { - let (raw_pats, raw_patsl) = unsafe { - ( - slice::from_raw_parts(patterns, patterns_count), - slice::from_raw_parts(patterns_lengths, patterns_count), - ) - }; - let mut pats = Vec::with_capacity(patterns_count); - for (&raw_pat, &raw_patl) in raw_pats.iter().zip(raw_patsl) { - let pat = unsafe { slice::from_raw_parts(raw_pat, raw_patl) }; - pats.push(match str::from_utf8(pat) { - Ok(pat) => pat, - Err(err) => unsafe { - if !error.is_null() { - *error = Error::new(ErrorKind::Str(err)); - } - return ptr::null(); - }, - }); - } +} + +#[no_mangle] +extern "C" fn rure_captures_len(captures: *const Captures) -> size_t { + unsafe { (*captures).0.len() } +} + +#[no_mangle] +extern "C" fn rure_compile_set( + patterns: *const *const u8, + patterns_lengths: *const size_t, + patterns_count: size_t, + flags: u32, + options: *const Options, + error: *mut Error, +) -> *const RegexSet { + let (raw_pats, raw_patsl) = unsafe { + ( + slice::from_raw_parts(patterns, patterns_count), + slice::from_raw_parts(patterns_lengths, patterns_count), + ) + }; + let mut pats = Vec::with_capacity(patterns_count); + for (&raw_pat, &raw_patl) in raw_pats.iter().zip(raw_patsl) { + let pat = unsafe { slice::from_raw_parts(raw_pat, raw_patl) }; + pats.push(match str::from_utf8(pat) { + Ok(pat) => pat, + Err(err) => unsafe { + if !error.is_null() { + *error = Error::new(ErrorKind::Str(err)); + } + return ptr::null(); + }, + }); + } let mut builder = rure_compile_set_internal(pats, flags); if !options.is_null() { @@ -359,230 +359,227 @@ ptr::null() }, } - } - - #[no_mangle] - extern "C" fn rure_set_free(re: *const RegexSet) { - unsafe { - drop(Box::from_raw(re as *mut RegexSet)); - } - } - - #[no_mangle] - extern "C" fn rure_set_is_match( - re: *const RegexSet, - haystack: *const u8, - len: size_t, - start: size_t, - ) -> bool { - let re = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(haystack, len) }; - re.is_match_at(haystack, start) - } - - #[no_mangle] - extern "C" fn rure_set_matches( - re: *const RegexSet, - haystack: *const u8, - len: size_t, - start: size_t, - matches: *mut bool, - ) -> bool { - let re = unsafe { &*re }; - let matches = unsafe { slice::from_raw_parts_mut(matches, re.len()) }; - let haystack = unsafe { slice::from_raw_parts(haystack, len) }; - - rure_set_matches_internal(re, matches, haystack, start) - } - - #[no_mangle] - extern "C" fn rure_set_len(re: *const RegexSet) -> size_t { - unsafe { (*re).len() } - } - - #[no_mangle] - extern "C" fn rure_escape_must(pattern: *const c_char) -> *const c_char { - let len = unsafe { CStr::from_ptr(pattern).to_bytes().len() }; - let pat = pattern as *const u8; - let mut err = Error::new(ErrorKind::None); - let esc = rure_escape(pat, len, &mut err); - if err.is_err() { - println!("{}", "aborting from rure_escape_must"); - let _ = writeln!(&mut io::stderr(), "{}", err); - let _ = writeln!(&mut io::stderr(), "aborting from rure_escape_must"); - abort() - } - esc - } - - /// A helper function that implements fallible escaping in a way that returns - /// an error if escaping failed. - /// - /// This should ideally be exposed, but it needs API design work. In - /// particular, this should not return a C string, but a `const uint8_t *` - /// instead, since it may contain a NUL byte. - fn rure_escape(pattern: *const u8, length: size_t, error: *mut Error) -> *const c_char { - let pat: &[u8] = unsafe { slice::from_raw_parts(pattern, length) }; - let str_pat = match str::from_utf8(pat) { - Ok(val) => val, - Err(err) => unsafe { - if !error.is_null() { - *error = Error::new(ErrorKind::Str(err)); - } - return ptr::null(); - }, - }; - let esc_pat = regex::escape(str_pat); - let c_esc_pat = match CString::new(esc_pat) { - Ok(val) => val, - Err(err) => unsafe { - if !error.is_null() { - *error = Error::new(ErrorKind::Nul(err)); - } - return ptr::null(); - }, - }; - c_esc_pat.into_raw() as *const c_char - } - - #[no_mangle] - extern "C" fn rure_cstring_free(s: *mut c_char) { - unsafe { - drop(CString::from_raw(s)); - } - } - - #[no_mangle] - extern "C" fn rure_replace( - re: *const RegexUnicode, - haystack: *const u8, - len_h: size_t, - rewrite: *const u8, - len_r: size_t, - ) -> *const u8 { - let re = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(haystack, len_h) }; - let rewrite = unsafe { slice::from_raw_parts(rewrite, len_r) }; - rure_replace_internal(re, haystack, rewrite) - } - - #[no_mangle] - extern "C" fn rure_replace_all( - re: *const RegexUnicode, - haystack: *const u8, - len_h: size_t, - rewrite: *const u8, - len_r: size_t, - ) -> *const u8 { - let re = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(haystack, len_h) }; - let rewrite = unsafe { slice::from_raw_parts(rewrite, len_r) }; - rure_replace_all_internal(re, haystack, rewrite) - } - - /* - * Simple way to use regex - */ - - #[no_mangle] - extern "C" fn rure_new(pattern: *const u8, length: size_t) -> *const RegexBytes { - let pat = unsafe { slice::from_raw_parts(pattern, length) }; - rure_new_internal(pat) - } - - #[no_mangle] - extern "C" fn rure_consume( - re: *const RegexBytes, - haystack: *const u8, - len: size_t, - match_info: *mut rure_match, - ) -> bool { - let exp = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(haystack, len) }; - exp.find(haystack) - .map(|m| unsafe { - if !match_info.is_null() { - (*match_info).start = m.start(); - (*match_info).end = m.end(); - } - }) - .is_some() - } - - #[no_mangle] - extern "C" fn rure_max_submatch(rewrite: *const c_char) -> i32 { - let len = unsafe { CStr::from_ptr(rewrite).to_bytes().len() }; - let pat = rewrite as *const u8; - let text = unsafe { slice::from_raw_parts(pat, len) }; - - rure_max_submatch_internal(text) - } - - #[no_mangle] - extern "C" fn rure_check_rewrite_string(rewrite: *const c_char, cap_num: i32) -> bool { - let len = unsafe { CStr::from_ptr(rewrite).to_bytes().len() }; - let pat = rewrite as *const u8; - let text = unsafe { slice::from_raw_parts(pat, len) }; - - rure_check_rewrite_string_internal(text, cap_num) - } - - #[no_mangle] - extern "C" fn rure_rewrite_str_convert(rewrite: *const u8, length: size_t) -> *const c_char { - let rewrite = unsafe { slice::from_raw_parts(rewrite, length) }; - - rure_rewrite_str_convert_internal(rewrite) - } - - #[no_mangle] - extern "C" fn rure_rewrite( - rewrite: *const u8, - length: size_t, - vecs: *const *const u8, - vecs_lengths: *const size_t, - vecs_count: size_t, - ) -> *const c_char { - // 获取rewrite - let rewrite = unsafe { slice::from_raw_parts(rewrite, length) }; - let rewrite_str = std::str::from_utf8(rewrite).unwrap(); - - //获取vecs中的内容 - let (raw_vecs, raw_vecsl) = unsafe { - ( - slice::from_raw_parts(vecs, vecs_count), - slice::from_raw_parts(vecs_lengths, vecs_count), - ) - }; - - let mut rure_vecs = Vec::with_capacity(vecs_count); - for (&raw_vec, &raw_vecl) in raw_vecs.iter().zip(raw_vecsl) { - let rure_vec = unsafe { slice::from_raw_parts(raw_vec, raw_vecl) }; - rure_vecs.push(str::from_utf8(rure_vec).unwrap()); - } - - rure_rewrite_internal(rewrite_str, vecs_count, rure_vecs) - } - - #[no_mangle] - extern "C" fn rure_replace_count(re: *const RegexUnicode, haystack: *const c_char) -> size_t { - let len = unsafe { CStr::from_ptr(haystack).to_bytes().len() }; - let hay = haystack as *const u8; - - let re = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(hay, len) }; - rure_replace_count_internal(haystack, re) - } - - #[no_mangle] - extern "C" fn rure_filter_compile( - regex_str: *const u8, - regex_len: size_t, - min_atoms_len: size_t, - ) -> MyVec { - let r = unsafe { slice::from_raw_parts(regex_str, regex_len) }; - let regex_str = str::from_utf8(r).unwrap(); - let atoms = my_compile(regex_str, min_atoms_len as i32); - atoms - } - +} + +#[no_mangle] +extern "C" fn rure_set_free(re: *const RegexSet) { + unsafe { + drop(Box::from_raw(re as *mut RegexSet)); + } +} + +#[no_mangle] +extern "C" fn rure_set_is_match( + re: *const RegexSet, + haystack: *const u8, + len: size_t, + start: size_t, +) -> bool { + let re = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(haystack, len) }; + re.is_match_at(haystack, start) +} + +#[no_mangle] +extern "C" fn rure_set_matches( + re: *const RegexSet, + haystack: *const u8, + len: size_t, + start: size_t, + matches: *mut bool, +) -> bool { + let re = unsafe { &*re }; + let matches = unsafe { slice::from_raw_parts_mut(matches, re.len()) }; + let haystack = unsafe { slice::from_raw_parts(haystack, len) }; + + rure_set_matches_internal(re, matches, haystack, start) +} + +#[no_mangle] +extern "C" fn rure_set_len(re: *const RegexSet) -> size_t { + unsafe { (*re).len() } +} + +#[no_mangle] +extern "C" fn rure_escape_must(pattern: *const c_char) -> *const c_char { + let len = unsafe { CStr::from_ptr(pattern).to_bytes().len() }; + let pat = pattern as *const u8; + let mut err = Error::new(ErrorKind::None); + let esc = rure_escape(pat, len, &mut err); + if err.is_err() { + println!("aborting from rure_escape_must"); + let _ = writeln!(&mut io::stderr(), "{}", err); + let _ = writeln!(&mut io::stderr(), "aborting from rure_escape_must"); + abort() + } + esc +} + +/// A helper function that implements fallible escaping in a way that returns +/// an error if escaping failed. +/// +/// This should ideally be exposed, but it needs API design work. In +/// particular, this should not return a C string, but a `const uint8_t *` +/// instead, since it may contain a NUL byte. +fn rure_escape(pattern: *const u8, length: size_t, error: *mut Error) -> *const c_char { + let pat: &[u8] = unsafe { slice::from_raw_parts(pattern, length) }; + let str_pat = match str::from_utf8(pat) { + Ok(val) => val, + Err(err) => unsafe { + if !error.is_null() { + *error = Error::new(ErrorKind::Str(err)); + } + return ptr::null(); + }, + }; + let esc_pat = regex::escape(str_pat); + let c_esc_pat = match CString::new(esc_pat) { + Ok(val) => val, + Err(err) => unsafe { + if !error.is_null() { + *error = Error::new(ErrorKind::Nul(err)); + } + return ptr::null(); + }, + }; + c_esc_pat.into_raw() as *const c_char +} + +#[no_mangle] +extern "C" fn rure_cstring_free(s: *mut c_char) { + unsafe { + drop(CString::from_raw(s)); + } +} + +#[no_mangle] +extern "C" fn rure_replace( + re: *const RegexUnicode, + haystack: *const u8, + len_h: size_t, + rewrite: *const u8, + len_r: size_t, +) -> *const u8 { + let re = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(haystack, len_h) }; + let rewrite = unsafe { slice::from_raw_parts(rewrite, len_r) }; + rure_replace_internal(re, haystack, rewrite) +} + +#[no_mangle] +extern "C" fn rure_replace_all( + re: *const RegexUnicode, + haystack: *const u8, + len_h: size_t, + rewrite: *const u8, + len_r: size_t, +) -> *const u8 { + let re = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(haystack, len_h) }; + let rewrite = unsafe { slice::from_raw_parts(rewrite, len_r) }; + rure_replace_all_internal(re, haystack, rewrite) +} + +/* + * Simple way to use regex + */ + +#[no_mangle] +extern "C" fn rure_new(pattern: *const u8, length: size_t) -> *const RegexBytes { + let pat = unsafe { slice::from_raw_parts(pattern, length) }; + rure_new_internal(pat) +} + +#[no_mangle] +extern "C" fn rure_consume( + re: *const RegexBytes, + haystack: *const u8, + len: size_t, + match_info: *mut rure_match, +) -> bool { + let exp = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(haystack, len) }; + exp.find(haystack) + .map(|m| unsafe { + if !match_info.is_null() { + (*match_info).start = m.start(); + (*match_info).end = m.end(); + } + }) + .is_some() +} + +#[no_mangle] +extern "C" fn rure_max_submatch(rewrite: *const c_char) -> i32 { + let len = unsafe { CStr::from_ptr(rewrite).to_bytes().len() }; + let pat = rewrite as *const u8; + let text = unsafe { slice::from_raw_parts(pat, len) }; + + rure_max_submatch_internal(text) +} + +#[no_mangle] +extern "C" fn rure_check_rewrite_string(rewrite: *const c_char, cap_num: i32) -> bool { + let len = unsafe { CStr::from_ptr(rewrite).to_bytes().len() }; + let pat = rewrite as *const u8; + let text = unsafe { slice::from_raw_parts(pat, len) }; + + rure_check_rewrite_string_internal(text, cap_num) +} + +#[no_mangle] +extern "C" fn rure_rewrite_str_convert(rewrite: *const u8, length: size_t) -> *const c_char { + let rewrite = unsafe { slice::from_raw_parts(rewrite, length) }; + + rure_rewrite_str_convert_internal(rewrite) +} + +#[no_mangle] +extern "C" fn rure_rewrite( + rewrite: *const u8, + length: size_t, + vecs: *const *const u8, + vecs_lengths: *const size_t, + vecs_count: size_t, +) -> *const c_char { + // 获取rewrite + let rewrite = unsafe { slice::from_raw_parts(rewrite, length) }; + let rewrite_str = std::str::from_utf8(rewrite).unwrap(); + + //获取vecs中的内容 + let (raw_vecs, raw_vecsl) = unsafe { + ( + slice::from_raw_parts(vecs, vecs_count), + slice::from_raw_parts(vecs_lengths, vecs_count), + ) + }; + + let mut rure_vecs = Vec::with_capacity(vecs_count); + for (&raw_vec, &raw_vecl) in raw_vecs.iter().zip(raw_vecsl) { + let rure_vec = unsafe { slice::from_raw_parts(raw_vec, raw_vecl) }; + rure_vecs.push(str::from_utf8(rure_vec).unwrap()); + } + + rure_rewrite_internal(rewrite_str, vecs_count, rure_vecs) +} + +#[no_mangle] +extern "C" fn rure_replace_count(re: *const RegexUnicode, haystack: *const c_char) -> size_t { + let len = unsafe { CStr::from_ptr(haystack).to_bytes().len() }; + let hay = haystack as *const u8; + + let re = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(hay, len) }; + rure_replace_count_internal(haystack, re) +} +#[no_mangle] +extern "C" fn rure_filter_compile( + regex_str: *const u8, + regex_len: size_t, + min_atoms_len: size_t, +) -> MyVec { + let r = unsafe { slice::from_raw_parts(regex_str, regex_len) }; + let regex_str = str::from_utf8(r).unwrap(); + my_compile(regex_str, min_atoms_len as i32) +} diff --git a/regex-capi/src/lib_internal.rs b/regex-capi/src/lib_internal.rs index fb331d0e9093bf4ea001ebc4e81a1c8ecfc63ecd..e5ad3780ac57a019da9f2a0bce5ee8ce4c19fe72 100644 --- a/regex-capi/src/lib_internal.rs +++ b/regex-capi/src/lib_internal.rs @@ -14,10 +14,7 @@ ******************************************************************************/ use regex::bytes::RegexBuilder; use regex::bytes::RegexSetBuilder; - fn rure_compile_internal( - pat: &str, - flags: u32, -) -> RegexBuilder { +fn rure_compile_internal(pat: &str, flags: u32) -> RegexBuilder { let mut builder = bytes::RegexBuilder::new(pat); builder.case_insensitive(flags & RURE_FLAG_CASEI > 0); builder.multi_line(flags & RURE_FLAG_MULTI > 0); @@ -28,10 +25,7 @@ use regex::bytes::RegexSetBuilder; builder } -fn rure_compile_set_internal( - pats: Vec<&str>, - flags: u32, -) -> RegexSetBuilder { +fn rure_compile_set_internal(pats: Vec<&str>, flags: u32) -> RegexSetBuilder { let mut builder = bytes::RegexSetBuilder::new(pats); builder.case_insensitive(flags & RURE_FLAG_CASEI > 0); @@ -177,7 +171,7 @@ fn rure_check_rewrite_string_internal(text: &[u8], cap_num: i32) -> bool { if max_token > cap_num { return false; } - return true; + true } fn rure_rewrite_str_convert_internal(rewrite: &[u8]) -> *const c_char { @@ -278,17 +272,17 @@ fn rure_replace_count_internal(haystack: &[u8], re: &RegexUnicode) -> size_t { } /** -* 负责对字符集进行连接操作 -* -*/ + * 负责对字符集进行连接操作 + * + */ fn connection(str: &str, vec1: Vec, vec2: Vec) -> Vec { let mut vec_tmp = Vec::new(); - if str.len() > 0 { + if !str.is_empty() { for chars in vec2 { let s = format!("{}{}", str, chars); vec_tmp.push(s); } - } else if vec1.len() == 0 { + } else if vec1.is_empty() { for elem in vec2 { vec_tmp.push(elem.to_string()) } @@ -328,7 +322,7 @@ fn group_multiple_selection(str: &str, min_atoms_len: i32) -> Vec { str_tmp.push(elem); } } - atoms_tmp.sort_by(|a, b| a.len().cmp(&b.len())); + atoms_tmp.sort_by_key(|a| a.len()); for i in 0..atoms_tmp.len() { let mut j = i + 1; @@ -344,11 +338,11 @@ fn group_multiple_selection(str: &str, min_atoms_len: i32) -> Vec { } /** -* 处理 -* a[a-c]a[zv] -* [abc] -* [a-c]+ -*/ + * 处理 + * a[a-c]a[zv] + * [abc] + * [a-c]+ + */ fn char_class_expansion(str: &str) -> Vec { let mut flag_connect = 0; @@ -398,7 +392,7 @@ fn my_compile(str: &str, min_atoms_len: i32) -> MyVec { let mut tmp_post_group = i; tmp_post_group += 1; if tmp_post_group >= chars.len() { - if vec.len() != 0 { + if !vec.is_empty() { // 右括号为自后一个字符的情况 for elem in vec { my_atoms.push(Atoms { @@ -409,14 +403,14 @@ fn my_compile(str: &str, min_atoms_len: i32) -> MyVec { i += 1; continue; } - if chars[tmp_post_group] == '.' && vec.len() != 0 { + if chars[tmp_post_group] == '.' && !vec.is_empty() { i += 1; for elem in vec { my_atoms.push(Atoms { atom: CString::new(elem).unwrap().into_raw(), }); } - } else if chars[tmp_post_group] == '{' && vec.len() != 0 { + } else if chars[tmp_post_group] == '{' && !vec.is_empty() { for elem in vec { my_atoms.push(Atoms { atom: CString::new(elem).unwrap().into_raw(), @@ -427,12 +421,12 @@ fn my_compile(str: &str, min_atoms_len: i32) -> MyVec { continue; } if chars[i] == '.' { - if atoms_tmp_string.len() as i32 >= min_atoms_len && vec_chars_con.len() == 0 { + if atoms_tmp_string.len() as i32 >= min_atoms_len && vec_chars_con.is_empty() { my_atoms.push(Atoms { atom: CString::new(atoms_tmp_string.clone()).unwrap().into_raw(), }); } - if vec_chars_con.len() > 0 && atoms_tmp_string.len() > 0 { + if !vec_chars_con.is_empty() && !atoms_tmp_string.is_empty() { for elems in vec_chars_con.clone() { my_atoms.push(Atoms { atom: CString::new(format!("{}{}", elems.clone(), atoms_tmp_string)) @@ -460,18 +454,19 @@ fn my_compile(str: &str, min_atoms_len: i32) -> MyVec { } let mut plus_tmp = i; plus_tmp += 1; - if plus_tmp < chars.len() && chars[plus_tmp] == '+' { - if atoms_tmp_string.len() as i32 >= min_atoms_len { - my_atoms.push(Atoms { - atom: CString::new(atoms_tmp_string.clone()).unwrap().into_raw(), - }); - atoms_tmp_string.clear(); - i += 2; - continue; - } + if plus_tmp < chars.len() + && chars[plus_tmp] == '+' + && atoms_tmp_string.len() as i32 >= min_atoms_len + { + my_atoms.push(Atoms { + atom: CString::new(atoms_tmp_string.clone()).unwrap().into_raw(), + }); + atoms_tmp_string.clear(); + i += 2; + continue; } let str_char_set = &str[start_post..plus_tmp]; - if atoms_tmp_string.len() > 0 && vec_chars_con.len() > 0 { + if !atoms_tmp_string.is_empty() && !vec_chars_con.is_empty() { for elem in vec_chars_con.clone() { vec_chars_con.push(format!("{}{}", elem, atoms_tmp_string)); } @@ -481,7 +476,7 @@ fn my_compile(str: &str, min_atoms_len: i32) -> MyVec { vec_chars_con = connection(atoms_tmp_string.as_str(), vec_chars_con, atoms_tmp); atoms_tmp_string.clear(); - if i == chars.len() - 1 && vec_chars_con.len() > 0 { + if i == chars.len() - 1 && !vec_chars_con.is_empty() { for elem in vec_chars_con.clone() { if elem.len() as i32 >= min_atoms_len { my_atoms.push(Atoms {