diff --git a/re2/filtered_re2.cc b/re2/filtered_re2.cc index a108e6863e63ac44ae3033c533ce965574f58368..e9dfc546399f3ae12b49eafd3d635c133671aba9 100644 --- a/re2/filtered_re2.cc +++ b/re2/filtered_re2.cc @@ -28,225 +28,257 @@ extern "C" #include "regex-capi/include/regex_capi.h" } using namespace std; -namespace re2 { +namespace re2 +{ -std::map> map_atoms; + std::map> map_atoms; -// #include "re2/prefilter_tree.h" - class PrefilterTree { - public: - PrefilterTree():min_atom_len_(3){}; - explicit PrefilterTree(int min_atom_len):min_atom_len_(min_atom_len){}; + // #include "re2/prefilter_tree.h" + class PrefilterTree + { + public: + PrefilterTree() : min_atom_len_(3){}; + explicit PrefilterTree(int min_atom_len) : min_atom_len_(min_atom_len){}; ~PrefilterTree(){}; - int getMinAtomLen(){ + int getMinAtomLen() + { return min_atom_len_; } - bool get_is_latin_result() {return is_latin;}; + bool get_is_latin_result() { return is_latin; }; void set_latin(bool x); - std::string get_latin_string() {return str_latin;}; + std::string get_latin_string() { return str_latin; }; void set_latin_str(std::string x); - - private: + + private: const int min_atom_len_; bool is_latin; std::string str_latin; }; - void PrefilterTree::set_latin(bool x) { + void PrefilterTree::set_latin(bool x) + { is_latin = x; } - void PrefilterTree::set_latin_str(std::string x) { + void PrefilterTree::set_latin_str(std::string x) + { str_latin = x; } }; -namespace re2 { - -FilteredRE2::FilteredRE2() - : compiled_(false), - prefilter_tree_(new PrefilterTree()) { -} +namespace re2 +{ -FilteredRE2::FilteredRE2(int min_atom_len) - : compiled_(false), - prefilter_tree_(new PrefilterTree(min_atom_len)) { -} + FilteredRE2::FilteredRE2() + : compiled_(false), + prefilter_tree_(new PrefilterTree()) + { + } -FilteredRE2::~FilteredRE2() { - for (size_t i = 0; i < re2_vec_.size(); i++) - delete re2_vec_[i]; -} + FilteredRE2::FilteredRE2(int min_atom_len) + : compiled_(false), + prefilter_tree_(new PrefilterTree(min_atom_len)) + { + } -FilteredRE2::FilteredRE2(FilteredRE2&& other) - : re2_vec_(std::move(other.re2_vec_)), - compiled_(other.compiled_), - prefilter_tree_(std::move(other.prefilter_tree_)) { - other.re2_vec_.clear(); - other.re2_vec_.shrink_to_fit(); - other.compiled_ = false; - other.prefilter_tree_.reset(new PrefilterTree()); -} + FilteredRE2::~FilteredRE2() + { + for (size_t i = 0; i < re2_vec_.size(); i++) + delete re2_vec_[i]; + } -FilteredRE2& FilteredRE2::operator=(FilteredRE2&& other) { - this->~FilteredRE2(); - (void) new (this) FilteredRE2(std::move(other)); - return *this; -} + FilteredRE2::FilteredRE2(FilteredRE2 &&other) + : re2_vec_(std::move(other.re2_vec_)), + compiled_(other.compiled_), + prefilter_tree_(std::move(other.prefilter_tree_)) + { + other.re2_vec_.clear(); + other.re2_vec_.shrink_to_fit(); + other.compiled_ = false; + other.prefilter_tree_.reset(new PrefilterTree()); + } -RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern, - const RE2::Options& options, int* id) { - RE2* re = new RE2(pattern, options); - RE2::ErrorCode code = re->error_code(); - if(options.encoding() == RE2::Options::EncodingLatin1) { - prefilter_tree_->set_latin(true); - prefilter_tree_->set_latin_str(pattern.as_string()); + FilteredRE2 &FilteredRE2::operator=(FilteredRE2 &&other) + { + this->~FilteredRE2(); + (void)new (this) FilteredRE2(std::move(other)); + return *this; } - else prefilter_tree_->set_latin(false); - - if (!re->ok()) { - if (options.log_errors()) { - LOG(ERROR) << "Couldn't compile regular expression, skipping: " - << pattern << " due to error " << re->error(); + + RE2::ErrorCode FilteredRE2::Add(const StringPiece &pattern, + const RE2::Options &options, int *id) + { + RE2 *re = new RE2(pattern, options); + RE2::ErrorCode code = re->error_code(); + if (options.encoding() == RE2::Options::EncodingLatin1) + { + prefilter_tree_->set_latin(true); + prefilter_tree_->set_latin_str(pattern.as_string()); } - delete re; - } else { - *id = static_cast(re2_vec_.size()); - re2_vec_.push_back(re); - } + else + prefilter_tree_->set_latin(false); - return code; -} + if (!re->ok()) + { + if (options.log_errors()) + { + LOG(ERROR) << "Couldn't compile regular expression, skipping: " + << pattern << " due to error " << re->error(); + } + delete re; + } + else + { + *id = static_cast(re2_vec_.size()); + re2_vec_.push_back(re); + } -void FilteredRE2::Compile(std::vector* atoms) { - map_atoms.clear(); - if (compiled_) { - LOG(ERROR) << "Compile called already."; - return; + return code; } - if (re2_vec_.empty()) { - LOG(ERROR) << "Compile called before Add."; - return; - } - atoms->clear(); - - // 处理latin的情况 - if(prefilter_tree_->get_is_latin_result()) { - std::string str = prefilter_tree_->get_latin_string(); - std::vector vec; - vec.push_back(str); - std::string str_low = str; - transform(str_low.begin(),str_low.end(),str_low.begin(),::tolower); - atoms->push_back(str_low); - map_atoms.insert(map>::value_type(str, vec)); - map_atoms.insert(map>::value_type("total", vec)); - compiled_ = true; - return; - } - - for(size_t i = 0; i < re2_vec_.size(); i++) { - // std::vector my_atoms = MyCompile(re2_vec_[i]->pattern(), prefilter_tree_->getMinAtomLen()); - const char *regex = re2_vec_[i]->pattern().c_str(); - std::string regex_str = regex; - MyVec vec = rure_filter_compile((const uint8_t *)regex, strlen(regex), prefilter_tree_->getMinAtomLen()); - int32_t len = vec.len; - std::vector v; - for(int32_t i = 0; i < len; i++) { - atoms->push_back(vec.data[i].atom); - v.push_back(vec.data[i].atom); + void FilteredRE2::Compile(std::vector *atoms) + { + map_atoms.clear(); + if (compiled_) + { + LOG(ERROR) << "Compile called already."; + return; } - map_atoms.insert(map>::value_type(regex_str, v)); - } - map_atoms.insert(map>::value_type("total", *atoms)); - compiled_ = true; -} -int FilteredRE2::SlowFirstMatch(const StringPiece& text) const { - for (size_t i = 0; i < re2_vec_.size(); i++) - { - if (RE2::PartialMatch(text, re2_vec_[i]->pattern())){ - return static_cast(i); - } + if (re2_vec_.empty()) + { + LOG(ERROR) << "Compile called before Add."; + return; + } + atoms->clear(); + + // 处理latin的情况 + if (prefilter_tree_->get_is_latin_result()) + { + std::string str = prefilter_tree_->get_latin_string(); + std::vector vec; + vec.push_back(str); + std::string str_low = str; + transform(str_low.begin(), str_low.end(), str_low.begin(), ::tolower); + atoms->push_back(str_low); + map_atoms.insert(map>::value_type(str, vec)); + map_atoms.insert(map>::value_type("total", vec)); + compiled_ = true; + return; + } + + for (size_t i = 0; i < re2_vec_.size(); i++) + { + // std::vector my_atoms = MyCompile(re2_vec_[i]->pattern(), prefilter_tree_->getMinAtomLen()); + const char *regex = re2_vec_[i]->pattern().c_str(); + std::string regex_str = regex; + MyVec vec = rure_filter_compile((const uint8_t *)regex, strlen(regex), prefilter_tree_->getMinAtomLen()); + int32_t len = vec.len; + std::vector v; + for (int32_t i = 0; i < len; i++) + { + atoms->push_back(vec.data[i].atom); + v.push_back(vec.data[i].atom); + } + map_atoms.insert(map>::value_type(regex_str, v)); + } + map_atoms.insert(map>::value_type("total", *atoms)); + compiled_ = true; } - return -1; -} -void AtomsToRegexps(std::vector re2_vec_, std::vector atoms, std::vector *regexps, int min_atom_len) -{ - // 根据atoms索引获取regexp索引的规则 - /* - * 如果没有原子, 那么直接会把re加进去。 - * 如果这个正则表达式有原子,那么要把该正则表达式的所有的原子的索引全加入,这个正则表达式才能加入成功。 - */ - - std::vector atoms_total = map_atoms["total"]; - std::vector atoms_tmp; - for(size_t i = 0; i < atoms.size(); i++) + int FilteredRE2::SlowFirstMatch(const StringPiece &text) const { - atoms_tmp.push_back(atoms_total[atoms[i]]); + for (size_t i = 0; i < re2_vec_.size(); i++) + { + if (RE2::PartialMatch(text, re2_vec_[i]->pattern())) + { + return static_cast(i); + } + } + return -1; } - for(size_t i = 0; i < re2_vec_.size(); i++) + + void AtomsToRegexps(std::vector re2_vec_, std::vector atoms, std::vector *regexps, int min_atom_len) { - // std::vector my_atoms = MyCompile(re2_vec_[i]->pattern(), min_atom_len); - std::string str = re2_vec_[i]->pattern(); - std::vector my_atoms = map_atoms[str]; - if(my_atoms.size() == 0){ - regexps->push_back(i); - continue; + // 根据atoms索引获取regexp索引的规则 + /* + * 如果没有原子, 那么直接会把re加进去。 + * 如果这个正则表达式有原子,那么要把该正则表达式的所有的原子的索引全加入,这个正则表达式才能加入成功。 + */ + + std::vector atoms_total = map_atoms["total"]; + std::vector atoms_tmp; + for (size_t i = 0; i < atoms.size(); i++) + { + atoms_tmp.push_back(atoms_total[atoms[i]]); } - else + for (size_t i = 0; i < re2_vec_.size(); i++) { - int count = 0; - for(size_t ii = 0; ii < my_atoms.size(); ii++) + // std::vector my_atoms = MyCompile(re2_vec_[i]->pattern(), min_atom_len); + std::string str = re2_vec_[i]->pattern(); + std::vector my_atoms = map_atoms[str]; + if (my_atoms.size() == 0) { - for(size_t jj = 0; jj < atoms_tmp.size(); jj++) + regexps->push_back(i); + continue; + } + else + { + int count = 0; + for (size_t ii = 0; ii < my_atoms.size(); ii++) { - if(my_atoms[ii] == atoms_tmp[jj]){ - count++; - break; + for (size_t jj = 0; jj < atoms_tmp.size(); jj++) + { + if (my_atoms[ii] == atoms_tmp[jj]) + { + count++; + break; + } } } + if (count == (int)my_atoms.size()) + regexps->push_back(int(i)); } - if(count == (int)my_atoms.size()) regexps->push_back(int(i)); } } -} -int FilteredRE2::FirstMatch(const StringPiece& text, - const std::vector& atoms) const { - if (!compiled_) { - LOG(DFATAL) << "FirstMatch called before Compile."; + int FilteredRE2::FirstMatch(const StringPiece &text, + const std::vector &atoms) const + { + if (!compiled_) + { + LOG(DFATAL) << "FirstMatch called before Compile."; + return -1; + } + std::vector regexps; + + AtomsToRegexps(re2_vec_, atoms, ®exps, prefilter_tree_->getMinAtomLen()); + + for (size_t i = 0; i < regexps.size(); i++) + if (RE2::PartialMatch(text, *re2_vec_[regexps[i]])) + return static_cast(i); return -1; } - std::vector regexps; - - AtomsToRegexps(re2_vec_, atoms, ®exps, prefilter_tree_->getMinAtomLen()); - - for (size_t i = 0; i < regexps.size(); i++) - if (RE2::PartialMatch(text, *re2_vec_[regexps[i]])) - return static_cast(i); - return -1; -} -bool FilteredRE2::AllMatches( - const StringPiece& text, - const std::vector& atoms, - std::vector* matching_regexps) const { - matching_regexps->clear(); + bool FilteredRE2::AllMatches( + const StringPiece &text, + const std::vector &atoms, + std::vector *matching_regexps) const + { + matching_regexps->clear(); - std::vector regexps; - AtomsToRegexps(re2_vec_, atoms, ®exps, prefilter_tree_->getMinAtomLen()); + std::vector regexps; + AtomsToRegexps(re2_vec_, atoms, ®exps, prefilter_tree_->getMinAtomLen()); - for (size_t i = 0; i < re2_vec_.size(); i++) - if (RE2::PartialMatch(text, *re2_vec_[i])) - matching_regexps->push_back(i); - return !matching_regexps->empty(); - -} + for (size_t i = 0; i < re2_vec_.size(); i++) + if (RE2::PartialMatch(text, *re2_vec_[i])) + matching_regexps->push_back(i); + return !matching_regexps->empty(); + } -void FilteredRE2::AllPotentials( - const std::vector& atoms, - std::vector* potential_regexps) const { - AtomsToRegexps(re2_vec_, atoms, potential_regexps, prefilter_tree_->getMinAtomLen()); -} + void FilteredRE2::AllPotentials( + const std::vector &atoms, + std::vector *potential_regexps) const + { + AtomsToRegexps(re2_vec_, atoms, potential_regexps, prefilter_tree_->getMinAtomLen()); + } -} // namespace re2 +} // namespace re2 diff --git a/re2/re2.cc b/re2/re2.cc index fa4f185a0e8d2259c9a972b394ca6c94fa372498..4bf66bcc7d425cd24bdacaee55fef5f0a80b3b4b 100644 --- a/re2/re2.cc +++ b/re2/re2.cc @@ -155,10 +155,12 @@ namespace re2 } uint32_t flags = RURE_DEFAULT_FLAGS; - if(options_.dot_nl()) flags = RURE_FLAG_DOTNL; + if (options_.dot_nl()) + flags = RURE_FLAG_DOTNL; // if(options_.never_nl()) flags = RURE_DEFAULT_FLAGS; - if(options_.encoding() == RE2::Options::EncodingLatin1){ - flags |= RURE_FLAG_UNICODE; + if (options_.encoding() == RE2::Options::EncodingLatin1) + { + flags |= RURE_FLAG_UNICODE; } // for All @@ -185,7 +187,7 @@ namespace re2 LOG(ERROR) << "Error Compile '" << pattern.data() << "':" << msg << "'"; } error_ = new std::string(msg); - error_code_ = ErrorInternal; // 暂时对这个错误进行赋值,如何处理错误类型??? + error_code_ = ErrorInternal; // 暂时对这个错误进行赋值,如何处理错误类型??? } return; } @@ -193,7 +195,7 @@ namespace re2 // for Consume and FindAndConsume suffix_regexp_ = (re2::Regexp *)rure_new((const uint8_t *)pattern.data(), pattern.size()); // for FullMatch - if(rure_str != "") + if (rure_str != "") { std::string FullMatch_rure_str = rure_str; FullMatch_rure_str.insert(0, "^("); @@ -208,11 +210,11 @@ namespace re2 //获取捕获组的数量, 并对num_captures_其进行赋值 rure_captures *caps = rure_captures_new(re); size_t captures_len = rure_captures_len(caps) - 1; - if(!options_.never_capture()) + if (!options_.never_capture()) { num_captures_ = (int)captures_len; } - else + else { num_captures_ = 0; } @@ -220,7 +222,7 @@ namespace re2 rure_captures_free(caps); rure_error_free(err); error_ = empty_string; - error_code_ = RE2::NoError; + error_code_ = RE2::NoError; } RE2::~RE2() @@ -318,7 +320,6 @@ namespace re2 } } - bool RE2::Replace(std::string *str, const RE2 &re, const StringPiece &rewrite) @@ -334,7 +335,7 @@ namespace re2 // 利用rure进行replace const char *rure_str = re.pattern_.c_str(); // 对rewrite进行处理 - const char *rure_rewrite = rure_rewrite_str_convert((const uint8_t*)rewrite.data(), rewrite.size()); + const char *rure_rewrite = rure_rewrite_str_convert((const uint8_t *)rewrite.data(), rewrite.size()); rure *re_rure = rure_compile((const uint8_t *)rure_str, strlen(rure_str), RURE_DEFAULT_FLAGS, NULL, NULL); const char *str_rure = rure_replace(re_rure, (const uint8_t *)str->c_str(), strlen(str->c_str()), @@ -364,7 +365,7 @@ namespace re2 if (count != 0) { // 对rewrite进行处理 - const char *rure_rewrite = rure_rewrite_str_convert((const uint8_t*)rewrite.data(), rewrite.size()); + const char *rure_rewrite = rure_rewrite_str_convert((const uint8_t *)rewrite.data(), rewrite.size()); const char *str_rure = rure_replace_all(rure_re, (const uint8_t *)str->c_str(), strlen(str->c_str()), (const uint8_t *)rure_rewrite, strlen(rure_rewrite)); *str = str_rure; @@ -442,7 +443,7 @@ namespace re2 StringPiece *submatch, int nsubmatch) const { - if(text.size() == 0 && pattern() == "") + if (text.size() == 0 && pattern() == "") { return true; } @@ -463,9 +464,9 @@ namespace re2 return false; } // 对null和empty进行处理 - if(text.data() == NULL) + if (text.data() == NULL) { - for(int i = 0; i < nsubmatch; i++) + for (int i = 0; i < nsubmatch; i++) { submatch[i] = NULL; } @@ -491,19 +492,21 @@ namespace re2 // rure *re1 = (rure *)rprog_; rure_match match = {0}; size_t length = strlen(haystack.c_str()); - if(options_.never_nl()) + if (options_.never_nl()) { std::string strs = haystack + '\n'; size_t pos = strs.find('\n'); bool flag = false; - while(pos != strs.npos) + while (pos != strs.npos) { std::string temp = strs.substr(0, pos); bool matched = rure_is_match(re, (const uint8_t *)temp.c_str(), strlen(temp.c_str()), 0); - if(matched && !nsubmatch){ + if (matched && !nsubmatch) + { return true; } - if(matched && nsubmatch){ + if (matched && nsubmatch) + { haystack = temp; length = strlen(haystack.c_str()); flag = true; @@ -512,41 +515,48 @@ namespace re2 strs = strs.substr(pos + 1, length + 1); pos = strs.find('\n'); } - if(!flag){return false;} + if (!flag) + { + return false; + } } // bool matched = rure_find(re, (const uint8_t *)haystack, strlen(haystack), 0, &match); // 这里没有 if(re_anchor == ANCHOR_START)原因是因为: // 只有Consume()使用了ANCHOR_START,而传入Consume()的参数通常是三个或者三个以上, // 调用Consume()时,nsubmatch不为0,因此会去执行rure_captures_new()、rure_find_captures()、rure_captures_at() - if(re_anchor == UNANCHORED) + if (re_anchor == UNANCHORED) { // bool matched = rure_find(re, (const uint8_t *)haystack.c_str(), length, 0, &match); bool matched = rure_is_match(re, (const uint8_t *)haystack.c_str(), length, 0); - if(!matched){ + if (!matched) + { return false; } - else if(!nsubmatch){ + else if (!nsubmatch) + { return true; } } - else if(re_anchor == ANCHOR_BOTH) + else if (re_anchor == ANCHOR_BOTH) { bool matched = rure_find(re, (const uint8_t *)haystack.c_str(), length, 0, &match); - if(!matched || match.start != 0 || match.end != length){ + if (!matched || match.start != 0 || match.end != length) + { return false; } - else if(!nsubmatch){ + else if (!nsubmatch) + { return true; } } - + // Demo 获取捕获组内容,存储到submatch数组中 rure_captures *caps = rure_captures_new(re); rure_find_captures(re, (const uint8_t *)haystack.c_str(), length, 0, caps); // size_t captures_len = num_captures_ + 1; - + rure_captures_at(caps, 0, &match); if (re_anchor == ANCHOR_START && match.start != 0) return false; @@ -559,13 +569,14 @@ namespace re2 size_t start = match.start; size_t end = match.end; size_t len = end - start; - if(options_.encoding() == RE2::Options::EncodingUTF8){ + if (options_.encoding() == RE2::Options::EncodingUTF8) + { submatch[i] = StringPiece(text.data() + start, static_cast(len)); } - else{ + else + { submatch[i] = StringPiece(text.data() + start, static_cast(len / 2)); } - } else { @@ -608,17 +619,17 @@ namespace re2 // RE has fewer capturing groups than number of Arg pointers passed in. return false; } - + // for Consume and FindAndConsume rure_match match; - if(consumed && n == 0 && + if (consumed && n == 0 && rure_consume((rure *)suffix_regexp_, (const uint8_t *)text.data(), (size_t)text.size(), &match)) { *consumed = match.end; return true; } // for FullMatch(no captures) - if(re_anchor == ANCHOR_BOTH && n == 0 && options_.encoding() == RE2::Options::EncodingUTF8) + if (re_anchor == ANCHOR_BOTH && n == 0 && options_.encoding() == RE2::Options::EncodingUTF8) { bool matched = rure_is_match((rure *)entire_regexp_, (const uint8_t *)text.data(), (size_t)text.size(), 0); return matched; @@ -700,12 +711,12 @@ namespace re2 { int num_caps = NumberOfCapturingGroups(); bool result = rure_check_rewrite_string(rewrite.data(), num_caps); - if(!result){ + if (!result) + { *error = "Rewrite schema error"; return false; } - return true; - + return true; } // Returns the maximum submatch needed for the rewrite to be done by Replace(). @@ -726,13 +737,15 @@ namespace re2 size_t len = rewrite.length(); const char *rewrites[veclen]; size_t rewrites_lengths[veclen]; - for(int i = 0; i < veclen; i++) { + for (int i = 0; i < veclen; i++) + { rewrites[i] = vec[i].data(); rewrites_lengths[i] = vec[i].size(); } - const char *result = rure_rewrite((const uint8_t *)rewrite.data(), len, (const uint8_t **)rewrites, - rewrites_lengths, (size_t)veclen); - if(result != NULL) { + const char *result = rure_rewrite((const uint8_t *)rewrite.data(), len, (const uint8_t **)rewrites, + rewrites_lengths, (size_t)veclen); + if (result != NULL) + { out->assign(result); return true; } diff --git a/re2/regex_internal.h b/re2/regex_internal.h index cab67fe7ced20bf7eba5e72791c1574662057073..482634f0bc3e925e843ab7f4110952e430907f20 100644 --- a/re2/regex_internal.h +++ b/re2/regex_internal.h @@ -15,58 +15,61 @@ // #include "re2/sparse_array.h" #include "regex-capi/include/regex_capi.h" -namespace re2 { -// #include "re2/prog.h" -// Compiled form of regexp program. - class Prog { - //rure 更名为 Prog -}; - -// #include "re2/regexp.h" -class Regexp { - public: +namespace re2 +{ + // #include "re2/prog.h" + // Compiled form of regexp program. + class Prog + { + // rure 更名为 Prog + }; - // Flags for parsing. Can be ORed together. - enum ParseFlags { - NoParseFlags = 0, - FoldCase = 1 << 0, // Fold case during matching (case-insensitive). - Literal = 1 << 1, // Treat s as literal string instead of a regexp. - ClassNL = 1 << 2, // Allow char classes like [^a-z] and \D and \s - // and [[:space:]] to match newline. - DotNL = 1 << 3, // Allow . to match newline. - MatchNL = ClassNL | DotNL, - OneLine = 1 << 4, // Treat ^ and $ as only matching at beginning and - // end of text, not around embedded newlines. - // (Perl's default) - Latin1 = 1 << 5, // Regexp and text are in Latin1, not UTF-8. - NonGreedy = 1 << 6, // Repetition operators are non-greedy by default. - PerlClasses = 1 << 7, // Allow Perl character classes like \d. - PerlB = 1 << 8, // Allow Perl's \b and \B. - PerlX = 1 << 9, // Perl extensions: - // non-capturing parens - (?: ) - // non-greedy operators - *? +? ?? {}? - // flag edits - (?i) (?-i) (?i: ) - // i - FoldCase - // m - !OneLine - // s - DotNL - // U - NonGreedy - // line ends: \A \z - // \Q and \E to disable/enable metacharacters - // (?Pexpr) for named captures - // \C to match any single byte - UnicodeGroups = 1 << 10, // Allow \p{Han} for Unicode Han group - // and \P{Han} for its negation. - NeverNL = 1 << 11, // Never match NL, even if the regexp mentions - // it explicitly. - NeverCapture = 1 << 12, // Parse all parens as non-capturing. + // #include "re2/regexp.h" + class Regexp + { + public: + // Flags for parsing. Can be ORed together. + enum ParseFlags + { + NoParseFlags = 0, + FoldCase = 1 << 0, // Fold case during matching (case-insensitive). + Literal = 1 << 1, // Treat s as literal string instead of a regexp. + ClassNL = 1 << 2, // Allow char classes like [^a-z] and \D and \s + // and [[:space:]] to match newline. + DotNL = 1 << 3, // Allow . to match newline. + MatchNL = ClassNL | DotNL, + OneLine = 1 << 4, // Treat ^ and $ as only matching at beginning and + // end of text, not around embedded newlines. + // (Perl's default) + Latin1 = 1 << 5, // Regexp and text are in Latin1, not UTF-8. + NonGreedy = 1 << 6, // Repetition operators are non-greedy by default. + PerlClasses = 1 << 7, // Allow Perl character classes like \d. + PerlB = 1 << 8, // Allow Perl's \b and \B. + PerlX = 1 << 9, // Perl extensions: + // non-capturing parens - (?: ) + // non-greedy operators - *? +? ?? {}? + // flag edits - (?i) (?-i) (?i: ) + // i - FoldCase + // m - !OneLine + // s - DotNL + // U - NonGreedy + // line ends: \A \z + // \Q and \E to disable/enable metacharacters + // (?Pexpr) for named captures + // \C to match any single byte + UnicodeGroups = 1 << 10, // Allow \p{Han} for Unicode Han group + // and \P{Han} for its negation. + NeverNL = 1 << 11, // Never match NL, even if the regexp mentions + // it explicitly. + NeverCapture = 1 << 12, // Parse all parens as non-capturing. - // As close to Perl as we can get. - LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX | - UnicodeGroups, + // As close to Perl as we can get. + LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX | + UnicodeGroups, - // Internal use only. - WasDollar = 1 << 13, // on kRegexpEndText: was $ in regexp text - AllParseFlags = (1 << 14)-1, + // Internal use only. + WasDollar = 1 << 13, // on kRegexpEndText: was $ in regexp text + AllParseFlags = (1 << 14) - 1, + }; }; -}; }; \ No newline at end of file diff --git a/re2/set.cc b/re2/set.cc index 2af02a9fb8f8ca8e4ed05666af4d813c970cc147..28d3be827a8ecab615b95ae0fe22a63362169677 100644 --- a/re2/set.cc +++ b/re2/set.cc @@ -48,17 +48,17 @@ namespace re2 elem_.clear(); } - RE2::Set::Set(Set && other) + RE2::Set::Set(Set &&other) : options_(other.options_), anchor_(other.anchor_), compiled_(other.compiled_), prog_(std::move(other.prog_)) { - other.elem_.clear(); - other.elem_.shrink_to_fit(); - other.compiled_ = false; - other.size_ = 0; - other.prog_.reset(); + other.elem_.clear(); + other.elem_.shrink_to_fit(); + other.compiled_ = false; + other.size_ = 0; + other.prog_.reset(); } RE2::Set &RE2::Set::operator=(Set &&other) @@ -68,14 +68,16 @@ namespace re2 return *this; } - int RE2::Set::Add(const StringPiece &pattern, std::string *error) { int place_num = size_; std::string rure_pattern = pattern.as_string(); - if(anchor_ == RE2::ANCHOR_START){ // 处理RE2::ANCHOR_START的情况 + if (anchor_ == RE2::ANCHOR_START) + { // 处理RE2::ANCHOR_START的情况 rure_pattern.insert(0, "^"); - } else if(anchor_ == RE2::ANCHOR_BOTH) { // 处理RE2::ANCHOR_BOTH的情况 + } + else if (anchor_ == RE2::ANCHOR_BOTH) + { // 处理RE2::ANCHOR_BOTH的情况 rure_pattern.insert(0, "^"); rure_pattern.append("$"); } @@ -84,7 +86,7 @@ namespace re2 if (re == NULL) { const char *msg = rure_error_message(err); - if(error != NULL) + if (error != NULL) { error->assign(msg); LOG(ERROR) << "Regexp Error '" << pattern.data() << "':" << msg << "'"; @@ -94,7 +96,7 @@ namespace re2 } else { - elem_.push_back(pair(rure_pattern, (re2::Regexp*)nullptr)); + elem_.push_back(pair(rure_pattern, (re2::Regexp *)nullptr)); size_++; // rure_free(re); return place_num; @@ -103,7 +105,8 @@ namespace re2 bool RE2::Set::Compile() { - if (compiled_) { + if (compiled_) + { LOG(ERROR) << "RE2::Set::Compile() called more than once"; return false; } @@ -111,19 +114,21 @@ namespace re2 const size_t PAT_COUNT = elem_.size(); const char *patterns[PAT_COUNT]; size_t patterns_lengths[PAT_COUNT]; - for (size_t i = 0; i < elem_.size(); i++) { + for (size_t i = 0; i < elem_.size(); i++) + { patterns[i] = elem_[i].first.c_str(); patterns_lengths[i] = elem_[i].first.length(); } - + rure_error *err = rure_error_new(); - rure_set *re = rure_compile_set((const uint8_t **) patterns, - patterns_lengths, PAT_COUNT, 0, NULL, err); - if(re == NULL){ + rure_set *re = rure_compile_set((const uint8_t **)patterns, + patterns_lengths, PAT_COUNT, 0, NULL, err); + if (re == NULL) + { compiled_ = false; rure_set_free(re); return false; - } + } prog_.reset((Prog *)re); compiled_ = true; return true; @@ -137,31 +142,34 @@ namespace re2 bool RE2::Set::Match(const StringPiece &text, std::vector *v, ErrorInfo *error_info) const { - if (!compiled_) { + if (!compiled_) + { LOG(ERROR) << "RE2::Set::Match() called before compiling"; if (error_info != NULL) error_info->kind = kNotCompiled; return false; } - + const char *pat_str = text.data(); size_t length = strlen(pat_str); - if(v == NULL) + if (v == NULL) { - bool result = rure_set_is_match((rure_set *)prog_.get(), - (const uint8_t *)pat_str, length, 0); + bool result = rure_set_is_match((rure_set *)prog_.get(), + (const uint8_t *)pat_str, length, 0); return result; } else - { + { v->clear(); bool matches[elem_.size()]; - bool result = rure_set_matches((rure_set *)prog_.get(), - (const uint8_t *)pat_str, length, 0, matches); - if(!result) return false; - for(size_t i = 0; i < elem_.size(); i++) + bool result = rure_set_matches((rure_set *)prog_.get(), + (const uint8_t *)pat_str, length, 0, matches); + if (!result) + return false; + for (size_t i = 0; i < elem_.size(); i++) { - if(matches[i]) v->push_back(i); + if (matches[i]) + v->push_back(i); } return true; } diff --git a/regex-capi/ctest/test.c b/regex-capi/ctest/test.c index 4aca84abf1238331ca58102d707568fa794349fe..295e9ca3d69af14cf536a1b01d7d6a92b59177eb 100644 --- a/regex-capi/ctest/test.c +++ b/regex-capi/ctest/test.c @@ -7,18 +7,21 @@ #include "regex_capi.h" #ifndef DEBUG - #define DEBUG false +#define DEBUG false #endif -bool test_is_match() { +bool test_is_match() +{ bool passed = true; const char *haystack = "snowman: \xE2\x98\x83"; rure *re = rure_compile_must("\\p{So}$"); bool matched = rure_is_match(re, (const uint8_t *)haystack, strlen(haystack), 0); - if (!matched) { - if (DEBUG) { + if (!matched) + { + if (DEBUG) + { fprintf(stderr, "[test_is_match] expected match, but got no match\n"); } @@ -28,8 +31,8 @@ bool test_is_match() { return passed; } - -bool test_find() { +bool test_find() +{ bool passed = true; const char *haystack = "snowman: \xE2\x98\x83"; @@ -37,16 +40,20 @@ bool test_find() { rure_match match = {0}; bool matched = rure_find(re, (const uint8_t *)haystack, strlen(haystack), 0, &match); - if (!matched) { - if (DEBUG) { + if (!matched) + { + if (DEBUG) + { fprintf(stderr, "[test_find] expected match, but got no match\n"); } passed = false; } size_t expect_start = 9; size_t expect_end = 12; - if (match.start != expect_start || match.end != expect_end) { - if (DEBUG) { + if (match.start != expect_start || match.end != expect_end) + { + if (DEBUG) + { fprintf(stderr, "[test_find] expected match at (%zu, %zu), but " "got match at (%zu, %zu)\n", @@ -58,7 +65,8 @@ bool test_find() { return passed; } -bool test_captures() { +bool test_captures() +{ bool passed = true; const char *haystack = "snowman: \xE2\x98\x83"; @@ -67,8 +75,10 @@ bool test_captures() { rure_captures *caps = rure_captures_new(re); bool matched = rure_find_captures(re, (const uint8_t *)haystack, strlen(haystack), 0, caps); - if (!matched) { - if (DEBUG) { + if (!matched) + { + if (DEBUG) + { fprintf(stderr, "[test_captures] expected match, but got no match\n"); } @@ -76,8 +86,10 @@ bool test_captures() { } size_t expect_captures_len = 3; size_t captures_len = rure_captures_len(caps); - if (captures_len != expect_captures_len) { - if (DEBUG) { + if (captures_len != expect_captures_len) + { + if (DEBUG) + { fprintf(stderr, "[test_captures] " "expected capture group length to be %zd, but " @@ -90,8 +102,10 @@ bool test_captures() { size_t expect_start = 9; size_t expect_end = 12; rure_captures_at(caps, 2, &match); - if (match.start != expect_start || match.end != expect_end) { - if (DEBUG) { + if (match.start != expect_start || match.end != expect_end) + { + if (DEBUG) + { fprintf(stderr, "[test_captures] " "expected capture 2 match at (%zu, %zu), " @@ -106,10 +120,13 @@ done: return passed; } -bool test_iter_capture_name(char *expect, char *given) { +bool test_iter_capture_name(char *expect, char *given) +{ bool passed = true; - if (strcmp(expect, given)) { - if (DEBUG) { + if (strcmp(expect, given)) + { + if (DEBUG) + { fprintf(stderr, "[test_iter_capture_name] expected first capture " "name '%s' got '%s'\n", @@ -120,7 +137,8 @@ bool test_iter_capture_name(char *expect, char *given) { return passed; } -bool test_iter_capture_names() { +bool test_iter_capture_names() +{ bool passed = true; char *name; @@ -129,8 +147,10 @@ bool test_iter_capture_names() { rure_iter_capture_names *it = rure_iter_capture_names_new(re); bool result = rure_iter_capture_names_next(it, &name); - if (!result) { - if (DEBUG) { + if (!result) + { + if (DEBUG) + { fprintf(stderr, "[test_iter_capture_names] expected a second name, " "but got none\n"); @@ -141,19 +161,22 @@ bool test_iter_capture_names() { result = rure_iter_capture_names_next(it, &name); passed = test_iter_capture_name("year", name); - if (!passed) { + if (!passed) + { goto done; } result = rure_iter_capture_names_next(it, &name); passed = test_iter_capture_name("month", name); - if (!passed) { + if (!passed) + { goto done; } result = rure_iter_capture_names_next(it, &name); passed = test_iter_capture_name("day", name); - if (!passed) { + if (!passed) + { goto done; } done: @@ -168,7 +191,8 @@ done: * mode, we can match arbitrary possibly invalid UTF-8 bytes, such as \xFF. * (When Unicode mode is enabled, \xFF won't match .) */ -bool test_flags() { +bool test_flags() +{ bool passed = true; const char *pattern = "."; const char *haystack = "\xFF"; @@ -177,8 +201,10 @@ bool test_flags() { 0, NULL, NULL); bool matched = rure_is_match(re, (const uint8_t *)haystack, strlen(haystack), 0); - if (!matched) { - if (DEBUG) { + if (!matched) + { + if (DEBUG) + { fprintf(stderr, "[test_flags] expected match, but got no match\n"); } passed = false; @@ -187,12 +213,15 @@ bool test_flags() { return passed; } -bool test_compile_error() { +bool test_compile_error() +{ bool passed = true; rure_error *err = rure_error_new(); rure *re = rure_compile((const uint8_t *)"(", 1, 0, NULL, err); - if (re != NULL) { - if (DEBUG) { + if (re != NULL) + { + if (DEBUG) + { fprintf(stderr, "[test_compile_error] " "expected NULL regex pointer, but got non-NULL pointer\n"); @@ -201,12 +230,15 @@ bool test_compile_error() { rure_free(re); } const char *msg = rure_error_message(err); - if (NULL == strstr(msg, "unclosed group")) { - if (DEBUG) { + if (NULL == strstr(msg, "unclosed group")) + { + if (DEBUG) + { fprintf(stderr, "[test_compile_error] " "expected an 'unclosed parenthesis' error message, but " - "got this instead: '%s'\n", msg); + "got this instead: '%s'\n", + msg); } passed = false; } @@ -214,59 +246,63 @@ bool test_compile_error() { return passed; } - -bool test_regex_set_matches() { +bool test_regex_set_matches() +{ #define PAT_COUNT 6 bool passed = true; const char *patterns[] = { - "foo", "barfoo", "\\w+", "\\d+", "foobar", "bar" - }; + "foo", "barfoo", "\\w+", "\\d+", "foobar", "bar"}; const size_t patterns_lengths[] = { - 3, 6, 3, 3, 6, 3 - }; + 3, 6, 3, 3, 6, 3}; rure_error *err = rure_error_new(); - rure_set *re = rure_compile_set((const uint8_t **) patterns, + rure_set *re = rure_compile_set((const uint8_t **)patterns, patterns_lengths, PAT_COUNT, 0, NULL, err); - if (re == NULL) { + if (re == NULL) + { passed = false; goto done2; } - if (rure_set_len(re) != PAT_COUNT) { + if (rure_set_len(re) != PAT_COUNT) + { passed = false; goto done1; } - if (!rure_set_is_match(re, (const uint8_t *) "foobar", 6, 0)) { + if (!rure_set_is_match(re, (const uint8_t *)"foobar", 6, 0)) + { passed = false; goto done1; } - if (rure_set_is_match(re, (const uint8_t *) "", 0, 0)) { + if (rure_set_is_match(re, (const uint8_t *)"", 0, 0)) + { passed = false; goto done1; } bool matches[PAT_COUNT]; - if (!rure_set_matches(re, (const uint8_t *) "foobar", 6, 0, matches)) { + if (!rure_set_matches(re, (const uint8_t *)"foobar", 6, 0, matches)) + { passed = false; goto done1; } const bool match_target[] = { - true, false, true, false, true, true - }; + true, false, true, false, true, true}; int i; - for (i = 0; i < PAT_COUNT; ++i) { - if (matches[i] != match_target[i]) { + for (i = 0; i < PAT_COUNT; ++i) + { + if (matches[i] != match_target[i]) + { passed = false; goto done1; } @@ -281,54 +317,58 @@ done2: #undef PAT_COUNT } -bool test_regex_set_match_start() { +bool test_regex_set_match_start() +{ #define PAT_COUNT 3 bool passed = true; const char *patterns[] = { - "foo", "bar", "fooo" - }; + "foo", "bar", "fooo"}; const size_t patterns_lengths[] = { - 3, 3, 4 - }; + 3, 3, 4}; rure_error *err = rure_error_new(); - rure_set *re = rure_compile_set((const uint8_t **) patterns, + rure_set *re = rure_compile_set((const uint8_t **)patterns, patterns_lengths, PAT_COUNT, 0, NULL, err); - if (re == NULL) { + if (re == NULL) + { passed = false; goto done2; } - if (rure_set_len(re) != PAT_COUNT) { + if (rure_set_len(re) != PAT_COUNT) + { passed = false; goto done1; } - if (rure_set_is_match(re, (const uint8_t *)"foobiasdr", 7, 2)) { + if (rure_set_is_match(re, (const uint8_t *)"foobiasdr", 7, 2)) + { passed = false; goto done1; } { bool matches[PAT_COUNT]; - if (!rure_set_matches(re, (const uint8_t *)"fooobar", 8, 0, matches)) { + if (!rure_set_matches(re, (const uint8_t *)"fooobar", 8, 0, matches)) + { passed = false; goto done1; } const bool match_target[] = { - true, true, true - }; + true, true, true}; int i; - for (i = 0; i < PAT_COUNT; ++i) { - if (matches[i] != match_target[i]) { + for (i = 0; i < PAT_COUNT; ++i) + { + if (matches[i] != match_target[i]) + { passed = false; goto done1; } @@ -337,18 +377,20 @@ bool test_regex_set_match_start() { { bool matches[PAT_COUNT]; - if (!rure_set_matches(re, (const uint8_t *)"fooobar", 7, 1, matches)) { + if (!rure_set_matches(re, (const uint8_t *)"fooobar", 7, 1, matches)) + { passed = false; goto done1; } const bool match_target[] = { - false, true, false - }; + false, true, false}; int i; - for (i = 0; i < PAT_COUNT; ++i) { - if (matches[i] != match_target[i]) { + for (i = 0; i < PAT_COUNT; ++i) + { + if (matches[i] != match_target[i]) + { passed = false; goto done1; } @@ -364,146 +406,159 @@ done2: #undef PAT_COUNT } - -bool test_escape() { +bool test_escape() +{ bool passed = true; const char *pattern = "^[a-z]+.*$"; const char *expected_escaped = "\\^\\[a\\-z\\]\\+\\.\\*\\$"; const char *escaped = rure_escape_must(pattern); - if (!escaped) { - if (DEBUG) { + if (!escaped) + { + if (DEBUG) + { fprintf(stderr, "[test_captures] expected escaped, but got no escaped\n"); } passed = false; - } else if (strcmp(escaped, expected_escaped) != 0) { - if (DEBUG) { + } + else if (strcmp(escaped, expected_escaped) != 0) + { + if (DEBUG) + { fprintf(stderr, "[test_captures] expected \"%s\", but got \"%s\"\n", expected_escaped, escaped); } passed = false; } - rure_cstring_free((char *) escaped); + rure_cstring_free((char *)escaped); return passed; } -bool test_replace_and_replace_all(){ +bool test_replace_and_replace_all() +{ bool passed = true; - typedef struct ReplaceTest { + typedef struct ReplaceTest + { const char *regexp; const char *rewrite; const char *original; const char *single; const char *global; - int greplace_count; - }ReplaceTest; + int greplace_count; + } ReplaceTest; static const ReplaceTest tests[] = { - { "(qu|[b-df-hj-np-tv-z]*)([a-z]+)", - "${2}${1}ay", - "the quick brown fox jumps over the lazy dogs.", - "ethay quick brown fox jumps over the lazy dogs.", - "ethay ickquay ownbray oxfay umpsjay overay ethay azylay ogsday.", - 9 }, - { "\\w+", - "${0}-NOSPAM", - "abcd.efghi@google.com", - "abcd-NOSPAM.efghi@google.com", - "abcd-NOSPAM.efghi-NOSPAM@google-NOSPAM.com-NOSPAM", - 4 }, - { "^", - "(START)", - "foo", - "(START)foo", - "(START)foo", - 1 }, - { "^", - "(START)", - "", - "(START)", - "(START)", - 1 }, - { "$", - "(END)", - "", - "(END)", - "(END)", - 1 }, - { "b", - "bb", - "ababababab", - "abbabababab", - "abbabbabbabbabb", - 5 }, - { "b", - "bb", - "bbbbbb", - "bbbbbbb", - "bbbbbbbbbbbb", - 6 }, - { "b+", - "bb", - "bbbbbb", - "bb", - "bb", - 1 }, - { "b*", - "bb", - "bbbbbb", - "bb", - "bb", - 1 }, - { "b*", - "bb", - "aaaaa", - "bbaaaaa", - "bbabbabbabbabbabb", - 6 }, - - { "a.*a", - "(${0})", - "aba\naba", - "(aba)\naba", - "(aba)\n(aba)", - 2 }, - { "", NULL, NULL, NULL, NULL, 0 } - }; + {"(qu|[b-df-hj-np-tv-z]*)([a-z]+)", + "${2}${1}ay", + "the quick brown fox jumps over the lazy dogs.", + "ethay quick brown fox jumps over the lazy dogs.", + "ethay ickquay ownbray oxfay umpsjay overay ethay azylay ogsday.", + 9}, + {"\\w+", + "${0}-NOSPAM", + "abcd.efghi@google.com", + "abcd-NOSPAM.efghi@google.com", + "abcd-NOSPAM.efghi-NOSPAM@google-NOSPAM.com-NOSPAM", + 4}, + {"^", + "(START)", + "foo", + "(START)foo", + "(START)foo", + 1}, + {"^", + "(START)", + "", + "(START)", + "(START)", + 1}, + {"$", + "(END)", + "", + "(END)", + "(END)", + 1}, + {"b", + "bb", + "ababababab", + "abbabababab", + "abbabbabbabbabb", + 5}, + {"b", + "bb", + "bbbbbb", + "bbbbbbb", + "bbbbbbbbbbbb", + 6}, + {"b+", + "bb", + "bbbbbb", + "bb", + "bb", + 1}, + {"b*", + "bb", + "bbbbbb", + "bb", + "bb", + 1}, + {"b*", + "bb", + "aaaaa", + "bbaaaaa", + "bbabbabbabbabbabb", + 6}, + + {"a.*a", + "(${0})", + "aba\naba", + "(aba)\naba", + "(aba)\n(aba)", + 2}, + {"", NULL, NULL, NULL, NULL, 0}}; const char *haystack; const char *rewrite; - const char* regex; + const char *regex; - for (const ReplaceTest* t = tests; t->original != NULL; t++) { + for (const ReplaceTest *t = tests; t->original != NULL; t++) + { haystack = t->original; regex = t->regexp; rewrite = t->rewrite; rure *re = rure_compile_must(regex); const char *replaced_haystack = rure_replace(re, (const uint8_t *)haystack, strlen(haystack), - (const uint8_t *)rewrite, strlen(rewrite)); + (const uint8_t *)rewrite, strlen(rewrite)); const char *replaced_all_haystack = rure_replace_all(re, (const uint8_t *)haystack, strlen(haystack), - (const uint8_t *)rewrite, strlen(rewrite)); + (const uint8_t *)rewrite, strlen(rewrite)); int result1 = strcmp(t->single, replaced_haystack); int result2 = strcmp(t->global, replaced_all_haystack); - if(result1 != 0 && result2 !=0) passed = false; + if (result1 != 0 && result2 != 0) + passed = false; } passed = true; return passed; } -void run_test(bool (test)(), const char *name, bool *passed) { - if (!test()) { +void run_test(bool(test)(), const char *name, bool *passed) +{ + if (!test()) + { *passed = false; fprintf(stderr, "FAILED: %s\n", name); - } else { + } + else + { fprintf(stderr, "PASSED: %s\n", name); } } -int main() { +int main() +{ bool passed = true; run_test(test_is_match, "test_is_match", &passed); @@ -518,7 +573,8 @@ int main() { run_test(test_escape, "test_escape", &passed); run_test(test_replace_and_replace_all, "test_replace_and_replace_all", &passed); - if (!passed) { + if (!passed) + { exit(1); } return 0; diff --git a/regex-capi/include/regex_capi.h b/regex-capi/include/regex_capi.h index 07fc630d3f120ef857281952605fcac8fbd767e2..1ac83f8d9ec1d0111b8f717b619c0eb1110fdae7 100644 --- a/regex-capi/include/regex_capi.h +++ b/regex-capi/include/regex_capi.h @@ -20,33 +20,34 @@ #include #ifdef __cplusplus -extern "C" { +extern "C" +{ #endif -/* - * rure is the type of a compiled regular expression. - * - * An rure can be safely used from multiple threads simultaneously. - */ -typedef struct rure rure; - -/* - * rure_set is the type of a set of compiled regular expressions. - * - * A rure can be safely used from multiple threads simultaneously. - */ -typedef struct rure_set rure_set; - -/* - * rure_options is the set of non-flag configuration options for compiling - * a regular expression. Currently, only two options are available: setting - * the size limit of the compiled program and setting the size limit of the - * cache of states that the DFA uses while searching. - * - * For most uses, the default settings will work fine, and NULL can be passed - * wherever a *rure_options is expected. -*/ -typedef struct rure_options rure_options; + /* + * rure is the type of a compiled regular expression. + * + * An rure can be safely used from multiple threads simultaneously. + */ + typedef struct rure rure; + + /* + * rure_set is the type of a set of compiled regular expressions. + * + * A rure can be safely used from multiple threads simultaneously. + */ + typedef struct rure_set rure_set; + + /* + * rure_options is the set of non-flag configuration options for compiling + * a regular expression. Currently, only two options are available: setting + * the size limit of the compiled program and setting the size limit of the + * cache of states that the DFA uses while searching. + * + * For most uses, the default settings will work fine, and NULL can be passed + * wherever a *rure_options is expected. + */ + typedef struct rure_options rure_options; /* * The flags listed below can be used in rure_compile to set the default @@ -69,465 +70,457 @@ typedef struct rure_options rure_options; /* The default set of flags enabled when no flags are set. */ #define RURE_DEFAULT_FLAGS RURE_FLAG_UNICODE -/* - * rure_match corresponds to the location of a single match in a haystack. - */ -typedef struct rure_match { - /* The start position. */ - size_t start; - /* The end position. */ - size_t end; -} rure_match; - -/* - * rure_captures represents storage for sub-capture locations of a match. - * - * Computing the capture groups of a match can carry a significant performance - * penalty, so their use in the API is optional. - * - * An rure_captures value can be reused in multiple calls to rure_find_captures, - * so long as it is used with the compiled regular expression that created - * it. - * - * An rure_captures value may outlive its corresponding rure and can be freed - * independently. - * - * It is not safe to use from multiple threads simultaneously. - */ -typedef struct rure_captures rure_captures; - -/* - * rure_iter is an iterator over successive non-overlapping matches in a - * particular haystack. - * - * An rure_iter value may not outlive its corresponding rure and should be freed - * before its corresponding rure is freed. - * - * It is not safe to use from multiple threads simultaneously. - */ -typedef struct rure_iter rure_iter; - -/* - * rure_iter_capture_names is an iterator over the list of capture group names - * in this particular rure. - * - * An rure_iter_capture_names value may not outlive its corresponding rure, - * and should be freed before its corresponding rure is freed. - * - * It is not safe to use from multiple threads simultaneously. - */ -typedef struct rure_iter_capture_names rure_iter_capture_names; - -/* - * rure_error is an error that caused compilation to fail. - * - * Most errors are syntax errors but an error can be returned if the compiled - * regular expression would be too big. - * - * Whenever a function accepts an *rure_error, it is safe to pass NULL. (But - * you will not get access to the error if one occurred.) - * - * It is not safe to use from multiple threads simultaneously. - */ -typedef struct rure_error rure_error; - -typedef struct -{ - char *atom; -} Atoms; - - -typedef struct -{ - Atoms *data; - int32_t len; -} MyVec; - -/* - * rure_compile_must compiles the given pattern into a regular expression. If - * compilation fails for any reason, an error message is printed to stderr and - * the process is aborted. - * - * The pattern given should be in UTF-8. For convenience, this accepts a C - * string, which means the pattern cannot usefully contain NUL. If your pattern - * may contain NUL, consider using a regular expression escape sequence, or - * just use rure_compile. - * - * This uses RURE_DEFAULT_FLAGS. - * - * The compiled expression returned may be used from multiple threads - * simultaneously. - */ -rure *rure_compile_must(const char *pattern); - -/* - * rure_compile compiles the given pattern into a regular expression. The - * pattern must be valid UTF-8 and the length corresponds to the number of - * bytes in the pattern. - * - * flags is a bitfield. Valid values are constants declared with prefix - * RURE_FLAG_. - * - * options contains non-flag configuration settings. If it's NULL, default - * settings are used. options may be freed immediately after a call to - * rure_compile. - * - * error is set if there was a problem compiling the pattern (including if the - * pattern is not valid UTF-8). If error is NULL, then no error information - * is returned. In all cases, if an error occurs, NULL is returned. - * - * The compiled expression returned may be used from multiple threads - * simultaneously. - */ -rure *rure_compile(const uint8_t *pattern, size_t length, - uint32_t flags, rure_options *options, - rure_error *error); - -/* - * rure_free frees the given compiled regular expression. - * - * This must be called at most once for any rure. - */ -void rure_free(rure *re); - -/* - * rure_is_match returns true if and only if re matches anywhere in haystack. - * - * haystack may contain arbitrary bytes, but ASCII compatible text is more - * useful. UTF-8 is even more useful. Other text encodings aren't supported. - * length should be the number of bytes in haystack. - * - * start is the position at which to start searching. Note that setting the - * start position is distinct from incrementing the pointer, since the regex - * engine may look at bytes before the start position to determine match - * information. For example, if the start position is greater than 0, then the - * \A ("begin text") anchor can never match. - * - * rure_is_match should be preferred to rure_find since it may be faster. - * - * N.B. The performance of this search is not impacted by the presence of - * capturing groups in your regular expression. - */ -bool rure_is_match(rure *re, const uint8_t *haystack, size_t length, - size_t start); - -/* - * rure_find returns true if and only if re matches anywhere in haystack. - * If a match is found, then its start and end offsets (in bytes) are set - * on the match pointer given. - * - * haystack may contain arbitrary bytes, but ASCII compatible text is more - * useful. UTF-8 is even more useful. Other text encodings aren't supported. - * length should be the number of bytes in haystack. - * - * start is the position at which to start searching. Note that setting the - * start position is distinct from incrementing the pointer, since the regex - * engine may look at bytes before the start position to determine match - * information. For example, if the start position is greater than 0, then the - * \A ("begin text") anchor can never match. - * - * rure_find should be preferred to rure_find_captures since it may be faster. - * - * N.B. The performance of this search is not impacted by the presence of - * capturing groups in your regular expression. - */ -bool rure_find(rure *re, const uint8_t *haystack, size_t length, - size_t start, rure_match *match); - -/* - * rure_find_captures returns true if and only if re matches anywhere in - * haystack. If a match is found, then all of its capture locations are stored - * in the captures pointer given. - * - * haystack may contain arbitrary bytes, but ASCII compatible text is more - * useful. UTF-8 is even more useful. Other text encodings aren't supported. - * length should be the number of bytes in haystack. - * - * start is the position at which to start searching. Note that setting the - * start position is distinct from incrementing the pointer, since the regex - * engine may look at bytes before the start position to determine match - * information. For example, if the start position is greater than 0, then the - * \A ("begin text") anchor can never match. - * - * Only use this function if you specifically need access to capture locations. - * It is not necessary to use this function just because your regular - * expression contains capturing groups. - * - * Capture locations can be accessed using the rure_captures_* functions. - * - * N.B. The performance of this search can be impacted by the number of - * capturing groups. If you're using this function, it may be beneficial to - * use non-capturing groups (e.g., `(?:re)`) where possible. - */ -bool rure_find_captures(rure *re, const uint8_t *haystack, size_t length, - size_t start, rure_captures *captures); - - - -/* - * rure_iter_capture_names_new creates a new capture_names iterator. - * - * An iterator will report all successive capture group names of re. - */ -rure_iter_capture_names *rure_iter_capture_names_new(rure *re); - -/* - * rure_iter_capture_names_free frees the iterator given. - * - * It must be called at most once. - */ -void rure_iter_capture_names_free(rure_iter_capture_names *it); - -/* - * rure_iter_capture_names_next advances the iterator and returns true - * if and only if another capture group name exists. - * - * The value of the capture group name is written to the provided pointer. - */ -bool rure_iter_capture_names_next(rure_iter_capture_names *it, char **name); - - -/* - * rure_iter_free frees the iterator given. - * - * It must be called at most once. - */ -void rure_iter_free(rure_iter *it); - - - -/* - * rure_captures_new allocates storage for all capturing groups in re. - * - * An rure_captures value may be reused on subsequent calls to - * rure_find_captures or rure_iter_next_captures. - * - * An rure_captures value may be freed independently of re, although any - * particular rure_captures should be used only with the re given here. - * - * It is not safe to use an rure_captures value from multiple threads - * simultaneously. - */ -rure_captures *rure_captures_new(rure *re); - -/* - * rure_captures_free frees the given captures. - * - * This must be called at most once. - */ -void rure_captures_free(rure_captures *captures); - -/* - * rure_captures_at returns true if and only if the capturing group at the - * index given was part of a match. If so, the given match pointer is populated - * with the start and end location (in bytes) of the capturing group. - * - * If no capture group with the index i exists, then false is - * returned. (A capturing group exists if and only if i is less than - * rure_captures_len(captures).) - * - * Note that index 0 corresponds to the full match. - */ -bool rure_captures_at(rure_captures *captures, size_t i, rure_match *match); - -/* - * rure_captures_len returns the number of capturing groups in the given - * captures. - */ -size_t rure_captures_len(rure_captures *captures); - - - -/* - * rure_compile_set compiles the given list of patterns into a single regular - * expression which can be matched in a linear-scan. Each pattern in patterns - * must be valid UTF-8 and the length of each pattern in patterns corresponds - * to a byte length in patterns_lengths. - * - * The number of patterns to compile is specified by patterns_count. patterns - * must contain at least this many entries. - * - * flags is a bitfield. Valid values are constants declared with prefix - * RURE_FLAG_. - * - * options contains non-flag configuration settings. If it's NULL, default - * settings are used. options may be freed immediately after a call to - * rure_compile. - * - * error is set if there was a problem compiling the pattern. - * - * The compiled expression set returned may be used from multiple threads. - */ -rure_set *rure_compile_set(const uint8_t **patterns, - const size_t *patterns_lengths, - size_t patterns_count, - uint32_t flags, - rure_options *options, - rure_error *error); - -/* - * rure_set_free frees the given compiled regular expression set. - * - * This must be called at most once for any rure_set. - */ -void rure_set_free(rure_set *re); - -/* - * rure_is_match returns true if and only if any regexes within the set - * match anywhere in the haystack. Once a match has been located, the - * matching engine will quit immediately. - * - * haystack may contain arbitrary bytes, but ASCII compatible text is more - * useful. UTF-8 is even more useful. Other text encodings aren't supported. - * length should be the number of bytes in haystack. - * - * start is the position at which to start searching. Note that setting the - * start position is distinct from incrementing the pointer, since the regex - * engine may look at bytes before the start position to determine match - * information. For example, if the start position is greater than 0, then the - * \A ("begin text") anchor can never match. - */ -bool rure_set_is_match(rure_set *re, const uint8_t *haystack, size_t length, + /* + * rure_match corresponds to the location of a single match in a haystack. + */ + typedef struct rure_match + { + /* The start position. */ + size_t start; + /* The end position. */ + size_t end; + } rure_match; + + /* + * rure_captures represents storage for sub-capture locations of a match. + * + * Computing the capture groups of a match can carry a significant performance + * penalty, so their use in the API is optional. + * + * An rure_captures value can be reused in multiple calls to rure_find_captures, + * so long as it is used with the compiled regular expression that created + * it. + * + * An rure_captures value may outlive its corresponding rure and can be freed + * independently. + * + * It is not safe to use from multiple threads simultaneously. + */ + typedef struct rure_captures rure_captures; + + /* + * rure_iter is an iterator over successive non-overlapping matches in a + * particular haystack. + * + * An rure_iter value may not outlive its corresponding rure and should be freed + * before its corresponding rure is freed. + * + * It is not safe to use from multiple threads simultaneously. + */ + typedef struct rure_iter rure_iter; + + /* + * rure_iter_capture_names is an iterator over the list of capture group names + * in this particular rure. + * + * An rure_iter_capture_names value may not outlive its corresponding rure, + * and should be freed before its corresponding rure is freed. + * + * It is not safe to use from multiple threads simultaneously. + */ + typedef struct rure_iter_capture_names rure_iter_capture_names; + + /* + * rure_error is an error that caused compilation to fail. + * + * Most errors are syntax errors but an error can be returned if the compiled + * regular expression would be too big. + * + * Whenever a function accepts an *rure_error, it is safe to pass NULL. (But + * you will not get access to the error if one occurred.) + * + * It is not safe to use from multiple threads simultaneously. + */ + typedef struct rure_error rure_error; + + typedef struct + { + char *atom; + } Atoms; + + typedef struct + { + Atoms *data; + int32_t len; + } MyVec; + + /* + * rure_compile_must compiles the given pattern into a regular expression. If + * compilation fails for any reason, an error message is printed to stderr and + * the process is aborted. + * + * The pattern given should be in UTF-8. For convenience, this accepts a C + * string, which means the pattern cannot usefully contain NUL. If your pattern + * may contain NUL, consider using a regular expression escape sequence, or + * just use rure_compile. + * + * This uses RURE_DEFAULT_FLAGS. + * + * The compiled expression returned may be used from multiple threads + * simultaneously. + */ + rure *rure_compile_must(const char *pattern); + + /* + * rure_compile compiles the given pattern into a regular expression. The + * pattern must be valid UTF-8 and the length corresponds to the number of + * bytes in the pattern. + * + * flags is a bitfield. Valid values are constants declared with prefix + * RURE_FLAG_. + * + * options contains non-flag configuration settings. If it's NULL, default + * settings are used. options may be freed immediately after a call to + * rure_compile. + * + * error is set if there was a problem compiling the pattern (including if the + * pattern is not valid UTF-8). If error is NULL, then no error information + * is returned. In all cases, if an error occurs, NULL is returned. + * + * The compiled expression returned may be used from multiple threads + * simultaneously. + */ + rure *rure_compile(const uint8_t *pattern, size_t length, + uint32_t flags, rure_options *options, + rure_error *error); + + /* + * rure_free frees the given compiled regular expression. + * + * This must be called at most once for any rure. + */ + void rure_free(rure *re); + + /* + * rure_is_match returns true if and only if re matches anywhere in haystack. + * + * haystack may contain arbitrary bytes, but ASCII compatible text is more + * useful. UTF-8 is even more useful. Other text encodings aren't supported. + * length should be the number of bytes in haystack. + * + * start is the position at which to start searching. Note that setting the + * start position is distinct from incrementing the pointer, since the regex + * engine may look at bytes before the start position to determine match + * information. For example, if the start position is greater than 0, then the + * \A ("begin text") anchor can never match. + * + * rure_is_match should be preferred to rure_find since it may be faster. + * + * N.B. The performance of this search is not impacted by the presence of + * capturing groups in your regular expression. + */ + bool rure_is_match(rure *re, const uint8_t *haystack, size_t length, size_t start); -/* - * rure_set_matches compares each regex in the set against the haystack and - * modifies matches with the match result of each pattern. Match results are - * ordered in the same way as the rure_set was compiled. For example, - * index 0 of matches corresponds to the first pattern passed to - * `rure_compile_set`. - * - * haystack may contain arbitrary bytes, but ASCII compatible text is more - * useful. UTF-8 is even more useful. Other text encodings aren't supported. - * length should be the number of bytes in haystack. - * - * start is the position at which to start searching. Note that setting the - * start position is distinct from incrementing the pointer, since the regex - * engine may look at bytes before the start position to determine match - * information. For example, if the start position is greater than 0, then the - * \A ("begin text") anchor can never match. - * - * matches must be greater than or equal to the number of patterns the - * rure_set was compiled with. - * - * Only use this function if you specifically need to know which regexes - * matched within the set. To determine if any of the regexes matched without - * caring which, use rure_set_is_match. - */ -bool rure_set_matches(rure_set *re, const uint8_t *haystack, size_t length, - size_t start, bool *matches); - -/* - * rure_set_len returns the number of patterns rure_set was compiled with. - */ -size_t rure_set_len(rure_set *re); - -/* - * rure_error_new allocates space for an error. - * - * If error information is desired, then rure_error_new should be called - * to create an rure_error pointer, and that pointer can be passed to - * rure_compile. If an error occurred, then rure_compile will return NULL and - * the error pointer will be set. A message can then be extracted. - * - * It is not safe to use errors from multiple threads simultaneously. An error - * value may be reused on subsequent calls to rure_compile. - */ -rure_error *rure_error_new(); - -/* - * rure_error_free frees the error given. - * - * This must be called at most once. - */ -void rure_error_free(rure_error *err); - -/* - * rure_error_message returns a NUL terminated string that describes the error - * message. - * - * The pointer returned must not be freed. Instead, it will be freed when - * rure_error_free is called. If err is used in subsequent calls to - * rure_compile, then this pointer may change or become invalid. - */ -const char *rure_error_message(rure_error *err); - -/* - * rure_escape_must returns a NUL terminated string where all meta characters - * have been escaped. If escaping fails for any reason, an error message is - * printed to stderr and the process is aborted. - * - * The pattern given should be in UTF-8. For convenience, this accepts a C - * string, which means the pattern cannot contain a NUL byte. These correspond - * to the only two failure conditions of this function. That is, if the caller - * guarantees that the given pattern is valid UTF-8 and does not contain a - * NUL byte, then this is guaranteed to succeed (modulo out-of-memory errors). - * - * The pointer returned must not be freed directly. Instead, it should be freed - * by calling rure_cstring_free. - */ -const char *rure_escape_must(const char *pattern); - -/* - * rure_cstring_free frees the string given. - * - * This must be called at most once per string. - */ -void rure_cstring_free(char *s); - -/* - * rure_replace replaces the leftmost-first match with the rewrite provided. - * - * The rewrite can be a regular string (where `$N` and `$name` are - * expanded to match capture groups) or a function that takes the matches' - * `Captures` and returns the replaced string. - * - * The longest possible name is used. e.g., `$1a` looks up the capture - * group named `1a` and not the capture group at index `1`. To exert more - * precise control over the name, use braces, e.g., `${1}a`. - * - * If no match is found, then a copy of the string is returned unchanged. - * - */ -const char *rure_replace(rure *re, const uint8_t *haystack, size_t len_h, - const uint8_t *rewrite, size_t len_r); - -/* - * This like the previous function rure_replace, but is has different. - * rure_replace_all replaces all non-overlapping matches in `text` with the rewrite provided. - * - * If no match is found, then a copy of the string is returned unchanged. - */ -const char *rure_replace_all(rure *re, const uint8_t *haystack, size_t len_h, - const uint8_t *rewrite, size_t len_r); - -/* - * Simple way to use regex - */ - -rure *rure_new(const uint8_t *pattern, size_t length); -bool rure_consume(rure *re, const uint8_t *haystack, size_t length, rure_match *match); -int rure_max_submatch(const char *rewrite); -bool rure_check_rewrite_string(const char *rewrite, int max_token); - - -/* - * Convert RE2 style rewrite string to a string that Rust can accept -*/ -const char *rure_rewrite_str_convert(const uint8_t *rewrite, size_t len); - -/* - * Similar to Rewrite function in RE2. -*/ -const char *rure_rewrite(const uint8_t *rewrite, size_t len, const uint8_t **vecs, - const size_t *vecs_lengths, size_t vecs_count); - -/* - * Calculate the number of replacements. -*/ -size_t rure_replace_count(rure *re, const char *haystack); - -MyVec rure_filter_compile(const uint8_t *regex_str, size_t regex_len, size_t min_atoms_len); + /* + * rure_find returns true if and only if re matches anywhere in haystack. + * If a match is found, then its start and end offsets (in bytes) are set + * on the match pointer given. + * + * haystack may contain arbitrary bytes, but ASCII compatible text is more + * useful. UTF-8 is even more useful. Other text encodings aren't supported. + * length should be the number of bytes in haystack. + * + * start is the position at which to start searching. Note that setting the + * start position is distinct from incrementing the pointer, since the regex + * engine may look at bytes before the start position to determine match + * information. For example, if the start position is greater than 0, then the + * \A ("begin text") anchor can never match. + * + * rure_find should be preferred to rure_find_captures since it may be faster. + * + * N.B. The performance of this search is not impacted by the presence of + * capturing groups in your regular expression. + */ + bool rure_find(rure *re, const uint8_t *haystack, size_t length, + size_t start, rure_match *match); + + /* + * rure_find_captures returns true if and only if re matches anywhere in + * haystack. If a match is found, then all of its capture locations are stored + * in the captures pointer given. + * + * haystack may contain arbitrary bytes, but ASCII compatible text is more + * useful. UTF-8 is even more useful. Other text encodings aren't supported. + * length should be the number of bytes in haystack. + * + * start is the position at which to start searching. Note that setting the + * start position is distinct from incrementing the pointer, since the regex + * engine may look at bytes before the start position to determine match + * information. For example, if the start position is greater than 0, then the + * \A ("begin text") anchor can never match. + * + * Only use this function if you specifically need access to capture locations. + * It is not necessary to use this function just because your regular + * expression contains capturing groups. + * + * Capture locations can be accessed using the rure_captures_* functions. + * + * N.B. The performance of this search can be impacted by the number of + * capturing groups. If you're using this function, it may be beneficial to + * use non-capturing groups (e.g., `(?:re)`) where possible. + */ + bool rure_find_captures(rure *re, const uint8_t *haystack, size_t length, + size_t start, rure_captures *captures); + + /* + * rure_iter_capture_names_new creates a new capture_names iterator. + * + * An iterator will report all successive capture group names of re. + */ + rure_iter_capture_names *rure_iter_capture_names_new(rure *re); + + /* + * rure_iter_capture_names_free frees the iterator given. + * + * It must be called at most once. + */ + void rure_iter_capture_names_free(rure_iter_capture_names *it); + + /* + * rure_iter_capture_names_next advances the iterator and returns true + * if and only if another capture group name exists. + * + * The value of the capture group name is written to the provided pointer. + */ + bool rure_iter_capture_names_next(rure_iter_capture_names *it, char **name); + + /* + * rure_iter_free frees the iterator given. + * + * It must be called at most once. + */ + void rure_iter_free(rure_iter *it); + + /* + * rure_captures_new allocates storage for all capturing groups in re. + * + * An rure_captures value may be reused on subsequent calls to + * rure_find_captures or rure_iter_next_captures. + * + * An rure_captures value may be freed independently of re, although any + * particular rure_captures should be used only with the re given here. + * + * It is not safe to use an rure_captures value from multiple threads + * simultaneously. + */ + rure_captures *rure_captures_new(rure *re); + + /* + * rure_captures_free frees the given captures. + * + * This must be called at most once. + */ + void rure_captures_free(rure_captures *captures); + + /* + * rure_captures_at returns true if and only if the capturing group at the + * index given was part of a match. If so, the given match pointer is populated + * with the start and end location (in bytes) of the capturing group. + * + * If no capture group with the index i exists, then false is + * returned. (A capturing group exists if and only if i is less than + * rure_captures_len(captures).) + * + * Note that index 0 corresponds to the full match. + */ + bool rure_captures_at(rure_captures *captures, size_t i, rure_match *match); + + /* + * rure_captures_len returns the number of capturing groups in the given + * captures. + */ + size_t rure_captures_len(rure_captures *captures); + + /* + * rure_compile_set compiles the given list of patterns into a single regular + * expression which can be matched in a linear-scan. Each pattern in patterns + * must be valid UTF-8 and the length of each pattern in patterns corresponds + * to a byte length in patterns_lengths. + * + * The number of patterns to compile is specified by patterns_count. patterns + * must contain at least this many entries. + * + * flags is a bitfield. Valid values are constants declared with prefix + * RURE_FLAG_. + * + * options contains non-flag configuration settings. If it's NULL, default + * settings are used. options may be freed immediately after a call to + * rure_compile. + * + * error is set if there was a problem compiling the pattern. + * + * The compiled expression set returned may be used from multiple threads. + */ + rure_set *rure_compile_set(const uint8_t **patterns, + const size_t *patterns_lengths, + size_t patterns_count, + uint32_t flags, + rure_options *options, + rure_error *error); + + /* + * rure_set_free frees the given compiled regular expression set. + * + * This must be called at most once for any rure_set. + */ + void rure_set_free(rure_set *re); + + /* + * rure_is_match returns true if and only if any regexes within the set + * match anywhere in the haystack. Once a match has been located, the + * matching engine will quit immediately. + * + * haystack may contain arbitrary bytes, but ASCII compatible text is more + * useful. UTF-8 is even more useful. Other text encodings aren't supported. + * length should be the number of bytes in haystack. + * + * start is the position at which to start searching. Note that setting the + * start position is distinct from incrementing the pointer, since the regex + * engine may look at bytes before the start position to determine match + * information. For example, if the start position is greater than 0, then the + * \A ("begin text") anchor can never match. + */ + bool rure_set_is_match(rure_set *re, const uint8_t *haystack, size_t length, + size_t start); + + /* + * rure_set_matches compares each regex in the set against the haystack and + * modifies matches with the match result of each pattern. Match results are + * ordered in the same way as the rure_set was compiled. For example, + * index 0 of matches corresponds to the first pattern passed to + * `rure_compile_set`. + * + * haystack may contain arbitrary bytes, but ASCII compatible text is more + * useful. UTF-8 is even more useful. Other text encodings aren't supported. + * length should be the number of bytes in haystack. + * + * start is the position at which to start searching. Note that setting the + * start position is distinct from incrementing the pointer, since the regex + * engine may look at bytes before the start position to determine match + * information. For example, if the start position is greater than 0, then the + * \A ("begin text") anchor can never match. + * + * matches must be greater than or equal to the number of patterns the + * rure_set was compiled with. + * + * Only use this function if you specifically need to know which regexes + * matched within the set. To determine if any of the regexes matched without + * caring which, use rure_set_is_match. + */ + bool rure_set_matches(rure_set *re, const uint8_t *haystack, size_t length, + size_t start, bool *matches); + + /* + * rure_set_len returns the number of patterns rure_set was compiled with. + */ + size_t rure_set_len(rure_set *re); + + /* + * rure_error_new allocates space for an error. + * + * If error information is desired, then rure_error_new should be called + * to create an rure_error pointer, and that pointer can be passed to + * rure_compile. If an error occurred, then rure_compile will return NULL and + * the error pointer will be set. A message can then be extracted. + * + * It is not safe to use errors from multiple threads simultaneously. An error + * value may be reused on subsequent calls to rure_compile. + */ + rure_error *rure_error_new(); + + /* + * rure_error_free frees the error given. + * + * This must be called at most once. + */ + void rure_error_free(rure_error *err); + + /* + * rure_error_message returns a NUL terminated string that describes the error + * message. + * + * The pointer returned must not be freed. Instead, it will be freed when + * rure_error_free is called. If err is used in subsequent calls to + * rure_compile, then this pointer may change or become invalid. + */ + const char *rure_error_message(rure_error *err); + + /* + * rure_escape_must returns a NUL terminated string where all meta characters + * have been escaped. If escaping fails for any reason, an error message is + * printed to stderr and the process is aborted. + * + * The pattern given should be in UTF-8. For convenience, this accepts a C + * string, which means the pattern cannot contain a NUL byte. These correspond + * to the only two failure conditions of this function. That is, if the caller + * guarantees that the given pattern is valid UTF-8 and does not contain a + * NUL byte, then this is guaranteed to succeed (modulo out-of-memory errors). + * + * The pointer returned must not be freed directly. Instead, it should be freed + * by calling rure_cstring_free. + */ + const char *rure_escape_must(const char *pattern); + + /* + * rure_cstring_free frees the string given. + * + * This must be called at most once per string. + */ + void rure_cstring_free(char *s); + + /* + * rure_replace replaces the leftmost-first match with the rewrite provided. + * + * The rewrite can be a regular string (where `$N` and `$name` are + * expanded to match capture groups) or a function that takes the matches' + * `Captures` and returns the replaced string. + * + * The longest possible name is used. e.g., `$1a` looks up the capture + * group named `1a` and not the capture group at index `1`. To exert more + * precise control over the name, use braces, e.g., `${1}a`. + * + * If no match is found, then a copy of the string is returned unchanged. + * + */ + const char *rure_replace(rure *re, const uint8_t *haystack, size_t len_h, + const uint8_t *rewrite, size_t len_r); + + /* + * This like the previous function rure_replace, but is has different. + * rure_replace_all replaces all non-overlapping matches in `text` with the rewrite provided. + * + * If no match is found, then a copy of the string is returned unchanged. + */ + const char *rure_replace_all(rure *re, const uint8_t *haystack, size_t len_h, + const uint8_t *rewrite, size_t len_r); + + /* + * Simple way to use regex + */ + + rure *rure_new(const uint8_t *pattern, size_t length); + bool rure_consume(rure *re, const uint8_t *haystack, size_t length, rure_match *match); + int rure_max_submatch(const char *rewrite); + bool rure_check_rewrite_string(const char *rewrite, int max_token); + + /* + * Convert RE2 style rewrite string to a string that Rust can accept + */ + const char *rure_rewrite_str_convert(const uint8_t *rewrite, size_t len); + + /* + * Similar to Rewrite function in RE2. + */ + const char *rure_rewrite(const uint8_t *rewrite, size_t len, const uint8_t **vecs, + const size_t *vecs_lengths, size_t vecs_count); + + /* + * Calculate the number of replacements. + */ + size_t rure_replace_count(rure *re, const char *haystack); + + MyVec rure_filter_compile(const uint8_t *regex_str, size_t regex_len, size_t min_atoms_len); #ifdef __cplusplus } diff --git a/regex-capi/src/error.rs b/regex-capi/src/error.rs index bf15e22b2c251f025ccd40a494663b6efddeeac6..413b2a8327219129ab43adf2932837deb14a3826 100644 --- a/regex-capi/src/error.rs +++ b/regex-capi/src/error.rs @@ -53,7 +53,9 @@ extern "C" fn rure_error_new() -> *mut Error { #[no_mangle] extern "C" fn rure_error_free(err: *mut Error) { - unsafe { drop(Box::from_raw(err)); } + unsafe { + drop(Box::from_raw(err)); + } } #[no_mangle] diff --git a/regex-capi/src/lib.rs b/regex-capi/src/lib.rs index d7b77990aba5ec641f31958a5295ed2c3ae964f2..7a269d1822fc5d6617b19521d08e49ca82587799 100644 --- a/regex-capi/src/lib.rs +++ b/regex-capi/src/lib.rs @@ -12,142 +12,142 @@ * Create: 2022-11-25 * Description: Rure is a C API to Rust's regex library. ******************************************************************************/ - #[macro_use] - mod error; - pub use crate::error::*; - - use std::ffi::{CStr, CString}; - use std::ops::Deref; - use std::ptr; - use std::slice; - use std::str; - - use libc::{c_char, size_t}; - - use regex::{bytes, Regex}; - - use crate::error::{Error, ErrorKind}; - use std::io; - use std::io::Write; - use std::process::abort; - - include!("lib_internal.rs"); - - const RURE_FLAG_CASEI: u32 = 1 << 0; - const RURE_FLAG_MULTI: u32 = 1 << 1; - const RURE_FLAG_DOTNL: u32 = 1 << 2; - const RURE_FLAG_SWAP_GREED: u32 = 1 << 3; - const RURE_FLAG_SPACE: u32 = 1 << 4; - const RURE_FLAG_UNICODE: u32 = 1 << 5; - const RURE_DEFAULT_FLAGS: u32 = RURE_FLAG_UNICODE; - - pub struct RegexBytes { - re: bytes::Regex, - // capture_names: HashMap, - } - - pub struct RegexUnicode { - re: Regex, - } - - pub struct Options { - size_limit: usize, - dfa_size_limit: usize, - } - - // The `RegexSet` is not exposed with option support or matching at an - // arbitrary position with a crate just yet. To circumvent this, we use - // the `Exec` structure directly. - pub struct RegexSet { - re: bytes::RegexSet, - } - - #[repr(C)] - pub struct rure_match { - pub start: size_t, - pub end: size_t, - } - - pub struct Captures(bytes::Locations); - - pub struct IterCaptureNames { - capture_names: bytes::CaptureNames<'static>, - name_ptrs: Vec<*mut c_char>, - } - - #[repr(C)] - pub struct Atoms { - atom: *mut c_char, - } - - #[repr(C)] - pub struct MyVec { - data: *mut Atoms, - len: i32, - } - - impl Deref for RegexBytes { - type Target = bytes::Regex; - fn deref(&self) -> &bytes::Regex { - &self.re - } - } - - impl Deref for RegexUnicode { - type Target = Regex; - fn deref(&self) -> &Regex { - &self.re - } - } - - impl Deref for RegexSet { - type Target = bytes::RegexSet; - fn deref(&self) -> &bytes::RegexSet { - &self.re - } - } - - impl Default for Options { - fn default() -> Options { - Options { - size_limit: 10 * (1 << 20), - dfa_size_limit: 2 * (1 << 20), - } - } - } - - #[no_mangle] - extern "C" fn rure_compile_must(pattern: *const c_char) -> *const RegexBytes { - let len = unsafe { CStr::from_ptr(pattern).to_bytes().len() }; - let pat = pattern as *const u8; - let mut err = Error::new(ErrorKind::None); - let re = rure_compile(pat, len, RURE_DEFAULT_FLAGS, ptr::null(), &mut err); - if err.is_err() { - let _ = writeln!(&mut io::stderr(), "{}", err); - let _ = writeln!(&mut io::stderr(), "aborting from rure_compile_must"); - abort() - } - re - } - - #[no_mangle] - extern "C" fn rure_compile( - pattern: *const u8, - length: size_t, - flags: u32, - options: *const Options, - error: *mut Error, - ) -> *const RegexBytes { - let pat = unsafe { slice::from_raw_parts(pattern, length) }; - let pat = match str::from_utf8(pat) { - Ok(pat) => pat, - Err(err) => unsafe { - if !error.is_null() { - *error = Error::new(ErrorKind::Str(err)); - } - return ptr::null(); - }, - }; +#[macro_use] +mod error; +pub use crate::error::*; + +use std::ffi::{CStr, CString}; +use std::ops::Deref; +use std::ptr; +use std::slice; +use std::str; + +use libc::{c_char, size_t}; + +use regex::{bytes, Regex}; + +use crate::error::{Error, ErrorKind}; +use std::io; +use std::io::Write; +use std::process::abort; + +include!("lib_internal.rs"); + +const RURE_FLAG_CASEI: u32 = 1 << 0; +const RURE_FLAG_MULTI: u32 = 1 << 1; +const RURE_FLAG_DOTNL: u32 = 1 << 2; +const RURE_FLAG_SWAP_GREED: u32 = 1 << 3; +const RURE_FLAG_SPACE: u32 = 1 << 4; +const RURE_FLAG_UNICODE: u32 = 1 << 5; +const RURE_DEFAULT_FLAGS: u32 = RURE_FLAG_UNICODE; + +pub struct RegexBytes { + re: bytes::Regex, + // capture_names: HashMap, +} + +pub struct RegexUnicode { + re: Regex, +} + +pub struct Options { + size_limit: usize, + dfa_size_limit: usize, +} + +// The `RegexSet` is not exposed with option support or matching at an +// arbitrary position with a crate just yet. To circumvent this, we use +// the `Exec` structure directly. +pub struct RegexSet { + re: bytes::RegexSet, +} + +#[repr(C)] +pub struct rure_match { + pub start: size_t, + pub end: size_t, +} + +pub struct Captures(bytes::Locations); + +pub struct IterCaptureNames { + capture_names: bytes::CaptureNames<'static>, + name_ptrs: Vec<*mut c_char>, +} + +#[repr(C)] +pub struct Atoms { + atom: *mut c_char, +} + +#[repr(C)] +pub struct MyVec { + data: *mut Atoms, + len: i32, +} + +impl Deref for RegexBytes { + type Target = bytes::Regex; + fn deref(&self) -> &bytes::Regex { + &self.re + } +} + +impl Deref for RegexUnicode { + type Target = Regex; + fn deref(&self) -> &Regex { + &self.re + } +} + +impl Deref for RegexSet { + type Target = bytes::RegexSet; + fn deref(&self) -> &bytes::RegexSet { + &self.re + } +} + +impl Default for Options { + fn default() -> Options { + Options { + size_limit: 10 * (1 << 20), + dfa_size_limit: 2 * (1 << 20), + } + } +} + +#[no_mangle] +extern "C" fn rure_compile_must(pattern: *const c_char) -> *const RegexBytes { + let len = unsafe { CStr::from_ptr(pattern).to_bytes().len() }; + let pat = pattern as *const u8; + let mut err = Error::new(ErrorKind::None); + let re = rure_compile(pat, len, RURE_DEFAULT_FLAGS, ptr::null(), &mut err); + if err.is_err() { + let _ = writeln!(&mut io::stderr(), "{}", err); + let _ = writeln!(&mut io::stderr(), "aborting from rure_compile_must"); + abort() + } + re +} + +#[no_mangle] +extern "C" fn rure_compile( + pattern: *const u8, + length: size_t, + flags: u32, + options: *const Options, + error: *mut Error, +) -> *const RegexBytes { + let pat = unsafe { slice::from_raw_parts(pattern, length) }; + let pat = match str::from_utf8(pat) { + Ok(pat) => pat, + Err(err) => unsafe { + if !error.is_null() { + *error = Error::new(ErrorKind::Str(err)); + } + return ptr::null(); + }, + }; let mut builder = rure_compile_internal(pat, flags); if !options.is_null() { @@ -160,7 +160,7 @@ Ok(re) => { let re = RegexBytes { re }; Box::into_raw(Box::new(re)) - }, + } Err(err) => unsafe { if !error.is_null() { *error = Error::new(ErrorKind::Regex(err)); @@ -168,135 +168,135 @@ ptr::null() }, } - } - - #[no_mangle] - extern "C" fn rure_free(re: *const RegexBytes) { - unsafe { - drop(Box::from_raw(re as *mut Regex)); - } - } - - #[no_mangle] - extern "C" fn rure_is_match( - re: *const RegexBytes, - haystack: *const u8, - len: size_t, - _start: size_t, - ) -> bool { - let re = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(haystack, len) }; - re.is_match(haystack) - } - - #[no_mangle] - extern "C" fn rure_find( - re: *const RegexBytes, - haystack: *const u8, - len: size_t, - start: size_t, - match_info: *mut rure_match, - ) -> bool { - let re = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(haystack, len) }; - re.find_at(haystack, start) - .map(|m| unsafe { - if !match_info.is_null() { - (*match_info).start = m.start(); - (*match_info).end = m.end(); - } - }) - .is_some() - } - - #[no_mangle] - extern "C" fn rure_find_captures( - re: *const RegexBytes, - haystack: *const u8, - len: size_t, - start: size_t, - captures: *mut Captures, - ) -> bool { - let re = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(haystack, len) }; - let slots = unsafe { &mut (*captures).0 }; - re.read_captures_at(slots, haystack, start).is_some() - } - - #[no_mangle] - extern "C" fn rure_iter_capture_names_new(re: *const RegexBytes) -> *mut IterCaptureNames { - let re = unsafe { &*re }; - Box::into_raw(Box::new(IterCaptureNames { - capture_names: re.re.capture_names(), - name_ptrs: Vec::new(), - })) - } - - #[no_mangle] - extern "C" fn rure_iter_capture_names_free(it: *mut IterCaptureNames) { - unsafe { - let it = &mut *it; - while let Some(ptr) = it.name_ptrs.pop() { - drop(CString::from_raw(ptr)); - } - drop(Box::from_raw(it)); - } - } - - #[no_mangle] - extern "C" fn rure_iter_capture_names_next( - it: *mut IterCaptureNames, - capture_name: *mut *mut c_char, - ) -> bool { - if capture_name.is_null() { - return false; - } - let it = unsafe { &mut *it }; - let cn = match it.capture_names.next() { - // Top-level iterator ran out of capture groups - None => return false, - Some(val) => { - match val { - // inner Option didn't have a name - None => "", - Some(name) => name, - } - } - }; - unsafe { - let cs = match CString::new(cn.as_bytes()) { - Result::Ok(val) => val, - Result::Err(_) => return false, - }; - let ptr = cs.into_raw(); - it.name_ptrs.push(ptr); - *capture_name = ptr; - } - true - } - - #[no_mangle] - extern "C" fn rure_captures_new(re: *const RegexBytes) -> *mut Captures { - let re = unsafe { &*re }; - let captures = Captures(re.locations()); - Box::into_raw(Box::new(captures)) - } - - #[no_mangle] - extern "C" fn rure_captures_free(captures: *const Captures) { - unsafe { - drop(Box::from_raw(captures as *mut Captures)); - } - } - - #[no_mangle] - extern "C" fn rure_captures_at( - captures: *const Captures, - i: size_t, - match_info: *mut rure_match, - ) -> bool { - let locs = unsafe { &(*captures).0 }; - match locs.pos(i) { +} + +#[no_mangle] +extern "C" fn rure_free(re: *const RegexBytes) { + unsafe { + drop(Box::from_raw(re as *mut Regex)); + } +} + +#[no_mangle] +extern "C" fn rure_is_match( + re: *const RegexBytes, + haystack: *const u8, + len: size_t, + _start: size_t, +) -> bool { + let re = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(haystack, len) }; + re.is_match(haystack) +} + +#[no_mangle] +extern "C" fn rure_find( + re: *const RegexBytes, + haystack: *const u8, + len: size_t, + start: size_t, + match_info: *mut rure_match, +) -> bool { + let re = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(haystack, len) }; + re.find_at(haystack, start) + .map(|m| unsafe { + if !match_info.is_null() { + (*match_info).start = m.start(); + (*match_info).end = m.end(); + } + }) + .is_some() +} + +#[no_mangle] +extern "C" fn rure_find_captures( + re: *const RegexBytes, + haystack: *const u8, + len: size_t, + start: size_t, + captures: *mut Captures, +) -> bool { + let re = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(haystack, len) }; + let slots = unsafe { &mut (*captures).0 }; + re.read_captures_at(slots, haystack, start).is_some() +} + +#[no_mangle] +extern "C" fn rure_iter_capture_names_new(re: *const RegexBytes) -> *mut IterCaptureNames { + let re = unsafe { &*re }; + Box::into_raw(Box::new(IterCaptureNames { + capture_names: re.re.capture_names(), + name_ptrs: Vec::new(), + })) +} + +#[no_mangle] +extern "C" fn rure_iter_capture_names_free(it: *mut IterCaptureNames) { + unsafe { + let it = &mut *it; + while let Some(ptr) = it.name_ptrs.pop() { + drop(CString::from_raw(ptr)); + } + drop(Box::from_raw(it)); + } +} + +#[no_mangle] +extern "C" fn rure_iter_capture_names_next( + it: *mut IterCaptureNames, + capture_name: *mut *mut c_char, +) -> bool { + if capture_name.is_null() { + return false; + } + let it = unsafe { &mut *it }; + let cn = match it.capture_names.next() { + // Top-level iterator ran out of capture groups + None => return false, + Some(val) => { + match val { + // inner Option didn't have a name + None => "", + Some(name) => name, + } + } + }; + unsafe { + let cs = match CString::new(cn.as_bytes()) { + Result::Ok(val) => val, + Result::Err(_) => return false, + }; + let ptr = cs.into_raw(); + it.name_ptrs.push(ptr); + *capture_name = ptr; + } + true +} + +#[no_mangle] +extern "C" fn rure_captures_new(re: *const RegexBytes) -> *mut Captures { + let re = unsafe { &*re }; + let captures = Captures(re.locations()); + Box::into_raw(Box::new(captures)) +} + +#[no_mangle] +extern "C" fn rure_captures_free(captures: *const Captures) { + unsafe { + drop(Box::from_raw(captures as *mut Captures)); + } +} + +#[no_mangle] +extern "C" fn rure_captures_at( + captures: *const Captures, + i: size_t, + match_info: *mut rure_match, +) -> bool { + let locs = unsafe { &(*captures).0 }; + match locs.pos(i) { Some((start, end)) => { if !match_info.is_null() { unsafe { @@ -308,41 +308,41 @@ } _ => false, } - } - - #[no_mangle] - extern "C" fn rure_captures_len(captures: *const Captures) -> size_t { - unsafe { (*captures).0.len() } - } - - #[no_mangle] - extern "C" fn rure_compile_set( - patterns: *const *const u8, - patterns_lengths: *const size_t, - patterns_count: size_t, - flags: u32, - options: *const Options, - error: *mut Error, - ) -> *const RegexSet { - let (raw_pats, raw_patsl) = unsafe { - ( - slice::from_raw_parts(patterns, patterns_count), - slice::from_raw_parts(patterns_lengths, patterns_count), - ) - }; - let mut pats = Vec::with_capacity(patterns_count); - for (&raw_pat, &raw_patl) in raw_pats.iter().zip(raw_patsl) { - let pat = unsafe { slice::from_raw_parts(raw_pat, raw_patl) }; - pats.push(match str::from_utf8(pat) { - Ok(pat) => pat, - Err(err) => unsafe { - if !error.is_null() { - *error = Error::new(ErrorKind::Str(err)); - } - return ptr::null(); - }, - }); - } +} + +#[no_mangle] +extern "C" fn rure_captures_len(captures: *const Captures) -> size_t { + unsafe { (*captures).0.len() } +} + +#[no_mangle] +extern "C" fn rure_compile_set( + patterns: *const *const u8, + patterns_lengths: *const size_t, + patterns_count: size_t, + flags: u32, + options: *const Options, + error: *mut Error, +) -> *const RegexSet { + let (raw_pats, raw_patsl) = unsafe { + ( + slice::from_raw_parts(patterns, patterns_count), + slice::from_raw_parts(patterns_lengths, patterns_count), + ) + }; + let mut pats = Vec::with_capacity(patterns_count); + for (&raw_pat, &raw_patl) in raw_pats.iter().zip(raw_patsl) { + let pat = unsafe { slice::from_raw_parts(raw_pat, raw_patl) }; + pats.push(match str::from_utf8(pat) { + Ok(pat) => pat, + Err(err) => unsafe { + if !error.is_null() { + *error = Error::new(ErrorKind::Str(err)); + } + return ptr::null(); + }, + }); + } let mut builder = rure_compile_set_internal(pats, flags); if !options.is_null() { @@ -359,230 +359,228 @@ ptr::null() }, } - } - - #[no_mangle] - extern "C" fn rure_set_free(re: *const RegexSet) { - unsafe { - drop(Box::from_raw(re as *mut RegexSet)); - } - } - - #[no_mangle] - extern "C" fn rure_set_is_match( - re: *const RegexSet, - haystack: *const u8, - len: size_t, - start: size_t, - ) -> bool { - let re = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(haystack, len) }; - re.is_match_at(haystack, start) - } - - #[no_mangle] - extern "C" fn rure_set_matches( - re: *const RegexSet, - haystack: *const u8, - len: size_t, - start: size_t, - matches: *mut bool, - ) -> bool { - let re = unsafe { &*re }; - let matches = unsafe { slice::from_raw_parts_mut(matches, re.len()) }; - let haystack = unsafe { slice::from_raw_parts(haystack, len) }; - - rure_set_matches_internal(re, matches, haystack, start) - } - - #[no_mangle] - extern "C" fn rure_set_len(re: *const RegexSet) -> size_t { - unsafe { (*re).len() } - } - - #[no_mangle] - extern "C" fn rure_escape_must(pattern: *const c_char) -> *const c_char { - let len = unsafe { CStr::from_ptr(pattern).to_bytes().len() }; - let pat = pattern as *const u8; - let mut err = Error::new(ErrorKind::None); - let esc = rure_escape(pat, len, &mut err); - if err.is_err() { - println!("{}", "aborting from rure_escape_must"); - let _ = writeln!(&mut io::stderr(), "{}", err); - let _ = writeln!(&mut io::stderr(), "aborting from rure_escape_must"); - abort() - } - esc - } - - /// A helper function that implements fallible escaping in a way that returns - /// an error if escaping failed. - /// - /// This should ideally be exposed, but it needs API design work. In - /// particular, this should not return a C string, but a `const uint8_t *` - /// instead, since it may contain a NUL byte. - fn rure_escape(pattern: *const u8, length: size_t, error: *mut Error) -> *const c_char { - let pat: &[u8] = unsafe { slice::from_raw_parts(pattern, length) }; - let str_pat = match str::from_utf8(pat) { - Ok(val) => val, - Err(err) => unsafe { - if !error.is_null() { - *error = Error::new(ErrorKind::Str(err)); - } - return ptr::null(); - }, - }; - let esc_pat = regex::escape(str_pat); - let c_esc_pat = match CString::new(esc_pat) { - Ok(val) => val, - Err(err) => unsafe { - if !error.is_null() { - *error = Error::new(ErrorKind::Nul(err)); - } - return ptr::null(); - }, - }; - c_esc_pat.into_raw() as *const c_char - } - - #[no_mangle] - extern "C" fn rure_cstring_free(s: *mut c_char) { - unsafe { - drop(CString::from_raw(s)); - } - } - - #[no_mangle] - extern "C" fn rure_replace( - re: *const RegexUnicode, - haystack: *const u8, - len_h: size_t, - rewrite: *const u8, - len_r: size_t, - ) -> *const u8 { - let re = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(haystack, len_h) }; - let rewrite = unsafe { slice::from_raw_parts(rewrite, len_r) }; - rure_replace_internal(re, haystack, rewrite) - } - - #[no_mangle] - extern "C" fn rure_replace_all( - re: *const RegexUnicode, - haystack: *const u8, - len_h: size_t, - rewrite: *const u8, - len_r: size_t, - ) -> *const u8 { - let re = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(haystack, len_h) }; - let rewrite = unsafe { slice::from_raw_parts(rewrite, len_r) }; - rure_replace_all_internal(re, haystack, rewrite) - } - - /* - * Simple way to use regex - */ - - #[no_mangle] - extern "C" fn rure_new(pattern: *const u8, length: size_t) -> *const RegexBytes { - let pat = unsafe { slice::from_raw_parts(pattern, length) }; - rure_new_internal(pat) - } - - #[no_mangle] - extern "C" fn rure_consume( - re: *const RegexBytes, - haystack: *const u8, - len: size_t, - match_info: *mut rure_match, - ) -> bool { - let exp = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(haystack, len) }; - exp.find(haystack) - .map(|m| unsafe { - if !match_info.is_null() { - (*match_info).start = m.start(); - (*match_info).end = m.end(); - } - }) - .is_some() - } - - #[no_mangle] - extern "C" fn rure_max_submatch(rewrite: *const c_char) -> i32 { - let len = unsafe { CStr::from_ptr(rewrite).to_bytes().len() }; - let pat = rewrite as *const u8; - let text = unsafe { slice::from_raw_parts(pat, len) }; - - rure_max_submatch_internal(text) - } - - #[no_mangle] - extern "C" fn rure_check_rewrite_string(rewrite: *const c_char, cap_num: i32) -> bool { - let len = unsafe { CStr::from_ptr(rewrite).to_bytes().len() }; - let pat = rewrite as *const u8; - let text = unsafe { slice::from_raw_parts(pat, len) }; - - rure_check_rewrite_string_internal(text, cap_num) - } - - #[no_mangle] - extern "C" fn rure_rewrite_str_convert(rewrite: *const u8, length: size_t) -> *const c_char { - let rewrite = unsafe { slice::from_raw_parts(rewrite, length) }; - - rure_rewrite_str_convert_internal(rewrite) - } - - #[no_mangle] - extern "C" fn rure_rewrite( - rewrite: *const u8, - length: size_t, - vecs: *const *const u8, - vecs_lengths: *const size_t, - vecs_count: size_t, - ) -> *const c_char { - // 获取rewrite - let rewrite = unsafe { slice::from_raw_parts(rewrite, length) }; - let rewrite_str = std::str::from_utf8(rewrite).unwrap(); - - //获取vecs中的内容 - let (raw_vecs, raw_vecsl) = unsafe { - ( - slice::from_raw_parts(vecs, vecs_count), - slice::from_raw_parts(vecs_lengths, vecs_count), - ) - }; - - let mut rure_vecs = Vec::with_capacity(vecs_count); - for (&raw_vec, &raw_vecl) in raw_vecs.iter().zip(raw_vecsl) { - let rure_vec = unsafe { slice::from_raw_parts(raw_vec, raw_vecl) }; - rure_vecs.push(str::from_utf8(rure_vec).unwrap()); - } - - rure_rewrite_internal(rewrite_str, vecs_count, rure_vecs) - } - - #[no_mangle] - extern "C" fn rure_replace_count(re: *const RegexUnicode, haystack: *const c_char) -> size_t { - let len = unsafe { CStr::from_ptr(haystack).to_bytes().len() }; - let hay = haystack as *const u8; - - let re = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(hay, len) }; - rure_replace_count_internal(haystack, re) - } - - #[no_mangle] - extern "C" fn rure_filter_compile( - regex_str: *const u8, - regex_len: size_t, - min_atoms_len: size_t, - ) -> MyVec { - let r = unsafe { slice::from_raw_parts(regex_str, regex_len) }; - let regex_str = str::from_utf8(r).unwrap(); - let atoms = my_compile(regex_str, min_atoms_len as i32); - atoms - } - +} + +#[no_mangle] +extern "C" fn rure_set_free(re: *const RegexSet) { + unsafe { + drop(Box::from_raw(re as *mut RegexSet)); + } +} + +#[no_mangle] +extern "C" fn rure_set_is_match( + re: *const RegexSet, + haystack: *const u8, + len: size_t, + start: size_t, +) -> bool { + let re = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(haystack, len) }; + re.is_match_at(haystack, start) +} + +#[no_mangle] +extern "C" fn rure_set_matches( + re: *const RegexSet, + haystack: *const u8, + len: size_t, + start: size_t, + matches: *mut bool, +) -> bool { + let re = unsafe { &*re }; + let matches = unsafe { slice::from_raw_parts_mut(matches, re.len()) }; + let haystack = unsafe { slice::from_raw_parts(haystack, len) }; + + rure_set_matches_internal(re, matches, haystack, start) +} + +#[no_mangle] +extern "C" fn rure_set_len(re: *const RegexSet) -> size_t { + unsafe { (*re).len() } +} + +#[no_mangle] +extern "C" fn rure_escape_must(pattern: *const c_char) -> *const c_char { + let len = unsafe { CStr::from_ptr(pattern).to_bytes().len() }; + let pat = pattern as *const u8; + let mut err = Error::new(ErrorKind::None); + let esc = rure_escape(pat, len, &mut err); + if err.is_err() { + println!("{}", "aborting from rure_escape_must"); + let _ = writeln!(&mut io::stderr(), "{}", err); + let _ = writeln!(&mut io::stderr(), "aborting from rure_escape_must"); + abort() + } + esc +} + +/// A helper function that implements fallible escaping in a way that returns +/// an error if escaping failed. +/// +/// This should ideally be exposed, but it needs API design work. In +/// particular, this should not return a C string, but a `const uint8_t *` +/// instead, since it may contain a NUL byte. +fn rure_escape(pattern: *const u8, length: size_t, error: *mut Error) -> *const c_char { + let pat: &[u8] = unsafe { slice::from_raw_parts(pattern, length) }; + let str_pat = match str::from_utf8(pat) { + Ok(val) => val, + Err(err) => unsafe { + if !error.is_null() { + *error = Error::new(ErrorKind::Str(err)); + } + return ptr::null(); + }, + }; + let esc_pat = regex::escape(str_pat); + let c_esc_pat = match CString::new(esc_pat) { + Ok(val) => val, + Err(err) => unsafe { + if !error.is_null() { + *error = Error::new(ErrorKind::Nul(err)); + } + return ptr::null(); + }, + }; + c_esc_pat.into_raw() as *const c_char +} + +#[no_mangle] +extern "C" fn rure_cstring_free(s: *mut c_char) { + unsafe { + drop(CString::from_raw(s)); + } +} + +#[no_mangle] +extern "C" fn rure_replace( + re: *const RegexUnicode, + haystack: *const u8, + len_h: size_t, + rewrite: *const u8, + len_r: size_t, +) -> *const u8 { + let re = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(haystack, len_h) }; + let rewrite = unsafe { slice::from_raw_parts(rewrite, len_r) }; + rure_replace_internal(re, haystack, rewrite) +} + +#[no_mangle] +extern "C" fn rure_replace_all( + re: *const RegexUnicode, + haystack: *const u8, + len_h: size_t, + rewrite: *const u8, + len_r: size_t, +) -> *const u8 { + let re = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(haystack, len_h) }; + let rewrite = unsafe { slice::from_raw_parts(rewrite, len_r) }; + rure_replace_all_internal(re, haystack, rewrite) +} + +/* + * Simple way to use regex + */ + +#[no_mangle] +extern "C" fn rure_new(pattern: *const u8, length: size_t) -> *const RegexBytes { + let pat = unsafe { slice::from_raw_parts(pattern, length) }; + rure_new_internal(pat) +} + +#[no_mangle] +extern "C" fn rure_consume( + re: *const RegexBytes, + haystack: *const u8, + len: size_t, + match_info: *mut rure_match, +) -> bool { + let exp = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(haystack, len) }; + exp.find(haystack) + .map(|m| unsafe { + if !match_info.is_null() { + (*match_info).start = m.start(); + (*match_info).end = m.end(); + } + }) + .is_some() +} + +#[no_mangle] +extern "C" fn rure_max_submatch(rewrite: *const c_char) -> i32 { + let len = unsafe { CStr::from_ptr(rewrite).to_bytes().len() }; + let pat = rewrite as *const u8; + let text = unsafe { slice::from_raw_parts(pat, len) }; + + rure_max_submatch_internal(text) +} + +#[no_mangle] +extern "C" fn rure_check_rewrite_string(rewrite: *const c_char, cap_num: i32) -> bool { + let len = unsafe { CStr::from_ptr(rewrite).to_bytes().len() }; + let pat = rewrite as *const u8; + let text = unsafe { slice::from_raw_parts(pat, len) }; + + rure_check_rewrite_string_internal(text, cap_num) +} + +#[no_mangle] +extern "C" fn rure_rewrite_str_convert(rewrite: *const u8, length: size_t) -> *const c_char { + let rewrite = unsafe { slice::from_raw_parts(rewrite, length) }; + + rure_rewrite_str_convert_internal(rewrite) +} + +#[no_mangle] +extern "C" fn rure_rewrite( + rewrite: *const u8, + length: size_t, + vecs: *const *const u8, + vecs_lengths: *const size_t, + vecs_count: size_t, +) -> *const c_char { + // 获取rewrite + let rewrite = unsafe { slice::from_raw_parts(rewrite, length) }; + let rewrite_str = std::str::from_utf8(rewrite).unwrap(); + + //获取vecs中的内容 + let (raw_vecs, raw_vecsl) = unsafe { + ( + slice::from_raw_parts(vecs, vecs_count), + slice::from_raw_parts(vecs_lengths, vecs_count), + ) + }; + + let mut rure_vecs = Vec::with_capacity(vecs_count); + for (&raw_vec, &raw_vecl) in raw_vecs.iter().zip(raw_vecsl) { + let rure_vec = unsafe { slice::from_raw_parts(raw_vec, raw_vecl) }; + rure_vecs.push(str::from_utf8(rure_vec).unwrap()); + } + + rure_rewrite_internal(rewrite_str, vecs_count, rure_vecs) +} + +#[no_mangle] +extern "C" fn rure_replace_count(re: *const RegexUnicode, haystack: *const c_char) -> size_t { + let len = unsafe { CStr::from_ptr(haystack).to_bytes().len() }; + let hay = haystack as *const u8; + + let re = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(hay, len) }; + rure_replace_count_internal(haystack, re) +} +#[no_mangle] +extern "C" fn rure_filter_compile( + regex_str: *const u8, + regex_len: size_t, + min_atoms_len: size_t, +) -> MyVec { + let r = unsafe { slice::from_raw_parts(regex_str, regex_len) }; + let regex_str = str::from_utf8(r).unwrap(); + let atoms = my_compile(regex_str, min_atoms_len as i32); + atoms +} diff --git a/regex-capi/src/lib_internal.rs b/regex-capi/src/lib_internal.rs index fb331d0e9093bf4ea001ebc4e81a1c8ecfc63ecd..2ad0e03f318892dfb86bf7cb8335d6429088ae7c 100644 --- a/regex-capi/src/lib_internal.rs +++ b/regex-capi/src/lib_internal.rs @@ -14,10 +14,7 @@ ******************************************************************************/ use regex::bytes::RegexBuilder; use regex::bytes::RegexSetBuilder; - fn rure_compile_internal( - pat: &str, - flags: u32, -) -> RegexBuilder { +fn rure_compile_internal(pat: &str, flags: u32) -> RegexBuilder { let mut builder = bytes::RegexBuilder::new(pat); builder.case_insensitive(flags & RURE_FLAG_CASEI > 0); builder.multi_line(flags & RURE_FLAG_MULTI > 0); @@ -28,10 +25,7 @@ use regex::bytes::RegexSetBuilder; builder } -fn rure_compile_set_internal( - pats: Vec<&str>, - flags: u32, -) -> RegexSetBuilder { +fn rure_compile_set_internal(pats: Vec<&str>, flags: u32) -> RegexSetBuilder { let mut builder = bytes::RegexSetBuilder::new(pats); builder.case_insensitive(flags & RURE_FLAG_CASEI > 0);