diff --git a/re2/filtered_re2.cc b/re2/filtered_re2.cc index a108e6863e63ac44ae3033c533ce965574f58368..e9dfc546399f3ae12b49eafd3d635c133671aba9 100644 --- a/re2/filtered_re2.cc +++ b/re2/filtered_re2.cc @@ -28,225 +28,257 @@ extern "C" #include "regex-capi/include/regex_capi.h" } using namespace std; -namespace re2 { +namespace re2 +{ -std::map> map_atoms; + std::map> map_atoms; -// #include "re2/prefilter_tree.h" - class PrefilterTree { - public: - PrefilterTree():min_atom_len_(3){}; - explicit PrefilterTree(int min_atom_len):min_atom_len_(min_atom_len){}; + // #include "re2/prefilter_tree.h" + class PrefilterTree + { + public: + PrefilterTree() : min_atom_len_(3){}; + explicit PrefilterTree(int min_atom_len) : min_atom_len_(min_atom_len){}; ~PrefilterTree(){}; - int getMinAtomLen(){ + int getMinAtomLen() + { return min_atom_len_; } - bool get_is_latin_result() {return is_latin;}; + bool get_is_latin_result() { return is_latin; }; void set_latin(bool x); - std::string get_latin_string() {return str_latin;}; + std::string get_latin_string() { return str_latin; }; void set_latin_str(std::string x); - - private: + + private: const int min_atom_len_; bool is_latin; std::string str_latin; }; - void PrefilterTree::set_latin(bool x) { + void PrefilterTree::set_latin(bool x) + { is_latin = x; } - void PrefilterTree::set_latin_str(std::string x) { + void PrefilterTree::set_latin_str(std::string x) + { str_latin = x; } }; -namespace re2 { - -FilteredRE2::FilteredRE2() - : compiled_(false), - prefilter_tree_(new PrefilterTree()) { -} +namespace re2 +{ -FilteredRE2::FilteredRE2(int min_atom_len) - : compiled_(false), - prefilter_tree_(new PrefilterTree(min_atom_len)) { -} + FilteredRE2::FilteredRE2() + : compiled_(false), + prefilter_tree_(new PrefilterTree()) + { + } -FilteredRE2::~FilteredRE2() { - for (size_t i = 0; i < re2_vec_.size(); i++) - delete re2_vec_[i]; -} + FilteredRE2::FilteredRE2(int min_atom_len) + : compiled_(false), + prefilter_tree_(new PrefilterTree(min_atom_len)) + { + } -FilteredRE2::FilteredRE2(FilteredRE2&& other) - : re2_vec_(std::move(other.re2_vec_)), - compiled_(other.compiled_), - prefilter_tree_(std::move(other.prefilter_tree_)) { - other.re2_vec_.clear(); - other.re2_vec_.shrink_to_fit(); - other.compiled_ = false; - other.prefilter_tree_.reset(new PrefilterTree()); -} + FilteredRE2::~FilteredRE2() + { + for (size_t i = 0; i < re2_vec_.size(); i++) + delete re2_vec_[i]; + } -FilteredRE2& FilteredRE2::operator=(FilteredRE2&& other) { - this->~FilteredRE2(); - (void) new (this) FilteredRE2(std::move(other)); - return *this; -} + FilteredRE2::FilteredRE2(FilteredRE2 &&other) + : re2_vec_(std::move(other.re2_vec_)), + compiled_(other.compiled_), + prefilter_tree_(std::move(other.prefilter_tree_)) + { + other.re2_vec_.clear(); + other.re2_vec_.shrink_to_fit(); + other.compiled_ = false; + other.prefilter_tree_.reset(new PrefilterTree()); + } -RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern, - const RE2::Options& options, int* id) { - RE2* re = new RE2(pattern, options); - RE2::ErrorCode code = re->error_code(); - if(options.encoding() == RE2::Options::EncodingLatin1) { - prefilter_tree_->set_latin(true); - prefilter_tree_->set_latin_str(pattern.as_string()); + FilteredRE2 &FilteredRE2::operator=(FilteredRE2 &&other) + { + this->~FilteredRE2(); + (void)new (this) FilteredRE2(std::move(other)); + return *this; } - else prefilter_tree_->set_latin(false); - - if (!re->ok()) { - if (options.log_errors()) { - LOG(ERROR) << "Couldn't compile regular expression, skipping: " - << pattern << " due to error " << re->error(); + + RE2::ErrorCode FilteredRE2::Add(const StringPiece &pattern, + const RE2::Options &options, int *id) + { + RE2 *re = new RE2(pattern, options); + RE2::ErrorCode code = re->error_code(); + if (options.encoding() == RE2::Options::EncodingLatin1) + { + prefilter_tree_->set_latin(true); + prefilter_tree_->set_latin_str(pattern.as_string()); } - delete re; - } else { - *id = static_cast(re2_vec_.size()); - re2_vec_.push_back(re); - } + else + prefilter_tree_->set_latin(false); - return code; -} + if (!re->ok()) + { + if (options.log_errors()) + { + LOG(ERROR) << "Couldn't compile regular expression, skipping: " + << pattern << " due to error " << re->error(); + } + delete re; + } + else + { + *id = static_cast(re2_vec_.size()); + re2_vec_.push_back(re); + } -void FilteredRE2::Compile(std::vector* atoms) { - map_atoms.clear(); - if (compiled_) { - LOG(ERROR) << "Compile called already."; - return; + return code; } - if (re2_vec_.empty()) { - LOG(ERROR) << "Compile called before Add."; - return; - } - atoms->clear(); - - // 处理latin的情况 - if(prefilter_tree_->get_is_latin_result()) { - std::string str = prefilter_tree_->get_latin_string(); - std::vector vec; - vec.push_back(str); - std::string str_low = str; - transform(str_low.begin(),str_low.end(),str_low.begin(),::tolower); - atoms->push_back(str_low); - map_atoms.insert(map>::value_type(str, vec)); - map_atoms.insert(map>::value_type("total", vec)); - compiled_ = true; - return; - } - - for(size_t i = 0; i < re2_vec_.size(); i++) { - // std::vector my_atoms = MyCompile(re2_vec_[i]->pattern(), prefilter_tree_->getMinAtomLen()); - const char *regex = re2_vec_[i]->pattern().c_str(); - std::string regex_str = regex; - MyVec vec = rure_filter_compile((const uint8_t *)regex, strlen(regex), prefilter_tree_->getMinAtomLen()); - int32_t len = vec.len; - std::vector v; - for(int32_t i = 0; i < len; i++) { - atoms->push_back(vec.data[i].atom); - v.push_back(vec.data[i].atom); + void FilteredRE2::Compile(std::vector *atoms) + { + map_atoms.clear(); + if (compiled_) + { + LOG(ERROR) << "Compile called already."; + return; } - map_atoms.insert(map>::value_type(regex_str, v)); - } - map_atoms.insert(map>::value_type("total", *atoms)); - compiled_ = true; -} -int FilteredRE2::SlowFirstMatch(const StringPiece& text) const { - for (size_t i = 0; i < re2_vec_.size(); i++) - { - if (RE2::PartialMatch(text, re2_vec_[i]->pattern())){ - return static_cast(i); - } + if (re2_vec_.empty()) + { + LOG(ERROR) << "Compile called before Add."; + return; + } + atoms->clear(); + + // 处理latin的情况 + if (prefilter_tree_->get_is_latin_result()) + { + std::string str = prefilter_tree_->get_latin_string(); + std::vector vec; + vec.push_back(str); + std::string str_low = str; + transform(str_low.begin(), str_low.end(), str_low.begin(), ::tolower); + atoms->push_back(str_low); + map_atoms.insert(map>::value_type(str, vec)); + map_atoms.insert(map>::value_type("total", vec)); + compiled_ = true; + return; + } + + for (size_t i = 0; i < re2_vec_.size(); i++) + { + // std::vector my_atoms = MyCompile(re2_vec_[i]->pattern(), prefilter_tree_->getMinAtomLen()); + const char *regex = re2_vec_[i]->pattern().c_str(); + std::string regex_str = regex; + MyVec vec = rure_filter_compile((const uint8_t *)regex, strlen(regex), prefilter_tree_->getMinAtomLen()); + int32_t len = vec.len; + std::vector v; + for (int32_t i = 0; i < len; i++) + { + atoms->push_back(vec.data[i].atom); + v.push_back(vec.data[i].atom); + } + map_atoms.insert(map>::value_type(regex_str, v)); + } + map_atoms.insert(map>::value_type("total", *atoms)); + compiled_ = true; } - return -1; -} -void AtomsToRegexps(std::vector re2_vec_, std::vector atoms, std::vector *regexps, int min_atom_len) -{ - // 根据atoms索引获取regexp索引的规则 - /* - * 如果没有原子, 那么直接会把re加进去。 - * 如果这个正则表达式有原子,那么要把该正则表达式的所有的原子的索引全加入,这个正则表达式才能加入成功。 - */ - - std::vector atoms_total = map_atoms["total"]; - std::vector atoms_tmp; - for(size_t i = 0; i < atoms.size(); i++) + int FilteredRE2::SlowFirstMatch(const StringPiece &text) const { - atoms_tmp.push_back(atoms_total[atoms[i]]); + for (size_t i = 0; i < re2_vec_.size(); i++) + { + if (RE2::PartialMatch(text, re2_vec_[i]->pattern())) + { + return static_cast(i); + } + } + return -1; } - for(size_t i = 0; i < re2_vec_.size(); i++) + + void AtomsToRegexps(std::vector re2_vec_, std::vector atoms, std::vector *regexps, int min_atom_len) { - // std::vector my_atoms = MyCompile(re2_vec_[i]->pattern(), min_atom_len); - std::string str = re2_vec_[i]->pattern(); - std::vector my_atoms = map_atoms[str]; - if(my_atoms.size() == 0){ - regexps->push_back(i); - continue; + // 根据atoms索引获取regexp索引的规则 + /* + * 如果没有原子, 那么直接会把re加进去。 + * 如果这个正则表达式有原子,那么要把该正则表达式的所有的原子的索引全加入,这个正则表达式才能加入成功。 + */ + + std::vector atoms_total = map_atoms["total"]; + std::vector atoms_tmp; + for (size_t i = 0; i < atoms.size(); i++) + { + atoms_tmp.push_back(atoms_total[atoms[i]]); } - else + for (size_t i = 0; i < re2_vec_.size(); i++) { - int count = 0; - for(size_t ii = 0; ii < my_atoms.size(); ii++) + // std::vector my_atoms = MyCompile(re2_vec_[i]->pattern(), min_atom_len); + std::string str = re2_vec_[i]->pattern(); + std::vector my_atoms = map_atoms[str]; + if (my_atoms.size() == 0) { - for(size_t jj = 0; jj < atoms_tmp.size(); jj++) + regexps->push_back(i); + continue; + } + else + { + int count = 0; + for (size_t ii = 0; ii < my_atoms.size(); ii++) { - if(my_atoms[ii] == atoms_tmp[jj]){ - count++; - break; + for (size_t jj = 0; jj < atoms_tmp.size(); jj++) + { + if (my_atoms[ii] == atoms_tmp[jj]) + { + count++; + break; + } } } + if (count == (int)my_atoms.size()) + regexps->push_back(int(i)); } - if(count == (int)my_atoms.size()) regexps->push_back(int(i)); } } -} -int FilteredRE2::FirstMatch(const StringPiece& text, - const std::vector& atoms) const { - if (!compiled_) { - LOG(DFATAL) << "FirstMatch called before Compile."; + int FilteredRE2::FirstMatch(const StringPiece &text, + const std::vector &atoms) const + { + if (!compiled_) + { + LOG(DFATAL) << "FirstMatch called before Compile."; + return -1; + } + std::vector regexps; + + AtomsToRegexps(re2_vec_, atoms, ®exps, prefilter_tree_->getMinAtomLen()); + + for (size_t i = 0; i < regexps.size(); i++) + if (RE2::PartialMatch(text, *re2_vec_[regexps[i]])) + return static_cast(i); return -1; } - std::vector regexps; - - AtomsToRegexps(re2_vec_, atoms, ®exps, prefilter_tree_->getMinAtomLen()); - - for (size_t i = 0; i < regexps.size(); i++) - if (RE2::PartialMatch(text, *re2_vec_[regexps[i]])) - return static_cast(i); - return -1; -} -bool FilteredRE2::AllMatches( - const StringPiece& text, - const std::vector& atoms, - std::vector* matching_regexps) const { - matching_regexps->clear(); + bool FilteredRE2::AllMatches( + const StringPiece &text, + const std::vector &atoms, + std::vector *matching_regexps) const + { + matching_regexps->clear(); - std::vector regexps; - AtomsToRegexps(re2_vec_, atoms, ®exps, prefilter_tree_->getMinAtomLen()); + std::vector regexps; + AtomsToRegexps(re2_vec_, atoms, ®exps, prefilter_tree_->getMinAtomLen()); - for (size_t i = 0; i < re2_vec_.size(); i++) - if (RE2::PartialMatch(text, *re2_vec_[i])) - matching_regexps->push_back(i); - return !matching_regexps->empty(); - -} + for (size_t i = 0; i < re2_vec_.size(); i++) + if (RE2::PartialMatch(text, *re2_vec_[i])) + matching_regexps->push_back(i); + return !matching_regexps->empty(); + } -void FilteredRE2::AllPotentials( - const std::vector& atoms, - std::vector* potential_regexps) const { - AtomsToRegexps(re2_vec_, atoms, potential_regexps, prefilter_tree_->getMinAtomLen()); -} + void FilteredRE2::AllPotentials( + const std::vector &atoms, + std::vector *potential_regexps) const + { + AtomsToRegexps(re2_vec_, atoms, potential_regexps, prefilter_tree_->getMinAtomLen()); + } -} // namespace re2 +} // namespace re2 diff --git a/re2/filtered_re2.h b/re2/filtered_re2.h index ea5c89fd408e2a3845a92f1f055f451ea2ab3d25..19e6177423bcd696747cdbe180d33999b4ae1db4 100644 --- a/re2/filtered_re2.h +++ b/re2/filtered_re2.h @@ -26,86 +26,88 @@ #include "re2/re2.h" -namespace re2 { - -class PrefilterTree; - -class FilteredRE2 { - public: - FilteredRE2(); - explicit FilteredRE2(int min_atom_len); - ~FilteredRE2(); - - // Not copyable. - FilteredRE2(const FilteredRE2&) = delete; - FilteredRE2& operator=(const FilteredRE2&) = delete; - // Movable. - FilteredRE2(FilteredRE2&& other); - FilteredRE2& operator=(FilteredRE2&& other); - - // Uses RE2 constructor to create a RE2 object (re). Returns - // re->error_code(). If error_code is other than NoError, then re is - // deleted and not added to re2_vec_. - RE2::ErrorCode Add(const StringPiece& pattern, - const RE2::Options& options, - int* id); - - // Prepares the regexps added by Add for filtering. Returns a set - // of strings that the caller should check for in candidate texts. - // The returned strings are lowercased and distinct. When doing - // string matching, it should be performed in a case-insensitive - // way or the search text should be lowercased first. Call after - // all Add calls are done. - void Compile(std::vector* strings_to_match); - - // Returns the index of the first matching regexp. - // Returns -1 on no match. Can be called prior to Compile. - // Does not do any filtering: simply tries to Match the - // regexps in a loop. - int SlowFirstMatch(const StringPiece& text) const; - - // Returns the index of the first matching regexp. - // Returns -1 on no match. Compile has to be called before - // calling this. - int FirstMatch(const StringPiece& text, - const std::vector& atoms) const; - - // Returns the indices of all matching regexps, after first clearing - // matched_regexps. - bool AllMatches(const StringPiece& text, - const std::vector& atoms, - std::vector* matching_regexps) const; - - // Returns the indices of all potentially matching regexps after first - // clearing potential_regexps. - // A regexp is potentially matching if it passes the filter. - // If a regexp passes the filter it may still not match. - // A regexp that does not pass the filter is guaranteed to not match. - void AllPotentials(const std::vector& atoms, - std::vector* potential_regexps) const; - - // The number of regexps added. - int NumRegexps() const { return static_cast(re2_vec_.size()); } - - // Get the individual RE2 objects. - const RE2& GetRE2(int regexpid) const { return *re2_vec_[regexpid]; } - - private: - // Print prefilter. - void PrintPrefilter(int regexpid); - - // Useful for testing and debugging. - void RegexpsGivenStrings(const std::vector& matched_atoms, - std::vector* passed_regexps); - - // All the regexps in the FilteredRE2. - std::vector re2_vec_; - - // Has the FilteredRE2 been compiled using Compile() - bool compiled_; - - // An AND-OR tree of string atoms used for filtering regexps. - std::unique_ptr prefilter_tree_; -}; - -} // namespace re2 +namespace re2 +{ + + class PrefilterTree; + + class FilteredRE2 + { + public: + FilteredRE2(); + explicit FilteredRE2(int min_atom_len); + ~FilteredRE2(); + + // Not copyable. + FilteredRE2(const FilteredRE2 &) = delete; + FilteredRE2 &operator=(const FilteredRE2 &) = delete; + // Movable. + FilteredRE2(FilteredRE2 &&other); + FilteredRE2 &operator=(FilteredRE2 &&other); + + // Uses RE2 constructor to create a RE2 object (re). Returns + // re->error_code(). If error_code is other than NoError, then re is + // deleted and not added to re2_vec_. + RE2::ErrorCode Add(const StringPiece &pattern, + const RE2::Options &options, + int *id); + + // Prepares the regexps added by Add for filtering. Returns a set + // of strings that the caller should check for in candidate texts. + // The returned strings are lowercased and distinct. When doing + // string matching, it should be performed in a case-insensitive + // way or the search text should be lowercased first. Call after + // all Add calls are done. + void Compile(std::vector *strings_to_match); + + // Returns the index of the first matching regexp. + // Returns -1 on no match. Can be called prior to Compile. + // Does not do any filtering: simply tries to Match the + // regexps in a loop. + int SlowFirstMatch(const StringPiece &text) const; + + // Returns the index of the first matching regexp. + // Returns -1 on no match. Compile has to be called before + // calling this. + int FirstMatch(const StringPiece &text, + const std::vector &atoms) const; + + // Returns the indices of all matching regexps, after first clearing + // matched_regexps. + bool AllMatches(const StringPiece &text, + const std::vector &atoms, + std::vector *matching_regexps) const; + + // Returns the indices of all potentially matching regexps after first + // clearing potential_regexps. + // A regexp is potentially matching if it passes the filter. + // If a regexp passes the filter it may still not match. + // A regexp that does not pass the filter is guaranteed to not match. + void AllPotentials(const std::vector &atoms, + std::vector *potential_regexps) const; + + // The number of regexps added. + int NumRegexps() const { return static_cast(re2_vec_.size()); } + + // Get the individual RE2 objects. + const RE2 &GetRE2(int regexpid) const { return *re2_vec_[regexpid]; } + + private: + // Print prefilter. + void PrintPrefilter(int regexpid); + + // Useful for testing and debugging. + void RegexpsGivenStrings(const std::vector &matched_atoms, + std::vector *passed_regexps); + + // All the regexps in the FilteredRE2. + std::vector re2_vec_; + + // Has the FilteredRE2 been compiled using Compile() + bool compiled_; + + // An AND-OR tree of string atoms used for filtering regexps. + std::unique_ptr prefilter_tree_; + }; + +} // namespace re2 diff --git a/re2/re2.cc b/re2/re2.cc index fa4f185a0e8d2259c9a972b394ca6c94fa372498..4bf66bcc7d425cd24bdacaee55fef5f0a80b3b4b 100644 --- a/re2/re2.cc +++ b/re2/re2.cc @@ -155,10 +155,12 @@ namespace re2 } uint32_t flags = RURE_DEFAULT_FLAGS; - if(options_.dot_nl()) flags = RURE_FLAG_DOTNL; + if (options_.dot_nl()) + flags = RURE_FLAG_DOTNL; // if(options_.never_nl()) flags = RURE_DEFAULT_FLAGS; - if(options_.encoding() == RE2::Options::EncodingLatin1){ - flags |= RURE_FLAG_UNICODE; + if (options_.encoding() == RE2::Options::EncodingLatin1) + { + flags |= RURE_FLAG_UNICODE; } // for All @@ -185,7 +187,7 @@ namespace re2 LOG(ERROR) << "Error Compile '" << pattern.data() << "':" << msg << "'"; } error_ = new std::string(msg); - error_code_ = ErrorInternal; // 暂时对这个错误进行赋值,如何处理错误类型??? + error_code_ = ErrorInternal; // 暂时对这个错误进行赋值,如何处理错误类型??? } return; } @@ -193,7 +195,7 @@ namespace re2 // for Consume and FindAndConsume suffix_regexp_ = (re2::Regexp *)rure_new((const uint8_t *)pattern.data(), pattern.size()); // for FullMatch - if(rure_str != "") + if (rure_str != "") { std::string FullMatch_rure_str = rure_str; FullMatch_rure_str.insert(0, "^("); @@ -208,11 +210,11 @@ namespace re2 //获取捕获组的数量, 并对num_captures_其进行赋值 rure_captures *caps = rure_captures_new(re); size_t captures_len = rure_captures_len(caps) - 1; - if(!options_.never_capture()) + if (!options_.never_capture()) { num_captures_ = (int)captures_len; } - else + else { num_captures_ = 0; } @@ -220,7 +222,7 @@ namespace re2 rure_captures_free(caps); rure_error_free(err); error_ = empty_string; - error_code_ = RE2::NoError; + error_code_ = RE2::NoError; } RE2::~RE2() @@ -318,7 +320,6 @@ namespace re2 } } - bool RE2::Replace(std::string *str, const RE2 &re, const StringPiece &rewrite) @@ -334,7 +335,7 @@ namespace re2 // 利用rure进行replace const char *rure_str = re.pattern_.c_str(); // 对rewrite进行处理 - const char *rure_rewrite = rure_rewrite_str_convert((const uint8_t*)rewrite.data(), rewrite.size()); + const char *rure_rewrite = rure_rewrite_str_convert((const uint8_t *)rewrite.data(), rewrite.size()); rure *re_rure = rure_compile((const uint8_t *)rure_str, strlen(rure_str), RURE_DEFAULT_FLAGS, NULL, NULL); const char *str_rure = rure_replace(re_rure, (const uint8_t *)str->c_str(), strlen(str->c_str()), @@ -364,7 +365,7 @@ namespace re2 if (count != 0) { // 对rewrite进行处理 - const char *rure_rewrite = rure_rewrite_str_convert((const uint8_t*)rewrite.data(), rewrite.size()); + const char *rure_rewrite = rure_rewrite_str_convert((const uint8_t *)rewrite.data(), rewrite.size()); const char *str_rure = rure_replace_all(rure_re, (const uint8_t *)str->c_str(), strlen(str->c_str()), (const uint8_t *)rure_rewrite, strlen(rure_rewrite)); *str = str_rure; @@ -442,7 +443,7 @@ namespace re2 StringPiece *submatch, int nsubmatch) const { - if(text.size() == 0 && pattern() == "") + if (text.size() == 0 && pattern() == "") { return true; } @@ -463,9 +464,9 @@ namespace re2 return false; } // 对null和empty进行处理 - if(text.data() == NULL) + if (text.data() == NULL) { - for(int i = 0; i < nsubmatch; i++) + for (int i = 0; i < nsubmatch; i++) { submatch[i] = NULL; } @@ -491,19 +492,21 @@ namespace re2 // rure *re1 = (rure *)rprog_; rure_match match = {0}; size_t length = strlen(haystack.c_str()); - if(options_.never_nl()) + if (options_.never_nl()) { std::string strs = haystack + '\n'; size_t pos = strs.find('\n'); bool flag = false; - while(pos != strs.npos) + while (pos != strs.npos) { std::string temp = strs.substr(0, pos); bool matched = rure_is_match(re, (const uint8_t *)temp.c_str(), strlen(temp.c_str()), 0); - if(matched && !nsubmatch){ + if (matched && !nsubmatch) + { return true; } - if(matched && nsubmatch){ + if (matched && nsubmatch) + { haystack = temp; length = strlen(haystack.c_str()); flag = true; @@ -512,41 +515,48 @@ namespace re2 strs = strs.substr(pos + 1, length + 1); pos = strs.find('\n'); } - if(!flag){return false;} + if (!flag) + { + return false; + } } // bool matched = rure_find(re, (const uint8_t *)haystack, strlen(haystack), 0, &match); // 这里没有 if(re_anchor == ANCHOR_START)原因是因为: // 只有Consume()使用了ANCHOR_START,而传入Consume()的参数通常是三个或者三个以上, // 调用Consume()时,nsubmatch不为0,因此会去执行rure_captures_new()、rure_find_captures()、rure_captures_at() - if(re_anchor == UNANCHORED) + if (re_anchor == UNANCHORED) { // bool matched = rure_find(re, (const uint8_t *)haystack.c_str(), length, 0, &match); bool matched = rure_is_match(re, (const uint8_t *)haystack.c_str(), length, 0); - if(!matched){ + if (!matched) + { return false; } - else if(!nsubmatch){ + else if (!nsubmatch) + { return true; } } - else if(re_anchor == ANCHOR_BOTH) + else if (re_anchor == ANCHOR_BOTH) { bool matched = rure_find(re, (const uint8_t *)haystack.c_str(), length, 0, &match); - if(!matched || match.start != 0 || match.end != length){ + if (!matched || match.start != 0 || match.end != length) + { return false; } - else if(!nsubmatch){ + else if (!nsubmatch) + { return true; } } - + // Demo 获取捕获组内容,存储到submatch数组中 rure_captures *caps = rure_captures_new(re); rure_find_captures(re, (const uint8_t *)haystack.c_str(), length, 0, caps); // size_t captures_len = num_captures_ + 1; - + rure_captures_at(caps, 0, &match); if (re_anchor == ANCHOR_START && match.start != 0) return false; @@ -559,13 +569,14 @@ namespace re2 size_t start = match.start; size_t end = match.end; size_t len = end - start; - if(options_.encoding() == RE2::Options::EncodingUTF8){ + if (options_.encoding() == RE2::Options::EncodingUTF8) + { submatch[i] = StringPiece(text.data() + start, static_cast(len)); } - else{ + else + { submatch[i] = StringPiece(text.data() + start, static_cast(len / 2)); } - } else { @@ -608,17 +619,17 @@ namespace re2 // RE has fewer capturing groups than number of Arg pointers passed in. return false; } - + // for Consume and FindAndConsume rure_match match; - if(consumed && n == 0 && + if (consumed && n == 0 && rure_consume((rure *)suffix_regexp_, (const uint8_t *)text.data(), (size_t)text.size(), &match)) { *consumed = match.end; return true; } // for FullMatch(no captures) - if(re_anchor == ANCHOR_BOTH && n == 0 && options_.encoding() == RE2::Options::EncodingUTF8) + if (re_anchor == ANCHOR_BOTH && n == 0 && options_.encoding() == RE2::Options::EncodingUTF8) { bool matched = rure_is_match((rure *)entire_regexp_, (const uint8_t *)text.data(), (size_t)text.size(), 0); return matched; @@ -700,12 +711,12 @@ namespace re2 { int num_caps = NumberOfCapturingGroups(); bool result = rure_check_rewrite_string(rewrite.data(), num_caps); - if(!result){ + if (!result) + { *error = "Rewrite schema error"; return false; } - return true; - + return true; } // Returns the maximum submatch needed for the rewrite to be done by Replace(). @@ -726,13 +737,15 @@ namespace re2 size_t len = rewrite.length(); const char *rewrites[veclen]; size_t rewrites_lengths[veclen]; - for(int i = 0; i < veclen; i++) { + for (int i = 0; i < veclen; i++) + { rewrites[i] = vec[i].data(); rewrites_lengths[i] = vec[i].size(); } - const char *result = rure_rewrite((const uint8_t *)rewrite.data(), len, (const uint8_t **)rewrites, - rewrites_lengths, (size_t)veclen); - if(result != NULL) { + const char *result = rure_rewrite((const uint8_t *)rewrite.data(), len, (const uint8_t **)rewrites, + rewrites_lengths, (size_t)veclen); + if (result != NULL) + { out->assign(result); return true; } diff --git a/re2/re2.h b/re2/re2.h index 1550dc9321cd17d1f5f026a2341a3bae88560166..ccd83b72ac966c2d963704735b83dc16de779bb5 100644 --- a/re2/re2.h +++ b/re2/re2.h @@ -216,751 +216,836 @@ #include "re2/stringpiece.h" -namespace re2 { -class Prog; -class Regexp; -} // namespace re2 - -namespace re2 { - -// Interface for regular expression matching. Also corresponds to a -// pre-compiled regular expression. An "RE2" object is safe for -// concurrent use by multiple threads. -class RE2 { - public: - // We convert user-passed pointers into special Arg objects - class Arg; - class Options; - - // Defined in set.h. - class Set; - - enum ErrorCode { - NoError = 0, - - // Unexpected error - ErrorInternal, - - // Parse errors - ErrorBadEscape, // bad escape sequence - ErrorBadCharClass, // bad character class - ErrorBadCharRange, // bad character class range - ErrorMissingBracket, // missing closing ] - ErrorMissingParen, // missing closing ) - ErrorUnexpectedParen, // unexpected closing ) - ErrorTrailingBackslash, // trailing \ at end of regexp - ErrorRepeatArgument, // repeat argument missing, e.g. "*" - ErrorRepeatSize, // bad repetition argument - ErrorRepeatOp, // bad repetition operator - ErrorBadPerlOp, // bad perl operator - ErrorBadUTF8, // invalid UTF-8 in regexp - ErrorBadNamedCapture, // bad named capture group - ErrorPatternTooLarge // pattern too large (compile failed) - }; +namespace re2 +{ + class Prog; + class Regexp; +} // namespace re2 + +namespace re2 +{ + + // Interface for regular expression matching. Also corresponds to a + // pre-compiled regular expression. An "RE2" object is safe for + // concurrent use by multiple threads. + class RE2 + { + public: + // We convert user-passed pointers into special Arg objects + class Arg; + class Options; + + // Defined in set.h. + class Set; + + enum ErrorCode + { + NoError = 0, + + // Unexpected error + ErrorInternal, + + // Parse errors + ErrorBadEscape, // bad escape sequence + ErrorBadCharClass, // bad character class + ErrorBadCharRange, // bad character class range + ErrorMissingBracket, // missing closing ] + ErrorMissingParen, // missing closing ) + ErrorUnexpectedParen, // unexpected closing ) + ErrorTrailingBackslash, // trailing \ at end of regexp + ErrorRepeatArgument, // repeat argument missing, e.g. "*" + ErrorRepeatSize, // bad repetition argument + ErrorRepeatOp, // bad repetition operator + ErrorBadPerlOp, // bad perl operator + ErrorBadUTF8, // invalid UTF-8 in regexp + ErrorBadNamedCapture, // bad named capture group + ErrorPatternTooLarge // pattern too large (compile failed) + }; - // Predefined common options. - // If you need more complicated things, instantiate - // an Option class, possibly passing one of these to - // the Option constructor, change the settings, and pass that - // Option class to the RE2 constructor. - enum CannedOptions { - DefaultOptions = 0, - Latin1, // treat input as Latin-1 (default UTF-8) - POSIX, // POSIX syntax, leftmost-longest match - Quiet // do not log about regexp parse errors - }; + // Predefined common options. + // If you need more complicated things, instantiate + // an Option class, possibly passing one of these to + // the Option constructor, change the settings, and pass that + // Option class to the RE2 constructor. + enum CannedOptions + { + DefaultOptions = 0, + Latin1, // treat input as Latin-1 (default UTF-8) + POSIX, // POSIX syntax, leftmost-longest match + Quiet // do not log about regexp parse errors + }; - // Need to have the const char* and const std::string& forms for implicit - // conversions when passing string literals to FullMatch and PartialMatch. - // Otherwise the StringPiece form would be sufficient. + // Need to have the const char* and const std::string& forms for implicit + // conversions when passing string literals to FullMatch and PartialMatch. + // Otherwise the StringPiece form would be sufficient. #ifndef SWIG - RE2(const char* pattern); - RE2(const std::string& pattern); + RE2(const char *pattern); + RE2(const std::string &pattern); #endif - RE2(const StringPiece& pattern); - RE2(const StringPiece& pattern, const Options& options); - ~RE2(); - - // Returns whether RE2 was created properly. - bool ok() const { return error_code() == NoError; } - - // The string specification for this RE2. E.g. - // RE2 re("ab*c?d+"); - // re.pattern(); // "ab*c?d+" - const std::string& pattern() const { return pattern_; } - - // If RE2 could not be created properly, returns an error string. - // Else returns the empty string. - const std::string& error() const { return *error_; } - - // If RE2 could not be created properly, returns an error code. - // Else returns RE2::NoError (== 0). - ErrorCode error_code() const { return error_code_; } - - // If RE2 could not be created properly, returns the offending - // portion of the regexp. - const std::string& error_arg() const { return error_arg_; } - - // Returns the program size, a very approximate measure of a regexp's "cost". - // Larger numbers are more expensive than smaller numbers. - int ProgramSize() const; - int ReverseProgramSize() const; - - // If histogram is not null, outputs the program fanout - // as a histogram bucketed by powers of 2. - // Returns the number of the largest non-empty bucket. - int ProgramFanout(std::vector* histogram) const; - int ReverseProgramFanout(std::vector* histogram) const; - - // Returns the underlying Regexp; not for general use. - // Returns entire_regexp_ so that callers don't need - // to know about prefix_ and prefix_foldcase_. - re2::Regexp* Regexp() const { return entire_regexp_; } - - /***** The array-based matching interface ******/ - - // The functions here have names ending in 'N' and are used to implement - // the functions whose names are the prefix before the 'N'. It is sometimes - // useful to invoke them directly, but the syntax is awkward, so the 'N'-less - // versions should be preferred. - static bool FullMatchN(const StringPiece& text, const RE2& re, - const Arg* const args[], int n); - static bool PartialMatchN(const StringPiece& text, const RE2& re, - const Arg* const args[], int n); - static bool ConsumeN(StringPiece* input, const RE2& re, - const Arg* const args[], int n); - static bool FindAndConsumeN(StringPiece* input, const RE2& re, - const Arg* const args[], int n); + RE2(const StringPiece &pattern); + RE2(const StringPiece &pattern, const Options &options); + ~RE2(); + + // Returns whether RE2 was created properly. + bool ok() const { return error_code() == NoError; } + + // The string specification for this RE2. E.g. + // RE2 re("ab*c?d+"); + // re.pattern(); // "ab*c?d+" + const std::string &pattern() const { return pattern_; } + + // If RE2 could not be created properly, returns an error string. + // Else returns the empty string. + const std::string &error() const { return *error_; } + + // If RE2 could not be created properly, returns an error code. + // Else returns RE2::NoError (== 0). + ErrorCode error_code() const { return error_code_; } + + // If RE2 could not be created properly, returns the offending + // portion of the regexp. + const std::string &error_arg() const { return error_arg_; } + + // Returns the program size, a very approximate measure of a regexp's "cost". + // Larger numbers are more expensive than smaller numbers. + int ProgramSize() const; + int ReverseProgramSize() const; + + // If histogram is not null, outputs the program fanout + // as a histogram bucketed by powers of 2. + // Returns the number of the largest non-empty bucket. + int ProgramFanout(std::vector *histogram) const; + int ReverseProgramFanout(std::vector *histogram) const; + + // Returns the underlying Regexp; not for general use. + // Returns entire_regexp_ so that callers don't need + // to know about prefix_ and prefix_foldcase_. + re2::Regexp *Regexp() const { return entire_regexp_; } + + /***** The array-based matching interface ******/ + + // The functions here have names ending in 'N' and are used to implement + // the functions whose names are the prefix before the 'N'. It is sometimes + // useful to invoke them directly, but the syntax is awkward, so the 'N'-less + // versions should be preferred. + static bool FullMatchN(const StringPiece &text, const RE2 &re, + const Arg *const args[], int n); + static bool PartialMatchN(const StringPiece &text, const RE2 &re, + const Arg *const args[], int n); + static bool ConsumeN(StringPiece *input, const RE2 &re, + const Arg *const args[], int n); + static bool FindAndConsumeN(StringPiece *input, const RE2 &re, + const Arg *const args[], int n); #ifndef SWIG - private: - template - static inline bool Apply(F f, SP sp, const RE2& re) { - return f(sp, re, NULL, 0); - } + private: + template + static inline bool Apply(F f, SP sp, const RE2 &re) + { + return f(sp, re, NULL, 0); + } - template - static inline bool Apply(F f, SP sp, const RE2& re, const A&... a) { - const Arg* const args[] = {&a...}; - const int n = sizeof...(a); - return f(sp, re, args, n); - } + template + static inline bool Apply(F f, SP sp, const RE2 &re, const A &...a) + { + const Arg *const args[] = {&a...}; + const int n = sizeof...(a); + return f(sp, re, args, n); + } - public: - // In order to allow FullMatch() et al. to be called with a varying number - // of arguments of varying types, we use two layers of variadic templates. - // The first layer constructs the temporary Arg objects. The second layer - // (above) constructs the array of pointers to the temporary Arg objects. + public: + // In order to allow FullMatch() et al. to be called with a varying number + // of arguments of varying types, we use two layers of variadic templates. + // The first layer constructs the temporary Arg objects. The second layer + // (above) constructs the array of pointers to the temporary Arg objects. - /***** The useful part: the matching interface *****/ + /***** The useful part: the matching interface *****/ - // Matches "text" against "re". If pointer arguments are - // supplied, copies matched sub-patterns into them. - // - // You can pass in a "const char*" or a "std::string" for "text". - // You can pass in a "const char*" or a "std::string" or a "RE2" for "re". - // - // The provided pointer arguments can be pointers to any scalar numeric - // type, or one of: - // std::string (matched piece is copied to string) - // StringPiece (StringPiece is mutated to point to matched piece) - // T (where "bool T::ParseFrom(const char*, size_t)" exists) - // (void*)NULL (the corresponding matched sub-pattern is not copied) - // - // Returns true iff all of the following conditions are satisfied: - // a. "text" matches "re" fully - from the beginning to the end of "text". - // b. The number of matched sub-patterns is >= number of supplied pointers. - // c. The "i"th argument has a suitable type for holding the - // string captured as the "i"th sub-pattern. If you pass in - // NULL for the "i"th argument, or pass fewer arguments than - // number of sub-patterns, the "i"th captured sub-pattern is - // ignored. - // - // CAVEAT: An optional sub-pattern that does not exist in the - // matched string is assigned the empty string. Therefore, the - // following will return false (because the empty string is not a - // valid number): - // int number; - // RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number); - template - static bool FullMatch(const StringPiece& text, const RE2& re, A&&... a) { - return Apply(FullMatchN, text, re, Arg(std::forward(a))...); - } + // Matches "text" against "re". If pointer arguments are + // supplied, copies matched sub-patterns into them. + // + // You can pass in a "const char*" or a "std::string" for "text". + // You can pass in a "const char*" or a "std::string" or a "RE2" for "re". + // + // The provided pointer arguments can be pointers to any scalar numeric + // type, or one of: + // std::string (matched piece is copied to string) + // StringPiece (StringPiece is mutated to point to matched piece) + // T (where "bool T::ParseFrom(const char*, size_t)" exists) + // (void*)NULL (the corresponding matched sub-pattern is not copied) + // + // Returns true iff all of the following conditions are satisfied: + // a. "text" matches "re" fully - from the beginning to the end of "text". + // b. The number of matched sub-patterns is >= number of supplied pointers. + // c. The "i"th argument has a suitable type for holding the + // string captured as the "i"th sub-pattern. If you pass in + // NULL for the "i"th argument, or pass fewer arguments than + // number of sub-patterns, the "i"th captured sub-pattern is + // ignored. + // + // CAVEAT: An optional sub-pattern that does not exist in the + // matched string is assigned the empty string. Therefore, the + // following will return false (because the empty string is not a + // valid number): + // int number; + // RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number); + template + static bool FullMatch(const StringPiece &text, const RE2 &re, A &&...a) + { + return Apply(FullMatchN, text, re, Arg(std::forward(a))...); + } - // Like FullMatch(), except that "re" is allowed to match a substring - // of "text". - // - // Returns true iff all of the following conditions are satisfied: - // a. "text" matches "re" partially - for some substring of "text". - // b. The number of matched sub-patterns is >= number of supplied pointers. - // c. The "i"th argument has a suitable type for holding the - // string captured as the "i"th sub-pattern. If you pass in - // NULL for the "i"th argument, or pass fewer arguments than - // number of sub-patterns, the "i"th captured sub-pattern is - // ignored. - template - static bool PartialMatch(const StringPiece& text, const RE2& re, A&&... a) { - return Apply(PartialMatchN, text, re, Arg(std::forward(a))...); - } + // Like FullMatch(), except that "re" is allowed to match a substring + // of "text". + // + // Returns true iff all of the following conditions are satisfied: + // a. "text" matches "re" partially - for some substring of "text". + // b. The number of matched sub-patterns is >= number of supplied pointers. + // c. The "i"th argument has a suitable type for holding the + // string captured as the "i"th sub-pattern. If you pass in + // NULL for the "i"th argument, or pass fewer arguments than + // number of sub-patterns, the "i"th captured sub-pattern is + // ignored. + template + static bool PartialMatch(const StringPiece &text, const RE2 &re, A &&...a) + { + return Apply(PartialMatchN, text, re, Arg(std::forward(a))...); + } - // Like FullMatch() and PartialMatch(), except that "re" has to match - // a prefix of the text, and "input" is advanced past the matched - // text. Note: "input" is modified iff this routine returns true - // and "re" matched a non-empty substring of "input". - // - // Returns true iff all of the following conditions are satisfied: - // a. "input" matches "re" partially - for some prefix of "input". - // b. The number of matched sub-patterns is >= number of supplied pointers. - // c. The "i"th argument has a suitable type for holding the - // string captured as the "i"th sub-pattern. If you pass in - // NULL for the "i"th argument, or pass fewer arguments than - // number of sub-patterns, the "i"th captured sub-pattern is - // ignored. - template - static bool Consume(StringPiece* input, const RE2& re, A&&... a) { - return Apply(ConsumeN, input, re, Arg(std::forward(a))...); - } + // Like FullMatch() and PartialMatch(), except that "re" has to match + // a prefix of the text, and "input" is advanced past the matched + // text. Note: "input" is modified iff this routine returns true + // and "re" matched a non-empty substring of "input". + // + // Returns true iff all of the following conditions are satisfied: + // a. "input" matches "re" partially - for some prefix of "input". + // b. The number of matched sub-patterns is >= number of supplied pointers. + // c. The "i"th argument has a suitable type for holding the + // string captured as the "i"th sub-pattern. If you pass in + // NULL for the "i"th argument, or pass fewer arguments than + // number of sub-patterns, the "i"th captured sub-pattern is + // ignored. + template + static bool Consume(StringPiece *input, const RE2 &re, A &&...a) + { + return Apply(ConsumeN, input, re, Arg(std::forward(a))...); + } - // Like Consume(), but does not anchor the match at the beginning of - // the text. That is, "re" need not start its match at the beginning - // of "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds - // the next word in "s" and stores it in "word". - // - // Returns true iff all of the following conditions are satisfied: - // a. "input" matches "re" partially - for some substring of "input". - // b. The number of matched sub-patterns is >= number of supplied pointers. - // c. The "i"th argument has a suitable type for holding the - // string captured as the "i"th sub-pattern. If you pass in - // NULL for the "i"th argument, or pass fewer arguments than - // number of sub-patterns, the "i"th captured sub-pattern is - // ignored. - template - static bool FindAndConsume(StringPiece* input, const RE2& re, A&&... a) { - return Apply(FindAndConsumeN, input, re, Arg(std::forward(a))...); - } + // Like Consume(), but does not anchor the match at the beginning of + // the text. That is, "re" need not start its match at the beginning + // of "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds + // the next word in "s" and stores it in "word". + // + // Returns true iff all of the following conditions are satisfied: + // a. "input" matches "re" partially - for some substring of "input". + // b. The number of matched sub-patterns is >= number of supplied pointers. + // c. The "i"th argument has a suitable type for holding the + // string captured as the "i"th sub-pattern. If you pass in + // NULL for the "i"th argument, or pass fewer arguments than + // number of sub-patterns, the "i"th captured sub-pattern is + // ignored. + template + static bool FindAndConsume(StringPiece *input, const RE2 &re, A &&...a) + { + return Apply(FindAndConsumeN, input, re, Arg(std::forward(a))...); + } #endif - // Replace the first match of "re" in "str" with "rewrite". - // Within "rewrite", backslash-escaped digits (\1 to \9) can be - // used to insert text matching corresponding parenthesized group - // from the pattern. \0 in "rewrite" refers to the entire matching - // text. E.g., - // - // std::string s = "yabba dabba doo"; - // CHECK(RE2::Replace(&s, "b+", "d")); - // - // will leave "s" containing "yada dabba doo" - // - // Returns true if the pattern matches and a replacement occurs, - // false otherwise. - static bool Replace(std::string* str, - const RE2& re, - const StringPiece& rewrite); - - // Like Replace(), except replaces successive non-overlapping occurrences - // of the pattern in the string with the rewrite. E.g. - // - // std::string s = "yabba dabba doo"; - // CHECK(RE2::GlobalReplace(&s, "b+", "d")); - // - // will leave "s" containing "yada dada doo" - // Replacements are not subject to re-matching. - // - // Because GlobalReplace only replaces non-overlapping matches, - // replacing "ana" within "banana" makes only one replacement, not two. - // - // Returns the number of replacements made. - static int GlobalReplace(std::string* str, - const RE2& re, - const StringPiece& rewrite); - - // Like Replace, except that if the pattern matches, "rewrite" - // is copied into "out" with substitutions. The non-matching - // portions of "text" are ignored. - // - // Returns true iff a match occurred and the extraction happened - // successfully; if no match occurs, the string is left unaffected. - // - // REQUIRES: "text" must not alias any part of "*out". - static bool Extract(const StringPiece& text, - const RE2& re, - const StringPiece& rewrite, - std::string* out); - - // Escapes all potentially meaningful regexp characters in - // 'unquoted'. The returned string, used as a regular expression, - // will match exactly the original string. For example, - // 1.5-2.0? - // may become: - // 1\.5\-2\.0\? - static std::string QuoteMeta(const StringPiece& unquoted); - - // Computes range for any strings matching regexp. The min and max can in - // some cases be arbitrarily precise, so the caller gets to specify the - // maximum desired length of string returned. - // - // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any - // string s that is an anchored match for this regexp satisfies - // min <= s && s <= max. - // - // Note that PossibleMatchRange() will only consider the first copy of an - // infinitely repeated element (i.e., any regexp element followed by a '*' or - // '+' operator). Regexps with "{N}" constructions are not affected, as those - // do not compile down to infinite repetitions. - // - // Returns true on success, false on error. - bool PossibleMatchRange(std::string* min, std::string* max, - int maxlen) const; - - // Generic matching interface - - // Type of match. - enum Anchor { - UNANCHORED, // No anchoring - ANCHOR_START, // Anchor at start only - ANCHOR_BOTH // Anchor at start and end - }; - - // Return the number of capturing subpatterns, or -1 if the - // regexp wasn't valid on construction. The overall match ($0) - // does not count: if the regexp is "(a)(b)", returns 2. - int NumberOfCapturingGroups() const { return num_captures_; } - - // Return a map from names to capturing indices. - // The map records the index of the leftmost group - // with the given name. - // Only valid until the re is deleted. - const std::map& NamedCapturingGroups() const; - - // Return a map from capturing indices to names. - // The map has no entries for unnamed groups. - // Only valid until the re is deleted. - const std::map& CapturingGroupNames() const; - - // General matching routine. - // Match against text starting at offset startpos - // and stopping the search at offset endpos. - // Returns true if match found, false if not. - // On a successful match, fills in submatch[] (up to nsubmatch entries) - // with information about submatches. - // I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true, with - // submatch[0] = "barbaz", submatch[1].data() = NULL, submatch[2] = "bar", - // submatch[3].data() = NULL, ..., up to submatch[nsubmatch-1].data() = NULL. - // Caveat: submatch[] may be clobbered even on match failure. - // - // Don't ask for more match information than you will use: - // runs much faster with nsubmatch == 1 than nsubmatch > 1, and - // runs even faster if nsubmatch == 0. - // Doesn't make sense to use nsubmatch > 1 + NumberOfCapturingGroups(), - // but will be handled correctly. - // - // Passing text == StringPiece(NULL, 0) will be handled like any other - // empty string, but note that on return, it will not be possible to tell - // whether submatch i matched the empty string or did not match: - // either way, submatch[i].data() == NULL. - bool Match(const StringPiece& text, - size_t startpos, - size_t endpos, - Anchor re_anchor, - StringPiece* submatch, - int nsubmatch) const; - - // Check that the given rewrite string is suitable for use with this - // regular expression. It checks that: - // * The regular expression has enough parenthesized subexpressions - // to satisfy all of the \N tokens in rewrite - // * The rewrite string doesn't have any syntax errors. E.g., - // '\' followed by anything other than a digit or '\'. - // A true return value guarantees that Replace() and Extract() won't - // fail because of a bad rewrite string. - bool CheckRewriteString(const StringPiece& rewrite, - std::string* error) const; - - // Returns the maximum submatch needed for the rewrite to be done by - // Replace(). E.g. if rewrite == "foo \\2,\\1", returns 2. - static int MaxSubmatch(const StringPiece& rewrite); - - // Append the "rewrite" string, with backslash subsitutions from "vec", - // to string "out". - // Returns true on success. This method can fail because of a malformed - // rewrite string. CheckRewriteString guarantees that the rewrite will - // be sucessful. - bool Rewrite(std::string* out, - const StringPiece& rewrite, - const StringPiece* vec, - int veclen) const; - - // Constructor options - class Options { - public: - // The options are (defaults in parentheses): + // Replace the first match of "re" in "str" with "rewrite". + // Within "rewrite", backslash-escaped digits (\1 to \9) can be + // used to insert text matching corresponding parenthesized group + // from the pattern. \0 in "rewrite" refers to the entire matching + // text. E.g., // - // utf8 (true) text and pattern are UTF-8; otherwise Latin-1 - // posix_syntax (false) restrict regexps to POSIX egrep syntax - // longest_match (false) search for longest match, not first match - // log_errors (true) log syntax and execution errors to ERROR - // max_mem (see below) approx. max memory footprint of RE2 - // literal (false) interpret string as literal, not regexp - // never_nl (false) never match \n, even if it is in regexp - // dot_nl (false) dot matches everything including new line - // never_capture (false) parse all parens as non-capturing - // case_sensitive (true) match is case-sensitive (regexp can override - // with (?i) unless in posix_syntax mode) + // std::string s = "yabba dabba doo"; + // CHECK(RE2::Replace(&s, "b+", "d")); // - // The following options are only consulted when posix_syntax == true. - // When posix_syntax == false, these features are always enabled and - // cannot be turned off; to perform multi-line matching in that case, - // begin the regexp with (?m). - // perl_classes (false) allow Perl's \d \s \w \D \S \W - // word_boundary (false) allow Perl's \b \B (word boundary and not) - // one_line (false) ^ and $ only match beginning and end of text + // will leave "s" containing "yada dabba doo" // - // The max_mem option controls how much memory can be used - // to hold the compiled form of the regexp (the Prog) and - // its cached DFA graphs. Code Search placed limits on the number - // of Prog instructions and DFA states: 10,000 for both. - // In RE2, those limits would translate to about 240 KB per Prog - // and perhaps 2.5 MB per DFA (DFA state sizes vary by regexp; RE2 does a - // better job of keeping them small than Code Search did). - // Each RE2 has two Progs (one forward, one reverse), and each Prog - // can have two DFAs (one first match, one longest match). - // That makes 4 DFAs: + // Returns true if the pattern matches and a replacement occurs, + // false otherwise. + static bool Replace(std::string *str, + const RE2 &re, + const StringPiece &rewrite); + + // Like Replace(), except replaces successive non-overlapping occurrences + // of the pattern in the string with the rewrite. E.g. // - // forward, first-match - used for UNANCHORED or ANCHOR_START searches - // if opt.longest_match() == false - // forward, longest-match - used for all ANCHOR_BOTH searches, - // and the other two kinds if - // opt.longest_match() == true - // reverse, first-match - never used - // reverse, longest-match - used as second phase for unanchored searches + // std::string s = "yabba dabba doo"; + // CHECK(RE2::GlobalReplace(&s, "b+", "d")); // - // The RE2 memory budget is statically divided between the two - // Progs and then the DFAs: two thirds to the forward Prog - // and one third to the reverse Prog. The forward Prog gives half - // of what it has left over to each of its DFAs. The reverse Prog - // gives it all to its longest-match DFA. + // will leave "s" containing "yada dada doo" + // Replacements are not subject to re-matching. // - // Once a DFA fills its budget, it flushes its cache and starts over. - // If this happens too often, RE2 falls back on the NFA implementation. - - // For now, make the default budget something close to Code Search. - static const int kDefaultMaxMem = 8<<20; - - enum Encoding { - EncodingUTF8 = 1, - EncodingLatin1 + // Because GlobalReplace only replaces non-overlapping matches, + // replacing "ana" within "banana" makes only one replacement, not two. + // + // Returns the number of replacements made. + static int GlobalReplace(std::string *str, + const RE2 &re, + const StringPiece &rewrite); + + // Like Replace, except that if the pattern matches, "rewrite" + // is copied into "out" with substitutions. The non-matching + // portions of "text" are ignored. + // + // Returns true iff a match occurred and the extraction happened + // successfully; if no match occurs, the string is left unaffected. + // + // REQUIRES: "text" must not alias any part of "*out". + static bool Extract(const StringPiece &text, + const RE2 &re, + const StringPiece &rewrite, + std::string *out); + + // Escapes all potentially meaningful regexp characters in + // 'unquoted'. The returned string, used as a regular expression, + // will match exactly the original string. For example, + // 1.5-2.0? + // may become: + // 1\.5\-2\.0\? + static std::string QuoteMeta(const StringPiece &unquoted); + + // Computes range for any strings matching regexp. The min and max can in + // some cases be arbitrarily precise, so the caller gets to specify the + // maximum desired length of string returned. + // + // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any + // string s that is an anchored match for this regexp satisfies + // min <= s && s <= max. + // + // Note that PossibleMatchRange() will only consider the first copy of an + // infinitely repeated element (i.e., any regexp element followed by a '*' or + // '+' operator). Regexps with "{N}" constructions are not affected, as those + // do not compile down to infinite repetitions. + // + // Returns true on success, false on error. + bool PossibleMatchRange(std::string *min, std::string *max, + int maxlen) const; + + // Generic matching interface + + // Type of match. + enum Anchor + { + UNANCHORED, // No anchoring + ANCHOR_START, // Anchor at start only + ANCHOR_BOTH // Anchor at start and end }; - Options() : - encoding_(EncodingUTF8), - posix_syntax_(false), - longest_match_(false), - log_errors_(true), - max_mem_(kDefaultMaxMem), - literal_(false), - never_nl_(false), - dot_nl_(false), - never_capture_(false), - case_sensitive_(true), - perl_classes_(false), - word_boundary_(false), - one_line_(false) { - } + // Return the number of capturing subpatterns, or -1 if the + // regexp wasn't valid on construction. The overall match ($0) + // does not count: if the regexp is "(a)(b)", returns 2. + int NumberOfCapturingGroups() const { return num_captures_; } + + // Return a map from names to capturing indices. + // The map records the index of the leftmost group + // with the given name. + // Only valid until the re is deleted. + const std::map &NamedCapturingGroups() const; + + // Return a map from capturing indices to names. + // The map has no entries for unnamed groups. + // Only valid until the re is deleted. + const std::map &CapturingGroupNames() const; + + // General matching routine. + // Match against text starting at offset startpos + // and stopping the search at offset endpos. + // Returns true if match found, false if not. + // On a successful match, fills in submatch[] (up to nsubmatch entries) + // with information about submatches. + // I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true, with + // submatch[0] = "barbaz", submatch[1].data() = NULL, submatch[2] = "bar", + // submatch[3].data() = NULL, ..., up to submatch[nsubmatch-1].data() = NULL. + // Caveat: submatch[] may be clobbered even on match failure. + // + // Don't ask for more match information than you will use: + // runs much faster with nsubmatch == 1 than nsubmatch > 1, and + // runs even faster if nsubmatch == 0. + // Doesn't make sense to use nsubmatch > 1 + NumberOfCapturingGroups(), + // but will be handled correctly. + // + // Passing text == StringPiece(NULL, 0) will be handled like any other + // empty string, but note that on return, it will not be possible to tell + // whether submatch i matched the empty string or did not match: + // either way, submatch[i].data() == NULL. + bool Match(const StringPiece &text, + size_t startpos, + size_t endpos, + Anchor re_anchor, + StringPiece *submatch, + int nsubmatch) const; + + // Check that the given rewrite string is suitable for use with this + // regular expression. It checks that: + // * The regular expression has enough parenthesized subexpressions + // to satisfy all of the \N tokens in rewrite + // * The rewrite string doesn't have any syntax errors. E.g., + // '\' followed by anything other than a digit or '\'. + // A true return value guarantees that Replace() and Extract() won't + // fail because of a bad rewrite string. + bool CheckRewriteString(const StringPiece &rewrite, + std::string *error) const; + + // Returns the maximum submatch needed for the rewrite to be done by + // Replace(). E.g. if rewrite == "foo \\2,\\1", returns 2. + static int MaxSubmatch(const StringPiece &rewrite); + + // Append the "rewrite" string, with backslash subsitutions from "vec", + // to string "out". + // Returns true on success. This method can fail because of a malformed + // rewrite string. CheckRewriteString guarantees that the rewrite will + // be sucessful. + bool Rewrite(std::string *out, + const StringPiece &rewrite, + const StringPiece *vec, + int veclen) const; + + // Constructor options + class Options + { + public: + // The options are (defaults in parentheses): + // + // utf8 (true) text and pattern are UTF-8; otherwise Latin-1 + // posix_syntax (false) restrict regexps to POSIX egrep syntax + // longest_match (false) search for longest match, not first match + // log_errors (true) log syntax and execution errors to ERROR + // max_mem (see below) approx. max memory footprint of RE2 + // literal (false) interpret string as literal, not regexp + // never_nl (false) never match \n, even if it is in regexp + // dot_nl (false) dot matches everything including new line + // never_capture (false) parse all parens as non-capturing + // case_sensitive (true) match is case-sensitive (regexp can override + // with (?i) unless in posix_syntax mode) + // + // The following options are only consulted when posix_syntax == true. + // When posix_syntax == false, these features are always enabled and + // cannot be turned off; to perform multi-line matching in that case, + // begin the regexp with (?m). + // perl_classes (false) allow Perl's \d \s \w \D \S \W + // word_boundary (false) allow Perl's \b \B (word boundary and not) + // one_line (false) ^ and $ only match beginning and end of text + // + // The max_mem option controls how much memory can be used + // to hold the compiled form of the regexp (the Prog) and + // its cached DFA graphs. Code Search placed limits on the number + // of Prog instructions and DFA states: 10,000 for both. + // In RE2, those limits would translate to about 240 KB per Prog + // and perhaps 2.5 MB per DFA (DFA state sizes vary by regexp; RE2 does a + // better job of keeping them small than Code Search did). + // Each RE2 has two Progs (one forward, one reverse), and each Prog + // can have two DFAs (one first match, one longest match). + // That makes 4 DFAs: + // + // forward, first-match - used for UNANCHORED or ANCHOR_START searches + // if opt.longest_match() == false + // forward, longest-match - used for all ANCHOR_BOTH searches, + // and the other two kinds if + // opt.longest_match() == true + // reverse, first-match - never used + // reverse, longest-match - used as second phase for unanchored searches + // + // The RE2 memory budget is statically divided between the two + // Progs and then the DFAs: two thirds to the forward Prog + // and one third to the reverse Prog. The forward Prog gives half + // of what it has left over to each of its DFAs. The reverse Prog + // gives it all to its longest-match DFA. + // + // Once a DFA fills its budget, it flushes its cache and starts over. + // If this happens too often, RE2 falls back on the NFA implementation. + + // For now, make the default budget something close to Code Search. + static const int kDefaultMaxMem = 8 << 20; + + enum Encoding + { + EncodingUTF8 = 1, + EncodingLatin1 + }; + + Options() : encoding_(EncodingUTF8), + posix_syntax_(false), + longest_match_(false), + log_errors_(true), + max_mem_(kDefaultMaxMem), + literal_(false), + never_nl_(false), + dot_nl_(false), + never_capture_(false), + case_sensitive_(true), + perl_classes_(false), + word_boundary_(false), + one_line_(false) + { + } + + /*implicit*/ Options(CannedOptions); + + Encoding encoding() const { return encoding_; } + void set_encoding(Encoding encoding) { encoding_ = encoding; } + + bool posix_syntax() const { return posix_syntax_; } + void set_posix_syntax(bool b) { posix_syntax_ = b; } + + bool longest_match() const { return longest_match_; } + void set_longest_match(bool b) { longest_match_ = b; } + + bool log_errors() const { return log_errors_; } + void set_log_errors(bool b) { log_errors_ = b; } + + int64_t max_mem() const { return max_mem_; } + void set_max_mem(int64_t m) { max_mem_ = m; } + + bool literal() const { return literal_; } + void set_literal(bool b) { literal_ = b; } + + bool never_nl() const { return never_nl_; } + void set_never_nl(bool b) { never_nl_ = b; } + + bool dot_nl() const { return dot_nl_; } + void set_dot_nl(bool b) { dot_nl_ = b; } + + bool never_capture() const { return never_capture_; } + void set_never_capture(bool b) { never_capture_ = b; } + + bool case_sensitive() const { return case_sensitive_; } + void set_case_sensitive(bool b) { case_sensitive_ = b; } + + bool perl_classes() const { return perl_classes_; } + void set_perl_classes(bool b) { perl_classes_ = b; } + + bool word_boundary() const { return word_boundary_; } + void set_word_boundary(bool b) { word_boundary_ = b; } + + bool one_line() const { return one_line_; } + void set_one_line(bool b) { one_line_ = b; } + + void Copy(const Options &src) + { + *this = src; + } + + int ParseFlags() const; + + private: + Encoding encoding_; + bool posix_syntax_; + bool longest_match_; + bool log_errors_; + int64_t max_mem_; + bool literal_; + bool never_nl_; + bool dot_nl_; + bool never_capture_; + bool case_sensitive_; + bool perl_classes_; + bool word_boundary_; + bool one_line_; + }; - /*implicit*/ Options(CannedOptions); + // Returns the options set in the constructor. + const Options &options() const { return options_; } + + // Argument converters; see below. + template + static Arg CRadix(T *ptr); + template + static Arg Hex(T *ptr); + template + static Arg Octal(T *ptr); + + private: + void Init(const StringPiece &pattern, const Options &options); + + bool DoMatch(const StringPiece &text, + Anchor re_anchor, + size_t *consumed, + const Arg *const args[], + int n) const; + + re2::Prog *ReverseProg() const; + + std::string pattern_; // string regular expression + Options options_; // option flags + re2::Regexp *entire_regexp_; // parsed regular expression + const std::string *error_; // error indicator (or points to empty string) + ErrorCode error_code_; // error code + std::string error_arg_; // fragment of regexp showing error + std::string prefix_; // required prefix (before suffix_regexp_) + bool prefix_foldcase_; // prefix_ is ASCII case-insensitive + re2::Regexp *suffix_regexp_; // parsed regular expression, prefix_ removed + re2::Prog *prog_; // compiled program for regexp + int num_captures_; // number of capturing groups + bool is_one_pass_; // can use prog_->SearchOnePass? + + // Reverse Prog for DFA execution only + mutable re2::Prog *rprog_; + // Map from capture names to indices + mutable const std::map *named_groups_; + // Map from capture indices to names + mutable const std::map *group_names_; + + mutable std::once_flag rprog_once_; + mutable std::once_flag named_groups_once_; + mutable std::once_flag group_names_once_; + + RE2(const RE2 &) = delete; + RE2 &operator=(const RE2 &) = delete; + }; - Encoding encoding() const { return encoding_; } - void set_encoding(Encoding encoding) { encoding_ = encoding; } + /***** Implementation details *****/ - bool posix_syntax() const { return posix_syntax_; } - void set_posix_syntax(bool b) { posix_syntax_ = b; } + namespace re2_internal + { - bool longest_match() const { return longest_match_; } - void set_longest_match(bool b) { longest_match_ = b; } + // Types for which the 3-ary Parse() function template has specializations. + template + struct Parse3ary : public std::false_type + { + }; + template <> + struct Parse3ary : public std::true_type + { + }; + template <> + struct Parse3ary : public std::true_type + { + }; + template <> + struct Parse3ary : public std::true_type + { + }; + template <> + struct Parse3ary : public std::true_type + { + }; + template <> + struct Parse3ary : public std::true_type + { + }; + template <> + struct Parse3ary : public std::true_type + { + }; + template <> + struct Parse3ary : public std::true_type + { + }; + template <> + struct Parse3ary : public std::true_type + { + }; - bool log_errors() const { return log_errors_; } - void set_log_errors(bool b) { log_errors_ = b; } + template + bool Parse(const char *str, size_t n, T *dest); - int64_t max_mem() const { return max_mem_; } - void set_max_mem(int64_t m) { max_mem_ = m; } + // Types for which the 4-ary Parse() function template has specializations. + template + struct Parse4ary : public std::false_type + { + }; + template <> + struct Parse4ary : public std::true_type + { + }; + template <> + struct Parse4ary : public std::true_type + { + }; + template <> + struct Parse4ary : public std::true_type + { + }; + template <> + struct Parse4ary : public std::true_type + { + }; + template <> + struct Parse4ary : public std::true_type + { + }; + template <> + struct Parse4ary : public std::true_type + { + }; + template <> + struct Parse4ary : public std::true_type + { + }; + template <> + struct Parse4ary : public std::true_type + { + }; - bool literal() const { return literal_; } - void set_literal(bool b) { literal_ = b; } + template + bool Parse(const char *str, size_t n, T *dest, int radix); - bool never_nl() const { return never_nl_; } - void set_never_nl(bool b) { never_nl_ = b; } + } // namespace re2_internal - bool dot_nl() const { return dot_nl_; } - void set_dot_nl(bool b) { dot_nl_ = b; } + class RE2::Arg + { + private: + template + using CanParse3ary = typename std::enable_if< + re2_internal::Parse3ary::value, + int>::type; - bool never_capture() const { return never_capture_; } - void set_never_capture(bool b) { never_capture_ = b; } + template + using CanParse4ary = typename std::enable_if< + re2_internal::Parse4ary::value, + int>::type; - bool case_sensitive() const { return case_sensitive_; } - void set_case_sensitive(bool b) { case_sensitive_ = b; } +#if !defined(_MSC_VER) + template + using CanParseFrom = typename std::enable_if< + std::is_member_function_pointer< + decltype(static_cast( + &T::ParseFrom))>::value, + int>::type; +#endif - bool perl_classes() const { return perl_classes_; } - void set_perl_classes(bool b) { perl_classes_ = b; } + public: + Arg() : Arg(nullptr) {} + Arg(std::nullptr_t ptr) : arg_(ptr), parser_(DoNothing) {} - bool word_boundary() const { return word_boundary_; } - void set_word_boundary(bool b) { word_boundary_ = b; } + template = 0> + Arg(T *ptr) : arg_(ptr), parser_(DoParse3ary) {} - bool one_line() const { return one_line_; } - void set_one_line(bool b) { one_line_ = b; } + template = 0> + Arg(T *ptr) : arg_(ptr), parser_(DoParse4ary) {} - void Copy(const Options& src) { - *this = src; +#if !defined(_MSC_VER) + template = 0> + Arg(T *ptr) : arg_(ptr), parser_(DoParseFrom) + { } +#endif - int ParseFlags() const; - - private: - Encoding encoding_; - bool posix_syntax_; - bool longest_match_; - bool log_errors_; - int64_t max_mem_; - bool literal_; - bool never_nl_; - bool dot_nl_; - bool never_capture_; - bool case_sensitive_; - bool perl_classes_; - bool word_boundary_; - bool one_line_; - }; - - // Returns the options set in the constructor. - const Options& options() const { return options_; } - - // Argument converters; see below. - template - static Arg CRadix(T* ptr); - template - static Arg Hex(T* ptr); - template - static Arg Octal(T* ptr); - - private: - void Init(const StringPiece& pattern, const Options& options); - - bool DoMatch(const StringPiece& text, - Anchor re_anchor, - size_t* consumed, - const Arg* const args[], - int n) const; - - re2::Prog* ReverseProg() const; - - std::string pattern_; // string regular expression - Options options_; // option flags - re2::Regexp* entire_regexp_; // parsed regular expression - const std::string* error_; // error indicator (or points to empty string) - ErrorCode error_code_; // error code - std::string error_arg_; // fragment of regexp showing error - std::string prefix_; // required prefix (before suffix_regexp_) - bool prefix_foldcase_; // prefix_ is ASCII case-insensitive - re2::Regexp* suffix_regexp_; // parsed regular expression, prefix_ removed - re2::Prog* prog_; // compiled program for regexp - int num_captures_; // number of capturing groups - bool is_one_pass_; // can use prog_->SearchOnePass? - - // Reverse Prog for DFA execution only - mutable re2::Prog* rprog_; - // Map from capture names to indices - mutable const std::map* named_groups_; - // Map from capture indices to names - mutable const std::map* group_names_; - - mutable std::once_flag rprog_once_; - mutable std::once_flag named_groups_once_; - mutable std::once_flag group_names_once_; - - RE2(const RE2&) = delete; - RE2& operator=(const RE2&) = delete; -}; - -/***** Implementation details *****/ - -namespace re2_internal { - -// Types for which the 3-ary Parse() function template has specializations. -template struct Parse3ary : public std::false_type {}; -template <> struct Parse3ary : public std::true_type {}; -template <> struct Parse3ary : public std::true_type {}; -template <> struct Parse3ary : public std::true_type {}; -template <> struct Parse3ary : public std::true_type {}; -template <> struct Parse3ary : public std::true_type {}; -template <> struct Parse3ary : public std::true_type {}; -template <> struct Parse3ary : public std::true_type {}; -template <> struct Parse3ary : public std::true_type {}; - -template -bool Parse(const char* str, size_t n, T* dest); - -// Types for which the 4-ary Parse() function template has specializations. -template struct Parse4ary : public std::false_type {}; -template <> struct Parse4ary : public std::true_type {}; -template <> struct Parse4ary : public std::true_type {}; -template <> struct Parse4ary : public std::true_type {}; -template <> struct Parse4ary : public std::true_type {}; -template <> struct Parse4ary : public std::true_type {}; -template <> struct Parse4ary : public std::true_type {}; -template <> struct Parse4ary : public std::true_type {}; -template <> struct Parse4ary : public std::true_type {}; - -template -bool Parse(const char* str, size_t n, T* dest, int radix); - -} // namespace re2_internal - -class RE2::Arg { - private: - template - using CanParse3ary = typename std::enable_if< - re2_internal::Parse3ary::value, - int>::type; + typedef bool (*Parser)(const char *str, size_t n, void *dest); - template - using CanParse4ary = typename std::enable_if< - re2_internal::Parse4ary::value, - int>::type; + template + Arg(T *ptr, Parser parser) : arg_(ptr), parser_(parser) {} -#if !defined(_MSC_VER) - template - using CanParseFrom = typename std::enable_if< - std::is_member_function_pointer< - decltype(static_cast( - &T::ParseFrom))>::value, - int>::type; -#endif + bool Parse(const char *str, size_t n) const + { + return (*parser_)(str, n, arg_); + } - public: - Arg() : Arg(nullptr) {} - Arg(std::nullptr_t ptr) : arg_(ptr), parser_(DoNothing) {} + private: + static bool DoNothing(const char * /*str*/, size_t /*n*/, void * /*dest*/) + { + return true; + } - template = 0> - Arg(T* ptr) : arg_(ptr), parser_(DoParse3ary) {} + template + static bool DoParse3ary(const char *str, size_t n, void *dest) + { + return re2_internal::Parse(str, n, reinterpret_cast(dest)); + } - template = 0> - Arg(T* ptr) : arg_(ptr), parser_(DoParse4ary) {} + template + static bool DoParse4ary(const char *str, size_t n, void *dest) + { + return re2_internal::Parse(str, n, reinterpret_cast(dest), 10); + } #if !defined(_MSC_VER) - template = 0> - Arg(T* ptr) : arg_(ptr), parser_(DoParseFrom) {} + template + static bool DoParseFrom(const char *str, size_t n, void *dest) + { + if (dest == NULL) + return true; + return reinterpret_cast(dest)->ParseFrom(str, n); + } #endif - typedef bool (*Parser)(const char* str, size_t n, void* dest); + void *arg_; + Parser parser_; + }; template - Arg(T* ptr, Parser parser) : arg_(ptr), parser_(parser) {} - - bool Parse(const char* str, size_t n) const { - return (*parser_)(str, n, arg_); - } - - private: - static bool DoNothing(const char* /*str*/, size_t /*n*/, void* /*dest*/) { - return true; + inline RE2::Arg RE2::CRadix(T *ptr) + { + return RE2::Arg(ptr, [](const char *str, size_t n, void *dest) -> bool + { return re2_internal::Parse(str, n, reinterpret_cast(dest), 0); }); } template - static bool DoParse3ary(const char* str, size_t n, void* dest) { - return re2_internal::Parse(str, n, reinterpret_cast(dest)); + inline RE2::Arg RE2::Hex(T *ptr) + { + return RE2::Arg(ptr, [](const char *str, size_t n, void *dest) -> bool + { return re2_internal::Parse(str, n, reinterpret_cast(dest), 16); }); } template - static bool DoParse4ary(const char* str, size_t n, void* dest) { - return re2_internal::Parse(str, n, reinterpret_cast(dest), 10); + inline RE2::Arg RE2::Octal(T *ptr) + { + return RE2::Arg(ptr, [](const char *str, size_t n, void *dest) -> bool + { return re2_internal::Parse(str, n, reinterpret_cast(dest), 8); }); } -#if !defined(_MSC_VER) - template - static bool DoParseFrom(const char* str, size_t n, void* dest) { - if (dest == NULL) return true; - return reinterpret_cast(dest)->ParseFrom(str, n); - } -#endif - - void* arg_; - Parser parser_; -}; - -template -inline RE2::Arg RE2::CRadix(T* ptr) { - return RE2::Arg(ptr, [](const char* str, size_t n, void* dest) -> bool { - return re2_internal::Parse(str, n, reinterpret_cast(dest), 0); - }); -} - -template -inline RE2::Arg RE2::Hex(T* ptr) { - return RE2::Arg(ptr, [](const char* str, size_t n, void* dest) -> bool { - return re2_internal::Parse(str, n, reinterpret_cast(dest), 16); - }); -} - -template -inline RE2::Arg RE2::Octal(T* ptr) { - return RE2::Arg(ptr, [](const char* str, size_t n, void* dest) -> bool { - return re2_internal::Parse(str, n, reinterpret_cast(dest), 8); - }); -} - #ifndef SWIG // Silence warnings about missing initializers for members of LazyRE2. #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 6 #pragma GCC diagnostic ignored "-Wmissing-field-initializers" #endif -// Helper for writing global or static RE2s safely. -// Write -// static LazyRE2 re = {".*"}; -// and then use *re instead of writing -// static RE2 re(".*"); -// The former is more careful about multithreaded -// situations than the latter. -// -// N.B. This class never deletes the RE2 object that -// it constructs: that's a feature, so that it can be used -// for global and function static variables. -class LazyRE2 { - private: - struct NoArg {}; - - public: - typedef RE2 element_type; // support std::pointer_traits - - // Constructor omitted to preserve braced initialization in C++98. - - // Pretend to be a pointer to Type (never NULL due to on-demand creation): - RE2& operator*() const { return *get(); } - RE2* operator->() const { return get(); } - - // Named accessor/initializer: - RE2* get() const { - std::call_once(once_, &LazyRE2::Init, this); - return ptr_; - } + // Helper for writing global or static RE2s safely. + // Write + // static LazyRE2 re = {".*"}; + // and then use *re instead of writing + // static RE2 re(".*"); + // The former is more careful about multithreaded + // situations than the latter. + // + // N.B. This class never deletes the RE2 object that + // it constructs: that's a feature, so that it can be used + // for global and function static variables. + class LazyRE2 + { + private: + struct NoArg + { + }; - // All data fields must be public to support {"foo"} initialization. - const char* pattern_; - RE2::CannedOptions options_; - NoArg barrier_against_excess_initializers_; + public: + typedef RE2 element_type; // support std::pointer_traits - mutable RE2* ptr_; - mutable std::once_flag once_; + // Constructor omitted to preserve braced initialization in C++98. - private: - static void Init(const LazyRE2* lazy_re2) { - lazy_re2->ptr_ = new RE2(lazy_re2->pattern_, lazy_re2->options_); - } + // Pretend to be a pointer to Type (never NULL due to on-demand creation): + RE2 &operator*() const { return *get(); } + RE2 *operator->() const { return get(); } + + // Named accessor/initializer: + RE2 *get() const + { + std::call_once(once_, &LazyRE2::Init, this); + return ptr_; + } - void operator=(const LazyRE2&); // disallowed -}; + // All data fields must be public to support {"foo"} initialization. + const char *pattern_; + RE2::CannedOptions options_; + NoArg barrier_against_excess_initializers_; + + mutable RE2 *ptr_; + mutable std::once_flag once_; + + private: + static void Init(const LazyRE2 *lazy_re2) + { + lazy_re2->ptr_ = new RE2(lazy_re2->pattern_, lazy_re2->options_); + } + + void operator=(const LazyRE2 &); // disallowed + }; #endif -namespace hooks { + namespace hooks + { // Most platforms support thread_local. Older versions of iOS don't support // thread_local, but for the sake of brevity, we lump together all versions @@ -984,31 +1069,33 @@ namespace hooks { // could result in infinite mutual recursion. To discourage that possibility, // RE2 will not maintain the context pointer correctly when used in that way. #ifdef RE2_HAVE_THREAD_LOCAL -extern thread_local const RE2* context; + extern thread_local const RE2 *context; #endif -struct DFAStateCacheReset { - int64_t state_budget; - size_t state_cache_size; -}; + struct DFAStateCacheReset + { + int64_t state_budget; + size_t state_cache_size; + }; -struct DFASearchFailure { - // Nothing yet... -}; + struct DFASearchFailure + { + // Nothing yet... + }; -#define DECLARE_HOOK(type) \ - using type##Callback = void(const type&); \ - void Set##type##Hook(type##Callback* cb); \ - type##Callback* Get##type##Hook(); +#define DECLARE_HOOK(type) \ + using type##Callback = void(const type &); \ + void Set##type##Hook(type##Callback *cb); \ + type##Callback *Get##type##Hook(); -DECLARE_HOOK(DFAStateCacheReset) -DECLARE_HOOK(DFASearchFailure) + DECLARE_HOOK(DFAStateCacheReset) + DECLARE_HOOK(DFASearchFailure) #undef DECLARE_HOOK -} // namespace hooks + } // namespace hooks -} // namespace re2 +} // namespace re2 -using re2::RE2; -using re2::LazyRE2; \ No newline at end of file +using re2::LazyRE2; +using re2::RE2; \ No newline at end of file diff --git a/re2/regex_internal.h b/re2/regex_internal.h index cab67fe7ced20bf7eba5e72791c1574662057073..482634f0bc3e925e843ab7f4110952e430907f20 100644 --- a/re2/regex_internal.h +++ b/re2/regex_internal.h @@ -15,58 +15,61 @@ // #include "re2/sparse_array.h" #include "regex-capi/include/regex_capi.h" -namespace re2 { -// #include "re2/prog.h" -// Compiled form of regexp program. - class Prog { - //rure 更名为 Prog -}; - -// #include "re2/regexp.h" -class Regexp { - public: +namespace re2 +{ + // #include "re2/prog.h" + // Compiled form of regexp program. + class Prog + { + // rure 更名为 Prog + }; - // Flags for parsing. Can be ORed together. - enum ParseFlags { - NoParseFlags = 0, - FoldCase = 1 << 0, // Fold case during matching (case-insensitive). - Literal = 1 << 1, // Treat s as literal string instead of a regexp. - ClassNL = 1 << 2, // Allow char classes like [^a-z] and \D and \s - // and [[:space:]] to match newline. - DotNL = 1 << 3, // Allow . to match newline. - MatchNL = ClassNL | DotNL, - OneLine = 1 << 4, // Treat ^ and $ as only matching at beginning and - // end of text, not around embedded newlines. - // (Perl's default) - Latin1 = 1 << 5, // Regexp and text are in Latin1, not UTF-8. - NonGreedy = 1 << 6, // Repetition operators are non-greedy by default. - PerlClasses = 1 << 7, // Allow Perl character classes like \d. - PerlB = 1 << 8, // Allow Perl's \b and \B. - PerlX = 1 << 9, // Perl extensions: - // non-capturing parens - (?: ) - // non-greedy operators - *? +? ?? {}? - // flag edits - (?i) (?-i) (?i: ) - // i - FoldCase - // m - !OneLine - // s - DotNL - // U - NonGreedy - // line ends: \A \z - // \Q and \E to disable/enable metacharacters - // (?Pexpr) for named captures - // \C to match any single byte - UnicodeGroups = 1 << 10, // Allow \p{Han} for Unicode Han group - // and \P{Han} for its negation. - NeverNL = 1 << 11, // Never match NL, even if the regexp mentions - // it explicitly. - NeverCapture = 1 << 12, // Parse all parens as non-capturing. + // #include "re2/regexp.h" + class Regexp + { + public: + // Flags for parsing. Can be ORed together. + enum ParseFlags + { + NoParseFlags = 0, + FoldCase = 1 << 0, // Fold case during matching (case-insensitive). + Literal = 1 << 1, // Treat s as literal string instead of a regexp. + ClassNL = 1 << 2, // Allow char classes like [^a-z] and \D and \s + // and [[:space:]] to match newline. + DotNL = 1 << 3, // Allow . to match newline. + MatchNL = ClassNL | DotNL, + OneLine = 1 << 4, // Treat ^ and $ as only matching at beginning and + // end of text, not around embedded newlines. + // (Perl's default) + Latin1 = 1 << 5, // Regexp and text are in Latin1, not UTF-8. + NonGreedy = 1 << 6, // Repetition operators are non-greedy by default. + PerlClasses = 1 << 7, // Allow Perl character classes like \d. + PerlB = 1 << 8, // Allow Perl's \b and \B. + PerlX = 1 << 9, // Perl extensions: + // non-capturing parens - (?: ) + // non-greedy operators - *? +? ?? {}? + // flag edits - (?i) (?-i) (?i: ) + // i - FoldCase + // m - !OneLine + // s - DotNL + // U - NonGreedy + // line ends: \A \z + // \Q and \E to disable/enable metacharacters + // (?Pexpr) for named captures + // \C to match any single byte + UnicodeGroups = 1 << 10, // Allow \p{Han} for Unicode Han group + // and \P{Han} for its negation. + NeverNL = 1 << 11, // Never match NL, even if the regexp mentions + // it explicitly. + NeverCapture = 1 << 12, // Parse all parens as non-capturing. - // As close to Perl as we can get. - LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX | - UnicodeGroups, + // As close to Perl as we can get. + LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX | + UnicodeGroups, - // Internal use only. - WasDollar = 1 << 13, // on kRegexpEndText: was $ in regexp text - AllParseFlags = (1 << 14)-1, + // Internal use only. + WasDollar = 1 << 13, // on kRegexpEndText: was $ in regexp text + AllParseFlags = (1 << 14) - 1, + }; }; -}; }; \ No newline at end of file diff --git a/re2/set.cc b/re2/set.cc index 2af02a9fb8f8ca8e4ed05666af4d813c970cc147..28d3be827a8ecab615b95ae0fe22a63362169677 100644 --- a/re2/set.cc +++ b/re2/set.cc @@ -48,17 +48,17 @@ namespace re2 elem_.clear(); } - RE2::Set::Set(Set && other) + RE2::Set::Set(Set &&other) : options_(other.options_), anchor_(other.anchor_), compiled_(other.compiled_), prog_(std::move(other.prog_)) { - other.elem_.clear(); - other.elem_.shrink_to_fit(); - other.compiled_ = false; - other.size_ = 0; - other.prog_.reset(); + other.elem_.clear(); + other.elem_.shrink_to_fit(); + other.compiled_ = false; + other.size_ = 0; + other.prog_.reset(); } RE2::Set &RE2::Set::operator=(Set &&other) @@ -68,14 +68,16 @@ namespace re2 return *this; } - int RE2::Set::Add(const StringPiece &pattern, std::string *error) { int place_num = size_; std::string rure_pattern = pattern.as_string(); - if(anchor_ == RE2::ANCHOR_START){ // 处理RE2::ANCHOR_START的情况 + if (anchor_ == RE2::ANCHOR_START) + { // 处理RE2::ANCHOR_START的情况 rure_pattern.insert(0, "^"); - } else if(anchor_ == RE2::ANCHOR_BOTH) { // 处理RE2::ANCHOR_BOTH的情况 + } + else if (anchor_ == RE2::ANCHOR_BOTH) + { // 处理RE2::ANCHOR_BOTH的情况 rure_pattern.insert(0, "^"); rure_pattern.append("$"); } @@ -84,7 +86,7 @@ namespace re2 if (re == NULL) { const char *msg = rure_error_message(err); - if(error != NULL) + if (error != NULL) { error->assign(msg); LOG(ERROR) << "Regexp Error '" << pattern.data() << "':" << msg << "'"; @@ -94,7 +96,7 @@ namespace re2 } else { - elem_.push_back(pair(rure_pattern, (re2::Regexp*)nullptr)); + elem_.push_back(pair(rure_pattern, (re2::Regexp *)nullptr)); size_++; // rure_free(re); return place_num; @@ -103,7 +105,8 @@ namespace re2 bool RE2::Set::Compile() { - if (compiled_) { + if (compiled_) + { LOG(ERROR) << "RE2::Set::Compile() called more than once"; return false; } @@ -111,19 +114,21 @@ namespace re2 const size_t PAT_COUNT = elem_.size(); const char *patterns[PAT_COUNT]; size_t patterns_lengths[PAT_COUNT]; - for (size_t i = 0; i < elem_.size(); i++) { + for (size_t i = 0; i < elem_.size(); i++) + { patterns[i] = elem_[i].first.c_str(); patterns_lengths[i] = elem_[i].first.length(); } - + rure_error *err = rure_error_new(); - rure_set *re = rure_compile_set((const uint8_t **) patterns, - patterns_lengths, PAT_COUNT, 0, NULL, err); - if(re == NULL){ + rure_set *re = rure_compile_set((const uint8_t **)patterns, + patterns_lengths, PAT_COUNT, 0, NULL, err); + if (re == NULL) + { compiled_ = false; rure_set_free(re); return false; - } + } prog_.reset((Prog *)re); compiled_ = true; return true; @@ -137,31 +142,34 @@ namespace re2 bool RE2::Set::Match(const StringPiece &text, std::vector *v, ErrorInfo *error_info) const { - if (!compiled_) { + if (!compiled_) + { LOG(ERROR) << "RE2::Set::Match() called before compiling"; if (error_info != NULL) error_info->kind = kNotCompiled; return false; } - + const char *pat_str = text.data(); size_t length = strlen(pat_str); - if(v == NULL) + if (v == NULL) { - bool result = rure_set_is_match((rure_set *)prog_.get(), - (const uint8_t *)pat_str, length, 0); + bool result = rure_set_is_match((rure_set *)prog_.get(), + (const uint8_t *)pat_str, length, 0); return result; } else - { + { v->clear(); bool matches[elem_.size()]; - bool result = rure_set_matches((rure_set *)prog_.get(), - (const uint8_t *)pat_str, length, 0, matches); - if(!result) return false; - for(size_t i = 0; i < elem_.size(); i++) + bool result = rure_set_matches((rure_set *)prog_.get(), + (const uint8_t *)pat_str, length, 0, matches); + if (!result) + return false; + for (size_t i = 0; i < elem_.size(); i++) { - if(matches[i]) v->push_back(i); + if (matches[i]) + v->push_back(i); } return true; } diff --git a/re2/set.h b/re2/set.h index 5d6ec912c2ade77c9151fda8f56da0943fa08b3c..c3ba571a4a8717d06ff7d5ae02743a00fd358cf2 100644 --- a/re2/set.h +++ b/re2/set.h @@ -11,72 +11,77 @@ #include "re2/re2.h" -namespace re2 { -class Prog; -class Regexp; -} // namespace re2 - -namespace re2 { - -// An RE2::Set represents a collection of regexps that can -// be searched for simultaneously. -class RE2::Set { - public: - enum ErrorKind { - kNoError = 0, - kNotCompiled, // The set is not compiled. - kOutOfMemory, // The DFA ran out of memory. - kInconsistent, // The result is inconsistent. This should never happen. +namespace re2 +{ + class Prog; + class Regexp; +} // namespace re2 + +namespace re2 +{ + + // An RE2::Set represents a collection of regexps that can + // be searched for simultaneously. + class RE2::Set + { + public: + enum ErrorKind + { + kNoError = 0, + kNotCompiled, // The set is not compiled. + kOutOfMemory, // The DFA ran out of memory. + kInconsistent, // The result is inconsistent. This should never happen. + }; + + struct ErrorInfo + { + ErrorKind kind; + }; + + Set(const RE2::Options &options, RE2::Anchor anchor); + ~Set(); + + // Not copyable. + Set(const Set &) = delete; + Set &operator=(const Set &) = delete; + // Movable. + Set(Set &&other); + Set &operator=(Set &&other); + + // Adds pattern to the set using the options passed to the constructor. + // Returns the index that will identify the regexp in the output of Match(), + // or -1 if the regexp cannot be parsed. + // Indices are assigned in sequential order starting from 0. + // Errors do not increment the index; if error is not NULL, *error will hold + // the error message from the parser. + int Add(const StringPiece &pattern, std::string *error); + + // Compiles the set in preparation for matching. + // Returns false if the compiler runs out of memory. + // Add() must not be called again after Compile(). + // Compile() must be called before Match(). + bool Compile(); + + // Returns true if text matches at least one of the regexps in the set. + // Fills v (if not NULL) with the indices of the matching regexps. + // Callers must not expect v to be sorted. + bool Match(const StringPiece &text, std::vector *v) const; + + // As above, but populates error_info (if not NULL) when none of the regexps + // in the set matched. This can inform callers when DFA execution fails, for + // example, because they might wish to handle that case differently. + bool Match(const StringPiece &text, std::vector *v, + ErrorInfo *error_info) const; + + private: + typedef std::pair Elem; + + RE2::Options options_; + RE2::Anchor anchor_; + std::vector elem_; + bool compiled_; + int size_; + std::unique_ptr prog_; }; - struct ErrorInfo { - ErrorKind kind; - }; - - Set(const RE2::Options& options, RE2::Anchor anchor); - ~Set(); - - // Not copyable. - Set(const Set&) = delete; - Set& operator=(const Set&) = delete; - // Movable. - Set(Set&& other); - Set& operator=(Set&& other); - - // Adds pattern to the set using the options passed to the constructor. - // Returns the index that will identify the regexp in the output of Match(), - // or -1 if the regexp cannot be parsed. - // Indices are assigned in sequential order starting from 0. - // Errors do not increment the index; if error is not NULL, *error will hold - // the error message from the parser. - int Add(const StringPiece& pattern, std::string* error); - - // Compiles the set in preparation for matching. - // Returns false if the compiler runs out of memory. - // Add() must not be called again after Compile(). - // Compile() must be called before Match(). - bool Compile(); - - // Returns true if text matches at least one of the regexps in the set. - // Fills v (if not NULL) with the indices of the matching regexps. - // Callers must not expect v to be sorted. - bool Match(const StringPiece& text, std::vector* v) const; - - // As above, but populates error_info (if not NULL) when none of the regexps - // in the set matched. This can inform callers when DFA execution fails, for - // example, because they might wish to handle that case differently. - bool Match(const StringPiece& text, std::vector* v, - ErrorInfo* error_info) const; - - private: - typedef std::pair Elem; - - RE2::Options options_; - RE2::Anchor anchor_; - std::vector elem_; - bool compiled_; - int size_; - std::unique_ptr prog_; -}; - -} // namespace re2 \ No newline at end of file +} // namespace re2 \ No newline at end of file diff --git a/re2/stringpiece.cc b/re2/stringpiece.cc index ea822d27de1a13425f70bc12e09833382b9a5147..d640018e5b88376f82cb3df59d855672a5430d3f 100644 --- a/re2/stringpiece.cc +++ b/re2/stringpiece.cc @@ -8,58 +8,75 @@ #include "re2/testing/util/util.h" -namespace re2 { +namespace re2 +{ -const StringPiece::size_type StringPiece::npos; // initialized in stringpiece.h + const StringPiece::size_type StringPiece::npos; // initialized in stringpiece.h -StringPiece::size_type StringPiece::copy(char* buf, size_type n, - size_type pos) const { - size_type ret = std::min(size_ - pos, n); - memcpy(buf, data_ + pos, ret); - return ret; -} + StringPiece::size_type StringPiece::copy(char *buf, size_type n, + size_type pos) const + { + size_type ret = std::min(size_ - pos, n); + memcpy(buf, data_ + pos, ret); + return ret; + } -StringPiece StringPiece::substr(size_type pos, size_type n) const { - if (pos > size_) pos = size_; - if (n > size_ - pos) n = size_ - pos; - return StringPiece(data_ + pos, n); -} + StringPiece StringPiece::substr(size_type pos, size_type n) const + { + if (pos > size_) + pos = size_; + if (n > size_ - pos) + n = size_ - pos; + return StringPiece(data_ + pos, n); + } -StringPiece::size_type StringPiece::find(const StringPiece& s, - size_type pos) const { - if (pos > size_) return npos; - const_pointer result = std::search(data_ + pos, data_ + size_, - s.data_, s.data_ + s.size_); - size_type xpos = result - data_; - return xpos + s.size_ <= size_ ? xpos : npos; -} + StringPiece::size_type StringPiece::find(const StringPiece &s, + size_type pos) const + { + if (pos > size_) + return npos; + const_pointer result = std::search(data_ + pos, data_ + size_, + s.data_, s.data_ + s.size_); + size_type xpos = result - data_; + return xpos + s.size_ <= size_ ? xpos : npos; + } -StringPiece::size_type StringPiece::find(char c, size_type pos) const { - if (size_ <= 0 || pos >= size_) return npos; - const_pointer result = std::find(data_ + pos, data_ + size_, c); - return result != data_ + size_ ? result - data_ : npos; -} + StringPiece::size_type StringPiece::find(char c, size_type pos) const + { + if (size_ <= 0 || pos >= size_) + return npos; + const_pointer result = std::find(data_ + pos, data_ + size_, c); + return result != data_ + size_ ? result - data_ : npos; + } -StringPiece::size_type StringPiece::rfind(const StringPiece& s, - size_type pos) const { - if (size_ < s.size_) return npos; - if (s.size_ == 0) return std::min(size_, pos); - const_pointer last = data_ + std::min(size_ - s.size_, pos) + s.size_; - const_pointer result = std::find_end(data_, last, s.data_, s.data_ + s.size_); - return result != last ? result - data_ : npos; -} + StringPiece::size_type StringPiece::rfind(const StringPiece &s, + size_type pos) const + { + if (size_ < s.size_) + return npos; + if (s.size_ == 0) + return std::min(size_, pos); + const_pointer last = data_ + std::min(size_ - s.size_, pos) + s.size_; + const_pointer result = std::find_end(data_, last, s.data_, s.data_ + s.size_); + return result != last ? result - data_ : npos; + } -StringPiece::size_type StringPiece::rfind(char c, size_type pos) const { - if (size_ <= 0) return npos; - for (size_t i = std::min(pos + 1, size_); i != 0;) { - if (data_[--i] == c) return i; + StringPiece::size_type StringPiece::rfind(char c, size_type pos) const + { + if (size_ <= 0) + return npos; + for (size_t i = std::min(pos + 1, size_); i != 0;) + { + if (data_[--i] == c) + return i; + } + return npos; } - return npos; -} -std::ostream& operator<<(std::ostream& o, const StringPiece& p) { - o.write(p.data(), p.size()); - return o; -} + std::ostream &operator<<(std::ostream &o, const StringPiece &p) + { + o.write(p.data(), p.size()); + return o; + } -} // namespace re2 +} // namespace re2 diff --git a/re2/stringpiece.h b/re2/stringpiece.h index f568876bab2b4c91851cd43e783073c082d25424..642651ddb8375047338a26ce0894b932e268120b 100644 --- a/re2/stringpiece.h +++ b/re2/stringpiece.h @@ -33,175 +33,209 @@ #include #endif -namespace re2 { - -class StringPiece { - public: - typedef std::char_traits traits_type; - typedef char value_type; - typedef char* pointer; - typedef const char* const_pointer; - typedef char& reference; - typedef const char& const_reference; - typedef const char* const_iterator; - typedef const_iterator iterator; - typedef std::reverse_iterator const_reverse_iterator; - typedef const_reverse_iterator reverse_iterator; - typedef size_t size_type; - typedef ptrdiff_t difference_type; - static const size_type npos = static_cast(-1); - - // We provide non-explicit singleton constructors so users can pass - // in a "const char*" or a "string" wherever a "StringPiece" is - // expected. - StringPiece() - : data_(NULL), size_(0) {} +namespace re2 +{ + + class StringPiece + { + public: + typedef std::char_traits traits_type; + typedef char value_type; + typedef char *pointer; + typedef const char *const_pointer; + typedef char &reference; + typedef const char &const_reference; + typedef const char *const_iterator; + typedef const_iterator iterator; + typedef std::reverse_iterator const_reverse_iterator; + typedef const_reverse_iterator reverse_iterator; + typedef size_t size_type; + typedef ptrdiff_t difference_type; + static const size_type npos = static_cast(-1); + + // We provide non-explicit singleton constructors so users can pass + // in a "const char*" or a "string" wherever a "StringPiece" is + // expected. + StringPiece() + : data_(NULL), size_(0) {} #if __has_include() && __cplusplus >= 201703L - StringPiece(const std::string_view& str) - : data_(str.data()), size_(str.size()) {} + StringPiece(const std::string_view &str) + : data_(str.data()), size_(str.size()) + { + } #endif - StringPiece(const std::string& str) - : data_(str.data()), size_(str.size()) {} - StringPiece(const char* str) - : data_(str), size_(str == NULL ? 0 : strlen(str)) {} - StringPiece(const char* str, size_type len) - : data_(str), size_(len) {} - - const_iterator begin() const { return data_; } - const_iterator end() const { return data_ + size_; } - const_reverse_iterator rbegin() const { - return const_reverse_iterator(data_ + size_); - } - const_reverse_iterator rend() const { - return const_reverse_iterator(data_); - } + StringPiece(const std::string &str) + : data_(str.data()), size_(str.size()) + { + } + StringPiece(const char *str) + : data_(str), size_(str == NULL ? 0 : strlen(str)) {} + StringPiece(const char *str, size_type len) + : data_(str), size_(len) {} + + const_iterator begin() const { return data_; } + const_iterator end() const { return data_ + size_; } + const_reverse_iterator rbegin() const + { + return const_reverse_iterator(data_ + size_); + } + const_reverse_iterator rend() const + { + return const_reverse_iterator(data_); + } - size_type size() const { return size_; } - size_type length() const { return size_; } - bool empty() const { return size_ == 0; } + size_type size() const { return size_; } + size_type length() const { return size_; } + bool empty() const { return size_ == 0; } - const_reference operator[](size_type i) const { return data_[i]; } - const_pointer data() const { return data_; } + const_reference operator[](size_type i) const { return data_[i]; } + const_pointer data() const { return data_; } - void remove_prefix(size_type n) { - data_ += n; - size_ -= n; - } + void remove_prefix(size_type n) + { + data_ += n; + size_ -= n; + } - void remove_suffix(size_type n) { - size_ -= n; - } + void remove_suffix(size_type n) + { + size_ -= n; + } - void set(const char* str) { - data_ = str; - size_ = str == NULL ? 0 : strlen(str); - } + void set(const char *str) + { + data_ = str; + size_ = str == NULL ? 0 : strlen(str); + } - void set(const char* str, size_type len) { - data_ = str; - size_ = len; - } + void set(const char *str, size_type len) + { + data_ = str; + size_ = len; + } - // Converts to `std::basic_string`. - template - explicit operator std::basic_string() const { - if (!data_) return {}; - return std::basic_string(data_, size_); - } + // Converts to `std::basic_string`. + template + explicit operator std::basic_string() const + { + if (!data_) + return {}; + return std::basic_string(data_, size_); + } - std::string as_string() const { - return std::string(data_, size_); - } + std::string as_string() const + { + return std::string(data_, size_); + } - // We also define ToString() here, since many other string-like - // interfaces name the routine that converts to a C++ string - // "ToString", and it's confusing to have the method that does that - // for a StringPiece be called "as_string()". We also leave the - // "as_string()" method defined here for existing code. - std::string ToString() const { - return std::string(data_, size_); - } + // We also define ToString() here, since many other string-like + // interfaces name the routine that converts to a C++ string + // "ToString", and it's confusing to have the method that does that + // for a StringPiece be called "as_string()". We also leave the + // "as_string()" method defined here for existing code. + std::string ToString() const + { + return std::string(data_, size_); + } - void CopyToString(std::string* target) const { - target->assign(data_, size_); - } + void CopyToString(std::string *target) const + { + target->assign(data_, size_); + } - void AppendToString(std::string* target) const { - target->append(data_, size_); - } + void AppendToString(std::string *target) const + { + target->append(data_, size_); + } + + size_type copy(char *buf, size_type n, size_type pos = 0) const; + StringPiece substr(size_type pos = 0, size_type n = npos) const; + + int compare(const StringPiece &x) const + { + size_type min_size = std::min(size(), x.size()); + if (min_size > 0) + { + int r = memcmp(data(), x.data(), min_size); + if (r < 0) + return -1; + if (r > 0) + return 1; + } + if (size() < x.size()) + return -1; + if (size() > x.size()) + return 1; + return 0; + } - size_type copy(char* buf, size_type n, size_type pos = 0) const; - StringPiece substr(size_type pos = 0, size_type n = npos) const; + // Does "this" start with "x"? + bool starts_with(const StringPiece &x) const + { + return x.empty() || + (size() >= x.size() && memcmp(data(), x.data(), x.size()) == 0); + } - int compare(const StringPiece& x) const { - size_type min_size = std::min(size(), x.size()); - if (min_size > 0) { - int r = memcmp(data(), x.data(), min_size); - if (r < 0) return -1; - if (r > 0) return 1; + // Does "this" end with "x"? + bool ends_with(const StringPiece &x) const + { + return x.empty() || + (size() >= x.size() && + memcmp(data() + (size() - x.size()), x.data(), x.size()) == 0); } - if (size() < x.size()) return -1; - if (size() > x.size()) return 1; - return 0; + + bool contains(const StringPiece &s) const + { + return find(s) != npos; + } + + size_type find(const StringPiece &s, size_type pos = 0) const; + size_type find(char c, size_type pos = 0) const; + size_type rfind(const StringPiece &s, size_type pos = npos) const; + size_type rfind(char c, size_type pos = npos) const; + + private: + const_pointer data_; + size_type size_; + }; + + inline bool operator==(const StringPiece &x, const StringPiece &y) + { + StringPiece::size_type len = x.size(); + if (len != y.size()) + return false; + return x.data() == y.data() || len == 0 || + memcmp(x.data(), y.data(), len) == 0; + } + + inline bool operator!=(const StringPiece &x, const StringPiece &y) + { + return !(x == y); + } + + inline bool operator<(const StringPiece &x, const StringPiece &y) + { + StringPiece::size_type min_size = std::min(x.size(), y.size()); + int r = min_size == 0 ? 0 : memcmp(x.data(), y.data(), min_size); + return (r < 0) || (r == 0 && x.size() < y.size()); } - // Does "this" start with "x"? - bool starts_with(const StringPiece& x) const { - return x.empty() || - (size() >= x.size() && memcmp(data(), x.data(), x.size()) == 0); + inline bool operator>(const StringPiece &x, const StringPiece &y) + { + return y < x; } - // Does "this" end with "x"? - bool ends_with(const StringPiece& x) const { - return x.empty() || - (size() >= x.size() && - memcmp(data() + (size() - x.size()), x.data(), x.size()) == 0); + inline bool operator<=(const StringPiece &x, const StringPiece &y) + { + return !(x > y); } - bool contains(const StringPiece& s) const { - return find(s) != npos; + inline bool operator>=(const StringPiece &x, const StringPiece &y) + { + return !(x < y); } - size_type find(const StringPiece& s, size_type pos = 0) const; - size_type find(char c, size_type pos = 0) const; - size_type rfind(const StringPiece& s, size_type pos = npos) const; - size_type rfind(char c, size_type pos = npos) const; - - private: - const_pointer data_; - size_type size_; -}; - -inline bool operator==(const StringPiece& x, const StringPiece& y) { - StringPiece::size_type len = x.size(); - if (len != y.size()) return false; - return x.data() == y.data() || len == 0 || - memcmp(x.data(), y.data(), len) == 0; -} - -inline bool operator!=(const StringPiece& x, const StringPiece& y) { - return !(x == y); -} - -inline bool operator<(const StringPiece& x, const StringPiece& y) { - StringPiece::size_type min_size = std::min(x.size(), y.size()); - int r = min_size == 0 ? 0 : memcmp(x.data(), y.data(), min_size); - return (r < 0) || (r == 0 && x.size() < y.size()); -} - -inline bool operator>(const StringPiece& x, const StringPiece& y) { - return y < x; -} - -inline bool operator<=(const StringPiece& x, const StringPiece& y) { - return !(x > y); -} - -inline bool operator>=(const StringPiece& x, const StringPiece& y) { - return !(x < y); -} - -// Allow StringPiece to be logged. -std::ostream& operator<<(std::ostream& o, const StringPiece& p); - -} // namespace re2 \ No newline at end of file + // Allow StringPiece to be logged. + std::ostream &operator<<(std::ostream &o, const StringPiece &p); + +} // namespace re2 \ No newline at end of file diff --git a/regex-capi/ctest/test.c b/regex-capi/ctest/test.c index 4aca84abf1238331ca58102d707568fa794349fe..295e9ca3d69af14cf536a1b01d7d6a92b59177eb 100644 --- a/regex-capi/ctest/test.c +++ b/regex-capi/ctest/test.c @@ -7,18 +7,21 @@ #include "regex_capi.h" #ifndef DEBUG - #define DEBUG false +#define DEBUG false #endif -bool test_is_match() { +bool test_is_match() +{ bool passed = true; const char *haystack = "snowman: \xE2\x98\x83"; rure *re = rure_compile_must("\\p{So}$"); bool matched = rure_is_match(re, (const uint8_t *)haystack, strlen(haystack), 0); - if (!matched) { - if (DEBUG) { + if (!matched) + { + if (DEBUG) + { fprintf(stderr, "[test_is_match] expected match, but got no match\n"); } @@ -28,8 +31,8 @@ bool test_is_match() { return passed; } - -bool test_find() { +bool test_find() +{ bool passed = true; const char *haystack = "snowman: \xE2\x98\x83"; @@ -37,16 +40,20 @@ bool test_find() { rure_match match = {0}; bool matched = rure_find(re, (const uint8_t *)haystack, strlen(haystack), 0, &match); - if (!matched) { - if (DEBUG) { + if (!matched) + { + if (DEBUG) + { fprintf(stderr, "[test_find] expected match, but got no match\n"); } passed = false; } size_t expect_start = 9; size_t expect_end = 12; - if (match.start != expect_start || match.end != expect_end) { - if (DEBUG) { + if (match.start != expect_start || match.end != expect_end) + { + if (DEBUG) + { fprintf(stderr, "[test_find] expected match at (%zu, %zu), but " "got match at (%zu, %zu)\n", @@ -58,7 +65,8 @@ bool test_find() { return passed; } -bool test_captures() { +bool test_captures() +{ bool passed = true; const char *haystack = "snowman: \xE2\x98\x83"; @@ -67,8 +75,10 @@ bool test_captures() { rure_captures *caps = rure_captures_new(re); bool matched = rure_find_captures(re, (const uint8_t *)haystack, strlen(haystack), 0, caps); - if (!matched) { - if (DEBUG) { + if (!matched) + { + if (DEBUG) + { fprintf(stderr, "[test_captures] expected match, but got no match\n"); } @@ -76,8 +86,10 @@ bool test_captures() { } size_t expect_captures_len = 3; size_t captures_len = rure_captures_len(caps); - if (captures_len != expect_captures_len) { - if (DEBUG) { + if (captures_len != expect_captures_len) + { + if (DEBUG) + { fprintf(stderr, "[test_captures] " "expected capture group length to be %zd, but " @@ -90,8 +102,10 @@ bool test_captures() { size_t expect_start = 9; size_t expect_end = 12; rure_captures_at(caps, 2, &match); - if (match.start != expect_start || match.end != expect_end) { - if (DEBUG) { + if (match.start != expect_start || match.end != expect_end) + { + if (DEBUG) + { fprintf(stderr, "[test_captures] " "expected capture 2 match at (%zu, %zu), " @@ -106,10 +120,13 @@ done: return passed; } -bool test_iter_capture_name(char *expect, char *given) { +bool test_iter_capture_name(char *expect, char *given) +{ bool passed = true; - if (strcmp(expect, given)) { - if (DEBUG) { + if (strcmp(expect, given)) + { + if (DEBUG) + { fprintf(stderr, "[test_iter_capture_name] expected first capture " "name '%s' got '%s'\n", @@ -120,7 +137,8 @@ bool test_iter_capture_name(char *expect, char *given) { return passed; } -bool test_iter_capture_names() { +bool test_iter_capture_names() +{ bool passed = true; char *name; @@ -129,8 +147,10 @@ bool test_iter_capture_names() { rure_iter_capture_names *it = rure_iter_capture_names_new(re); bool result = rure_iter_capture_names_next(it, &name); - if (!result) { - if (DEBUG) { + if (!result) + { + if (DEBUG) + { fprintf(stderr, "[test_iter_capture_names] expected a second name, " "but got none\n"); @@ -141,19 +161,22 @@ bool test_iter_capture_names() { result = rure_iter_capture_names_next(it, &name); passed = test_iter_capture_name("year", name); - if (!passed) { + if (!passed) + { goto done; } result = rure_iter_capture_names_next(it, &name); passed = test_iter_capture_name("month", name); - if (!passed) { + if (!passed) + { goto done; } result = rure_iter_capture_names_next(it, &name); passed = test_iter_capture_name("day", name); - if (!passed) { + if (!passed) + { goto done; } done: @@ -168,7 +191,8 @@ done: * mode, we can match arbitrary possibly invalid UTF-8 bytes, such as \xFF. * (When Unicode mode is enabled, \xFF won't match .) */ -bool test_flags() { +bool test_flags() +{ bool passed = true; const char *pattern = "."; const char *haystack = "\xFF"; @@ -177,8 +201,10 @@ bool test_flags() { 0, NULL, NULL); bool matched = rure_is_match(re, (const uint8_t *)haystack, strlen(haystack), 0); - if (!matched) { - if (DEBUG) { + if (!matched) + { + if (DEBUG) + { fprintf(stderr, "[test_flags] expected match, but got no match\n"); } passed = false; @@ -187,12 +213,15 @@ bool test_flags() { return passed; } -bool test_compile_error() { +bool test_compile_error() +{ bool passed = true; rure_error *err = rure_error_new(); rure *re = rure_compile((const uint8_t *)"(", 1, 0, NULL, err); - if (re != NULL) { - if (DEBUG) { + if (re != NULL) + { + if (DEBUG) + { fprintf(stderr, "[test_compile_error] " "expected NULL regex pointer, but got non-NULL pointer\n"); @@ -201,12 +230,15 @@ bool test_compile_error() { rure_free(re); } const char *msg = rure_error_message(err); - if (NULL == strstr(msg, "unclosed group")) { - if (DEBUG) { + if (NULL == strstr(msg, "unclosed group")) + { + if (DEBUG) + { fprintf(stderr, "[test_compile_error] " "expected an 'unclosed parenthesis' error message, but " - "got this instead: '%s'\n", msg); + "got this instead: '%s'\n", + msg); } passed = false; } @@ -214,59 +246,63 @@ bool test_compile_error() { return passed; } - -bool test_regex_set_matches() { +bool test_regex_set_matches() +{ #define PAT_COUNT 6 bool passed = true; const char *patterns[] = { - "foo", "barfoo", "\\w+", "\\d+", "foobar", "bar" - }; + "foo", "barfoo", "\\w+", "\\d+", "foobar", "bar"}; const size_t patterns_lengths[] = { - 3, 6, 3, 3, 6, 3 - }; + 3, 6, 3, 3, 6, 3}; rure_error *err = rure_error_new(); - rure_set *re = rure_compile_set((const uint8_t **) patterns, + rure_set *re = rure_compile_set((const uint8_t **)patterns, patterns_lengths, PAT_COUNT, 0, NULL, err); - if (re == NULL) { + if (re == NULL) + { passed = false; goto done2; } - if (rure_set_len(re) != PAT_COUNT) { + if (rure_set_len(re) != PAT_COUNT) + { passed = false; goto done1; } - if (!rure_set_is_match(re, (const uint8_t *) "foobar", 6, 0)) { + if (!rure_set_is_match(re, (const uint8_t *)"foobar", 6, 0)) + { passed = false; goto done1; } - if (rure_set_is_match(re, (const uint8_t *) "", 0, 0)) { + if (rure_set_is_match(re, (const uint8_t *)"", 0, 0)) + { passed = false; goto done1; } bool matches[PAT_COUNT]; - if (!rure_set_matches(re, (const uint8_t *) "foobar", 6, 0, matches)) { + if (!rure_set_matches(re, (const uint8_t *)"foobar", 6, 0, matches)) + { passed = false; goto done1; } const bool match_target[] = { - true, false, true, false, true, true - }; + true, false, true, false, true, true}; int i; - for (i = 0; i < PAT_COUNT; ++i) { - if (matches[i] != match_target[i]) { + for (i = 0; i < PAT_COUNT; ++i) + { + if (matches[i] != match_target[i]) + { passed = false; goto done1; } @@ -281,54 +317,58 @@ done2: #undef PAT_COUNT } -bool test_regex_set_match_start() { +bool test_regex_set_match_start() +{ #define PAT_COUNT 3 bool passed = true; const char *patterns[] = { - "foo", "bar", "fooo" - }; + "foo", "bar", "fooo"}; const size_t patterns_lengths[] = { - 3, 3, 4 - }; + 3, 3, 4}; rure_error *err = rure_error_new(); - rure_set *re = rure_compile_set((const uint8_t **) patterns, + rure_set *re = rure_compile_set((const uint8_t **)patterns, patterns_lengths, PAT_COUNT, 0, NULL, err); - if (re == NULL) { + if (re == NULL) + { passed = false; goto done2; } - if (rure_set_len(re) != PAT_COUNT) { + if (rure_set_len(re) != PAT_COUNT) + { passed = false; goto done1; } - if (rure_set_is_match(re, (const uint8_t *)"foobiasdr", 7, 2)) { + if (rure_set_is_match(re, (const uint8_t *)"foobiasdr", 7, 2)) + { passed = false; goto done1; } { bool matches[PAT_COUNT]; - if (!rure_set_matches(re, (const uint8_t *)"fooobar", 8, 0, matches)) { + if (!rure_set_matches(re, (const uint8_t *)"fooobar", 8, 0, matches)) + { passed = false; goto done1; } const bool match_target[] = { - true, true, true - }; + true, true, true}; int i; - for (i = 0; i < PAT_COUNT; ++i) { - if (matches[i] != match_target[i]) { + for (i = 0; i < PAT_COUNT; ++i) + { + if (matches[i] != match_target[i]) + { passed = false; goto done1; } @@ -337,18 +377,20 @@ bool test_regex_set_match_start() { { bool matches[PAT_COUNT]; - if (!rure_set_matches(re, (const uint8_t *)"fooobar", 7, 1, matches)) { + if (!rure_set_matches(re, (const uint8_t *)"fooobar", 7, 1, matches)) + { passed = false; goto done1; } const bool match_target[] = { - false, true, false - }; + false, true, false}; int i; - for (i = 0; i < PAT_COUNT; ++i) { - if (matches[i] != match_target[i]) { + for (i = 0; i < PAT_COUNT; ++i) + { + if (matches[i] != match_target[i]) + { passed = false; goto done1; } @@ -364,146 +406,159 @@ done2: #undef PAT_COUNT } - -bool test_escape() { +bool test_escape() +{ bool passed = true; const char *pattern = "^[a-z]+.*$"; const char *expected_escaped = "\\^\\[a\\-z\\]\\+\\.\\*\\$"; const char *escaped = rure_escape_must(pattern); - if (!escaped) { - if (DEBUG) { + if (!escaped) + { + if (DEBUG) + { fprintf(stderr, "[test_captures] expected escaped, but got no escaped\n"); } passed = false; - } else if (strcmp(escaped, expected_escaped) != 0) { - if (DEBUG) { + } + else if (strcmp(escaped, expected_escaped) != 0) + { + if (DEBUG) + { fprintf(stderr, "[test_captures] expected \"%s\", but got \"%s\"\n", expected_escaped, escaped); } passed = false; } - rure_cstring_free((char *) escaped); + rure_cstring_free((char *)escaped); return passed; } -bool test_replace_and_replace_all(){ +bool test_replace_and_replace_all() +{ bool passed = true; - typedef struct ReplaceTest { + typedef struct ReplaceTest + { const char *regexp; const char *rewrite; const char *original; const char *single; const char *global; - int greplace_count; - }ReplaceTest; + int greplace_count; + } ReplaceTest; static const ReplaceTest tests[] = { - { "(qu|[b-df-hj-np-tv-z]*)([a-z]+)", - "${2}${1}ay", - "the quick brown fox jumps over the lazy dogs.", - "ethay quick brown fox jumps over the lazy dogs.", - "ethay ickquay ownbray oxfay umpsjay overay ethay azylay ogsday.", - 9 }, - { "\\w+", - "${0}-NOSPAM", - "abcd.efghi@google.com", - "abcd-NOSPAM.efghi@google.com", - "abcd-NOSPAM.efghi-NOSPAM@google-NOSPAM.com-NOSPAM", - 4 }, - { "^", - "(START)", - "foo", - "(START)foo", - "(START)foo", - 1 }, - { "^", - "(START)", - "", - "(START)", - "(START)", - 1 }, - { "$", - "(END)", - "", - "(END)", - "(END)", - 1 }, - { "b", - "bb", - "ababababab", - "abbabababab", - "abbabbabbabbabb", - 5 }, - { "b", - "bb", - "bbbbbb", - "bbbbbbb", - "bbbbbbbbbbbb", - 6 }, - { "b+", - "bb", - "bbbbbb", - "bb", - "bb", - 1 }, - { "b*", - "bb", - "bbbbbb", - "bb", - "bb", - 1 }, - { "b*", - "bb", - "aaaaa", - "bbaaaaa", - "bbabbabbabbabbabb", - 6 }, - - { "a.*a", - "(${0})", - "aba\naba", - "(aba)\naba", - "(aba)\n(aba)", - 2 }, - { "", NULL, NULL, NULL, NULL, 0 } - }; + {"(qu|[b-df-hj-np-tv-z]*)([a-z]+)", + "${2}${1}ay", + "the quick brown fox jumps over the lazy dogs.", + "ethay quick brown fox jumps over the lazy dogs.", + "ethay ickquay ownbray oxfay umpsjay overay ethay azylay ogsday.", + 9}, + {"\\w+", + "${0}-NOSPAM", + "abcd.efghi@google.com", + "abcd-NOSPAM.efghi@google.com", + "abcd-NOSPAM.efghi-NOSPAM@google-NOSPAM.com-NOSPAM", + 4}, + {"^", + "(START)", + "foo", + "(START)foo", + "(START)foo", + 1}, + {"^", + "(START)", + "", + "(START)", + "(START)", + 1}, + {"$", + "(END)", + "", + "(END)", + "(END)", + 1}, + {"b", + "bb", + "ababababab", + "abbabababab", + "abbabbabbabbabb", + 5}, + {"b", + "bb", + "bbbbbb", + "bbbbbbb", + "bbbbbbbbbbbb", + 6}, + {"b+", + "bb", + "bbbbbb", + "bb", + "bb", + 1}, + {"b*", + "bb", + "bbbbbb", + "bb", + "bb", + 1}, + {"b*", + "bb", + "aaaaa", + "bbaaaaa", + "bbabbabbabbabbabb", + 6}, + + {"a.*a", + "(${0})", + "aba\naba", + "(aba)\naba", + "(aba)\n(aba)", + 2}, + {"", NULL, NULL, NULL, NULL, 0}}; const char *haystack; const char *rewrite; - const char* regex; + const char *regex; - for (const ReplaceTest* t = tests; t->original != NULL; t++) { + for (const ReplaceTest *t = tests; t->original != NULL; t++) + { haystack = t->original; regex = t->regexp; rewrite = t->rewrite; rure *re = rure_compile_must(regex); const char *replaced_haystack = rure_replace(re, (const uint8_t *)haystack, strlen(haystack), - (const uint8_t *)rewrite, strlen(rewrite)); + (const uint8_t *)rewrite, strlen(rewrite)); const char *replaced_all_haystack = rure_replace_all(re, (const uint8_t *)haystack, strlen(haystack), - (const uint8_t *)rewrite, strlen(rewrite)); + (const uint8_t *)rewrite, strlen(rewrite)); int result1 = strcmp(t->single, replaced_haystack); int result2 = strcmp(t->global, replaced_all_haystack); - if(result1 != 0 && result2 !=0) passed = false; + if (result1 != 0 && result2 != 0) + passed = false; } passed = true; return passed; } -void run_test(bool (test)(), const char *name, bool *passed) { - if (!test()) { +void run_test(bool(test)(), const char *name, bool *passed) +{ + if (!test()) + { *passed = false; fprintf(stderr, "FAILED: %s\n", name); - } else { + } + else + { fprintf(stderr, "PASSED: %s\n", name); } } -int main() { +int main() +{ bool passed = true; run_test(test_is_match, "test_is_match", &passed); @@ -518,7 +573,8 @@ int main() { run_test(test_escape, "test_escape", &passed); run_test(test_replace_and_replace_all, "test_replace_and_replace_all", &passed); - if (!passed) { + if (!passed) + { exit(1); } return 0; diff --git a/regex-capi/include/regex_capi.h b/regex-capi/include/regex_capi.h index 07fc630d3f120ef857281952605fcac8fbd767e2..1ac83f8d9ec1d0111b8f717b619c0eb1110fdae7 100644 --- a/regex-capi/include/regex_capi.h +++ b/regex-capi/include/regex_capi.h @@ -20,33 +20,34 @@ #include #ifdef __cplusplus -extern "C" { +extern "C" +{ #endif -/* - * rure is the type of a compiled regular expression. - * - * An rure can be safely used from multiple threads simultaneously. - */ -typedef struct rure rure; - -/* - * rure_set is the type of a set of compiled regular expressions. - * - * A rure can be safely used from multiple threads simultaneously. - */ -typedef struct rure_set rure_set; - -/* - * rure_options is the set of non-flag configuration options for compiling - * a regular expression. Currently, only two options are available: setting - * the size limit of the compiled program and setting the size limit of the - * cache of states that the DFA uses while searching. - * - * For most uses, the default settings will work fine, and NULL can be passed - * wherever a *rure_options is expected. -*/ -typedef struct rure_options rure_options; + /* + * rure is the type of a compiled regular expression. + * + * An rure can be safely used from multiple threads simultaneously. + */ + typedef struct rure rure; + + /* + * rure_set is the type of a set of compiled regular expressions. + * + * A rure can be safely used from multiple threads simultaneously. + */ + typedef struct rure_set rure_set; + + /* + * rure_options is the set of non-flag configuration options for compiling + * a regular expression. Currently, only two options are available: setting + * the size limit of the compiled program and setting the size limit of the + * cache of states that the DFA uses while searching. + * + * For most uses, the default settings will work fine, and NULL can be passed + * wherever a *rure_options is expected. + */ + typedef struct rure_options rure_options; /* * The flags listed below can be used in rure_compile to set the default @@ -69,465 +70,457 @@ typedef struct rure_options rure_options; /* The default set of flags enabled when no flags are set. */ #define RURE_DEFAULT_FLAGS RURE_FLAG_UNICODE -/* - * rure_match corresponds to the location of a single match in a haystack. - */ -typedef struct rure_match { - /* The start position. */ - size_t start; - /* The end position. */ - size_t end; -} rure_match; - -/* - * rure_captures represents storage for sub-capture locations of a match. - * - * Computing the capture groups of a match can carry a significant performance - * penalty, so their use in the API is optional. - * - * An rure_captures value can be reused in multiple calls to rure_find_captures, - * so long as it is used with the compiled regular expression that created - * it. - * - * An rure_captures value may outlive its corresponding rure and can be freed - * independently. - * - * It is not safe to use from multiple threads simultaneously. - */ -typedef struct rure_captures rure_captures; - -/* - * rure_iter is an iterator over successive non-overlapping matches in a - * particular haystack. - * - * An rure_iter value may not outlive its corresponding rure and should be freed - * before its corresponding rure is freed. - * - * It is not safe to use from multiple threads simultaneously. - */ -typedef struct rure_iter rure_iter; - -/* - * rure_iter_capture_names is an iterator over the list of capture group names - * in this particular rure. - * - * An rure_iter_capture_names value may not outlive its corresponding rure, - * and should be freed before its corresponding rure is freed. - * - * It is not safe to use from multiple threads simultaneously. - */ -typedef struct rure_iter_capture_names rure_iter_capture_names; - -/* - * rure_error is an error that caused compilation to fail. - * - * Most errors are syntax errors but an error can be returned if the compiled - * regular expression would be too big. - * - * Whenever a function accepts an *rure_error, it is safe to pass NULL. (But - * you will not get access to the error if one occurred.) - * - * It is not safe to use from multiple threads simultaneously. - */ -typedef struct rure_error rure_error; - -typedef struct -{ - char *atom; -} Atoms; - - -typedef struct -{ - Atoms *data; - int32_t len; -} MyVec; - -/* - * rure_compile_must compiles the given pattern into a regular expression. If - * compilation fails for any reason, an error message is printed to stderr and - * the process is aborted. - * - * The pattern given should be in UTF-8. For convenience, this accepts a C - * string, which means the pattern cannot usefully contain NUL. If your pattern - * may contain NUL, consider using a regular expression escape sequence, or - * just use rure_compile. - * - * This uses RURE_DEFAULT_FLAGS. - * - * The compiled expression returned may be used from multiple threads - * simultaneously. - */ -rure *rure_compile_must(const char *pattern); - -/* - * rure_compile compiles the given pattern into a regular expression. The - * pattern must be valid UTF-8 and the length corresponds to the number of - * bytes in the pattern. - * - * flags is a bitfield. Valid values are constants declared with prefix - * RURE_FLAG_. - * - * options contains non-flag configuration settings. If it's NULL, default - * settings are used. options may be freed immediately after a call to - * rure_compile. - * - * error is set if there was a problem compiling the pattern (including if the - * pattern is not valid UTF-8). If error is NULL, then no error information - * is returned. In all cases, if an error occurs, NULL is returned. - * - * The compiled expression returned may be used from multiple threads - * simultaneously. - */ -rure *rure_compile(const uint8_t *pattern, size_t length, - uint32_t flags, rure_options *options, - rure_error *error); - -/* - * rure_free frees the given compiled regular expression. - * - * This must be called at most once for any rure. - */ -void rure_free(rure *re); - -/* - * rure_is_match returns true if and only if re matches anywhere in haystack. - * - * haystack may contain arbitrary bytes, but ASCII compatible text is more - * useful. UTF-8 is even more useful. Other text encodings aren't supported. - * length should be the number of bytes in haystack. - * - * start is the position at which to start searching. Note that setting the - * start position is distinct from incrementing the pointer, since the regex - * engine may look at bytes before the start position to determine match - * information. For example, if the start position is greater than 0, then the - * \A ("begin text") anchor can never match. - * - * rure_is_match should be preferred to rure_find since it may be faster. - * - * N.B. The performance of this search is not impacted by the presence of - * capturing groups in your regular expression. - */ -bool rure_is_match(rure *re, const uint8_t *haystack, size_t length, - size_t start); - -/* - * rure_find returns true if and only if re matches anywhere in haystack. - * If a match is found, then its start and end offsets (in bytes) are set - * on the match pointer given. - * - * haystack may contain arbitrary bytes, but ASCII compatible text is more - * useful. UTF-8 is even more useful. Other text encodings aren't supported. - * length should be the number of bytes in haystack. - * - * start is the position at which to start searching. Note that setting the - * start position is distinct from incrementing the pointer, since the regex - * engine may look at bytes before the start position to determine match - * information. For example, if the start position is greater than 0, then the - * \A ("begin text") anchor can never match. - * - * rure_find should be preferred to rure_find_captures since it may be faster. - * - * N.B. The performance of this search is not impacted by the presence of - * capturing groups in your regular expression. - */ -bool rure_find(rure *re, const uint8_t *haystack, size_t length, - size_t start, rure_match *match); - -/* - * rure_find_captures returns true if and only if re matches anywhere in - * haystack. If a match is found, then all of its capture locations are stored - * in the captures pointer given. - * - * haystack may contain arbitrary bytes, but ASCII compatible text is more - * useful. UTF-8 is even more useful. Other text encodings aren't supported. - * length should be the number of bytes in haystack. - * - * start is the position at which to start searching. Note that setting the - * start position is distinct from incrementing the pointer, since the regex - * engine may look at bytes before the start position to determine match - * information. For example, if the start position is greater than 0, then the - * \A ("begin text") anchor can never match. - * - * Only use this function if you specifically need access to capture locations. - * It is not necessary to use this function just because your regular - * expression contains capturing groups. - * - * Capture locations can be accessed using the rure_captures_* functions. - * - * N.B. The performance of this search can be impacted by the number of - * capturing groups. If you're using this function, it may be beneficial to - * use non-capturing groups (e.g., `(?:re)`) where possible. - */ -bool rure_find_captures(rure *re, const uint8_t *haystack, size_t length, - size_t start, rure_captures *captures); - - - -/* - * rure_iter_capture_names_new creates a new capture_names iterator. - * - * An iterator will report all successive capture group names of re. - */ -rure_iter_capture_names *rure_iter_capture_names_new(rure *re); - -/* - * rure_iter_capture_names_free frees the iterator given. - * - * It must be called at most once. - */ -void rure_iter_capture_names_free(rure_iter_capture_names *it); - -/* - * rure_iter_capture_names_next advances the iterator and returns true - * if and only if another capture group name exists. - * - * The value of the capture group name is written to the provided pointer. - */ -bool rure_iter_capture_names_next(rure_iter_capture_names *it, char **name); - - -/* - * rure_iter_free frees the iterator given. - * - * It must be called at most once. - */ -void rure_iter_free(rure_iter *it); - - - -/* - * rure_captures_new allocates storage for all capturing groups in re. - * - * An rure_captures value may be reused on subsequent calls to - * rure_find_captures or rure_iter_next_captures. - * - * An rure_captures value may be freed independently of re, although any - * particular rure_captures should be used only with the re given here. - * - * It is not safe to use an rure_captures value from multiple threads - * simultaneously. - */ -rure_captures *rure_captures_new(rure *re); - -/* - * rure_captures_free frees the given captures. - * - * This must be called at most once. - */ -void rure_captures_free(rure_captures *captures); - -/* - * rure_captures_at returns true if and only if the capturing group at the - * index given was part of a match. If so, the given match pointer is populated - * with the start and end location (in bytes) of the capturing group. - * - * If no capture group with the index i exists, then false is - * returned. (A capturing group exists if and only if i is less than - * rure_captures_len(captures).) - * - * Note that index 0 corresponds to the full match. - */ -bool rure_captures_at(rure_captures *captures, size_t i, rure_match *match); - -/* - * rure_captures_len returns the number of capturing groups in the given - * captures. - */ -size_t rure_captures_len(rure_captures *captures); - - - -/* - * rure_compile_set compiles the given list of patterns into a single regular - * expression which can be matched in a linear-scan. Each pattern in patterns - * must be valid UTF-8 and the length of each pattern in patterns corresponds - * to a byte length in patterns_lengths. - * - * The number of patterns to compile is specified by patterns_count. patterns - * must contain at least this many entries. - * - * flags is a bitfield. Valid values are constants declared with prefix - * RURE_FLAG_. - * - * options contains non-flag configuration settings. If it's NULL, default - * settings are used. options may be freed immediately after a call to - * rure_compile. - * - * error is set if there was a problem compiling the pattern. - * - * The compiled expression set returned may be used from multiple threads. - */ -rure_set *rure_compile_set(const uint8_t **patterns, - const size_t *patterns_lengths, - size_t patterns_count, - uint32_t flags, - rure_options *options, - rure_error *error); - -/* - * rure_set_free frees the given compiled regular expression set. - * - * This must be called at most once for any rure_set. - */ -void rure_set_free(rure_set *re); - -/* - * rure_is_match returns true if and only if any regexes within the set - * match anywhere in the haystack. Once a match has been located, the - * matching engine will quit immediately. - * - * haystack may contain arbitrary bytes, but ASCII compatible text is more - * useful. UTF-8 is even more useful. Other text encodings aren't supported. - * length should be the number of bytes in haystack. - * - * start is the position at which to start searching. Note that setting the - * start position is distinct from incrementing the pointer, since the regex - * engine may look at bytes before the start position to determine match - * information. For example, if the start position is greater than 0, then the - * \A ("begin text") anchor can never match. - */ -bool rure_set_is_match(rure_set *re, const uint8_t *haystack, size_t length, + /* + * rure_match corresponds to the location of a single match in a haystack. + */ + typedef struct rure_match + { + /* The start position. */ + size_t start; + /* The end position. */ + size_t end; + } rure_match; + + /* + * rure_captures represents storage for sub-capture locations of a match. + * + * Computing the capture groups of a match can carry a significant performance + * penalty, so their use in the API is optional. + * + * An rure_captures value can be reused in multiple calls to rure_find_captures, + * so long as it is used with the compiled regular expression that created + * it. + * + * An rure_captures value may outlive its corresponding rure and can be freed + * independently. + * + * It is not safe to use from multiple threads simultaneously. + */ + typedef struct rure_captures rure_captures; + + /* + * rure_iter is an iterator over successive non-overlapping matches in a + * particular haystack. + * + * An rure_iter value may not outlive its corresponding rure and should be freed + * before its corresponding rure is freed. + * + * It is not safe to use from multiple threads simultaneously. + */ + typedef struct rure_iter rure_iter; + + /* + * rure_iter_capture_names is an iterator over the list of capture group names + * in this particular rure. + * + * An rure_iter_capture_names value may not outlive its corresponding rure, + * and should be freed before its corresponding rure is freed. + * + * It is not safe to use from multiple threads simultaneously. + */ + typedef struct rure_iter_capture_names rure_iter_capture_names; + + /* + * rure_error is an error that caused compilation to fail. + * + * Most errors are syntax errors but an error can be returned if the compiled + * regular expression would be too big. + * + * Whenever a function accepts an *rure_error, it is safe to pass NULL. (But + * you will not get access to the error if one occurred.) + * + * It is not safe to use from multiple threads simultaneously. + */ + typedef struct rure_error rure_error; + + typedef struct + { + char *atom; + } Atoms; + + typedef struct + { + Atoms *data; + int32_t len; + } MyVec; + + /* + * rure_compile_must compiles the given pattern into a regular expression. If + * compilation fails for any reason, an error message is printed to stderr and + * the process is aborted. + * + * The pattern given should be in UTF-8. For convenience, this accepts a C + * string, which means the pattern cannot usefully contain NUL. If your pattern + * may contain NUL, consider using a regular expression escape sequence, or + * just use rure_compile. + * + * This uses RURE_DEFAULT_FLAGS. + * + * The compiled expression returned may be used from multiple threads + * simultaneously. + */ + rure *rure_compile_must(const char *pattern); + + /* + * rure_compile compiles the given pattern into a regular expression. The + * pattern must be valid UTF-8 and the length corresponds to the number of + * bytes in the pattern. + * + * flags is a bitfield. Valid values are constants declared with prefix + * RURE_FLAG_. + * + * options contains non-flag configuration settings. If it's NULL, default + * settings are used. options may be freed immediately after a call to + * rure_compile. + * + * error is set if there was a problem compiling the pattern (including if the + * pattern is not valid UTF-8). If error is NULL, then no error information + * is returned. In all cases, if an error occurs, NULL is returned. + * + * The compiled expression returned may be used from multiple threads + * simultaneously. + */ + rure *rure_compile(const uint8_t *pattern, size_t length, + uint32_t flags, rure_options *options, + rure_error *error); + + /* + * rure_free frees the given compiled regular expression. + * + * This must be called at most once for any rure. + */ + void rure_free(rure *re); + + /* + * rure_is_match returns true if and only if re matches anywhere in haystack. + * + * haystack may contain arbitrary bytes, but ASCII compatible text is more + * useful. UTF-8 is even more useful. Other text encodings aren't supported. + * length should be the number of bytes in haystack. + * + * start is the position at which to start searching. Note that setting the + * start position is distinct from incrementing the pointer, since the regex + * engine may look at bytes before the start position to determine match + * information. For example, if the start position is greater than 0, then the + * \A ("begin text") anchor can never match. + * + * rure_is_match should be preferred to rure_find since it may be faster. + * + * N.B. The performance of this search is not impacted by the presence of + * capturing groups in your regular expression. + */ + bool rure_is_match(rure *re, const uint8_t *haystack, size_t length, size_t start); -/* - * rure_set_matches compares each regex in the set against the haystack and - * modifies matches with the match result of each pattern. Match results are - * ordered in the same way as the rure_set was compiled. For example, - * index 0 of matches corresponds to the first pattern passed to - * `rure_compile_set`. - * - * haystack may contain arbitrary bytes, but ASCII compatible text is more - * useful. UTF-8 is even more useful. Other text encodings aren't supported. - * length should be the number of bytes in haystack. - * - * start is the position at which to start searching. Note that setting the - * start position is distinct from incrementing the pointer, since the regex - * engine may look at bytes before the start position to determine match - * information. For example, if the start position is greater than 0, then the - * \A ("begin text") anchor can never match. - * - * matches must be greater than or equal to the number of patterns the - * rure_set was compiled with. - * - * Only use this function if you specifically need to know which regexes - * matched within the set. To determine if any of the regexes matched without - * caring which, use rure_set_is_match. - */ -bool rure_set_matches(rure_set *re, const uint8_t *haystack, size_t length, - size_t start, bool *matches); - -/* - * rure_set_len returns the number of patterns rure_set was compiled with. - */ -size_t rure_set_len(rure_set *re); - -/* - * rure_error_new allocates space for an error. - * - * If error information is desired, then rure_error_new should be called - * to create an rure_error pointer, and that pointer can be passed to - * rure_compile. If an error occurred, then rure_compile will return NULL and - * the error pointer will be set. A message can then be extracted. - * - * It is not safe to use errors from multiple threads simultaneously. An error - * value may be reused on subsequent calls to rure_compile. - */ -rure_error *rure_error_new(); - -/* - * rure_error_free frees the error given. - * - * This must be called at most once. - */ -void rure_error_free(rure_error *err); - -/* - * rure_error_message returns a NUL terminated string that describes the error - * message. - * - * The pointer returned must not be freed. Instead, it will be freed when - * rure_error_free is called. If err is used in subsequent calls to - * rure_compile, then this pointer may change or become invalid. - */ -const char *rure_error_message(rure_error *err); - -/* - * rure_escape_must returns a NUL terminated string where all meta characters - * have been escaped. If escaping fails for any reason, an error message is - * printed to stderr and the process is aborted. - * - * The pattern given should be in UTF-8. For convenience, this accepts a C - * string, which means the pattern cannot contain a NUL byte. These correspond - * to the only two failure conditions of this function. That is, if the caller - * guarantees that the given pattern is valid UTF-8 and does not contain a - * NUL byte, then this is guaranteed to succeed (modulo out-of-memory errors). - * - * The pointer returned must not be freed directly. Instead, it should be freed - * by calling rure_cstring_free. - */ -const char *rure_escape_must(const char *pattern); - -/* - * rure_cstring_free frees the string given. - * - * This must be called at most once per string. - */ -void rure_cstring_free(char *s); - -/* - * rure_replace replaces the leftmost-first match with the rewrite provided. - * - * The rewrite can be a regular string (where `$N` and `$name` are - * expanded to match capture groups) or a function that takes the matches' - * `Captures` and returns the replaced string. - * - * The longest possible name is used. e.g., `$1a` looks up the capture - * group named `1a` and not the capture group at index `1`. To exert more - * precise control over the name, use braces, e.g., `${1}a`. - * - * If no match is found, then a copy of the string is returned unchanged. - * - */ -const char *rure_replace(rure *re, const uint8_t *haystack, size_t len_h, - const uint8_t *rewrite, size_t len_r); - -/* - * This like the previous function rure_replace, but is has different. - * rure_replace_all replaces all non-overlapping matches in `text` with the rewrite provided. - * - * If no match is found, then a copy of the string is returned unchanged. - */ -const char *rure_replace_all(rure *re, const uint8_t *haystack, size_t len_h, - const uint8_t *rewrite, size_t len_r); - -/* - * Simple way to use regex - */ - -rure *rure_new(const uint8_t *pattern, size_t length); -bool rure_consume(rure *re, const uint8_t *haystack, size_t length, rure_match *match); -int rure_max_submatch(const char *rewrite); -bool rure_check_rewrite_string(const char *rewrite, int max_token); - - -/* - * Convert RE2 style rewrite string to a string that Rust can accept -*/ -const char *rure_rewrite_str_convert(const uint8_t *rewrite, size_t len); - -/* - * Similar to Rewrite function in RE2. -*/ -const char *rure_rewrite(const uint8_t *rewrite, size_t len, const uint8_t **vecs, - const size_t *vecs_lengths, size_t vecs_count); - -/* - * Calculate the number of replacements. -*/ -size_t rure_replace_count(rure *re, const char *haystack); - -MyVec rure_filter_compile(const uint8_t *regex_str, size_t regex_len, size_t min_atoms_len); + /* + * rure_find returns true if and only if re matches anywhere in haystack. + * If a match is found, then its start and end offsets (in bytes) are set + * on the match pointer given. + * + * haystack may contain arbitrary bytes, but ASCII compatible text is more + * useful. UTF-8 is even more useful. Other text encodings aren't supported. + * length should be the number of bytes in haystack. + * + * start is the position at which to start searching. Note that setting the + * start position is distinct from incrementing the pointer, since the regex + * engine may look at bytes before the start position to determine match + * information. For example, if the start position is greater than 0, then the + * \A ("begin text") anchor can never match. + * + * rure_find should be preferred to rure_find_captures since it may be faster. + * + * N.B. The performance of this search is not impacted by the presence of + * capturing groups in your regular expression. + */ + bool rure_find(rure *re, const uint8_t *haystack, size_t length, + size_t start, rure_match *match); + + /* + * rure_find_captures returns true if and only if re matches anywhere in + * haystack. If a match is found, then all of its capture locations are stored + * in the captures pointer given. + * + * haystack may contain arbitrary bytes, but ASCII compatible text is more + * useful. UTF-8 is even more useful. Other text encodings aren't supported. + * length should be the number of bytes in haystack. + * + * start is the position at which to start searching. Note that setting the + * start position is distinct from incrementing the pointer, since the regex + * engine may look at bytes before the start position to determine match + * information. For example, if the start position is greater than 0, then the + * \A ("begin text") anchor can never match. + * + * Only use this function if you specifically need access to capture locations. + * It is not necessary to use this function just because your regular + * expression contains capturing groups. + * + * Capture locations can be accessed using the rure_captures_* functions. + * + * N.B. The performance of this search can be impacted by the number of + * capturing groups. If you're using this function, it may be beneficial to + * use non-capturing groups (e.g., `(?:re)`) where possible. + */ + bool rure_find_captures(rure *re, const uint8_t *haystack, size_t length, + size_t start, rure_captures *captures); + + /* + * rure_iter_capture_names_new creates a new capture_names iterator. + * + * An iterator will report all successive capture group names of re. + */ + rure_iter_capture_names *rure_iter_capture_names_new(rure *re); + + /* + * rure_iter_capture_names_free frees the iterator given. + * + * It must be called at most once. + */ + void rure_iter_capture_names_free(rure_iter_capture_names *it); + + /* + * rure_iter_capture_names_next advances the iterator and returns true + * if and only if another capture group name exists. + * + * The value of the capture group name is written to the provided pointer. + */ + bool rure_iter_capture_names_next(rure_iter_capture_names *it, char **name); + + /* + * rure_iter_free frees the iterator given. + * + * It must be called at most once. + */ + void rure_iter_free(rure_iter *it); + + /* + * rure_captures_new allocates storage for all capturing groups in re. + * + * An rure_captures value may be reused on subsequent calls to + * rure_find_captures or rure_iter_next_captures. + * + * An rure_captures value may be freed independently of re, although any + * particular rure_captures should be used only with the re given here. + * + * It is not safe to use an rure_captures value from multiple threads + * simultaneously. + */ + rure_captures *rure_captures_new(rure *re); + + /* + * rure_captures_free frees the given captures. + * + * This must be called at most once. + */ + void rure_captures_free(rure_captures *captures); + + /* + * rure_captures_at returns true if and only if the capturing group at the + * index given was part of a match. If so, the given match pointer is populated + * with the start and end location (in bytes) of the capturing group. + * + * If no capture group with the index i exists, then false is + * returned. (A capturing group exists if and only if i is less than + * rure_captures_len(captures).) + * + * Note that index 0 corresponds to the full match. + */ + bool rure_captures_at(rure_captures *captures, size_t i, rure_match *match); + + /* + * rure_captures_len returns the number of capturing groups in the given + * captures. + */ + size_t rure_captures_len(rure_captures *captures); + + /* + * rure_compile_set compiles the given list of patterns into a single regular + * expression which can be matched in a linear-scan. Each pattern in patterns + * must be valid UTF-8 and the length of each pattern in patterns corresponds + * to a byte length in patterns_lengths. + * + * The number of patterns to compile is specified by patterns_count. patterns + * must contain at least this many entries. + * + * flags is a bitfield. Valid values are constants declared with prefix + * RURE_FLAG_. + * + * options contains non-flag configuration settings. If it's NULL, default + * settings are used. options may be freed immediately after a call to + * rure_compile. + * + * error is set if there was a problem compiling the pattern. + * + * The compiled expression set returned may be used from multiple threads. + */ + rure_set *rure_compile_set(const uint8_t **patterns, + const size_t *patterns_lengths, + size_t patterns_count, + uint32_t flags, + rure_options *options, + rure_error *error); + + /* + * rure_set_free frees the given compiled regular expression set. + * + * This must be called at most once for any rure_set. + */ + void rure_set_free(rure_set *re); + + /* + * rure_is_match returns true if and only if any regexes within the set + * match anywhere in the haystack. Once a match has been located, the + * matching engine will quit immediately. + * + * haystack may contain arbitrary bytes, but ASCII compatible text is more + * useful. UTF-8 is even more useful. Other text encodings aren't supported. + * length should be the number of bytes in haystack. + * + * start is the position at which to start searching. Note that setting the + * start position is distinct from incrementing the pointer, since the regex + * engine may look at bytes before the start position to determine match + * information. For example, if the start position is greater than 0, then the + * \A ("begin text") anchor can never match. + */ + bool rure_set_is_match(rure_set *re, const uint8_t *haystack, size_t length, + size_t start); + + /* + * rure_set_matches compares each regex in the set against the haystack and + * modifies matches with the match result of each pattern. Match results are + * ordered in the same way as the rure_set was compiled. For example, + * index 0 of matches corresponds to the first pattern passed to + * `rure_compile_set`. + * + * haystack may contain arbitrary bytes, but ASCII compatible text is more + * useful. UTF-8 is even more useful. Other text encodings aren't supported. + * length should be the number of bytes in haystack. + * + * start is the position at which to start searching. Note that setting the + * start position is distinct from incrementing the pointer, since the regex + * engine may look at bytes before the start position to determine match + * information. For example, if the start position is greater than 0, then the + * \A ("begin text") anchor can never match. + * + * matches must be greater than or equal to the number of patterns the + * rure_set was compiled with. + * + * Only use this function if you specifically need to know which regexes + * matched within the set. To determine if any of the regexes matched without + * caring which, use rure_set_is_match. + */ + bool rure_set_matches(rure_set *re, const uint8_t *haystack, size_t length, + size_t start, bool *matches); + + /* + * rure_set_len returns the number of patterns rure_set was compiled with. + */ + size_t rure_set_len(rure_set *re); + + /* + * rure_error_new allocates space for an error. + * + * If error information is desired, then rure_error_new should be called + * to create an rure_error pointer, and that pointer can be passed to + * rure_compile. If an error occurred, then rure_compile will return NULL and + * the error pointer will be set. A message can then be extracted. + * + * It is not safe to use errors from multiple threads simultaneously. An error + * value may be reused on subsequent calls to rure_compile. + */ + rure_error *rure_error_new(); + + /* + * rure_error_free frees the error given. + * + * This must be called at most once. + */ + void rure_error_free(rure_error *err); + + /* + * rure_error_message returns a NUL terminated string that describes the error + * message. + * + * The pointer returned must not be freed. Instead, it will be freed when + * rure_error_free is called. If err is used in subsequent calls to + * rure_compile, then this pointer may change or become invalid. + */ + const char *rure_error_message(rure_error *err); + + /* + * rure_escape_must returns a NUL terminated string where all meta characters + * have been escaped. If escaping fails for any reason, an error message is + * printed to stderr and the process is aborted. + * + * The pattern given should be in UTF-8. For convenience, this accepts a C + * string, which means the pattern cannot contain a NUL byte. These correspond + * to the only two failure conditions of this function. That is, if the caller + * guarantees that the given pattern is valid UTF-8 and does not contain a + * NUL byte, then this is guaranteed to succeed (modulo out-of-memory errors). + * + * The pointer returned must not be freed directly. Instead, it should be freed + * by calling rure_cstring_free. + */ + const char *rure_escape_must(const char *pattern); + + /* + * rure_cstring_free frees the string given. + * + * This must be called at most once per string. + */ + void rure_cstring_free(char *s); + + /* + * rure_replace replaces the leftmost-first match with the rewrite provided. + * + * The rewrite can be a regular string (where `$N` and `$name` are + * expanded to match capture groups) or a function that takes the matches' + * `Captures` and returns the replaced string. + * + * The longest possible name is used. e.g., `$1a` looks up the capture + * group named `1a` and not the capture group at index `1`. To exert more + * precise control over the name, use braces, e.g., `${1}a`. + * + * If no match is found, then a copy of the string is returned unchanged. + * + */ + const char *rure_replace(rure *re, const uint8_t *haystack, size_t len_h, + const uint8_t *rewrite, size_t len_r); + + /* + * This like the previous function rure_replace, but is has different. + * rure_replace_all replaces all non-overlapping matches in `text` with the rewrite provided. + * + * If no match is found, then a copy of the string is returned unchanged. + */ + const char *rure_replace_all(rure *re, const uint8_t *haystack, size_t len_h, + const uint8_t *rewrite, size_t len_r); + + /* + * Simple way to use regex + */ + + rure *rure_new(const uint8_t *pattern, size_t length); + bool rure_consume(rure *re, const uint8_t *haystack, size_t length, rure_match *match); + int rure_max_submatch(const char *rewrite); + bool rure_check_rewrite_string(const char *rewrite, int max_token); + + /* + * Convert RE2 style rewrite string to a string that Rust can accept + */ + const char *rure_rewrite_str_convert(const uint8_t *rewrite, size_t len); + + /* + * Similar to Rewrite function in RE2. + */ + const char *rure_rewrite(const uint8_t *rewrite, size_t len, const uint8_t **vecs, + const size_t *vecs_lengths, size_t vecs_count); + + /* + * Calculate the number of replacements. + */ + size_t rure_replace_count(rure *re, const char *haystack); + + MyVec rure_filter_compile(const uint8_t *regex_str, size_t regex_len, size_t min_atoms_len); #ifdef __cplusplus } diff --git a/regex-capi/src/error.rs b/regex-capi/src/error.rs index bf15e22b2c251f025ccd40a494663b6efddeeac6..413b2a8327219129ab43adf2932837deb14a3826 100644 --- a/regex-capi/src/error.rs +++ b/regex-capi/src/error.rs @@ -53,7 +53,9 @@ extern "C" fn rure_error_new() -> *mut Error { #[no_mangle] extern "C" fn rure_error_free(err: *mut Error) { - unsafe { drop(Box::from_raw(err)); } + unsafe { + drop(Box::from_raw(err)); + } } #[no_mangle] diff --git a/regex-capi/src/lib.rs b/regex-capi/src/lib.rs index d7b77990aba5ec641f31958a5295ed2c3ae964f2..7a269d1822fc5d6617b19521d08e49ca82587799 100644 --- a/regex-capi/src/lib.rs +++ b/regex-capi/src/lib.rs @@ -12,142 +12,142 @@ * Create: 2022-11-25 * Description: Rure is a C API to Rust's regex library. ******************************************************************************/ - #[macro_use] - mod error; - pub use crate::error::*; - - use std::ffi::{CStr, CString}; - use std::ops::Deref; - use std::ptr; - use std::slice; - use std::str; - - use libc::{c_char, size_t}; - - use regex::{bytes, Regex}; - - use crate::error::{Error, ErrorKind}; - use std::io; - use std::io::Write; - use std::process::abort; - - include!("lib_internal.rs"); - - const RURE_FLAG_CASEI: u32 = 1 << 0; - const RURE_FLAG_MULTI: u32 = 1 << 1; - const RURE_FLAG_DOTNL: u32 = 1 << 2; - const RURE_FLAG_SWAP_GREED: u32 = 1 << 3; - const RURE_FLAG_SPACE: u32 = 1 << 4; - const RURE_FLAG_UNICODE: u32 = 1 << 5; - const RURE_DEFAULT_FLAGS: u32 = RURE_FLAG_UNICODE; - - pub struct RegexBytes { - re: bytes::Regex, - // capture_names: HashMap, - } - - pub struct RegexUnicode { - re: Regex, - } - - pub struct Options { - size_limit: usize, - dfa_size_limit: usize, - } - - // The `RegexSet` is not exposed with option support or matching at an - // arbitrary position with a crate just yet. To circumvent this, we use - // the `Exec` structure directly. - pub struct RegexSet { - re: bytes::RegexSet, - } - - #[repr(C)] - pub struct rure_match { - pub start: size_t, - pub end: size_t, - } - - pub struct Captures(bytes::Locations); - - pub struct IterCaptureNames { - capture_names: bytes::CaptureNames<'static>, - name_ptrs: Vec<*mut c_char>, - } - - #[repr(C)] - pub struct Atoms { - atom: *mut c_char, - } - - #[repr(C)] - pub struct MyVec { - data: *mut Atoms, - len: i32, - } - - impl Deref for RegexBytes { - type Target = bytes::Regex; - fn deref(&self) -> &bytes::Regex { - &self.re - } - } - - impl Deref for RegexUnicode { - type Target = Regex; - fn deref(&self) -> &Regex { - &self.re - } - } - - impl Deref for RegexSet { - type Target = bytes::RegexSet; - fn deref(&self) -> &bytes::RegexSet { - &self.re - } - } - - impl Default for Options { - fn default() -> Options { - Options { - size_limit: 10 * (1 << 20), - dfa_size_limit: 2 * (1 << 20), - } - } - } - - #[no_mangle] - extern "C" fn rure_compile_must(pattern: *const c_char) -> *const RegexBytes { - let len = unsafe { CStr::from_ptr(pattern).to_bytes().len() }; - let pat = pattern as *const u8; - let mut err = Error::new(ErrorKind::None); - let re = rure_compile(pat, len, RURE_DEFAULT_FLAGS, ptr::null(), &mut err); - if err.is_err() { - let _ = writeln!(&mut io::stderr(), "{}", err); - let _ = writeln!(&mut io::stderr(), "aborting from rure_compile_must"); - abort() - } - re - } - - #[no_mangle] - extern "C" fn rure_compile( - pattern: *const u8, - length: size_t, - flags: u32, - options: *const Options, - error: *mut Error, - ) -> *const RegexBytes { - let pat = unsafe { slice::from_raw_parts(pattern, length) }; - let pat = match str::from_utf8(pat) { - Ok(pat) => pat, - Err(err) => unsafe { - if !error.is_null() { - *error = Error::new(ErrorKind::Str(err)); - } - return ptr::null(); - }, - }; +#[macro_use] +mod error; +pub use crate::error::*; + +use std::ffi::{CStr, CString}; +use std::ops::Deref; +use std::ptr; +use std::slice; +use std::str; + +use libc::{c_char, size_t}; + +use regex::{bytes, Regex}; + +use crate::error::{Error, ErrorKind}; +use std::io; +use std::io::Write; +use std::process::abort; + +include!("lib_internal.rs"); + +const RURE_FLAG_CASEI: u32 = 1 << 0; +const RURE_FLAG_MULTI: u32 = 1 << 1; +const RURE_FLAG_DOTNL: u32 = 1 << 2; +const RURE_FLAG_SWAP_GREED: u32 = 1 << 3; +const RURE_FLAG_SPACE: u32 = 1 << 4; +const RURE_FLAG_UNICODE: u32 = 1 << 5; +const RURE_DEFAULT_FLAGS: u32 = RURE_FLAG_UNICODE; + +pub struct RegexBytes { + re: bytes::Regex, + // capture_names: HashMap, +} + +pub struct RegexUnicode { + re: Regex, +} + +pub struct Options { + size_limit: usize, + dfa_size_limit: usize, +} + +// The `RegexSet` is not exposed with option support or matching at an +// arbitrary position with a crate just yet. To circumvent this, we use +// the `Exec` structure directly. +pub struct RegexSet { + re: bytes::RegexSet, +} + +#[repr(C)] +pub struct rure_match { + pub start: size_t, + pub end: size_t, +} + +pub struct Captures(bytes::Locations); + +pub struct IterCaptureNames { + capture_names: bytes::CaptureNames<'static>, + name_ptrs: Vec<*mut c_char>, +} + +#[repr(C)] +pub struct Atoms { + atom: *mut c_char, +} + +#[repr(C)] +pub struct MyVec { + data: *mut Atoms, + len: i32, +} + +impl Deref for RegexBytes { + type Target = bytes::Regex; + fn deref(&self) -> &bytes::Regex { + &self.re + } +} + +impl Deref for RegexUnicode { + type Target = Regex; + fn deref(&self) -> &Regex { + &self.re + } +} + +impl Deref for RegexSet { + type Target = bytes::RegexSet; + fn deref(&self) -> &bytes::RegexSet { + &self.re + } +} + +impl Default for Options { + fn default() -> Options { + Options { + size_limit: 10 * (1 << 20), + dfa_size_limit: 2 * (1 << 20), + } + } +} + +#[no_mangle] +extern "C" fn rure_compile_must(pattern: *const c_char) -> *const RegexBytes { + let len = unsafe { CStr::from_ptr(pattern).to_bytes().len() }; + let pat = pattern as *const u8; + let mut err = Error::new(ErrorKind::None); + let re = rure_compile(pat, len, RURE_DEFAULT_FLAGS, ptr::null(), &mut err); + if err.is_err() { + let _ = writeln!(&mut io::stderr(), "{}", err); + let _ = writeln!(&mut io::stderr(), "aborting from rure_compile_must"); + abort() + } + re +} + +#[no_mangle] +extern "C" fn rure_compile( + pattern: *const u8, + length: size_t, + flags: u32, + options: *const Options, + error: *mut Error, +) -> *const RegexBytes { + let pat = unsafe { slice::from_raw_parts(pattern, length) }; + let pat = match str::from_utf8(pat) { + Ok(pat) => pat, + Err(err) => unsafe { + if !error.is_null() { + *error = Error::new(ErrorKind::Str(err)); + } + return ptr::null(); + }, + }; let mut builder = rure_compile_internal(pat, flags); if !options.is_null() { @@ -160,7 +160,7 @@ Ok(re) => { let re = RegexBytes { re }; Box::into_raw(Box::new(re)) - }, + } Err(err) => unsafe { if !error.is_null() { *error = Error::new(ErrorKind::Regex(err)); @@ -168,135 +168,135 @@ ptr::null() }, } - } - - #[no_mangle] - extern "C" fn rure_free(re: *const RegexBytes) { - unsafe { - drop(Box::from_raw(re as *mut Regex)); - } - } - - #[no_mangle] - extern "C" fn rure_is_match( - re: *const RegexBytes, - haystack: *const u8, - len: size_t, - _start: size_t, - ) -> bool { - let re = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(haystack, len) }; - re.is_match(haystack) - } - - #[no_mangle] - extern "C" fn rure_find( - re: *const RegexBytes, - haystack: *const u8, - len: size_t, - start: size_t, - match_info: *mut rure_match, - ) -> bool { - let re = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(haystack, len) }; - re.find_at(haystack, start) - .map(|m| unsafe { - if !match_info.is_null() { - (*match_info).start = m.start(); - (*match_info).end = m.end(); - } - }) - .is_some() - } - - #[no_mangle] - extern "C" fn rure_find_captures( - re: *const RegexBytes, - haystack: *const u8, - len: size_t, - start: size_t, - captures: *mut Captures, - ) -> bool { - let re = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(haystack, len) }; - let slots = unsafe { &mut (*captures).0 }; - re.read_captures_at(slots, haystack, start).is_some() - } - - #[no_mangle] - extern "C" fn rure_iter_capture_names_new(re: *const RegexBytes) -> *mut IterCaptureNames { - let re = unsafe { &*re }; - Box::into_raw(Box::new(IterCaptureNames { - capture_names: re.re.capture_names(), - name_ptrs: Vec::new(), - })) - } - - #[no_mangle] - extern "C" fn rure_iter_capture_names_free(it: *mut IterCaptureNames) { - unsafe { - let it = &mut *it; - while let Some(ptr) = it.name_ptrs.pop() { - drop(CString::from_raw(ptr)); - } - drop(Box::from_raw(it)); - } - } - - #[no_mangle] - extern "C" fn rure_iter_capture_names_next( - it: *mut IterCaptureNames, - capture_name: *mut *mut c_char, - ) -> bool { - if capture_name.is_null() { - return false; - } - let it = unsafe { &mut *it }; - let cn = match it.capture_names.next() { - // Top-level iterator ran out of capture groups - None => return false, - Some(val) => { - match val { - // inner Option didn't have a name - None => "", - Some(name) => name, - } - } - }; - unsafe { - let cs = match CString::new(cn.as_bytes()) { - Result::Ok(val) => val, - Result::Err(_) => return false, - }; - let ptr = cs.into_raw(); - it.name_ptrs.push(ptr); - *capture_name = ptr; - } - true - } - - #[no_mangle] - extern "C" fn rure_captures_new(re: *const RegexBytes) -> *mut Captures { - let re = unsafe { &*re }; - let captures = Captures(re.locations()); - Box::into_raw(Box::new(captures)) - } - - #[no_mangle] - extern "C" fn rure_captures_free(captures: *const Captures) { - unsafe { - drop(Box::from_raw(captures as *mut Captures)); - } - } - - #[no_mangle] - extern "C" fn rure_captures_at( - captures: *const Captures, - i: size_t, - match_info: *mut rure_match, - ) -> bool { - let locs = unsafe { &(*captures).0 }; - match locs.pos(i) { +} + +#[no_mangle] +extern "C" fn rure_free(re: *const RegexBytes) { + unsafe { + drop(Box::from_raw(re as *mut Regex)); + } +} + +#[no_mangle] +extern "C" fn rure_is_match( + re: *const RegexBytes, + haystack: *const u8, + len: size_t, + _start: size_t, +) -> bool { + let re = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(haystack, len) }; + re.is_match(haystack) +} + +#[no_mangle] +extern "C" fn rure_find( + re: *const RegexBytes, + haystack: *const u8, + len: size_t, + start: size_t, + match_info: *mut rure_match, +) -> bool { + let re = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(haystack, len) }; + re.find_at(haystack, start) + .map(|m| unsafe { + if !match_info.is_null() { + (*match_info).start = m.start(); + (*match_info).end = m.end(); + } + }) + .is_some() +} + +#[no_mangle] +extern "C" fn rure_find_captures( + re: *const RegexBytes, + haystack: *const u8, + len: size_t, + start: size_t, + captures: *mut Captures, +) -> bool { + let re = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(haystack, len) }; + let slots = unsafe { &mut (*captures).0 }; + re.read_captures_at(slots, haystack, start).is_some() +} + +#[no_mangle] +extern "C" fn rure_iter_capture_names_new(re: *const RegexBytes) -> *mut IterCaptureNames { + let re = unsafe { &*re }; + Box::into_raw(Box::new(IterCaptureNames { + capture_names: re.re.capture_names(), + name_ptrs: Vec::new(), + })) +} + +#[no_mangle] +extern "C" fn rure_iter_capture_names_free(it: *mut IterCaptureNames) { + unsafe { + let it = &mut *it; + while let Some(ptr) = it.name_ptrs.pop() { + drop(CString::from_raw(ptr)); + } + drop(Box::from_raw(it)); + } +} + +#[no_mangle] +extern "C" fn rure_iter_capture_names_next( + it: *mut IterCaptureNames, + capture_name: *mut *mut c_char, +) -> bool { + if capture_name.is_null() { + return false; + } + let it = unsafe { &mut *it }; + let cn = match it.capture_names.next() { + // Top-level iterator ran out of capture groups + None => return false, + Some(val) => { + match val { + // inner Option didn't have a name + None => "", + Some(name) => name, + } + } + }; + unsafe { + let cs = match CString::new(cn.as_bytes()) { + Result::Ok(val) => val, + Result::Err(_) => return false, + }; + let ptr = cs.into_raw(); + it.name_ptrs.push(ptr); + *capture_name = ptr; + } + true +} + +#[no_mangle] +extern "C" fn rure_captures_new(re: *const RegexBytes) -> *mut Captures { + let re = unsafe { &*re }; + let captures = Captures(re.locations()); + Box::into_raw(Box::new(captures)) +} + +#[no_mangle] +extern "C" fn rure_captures_free(captures: *const Captures) { + unsafe { + drop(Box::from_raw(captures as *mut Captures)); + } +} + +#[no_mangle] +extern "C" fn rure_captures_at( + captures: *const Captures, + i: size_t, + match_info: *mut rure_match, +) -> bool { + let locs = unsafe { &(*captures).0 }; + match locs.pos(i) { Some((start, end)) => { if !match_info.is_null() { unsafe { @@ -308,41 +308,41 @@ } _ => false, } - } - - #[no_mangle] - extern "C" fn rure_captures_len(captures: *const Captures) -> size_t { - unsafe { (*captures).0.len() } - } - - #[no_mangle] - extern "C" fn rure_compile_set( - patterns: *const *const u8, - patterns_lengths: *const size_t, - patterns_count: size_t, - flags: u32, - options: *const Options, - error: *mut Error, - ) -> *const RegexSet { - let (raw_pats, raw_patsl) = unsafe { - ( - slice::from_raw_parts(patterns, patterns_count), - slice::from_raw_parts(patterns_lengths, patterns_count), - ) - }; - let mut pats = Vec::with_capacity(patterns_count); - for (&raw_pat, &raw_patl) in raw_pats.iter().zip(raw_patsl) { - let pat = unsafe { slice::from_raw_parts(raw_pat, raw_patl) }; - pats.push(match str::from_utf8(pat) { - Ok(pat) => pat, - Err(err) => unsafe { - if !error.is_null() { - *error = Error::new(ErrorKind::Str(err)); - } - return ptr::null(); - }, - }); - } +} + +#[no_mangle] +extern "C" fn rure_captures_len(captures: *const Captures) -> size_t { + unsafe { (*captures).0.len() } +} + +#[no_mangle] +extern "C" fn rure_compile_set( + patterns: *const *const u8, + patterns_lengths: *const size_t, + patterns_count: size_t, + flags: u32, + options: *const Options, + error: *mut Error, +) -> *const RegexSet { + let (raw_pats, raw_patsl) = unsafe { + ( + slice::from_raw_parts(patterns, patterns_count), + slice::from_raw_parts(patterns_lengths, patterns_count), + ) + }; + let mut pats = Vec::with_capacity(patterns_count); + for (&raw_pat, &raw_patl) in raw_pats.iter().zip(raw_patsl) { + let pat = unsafe { slice::from_raw_parts(raw_pat, raw_patl) }; + pats.push(match str::from_utf8(pat) { + Ok(pat) => pat, + Err(err) => unsafe { + if !error.is_null() { + *error = Error::new(ErrorKind::Str(err)); + } + return ptr::null(); + }, + }); + } let mut builder = rure_compile_set_internal(pats, flags); if !options.is_null() { @@ -359,230 +359,228 @@ ptr::null() }, } - } - - #[no_mangle] - extern "C" fn rure_set_free(re: *const RegexSet) { - unsafe { - drop(Box::from_raw(re as *mut RegexSet)); - } - } - - #[no_mangle] - extern "C" fn rure_set_is_match( - re: *const RegexSet, - haystack: *const u8, - len: size_t, - start: size_t, - ) -> bool { - let re = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(haystack, len) }; - re.is_match_at(haystack, start) - } - - #[no_mangle] - extern "C" fn rure_set_matches( - re: *const RegexSet, - haystack: *const u8, - len: size_t, - start: size_t, - matches: *mut bool, - ) -> bool { - let re = unsafe { &*re }; - let matches = unsafe { slice::from_raw_parts_mut(matches, re.len()) }; - let haystack = unsafe { slice::from_raw_parts(haystack, len) }; - - rure_set_matches_internal(re, matches, haystack, start) - } - - #[no_mangle] - extern "C" fn rure_set_len(re: *const RegexSet) -> size_t { - unsafe { (*re).len() } - } - - #[no_mangle] - extern "C" fn rure_escape_must(pattern: *const c_char) -> *const c_char { - let len = unsafe { CStr::from_ptr(pattern).to_bytes().len() }; - let pat = pattern as *const u8; - let mut err = Error::new(ErrorKind::None); - let esc = rure_escape(pat, len, &mut err); - if err.is_err() { - println!("{}", "aborting from rure_escape_must"); - let _ = writeln!(&mut io::stderr(), "{}", err); - let _ = writeln!(&mut io::stderr(), "aborting from rure_escape_must"); - abort() - } - esc - } - - /// A helper function that implements fallible escaping in a way that returns - /// an error if escaping failed. - /// - /// This should ideally be exposed, but it needs API design work. In - /// particular, this should not return a C string, but a `const uint8_t *` - /// instead, since it may contain a NUL byte. - fn rure_escape(pattern: *const u8, length: size_t, error: *mut Error) -> *const c_char { - let pat: &[u8] = unsafe { slice::from_raw_parts(pattern, length) }; - let str_pat = match str::from_utf8(pat) { - Ok(val) => val, - Err(err) => unsafe { - if !error.is_null() { - *error = Error::new(ErrorKind::Str(err)); - } - return ptr::null(); - }, - }; - let esc_pat = regex::escape(str_pat); - let c_esc_pat = match CString::new(esc_pat) { - Ok(val) => val, - Err(err) => unsafe { - if !error.is_null() { - *error = Error::new(ErrorKind::Nul(err)); - } - return ptr::null(); - }, - }; - c_esc_pat.into_raw() as *const c_char - } - - #[no_mangle] - extern "C" fn rure_cstring_free(s: *mut c_char) { - unsafe { - drop(CString::from_raw(s)); - } - } - - #[no_mangle] - extern "C" fn rure_replace( - re: *const RegexUnicode, - haystack: *const u8, - len_h: size_t, - rewrite: *const u8, - len_r: size_t, - ) -> *const u8 { - let re = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(haystack, len_h) }; - let rewrite = unsafe { slice::from_raw_parts(rewrite, len_r) }; - rure_replace_internal(re, haystack, rewrite) - } - - #[no_mangle] - extern "C" fn rure_replace_all( - re: *const RegexUnicode, - haystack: *const u8, - len_h: size_t, - rewrite: *const u8, - len_r: size_t, - ) -> *const u8 { - let re = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(haystack, len_h) }; - let rewrite = unsafe { slice::from_raw_parts(rewrite, len_r) }; - rure_replace_all_internal(re, haystack, rewrite) - } - - /* - * Simple way to use regex - */ - - #[no_mangle] - extern "C" fn rure_new(pattern: *const u8, length: size_t) -> *const RegexBytes { - let pat = unsafe { slice::from_raw_parts(pattern, length) }; - rure_new_internal(pat) - } - - #[no_mangle] - extern "C" fn rure_consume( - re: *const RegexBytes, - haystack: *const u8, - len: size_t, - match_info: *mut rure_match, - ) -> bool { - let exp = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(haystack, len) }; - exp.find(haystack) - .map(|m| unsafe { - if !match_info.is_null() { - (*match_info).start = m.start(); - (*match_info).end = m.end(); - } - }) - .is_some() - } - - #[no_mangle] - extern "C" fn rure_max_submatch(rewrite: *const c_char) -> i32 { - let len = unsafe { CStr::from_ptr(rewrite).to_bytes().len() }; - let pat = rewrite as *const u8; - let text = unsafe { slice::from_raw_parts(pat, len) }; - - rure_max_submatch_internal(text) - } - - #[no_mangle] - extern "C" fn rure_check_rewrite_string(rewrite: *const c_char, cap_num: i32) -> bool { - let len = unsafe { CStr::from_ptr(rewrite).to_bytes().len() }; - let pat = rewrite as *const u8; - let text = unsafe { slice::from_raw_parts(pat, len) }; - - rure_check_rewrite_string_internal(text, cap_num) - } - - #[no_mangle] - extern "C" fn rure_rewrite_str_convert(rewrite: *const u8, length: size_t) -> *const c_char { - let rewrite = unsafe { slice::from_raw_parts(rewrite, length) }; - - rure_rewrite_str_convert_internal(rewrite) - } - - #[no_mangle] - extern "C" fn rure_rewrite( - rewrite: *const u8, - length: size_t, - vecs: *const *const u8, - vecs_lengths: *const size_t, - vecs_count: size_t, - ) -> *const c_char { - // 获取rewrite - let rewrite = unsafe { slice::from_raw_parts(rewrite, length) }; - let rewrite_str = std::str::from_utf8(rewrite).unwrap(); - - //获取vecs中的内容 - let (raw_vecs, raw_vecsl) = unsafe { - ( - slice::from_raw_parts(vecs, vecs_count), - slice::from_raw_parts(vecs_lengths, vecs_count), - ) - }; - - let mut rure_vecs = Vec::with_capacity(vecs_count); - for (&raw_vec, &raw_vecl) in raw_vecs.iter().zip(raw_vecsl) { - let rure_vec = unsafe { slice::from_raw_parts(raw_vec, raw_vecl) }; - rure_vecs.push(str::from_utf8(rure_vec).unwrap()); - } - - rure_rewrite_internal(rewrite_str, vecs_count, rure_vecs) - } - - #[no_mangle] - extern "C" fn rure_replace_count(re: *const RegexUnicode, haystack: *const c_char) -> size_t { - let len = unsafe { CStr::from_ptr(haystack).to_bytes().len() }; - let hay = haystack as *const u8; - - let re = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(hay, len) }; - rure_replace_count_internal(haystack, re) - } - - #[no_mangle] - extern "C" fn rure_filter_compile( - regex_str: *const u8, - regex_len: size_t, - min_atoms_len: size_t, - ) -> MyVec { - let r = unsafe { slice::from_raw_parts(regex_str, regex_len) }; - let regex_str = str::from_utf8(r).unwrap(); - let atoms = my_compile(regex_str, min_atoms_len as i32); - atoms - } - +} + +#[no_mangle] +extern "C" fn rure_set_free(re: *const RegexSet) { + unsafe { + drop(Box::from_raw(re as *mut RegexSet)); + } +} + +#[no_mangle] +extern "C" fn rure_set_is_match( + re: *const RegexSet, + haystack: *const u8, + len: size_t, + start: size_t, +) -> bool { + let re = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(haystack, len) }; + re.is_match_at(haystack, start) +} + +#[no_mangle] +extern "C" fn rure_set_matches( + re: *const RegexSet, + haystack: *const u8, + len: size_t, + start: size_t, + matches: *mut bool, +) -> bool { + let re = unsafe { &*re }; + let matches = unsafe { slice::from_raw_parts_mut(matches, re.len()) }; + let haystack = unsafe { slice::from_raw_parts(haystack, len) }; + + rure_set_matches_internal(re, matches, haystack, start) +} + +#[no_mangle] +extern "C" fn rure_set_len(re: *const RegexSet) -> size_t { + unsafe { (*re).len() } +} + +#[no_mangle] +extern "C" fn rure_escape_must(pattern: *const c_char) -> *const c_char { + let len = unsafe { CStr::from_ptr(pattern).to_bytes().len() }; + let pat = pattern as *const u8; + let mut err = Error::new(ErrorKind::None); + let esc = rure_escape(pat, len, &mut err); + if err.is_err() { + println!("{}", "aborting from rure_escape_must"); + let _ = writeln!(&mut io::stderr(), "{}", err); + let _ = writeln!(&mut io::stderr(), "aborting from rure_escape_must"); + abort() + } + esc +} + +/// A helper function that implements fallible escaping in a way that returns +/// an error if escaping failed. +/// +/// This should ideally be exposed, but it needs API design work. In +/// particular, this should not return a C string, but a `const uint8_t *` +/// instead, since it may contain a NUL byte. +fn rure_escape(pattern: *const u8, length: size_t, error: *mut Error) -> *const c_char { + let pat: &[u8] = unsafe { slice::from_raw_parts(pattern, length) }; + let str_pat = match str::from_utf8(pat) { + Ok(val) => val, + Err(err) => unsafe { + if !error.is_null() { + *error = Error::new(ErrorKind::Str(err)); + } + return ptr::null(); + }, + }; + let esc_pat = regex::escape(str_pat); + let c_esc_pat = match CString::new(esc_pat) { + Ok(val) => val, + Err(err) => unsafe { + if !error.is_null() { + *error = Error::new(ErrorKind::Nul(err)); + } + return ptr::null(); + }, + }; + c_esc_pat.into_raw() as *const c_char +} + +#[no_mangle] +extern "C" fn rure_cstring_free(s: *mut c_char) { + unsafe { + drop(CString::from_raw(s)); + } +} + +#[no_mangle] +extern "C" fn rure_replace( + re: *const RegexUnicode, + haystack: *const u8, + len_h: size_t, + rewrite: *const u8, + len_r: size_t, +) -> *const u8 { + let re = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(haystack, len_h) }; + let rewrite = unsafe { slice::from_raw_parts(rewrite, len_r) }; + rure_replace_internal(re, haystack, rewrite) +} + +#[no_mangle] +extern "C" fn rure_replace_all( + re: *const RegexUnicode, + haystack: *const u8, + len_h: size_t, + rewrite: *const u8, + len_r: size_t, +) -> *const u8 { + let re = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(haystack, len_h) }; + let rewrite = unsafe { slice::from_raw_parts(rewrite, len_r) }; + rure_replace_all_internal(re, haystack, rewrite) +} + +/* + * Simple way to use regex + */ + +#[no_mangle] +extern "C" fn rure_new(pattern: *const u8, length: size_t) -> *const RegexBytes { + let pat = unsafe { slice::from_raw_parts(pattern, length) }; + rure_new_internal(pat) +} + +#[no_mangle] +extern "C" fn rure_consume( + re: *const RegexBytes, + haystack: *const u8, + len: size_t, + match_info: *mut rure_match, +) -> bool { + let exp = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(haystack, len) }; + exp.find(haystack) + .map(|m| unsafe { + if !match_info.is_null() { + (*match_info).start = m.start(); + (*match_info).end = m.end(); + } + }) + .is_some() +} + +#[no_mangle] +extern "C" fn rure_max_submatch(rewrite: *const c_char) -> i32 { + let len = unsafe { CStr::from_ptr(rewrite).to_bytes().len() }; + let pat = rewrite as *const u8; + let text = unsafe { slice::from_raw_parts(pat, len) }; + + rure_max_submatch_internal(text) +} + +#[no_mangle] +extern "C" fn rure_check_rewrite_string(rewrite: *const c_char, cap_num: i32) -> bool { + let len = unsafe { CStr::from_ptr(rewrite).to_bytes().len() }; + let pat = rewrite as *const u8; + let text = unsafe { slice::from_raw_parts(pat, len) }; + + rure_check_rewrite_string_internal(text, cap_num) +} + +#[no_mangle] +extern "C" fn rure_rewrite_str_convert(rewrite: *const u8, length: size_t) -> *const c_char { + let rewrite = unsafe { slice::from_raw_parts(rewrite, length) }; + + rure_rewrite_str_convert_internal(rewrite) +} + +#[no_mangle] +extern "C" fn rure_rewrite( + rewrite: *const u8, + length: size_t, + vecs: *const *const u8, + vecs_lengths: *const size_t, + vecs_count: size_t, +) -> *const c_char { + // 获取rewrite + let rewrite = unsafe { slice::from_raw_parts(rewrite, length) }; + let rewrite_str = std::str::from_utf8(rewrite).unwrap(); + + //获取vecs中的内容 + let (raw_vecs, raw_vecsl) = unsafe { + ( + slice::from_raw_parts(vecs, vecs_count), + slice::from_raw_parts(vecs_lengths, vecs_count), + ) + }; + + let mut rure_vecs = Vec::with_capacity(vecs_count); + for (&raw_vec, &raw_vecl) in raw_vecs.iter().zip(raw_vecsl) { + let rure_vec = unsafe { slice::from_raw_parts(raw_vec, raw_vecl) }; + rure_vecs.push(str::from_utf8(rure_vec).unwrap()); + } + + rure_rewrite_internal(rewrite_str, vecs_count, rure_vecs) +} + +#[no_mangle] +extern "C" fn rure_replace_count(re: *const RegexUnicode, haystack: *const c_char) -> size_t { + let len = unsafe { CStr::from_ptr(haystack).to_bytes().len() }; + let hay = haystack as *const u8; + + let re = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(hay, len) }; + rure_replace_count_internal(haystack, re) +} +#[no_mangle] +extern "C" fn rure_filter_compile( + regex_str: *const u8, + regex_len: size_t, + min_atoms_len: size_t, +) -> MyVec { + let r = unsafe { slice::from_raw_parts(regex_str, regex_len) }; + let regex_str = str::from_utf8(r).unwrap(); + let atoms = my_compile(regex_str, min_atoms_len as i32); + atoms +} diff --git a/regex-capi/src/lib_internal.rs b/regex-capi/src/lib_internal.rs index fb331d0e9093bf4ea001ebc4e81a1c8ecfc63ecd..2ad0e03f318892dfb86bf7cb8335d6429088ae7c 100644 --- a/regex-capi/src/lib_internal.rs +++ b/regex-capi/src/lib_internal.rs @@ -14,10 +14,7 @@ ******************************************************************************/ use regex::bytes::RegexBuilder; use regex::bytes::RegexSetBuilder; - fn rure_compile_internal( - pat: &str, - flags: u32, -) -> RegexBuilder { +fn rure_compile_internal(pat: &str, flags: u32) -> RegexBuilder { let mut builder = bytes::RegexBuilder::new(pat); builder.case_insensitive(flags & RURE_FLAG_CASEI > 0); builder.multi_line(flags & RURE_FLAG_MULTI > 0); @@ -28,10 +25,7 @@ use regex::bytes::RegexSetBuilder; builder } -fn rure_compile_set_internal( - pats: Vec<&str>, - flags: u32, -) -> RegexSetBuilder { +fn rure_compile_set_internal(pats: Vec<&str>, flags: u32) -> RegexSetBuilder { let mut builder = bytes::RegexSetBuilder::new(pats); builder.case_insensitive(flags & RURE_FLAG_CASEI > 0);