diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..e43b0f988953ae3a84b00331d0ccf5f7d51cb3cf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.DS_Store diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..6a6f414015af8bf5f0131c814a3251aad57061dc --- /dev/null +++ b/Makefile @@ -0,0 +1,377 @@ +# Copyright 2009 The RE2 Authors. All Rights Reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +# To build against ICU for full Unicode properties support, +# uncomment the next two lines: +# CCICU=$(shell pkg-config icu-uc --cflags) -DRE2_USE_ICU +# LDICU=$(shell pkg-config icu-uc --libs) + +# To build against PCRE for testing or benchmarking, +# uncomment the next two lines: +# CCPCRE=-I/usr/local/include -DUSEPCRE +# LDPCRE=-L/usr/local/lib -lpcre + +CXX?=g++ +# can override +CXXFLAGS?=-O3 -g +LDFLAGS?=-lrure +# required +RE2_CXXFLAGS?=-std=c++11 -pthread -Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers -I. $(CCICU) $(CCPCRE) +RE2_LDFLAGS?=-pthread $(LDICU) $(LDPCRE) +AR?=ar +ARFLAGS?=rsc +NM?=nm +NMFLAGS?=-p + +# Variables mandated by GNU, the arbiter of all good taste on the internet. +# http://www.gnu.org/prep/standards/standards.html +prefix=/usr/local +exec_prefix=$(prefix) +includedir=$(prefix)/include +libdir=$(exec_prefix)/lib +INSTALL=install +INSTALL_DATA=$(INSTALL) -m 644 + +# Work around the weirdness of sed(1) on Darwin. :/ +ifeq ($(shell uname),Darwin) +SED_INPLACE=sed -i '' +else ifeq ($(shell uname),SunOS) +SED_INPLACE=sed -i +else +SED_INPLACE=sed -i +endif + +# ABI version +# http://tldp.org/HOWTO/Program-Library-HOWTO/shared-libraries.html +SONAME=9 + +# To rebuild the Tables generated by Perl and Python scripts (requires Internet +# access for Unicode data), uncomment the following line: +# REBUILD_TABLES=1 + +# The SunOS linker does not support wildcards. :( +ifeq ($(shell uname),Darwin) +SOEXT=dylib +SOEXTVER=$(SONAME).$(SOEXT) +SOEXTVER00=$(SONAME).0.0.$(SOEXT) +MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib -Wl,-compatibility_version,$(SONAME),-current_version,$(SONAME).0.0,-install_name,$(libdir)/libre2.$(SOEXTVER),-exported_symbols_list,libre2.symbols.darwin $(RE2_LDFLAGS) $(LDFLAGS) +else ifeq ($(shell uname),SunOS) +SOEXT=so +SOEXTVER=$(SOEXT).$(SONAME) +SOEXTVER00=$(SOEXT).$(SONAME).0.0 +MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER) $(RE2_LDFLAGS) $(LDFLAGS) +else +SOEXT=so +SOEXTVER=$(SOEXT).$(SONAME) +SOEXTVER00=$(SOEXT).$(SONAME).0.0 +MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER),--version-script,libre2.symbols $(RE2_LDFLAGS) $(LDFLAGS) +endif + +.PHONY: all +all: obj/libre2.a obj/so/libre2.$(SOEXT) + +INSTALL_HFILES=\ + re2/filtered_re2.h\ + re2/re2.h\ + re2/set.h\ + re2/stringpiece.h\ + +HFILES=\ + util/benchmark.h\ + util/flags.h\ + util/logging.h\ + util/malloc_counter.h\ + util/mix.h\ + util/mutex.h\ + util/pcre.h\ + util/strutil.h\ + util/test.h\ + util/utf.h\ + util/util.h\ + re2/filtered_re2.h\ + re2/re2.h\ + re2/set.h\ + re2/stringpiece.h\ + # re2/testing/exhaustive_tester.h\ + # re2/testing/regexp_generator.h\ + # re2/testing/string_generator.h\ + # re2/testing/tester.h\ + +# 仅保留接口stub +OFILES=obj/re2/re2.o\ + obj/re2/stringpiece.o\ + obj/re2/set.o\ + obj/re2/filtered_re2.o\ + + # obj/util/rune.o\ + # obj/util/strutil.o\ + # obj/re2/bitstate.o\ + # obj/re2/compile.o\ + # obj/re2/dfa.o\ + # obj/re2/filtered_re2.o\ + # obj/re2/mimics_pcre.o\ + # obj/re2/nfa.o\ + # obj/re2/onepass.o\ + # obj/re2/parse.o\ + # obj/re2/perl_groups.o\ + # obj/re2/prefilter.o\ + # obj/re2/prefilter_tree.o\ + # obj/re2/prog.o\ + # obj/re2/re2.o\ + # obj/re2/regexp.o\ + # obj/re2/set.o\ + # obj/re2/simplify.o\ + # obj/re2/stringpiece.o\ + # obj/re2/tostring.o\ + # obj/re2/unicode_casefold.o\ + # obj/re2/unicode_groups.o\ + +TESTOFILES=\ + obj/util/pcre.o\ + obj/util/strutil.o\ + + #obj/re2/testing/string_generator.o\ + # obj/re2/testing/backtrack.o\ + # obj/re2/testing/dump.o\ + # obj/re2/testing/exhaustive_tester.o\ + # obj/re2/testing/null_walker.o\ + # obj/re2/testing/regexp_generator.o\ + # obj/re2/testing/tester.o\ + +TESTS=\ + obj/test/re2_test\ + obj/test/re2_arg_test\ + + #obj/test/set_test\ + #obj/test/filtered_re2_test\ + + # obj/test/charclass_test\ + # obj/test/compile_test\ + # obj/test/mimics_pcre_test\ + # obj/test/parse_test\ + # obj/test/possible_match_test\ + # obj/test/regexp_test\ + # obj/test/required_prefix_test\ + # obj/test/search_test\ + # obj/test/simplify_test\ + # obj/test/string_generator_test\ + +BIGTESTS=\ + obj/test/dfa_test\ + obj/test/exhaustive1_test\ + obj/test/exhaustive2_test\ + obj/test/exhaustive3_test\ + obj/test/exhaustive_test\ + obj/test/random_test\ + +SOFILES=$(patsubst obj/%,obj/so/%,$(OFILES)) +# We use TESTOFILES for testing the shared lib, only it is built differently. +STESTS=$(patsubst obj/%,obj/so/%,$(TESTS)) +SBIGTESTS=$(patsubst obj/%,obj/so/%,$(BIGTESTS)) + +DOFILES=$(patsubst obj/%,obj/dbg/%,$(OFILES)) +DTESTOFILES=$(patsubst obj/%,obj/dbg/%,$(TESTOFILES)) +DTESTS=$(patsubst obj/%,obj/dbg/%,$(TESTS)) +DBIGTESTS=$(patsubst obj/%,obj/dbg/%,$(BIGTESTS)) + +.PRECIOUS: obj/%.o +obj/%.o: %.cc $(HFILES) + @mkdir -p $$(dirname $@) + $(CXX) -c -o $@ $(CPPFLAGS) $(RE2_CXXFLAGS) $(CXXFLAGS) -DNDEBUG $*.cc + +.PRECIOUS: obj/dbg/%.o +obj/dbg/%.o: %.cc $(HFILES) + @mkdir -p $$(dirname $@) + $(CXX) -c -o $@ $(CPPFLAGS) $(RE2_CXXFLAGS) $(CXXFLAGS) $*.cc + +.PRECIOUS: obj/so/%.o +obj/so/%.o: %.cc $(HFILES) + @mkdir -p $$(dirname $@) + $(CXX) -c -o $@ -fPIC $(CPPFLAGS) $(RE2_CXXFLAGS) $(CXXFLAGS) -DNDEBUG $*.cc + +.PRECIOUS: obj/libre2.a +obj/libre2.a: $(OFILES) + @mkdir -p obj + $(AR) $(ARFLAGS) obj/libre2.a $(OFILES) + +.PRECIOUS: obj/dbg/libre2.a +obj/dbg/libre2.a: $(DOFILES) + @mkdir -p obj/dbg + $(AR) $(ARFLAGS) obj/dbg/libre2.a $(DOFILES) + +.PRECIOUS: obj/so/libre2.$(SOEXT) +obj/so/libre2.$(SOEXT): $(SOFILES) libre2.symbols libre2.symbols.darwin + @mkdir -p obj/so + $(MAKE_SHARED_LIBRARY) -o obj/so/libre2.$(SOEXTVER) $(SOFILES) + ln -sf libre2.$(SOEXTVER) $@ + +.PRECIOUS: obj/dbg/test/% +obj/dbg/test/%: obj/dbg/libre2.a obj/dbg/re2/testing/%.o $(DTESTOFILES) obj/dbg/util/test.o + @mkdir -p obj/dbg/test + $(CXX) -o $@ obj/dbg/re2/testing/$*.o $(DTESTOFILES) obj/dbg/util/test.o obj/dbg/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) + +.PRECIOUS: obj/test/% +obj/test/%: obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o + @mkdir -p obj/test + $(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) + +# Test the shared lib, falling back to the static lib for private symbols +.PRECIOUS: obj/so/test/% +obj/so/test/%: obj/so/libre2.$(SOEXT) obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o + @mkdir -p obj/so/test + $(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o -Lobj/so -lre2 obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) + +# Filter out dump.o because testing::TempDir() isn't available for it. +obj/test/regexp_benchmark: obj/libre2.a obj/re2/testing/regexp_benchmark.o $(TESTOFILES) obj/util/benchmark.o + @mkdir -p obj/test + $(CXX) -o $@ obj/re2/testing/regexp_benchmark.o $(filter-out obj/re2/testing/dump.o, $(TESTOFILES)) obj/util/benchmark.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) + +# re2_fuzzer is a target for fuzzers like libFuzzer and AFL. This fake fuzzing +# is simply a way to check that the target builds and then to run it against a +# fixed set of inputs. To perform real fuzzing, refer to the documentation for +# libFuzzer (llvm.org/docs/LibFuzzer.html) and AFL (lcamtuf.coredump.cx/afl/). +obj/test/re2_fuzzer: CXXFLAGS:=-I./re2/fuzzing/compiler-rt/include $(CXXFLAGS) +obj/test/re2_fuzzer: obj/libre2.a obj/re2/fuzzing/re2_fuzzer.o obj/util/fuzz.o + @mkdir -p obj/test + $(CXX) -o $@ obj/re2/fuzzing/re2_fuzzer.o obj/util/fuzz.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) + +ifdef REBUILD_TABLES +.PRECIOUS: re2/perl_groups.cc +re2/perl_groups.cc: re2/make_perl_groups.pl + perl $< > $@ + +.PRECIOUS: re2/unicode_%.cc +re2/unicode_%.cc: re2/make_unicode_%.py re2/unicode.py + python3 $< > $@ +endif + +.PHONY: distclean +distclean: clean + rm -f re2/perl_groups.cc re2/unicode_casefold.cc re2/unicode_groups.cc + +.PHONY: clean +clean: + rm -rf obj + rm -f re2/*.pyc + +.PHONY: testofiles +testofiles: $(TESTOFILES) + +.PHONY: test +test: $(DTESTS) $(TESTS) $(STESTS) debug-test static-test shared-test + +.PHONY: debug-test +debug-test: $(DTESTS) + @./runtests $(DTESTS) + +.PHONY: static-test +static-test: $(TESTS) + @./runtests $(TESTS) + +.PHONY: shared-test +shared-test: $(STESTS) + @./runtests -shared-library-path obj/so $(STESTS) + +.PHONY: debug-bigtest +debug-bigtest: $(DTESTS) $(DBIGTESTS) + @./runtests $(DTESTS) $(DBIGTESTS) + +.PHONY: static-bigtest +static-bigtest: $(TESTS) $(BIGTESTS) + @./runtests $(TESTS) $(BIGTESTS) + +.PHONY: shared-bigtest +shared-bigtest: $(STESTS) $(SBIGTESTS) + @./runtests -shared-library-path obj/so $(STESTS) $(SBIGTESTS) + +.PHONY: benchmark +benchmark: obj/test/regexp_benchmark + +.PHONY: fuzz +fuzz: obj/test/re2_fuzzer + +.PHONY: install +install: static-install shared-install + +.PHONY: static +static: obj/libre2.a + +.PHONY: static-install +static-install: obj/libre2.a common-install + $(INSTALL) obj/libre2.a $(DESTDIR)$(libdir)/libre2.a + +.PHONY: shared +shared: obj/so/libre2.$(SOEXT) + +.PHONY: shared-install +shared-install: obj/so/libre2.$(SOEXT) common-install + $(INSTALL) obj/so/libre2.$(SOEXT) $(DESTDIR)$(libdir)/libre2.$(SOEXTVER00) + ln -sf libre2.$(SOEXTVER00) $(DESTDIR)$(libdir)/libre2.$(SOEXTVER) + ln -sf libre2.$(SOEXTVER00) $(DESTDIR)$(libdir)/libre2.$(SOEXT) + +.PHONY: common-install +common-install: + mkdir -p $(DESTDIR)$(includedir)/re2 $(DESTDIR)$(libdir)/pkgconfig + $(INSTALL_DATA) $(INSTALL_HFILES) $(DESTDIR)$(includedir)/re2 + $(INSTALL_DATA) re2.pc $(DESTDIR)$(libdir)/pkgconfig/re2.pc + $(SED_INPLACE) -e "s#@includedir@#$(includedir)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc + $(SED_INPLACE) -e "s#@libdir@#$(libdir)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc + +.PHONY: testinstall +testinstall: static-testinstall shared-testinstall + @echo + @echo Install tests passed. + @echo + +.PHONY: static-testinstall +static-testinstall: CXXFLAGS:=-std=c++11 -pthread -I$(DESTDIR)$(includedir) $(CXXFLAGS) +static-testinstall: LDFLAGS:=-pthread -L$(DESTDIR)$(libdir) -l:libre2.a $(LDICU) $(LDFLAGS) +static-testinstall: + @mkdir -p obj + @cp testinstall.cc obj +ifeq ($(shell uname),Darwin) + @echo Skipping test for libre2.a on Darwin. +else ifeq ($(shell uname),SunOS) + @echo Skipping test for libre2.a on SunOS. +else + (cd obj && $(CXX) testinstall.cc -o testinstall $(CXXFLAGS) $(LDFLAGS)) + obj/testinstall +endif + +.PHONY: shared-testinstall +shared-testinstall: CXXFLAGS:=-std=c++11 -pthread -I$(DESTDIR)$(includedir) $(CXXFLAGS) +shared-testinstall: LDFLAGS:=-pthread -L$(DESTDIR)$(libdir) -lre2 $(LDICU) $(LDFLAGS) +shared-testinstall: + @mkdir -p obj + @cp testinstall.cc obj + (cd obj && $(CXX) testinstall.cc -o testinstall $(CXXFLAGS) $(LDFLAGS)) +ifeq ($(shell uname),Darwin) + DYLD_LIBRARY_PATH="$(DESTDIR)$(libdir):$(DYLD_LIBRARY_PATH)" obj/testinstall +else + LD_LIBRARY_PATH="$(DESTDIR)$(libdir):$(LD_LIBRARY_PATH)" obj/testinstall +endif + +.PHONY: benchlog +benchlog: obj/test/regexp_benchmark + (echo '==BENCHMARK==' `hostname` `date`; \ + (uname -a; $(CXX) --version; git rev-parse --short HEAD; file obj/test/regexp_benchmark) | sed 's/^/# /'; \ + echo; \ + ./obj/test/regexp_benchmark 'PCRE|RE2') | tee -a benchlog.$$(hostname | sed 's/\..*//') + +.PHONY: log +log: + $(MAKE) clean + $(MAKE) CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" \ + $(filter obj/test/exhaustive%_test,$(BIGTESTS)) + echo '#' RE2 exhaustive tests built by make log >re2-exhaustive.txt + echo '#' $$(date) >>re2-exhaustive.txt + obj/test/exhaustive_test |grep -v '^PASS$$' >>re2-exhaustive.txt + obj/test/exhaustive1_test |grep -v '^PASS$$' >>re2-exhaustive.txt + obj/test/exhaustive2_test |grep -v '^PASS$$' >>re2-exhaustive.txt + obj/test/exhaustive3_test |grep -v '^PASS$$' >>re2-exhaustive.txt + + $(MAKE) CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" obj/test/search_test + echo '#' RE2 basic search tests built by make $@ >re2-search.txt + echo '#' $$(date) >>re2-search.txt + obj/test/search_test |grep -v '^PASS$$' >>re2-search.txt diff --git a/README.en.md b/README.en.md deleted file mode 100644 index 73d7961c0f9208a2c939b3dd6684c5c392bac0e3..0000000000000000000000000000000000000000 --- a/README.en.md +++ /dev/null @@ -1,36 +0,0 @@ -# re2-rust - -#### Description -a compatible RE2 API by calling Rust library regex(rure) - -#### Software Architecture -Software architecture description - -#### Installation - -1. xxxx -2. xxxx -3. xxxx - -#### Instructions - -1. xxxx -2. xxxx -3. xxxx - -#### Contribution - -1. Fork the repository -2. Create Feat_xxx branch -3. Commit your code -4. Create Pull Request - - -#### Gitee Feature - -1. You can use Readme\_XXX.md to support different languages, such as Readme\_en.md, Readme\_zh.md -2. Gitee blog [blog.gitee.com](https://blog.gitee.com) -3. Explore open source project [https://gitee.com/explore](https://gitee.com/explore) -4. The most valuable open source project [GVP](https://gitee.com/gvp) -5. The manual of Gitee [https://gitee.com/help](https://gitee.com/help) -6. The most popular members [https://gitee.com/gitee-stars/](https://gitee.com/gitee-stars/) diff --git a/README.md b/README.md index a459819322a568889d1a57acd7c04d73d2269189..b14f434d9bf05142a3276506d7402a6e962631c9 100644 --- a/README.md +++ b/README.md @@ -1,37 +1,55 @@ # re2-rust -#### 介绍 -a compatible RE2 API by calling Rust library regex(rure) +a compatible RE2 API( +2021-11-01) by calling Rust library [regex(rure)](https://github.com/rust-lang/regex) -#### 软件架构 -软件架构说明 +``` Shell -#### 安装教程 +$ git clone https://gitee.com/openeuler/re2-rust.git +$ cd re2-rust +``` -1. xxxx -2. xxxx -3. xxxx -#### 使用说明 +### 安装rure库 +安装过程如下: +``` Shell +$ git clone https://github.com/rust-lang/regex +$ cd regex/regex-capi +$ cargo build --verbose +``` +对于编译完成的`librure.a`和`librure.so`文件需要进行手工安装 +``` Shell +# put the librure.a and librure.so into the /usr/lib -1. xxxx -2. xxxx -3. xxxx +$ sudo cp regex/target/debug/librure.a /usr/lib +$ sudo cp regex/target/debug/librure.so /usr/lib +``` +手工安装rure.h文件 +``` Shell +# copy the rure.h +$ sudo cp regex/regex-capi/include/rure.h /usr/include +``` -#### 参与贡献 +使用rure库: +使用regex/regex-capi/ctest/目录下的 test.c文件进行测试 +``` Shell +$ gcc test.c -o test -lrure +$ ./test +``` -1. Fork 本仓库 -2. 新建 Feat_xxx 分支 -3. 提交代码 -4. 新建 Pull Request +### 编译、安装re2-rust + +``` Shell +$ make +$ sudo make install +``` + +# Links + +* https://github.com/rust-lang/regex +* https://gitee.com/src-openeuler/re2 +* https://github.com/google/re2 -#### 特技 -1. 使用 Readme\_XXX.md 来支持不同的语言,例如 Readme\_en.md, Readme\_zh.md -2. Gitee 官方博客 [blog.gitee.com](https://blog.gitee.com) -3. 你可以 [https://gitee.com/explore](https://gitee.com/explore) 这个地址来了解 Gitee 上的优秀开源项目 -4. [GVP](https://gitee.com/gvp) 全称是 Gitee 最有价值开源项目,是综合评定出的优秀开源项目 -5. Gitee 官方提供的使用手册 [https://gitee.com/help](https://gitee.com/help) -6. Gitee 封面人物是一档用来展示 Gitee 会员风采的栏目 [https://gitee.com/gitee-stars/](https://gitee.com/gitee-stars/) diff --git a/libre2.symbols b/libre2.symbols new file mode 100644 index 0000000000000000000000000000000000000000..93b71b486233e22a440feeae7175b784cf06f151 --- /dev/null +++ b/libre2.symbols @@ -0,0 +1,19 @@ +{ + global: + # re2::RE2* + _ZN3re23RE2*; + _ZNK3re23RE2*; + # re2::StringPiece* + _ZN3re211StringPiece*; + _ZNK3re211StringPiece*; + # re2::operator<<* + _ZN3re2ls*; + # re2::FilteredRE2* + _ZN3re211FilteredRE2*; + _ZNK3re211FilteredRE2*; + # re2::re2_internal* + _ZN3re212re2_internal*; + _ZNK3re212re2_internal*; + local: + *; +}; diff --git a/libre2.symbols.darwin b/libre2.symbols.darwin new file mode 100644 index 0000000000000000000000000000000000000000..41ac96f93b10cafc08f091dbc0eee6191566775a --- /dev/null +++ b/libre2.symbols.darwin @@ -0,0 +1,15 @@ +# Linker doesn't like these unmangled: +# re2::RE2* +__ZN3re23RE2* +__ZNK3re23RE2* +# re2::StringPiece* +__ZN3re211StringPiece* +__ZNK3re211StringPiece* +# re2::operator<<* +__ZN3re2ls* +# re2::FilteredRE2* +__ZN3re211FilteredRE2* +__ZNK3re211FilteredRE2* +# re2::re2_internal* +__ZN3re212re2_internal* +__ZNK3re212re2_internal* diff --git a/re2.pc b/re2.pc new file mode 100644 index 0000000000000000000000000000000000000000..50fd637d4eebe977a607ba446000f6aaaf427365 --- /dev/null +++ b/re2.pc @@ -0,0 +1,8 @@ +includedir=@includedir@ +libdir=@libdir@ + +Name: re2 +Description: RE2 is a fast, safe, thread-friendly regular expression engine. +Version: 0.0.0 +Cflags: -std=c++11 -pthread -I${includedir} +Libs: -pthread -L${libdir} -lre2 diff --git a/re2/filtered_re2.cc b/re2/filtered_re2.cc new file mode 100644 index 0000000000000000000000000000000000000000..01b9e49ad450735fc06c1e2ebd2d05320863d275 --- /dev/null +++ b/re2/filtered_re2.cc @@ -0,0 +1,148 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "re2/filtered_re2.h" + +#include +#include +#include + +#include "util/util.h" +#include "util/logging.h" +// #include "re2/prefilter.h" +namespace re2 { +class Prefilter {}; +// #include "re2/prefilter_tree.h" +class PrefilterTree { + public: + PrefilterTree(){}; + explicit PrefilterTree(int min_atom_len){}; + ~PrefilterTree(){}; +}; +}; + +namespace re2 { + +FilteredRE2::FilteredRE2() + : compiled_(false), + prefilter_tree_(new PrefilterTree()) { +} + +FilteredRE2::FilteredRE2(int min_atom_len) + : compiled_(false), + prefilter_tree_(new PrefilterTree(min_atom_len)) { +} + +FilteredRE2::~FilteredRE2() { + for (size_t i = 0; i < re2_vec_.size(); i++) + delete re2_vec_[i]; +} + +FilteredRE2::FilteredRE2(FilteredRE2&& other) + : re2_vec_(std::move(other.re2_vec_)), + compiled_(other.compiled_), + prefilter_tree_(std::move(other.prefilter_tree_)) { + other.re2_vec_.clear(); + other.re2_vec_.shrink_to_fit(); + other.compiled_ = false; + other.prefilter_tree_.reset(new PrefilterTree()); +} + +FilteredRE2& FilteredRE2::operator=(FilteredRE2&& other) { + this->~FilteredRE2(); + (void) new (this) FilteredRE2(std::move(other)); + return *this; +} + +RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern, + const RE2::Options& options, int* id) { + RE2* re = new RE2(pattern, options); + RE2::ErrorCode code = re->error_code(); + + if (!re->ok()) { + if (options.log_errors()) { + LOG(ERROR) << "Couldn't compile regular expression, skipping: " + << pattern << " due to error " << re->error(); + } + delete re; + } else { + *id = static_cast(re2_vec_.size()); + re2_vec_.push_back(re); + } + + return code; +} + +void FilteredRE2::Compile(std::vector* atoms) { + // if (compiled_) { + // LOG(ERROR) << "Compile called already."; + // return; + // } + + // if (re2_vec_.empty()) { + // LOG(ERROR) << "Compile called before Add."; + // return; + // } + + // for (size_t i = 0; i < re2_vec_.size(); i++) { + // Prefilter* prefilter = Prefilter::FromRE2(re2_vec_[i]); + // prefilter_tree_->Add(prefilter); + // } + // atoms->clear(); + // prefilter_tree_->Compile(atoms); + compiled_ = true; +} + +int FilteredRE2::SlowFirstMatch(const StringPiece& text) const { + for (size_t i = 0; i < re2_vec_.size(); i++) + if (RE2::PartialMatch(text, *re2_vec_[i])) + return static_cast(i); + return -1; +} + +int FilteredRE2::FirstMatch(const StringPiece& text, + const std::vector& atoms) const { + // if (!compiled_) { + // LOG(DFATAL) << "FirstMatch called before Compile."; + // return -1; + // } + // std::vector regexps; + // prefilter_tree_->RegexpsGivenStrings(atoms, ®exps); + // for (size_t i = 0; i < regexps.size(); i++) + // if (RE2::PartialMatch(text, *re2_vec_[regexps[i]])) + // return regexps[i]; + // return -1; + return 0; +} + +bool FilteredRE2::AllMatches( + const StringPiece& text, + const std::vector& atoms, + std::vector* matching_regexps) const { + // matching_regexps->clear(); + // std::vector regexps; + // prefilter_tree_->RegexpsGivenStrings(atoms, ®exps); + // for (size_t i = 0; i < regexps.size(); i++) + // if (RE2::PartialMatch(text, *re2_vec_[regexps[i]])) + // matching_regexps->push_back(regexps[i]); + // return !matching_regexps->empty(); + return true; +} + +void FilteredRE2::AllPotentials( + const std::vector& atoms, + std::vector* potential_regexps) const { + // prefilter_tree_->RegexpsGivenStrings(atoms, potential_regexps); +} + +void FilteredRE2::RegexpsGivenStrings(const std::vector& matched_atoms, + std::vector* passed_regexps) { + // prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps); +} + +void FilteredRE2::PrintPrefilter(int regexpid) { + // prefilter_tree_->PrintPrefilter(regexpid); +} + +} // namespace re2 diff --git a/re2/filtered_re2.h b/re2/filtered_re2.h new file mode 100644 index 0000000000000000000000000000000000000000..dd618c70e8bfee9cfc8e5118868f5f0a3cd298ee --- /dev/null +++ b/re2/filtered_re2.h @@ -0,0 +1,114 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef RE2_FILTERED_RE2_H_ +#define RE2_FILTERED_RE2_H_ + +// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps. +// It provides a prefilter mechanism that helps in cutting down the +// number of regexps that need to be actually searched. +// +// By design, it does not include a string matching engine. This is to +// allow the user of the class to use their favorite string matching +// engine. The overall flow is: Add all the regexps using Add, then +// Compile the FilteredRE2. Compile returns strings that need to be +// matched. Note that the returned strings are lowercased and distinct. +// For applying regexps to a search text, the caller does the string +// matching using the returned strings. When doing the string match, +// note that the caller has to do that in a case-insensitive way or +// on a lowercased version of the search text. Then call FirstMatch +// or AllMatches with a vector of indices of strings that were found +// in the text to get the actual regexp matches. + +#include +#include +#include + +#include "re2/re2.h" + +namespace re2 { + +class PrefilterTree; + +class FilteredRE2 { + public: + FilteredRE2(); + explicit FilteredRE2(int min_atom_len); + ~FilteredRE2(); + + // Not copyable. + FilteredRE2(const FilteredRE2&) = delete; + FilteredRE2& operator=(const FilteredRE2&) = delete; + // Movable. + FilteredRE2(FilteredRE2&& other); + FilteredRE2& operator=(FilteredRE2&& other); + + // Uses RE2 constructor to create a RE2 object (re). Returns + // re->error_code(). If error_code is other than NoError, then re is + // deleted and not added to re2_vec_. + RE2::ErrorCode Add(const StringPiece& pattern, + const RE2::Options& options, + int* id); + + // Prepares the regexps added by Add for filtering. Returns a set + // of strings that the caller should check for in candidate texts. + // The returned strings are lowercased and distinct. When doing + // string matching, it should be performed in a case-insensitive + // way or the search text should be lowercased first. Call after + // all Add calls are done. + void Compile(std::vector* strings_to_match); + + // Returns the index of the first matching regexp. + // Returns -1 on no match. Can be called prior to Compile. + // Does not do any filtering: simply tries to Match the + // regexps in a loop. + int SlowFirstMatch(const StringPiece& text) const; + + // Returns the index of the first matching regexp. + // Returns -1 on no match. Compile has to be called before + // calling this. + int FirstMatch(const StringPiece& text, + const std::vector& atoms) const; + + // Returns the indices of all matching regexps, after first clearing + // matched_regexps. + bool AllMatches(const StringPiece& text, + const std::vector& atoms, + std::vector* matching_regexps) const; + + // Returns the indices of all potentially matching regexps after first + // clearing potential_regexps. + // A regexp is potentially matching if it passes the filter. + // If a regexp passes the filter it may still not match. + // A regexp that does not pass the filter is guaranteed to not match. + void AllPotentials(const std::vector& atoms, + std::vector* potential_regexps) const; + + // The number of regexps added. + int NumRegexps() const { return static_cast(re2_vec_.size()); } + + // Get the individual RE2 objects. + const RE2& GetRE2(int regexpid) const { return *re2_vec_[regexpid]; } + + private: + // Print prefilter. + void PrintPrefilter(int regexpid); + + // Useful for testing and debugging. + void RegexpsGivenStrings(const std::vector& matched_atoms, + std::vector* passed_regexps); + + // All the regexps in the FilteredRE2. + std::vector re2_vec_; + + // Has the FilteredRE2 been compiled using Compile() + bool compiled_; + + // An AND-OR tree of string atoms used for filtering regexps. + std::unique_ptr prefilter_tree_; +}; + +} // namespace re2 + +#endif // RE2_FILTERED_RE2_H_ diff --git a/re2/fuzzing/compiler-rt/LICENSE b/re2/fuzzing/compiler-rt/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..f9dc50615d7ec2b9913dc434fb243fc30889d2a9 --- /dev/null +++ b/re2/fuzzing/compiler-rt/LICENSE @@ -0,0 +1,219 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +--- LLVM Exceptions to the Apache 2.0 License ---- + +As an exception, if, as a result of your compiling your source code, portions +of this Software are embedded into an Object form of such source code, you +may redistribute such embedded portions in such Object form without complying +with the conditions of Sections 4(a), 4(b) and 4(d) of the License. + +In addition, if you combine or link compiled forms of this Software with +software that is licensed under the GPLv2 ("Combined Software") and if a +court of competent jurisdiction determines that the patent provision (Section +3), the indemnity provision (Section 9) or other Section of the License +conflicts with the conditions of the GPLv2, you may retroactively and +prospectively choose to deem waived or otherwise exclude such Section(s) of +the License, but only in their entirety and only with respect to the Combined +Software. + diff --git a/re2/fuzzing/compiler-rt/include/fuzzer/FuzzedDataProvider.h b/re2/fuzzing/compiler-rt/include/fuzzer/FuzzedDataProvider.h new file mode 100644 index 0000000000000000000000000000000000000000..3e069eba69b46229aa765d36db84197698a5b42a --- /dev/null +++ b/re2/fuzzing/compiler-rt/include/fuzzer/FuzzedDataProvider.h @@ -0,0 +1,305 @@ +//===- FuzzedDataProvider.h - Utility header for fuzz targets ---*- C++ -* ===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// A single header library providing an utility class to break up an array of +// bytes. Whenever run on the same input, provides the same output, as long as +// its methods are called in the same order, with the same arguments. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_ +#define LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// In addition to the comments below, the API is also briefly documented at +// https://github.com/google/fuzzing/blob/master/docs/split-inputs.md#fuzzed-data-provider +class FuzzedDataProvider { + public: + // |data| is an array of length |size| that the FuzzedDataProvider wraps to + // provide more granular access. |data| must outlive the FuzzedDataProvider. + FuzzedDataProvider(const uint8_t *data, size_t size) + : data_ptr_(data), remaining_bytes_(size) {} + ~FuzzedDataProvider() = default; + + // Returns a std::vector containing |num_bytes| of input data. If fewer than + // |num_bytes| of data remain, returns a shorter std::vector containing all + // of the data that's left. Can be used with any byte sized type, such as + // char, unsigned char, uint8_t, etc. + template std::vector ConsumeBytes(size_t num_bytes) { + num_bytes = std::min(num_bytes, remaining_bytes_); + return ConsumeBytes(num_bytes, num_bytes); + } + + // Similar to |ConsumeBytes|, but also appends the terminator value at the end + // of the resulting vector. Useful, when a mutable null-terminated C-string is + // needed, for example. But that is a rare case. Better avoid it, if possible, + // and prefer using |ConsumeBytes| or |ConsumeBytesAsString| methods. + template + std::vector ConsumeBytesWithTerminator(size_t num_bytes, + T terminator = 0) { + num_bytes = std::min(num_bytes, remaining_bytes_); + std::vector result = ConsumeBytes(num_bytes + 1, num_bytes); + result.back() = terminator; + return result; + } + + // Returns a std::string containing |num_bytes| of input data. Using this and + // |.c_str()| on the resulting string is the best way to get an immutable + // null-terminated C string. If fewer than |num_bytes| of data remain, returns + // a shorter std::string containing all of the data that's left. + std::string ConsumeBytesAsString(size_t num_bytes) { + static_assert(sizeof(std::string::value_type) == sizeof(uint8_t), + "ConsumeBytesAsString cannot convert the data to a string."); + + num_bytes = std::min(num_bytes, remaining_bytes_); + std::string result( + reinterpret_cast(data_ptr_), + num_bytes); + Advance(num_bytes); + return result; + } + + // Returns a number in the range [min, max] by consuming bytes from the + // input data. The value might not be uniformly distributed in the given + // range. If there's no input data left, always returns |min|. |min| must + // be less than or equal to |max|. + template T ConsumeIntegralInRange(T min, T max) { + static_assert(std::is_integral::value, "An integral type is required."); + static_assert(sizeof(T) <= sizeof(uint64_t), "Unsupported integral type."); + + if (min > max) + abort(); + + // Use the biggest type possible to hold the range and the result. + uint64_t range = static_cast(max) - min; + uint64_t result = 0; + size_t offset = 0; + + while (offset < sizeof(T) * CHAR_BIT && (range >> offset) > 0 && + remaining_bytes_ != 0) { + // Pull bytes off the end of the seed data. Experimentally, this seems to + // allow the fuzzer to more easily explore the input space. This makes + // sense, since it works by modifying inputs that caused new code to run, + // and this data is often used to encode length of data read by + // |ConsumeBytes|. Separating out read lengths makes it easier modify the + // contents of the data that is actually read. + --remaining_bytes_; + result = (result << CHAR_BIT) | data_ptr_[remaining_bytes_]; + offset += CHAR_BIT; + } + + // Avoid division by 0, in case |range + 1| results in overflow. + if (range != std::numeric_limits::max()) + result = result % (range + 1); + + return static_cast(min + result); + } + + // Returns a std::string of length from 0 to |max_length|. When it runs out of + // input data, returns what remains of the input. Designed to be more stable + // with respect to a fuzzer inserting characters than just picking a random + // length and then consuming that many bytes with |ConsumeBytes|. + std::string ConsumeRandomLengthString(size_t max_length) { + // Reads bytes from the start of |data_ptr_|. Maps "\\" to "\", and maps "\" + // followed by anything else to the end of the string. As a result of this + // logic, a fuzzer can insert characters into the string, and the string + // will be lengthened to include those new characters, resulting in a more + // stable fuzzer than picking the length of a string independently from + // picking its contents. + std::string result; + + // Reserve the anticipated capaticity to prevent several reallocations. + result.reserve(std::min(max_length, remaining_bytes_)); + for (size_t i = 0; i < max_length && remaining_bytes_ != 0; ++i) { + char next = ConvertUnsignedToSigned(data_ptr_[0]); + Advance(1); + if (next == '\\' && remaining_bytes_ != 0) { + next = ConvertUnsignedToSigned(data_ptr_[0]); + Advance(1); + if (next != '\\') + break; + } + result += next; + } + + result.shrink_to_fit(); + return result; + } + + // Returns a std::vector containing all remaining bytes of the input data. + template std::vector ConsumeRemainingBytes() { + return ConsumeBytes(remaining_bytes_); + } + + // Returns a std::string containing all remaining bytes of the input data. + // Prefer using |ConsumeRemainingBytes| unless you actually need a std::string + // object. + std::string ConsumeRemainingBytesAsString() { + return ConsumeBytesAsString(remaining_bytes_); + } + + // Returns a number in the range [Type's min, Type's max]. The value might + // not be uniformly distributed in the given range. If there's no input data + // left, always returns |min|. + template T ConsumeIntegral() { + return ConsumeIntegralInRange(std::numeric_limits::min(), + std::numeric_limits::max()); + } + + // Reads one byte and returns a bool, or false when no data remains. + bool ConsumeBool() { return 1 & ConsumeIntegral(); } + + // Returns a copy of the value selected from the given fixed-size |array|. + template + T PickValueInArray(const T (&array)[size]) { + static_assert(size > 0, "The array must be non empty."); + return array[ConsumeIntegralInRange(0, size - 1)]; + } + + template + T PickValueInArray(std::initializer_list list) { + // TODO(Dor1s): switch to static_assert once C++14 is allowed. + if (!list.size()) + abort(); + + return *(list.begin() + ConsumeIntegralInRange(0, list.size() - 1)); + } + + // Returns an enum value. The enum must start at 0 and be contiguous. It must + // also contain |kMaxValue| aliased to its largest (inclusive) value. Such as: + // enum class Foo { SomeValue, OtherValue, kMaxValue = OtherValue }; + template T ConsumeEnum() { + static_assert(std::is_enum::value, "|T| must be an enum type."); + return static_cast(ConsumeIntegralInRange( + 0, static_cast(T::kMaxValue))); + } + + // Returns a floating point number in the range [0.0, 1.0]. If there's no + // input data left, always returns 0. + template T ConsumeProbability() { + static_assert(std::is_floating_point::value, + "A floating point type is required."); + + // Use different integral types for different floating point types in order + // to provide better density of the resulting values. + using IntegralType = + typename std::conditional<(sizeof(T) <= sizeof(uint32_t)), uint32_t, + uint64_t>::type; + + T result = static_cast(ConsumeIntegral()); + result /= static_cast(std::numeric_limits::max()); + return result; + } + + // Returns a floating point value in the range [Type's lowest, Type's max] by + // consuming bytes from the input data. If there's no input data left, always + // returns approximately 0. + template T ConsumeFloatingPoint() { + return ConsumeFloatingPointInRange(std::numeric_limits::lowest(), + std::numeric_limits::max()); + } + + // Returns a floating point value in the given range by consuming bytes from + // the input data. If there's no input data left, returns |min|. Note that + // |min| must be less than or equal to |max|. + template T ConsumeFloatingPointInRange(T min, T max) { + if (min > max) + abort(); + + T range = .0; + T result = min; + constexpr T zero(.0); + if (max > zero && min < zero && max > min + std::numeric_limits::max()) { + // The diff |max - min| would overflow the given floating point type. Use + // the half of the diff as the range and consume a bool to decide whether + // the result is in the first of the second part of the diff. + range = (max / 2.0) - (min / 2.0); + if (ConsumeBool()) { + result += range; + } + } else { + range = max - min; + } + + return result + range * ConsumeProbability(); + } + + // Reports the remaining bytes available for fuzzed input. + size_t remaining_bytes() { return remaining_bytes_; } + + private: + FuzzedDataProvider(const FuzzedDataProvider &) = delete; + FuzzedDataProvider &operator=(const FuzzedDataProvider &) = delete; + + void Advance(size_t num_bytes) { + if (num_bytes > remaining_bytes_) + abort(); + + data_ptr_ += num_bytes; + remaining_bytes_ -= num_bytes; + } + + template + std::vector ConsumeBytes(size_t size, size_t num_bytes_to_consume) { + static_assert(sizeof(T) == sizeof(uint8_t), "Incompatible data type."); + + // The point of using the size-based constructor below is to increase the + // odds of having a vector object with capacity being equal to the length. + // That part is always implementation specific, but at least both libc++ and + // libstdc++ allocate the requested number of bytes in that constructor, + // which seems to be a natural choice for other implementations as well. + // To increase the odds even more, we also call |shrink_to_fit| below. + std::vector result(size); + if (size == 0) { + if (num_bytes_to_consume != 0) + abort(); + return result; + } + + std::memcpy(result.data(), data_ptr_, num_bytes_to_consume); + Advance(num_bytes_to_consume); + + // Even though |shrink_to_fit| is also implementation specific, we expect it + // to provide an additional assurance in case vector's constructor allocated + // a buffer which is larger than the actual amount of data we put inside it. + result.shrink_to_fit(); + return result; + } + + template TS ConvertUnsignedToSigned(TU value) { + static_assert(sizeof(TS) == sizeof(TU), "Incompatible data types."); + static_assert(!std::numeric_limits::is_signed, + "Source type must be unsigned."); + + // TODO(Dor1s): change to `if constexpr` once C++17 becomes mainstream. + if (std::numeric_limits::is_modulo) + return static_cast(value); + + // Avoid using implementation-defined unsigned to signer conversions. + // To learn more, see https://stackoverflow.com/questions/13150449. + if (value <= std::numeric_limits::max()) { + return static_cast(value); + } else { + constexpr auto TS_min = std::numeric_limits::min(); + return TS_min + static_cast(value - TS_min); + } + } + + const uint8_t *data_ptr_; + size_t remaining_bytes_; +}; + +#endif // LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_ diff --git a/re2/fuzzing/re2_fuzzer.cc b/re2/fuzzing/re2_fuzzer.cc new file mode 100644 index 0000000000000000000000000000000000000000..3082a769252153f4f48622f24cdda838f0ab17e7 --- /dev/null +++ b/re2/fuzzing/re2_fuzzer.cc @@ -0,0 +1,247 @@ +// Copyright 2016 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include +#include +#include +#include +#include + +#include "re2/re2.h" +#include "re2/regexp.h" +#include "re2/walker-inl.h" + +using re2::StringPiece; + +// NOT static, NOT signed. +uint8_t dummy = 0; + +// Walks kRegexpConcat and kRegexpAlternate subexpressions +// to determine their maximum length. +class SubexpressionWalker : public re2::Regexp::Walker { + public: + SubexpressionWalker() = default; + ~SubexpressionWalker() override = default; + + int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg, + int* child_args, int nchild_args) override { + switch (re->op()) { + case re2::kRegexpConcat: + case re2::kRegexpAlternate: { + int max = nchild_args; + for (int i = 0; i < nchild_args; i++) + max = std::max(max, child_args[i]); + return max; + } + + default: + break; + } + return -1; + } + + // Should never be called: we use Walk(), not WalkExponential(). + int ShortVisit(re2::Regexp* re, int parent_arg) override { + return parent_arg; + } + + private: + SubexpressionWalker(const SubexpressionWalker&) = delete; + SubexpressionWalker& operator=(const SubexpressionWalker&) = delete; +}; + +// Walks substrings (i.e. kRegexpLiteralString subexpressions) +// to determine their maximum length... in runes, but avoiding +// overheads due to UTF-8 encoding is worthwhile when fuzzing. +class SubstringWalker : public re2::Regexp::Walker { + public: + SubstringWalker() = default; + ~SubstringWalker() override = default; + + int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg, + int* child_args, int nchild_args) override { + switch (re->op()) { + case re2::kRegexpConcat: + case re2::kRegexpAlternate: + case re2::kRegexpStar: + case re2::kRegexpPlus: + case re2::kRegexpQuest: + case re2::kRegexpRepeat: + case re2::kRegexpCapture: { + int max = -1; + for (int i = 0; i < nchild_args; i++) + max = std::max(max, child_args[i]); + return max; + } + + case re2::kRegexpLiteralString: + return re->nrunes(); + + default: + break; + } + return -1; + } + + // Should never be called: we use Walk(), not WalkExponential(). + int ShortVisit(re2::Regexp* re, int parent_arg) override { + return parent_arg; + } + + private: + SubstringWalker(const SubstringWalker&) = delete; + SubstringWalker& operator=(const SubstringWalker&) = delete; +}; + +void TestOneInput(StringPiece pattern, const RE2::Options& options, + StringPiece text) { + // Crudely limit the use of ., \p, \P, \d, \D, \s, \S, \w and \W. + // Otherwise, we will waste time on inputs that have long runs of various + // character classes. The fuzzer has shown itself to be easily capable of + // generating such patterns that fall within the other limits, but result + // in timeouts nonetheless. The marginal cost is high - even more so when + // counted repetition is involved - whereas the marginal benefit is zero. + // Crudely limit the use of 'k', 'K', 's' and 'S' too because they become + // three-element character classes when case-insensitive and using UTF-8. + // TODO(junyer): Handle [:isalnum:] et al. when they start to cause pain. + int char_class = 0; + int backslash_p = 0; // very expensive, so handle specially + for (size_t i = 0; i < pattern.size(); i++) { + if (pattern[i] == '.' || + pattern[i] == 'k' || pattern[i] == 'K' || + pattern[i] == 's' || pattern[i] == 'S') + char_class++; + if (pattern[i] != '\\') + continue; + i++; + if (i >= pattern.size()) + break; + if (pattern[i] == 'p' || pattern[i] == 'P' || + pattern[i] == 'd' || pattern[i] == 'D' || + pattern[i] == 's' || pattern[i] == 'S' || + pattern[i] == 'w' || pattern[i] == 'W') + char_class++; + if (pattern[i] == 'p' || pattern[i] == 'P') + backslash_p++; + } + if (char_class > 9) + return; + if (backslash_p > 1) + return; + + // The default is 1000. Even 100 turned out to be too generous + // for fuzzing, empirically speaking, so let's try 10 instead. + re2::Regexp::FUZZING_ONLY_set_maximum_repeat_count(10); + + RE2 re(pattern, options); + if (!re.ok()) + return; + + // Don't waste time fuzzing programs with large subexpressions. + // They can cause bug reports due to fuzzer timeouts. And they + // aren't interesting for fuzzing purposes. + if (SubexpressionWalker().Walk(re.Regexp(), -1) > 9) + return; + + // Don't waste time fuzzing programs with large substrings. + // They can cause bug reports due to fuzzer timeouts when they + // are repetitions (e.g. hundreds of NUL bytes) and matching is + // unanchored. And they aren't interesting for fuzzing purposes. + if (SubstringWalker().Walk(re.Regexp(), -1) > 9) + return; + + // Don't waste time fuzzing high-size programs. + // They can cause bug reports due to fuzzer timeouts. + int size = re.ProgramSize(); + if (size > 9999) + return; + int rsize = re.ReverseProgramSize(); + if (rsize > 9999) + return; + + // Don't waste time fuzzing high-fanout programs. + // They can cause bug reports due to fuzzer timeouts. + std::vector histogram; + int fanout = re.ProgramFanout(&histogram); + if (fanout > 9) + return; + int rfanout = re.ReverseProgramFanout(&histogram); + if (rfanout > 9) + return; + + if (re.NumberOfCapturingGroups() == 0) { + // Avoid early return due to too many arguments. + StringPiece sp = text; + RE2::FullMatch(sp, re); + RE2::PartialMatch(sp, re); + RE2::Consume(&sp, re); + sp = text; // Reset. + RE2::FindAndConsume(&sp, re); + } else { + // Okay, we have at least one capturing group... + // Try conversion for variously typed arguments. + StringPiece sp = text; + short s; + RE2::FullMatch(sp, re, &s); + long l; + RE2::PartialMatch(sp, re, &l); + float f; + RE2::Consume(&sp, re, &f); + sp = text; // Reset. + double d; + RE2::FindAndConsume(&sp, re, &d); + } + + std::string s = std::string(text); + RE2::Replace(&s, re, ""); + s = std::string(text); // Reset. + RE2::GlobalReplace(&s, re, ""); + + std::string min, max; + re.PossibleMatchRange(&min, &max, /*maxlen=*/9); + + // Exercise some other API functionality. + dummy += re.NamedCapturingGroups().size(); + dummy += re.CapturingGroupNames().size(); + dummy += RE2::QuoteMeta(pattern).size(); +} + +// Entry point for libFuzzer. +extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { + // An input larger than 4 KiB probably isn't interesting. (This limit + // allows for fdp.ConsumeRandomLengthString()'s backslash behaviour.) + if (size == 0 || size > 4096) + return 0; + + FuzzedDataProvider fdp(data, size); + + // The convention here is that fdp.ConsumeBool() returning false sets + // the default value whereas returning true sets the alternate value: + // most options default to false and so can be set directly; encoding + // defaults to UTF-8; case_sensitive defaults to true. We do NOT want + // to log errors. max_mem is 64 MiB because we can afford to use more + // RAM in exchange for (hopefully) faster fuzzing. + RE2::Options options; + options.set_encoding(fdp.ConsumeBool() ? RE2::Options::EncodingLatin1 + : RE2::Options::EncodingUTF8); + options.set_posix_syntax(fdp.ConsumeBool()); + options.set_longest_match(fdp.ConsumeBool()); + options.set_log_errors(false); + options.set_max_mem(64 << 20); + options.set_literal(fdp.ConsumeBool()); + options.set_never_nl(fdp.ConsumeBool()); + options.set_dot_nl(fdp.ConsumeBool()); + options.set_never_capture(fdp.ConsumeBool()); + options.set_case_sensitive(!fdp.ConsumeBool()); + options.set_perl_classes(fdp.ConsumeBool()); + options.set_word_boundary(fdp.ConsumeBool()); + options.set_one_line(fdp.ConsumeBool()); + + std::string pattern = fdp.ConsumeRandomLengthString(999); + std::string text = fdp.ConsumeRandomLengthString(999); + + TestOneInput(pattern, options, text); + return 0; +} diff --git a/re2/re2.cc b/re2/re2.cc new file mode 100644 index 0000000000000000000000000000000000000000..73231287aa2b8e34a4022bed274fee4111c7d66f --- /dev/null +++ b/re2/re2.cc @@ -0,0 +1,1335 @@ +// Copyright 2003-2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Regular expression interface RE2. +// +// Originally the PCRE C++ wrapper, but adapted to use +// the new automata-based regular expression engines. + +#include "re2/re2.h" +#include +#include +#include +#include +#ifdef _MSC_VER +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "util/util.h" +#include "util/logging.h" +#include "util/strutil.h" +#include "util/utf.h" +// #include "re2/sparse_array.h" +// #include "re2/prog.h" +// #include "re2/regexp.h" +#include "regex_internal.h" + +using namespace std; + +extern "C" +{ +#include +} + + +namespace re2 { +// Maximum number of args we can set +static const int kMaxArgs = 16; +static const int kVecSize = 1+kMaxArgs; + +const int RE2::Options::kDefaultMaxMem; // initialized in re2.h + +RE2::Options::Options(RE2::CannedOptions opt) + : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8), + posix_syntax_(opt == RE2::POSIX), + longest_match_(opt == RE2::POSIX), + log_errors_(opt != RE2::Quiet), + max_mem_(kDefaultMaxMem), + literal_(false), + never_nl_(false), + dot_nl_(false), + never_capture_(false), + case_sensitive_(true), + perl_classes_(false), + word_boundary_(false), + one_line_(false) { +} + +// static empty objects for use as const references. +// To avoid global constructors, allocated in RE2::Init(). +static const std::string* empty_string; +static const std::map* empty_named_groups; +static const std::map* empty_group_names; + +RE2::RE2(const char* pattern) { + Init(pattern, DefaultOptions); +} + +RE2::RE2(const std::string& pattern) { + Init(pattern, DefaultOptions); +} + +RE2::RE2(const StringPiece& pattern) { + Init(pattern, DefaultOptions); +} + +RE2::RE2(const StringPiece& pattern, const Options& options) { + Init(pattern, options); +} + +int RE2::Options::ParseFlags() const { + int flags = Regexp::ClassNL; + switch (encoding()) { + default: + if (log_errors()) + LOG(ERROR) << "Unknown encoding " << encoding(); + break; + case RE2::Options::EncodingUTF8: + break; + case RE2::Options::EncodingLatin1: + flags |= Regexp::Latin1; + break; + } + + if (!posix_syntax()) + flags |= Regexp::LikePerl; + + if (literal()) + flags |= Regexp::Literal; + + if (never_nl()) + flags |= Regexp::NeverNL; + + if (dot_nl()) + flags |= Regexp::DotNL; + + if (never_capture()) + flags |= Regexp::NeverCapture; + + if (!case_sensitive()) + flags |= Regexp::FoldCase; + + if (perl_classes()) + flags |= Regexp::PerlClasses; + + if (word_boundary()) + flags |= Regexp::PerlB; + + if (one_line()) + flags |= Regexp::OneLine; + + return flags; +} + + +std::string encodingLatin1ToUTF8(std::string str) +{ + string strOut; + for (std::string::iterator it = str.begin(); it != str.end(); ++it) + { + uint8_t ch = *it; + if (ch < 0x80) { + strOut.push_back(ch); + } + else { + strOut.push_back(0xc0 | ch >> 6); + strOut.push_back(0x80 | (ch & 0x3f)); + } + } + return strOut; +} + +void RE2::Init(const StringPiece& pattern, const Options& options) { + const char *rure_str; // 正则表达式UTF-8编码形式 + static std::once_flag empty_once; + std::call_once(empty_once, []() { //为了解决多线程中出现的资源竞争导致的数据不一致问题 + empty_string = new std::string; + empty_named_groups = new std::map; + empty_group_names = new std::map; + }); + + pattern_.assign(pattern.data(), pattern.size()); //Set value to a C substring. + options_.Copy(options); //option + entire_regexp_ = NULL; + error_ = empty_string; + error_code_ = NoError; //Erases the string, making it empty. + error_arg_.clear(); + prefix_.clear(); + prefix_foldcase_ = false; + suffix_regexp_ = NULL; + prog_ = NULL; + num_captures_ = -1; + is_one_pass_ = false; + + rprog_ = NULL; + named_groups_ = NULL; + group_names_ = NULL; + + rure_error *err = rure_error_new(); + // pattern --> rure --> Prog + // Compile + // 要对flages进行设置,对应RE2中传入的option + // 对传入的Latin-1编码的字符串要进行转换 + if(options.encoding() == 1){ // UTF-8编码 + rure_str = pattern.data(); + } + else{ // Latin-1编码 + rure_str = encodingLatin1ToUTF8(pattern.ToString()).c_str(); + } + + // 空字符串的处理??? + rure *re = rure_compile((const uint8_t *)rure_str, strlen(rure_str), RURE_DEFAULT_FLAGS, NULL, err); + const char *msg = rure_error_message(err); + + + std::string empty_character_classes = "empty character classes are not allowed"; + // 处理空字符集无法编译的问题 + std::string empty_info = msg; + + + //如果编译失败,打印错误信息 + if (re == NULL) { + if(empty_info.find(empty_character_classes) != string::npos ){ + rure_error_free(err); + rure_error *err_tmp = rure_error_new(); + const char *empty_char = ""; + re = rure_compile((const uint8_t *)empty_char, strlen(empty_char), RURE_DEFAULT_FLAGS, NULL, err_tmp); + prog_ = (Prog*)re; + rure_error_free(err_tmp); + // std::cout << "empty character classes are not allowed" << std::endl; + } + else{ + if (options_.log_errors()) { + LOG(ERROR) << "Error Compile '" << pattern.data() << "':"<< msg << "'"; + } + error_ = new std::string(msg); + error_code_ = ErrorInternal; // 暂时对这个错误进行赋值,如何处理错误类型??? + // rure_free(re); + // rure_error_free(err); + + + return; + } + + } + else{ + prog_ = (Prog*)re; + error_ = empty_string; + error_code_ = RE2::NoError; + } + + + //获取捕获组的数量, 并对num_captures_其进行赋值 + rure_captures *caps = rure_captures_new(re); + size_t captures_len = rure_captures_len(caps) - 1; + num_captures_ = (int)captures_len; + + + // 问题??? + //rure_free和rure_captures_free是否要进行使用? + // error_code_如何进行赋值,RegexpErrorToRE2删除了??? + // rure_free(re); + +} + +// Returns rprog_, computing it if needed. +re2::Prog* RE2::ReverseProg() const { + // std::call_once(rprog_once_, [](const RE2* re) { + // re->rprog_ = + // re->suffix_regexp_->CompileToReverseProg(re->options_.max_mem() / 3); + // if (re->rprog_ == NULL) { + // if (re->options_.log_errors()) + // LOG(ERROR) << "Error reverse compiling '" << trunc(re->pattern_) << "'"; + // // We no longer touch error_ and error_code_ because failing to compile + // // the reverse Prog is not a showstopper: falling back to NFA execution + // // is fine. More importantly, an RE2 object is supposed to be logically + // // immutable: whatever ok() would have returned after Init() completed, + // // it should continue to return that no matter what ReverseProg() does. + // } + // }, this); + return rprog_; +} + +RE2::~RE2() { + if (suffix_regexp_) + // suffix_regexp_->Decref(); + if (entire_regexp_) + // entire_regexp_->Decref(); + // delete prog_; + // delete rprog_; + if (error_ != empty_string) + delete error_; + if (named_groups_ != NULL && named_groups_ != empty_named_groups) + delete named_groups_; + if (group_names_ != NULL && group_names_ != empty_group_names) + delete group_names_; +} + +int RE2::ProgramSize() const { + // if (prog_ == NULL) + // return -1; + // return prog_->size(); + return 0; +} + +int RE2::ReverseProgramSize() const { + // if (prog_ == NULL) + // return -1; + // Prog* prog = ReverseProg(); + // if (prog == NULL) + // return -1; + // return prog->size(); + return 0; +} + +// // Finds the most significant non-zero bit in n. +// static int FindMSBSet(uint32_t n) { +// DCHECK_NE(n, 0); +// #if defined(__GNUC__) +// return 31 ^ __builtin_clz(n); +// #elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) +// unsigned long c; +// _BitScanReverse(&c, n); +// return static_cast(c); +// #else +// int c = 0; +// for (int shift = 1 << 4; shift != 0; shift >>= 1) { +// uint32_t word = n >> shift; +// if (word != 0) { +// n = word; +// c += shift; +// } +// } +// return c; +// #endif +// } + +// static int Fanout(Prog* prog, std::vector* histogram) { +// SparseArray fanout(prog->size()); +// prog->Fanout(&fanout); +// int data[32] = {}; +// int size = 0; +// for (SparseArray::iterator i = fanout.begin(); i != fanout.end(); ++i) { +// if (i->value() == 0) +// continue; +// uint32_t value = i->value(); +// int bucket = FindMSBSet(value); +// bucket += value & (value-1) ? 1 : 0; +// ++data[bucket]; +// size = std::max(size, bucket+1); +// } +// if (histogram != NULL) +// histogram->assign(data, data+size); +// return size-1; +// } + +int RE2::ProgramFanout(std::vector* histogram) const { + // if (prog_ == NULL) + // return -1; + // return Fanout(prog_, histogram); + return 0; +} + +int RE2::ReverseProgramFanout(std::vector* histogram) const { + // if (prog_ == NULL) + // return -1; + // Prog* prog = ReverseProg(); + // if (prog == NULL) + // return -1; + // return Fanout(prog, histogram); + return 0; +} + +// Returns named_groups_, computing it if needed. +const std::map& RE2::NamedCapturingGroups() const { + std::map *temp = new std::map; + std::string str; + char *name; + int i = 0; + rure_iter_capture_names *it = rure_iter_capture_names_new((rure *)prog_); + while (rure_iter_capture_names_next(it, &name)) + { + str = name; + if(str.length()!=0) + temp->insert(make_pair(str, i)); + ++i; + } + named_groups_ = temp; + + return *named_groups_; +} + +// Returns group_names_, computing it if needed. +const std::map& RE2::CapturingGroupNames() const { + std::map *temp = new std::map; + std::string str; + char *name; + int i = 0; + rure_iter_capture_names *it = rure_iter_capture_names_new((rure *)prog_); + while (rure_iter_capture_names_next(it, &name)) + { + str = name; + if(str.length()!=0) + temp->insert(make_pair(i, str)); + ++i; + } + group_names_ = temp; + + return *group_names_; +} + +/***** Convenience interfaces *****/ + +bool RE2::FullMatchN(const StringPiece& text, const RE2& re, + const Arg* const args[], int n) { + return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n); +} + +bool RE2::PartialMatchN(const StringPiece& text, const RE2& re, + const Arg* const args[], int n) { + return re.DoMatch(text, UNANCHORED, NULL, args, n); +} + +bool RE2::ConsumeN(StringPiece* input, const RE2& re, + const Arg* const args[], int n) { + size_t consumed; + if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) { + input->remove_prefix(consumed); + return true; + } else { + return false; + } +} + +bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re, + const Arg* const args[], int n) { + size_t consumed; + if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) { + input->remove_prefix(consumed); + return true; + } else { + return false; + } +} + +bool RE2::Replace(std::string* str, + const RE2& re, + const StringPiece& rewrite) { + StringPiece vec[kVecSize]; + int nvec = 1 + MaxSubmatch(rewrite); + if (nvec > 1 + re.NumberOfCapturingGroups()) + return false; + if (nvec > static_cast(arraysize(vec))) + return false; + if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec)) + return false; + + std::string s; + if (!re.Rewrite(&s, rewrite, vec, nvec)) + return false; + + assert(vec[0].data() >= str->data()); + assert(vec[0].data() + vec[0].size() <= str->data() + str->size()); + str->replace(vec[0].data() - str->data(), vec[0].size(), s); + return true; +} + +int RE2::GlobalReplace(std::string* str, + const RE2& re, + const StringPiece& rewrite) { +// StringPiece vec[kVecSize]; +// int nvec = 1 + MaxSubmatch(rewrite); +// if (nvec > 1 + re.NumberOfCapturingGroups()) +// return false; +// if (nvec > static_cast(arraysize(vec))) +// return false; + +// const char* p = str->data(); +// const char* ep = p + str->size(); +// const char* lastend = NULL; +// std::string out; +// int count = 0; +// #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION +// // Iterate just once when fuzzing. Otherwise, we easily get bogged down +// // and coverage is unlikely to improve despite significant expense. +// while (p == str->data()) { +// #else +// while (p <= ep) { +// #endif +// if (!re.Match(*str, static_cast(p - str->data()), +// str->size(), UNANCHORED, vec, nvec)) +// break; +// if (p < vec[0].data()) +// out.append(p, vec[0].data() - p); +// if (vec[0].data() == lastend && vec[0].empty()) { +// // Disallow empty match at end of last match: skip ahead. +// // +// // fullrune() takes int, not ptrdiff_t. However, it just looks +// // at the leading byte and treats any length >= 4 the same. +// if (re.options().encoding() == RE2::Options::EncodingUTF8 && +// fullrune(p, static_cast(std::min(ptrdiff_t{4}, ep - p)))) { +// // re is in UTF-8 mode and there is enough left of str +// // to allow us to advance by up to UTFmax bytes. +// Rune r; +// int n = chartorune(&r, p); +// // Some copies of chartorune have a bug that accepts +// // encodings of values in (10FFFF, 1FFFFF] as valid. +// if (r > Runemax) { +// n = 1; +// r = Runeerror; +// } +// if (!(n == 1 && r == Runeerror)) { // no decoding error +// out.append(p, n); +// p += n; +// continue; +// } +// } +// // Most likely, re is in Latin-1 mode. If it is in UTF-8 mode, +// // we fell through from above and the GIGO principle applies. +// if (p < ep) +// out.append(p, 1); +// p++; +// continue; +// } +// re.Rewrite(&out, rewrite, vec, nvec); +// p = vec[0].data() + vec[0].size(); +// lastend = p; +// count++; +// } + +// if (count == 0) +// return 0; + +// if (p < ep) +// out.append(p, ep - p); +// using std::swap; +// swap(out, *str); +// return count; + return 0; +} + +bool RE2::Extract(const StringPiece& text, + const RE2& re, + const StringPiece& rewrite, + std::string* out) { + StringPiece vec[kVecSize]; + int nvec = 1 + MaxSubmatch(rewrite); + if (nvec > 1 + re.NumberOfCapturingGroups()) + return false; + if (nvec > static_cast(arraysize(vec))) + return false; + if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec)) + return false; + + out->clear(); + return re.Rewrite(out, rewrite, vec, nvec); +} + +std::string RE2::QuoteMeta(const StringPiece& unquoted) { + std::string result; + result.reserve(unquoted.size() << 1); + + // Escape any ascii character not in [A-Za-z_0-9]. + // + // Note that it's legal to escape a character even if it has no + // special meaning in a regular expression -- so this function does + // that. (This also makes it identical to the perl function of the + // same name except for the null-character special case; + // see `perldoc -f quotemeta`.) + for (size_t ii = 0; ii < unquoted.size(); ++ii) { + // Note that using 'isalnum' here raises the benchmark time from + // 32ns to 58ns: + if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && + (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && + (unquoted[ii] < '0' || unquoted[ii] > '9') && + unquoted[ii] != '_' && unquoted[ii] != '!' && + unquoted[ii] != ' ' && unquoted[ii] != '\''&& + unquoted[ii] != '=' && + // If this is the part of a UTF8 or Latin1 character, we need + // to copy this byte without escaping. Experimentally this is + // what works correctly with the regexp library. + !(unquoted[ii] & 128)) { + if (unquoted[ii] == '\0') { // Special handling for null chars. + // Note that this special handling is not strictly required for RE2, + // but this quoting is required for other regexp libraries such as + // PCRE. + // Can't use "\\0" since the next character might be a digit. + result += "\\x00"; + continue; + } + result += '\\'; + } + result += unquoted[ii]; + } + + return result; +} + +bool RE2::PossibleMatchRange(std::string* min, std::string* max, + int maxlen) const { + // if (prog_ == NULL) + // return false; + + // int n = static_cast(prefix_.size()); + // if (n > maxlen) + // n = maxlen; + + // // Determine initial min max from prefix_ literal. + // *min = prefix_.substr(0, n); + // *max = prefix_.substr(0, n); + // if (prefix_foldcase_) { + // // prefix is ASCII lowercase; change *min to uppercase. + // for (int i = 0; i < n; i++) { + // char& c = (*min)[i]; + // if ('a' <= c && c <= 'z') + // c += 'A' - 'a'; + // } + // } + + // // Add to prefix min max using PossibleMatchRange on regexp. + // std::string dmin, dmax; + // maxlen -= n; + // if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) { + // min->append(dmin); + // max->append(dmax); + // } else if (!max->empty()) { + // // prog_->PossibleMatchRange has failed us, + // // but we still have useful information from prefix_. + // // Round up *max to allow any possible suffix. + // PrefixSuccessor(max); + // } else { + // // Nothing useful. + // *min = ""; + // *max = ""; + // return false; + // } + + return true; +} + +// // Avoid possible locale nonsense in standard strcasecmp. +// // The string a is known to be all lowercase. +// static int ascii_strcasecmp(const char* a, const char* b, size_t len) { +// const char* ae = a + len; + +// for (; a < ae; a++, b++) { +// uint8_t x = *a; +// uint8_t y = *b; +// if ('A' <= y && y <= 'Z') +// y += 'a' - 'A'; +// if (x != y) +// return x - y; +// } +// return 0; +// } + + +/***** Actual matching and rewriting code *****/ + +bool RE2::Match(const StringPiece& text, + size_t startpos, + size_t endpos, + Anchor re_anchor, + StringPiece* submatch, + int nsubmatch) const { + + if (!ok()) { + if (options_.log_errors()) + LOG(ERROR) << "Invalid RE2: " << *error_; + return false; + } + + if (startpos > endpos || endpos > text.size()) { + if (options_.log_errors()) + LOG(ERROR) << "RE2: invalid startpos, endpos pair. [" + << "startpos: " << startpos << ", " + << "endpos: " << endpos << ", " + << "text size: " << text.size() << "]"; + return false; + } + + const char *haystack = text.data(); + rure *re = (rure*)prog_; + rure_match match = {0}; + bool matched = rure_find(re, (const uint8_t *)haystack, strlen(haystack),0, &match); + + + + switch (re_anchor){ + // ANCHOR_BOTH FullMatch + case ANCHOR_BOTH:{ + // 是否是FullMatch + if(nsubmatch != 0){ + + if(!matched){ + return false; + } + else{ + if(match.start != 0 || match.end != strlen(haystack)){ + return false; + } + } + } + else{ + if(matched && match.start == startpos && match.end == endpos){ + return true; + } + else{ + return false; + } + } + break; + } + // UNANCHORED PartialMatch + case UNANCHORED: + { + if(nsubmatch != 0){ + if(!matched){ + return false; + } + } + else{ + if(matched && match.end != 0) return true; + else return false; + } + break; + + } + case ANCHOR_START: + { + if(nsubmatch == 0){ + if(matched && match.start == startpos) return true; + else return false; + } + else{ + if(!matched) return false; + } + } + } + + // Demo 获取捕获组内容,存储到submatch数组中 + + size_t length = strlen(haystack); + + rure_captures *caps = rure_captures_new(re); + rure_find_captures(re, (const uint8_t *)haystack, + length, 0, caps); + size_t captures_len = num_captures_ + 1; + + rure_captures_at(caps, 0, &match); + if(re_anchor==ANCHOR_START && match.start!=0) return false; + + for (size_t i = 0; i < captures_len; i++) + { + bool result = rure_captures_at(caps, i, &match); + if (result) + { + size_t start = match.start; + size_t end = match.end; + size_t len = end - start; + + submatch[i] = StringPiece(text.data()+start,static_cast(len)); + // std::cout << "i=" << i << ", start=" << start << ", submatch=" << submatch[i] << endl; + } + else + { + submatch[i] = StringPiece(); + } + } + + + return true; +} + +// std::string_view in MSVC has iterators that aren't just pointers and +// that don't allow comparisons between different objects - not even if +// those objects are views into the same string! Thus, we provide these +// conversion functions for convenience. +static inline const char* BeginPtr(const StringPiece& s) { + return s.data(); +} +static inline const char* EndPtr(const StringPiece& s) { + return s.data() + s.size(); +} + +// Internal matcher - like Match() but takes Args not StringPieces. +bool RE2::DoMatch(const StringPiece& text, + Anchor re_anchor, + size_t* consumed, + const Arg* const* args, + int n) const { + // re是否成功创建 + if (!ok()){ + if (options_.log_errors()) + LOG(ERROR) << "Invalid RE2: " << *error_; + return false; + } + // re的捕获组数目小于给定数目,返回flase + if (NumberOfCapturingGroups() < n){ + // RE has fewer capturing groups than number of Arg pointers passed in. + return false; + } + + // 判断是否FullMatch, 判空 + const char *haystack; + if(text.data() == NULL || text[0] == '\0'){ + haystack = ""; + } + else{ + haystack = text.data(); + } + + + // Latin-1编码转换 + if(options_.encoding() == 2){ + // std::cout << "DoMatch-Latin-1\n"; + haystack = encodingLatin1ToUTF8(text.as_string()).c_str(); + } + + + + rure *re = (rure*)prog_; + rure_match match = {0}; + bool matched = rure_find(re, (const uint8_t *)haystack, strlen(haystack),0, &match); + + // Count number of capture groups needed. + int nvec; + if (n == 0 && consumed == NULL) + nvec = 0; //0个捕获组 + else + nvec = n + 1; + // 0个捕获组的匹配判断 + if(nvec==0) + { + switch(re_anchor) + { + // ANCHOR_BOTH FullMatch + case ANCHOR_BOTH: + { + if(!matched) + { + return false; + } + else + { + if(match.start == 0 && match.end == strlen(haystack)) + { + // std::cout << "DoMatch : 0个捕获组, FullMatch成功!!\n"; + return true; + } + else + { + // std::cout << "位置不对\n"; + return false; + } + } + + break; + } + // ANCHOR_START + case ANCHOR_START: + { + if(!matched) + { + return false; + } + else + { + if(match.start == 0) + { + return true; + } + else + { + // std::cout << "位置不对\n"; + return false; + } + } + break; + + } + + // UNANCHORED PartialMatch + case UNANCHORED: + { + if(!matched) + { + return false; + } + else + { + return true; + } + + break; + } + } + } + + StringPiece *vec; + StringPiece stkvec[kVecSize]; + StringPiece *heapvec = NULL; + + // 判断是否超出已预定的内存 + if (nvec <= static_cast(arraysize(stkvec))) + { + vec = stkvec; + } + else + { + vec = new StringPiece[nvec]; + heapvec = vec; + } + + // 存在捕获组的判断 + + // 匹配失败,返回false + // startpos endpos + // vec 用于存放捕获到的数据 + // nvec 表示需要捕获的数据的个数 + + //此处在改写的时候先不进行任何处理,直接使用之前的Match函数,完成之后在对Match进行改写 + if (!Match(text, 0, text.size(), re_anchor, vec, nvec)) + { + + // std::cout << "DoMatch : Match 带参 未匹配"; + delete[] heapvec; + return false; + } + + + + // 为consume赋值,consume的 + if (consumed != NULL) + *consumed = static_cast(EndPtr(vec[0]) - BeginPtr(text)); + + // 以上的代码已经完成了整个字符数是否和正则表达式全局匹配 + // 结下来就是要对正表达式中存在的捕获组进行处理 + + // 如果不需要捕获组,直接返回true + // if (n == 0 || args == NULL) + // { + // // We are not interested in results + // delete[] heapvec; + // return true; + // } + + + // If we got here, we must have matched the whole pattern. + for (int i = 0; i < n; i++) + { + // cout << vec[i].data() << endl; + const StringPiece &s = vec[i + 1]; + // std::cout << s.data() << "-" << s.size() <Parse(s.data(), s.size())) + { + // TODO: Should we indicate what the error was? + delete[] heapvec; + return false; + } + } + + delete[] heapvec; + + return true; +} + +// Checks that the rewrite string is well-formed with respect to this +// regular expression. +bool RE2::CheckRewriteString(const StringPiece& rewrite, + std::string* error) const { + // int max_token = -1; + // for (const char *s = rewrite.data(), *end = s + rewrite.size(); + // s < end; s++) { + // int c = *s; + // if (c != '\\') { + // continue; + // } + // if (++s == end) { + // *error = "Rewrite schema error: '\\' not allowed at end."; + // return false; + // } + // c = *s; + // if (c == '\\') { + // continue; + // } + // if (!isdigit(c)) { + // *error = "Rewrite schema error: " + // "'\\' must be followed by a digit or '\\'."; + // return false; + // } + // int n = (c - '0'); + // if (max_token < n) { + // max_token = n; + // } + // } + + // if (max_token > NumberOfCapturingGroups()) { + // *error = StringPrintf( + // "Rewrite schema requests %d matches, but the regexp only has %d " + // "parenthesized subexpressions.", + // max_token, NumberOfCapturingGroups()); + // return false; + // } + return true; +} + +// Returns the maximum submatch needed for the rewrite to be done by Replace(). +// E.g. if rewrite == "foo \\2,\\1", returns 2. +int RE2::MaxSubmatch(const StringPiece& rewrite) { + int max = 0; + for (const char *s = rewrite.data(), *end = s + rewrite.size(); + s < end; s++) { + if (*s == '\\') { + s++; + int c = (s < end) ? *s : -1; + if (isdigit(c)) { + int n = (c - '0'); + if (n > max) + max = n; + } + } + } + return max; +} + +// Append the "rewrite" string, with backslash subsitutions from "vec", +// to string "out". +bool RE2::Rewrite(std::string* out, + const StringPiece& rewrite, + const StringPiece* vec, + int veclen) const { + for (const char *s = rewrite.data(), *end = s + rewrite.size(); + s < end; s++) { + if (*s != '\\') { + out->push_back(*s); + continue; + } + s++; + int c = (s < end) ? *s : -1; + if (isdigit(c)) { + int n = (c - '0'); + if (n >= veclen) { + if (options_.log_errors()) { + LOG(ERROR) << "invalid substitution \\" << n + << " from " << veclen << " groups"; + } + return false; + } + StringPiece snip = vec[n]; + if (!snip.empty()) + out->append(snip.data(), snip.size()); + } else if (c == '\\') { + out->push_back('\\'); + } else { + if (options_.log_errors()) + LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data(); + return false; + } + } + return true; +} + +/***** Parsers for various types *****/ + +namespace re2_internal { + +template <> +bool Parse(const char* str, size_t n, void* dest) { + // We fail if somebody asked us to store into a non-NULL void* pointer + return (dest == NULL); +} + +template <> +bool Parse(const char* str, size_t n, std::string* dest) { + if (dest == NULL) return true; + dest->assign(str, n); + return true; +} + +template <> +bool Parse(const char* str, size_t n, StringPiece* dest) { + if (dest == NULL) return true; + *dest = StringPiece(str, n); + return true; +} + +template <> +bool Parse(const char* str, size_t n, char* dest) { + if (n != 1) return false; + if (dest == NULL) return true; + *dest = str[0]; + return true; +} + +template <> +bool Parse(const char* str, size_t n, signed char* dest) { + if (n != 1) return false; + if (dest == NULL) return true; + *dest = str[0]; + return true; +} + +template <> +bool Parse(const char* str, size_t n, unsigned char* dest) { + if (n != 1) return false; + if (dest == NULL) return true; + *dest = str[0]; + return true; +} + +// Largest number spec that we are willing to parse +static const int kMaxNumberLength = 32; + +// REQUIRES "buf" must have length at least nbuf. +// Copies "str" into "buf" and null-terminates. +// Overwrites *np with the new length. +static const char* TerminateNumber(char* buf, size_t nbuf, const char* str, + size_t* np, bool accept_spaces) { + size_t n = *np; + if (n == 0) return ""; + if (n > 0 && isspace(*str)) { + // We are less forgiving than the strtoxxx() routines and do not + // allow leading spaces. We do allow leading spaces for floats. + if (!accept_spaces) { + return ""; + } + while (n > 0 && isspace(*str)) { + n--; + str++; + } + } + + // Although buf has a fixed maximum size, we can still handle + // arbitrarily large integers correctly by omitting leading zeros. + // (Numbers that are still too long will be out of range.) + // Before deciding whether str is too long, + // remove leading zeros with s/000+/00/. + // Leaving the leading two zeros in place means that + // we don't change 0000x123 (invalid) into 0x123 (valid). + // Skip over leading - before replacing. + bool neg = false; + if (n >= 1 && str[0] == '-') { + neg = true; + n--; + str++; + } + + if (n >= 3 && str[0] == '0' && str[1] == '0') { + while (n >= 3 && str[2] == '0') { + n--; + str++; + } + } + + if (neg) { // make room in buf for - + n++; + str--; + } + + if (n > nbuf-1) return ""; + + memmove(buf, str, n); + if (neg) { + buf[0] = '-'; + } + buf[n] = '\0'; + *np = n; + return buf; +} + +template <> +bool Parse(const char* str, size_t n, float* dest) { + if (n == 0) return false; + static const int kMaxLength = 200; + char buf[kMaxLength+1]; + str = TerminateNumber(buf, sizeof buf, str, &n, true); + char* end; + errno = 0; + float r = strtof(str, &end); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *dest = r; + return true; +} + +template <> +bool Parse(const char* str, size_t n, double* dest) { + if (n == 0) return false; + static const int kMaxLength = 200; + char buf[kMaxLength+1]; + str = TerminateNumber(buf, sizeof buf, str, &n, true); + char* end; + errno = 0; + double r = strtod(str, &end); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *dest = r; + return true; +} + +template <> +bool Parse(const char* str, size_t n, long* dest, int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, sizeof buf, str, &n, false); + char* end; + errno = 0; + long r = strtol(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *dest = r; + return true; +} + +template <> +bool Parse(const char* str, size_t n, unsigned long* dest, int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, sizeof buf, str, &n, false); + if (str[0] == '-') { + // strtoul() will silently accept negative numbers and parse + // them. This module is more strict and treats them as errors. + return false; + } + + char* end; + errno = 0; + unsigned long r = strtoul(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *dest = r; + return true; +} + +template <> +bool Parse(const char* str, size_t n, short* dest, int radix) { + long r; + if (!Parse(str, n, &r, radix)) return false; // Could not parse + if ((short)r != r) return false; // Out of range + if (dest == NULL) return true; + *dest = (short)r; + return true; +} + +template <> +bool Parse(const char* str, size_t n, unsigned short* dest, int radix) { + unsigned long r; + if (!Parse(str, n, &r, radix)) return false; // Could not parse + if ((unsigned short)r != r) return false; // Out of range + if (dest == NULL) return true; + *dest = (unsigned short)r; + return true; +} + +template <> +bool Parse(const char* str, size_t n, int* dest, int radix) { + long r; + if (!Parse(str, n, &r, radix)) return false; // Could not parse + if ((int)r != r) return false; // Out of range + if (dest == NULL) return true; + *dest = (int)r; + return true; +} + +template <> +bool Parse(const char* str, size_t n, unsigned int* dest, int radix) { + unsigned long r; + if (!Parse(str, n, &r, radix)) return false; // Could not parse + if ((unsigned int)r != r) return false; // Out of range + if (dest == NULL) return true; + *dest = (unsigned int)r; + return true; +} + +template <> +bool Parse(const char* str, size_t n, long long* dest, int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, sizeof buf, str, &n, false); + char* end; + errno = 0; + long long r = strtoll(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *dest = r; + return true; +} + +template <> +bool Parse(const char* str, size_t n, unsigned long long* dest, int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, sizeof buf, str, &n, false); + if (str[0] == '-') { + // strtoull() will silently accept negative numbers and parse + // them. This module is more strict and treats them as errors. + return false; + } + char* end; + errno = 0; + unsigned long long r = strtoull(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *dest = r; + return true; +} + +} // namespace re2_internal + +namespace hooks { + +#ifdef RE2_HAVE_THREAD_LOCAL +thread_local const RE2* context = NULL; +#endif + +template +union Hook { + void Store(T* cb) { cb_.store(cb, std::memory_order_release); } + T* Load() const { return cb_.load(std::memory_order_acquire); } + +#if !defined(__clang__) && defined(_MSC_VER) + // Citing https://github.com/protocolbuffers/protobuf/pull/4777 as precedent, + // this is a gross hack to make std::atomic constant-initialized on MSVC. + static_assert(ATOMIC_POINTER_LOCK_FREE == 2, + "std::atomic must be always lock-free"); + T* cb_for_constinit_; +#endif + + std::atomic cb_; +}; + +template +static void DoNothing(const T&) {} + +#define DEFINE_HOOK(type, name) \ + static Hook name##_hook = {{&DoNothing}}; \ + void Set##type##Hook(type##Callback* cb) { name##_hook.Store(cb); } \ + type##Callback* Get##type##Hook() { return name##_hook.Load(); } + +DEFINE_HOOK(DFAStateCacheReset, dfa_state_cache_reset) +DEFINE_HOOK(DFASearchFailure, dfa_search_failure) + +#undef DEFINE_HOOK + +} // namespace hooks + +} // namespace re2 diff --git a/re2/re2.h b/re2/re2.h new file mode 100644 index 0000000000000000000000000000000000000000..7fd2245cb35c070b81fb50429bf2af2fa48f4ac0 --- /dev/null +++ b/re2/re2.h @@ -0,0 +1,1017 @@ +// Copyright 2003-2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef RE2_RE2_H_ +#define RE2_RE2_H_ + +// C++ interface to the re2 regular-expression library. +// RE2 supports Perl-style regular expressions (with extensions like +// \d, \w, \s, ...). +// +// ----------------------------------------------------------------------- +// REGEXP SYNTAX: +// +// This module uses the re2 library and hence supports +// its syntax for regular expressions, which is similar to Perl's with +// some of the more complicated things thrown away. In particular, +// backreferences and generalized assertions are not available, nor is \Z. +// +// See https://github.com/google/re2/wiki/Syntax for the syntax +// supported by RE2, and a comparison with PCRE and PERL regexps. +// +// For those not familiar with Perl's regular expressions, +// here are some examples of the most commonly used extensions: +// +// "hello (\\w+) world" -- \w matches a "word" character +// "version (\\d+)" -- \d matches a digit +// "hello\\s+world" -- \s matches any whitespace character +// "\\b(\\w+)\\b" -- \b matches non-empty string at word boundary +// "(?i)hello" -- (?i) turns on case-insensitive matching +// "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible +// +// The double backslashes are needed when writing C++ string literals. +// However, they should NOT be used when writing C++11 raw string literals: +// +// R"(hello (\w+) world)" -- \w matches a "word" character +// R"(version (\d+))" -- \d matches a digit +// R"(hello\s+world)" -- \s matches any whitespace character +// R"(\b(\w+)\b)" -- \b matches non-empty string at word boundary +// R"((?i)hello)" -- (?i) turns on case-insensitive matching +// R"(/\*(.*?)\*/)" -- .*? matches . minimum no. of times possible +// +// When using UTF-8 encoding, case-insensitive matching will perform +// simple case folding, not full case folding. +// +// ----------------------------------------------------------------------- +// MATCHING INTERFACE: +// +// The "FullMatch" operation checks that supplied text matches a +// supplied pattern exactly. +// +// Example: successful match +// CHECK(RE2::FullMatch("hello", "h.*o")); +// +// Example: unsuccessful match (requires full match): +// CHECK(!RE2::FullMatch("hello", "e")); +// +// ----------------------------------------------------------------------- +// UTF-8 AND THE MATCHING INTERFACE: +// +// By default, the pattern and input text are interpreted as UTF-8. +// The RE2::Latin1 option causes them to be interpreted as Latin-1. +// +// Example: +// CHECK(RE2::FullMatch(utf8_string, RE2(utf8_pattern))); +// CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1))); +// +// ----------------------------------------------------------------------- +// MATCHING WITH SUBSTRING EXTRACTION: +// +// You can supply extra pointer arguments to extract matched substrings. +// On match failure, none of the pointees will have been modified. +// On match success, the substrings will be converted (as necessary) and +// their values will be assigned to their pointees until all conversions +// have succeeded or one conversion has failed. +// On conversion failure, the pointees will be in an indeterminate state +// because the caller has no way of knowing which conversion failed. +// However, conversion cannot fail for types like string and StringPiece +// that do not inspect the substring contents. Hence, in the common case +// where all of the pointees are of such types, failure is always due to +// match failure and thus none of the pointees will have been modified. +// +// Example: extracts "ruby" into "s" and 1234 into "i" +// int i; +// std::string s; +// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i)); +// +// Example: fails because string cannot be stored in integer +// CHECK(!RE2::FullMatch("ruby", "(.*)", &i)); +// +// Example: fails because there aren't enough sub-patterns +// CHECK(!RE2::FullMatch("ruby:1234", "\\w+:\\d+", &s)); +// +// Example: does not try to extract any extra sub-patterns +// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s)); +// +// Example: does not try to extract into NULL +// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i)); +// +// Example: integer overflow causes failure +// CHECK(!RE2::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i)); +// +// NOTE(rsc): Asking for substrings slows successful matches quite a bit. +// This may get a little faster in the future, but right now is slower +// than PCRE. On the other hand, failed matches run *very* fast (faster +// than PCRE), as do matches without substring extraction. +// +// ----------------------------------------------------------------------- +// PARTIAL MATCHES +// +// You can use the "PartialMatch" operation when you want the pattern +// to match any substring of the text. +// +// Example: simple search for a string: +// CHECK(RE2::PartialMatch("hello", "ell")); +// +// Example: find first number in a string +// int number; +// CHECK(RE2::PartialMatch("x*100 + 20", "(\\d+)", &number)); +// CHECK_EQ(number, 100); +// +// ----------------------------------------------------------------------- +// PRE-COMPILED REGULAR EXPRESSIONS +// +// RE2 makes it easy to use any string as a regular expression, without +// requiring a separate compilation step. +// +// If speed is of the essence, you can create a pre-compiled "RE2" +// object from the pattern and use it multiple times. If you do so, +// you can typically parse text faster than with sscanf. +// +// Example: precompile pattern for faster matching: +// RE2 pattern("h.*o"); +// while (ReadLine(&str)) { +// if (RE2::FullMatch(str, pattern)) ...; +// } +// +// ----------------------------------------------------------------------- +// SCANNING TEXT INCREMENTALLY +// +// The "Consume" operation may be useful if you want to repeatedly +// match regular expressions at the front of a string and skip over +// them as they match. This requires use of the "StringPiece" type, +// which represents a sub-range of a real string. +// +// Example: read lines of the form "var = value" from a string. +// std::string contents = ...; // Fill string somehow +// StringPiece input(contents); // Wrap a StringPiece around it +// +// std::string var; +// int value; +// while (RE2::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) { +// ...; +// } +// +// Each successful call to "Consume" will set "var/value", and also +// advance "input" so it points past the matched text. Note that if the +// regular expression matches an empty string, input will advance +// by 0 bytes. If the regular expression being used might match +// an empty string, the loop body must check for this case and either +// advance the string or break out of the loop. +// +// The "FindAndConsume" operation is similar to "Consume" but does not +// anchor your match at the beginning of the string. For example, you +// could extract all words from a string by repeatedly calling +// RE2::FindAndConsume(&input, "(\\w+)", &word) +// +// ----------------------------------------------------------------------- +// USING VARIABLE NUMBER OF ARGUMENTS +// +// The above operations require you to know the number of arguments +// when you write the code. This is not always possible or easy (for +// example, the regular expression may be calculated at run time). +// You can use the "N" version of the operations when the number of +// match arguments are determined at run time. +// +// Example: +// const RE2::Arg* args[10]; +// int n; +// // ... populate args with pointers to RE2::Arg values ... +// // ... set n to the number of RE2::Arg objects ... +// bool match = RE2::FullMatchN(input, pattern, args, n); +// +// The last statement is equivalent to +// +// bool match = RE2::FullMatch(input, pattern, +// *args[0], *args[1], ..., *args[n - 1]); +// +// ----------------------------------------------------------------------- +// PARSING HEX/OCTAL/C-RADIX NUMBERS +// +// By default, if you pass a pointer to a numeric value, the +// corresponding text is interpreted as a base-10 number. You can +// instead wrap the pointer with a call to one of the operators Hex(), +// Octal(), or CRadix() to interpret the text in another base. The +// CRadix operator interprets C-style "0" (base-8) and "0x" (base-16) +// prefixes, but defaults to base-10. +// +// Example: +// int a, b, c, d; +// CHECK(RE2::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)", +// RE2::Octal(&a), RE2::Hex(&b), RE2::CRadix(&c), RE2::CRadix(&d)); +// will leave 64 in a, b, c, and d. + +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__APPLE__) +#include +#endif + +#include "re2/stringpiece.h" + +namespace re2 { +class Prog; +class Regexp; +} // namespace re2 + +namespace re2 { + +// Interface for regular expression matching. Also corresponds to a +// pre-compiled regular expression. An "RE2" object is safe for +// concurrent use by multiple threads. +class RE2 { + public: + // We convert user-passed pointers into special Arg objects + class Arg; + class Options; + + // Defined in set.h. + class Set; + + enum ErrorCode { + NoError = 0, + + // Unexpected error + ErrorInternal, + + // Parse errors + ErrorBadEscape, // bad escape sequence + ErrorBadCharClass, // bad character class + ErrorBadCharRange, // bad character class range + ErrorMissingBracket, // missing closing ] + ErrorMissingParen, // missing closing ) + ErrorUnexpectedParen, // unexpected closing ) + ErrorTrailingBackslash, // trailing \ at end of regexp + ErrorRepeatArgument, // repeat argument missing, e.g. "*" + ErrorRepeatSize, // bad repetition argument + ErrorRepeatOp, // bad repetition operator + ErrorBadPerlOp, // bad perl operator + ErrorBadUTF8, // invalid UTF-8 in regexp + ErrorBadNamedCapture, // bad named capture group + ErrorPatternTooLarge // pattern too large (compile failed) + }; + + // Predefined common options. + // If you need more complicated things, instantiate + // an Option class, possibly passing one of these to + // the Option constructor, change the settings, and pass that + // Option class to the RE2 constructor. + enum CannedOptions { + DefaultOptions = 0, + Latin1, // treat input as Latin-1 (default UTF-8) + POSIX, // POSIX syntax, leftmost-longest match + Quiet // do not log about regexp parse errors + }; + + // Need to have the const char* and const std::string& forms for implicit + // conversions when passing string literals to FullMatch and PartialMatch. + // Otherwise the StringPiece form would be sufficient. +#ifndef SWIG + RE2(const char* pattern); + RE2(const std::string& pattern); +#endif + RE2(const StringPiece& pattern); + RE2(const StringPiece& pattern, const Options& options); + ~RE2(); + + // Returns whether RE2 was created properly. + bool ok() const { return error_code() == NoError; } + + // The string specification for this RE2. E.g. + // RE2 re("ab*c?d+"); + // re.pattern(); // "ab*c?d+" + const std::string& pattern() const { return pattern_; } + + // If RE2 could not be created properly, returns an error string. + // Else returns the empty string. + const std::string& error() const { return *error_; } + + // If RE2 could not be created properly, returns an error code. + // Else returns RE2::NoError (== 0). + ErrorCode error_code() const { return error_code_; } + + // If RE2 could not be created properly, returns the offending + // portion of the regexp. + const std::string& error_arg() const { return error_arg_; } + + // Returns the program size, a very approximate measure of a regexp's "cost". + // Larger numbers are more expensive than smaller numbers. + int ProgramSize() const; + int ReverseProgramSize() const; + + // If histogram is not null, outputs the program fanout + // as a histogram bucketed by powers of 2. + // Returns the number of the largest non-empty bucket. + int ProgramFanout(std::vector* histogram) const; + int ReverseProgramFanout(std::vector* histogram) const; + + // Returns the underlying Regexp; not for general use. + // Returns entire_regexp_ so that callers don't need + // to know about prefix_ and prefix_foldcase_. + re2::Regexp* Regexp() const { return entire_regexp_; } + + /***** The array-based matching interface ******/ + + // The functions here have names ending in 'N' and are used to implement + // the functions whose names are the prefix before the 'N'. It is sometimes + // useful to invoke them directly, but the syntax is awkward, so the 'N'-less + // versions should be preferred. + static bool FullMatchN(const StringPiece& text, const RE2& re, + const Arg* const args[], int n); + static bool PartialMatchN(const StringPiece& text, const RE2& re, + const Arg* const args[], int n); + static bool ConsumeN(StringPiece* input, const RE2& re, + const Arg* const args[], int n); + static bool FindAndConsumeN(StringPiece* input, const RE2& re, + const Arg* const args[], int n); + +#ifndef SWIG + private: + template + static inline bool Apply(F f, SP sp, const RE2& re) { + return f(sp, re, NULL, 0); + } + + template + static inline bool Apply(F f, SP sp, const RE2& re, const A&... a) { + const Arg* const args[] = {&a...}; + const int n = sizeof...(a); + return f(sp, re, args, n); + } + + public: + // In order to allow FullMatch() et al. to be called with a varying number + // of arguments of varying types, we use two layers of variadic templates. + // The first layer constructs the temporary Arg objects. The second layer + // (above) constructs the array of pointers to the temporary Arg objects. + + /***** The useful part: the matching interface *****/ + + // Matches "text" against "re". If pointer arguments are + // supplied, copies matched sub-patterns into them. + // + // You can pass in a "const char*" or a "std::string" for "text". + // You can pass in a "const char*" or a "std::string" or a "RE2" for "re". + // + // The provided pointer arguments can be pointers to any scalar numeric + // type, or one of: + // std::string (matched piece is copied to string) + // StringPiece (StringPiece is mutated to point to matched piece) + // T (where "bool T::ParseFrom(const char*, size_t)" exists) + // (void*)NULL (the corresponding matched sub-pattern is not copied) + // + // Returns true iff all of the following conditions are satisfied: + // a. "text" matches "re" fully - from the beginning to the end of "text". + // b. The number of matched sub-patterns is >= number of supplied pointers. + // c. The "i"th argument has a suitable type for holding the + // string captured as the "i"th sub-pattern. If you pass in + // NULL for the "i"th argument, or pass fewer arguments than + // number of sub-patterns, the "i"th captured sub-pattern is + // ignored. + // + // CAVEAT: An optional sub-pattern that does not exist in the + // matched string is assigned the empty string. Therefore, the + // following will return false (because the empty string is not a + // valid number): + // int number; + // RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number); + template + static bool FullMatch(const StringPiece& text, const RE2& re, A&&... a) { + return Apply(FullMatchN, text, re, Arg(std::forward(a))...); + } + + // Like FullMatch(), except that "re" is allowed to match a substring + // of "text". + // + // Returns true iff all of the following conditions are satisfied: + // a. "text" matches "re" partially - for some substring of "text". + // b. The number of matched sub-patterns is >= number of supplied pointers. + // c. The "i"th argument has a suitable type for holding the + // string captured as the "i"th sub-pattern. If you pass in + // NULL for the "i"th argument, or pass fewer arguments than + // number of sub-patterns, the "i"th captured sub-pattern is + // ignored. + template + static bool PartialMatch(const StringPiece& text, const RE2& re, A&&... a) { + return Apply(PartialMatchN, text, re, Arg(std::forward(a))...); + } + + // Like FullMatch() and PartialMatch(), except that "re" has to match + // a prefix of the text, and "input" is advanced past the matched + // text. Note: "input" is modified iff this routine returns true + // and "re" matched a non-empty substring of "input". + // + // Returns true iff all of the following conditions are satisfied: + // a. "input" matches "re" partially - for some prefix of "input". + // b. The number of matched sub-patterns is >= number of supplied pointers. + // c. The "i"th argument has a suitable type for holding the + // string captured as the "i"th sub-pattern. If you pass in + // NULL for the "i"th argument, or pass fewer arguments than + // number of sub-patterns, the "i"th captured sub-pattern is + // ignored. + template + static bool Consume(StringPiece* input, const RE2& re, A&&... a) { + return Apply(ConsumeN, input, re, Arg(std::forward(a))...); + } + + // Like Consume(), but does not anchor the match at the beginning of + // the text. That is, "re" need not start its match at the beginning + // of "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds + // the next word in "s" and stores it in "word". + // + // Returns true iff all of the following conditions are satisfied: + // a. "input" matches "re" partially - for some substring of "input". + // b. The number of matched sub-patterns is >= number of supplied pointers. + // c. The "i"th argument has a suitable type for holding the + // string captured as the "i"th sub-pattern. If you pass in + // NULL for the "i"th argument, or pass fewer arguments than + // number of sub-patterns, the "i"th captured sub-pattern is + // ignored. + template + static bool FindAndConsume(StringPiece* input, const RE2& re, A&&... a) { + return Apply(FindAndConsumeN, input, re, Arg(std::forward(a))...); + } +#endif + + // Replace the first match of "re" in "str" with "rewrite". + // Within "rewrite", backslash-escaped digits (\1 to \9) can be + // used to insert text matching corresponding parenthesized group + // from the pattern. \0 in "rewrite" refers to the entire matching + // text. E.g., + // + // std::string s = "yabba dabba doo"; + // CHECK(RE2::Replace(&s, "b+", "d")); + // + // will leave "s" containing "yada dabba doo" + // + // Returns true if the pattern matches and a replacement occurs, + // false otherwise. + static bool Replace(std::string* str, + const RE2& re, + const StringPiece& rewrite); + + // Like Replace(), except replaces successive non-overlapping occurrences + // of the pattern in the string with the rewrite. E.g. + // + // std::string s = "yabba dabba doo"; + // CHECK(RE2::GlobalReplace(&s, "b+", "d")); + // + // will leave "s" containing "yada dada doo" + // Replacements are not subject to re-matching. + // + // Because GlobalReplace only replaces non-overlapping matches, + // replacing "ana" within "banana" makes only one replacement, not two. + // + // Returns the number of replacements made. + static int GlobalReplace(std::string* str, + const RE2& re, + const StringPiece& rewrite); + + // Like Replace, except that if the pattern matches, "rewrite" + // is copied into "out" with substitutions. The non-matching + // portions of "text" are ignored. + // + // Returns true iff a match occurred and the extraction happened + // successfully; if no match occurs, the string is left unaffected. + // + // REQUIRES: "text" must not alias any part of "*out". + static bool Extract(const StringPiece& text, + const RE2& re, + const StringPiece& rewrite, + std::string* out); + + // Escapes all potentially meaningful regexp characters in + // 'unquoted'. The returned string, used as a regular expression, + // will match exactly the original string. For example, + // 1.5-2.0? + // may become: + // 1\.5\-2\.0\? + static std::string QuoteMeta(const StringPiece& unquoted); + + // Computes range for any strings matching regexp. The min and max can in + // some cases be arbitrarily precise, so the caller gets to specify the + // maximum desired length of string returned. + // + // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any + // string s that is an anchored match for this regexp satisfies + // min <= s && s <= max. + // + // Note that PossibleMatchRange() will only consider the first copy of an + // infinitely repeated element (i.e., any regexp element followed by a '*' or + // '+' operator). Regexps with "{N}" constructions are not affected, as those + // do not compile down to infinite repetitions. + // + // Returns true on success, false on error. + bool PossibleMatchRange(std::string* min, std::string* max, + int maxlen) const; + + // Generic matching interface + + // Type of match. + enum Anchor { + UNANCHORED, // No anchoring + ANCHOR_START, // Anchor at start only + ANCHOR_BOTH // Anchor at start and end + }; + + // Return the number of capturing subpatterns, or -1 if the + // regexp wasn't valid on construction. The overall match ($0) + // does not count: if the regexp is "(a)(b)", returns 2. + int NumberOfCapturingGroups() const { return num_captures_; } + + // Return a map from names to capturing indices. + // The map records the index of the leftmost group + // with the given name. + // Only valid until the re is deleted. + const std::map& NamedCapturingGroups() const; + + // Return a map from capturing indices to names. + // The map has no entries for unnamed groups. + // Only valid until the re is deleted. + const std::map& CapturingGroupNames() const; + + // General matching routine. + // Match against text starting at offset startpos + // and stopping the search at offset endpos. + // Returns true if match found, false if not. + // On a successful match, fills in submatch[] (up to nsubmatch entries) + // with information about submatches. + // I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true, with + // submatch[0] = "barbaz", submatch[1].data() = NULL, submatch[2] = "bar", + // submatch[3].data() = NULL, ..., up to submatch[nsubmatch-1].data() = NULL. + // Caveat: submatch[] may be clobbered even on match failure. + // + // Don't ask for more match information than you will use: + // runs much faster with nsubmatch == 1 than nsubmatch > 1, and + // runs even faster if nsubmatch == 0. + // Doesn't make sense to use nsubmatch > 1 + NumberOfCapturingGroups(), + // but will be handled correctly. + // + // Passing text == StringPiece(NULL, 0) will be handled like any other + // empty string, but note that on return, it will not be possible to tell + // whether submatch i matched the empty string or did not match: + // either way, submatch[i].data() == NULL. + bool Match(const StringPiece& text, + size_t startpos, + size_t endpos, + Anchor re_anchor, + StringPiece* submatch, + int nsubmatch) const; + + // Check that the given rewrite string is suitable for use with this + // regular expression. It checks that: + // * The regular expression has enough parenthesized subexpressions + // to satisfy all of the \N tokens in rewrite + // * The rewrite string doesn't have any syntax errors. E.g., + // '\' followed by anything other than a digit or '\'. + // A true return value guarantees that Replace() and Extract() won't + // fail because of a bad rewrite string. + bool CheckRewriteString(const StringPiece& rewrite, + std::string* error) const; + + // Returns the maximum submatch needed for the rewrite to be done by + // Replace(). E.g. if rewrite == "foo \\2,\\1", returns 2. + static int MaxSubmatch(const StringPiece& rewrite); + + // Append the "rewrite" string, with backslash subsitutions from "vec", + // to string "out". + // Returns true on success. This method can fail because of a malformed + // rewrite string. CheckRewriteString guarantees that the rewrite will + // be sucessful. + bool Rewrite(std::string* out, + const StringPiece& rewrite, + const StringPiece* vec, + int veclen) const; + + // Constructor options + class Options { + public: + // The options are (defaults in parentheses): + // + // utf8 (true) text and pattern are UTF-8; otherwise Latin-1 + // posix_syntax (false) restrict regexps to POSIX egrep syntax + // longest_match (false) search for longest match, not first match + // log_errors (true) log syntax and execution errors to ERROR + // max_mem (see below) approx. max memory footprint of RE2 + // literal (false) interpret string as literal, not regexp + // never_nl (false) never match \n, even if it is in regexp + // dot_nl (false) dot matches everything including new line + // never_capture (false) parse all parens as non-capturing + // case_sensitive (true) match is case-sensitive (regexp can override + // with (?i) unless in posix_syntax mode) + // + // The following options are only consulted when posix_syntax == true. + // When posix_syntax == false, these features are always enabled and + // cannot be turned off; to perform multi-line matching in that case, + // begin the regexp with (?m). + // perl_classes (false) allow Perl's \d \s \w \D \S \W + // word_boundary (false) allow Perl's \b \B (word boundary and not) + // one_line (false) ^ and $ only match beginning and end of text + // + // The max_mem option controls how much memory can be used + // to hold the compiled form of the regexp (the Prog) and + // its cached DFA graphs. Code Search placed limits on the number + // of Prog instructions and DFA states: 10,000 for both. + // In RE2, those limits would translate to about 240 KB per Prog + // and perhaps 2.5 MB per DFA (DFA state sizes vary by regexp; RE2 does a + // better job of keeping them small than Code Search did). + // Each RE2 has two Progs (one forward, one reverse), and each Prog + // can have two DFAs (one first match, one longest match). + // That makes 4 DFAs: + // + // forward, first-match - used for UNANCHORED or ANCHOR_START searches + // if opt.longest_match() == false + // forward, longest-match - used for all ANCHOR_BOTH searches, + // and the other two kinds if + // opt.longest_match() == true + // reverse, first-match - never used + // reverse, longest-match - used as second phase for unanchored searches + // + // The RE2 memory budget is statically divided between the two + // Progs and then the DFAs: two thirds to the forward Prog + // and one third to the reverse Prog. The forward Prog gives half + // of what it has left over to each of its DFAs. The reverse Prog + // gives it all to its longest-match DFA. + // + // Once a DFA fills its budget, it flushes its cache and starts over. + // If this happens too often, RE2 falls back on the NFA implementation. + + // For now, make the default budget something close to Code Search. + static const int kDefaultMaxMem = 8<<20; + + enum Encoding { + EncodingUTF8 = 1, + EncodingLatin1 + }; + + Options() : + encoding_(EncodingUTF8), + posix_syntax_(false), + longest_match_(false), + log_errors_(true), + max_mem_(kDefaultMaxMem), + literal_(false), + never_nl_(false), + dot_nl_(false), + never_capture_(false), + case_sensitive_(true), + perl_classes_(false), + word_boundary_(false), + one_line_(false) { + } + + /*implicit*/ Options(CannedOptions); + + Encoding encoding() const { return encoding_; } + void set_encoding(Encoding encoding) { encoding_ = encoding; } + + bool posix_syntax() const { return posix_syntax_; } + void set_posix_syntax(bool b) { posix_syntax_ = b; } + + bool longest_match() const { return longest_match_; } + void set_longest_match(bool b) { longest_match_ = b; } + + bool log_errors() const { return log_errors_; } + void set_log_errors(bool b) { log_errors_ = b; } + + int64_t max_mem() const { return max_mem_; } + void set_max_mem(int64_t m) { max_mem_ = m; } + + bool literal() const { return literal_; } + void set_literal(bool b) { literal_ = b; } + + bool never_nl() const { return never_nl_; } + void set_never_nl(bool b) { never_nl_ = b; } + + bool dot_nl() const { return dot_nl_; } + void set_dot_nl(bool b) { dot_nl_ = b; } + + bool never_capture() const { return never_capture_; } + void set_never_capture(bool b) { never_capture_ = b; } + + bool case_sensitive() const { return case_sensitive_; } + void set_case_sensitive(bool b) { case_sensitive_ = b; } + + bool perl_classes() const { return perl_classes_; } + void set_perl_classes(bool b) { perl_classes_ = b; } + + bool word_boundary() const { return word_boundary_; } + void set_word_boundary(bool b) { word_boundary_ = b; } + + bool one_line() const { return one_line_; } + void set_one_line(bool b) { one_line_ = b; } + + void Copy(const Options& src) { + *this = src; + } + + int ParseFlags() const; + + private: + Encoding encoding_; + bool posix_syntax_; + bool longest_match_; + bool log_errors_; + int64_t max_mem_; + bool literal_; + bool never_nl_; + bool dot_nl_; + bool never_capture_; + bool case_sensitive_; + bool perl_classes_; + bool word_boundary_; + bool one_line_; + }; + + // Returns the options set in the constructor. + const Options& options() const { return options_; } + + // Argument converters; see below. + template + static Arg CRadix(T* ptr); + template + static Arg Hex(T* ptr); + template + static Arg Octal(T* ptr); + + private: + void Init(const StringPiece& pattern, const Options& options); + + bool DoMatch(const StringPiece& text, + Anchor re_anchor, + size_t* consumed, + const Arg* const args[], + int n) const; + + re2::Prog* ReverseProg() const; + + std::string pattern_; // string regular expression + Options options_; // option flags + re2::Regexp* entire_regexp_; // parsed regular expression + const std::string* error_; // error indicator (or points to empty string) + ErrorCode error_code_; // error code + std::string error_arg_; // fragment of regexp showing error + std::string prefix_; // required prefix (before suffix_regexp_) + bool prefix_foldcase_; // prefix_ is ASCII case-insensitive + re2::Regexp* suffix_regexp_; // parsed regular expression, prefix_ removed + re2::Prog* prog_; // compiled program for regexp + int num_captures_; // number of capturing groups + bool is_one_pass_; // can use prog_->SearchOnePass? + + // Reverse Prog for DFA execution only + mutable re2::Prog* rprog_; + // Map from capture names to indices + mutable const std::map* named_groups_; + // Map from capture indices to names + mutable const std::map* group_names_; + + mutable std::once_flag rprog_once_; + mutable std::once_flag named_groups_once_; + mutable std::once_flag group_names_once_; + + RE2(const RE2&) = delete; + RE2& operator=(const RE2&) = delete; +}; + +/***** Implementation details *****/ + +namespace re2_internal { + +// Types for which the 3-ary Parse() function template has specializations. +template struct Parse3ary : public std::false_type {}; +template <> struct Parse3ary : public std::true_type {}; +template <> struct Parse3ary : public std::true_type {}; +template <> struct Parse3ary : public std::true_type {}; +template <> struct Parse3ary : public std::true_type {}; +template <> struct Parse3ary : public std::true_type {}; +template <> struct Parse3ary : public std::true_type {}; +template <> struct Parse3ary : public std::true_type {}; +template <> struct Parse3ary : public std::true_type {}; + +template +bool Parse(const char* str, size_t n, T* dest); + +// Types for which the 4-ary Parse() function template has specializations. +template struct Parse4ary : public std::false_type {}; +template <> struct Parse4ary : public std::true_type {}; +template <> struct Parse4ary : public std::true_type {}; +template <> struct Parse4ary : public std::true_type {}; +template <> struct Parse4ary : public std::true_type {}; +template <> struct Parse4ary : public std::true_type {}; +template <> struct Parse4ary : public std::true_type {}; +template <> struct Parse4ary : public std::true_type {}; +template <> struct Parse4ary : public std::true_type {}; + +template +bool Parse(const char* str, size_t n, T* dest, int radix); + +} // namespace re2_internal + +class RE2::Arg { + private: + template + using CanParse3ary = typename std::enable_if< + re2_internal::Parse3ary::value, + int>::type; + + template + using CanParse4ary = typename std::enable_if< + re2_internal::Parse4ary::value, + int>::type; + +#if !defined(_MSC_VER) + template + using CanParseFrom = typename std::enable_if< + std::is_member_function_pointer< + decltype(static_cast( + &T::ParseFrom))>::value, + int>::type; +#endif + + public: + Arg() : Arg(nullptr) {} + Arg(std::nullptr_t ptr) : arg_(ptr), parser_(DoNothing) {} + + template = 0> + Arg(T* ptr) : arg_(ptr), parser_(DoParse3ary) {} + + template = 0> + Arg(T* ptr) : arg_(ptr), parser_(DoParse4ary) {} + +#if !defined(_MSC_VER) + template = 0> + Arg(T* ptr) : arg_(ptr), parser_(DoParseFrom) {} +#endif + + typedef bool (*Parser)(const char* str, size_t n, void* dest); + + template + Arg(T* ptr, Parser parser) : arg_(ptr), parser_(parser) {} + + bool Parse(const char* str, size_t n) const { + return (*parser_)(str, n, arg_); + } + + private: + static bool DoNothing(const char* /*str*/, size_t /*n*/, void* /*dest*/) { + return true; + } + + template + static bool DoParse3ary(const char* str, size_t n, void* dest) { + return re2_internal::Parse(str, n, reinterpret_cast(dest)); + } + + template + static bool DoParse4ary(const char* str, size_t n, void* dest) { + return re2_internal::Parse(str, n, reinterpret_cast(dest), 10); + } + +#if !defined(_MSC_VER) + template + static bool DoParseFrom(const char* str, size_t n, void* dest) { + if (dest == NULL) return true; + return reinterpret_cast(dest)->ParseFrom(str, n); + } +#endif + + void* arg_; + Parser parser_; +}; + +template +inline RE2::Arg RE2::CRadix(T* ptr) { + return RE2::Arg(ptr, [](const char* str, size_t n, void* dest) -> bool { + return re2_internal::Parse(str, n, reinterpret_cast(dest), 0); + }); +} + +template +inline RE2::Arg RE2::Hex(T* ptr) { + return RE2::Arg(ptr, [](const char* str, size_t n, void* dest) -> bool { + return re2_internal::Parse(str, n, reinterpret_cast(dest), 16); + }); +} + +template +inline RE2::Arg RE2::Octal(T* ptr) { + return RE2::Arg(ptr, [](const char* str, size_t n, void* dest) -> bool { + return re2_internal::Parse(str, n, reinterpret_cast(dest), 8); + }); +} + +#ifndef SWIG +// Silence warnings about missing initializers for members of LazyRE2. +#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 6 +#pragma GCC diagnostic ignored "-Wmissing-field-initializers" +#endif + +// Helper for writing global or static RE2s safely. +// Write +// static LazyRE2 re = {".*"}; +// and then use *re instead of writing +// static RE2 re(".*"); +// The former is more careful about multithreaded +// situations than the latter. +// +// N.B. This class never deletes the RE2 object that +// it constructs: that's a feature, so that it can be used +// for global and function static variables. +class LazyRE2 { + private: + struct NoArg {}; + + public: + typedef RE2 element_type; // support std::pointer_traits + + // Constructor omitted to preserve braced initialization in C++98. + + // Pretend to be a pointer to Type (never NULL due to on-demand creation): + RE2& operator*() const { return *get(); } + RE2* operator->() const { return get(); } + + // Named accessor/initializer: + RE2* get() const { + std::call_once(once_, &LazyRE2::Init, this); + return ptr_; + } + + // All data fields must be public to support {"foo"} initialization. + const char* pattern_; + RE2::CannedOptions options_; + NoArg barrier_against_excess_initializers_; + + mutable RE2* ptr_; + mutable std::once_flag once_; + + private: + static void Init(const LazyRE2* lazy_re2) { + lazy_re2->ptr_ = new RE2(lazy_re2->pattern_, lazy_re2->options_); + } + + void operator=(const LazyRE2&); // disallowed +}; +#endif + +namespace hooks { + +// Most platforms support thread_local. Older versions of iOS don't support +// thread_local, but for the sake of brevity, we lump together all versions +// of Apple platforms that aren't macOS. If an iOS application really needs +// the context pointee someday, we can get more specific then... +// +// As per https://github.com/google/re2/issues/325, thread_local support in +// MinGW seems to be buggy. (FWIW, Abseil folks also avoid it.) +#define RE2_HAVE_THREAD_LOCAL +#if (defined(__APPLE__) && !TARGET_OS_OSX) || defined(__MINGW32__) +#undef RE2_HAVE_THREAD_LOCAL +#endif + +// A hook must not make any assumptions regarding the lifetime of the context +// pointee beyond the current invocation of the hook. Pointers and references +// obtained via the context pointee should be considered invalidated when the +// hook returns. Hence, any data about the context pointee (e.g. its pattern) +// would have to be copied in order for it to be kept for an indefinite time. +// +// A hook must not use RE2 for matching. Control flow reentering RE2::Match() +// could result in infinite mutual recursion. To discourage that possibility, +// RE2 will not maintain the context pointer correctly when used in that way. +#ifdef RE2_HAVE_THREAD_LOCAL +extern thread_local const RE2* context; +#endif + +struct DFAStateCacheReset { + int64_t state_budget; + size_t state_cache_size; +}; + +struct DFASearchFailure { + // Nothing yet... +}; + +#define DECLARE_HOOK(type) \ + using type##Callback = void(const type&); \ + void Set##type##Hook(type##Callback* cb); \ + type##Callback* Get##type##Hook(); + +DECLARE_HOOK(DFAStateCacheReset) +DECLARE_HOOK(DFASearchFailure) + +#undef DECLARE_HOOK + +} // namespace hooks + +} // namespace re2 + +using re2::RE2; +using re2::LazyRE2; + +#endif // RE2_RE2_H_ diff --git a/re2/regex_internal.h b/re2/regex_internal.h new file mode 100644 index 0000000000000000000000000000000000000000..5824c8073bb55e07548a7d5f206f0f217cc13a51 --- /dev/null +++ b/re2/regex_internal.h @@ -0,0 +1,57 @@ +// #include "re2/sparse_array.h" +#include +namespace re2 { +// #include "re2/prog.h" +// Compiled form of regexp program. + class Prog { + //rure 更名为 Prog +}; + +// #include "re2/regexp.h" +class Regexp { + public: + + // Flags for parsing. Can be ORed together. + enum ParseFlags { + NoParseFlags = 0, + FoldCase = 1<<0, // Fold case during matching (case-insensitive). + Literal = 1<<1, // Treat s as literal string instead of a regexp. + ClassNL = 1<<2, // Allow char classes like [^a-z] and \D and \s + // and [[:space:]] to match newline. + DotNL = 1<<3, // Allow . to match newline. + MatchNL = ClassNL | DotNL, + OneLine = 1<<4, // Treat ^ and $ as only matching at beginning and + // end of text, not around embedded newlines. + // (Perl's default) + Latin1 = 1<<5, // Regexp and text are in Latin1, not UTF-8. + NonGreedy = 1<<6, // Repetition operators are non-greedy by default. + PerlClasses = 1<<7, // Allow Perl character classes like \d. + PerlB = 1<<8, // Allow Perl's \b and \B. + PerlX = 1<<9, // Perl extensions: + // non-capturing parens - (?: ) + // non-greedy operators - *? +? ?? {}? + // flag edits - (?i) (?-i) (?i: ) + // i - FoldCase + // m - !OneLine + // s - DotNL + // U - NonGreedy + // line ends: \A \z + // \Q and \E to disable/enable metacharacters + // (?Pexpr) for named captures + // \C to match any single byte + UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group + // and \P{Han} for its negation. + NeverNL = 1<<11, // Never match NL, even if the regexp mentions + // it explicitly. + NeverCapture = 1<<12, // Parse all parens as non-capturing. + + // As close to Perl as we can get. + LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX | + UnicodeGroups, + + // Internal use only. + WasDollar = 1<<13, // on kRegexpEndText: was $ in regexp text + AllParseFlags = (1<<14)-1, + }; +}; +}; \ No newline at end of file diff --git a/re2/set.cc b/re2/set.cc new file mode 100644 index 0000000000000000000000000000000000000000..f62fcd7088a00f8eefac0b57e0fe14c5878e95a8 --- /dev/null +++ b/re2/set.cc @@ -0,0 +1,178 @@ +// Copyright 2010 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "re2/set.h" + +#include +#include +#include +#include + +#include "util/util.h" +#include "util/logging.h" +// #include "re2/pod_array.h" +// #include "re2/prog.h" +#include "re2/re2.h" +// #include "re2/regexp.h" +#include "regex_internal.h" +#include "re2/stringpiece.h" + +namespace re2 { + +RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) + : options_(options), + anchor_(anchor), + compiled_(false), + size_(0) { + options_.set_never_capture(true); // might unblock some optimisations +} + +RE2::Set::~Set() { + for (size_t i = 0; i < elem_.size(); i++) + ;// elem_[i].second->Decref(); +} + +RE2::Set::Set(Set&& other) + : options_(other.options_), + anchor_(other.anchor_), + elem_(std::move(other.elem_)), + compiled_(other.compiled_), + size_(other.size_), + prog_(std::move(other.prog_)) { + other.elem_.clear(); + other.elem_.shrink_to_fit(); + other.compiled_ = false; + other.size_ = 0; + other.prog_.reset(); +} + +RE2::Set& RE2::Set::operator=(Set&& other) { + this->~Set(); + (void) new (this) Set(std::move(other)); + return *this; +} + +int RE2::Set::Add(const StringPiece& pattern, std::string* error) { + // if (compiled_) { + // LOG(DFATAL) << "RE2::Set::Add() called after compiling"; + // return -1; + // } + + // Regexp::ParseFlags pf = static_cast( + // options_.ParseFlags()); + // RegexpStatus status; + // re2::Regexp* re = Regexp::Parse(pattern, pf, &status); + // if (re == NULL) { + // if (error != NULL) + // *error = status.Text(); + // if (options_.log_errors()) + // LOG(ERROR) << "Error parsing '" << pattern << "': " << status.Text(); + // return -1; + // } + + // // Concatenate with match index and push on vector. + // int n = static_cast(elem_.size()); + // re2::Regexp* m = re2::Regexp::HaveMatch(n, pf); + // if (re->op() == kRegexpConcat) { + // int nsub = re->nsub(); + // PODArray sub(nsub + 1); + // for (int i = 0; i < nsub; i++) + // sub[i] = re->sub()[i]->Incref(); + // sub[nsub] = m; + // re->Decref(); + // re = re2::Regexp::Concat(sub.data(), nsub + 1, pf); + // } else { + // re2::Regexp* sub[2]; + // sub[0] = re; + // sub[1] = m; + // re = re2::Regexp::Concat(sub, 2, pf); + // } + // elem_.emplace_back(std::string(pattern), re); + // return n; + return 0; +} + +bool RE2::Set::Compile() { + // if (compiled_) { + // LOG(DFATAL) << "RE2::Set::Compile() called more than once"; + // return false; + // } + // compiled_ = true; + // size_ = static_cast(elem_.size()); + + // // Sort the elements by their patterns. This is good enough for now + // // until we have a Regexp comparison function. (Maybe someday...) + // std::sort(elem_.begin(), elem_.end(), + // [](const Elem& a, const Elem& b) -> bool { + // return a.first < b.first; + // }); + + // PODArray sub(size_); + // for (int i = 0; i < size_; i++) + // sub[i] = elem_[i].second; + // elem_.clear(); + // elem_.shrink_to_fit(); + + // Regexp::ParseFlags pf = static_cast( + // options_.ParseFlags()); + // re2::Regexp* re = re2::Regexp::Alternate(sub.data(), size_, pf); + + // prog_.reset(Prog::CompileSet(re, anchor_, options_.max_mem())); + // re->Decref(); + return prog_ != nullptr; +} + +bool RE2::Set::Match(const StringPiece& text, std::vector* v) const { + return Match(text, v, NULL); +} + +bool RE2::Set::Match(const StringPiece& text, std::vector* v, + ErrorInfo* error_info) const { +// if (!compiled_) { +// LOG(DFATAL) << "RE2::Set::Match() called before compiling"; +// if (error_info != NULL) +// error_info->kind = kNotCompiled; +// return false; +// } +// #ifdef RE2_HAVE_THREAD_LOCAL +// hooks::context = NULL; +// #endif +// bool dfa_failed = false; +// std::unique_ptr matches; +// if (v != NULL) { +// matches.reset(new SparseSet(size_)); +// v->clear(); +// } +// bool ret = prog_->SearchDFA(text, text, Prog::kAnchored, Prog::kManyMatch, +// NULL, &dfa_failed, matches.get()); +// if (dfa_failed) { +// if (options_.log_errors()) +// LOG(ERROR) << "DFA out of memory: " +// << "program size " << prog_->size() << ", " +// << "list count " << prog_->list_count() << ", " +// << "bytemap range " << prog_->bytemap_range(); +// if (error_info != NULL) +// error_info->kind = kOutOfMemory; +// return false; +// } +// if (ret == false) { +// if (error_info != NULL) +// error_info->kind = kNoError; +// return false; +// } +// if (v != NULL) { +// if (matches->empty()) { +// LOG(DFATAL) << "RE2::Set::Match() matched, but no matches returned?!"; +// if (error_info != NULL) +// error_info->kind = kInconsistent; +// return false; +// } +// v->assign(matches->begin(), matches->end()); +// } +// if (error_info != NULL) +// error_info->kind = kNoError; + return true; +} + +} // namespace re2 diff --git a/re2/set.h b/re2/set.h new file mode 100644 index 0000000000000000000000000000000000000000..8d64f30ccd94073058de740e22fb110d013de506 --- /dev/null +++ b/re2/set.h @@ -0,0 +1,85 @@ +// Copyright 2010 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef RE2_SET_H_ +#define RE2_SET_H_ + +#include +#include +#include +#include + +#include "re2/re2.h" + +namespace re2 { +class Prog; +class Regexp; +} // namespace re2 + +namespace re2 { + +// An RE2::Set represents a collection of regexps that can +// be searched for simultaneously. +class RE2::Set { + public: + enum ErrorKind { + kNoError = 0, + kNotCompiled, // The set is not compiled. + kOutOfMemory, // The DFA ran out of memory. + kInconsistent, // The result is inconsistent. This should never happen. + }; + + struct ErrorInfo { + ErrorKind kind; + }; + + Set(const RE2::Options& options, RE2::Anchor anchor); + ~Set(); + + // Not copyable. + Set(const Set&) = delete; + Set& operator=(const Set&) = delete; + // Movable. + Set(Set&& other); + Set& operator=(Set&& other); + + // Adds pattern to the set using the options passed to the constructor. + // Returns the index that will identify the regexp in the output of Match(), + // or -1 if the regexp cannot be parsed. + // Indices are assigned in sequential order starting from 0. + // Errors do not increment the index; if error is not NULL, *error will hold + // the error message from the parser. + int Add(const StringPiece& pattern, std::string* error); + + // Compiles the set in preparation for matching. + // Returns false if the compiler runs out of memory. + // Add() must not be called again after Compile(). + // Compile() must be called before Match(). + bool Compile(); + + // Returns true if text matches at least one of the regexps in the set. + // Fills v (if not NULL) with the indices of the matching regexps. + // Callers must not expect v to be sorted. + bool Match(const StringPiece& text, std::vector* v) const; + + // As above, but populates error_info (if not NULL) when none of the regexps + // in the set matched. This can inform callers when DFA execution fails, for + // example, because they might wish to handle that case differently. + bool Match(const StringPiece& text, std::vector* v, + ErrorInfo* error_info) const; + + private: + typedef std::pair Elem; + + RE2::Options options_; + RE2::Anchor anchor_; + std::vector elem_; + bool compiled_; + int size_; + std::unique_ptr prog_; +}; + +} // namespace re2 + +#endif // RE2_SET_H_ diff --git a/re2/stringpiece.cc b/re2/stringpiece.cc new file mode 100644 index 0000000000000000000000000000000000000000..ef2e2874ead91d1324d08bf6a0e7c2c528e6e2d8 --- /dev/null +++ b/re2/stringpiece.cc @@ -0,0 +1,65 @@ +// Copyright 2004 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "re2/stringpiece.h" + +#include + +#include "util/util.h" + +namespace re2 { + +const StringPiece::size_type StringPiece::npos; // initialized in stringpiece.h + +StringPiece::size_type StringPiece::copy(char* buf, size_type n, + size_type pos) const { + size_type ret = std::min(size_ - pos, n); + memcpy(buf, data_ + pos, ret); + return ret; +} + +StringPiece StringPiece::substr(size_type pos, size_type n) const { + if (pos > size_) pos = size_; + if (n > size_ - pos) n = size_ - pos; + return StringPiece(data_ + pos, n); +} + +StringPiece::size_type StringPiece::find(const StringPiece& s, + size_type pos) const { + if (pos > size_) return npos; + const_pointer result = std::search(data_ + pos, data_ + size_, + s.data_, s.data_ + s.size_); + size_type xpos = result - data_; + return xpos + s.size_ <= size_ ? xpos : npos; +} + +StringPiece::size_type StringPiece::find(char c, size_type pos) const { + if (size_ <= 0 || pos >= size_) return npos; + const_pointer result = std::find(data_ + pos, data_ + size_, c); + return result != data_ + size_ ? result - data_ : npos; +} + +StringPiece::size_type StringPiece::rfind(const StringPiece& s, + size_type pos) const { + if (size_ < s.size_) return npos; + if (s.size_ == 0) return std::min(size_, pos); + const_pointer last = data_ + std::min(size_ - s.size_, pos) + s.size_; + const_pointer result = std::find_end(data_, last, s.data_, s.data_ + s.size_); + return result != last ? result - data_ : npos; +} + +StringPiece::size_type StringPiece::rfind(char c, size_type pos) const { + if (size_ <= 0) return npos; + for (size_t i = std::min(pos + 1, size_); i != 0;) { + if (data_[--i] == c) return i; + } + return npos; +} + +std::ostream& operator<<(std::ostream& o, const StringPiece& p) { + o.write(p.data(), p.size()); + return o; +} + +} // namespace re2 diff --git a/re2/stringpiece.h b/re2/stringpiece.h new file mode 100644 index 0000000000000000000000000000000000000000..1d9c2d3d2c34d245d4dd78978c363e6b694041c5 --- /dev/null +++ b/re2/stringpiece.h @@ -0,0 +1,210 @@ +// Copyright 2001-2010 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef RE2_STRINGPIECE_H_ +#define RE2_STRINGPIECE_H_ + +// A string-like object that points to a sized piece of memory. +// +// Functions or methods may use const StringPiece& parameters to accept either +// a "const char*" or a "string" value that will be implicitly converted to +// a StringPiece. The implicit conversion means that it is often appropriate +// to include this .h file in other files rather than forward-declaring +// StringPiece as would be appropriate for most other Google classes. +// +// Systematic usage of StringPiece is encouraged as it will reduce unnecessary +// conversions from "const char*" to "string" and back again. +// +// +// Arghh! I wish C++ literals were "string". + +// Doing this simplifies the logic below. +#ifndef __has_include +#define __has_include(x) 0 +#endif + +#include +#include +#include +#include +#include +#include +#if __has_include() && __cplusplus >= 201703L +#include +#endif + +namespace re2 { + +class StringPiece { + public: + typedef std::char_traits traits_type; + typedef char value_type; + typedef char* pointer; + typedef const char* const_pointer; + typedef char& reference; + typedef const char& const_reference; + typedef const char* const_iterator; + typedef const_iterator iterator; + typedef std::reverse_iterator const_reverse_iterator; + typedef const_reverse_iterator reverse_iterator; + typedef size_t size_type; + typedef ptrdiff_t difference_type; + static const size_type npos = static_cast(-1); + + // We provide non-explicit singleton constructors so users can pass + // in a "const char*" or a "string" wherever a "StringPiece" is + // expected. + StringPiece() + : data_(NULL), size_(0) {} +#if __has_include() && __cplusplus >= 201703L + StringPiece(const std::string_view& str) + : data_(str.data()), size_(str.size()) {} +#endif + StringPiece(const std::string& str) + : data_(str.data()), size_(str.size()) {} + StringPiece(const char* str) + : data_(str), size_(str == NULL ? 0 : strlen(str)) {} + StringPiece(const char* str, size_type len) + : data_(str), size_(len) {} + + const_iterator begin() const { return data_; } + const_iterator end() const { return data_ + size_; } + const_reverse_iterator rbegin() const { + return const_reverse_iterator(data_ + size_); + } + const_reverse_iterator rend() const { + return const_reverse_iterator(data_); + } + + size_type size() const { return size_; } + size_type length() const { return size_; } + bool empty() const { return size_ == 0; } + + const_reference operator[](size_type i) const { return data_[i]; } + const_pointer data() const { return data_; } + + void remove_prefix(size_type n) { + data_ += n; + size_ -= n; + } + + void remove_suffix(size_type n) { + size_ -= n; + } + + void set(const char* str) { + data_ = str; + size_ = str == NULL ? 0 : strlen(str); + } + + void set(const char* str, size_type len) { + data_ = str; + size_ = len; + } + + // Converts to `std::basic_string`. + template + explicit operator std::basic_string() const { + if (!data_) return {}; + return std::basic_string(data_, size_); + } + + std::string as_string() const { + return std::string(data_, size_); + } + + // We also define ToString() here, since many other string-like + // interfaces name the routine that converts to a C++ string + // "ToString", and it's confusing to have the method that does that + // for a StringPiece be called "as_string()". We also leave the + // "as_string()" method defined here for existing code. + std::string ToString() const { + return std::string(data_, size_); + } + + void CopyToString(std::string* target) const { + target->assign(data_, size_); + } + + void AppendToString(std::string* target) const { + target->append(data_, size_); + } + + size_type copy(char* buf, size_type n, size_type pos = 0) const; + StringPiece substr(size_type pos = 0, size_type n = npos) const; + + int compare(const StringPiece& x) const { + size_type min_size = std::min(size(), x.size()); + if (min_size > 0) { + int r = memcmp(data(), x.data(), min_size); + if (r < 0) return -1; + if (r > 0) return 1; + } + if (size() < x.size()) return -1; + if (size() > x.size()) return 1; + return 0; + } + + // Does "this" start with "x"? + bool starts_with(const StringPiece& x) const { + return x.empty() || + (size() >= x.size() && memcmp(data(), x.data(), x.size()) == 0); + } + + // Does "this" end with "x"? + bool ends_with(const StringPiece& x) const { + return x.empty() || + (size() >= x.size() && + memcmp(data() + (size() - x.size()), x.data(), x.size()) == 0); + } + + bool contains(const StringPiece& s) const { + return find(s) != npos; + } + + size_type find(const StringPiece& s, size_type pos = 0) const; + size_type find(char c, size_type pos = 0) const; + size_type rfind(const StringPiece& s, size_type pos = npos) const; + size_type rfind(char c, size_type pos = npos) const; + + private: + const_pointer data_; + size_type size_; +}; + +inline bool operator==(const StringPiece& x, const StringPiece& y) { + StringPiece::size_type len = x.size(); + if (len != y.size()) return false; + return x.data() == y.data() || len == 0 || + memcmp(x.data(), y.data(), len) == 0; +} + +inline bool operator!=(const StringPiece& x, const StringPiece& y) { + return !(x == y); +} + +inline bool operator<(const StringPiece& x, const StringPiece& y) { + StringPiece::size_type min_size = std::min(x.size(), y.size()); + int r = min_size == 0 ? 0 : memcmp(x.data(), y.data(), min_size); + return (r < 0) || (r == 0 && x.size() < y.size()); +} + +inline bool operator>(const StringPiece& x, const StringPiece& y) { + return y < x; +} + +inline bool operator<=(const StringPiece& x, const StringPiece& y) { + return !(x > y); +} + +inline bool operator>=(const StringPiece& x, const StringPiece& y) { + return !(x < y); +} + +// Allow StringPiece to be logged. +std::ostream& operator<<(std::ostream& o, const StringPiece& p); + +} // namespace re2 + +#endif // RE2_STRINGPIECE_H_ diff --git a/re2/testing/filtered_re2_test.cc b/re2/testing/filtered_re2_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..c788fdadc49b2f7ae280fef9289f79a5ee172fde --- /dev/null +++ b/re2/testing/filtered_re2_test.cc @@ -0,0 +1,340 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include +#include +#include +#include +#include + +#include "util/test.h" +#include "util/logging.h" +#include "re2/filtered_re2.h" +#include "re2/re2.h" + +namespace re2 { + +struct FilterTestVars { + FilterTestVars() {} + explicit FilterTestVars(int min_atom_len) : f(min_atom_len) {} + + std::vector atoms; + std::vector atom_indices; + std::vector matches; + RE2::Options opts; + FilteredRE2 f; +}; + +TEST(FilteredRE2Test, EmptyTest) { + FilterTestVars v; + + v.f.Compile(&v.atoms); + EXPECT_EQ(0, v.atoms.size()); + + // Compile has no effect at all when called before Add: it will not + // record that it has been called and it will not clear the vector. + // The second point does not matter here, but the first point means + // that an error will be logged during the call to AllMatches. + v.f.AllMatches("foo", v.atom_indices, &v.matches); + EXPECT_EQ(0, v.matches.size()); +} + +TEST(FilteredRE2Test, SmallOrTest) { + FilterTestVars v(4); // override the minimum atom length + int id; + v.f.Add("(foo|bar)", v.opts, &id); + + v.f.Compile(&v.atoms); + EXPECT_EQ(0, v.atoms.size()); + + v.f.AllMatches("lemurs bar", v.atom_indices, &v.matches); + EXPECT_EQ(1, v.matches.size()); + EXPECT_EQ(id, v.matches[0]); +} + +TEST(FilteredRE2Test, SmallLatinTest) { + FilterTestVars v; + int id; + + v.opts.set_encoding(RE2::Options::EncodingLatin1); + v.f.Add("\xde\xadQ\xbe\xef", v.opts, &id); + v.f.Compile(&v.atoms); + EXPECT_EQ(1, v.atoms.size()); + EXPECT_EQ(v.atoms[0], "\xde\xadq\xbe\xef"); + + v.atom_indices.push_back(0); + v.f.AllMatches("foo\xde\xadQ\xbe\xeflemur", v.atom_indices, &v.matches); + EXPECT_EQ(1, v.matches.size()); + EXPECT_EQ(id, v.matches[0]); +} + +struct AtomTest { + const char* testname; + // If any test needs more than this many regexps or atoms, increase + // the size of the corresponding array. + const char* regexps[20]; + const char* atoms[20]; +}; + +AtomTest atom_tests[] = { + { + // This test checks to make sure empty patterns are allowed. + "CheckEmptyPattern", + {""}, + {} + }, { + // This test checks that all atoms of length greater than min length + // are found, and no atoms that are of smaller length are found. + "AllAtomsGtMinLengthFound", { + "(abc123|def456|ghi789).*mnop[x-z]+", + "abc..yyy..zz", + "mnmnpp[a-z]+PPP" + }, { + "abc123", + "def456", + "ghi789", + "mnop", + "abc", + "yyy", + "mnmnpp", + "ppp" + } + }, { + // Test to make sure that any atoms that have another atom as a + // substring in an OR are removed; that is, only the shortest + // substring is kept. + "SubstrAtomRemovesSuperStrInOr", { + "(abc123|abc|ghi789|abc1234).*[x-z]+", + "abcd..yyy..yyyzzz", + "mnmnpp[a-z]+PPP" + }, { + "abc", + "ghi789", + "abcd", + "yyy", + "yyyzzz", + "mnmnpp", + "ppp" + } + }, { + // Test character class expansion. + "CharClassExpansion", { + "m[a-c][d-f]n.*[x-z]+", + "[x-y]bcde[ab]" + }, { + "madn", "maen", "mafn", + "mbdn", "mben", "mbfn", + "mcdn", "mcen", "mcfn", + "xbcdea", "xbcdeb", + "ybcdea", "ybcdeb" + } + }, { + // Test upper/lower of non-ASCII. + "UnicodeLower", { + "(?i)ΔδΠϖπΣςσ", + "ΛΜΝΟΠ", + "ψρστυ", + }, { + "δδπππσσσ", + "λμνοπ", + "ψρστυ", + }, + }, +}; + +void AddRegexpsAndCompile(const char* regexps[], + size_t n, + struct FilterTestVars* v) { + for (size_t i = 0; i < n; i++) { + int id; + v->f.Add(regexps[i], v->opts, &id); + } + v->f.Compile(&v->atoms); +} + +bool CheckExpectedAtoms(const char* atoms[], + size_t n, + const char* testname, + struct FilterTestVars* v) { + std::vector expected; + for (size_t i = 0; i < n; i++) + expected.push_back(atoms[i]); + + bool pass = expected.size() == v->atoms.size(); + + std::sort(v->atoms.begin(), v->atoms.end()); + std::sort(expected.begin(), expected.end()); + for (size_t i = 0; pass && i < n; i++) + pass = pass && expected[i] == v->atoms[i]; + + if (!pass) { + LOG(ERROR) << "Failed " << testname; + LOG(ERROR) << "Expected #atoms = " << expected.size(); + for (size_t i = 0; i < expected.size(); i++) + LOG(ERROR) << expected[i]; + LOG(ERROR) << "Found #atoms = " << v->atoms.size(); + for (size_t i = 0; i < v->atoms.size(); i++) + LOG(ERROR) << v->atoms[i]; + } + + return pass; +} + +TEST(FilteredRE2Test, AtomTests) { + int nfail = 0; + for (size_t i = 0; i < arraysize(atom_tests); i++) { + FilterTestVars v; + AtomTest* t = &atom_tests[i]; + size_t nregexp, natom; + for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++) + if (t->regexps[nregexp] == NULL) + break; + for (natom = 0; natom < arraysize(t->atoms); natom++) + if (t->atoms[natom] == NULL) + break; + AddRegexpsAndCompile(t->regexps, nregexp, &v); + if (!CheckExpectedAtoms(t->atoms, natom, t->testname, &v)) + nfail++; + } + EXPECT_EQ(0, nfail); +} + +void FindAtomIndices(const std::vector& atoms, + const std::vector& matched_atoms, + std::vector* atom_indices) { + atom_indices->clear(); + for (size_t i = 0; i < matched_atoms.size(); i++) { + for (size_t j = 0; j < atoms.size(); j++) { + if (matched_atoms[i] == atoms[j]) { + atom_indices->push_back(static_cast(j)); + break; + } + } + } +} + +TEST(FilteredRE2Test, MatchEmptyPattern) { + FilterTestVars v; + AtomTest* t = &atom_tests[0]; + // We are using the regexps used in one of the atom tests + // for this test. Adding the EXPECT here to make sure + // the index we use for the test is for the correct test. + EXPECT_EQ("CheckEmptyPattern", std::string(t->testname)); + size_t nregexp; + for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++) + if (t->regexps[nregexp] == NULL) + break; + AddRegexpsAndCompile(t->regexps, nregexp, &v); + std::string text = "0123"; + std::vector atom_ids; + std::vector matching_regexps; + EXPECT_EQ(0, v.f.FirstMatch(text, atom_ids)); +} + +TEST(FilteredRE2Test, MatchTests) { + FilterTestVars v; + AtomTest* t = &atom_tests[2]; + // We are using the regexps used in one of the atom tests + // for this test. + EXPECT_EQ("SubstrAtomRemovesSuperStrInOr", std::string(t->testname)); + size_t nregexp; + for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++) + if (t->regexps[nregexp] == NULL) + break; + AddRegexpsAndCompile(t->regexps, nregexp, &v); + + std::string text = "abc121212xyz"; + // atoms = abc + std::vector atom_ids; + std::vector atoms; + atoms.push_back("abc"); + FindAtomIndices(v.atoms, atoms, &atom_ids); + std::vector matching_regexps; + v.f.AllMatches(text, atom_ids, &matching_regexps); + EXPECT_EQ(1, matching_regexps.size()); + + text = "abc12312yyyzzz"; + atoms.clear(); + atoms.push_back("abc"); + atoms.push_back("yyy"); + atoms.push_back("yyyzzz"); + FindAtomIndices(v.atoms, atoms, &atom_ids); + v.f.AllMatches(text, atom_ids, &matching_regexps); + EXPECT_EQ(1, matching_regexps.size()); + + text = "abcd12yyy32yyyzzz"; + atoms.clear(); + atoms.push_back("abc"); + atoms.push_back("abcd"); + atoms.push_back("yyy"); + atoms.push_back("yyyzzz"); + FindAtomIndices(v.atoms, atoms, &atom_ids); + LOG(INFO) << "S: " << atom_ids.size(); + for (size_t i = 0; i < atom_ids.size(); i++) + LOG(INFO) << "i: " << i << " : " << atom_ids[i]; + v.f.AllMatches(text, atom_ids, &matching_regexps); + EXPECT_EQ(2, matching_regexps.size()); +} + +TEST(FilteredRE2Test, EmptyStringInStringSetBug) { + // Bug due to find() finding "" at the start of everything in a string + // set and thus SimplifyStringSet() would end up erasing everything. + // In order to test this, we have to keep PrefilterTree from discarding + // the OR entirely, so we have to make the minimum atom length zero. + + FilterTestVars v(0); // override the minimum atom length + const char* regexps[] = {"-R.+(|ADD=;AA){12}}"}; + const char* atoms[] = {"", "-r", "add=;aa", "}"}; + AddRegexpsAndCompile(regexps, arraysize(regexps), &v); + EXPECT_TRUE(CheckExpectedAtoms(atoms, arraysize(atoms), + "EmptyStringInStringSetBug", &v)); +} + +TEST(FilteredRE2Test, MoveSemantics) { + FilterTestVars v1; + int id; + v1.f.Add("foo\\d+", v1.opts, &id); + EXPECT_EQ(0, id); + v1.f.Compile(&v1.atoms); + EXPECT_EQ(1, v1.atoms.size()); + EXPECT_EQ("foo", v1.atoms[0]); + v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches); + EXPECT_EQ(1, v1.matches.size()); + EXPECT_EQ(0, v1.matches[0]); + v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches); + EXPECT_EQ(0, v1.matches.size()); + + // The moved-to object should do what the moved-from object did. + FilterTestVars v2; + v2.f = std::move(v1.f); + v2.f.AllMatches("abc foo1 xyz", {0}, &v2.matches); + EXPECT_EQ(1, v2.matches.size()); + EXPECT_EQ(0, v2.matches[0]); + v2.f.AllMatches("abc bar2 xyz", {0}, &v2.matches); + EXPECT_EQ(0, v2.matches.size()); + + // The moved-from object should have been reset and be reusable. + v1.f.Add("bar\\d+", v1.opts, &id); + EXPECT_EQ(0, id); + v1.f.Compile(&v1.atoms); + EXPECT_EQ(1, v1.atoms.size()); + EXPECT_EQ("bar", v1.atoms[0]); + v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches); + EXPECT_EQ(0, v1.matches.size()); + v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches); + EXPECT_EQ(1, v1.matches.size()); + EXPECT_EQ(0, v1.matches[0]); + + // Verify that "overwriting" works and also doesn't leak memory. + // (The latter will need a leak detector such as LeakSanitizer.) + v1.f = std::move(v2.f); + v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches); + EXPECT_EQ(1, v1.matches.size()); + EXPECT_EQ(0, v1.matches[0]); + v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches); + EXPECT_EQ(0, v1.matches.size()); +} + +} // namespace re2 diff --git a/re2/testing/re2_arg_test.cc b/re2/testing/re2_arg_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..f62e17cf4772c688a661cc262007c462ae373dd2 --- /dev/null +++ b/re2/testing/re2_arg_test.cc @@ -0,0 +1,160 @@ +// Copyright 2005 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This tests to make sure numbers are parsed from strings +// correctly. +// Todo: Expand the test to validate strings parsed to the other types +// supported by RE2::Arg class + +#include +#include + +#include "util/test.h" +#include "util/logging.h" +#include "re2/re2.h" + +namespace re2 { + +struct SuccessTable { + const char * value_string; + int64_t value; + bool success[6]; +}; + +// Test boundary cases for different integral sizes. +// Specifically I want to make sure that values outside the boundries +// of an integral type will fail and that negative numbers will fail +// for unsigned types. The following table contains the boundaries for +// the various integral types and has entries for whether or not each +// type can contain the given value. +const SuccessTable kSuccessTable[] = { +// string integer value i16 u16 i32 u32 i64 u64 +// 0 to 2^7-1 +{ "0", 0, { true, true, true, true, true, true }}, +{ "127", 127, { true, true, true, true, true, true }}, + +// -1 to -2^7 +{ "-1", -1, { true, false, true, false, true, false }}, +{ "-128", -128, { true, false, true, false, true, false }}, + +// 2^7 to 2^8-1 +{ "128", 128, { true, true, true, true, true, true }}, +{ "255", 255, { true, true, true, true, true, true }}, + +// 2^8 to 2^15-1 +{ "256", 256, { true, true, true, true, true, true }}, +{ "32767", 32767, { true, true, true, true, true, true }}, + +// -2^7-1 to -2^15 +{ "-129", -129, { true, false, true, false, true, false }}, +{ "-32768", -32768, { true, false, true, false, true, false }}, + +// 2^15 to 2^16-1 +{ "32768", 32768, { false, true, true, true, true, true }}, +{ "65535", 65535, { false, true, true, true, true, true }}, + +// 2^16 to 2^31-1 +{ "65536", 65536, { false, false, true, true, true, true }}, +{ "2147483647", 2147483647, { false, false, true, true, true, true }}, + +// -2^15-1 to -2^31 +{ "-32769", -32769, { false, false, true, false, true, false }}, +{ "-2147483648", static_cast(0xFFFFFFFF80000000LL), + { false, false, true, false, true, false }}, + +// 2^31 to 2^32-1 +{ "2147483648", 2147483648U, { false, false, false, true, true, true }}, +{ "4294967295", 4294967295U, { false, false, false, true, true, true }}, + +// 2^32 to 2^63-1 +{ "4294967296", 4294967296LL, { false, false, false, false, true, true }}, +{ "9223372036854775807", + 9223372036854775807LL, { false, false, false, false, true, true }}, + +// -2^31-1 to -2^63 +{ "-2147483649", -2147483649LL, { false, false, false, false, true, false }}, +{ "-9223372036854775808", static_cast(0x8000000000000000LL), + { false, false, false, false, true, false }}, + +// 2^63 to 2^64-1 +{ "9223372036854775808", static_cast(9223372036854775808ULL), + { false, false, false, false, false, true }}, +{ "18446744073709551615", static_cast(18446744073709551615ULL), + { false, false, false, false, false, true }}, + +// >= 2^64 +{ "18446744073709551616", 0, { false, false, false, false, false, false }}, +}; + +const int kNumStrings = arraysize(kSuccessTable); + +// It's ugly to use a macro, but we apparently can't use the EXPECT_EQ +// macro outside of a TEST block and this seems to be the only way to +// avoid code duplication. I can also pull off a couple nice tricks +// using concatenation for the type I'm checking against. +#define PARSE_FOR_TYPE(type, column) { \ + type r; \ + for (int i = 0; i < kNumStrings; ++i) { \ + RE2::Arg arg(&r); \ + const char* const p = kSuccessTable[i].value_string; \ + bool retval = arg.Parse(p, strlen(p)); \ + bool success = kSuccessTable[i].success[column]; \ + EXPECT_EQ(retval, success) \ + << "Parsing '" << p << "' for type " #type " should return " \ + << success; \ + if (success) { \ + EXPECT_EQ(r, (type)kSuccessTable[i].value); \ + } \ + } \ +} + +TEST(RE2ArgTest, Int16Test) { + PARSE_FOR_TYPE(int16_t, 0); +} + +TEST(RE2ArgTest, Uint16Test) { + PARSE_FOR_TYPE(uint16_t, 1); +} + +TEST(RE2ArgTest, Int32Test) { + PARSE_FOR_TYPE(int32_t, 2); +} + +TEST(RE2ArgTest, Uint32Test) { + PARSE_FOR_TYPE(uint32_t, 3); +} + +TEST(RE2ArgTest, Int64Test) { + PARSE_FOR_TYPE(int64_t, 4); +} + +TEST(RE2ArgTest, Uint64Test) { + PARSE_FOR_TYPE(uint64_t, 5); +} + +TEST(RE2ArgTest, ParseFromTest) { +#if !defined(_MSC_VER) + struct { + bool ParseFrom(const char* str, size_t n) { + LOG(INFO) << "str = " << str << ", n = " << n; + return true; + } + } obj1; + RE2::Arg arg1(&obj1); + EXPECT_TRUE(arg1.Parse("one", 3)); + + struct { + bool ParseFrom(const char* str, size_t n) { + LOG(INFO) << "str = " << str << ", n = " << n; + return false; + } + // Ensure that RE2::Arg works even with overloaded ParseFrom(). + void ParseFrom(const char* str) {} + } obj2; + RE2::Arg arg2(&obj2); + EXPECT_FALSE(arg2.Parse("two", 3)); +#endif +} + +} // namespace re2 diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..3ee20531e4f0b2f5d3900702d216e33c4a03b691 --- /dev/null +++ b/re2/testing/re2_test.cc @@ -0,0 +1,1660 @@ +// -*- coding: utf-8 -*- +// Copyright 2002-2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// TODO: Test extractions for PartialMatch/Consume + +#include +#include +#include +#include +#include +#include +#include +#include +#if !defined(_MSC_VER) && !defined(__CYGWIN__) && !defined(__MINGW32__) +#include +#include /* for sysconf */ +#endif + +#include "util/test.h" +#include "util/logging.h" +#include "util/strutil.h" +#include "re2/re2.h" + + +namespace re2 { + +TEST(RE2, HexTests) { +#define ASSERT_HEX(type, value) \ + do { \ + type v; \ + ASSERT_TRUE( \ + RE2::FullMatch(#value, "([0-9a-fA-F]+)[uUlL]*", RE2::Hex(&v))); \ + ASSERT_EQ(v, 0x##value); \ + ASSERT_TRUE(RE2::FullMatch("0x" #value, "([0-9a-fA-FxX]+)[uUlL]*", \ + RE2::CRadix(&v))); \ + ASSERT_EQ(v, 0x##value); \ + } while (0) + + ASSERT_HEX(short, 2bad); + ASSERT_HEX(unsigned short, 2badU); + ASSERT_HEX(int, dead); + ASSERT_HEX(unsigned int, deadU); + ASSERT_HEX(long, 7eadbeefL); + ASSERT_HEX(unsigned long, deadbeefUL); + ASSERT_HEX(long long, 12345678deadbeefLL); + ASSERT_HEX(unsigned long long, cafebabedeadbeefULL); + +#undef ASSERT_HEX +} + +TEST(RE2, OctalTests) { +#define ASSERT_OCTAL(type, value) \ + do { \ + type v; \ + ASSERT_TRUE(RE2::FullMatch(#value, "([0-7]+)[uUlL]*", RE2::Octal(&v))); \ + ASSERT_EQ(v, 0##value); \ + ASSERT_TRUE(RE2::FullMatch("0" #value, "([0-9a-fA-FxX]+)[uUlL]*", \ + RE2::CRadix(&v))); \ + ASSERT_EQ(v, 0##value); \ + } while (0) + + ASSERT_OCTAL(short, 77777); + ASSERT_OCTAL(unsigned short, 177777U); + ASSERT_OCTAL(int, 17777777777); + ASSERT_OCTAL(unsigned int, 37777777777U); + ASSERT_OCTAL(long, 17777777777L); + ASSERT_OCTAL(unsigned long, 37777777777UL); + ASSERT_OCTAL(long long, 777777777777777777777LL); + ASSERT_OCTAL(unsigned long long, 1777777777777777777777ULL); + +#undef ASSERT_OCTAL +} + +TEST(RE2, DecimalTests) { +#define ASSERT_DECIMAL(type, value) \ + do { \ + type v; \ + ASSERT_TRUE(RE2::FullMatch(#value, "(-?[0-9]+)[uUlL]*", &v)); \ + ASSERT_EQ(v, value); \ + ASSERT_TRUE( \ + RE2::FullMatch(#value, "(-?[0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \ + ASSERT_EQ(v, value); \ + } while (0) + + ASSERT_DECIMAL(short, -1); + ASSERT_DECIMAL(unsigned short, 9999); + ASSERT_DECIMAL(int, -1000); + ASSERT_DECIMAL(unsigned int, 12345U); + ASSERT_DECIMAL(long, -10000000L); + ASSERT_DECIMAL(unsigned long, 3083324652U); + ASSERT_DECIMAL(long long, -100000000000000LL); + ASSERT_DECIMAL(unsigned long long, 1234567890987654321ULL); + +#undef ASSERT_DECIMAL +} + +// TEST(RE2, Replace) { +// struct ReplaceTest { +// const char *regexp; +// const char *rewrite; +// const char *original; +// const char *single; +// const char *global; +// int greplace_count; +// }; +// static const ReplaceTest tests[] = { +// { "(qu|[b-df-hj-np-tv-z]*)([a-z]+)", +// "\\2\\1ay", +// "the quick brown fox jumps over the lazy dogs.", +// "ethay quick brown fox jumps over the lazy dogs.", +// "ethay ickquay ownbray oxfay umpsjay overay ethay azylay ogsday.", +// 9 }, +// { "\\w+", +// "\\0-NOSPAM", +// "abcd.efghi@google.com", +// "abcd-NOSPAM.efghi@google.com", +// "abcd-NOSPAM.efghi-NOSPAM@google-NOSPAM.com-NOSPAM", +// 4 }, +// { "^", +// "(START)", +// "foo", +// "(START)foo", +// "(START)foo", +// 1 }, +// { "^", +// "(START)", +// "", +// "(START)", +// "(START)", +// 1 }, +// { "$", +// "(END)", +// "", +// "(END)", +// "(END)", +// 1 }, +// { "b", +// "bb", +// "ababababab", +// "abbabababab", +// "abbabbabbabbabb", +// 5 }, +// { "b", +// "bb", +// "bbbbbb", +// "bbbbbbb", +// "bbbbbbbbbbbb", +// 6 }, +// { "b+", +// "bb", +// "bbbbbb", +// "bb", +// "bb", +// 1 }, +// { "b*", +// "bb", +// "bbbbbb", +// "bb", +// "bb", +// 1 }, +// { "b*", +// "bb", +// "aaaaa", +// "bbaaaaa", +// "bbabbabbabbabbabb", +// 6 }, +// // Check newline handling +// { "a.*a", +// "(\\0)", +// "aba\naba", +// "(aba)\naba", +// "(aba)\n(aba)", +// 2 }, +// { "", NULL, NULL, NULL, NULL, 0 } +// }; + +// for (const ReplaceTest* t = tests; t->original != NULL; t++) { +// std::string one(t->original); +// ASSERT_TRUE(RE2::Replace(&one, t->regexp, t->rewrite)); +// ASSERT_EQ(one, t->single); +// std::string all(t->original); +// ASSERT_EQ(RE2::GlobalReplace(&all, t->regexp, t->rewrite), t->greplace_count) +// << "Got: " << all; +// ASSERT_EQ(all, t->global); +// } +// } + +// static void TestCheckRewriteString(const char* regexp, const char* rewrite, +// bool expect_ok) { +// std::string error; +// RE2 exp(regexp); +// bool actual_ok = exp.CheckRewriteString(rewrite, &error); +// EXPECT_EQ(expect_ok, actual_ok) << " for " << rewrite << " error: " << error; +// } + +// TEST(CheckRewriteString, all) { +// TestCheckRewriteString("abc", "foo", true); +// TestCheckRewriteString("abc", "foo\\", false); +// TestCheckRewriteString("abc", "foo\\0bar", true); + +// TestCheckRewriteString("a(b)c", "foo", true); +// TestCheckRewriteString("a(b)c", "foo\\0bar", true); +// TestCheckRewriteString("a(b)c", "foo\\1bar", true); +// TestCheckRewriteString("a(b)c", "foo\\2bar", false); +// TestCheckRewriteString("a(b)c", "f\\\\2o\\1o", true); + +// TestCheckRewriteString("a(b)(c)", "foo\\12", true); +// TestCheckRewriteString("a(b)(c)", "f\\2o\\1o", true); +// TestCheckRewriteString("a(b)(c)", "f\\oo\\1", false); +// } + +// TEST(RE2, Extract) { +// std::string s; + +// ASSERT_TRUE(RE2::Extract("boris@kremvax.ru", "(.*)@([^.]*)", "\\2!\\1", &s)); +// ASSERT_EQ(s, "kremvax!boris"); + +// ASSERT_TRUE(RE2::Extract("foo", ".*", "'\\0'", &s)); +// ASSERT_EQ(s, "'foo'"); +// // check that false match doesn't overwrite +// ASSERT_FALSE(RE2::Extract("baz", "bar", "'\\0'", &s)); +// ASSERT_EQ(s, "'foo'"); +// } + +// TEST(RE2, MaxSubmatchTooLarge) { +// std::string s; +// ASSERT_FALSE(RE2::Extract("foo", "f(o+)", "\\1\\2", &s)); +// s = "foo"; +// ASSERT_FALSE(RE2::Replace(&s, "f(o+)", "\\1\\2")); +// s = "foo"; +// ASSERT_FALSE(RE2::GlobalReplace(&s, "f(o+)", "\\1\\2")); +// } + +TEST(RE2, Consume) { + RE2 r("\\s*(\\w+)"); // matches a word, possibly proceeded by whitespace + std::string word; + + std::string s(" aaa b!@#$@#$cccc"); + StringPiece input(s); + + ASSERT_TRUE(RE2::Consume(&input, r, &word)); + ASSERT_EQ(word, "aaa") << " input: " << input; + ASSERT_TRUE(RE2::Consume(&input, r, &word)); + ASSERT_EQ(word, "b") << " input: " << input; + ASSERT_FALSE(RE2::Consume(&input, r, &word)) << " input: " << input; +} + +TEST(RE2, ConsumeN) { + const std::string s(" one two three 4"); + StringPiece input(s); + + RE2::Arg argv[2]; + const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; + + // 0 arg + EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 0)); // Skips "one". + + // 1 arg + std::string word; + argv[0] = &word; + EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 1)); + EXPECT_EQ("two", word); + + // Multi-args + int n; + argv[1] = &n; + EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)\\s*(\\d+)", args, 2)); + EXPECT_EQ("three", word); + EXPECT_EQ(4, n); +} + +TEST(RE2, FindAndConsume) { + RE2 r("(\\w+)"); // matches a word + std::string word; + + std::string s(" aaa b!@#$@#$cccc"); + StringPiece input(s); + + ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word)); + ASSERT_EQ(word, "aaa"); + ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word)); + ASSERT_EQ(word, "b"); + ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word)); + ASSERT_EQ(word, "cccc"); + ASSERT_FALSE(RE2::FindAndConsume(&input, r, &word)); + + // Check that FindAndConsume works without any submatches. + // Earlier version used uninitialized data for + // length to consume. + input = "aaa"; + ASSERT_TRUE(RE2::FindAndConsume(&input, "aaa")); + ASSERT_EQ(input, ""); +} + +TEST(RE2, FindAndConsumeN) { + const std::string s(" one two three 4"); + StringPiece input(s); + + RE2::Arg argv[2]; + const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; + + // 0 arg + EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 0)); // Skips "one". + + // 1 arg + std::string word; + argv[0] = &word; + EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 1)); + EXPECT_EQ("two", word); + + // Multi-args + int n; + argv[1] = &n; + EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)\\s*(\\d+)", args, 2)); + EXPECT_EQ("three", word); + EXPECT_EQ(4, n); +} + +TEST(RE2, MatchNumberPeculiarity) { + RE2 r("(foo)|(bar)|(baz)"); + std::string word1; + std::string word2; + std::string word3; + + ASSERT_TRUE(RE2::PartialMatch("foo", r, &word1, &word2, &word3)); + ASSERT_EQ(word1, "foo"); + ASSERT_EQ(word2, ""); + ASSERT_EQ(word3, ""); + ASSERT_TRUE(RE2::PartialMatch("bar", r, &word1, &word2, &word3)); + ASSERT_EQ(word1, ""); + ASSERT_EQ(word2, "bar"); + ASSERT_EQ(word3, ""); + ASSERT_TRUE(RE2::PartialMatch("baz", r, &word1, &word2, &word3)); + ASSERT_EQ(word1, ""); + ASSERT_EQ(word2, ""); + ASSERT_EQ(word3, "baz"); + ASSERT_FALSE(RE2::PartialMatch("f", r, &word1, &word2, &word3)); + + std::string a; + ASSERT_TRUE(RE2::FullMatch("hello", "(foo)|hello", &a)); + ASSERT_EQ(a, ""); +} + +TEST(RE2, Match) { + RE2 re("((\\w+):([0-9]+))"); // extracts host and port + StringPiece group[4]; + + // No match. + StringPiece s = "zyzzyva"; + ASSERT_FALSE( + re.Match(s, 0, s.size(), RE2::UNANCHORED, group, arraysize(group))); + + // Matches and extracts. + s = "a chrisr:9000 here"; + ASSERT_TRUE( + re.Match(s, 0, s.size(), RE2::UNANCHORED, group, arraysize(group))); + ASSERT_EQ(group[0], "chrisr:9000"); + ASSERT_EQ(group[1], "chrisr:9000"); + ASSERT_EQ(group[2], "chrisr"); + ASSERT_EQ(group[3], "9000"); + + std::string all, host; + int port; + ASSERT_TRUE(RE2::PartialMatch("a chrisr:9000 here", re, &all, &host, &port)); + ASSERT_EQ(all, "chrisr:9000"); + ASSERT_EQ(host, "chrisr"); + ASSERT_EQ(port, 9000); +} + +static void TestRecursion(int size, const char* pattern) { + // Fill up a string repeating the pattern given + std::string domain; + domain.resize(size); + size_t patlen = strlen(pattern); + for (int i = 0; i < size; i++) { + domain[i] = pattern[i % patlen]; + } + // Just make sure it doesn't crash due to too much recursion. + RE2 re("([a-zA-Z0-9]|-)+(\\.([a-zA-Z0-9]|-)+)*(\\.)?", RE2::Quiet); + RE2::FullMatch(domain, re); +} + +// A meta-quoted string, interpreted as a pattern, should always match +// the original unquoted string. +static void TestQuoteMeta(const std::string& unquoted, + const RE2::Options& options = RE2::DefaultOptions) { + std::string quoted = RE2::QuoteMeta(unquoted); + RE2 re(quoted, options); + EXPECT_TRUE(RE2::FullMatch(unquoted, re)) + << "Unquoted='" << unquoted << "', quoted='" << quoted << "'."; +} + +// A meta-quoted string, interpreted as a pattern, should always match +// the original unquoted string. +static void NegativeTestQuoteMeta( + const std::string& unquoted, const std::string& should_not_match, + const RE2::Options& options = RE2::DefaultOptions) { + std::string quoted = RE2::QuoteMeta(unquoted); + RE2 re(quoted, options); + EXPECT_FALSE(RE2::FullMatch(should_not_match, re)) + << "Unquoted='" << unquoted << "', quoted='" << quoted << "'."; +} + +// Tests that quoted meta characters match their original strings, +// and that a few things that shouldn't match indeed do not. +TEST(QuoteMeta, Simple) { + TestQuoteMeta("foo"); + TestQuoteMeta("foo.bar"); + TestQuoteMeta("foo\\.bar"); + TestQuoteMeta("[1-9]"); + TestQuoteMeta("1.5-2.0?"); + TestQuoteMeta("\\d"); + TestQuoteMeta("Who doesn't like ice cream?"); + TestQuoteMeta("((a|b)c?d*e+[f-h]i)"); + TestQuoteMeta("((?!)xxx).*yyy"); + TestQuoteMeta("(["); +} +TEST(QuoteMeta, SimpleNegative) { + NegativeTestQuoteMeta("foo", "bar"); + NegativeTestQuoteMeta("...", "bar"); + NegativeTestQuoteMeta("\\.", "."); + NegativeTestQuoteMeta("\\.", ".."); + NegativeTestQuoteMeta("(a)", "a"); + NegativeTestQuoteMeta("(a|b)", "a"); + NegativeTestQuoteMeta("(a|b)", "(a)"); + NegativeTestQuoteMeta("(a|b)", "a|b"); + NegativeTestQuoteMeta("[0-9]", "0"); + NegativeTestQuoteMeta("[0-9]", "0-9"); + NegativeTestQuoteMeta("[0-9]", "[9]"); + NegativeTestQuoteMeta("((?!)xxx)", "xxx"); +} + +TEST(QuoteMeta, Latin1) { + TestQuoteMeta("3\xb2 = 9", RE2::Latin1); +} + +TEST(QuoteMeta, UTF8) { + TestQuoteMeta("Plácido Domingo"); + TestQuoteMeta("xyz"); // No fancy utf8. + TestQuoteMeta("\xc2\xb0"); // 2-byte utf8 -- a degree symbol. + TestQuoteMeta("27\xc2\xb0 degrees"); // As a middle character. + TestQuoteMeta("\xe2\x80\xb3"); // 3-byte utf8 -- a double prime. + TestQuoteMeta("\xf0\x9d\x85\x9f"); // 4-byte utf8 -- a music note. + TestQuoteMeta("27\xc2\xb0"); // Interpreted as Latin-1, this should + // still work. + NegativeTestQuoteMeta("27\xc2\xb0", + "27\\\xc2\\\xb0"); // 2-byte utf8 -- a degree symbol. +} + +// TEST(QuoteMeta, HasNull) { +// std::string has_null; + +// // string with one null character +// has_null += '\0'; +// TestQuoteMeta(has_null); +// NegativeTestQuoteMeta(has_null, ""); + +// // Don't want null-followed-by-'1' to be interpreted as '\01'. +// has_null += '1'; +// TestQuoteMeta(has_null); +// NegativeTestQuoteMeta(has_null, "\1"); +// } + +// TEST(ProgramSize, BigProgram) { +// RE2 re_simple("simple regexp"); +// RE2 re_medium("medium.*regexp"); +// RE2 re_complex("complex.{1,128}regexp"); + +// ASSERT_GT(re_simple.ProgramSize(), 0); +// ASSERT_GT(re_medium.ProgramSize(), re_simple.ProgramSize()); +// ASSERT_GT(re_complex.ProgramSize(), re_medium.ProgramSize()); + +// ASSERT_GT(re_simple.ReverseProgramSize(), 0); +// ASSERT_GT(re_medium.ReverseProgramSize(), re_simple.ReverseProgramSize()); +// ASSERT_GT(re_complex.ReverseProgramSize(), re_medium.ReverseProgramSize()); +// } + +// TEST(ProgramFanout, BigProgram) { +// RE2 re1("(?:(?:(?:(?:(?:.)?){1})*)+)"); +// RE2 re10("(?:(?:(?:(?:(?:.)?){10})*)+)"); +// RE2 re100("(?:(?:(?:(?:(?:.)?){100})*)+)"); +// RE2 re1000("(?:(?:(?:(?:(?:.)?){1000})*)+)"); + +// std::vector histogram; + +// // 3 is the largest non-empty bucket and has 2 element. +// ASSERT_EQ(3, re1.ProgramFanout(&histogram)); +// ASSERT_EQ(2, histogram[3]); + +// // 6 is the largest non-empty bucket and has 11 elements. +// ASSERT_EQ(6, re10.ProgramFanout(&histogram)); +// ASSERT_EQ(11, histogram[6]); + +// // 9 is the largest non-empty bucket and has 101 elements. +// ASSERT_EQ(9, re100.ProgramFanout(&histogram)); +// ASSERT_EQ(101, histogram[9]); + +// // 13 is the largest non-empty bucket and has 1001 elements. +// ASSERT_EQ(13, re1000.ProgramFanout(&histogram)); +// ASSERT_EQ(1001, histogram[13]); + +// // 2 is the largest non-empty bucket and has 2 element. +// ASSERT_EQ(2, re1.ReverseProgramFanout(&histogram)); +// ASSERT_EQ(2, histogram[2]); + +// // 5 is the largest non-empty bucket and has 11 elements. +// ASSERT_EQ(5, re10.ReverseProgramFanout(&histogram)); +// ASSERT_EQ(11, histogram[5]); + +// // 9 is the largest non-empty bucket and has 101 elements. +// ASSERT_EQ(9, re100.ReverseProgramFanout(&histogram)); +// ASSERT_EQ(101, histogram[9]); + +// // 12 is the largest non-empty bucket and has 1001 elements. +// ASSERT_EQ(12, re1000.ReverseProgramFanout(&histogram)); +// ASSERT_EQ(1001, histogram[12]); +// } + +// Issue 956519: handling empty character sets was +// causing NULL dereference. This tests a few empty character sets. +// (The way to get an empty character set is to negate a full one.) +TEST(EmptyCharset, Fuzz) { + static const char *empties[] = { + "[^\\S\\s]", + "[^\\S[:space:]]", + "[^\\D\\d]", + "[^\\D[:digit:]]" + }; + for (size_t i = 0; i < arraysize(empties); i++) + ASSERT_FALSE(RE2(empties[i]).Match("abc", 0, 3, RE2::UNANCHORED, NULL, 0)); +} + +// Bitstate assumes that kInstFail instructions in +// alternations or capture groups have been "compiled away". +TEST(EmptyCharset, BitstateAssumptions) { + // Captures trigger use of Bitstate. + static const char *nop_empties[] = { + "((((()))))" "[^\\S\\s]?", + "((((()))))" "([^\\S\\s])?", + "((((()))))" "([^\\S\\s]|[^\\S\\s])?", + "((((()))))" "(([^\\S\\s]|[^\\S\\s])|)" + }; + StringPiece group[6]; + for (size_t i = 0; i < arraysize(nop_empties); i++) + ASSERT_TRUE(RE2(nop_empties[i]).Match("", 0, 0, RE2::UNANCHORED, group, 6)); +} + +// Test that named groups work correctly. +TEST(Capture, NamedGroups) { + { + RE2 re("(hello world)"); + ASSERT_EQ(re.NumberOfCapturingGroups(), 1); + const std::map& m = re.NamedCapturingGroups(); + ASSERT_EQ(m.size(), 0); + } + + { + RE2 re("(?Pexpr(?Pexpr)(?Pexpr))((expr)(?Pexpr))"); + ASSERT_EQ(re.NumberOfCapturingGroups(), 6); + const std::map& m = re.NamedCapturingGroups(); + ASSERT_EQ(m.size(), 4); + ASSERT_EQ(m.find("A")->second, 1); + ASSERT_EQ(m.find("B")->second, 2); + ASSERT_EQ(m.find("C")->second, 3); + ASSERT_EQ(m.find("D")->second, 6); // $4 and $5 are anonymous + } +} + +TEST(RE2, CapturedGroupTest) { + RE2 re("directions from (?P.*) to (?P.*)"); + int num_groups = re.NumberOfCapturingGroups(); + EXPECT_EQ(2, num_groups); + std::string args[4]; + RE2::Arg arg0(&args[0]); + RE2::Arg arg1(&args[1]); + RE2::Arg arg2(&args[2]); + RE2::Arg arg3(&args[3]); + + const RE2::Arg* const matches[4] = {&arg0, &arg1, &arg2, &arg3}; + EXPECT_TRUE(RE2::FullMatchN("directions from mountain view to san jose", + re, matches, num_groups)); + const std::map& named_groups = re.NamedCapturingGroups(); + EXPECT_TRUE(named_groups.find("S") != named_groups.end()); + EXPECT_TRUE(named_groups.find("D") != named_groups.end()); + + // The named group index is 1-based. + int source_group_index = named_groups.find("S")->second; + int destination_group_index = named_groups.find("D")->second; + EXPECT_EQ(1, source_group_index); + EXPECT_EQ(2, destination_group_index); + + // The args is zero-based. + EXPECT_EQ("mountain view", args[source_group_index - 1]); + EXPECT_EQ("san jose", args[destination_group_index - 1]); +} + +TEST(RE2, FullMatchWithNoArgs) { + ASSERT_TRUE(RE2::FullMatch("h", "h")); + ASSERT_TRUE(RE2::FullMatch("hello", "hello")); + ASSERT_TRUE(RE2::FullMatch("hello", "h.*o")); + ASSERT_FALSE(RE2::FullMatch("othello", "h.*o")); // Must be anchored at front + ASSERT_FALSE(RE2::FullMatch("hello!", "h.*o")); // Must be anchored at end +} + +TEST(RE2, PartialMatch) { + ASSERT_TRUE(RE2::PartialMatch("x", "x")); + ASSERT_TRUE(RE2::PartialMatch("hello", "h.*o")); + ASSERT_TRUE(RE2::PartialMatch("othello", "h.*o")); + ASSERT_TRUE(RE2::PartialMatch("hello!", "h.*o")); + ASSERT_TRUE(RE2::PartialMatch("x", "((((((((((((((((((((x))))))))))))))))))))")); +} + +TEST(RE2, PartialMatchN) { + RE2::Arg argv[2]; + const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; + + // 0 arg + EXPECT_TRUE(RE2::PartialMatchN("hello", "e.*o", args, 0)); + EXPECT_FALSE(RE2::PartialMatchN("othello", "a.*o", args, 0)); + + // 1 arg + int i; + argv[0] = &i; + EXPECT_TRUE(RE2::PartialMatchN("1001 nights", "(\\d+)", args, 1)); + EXPECT_EQ(1001, i); + EXPECT_FALSE(RE2::PartialMatchN("three", "(\\d+)", args, 1)); + + // Multi-arg + std::string s; + argv[1] = &s; + EXPECT_TRUE(RE2::PartialMatchN("answer: 42:life", "(\\d+):(\\w+)", args, 2)); + EXPECT_EQ(42, i); + EXPECT_EQ("life", s); + EXPECT_FALSE(RE2::PartialMatchN("hi1", "(\\w+)(1)", args, 2)); +} + +TEST(RE2, FullMatchZeroArg) { + // Zero-arg + ASSERT_TRUE(RE2::FullMatch("1001", "\\d+")); +} + +TEST(RE2, FullMatchOneArg) { + int i; + + // Single-arg + ASSERT_TRUE(RE2::FullMatch("1001", "(\\d+)", &i)); + ASSERT_EQ(i, 1001); + ASSERT_TRUE(RE2::FullMatch("-123", "(-?\\d+)", &i)); + ASSERT_EQ(i, -123); + ASSERT_FALSE(RE2::FullMatch("10", "()\\d+", &i)); + ASSERT_FALSE( + RE2::FullMatch("1234567890123456789012345678901234567890", "(\\d+)", &i)); +} + +TEST(RE2, FullMatchIntegerArg) { + int i; + + // Digits surrounding integer-arg + ASSERT_TRUE(RE2::FullMatch("1234", "1(\\d*)4", &i)); + ASSERT_EQ(i, 23); + ASSERT_TRUE(RE2::FullMatch("1234", "(\\d)\\d+", &i)); + ASSERT_EQ(i, 1); + ASSERT_TRUE(RE2::FullMatch("-1234", "(-\\d)\\d+", &i)); + ASSERT_EQ(i, -1); + ASSERT_TRUE(RE2::PartialMatch("1234", "(\\d)", &i)); + ASSERT_EQ(i, 1); + ASSERT_TRUE(RE2::PartialMatch("-1234", "(-\\d)", &i)); + ASSERT_EQ(i, -1); +} + +TEST(RE2, FullMatchStringArg) { + std::string s; + // String-arg + ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", &s)); + ASSERT_EQ(s, std::string("ell")); +} + +TEST(RE2, FullMatchStringPieceArg) { + int i; + // StringPiece-arg + StringPiece sp; + ASSERT_TRUE(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &sp, &i)); + ASSERT_EQ(sp.size(), 4); + ASSERT_TRUE(memcmp(sp.data(), "ruby", 4) == 0); + ASSERT_EQ(i, 1234); +} + +TEST(RE2, FullMatchMultiArg) { + int i; + std::string s; + // Multi-arg + ASSERT_TRUE(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i)); + ASSERT_EQ(s, std::string("ruby")); + ASSERT_EQ(i, 1234); +} + +TEST(RE2, FullMatchN) { + RE2::Arg argv[2]; + const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; + + // 0 arg + EXPECT_TRUE(RE2::FullMatchN("hello", "h.*o", args, 0)); + EXPECT_FALSE(RE2::FullMatchN("othello", "h.*o", args, 0)); + + // 1 arg + int i; + argv[0] = &i; + EXPECT_TRUE(RE2::FullMatchN("1001", "(\\d+)", args, 1)); + EXPECT_EQ(1001, i); + EXPECT_FALSE(RE2::FullMatchN("three", "(\\d+)", args, 1)); + + // Multi-arg + std::string s; + argv[1] = &s; + EXPECT_TRUE(RE2::FullMatchN("42:life", "(\\d+):(\\w+)", args, 2)); + EXPECT_EQ(42, i); + EXPECT_EQ("life", s); + EXPECT_FALSE(RE2::FullMatchN("hi1", "(\\w+)(1)", args, 2)); +} + +TEST(RE2, FullMatchIgnoredArg) { + int i; + std::string s; + + // Old-school NULL should be ignored. + ASSERT_TRUE( + RE2::FullMatch("ruby:1234", "(\\w+)(:)(\\d+)", &s, (void*)NULL, &i)); + ASSERT_EQ(s, std::string("ruby")); + ASSERT_EQ(i, 1234); + + // C++11 nullptr should also be ignored. + ASSERT_TRUE(RE2::FullMatch("rubz:1235", "(\\w+)(:)(\\d+)", &s, nullptr, &i)); + ASSERT_EQ(s, std::string("rubz")); + ASSERT_EQ(i, 1235); +} + +TEST(RE2, FullMatchTypedNullArg) { + std::string s; + + // Ignore non-void* NULL arg + ASSERT_TRUE(RE2::FullMatch("hello", "he(.*)lo", (char*)NULL)); + ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", (std::string*)NULL)); + ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", (StringPiece*)NULL)); + ASSERT_TRUE(RE2::FullMatch("1234", "(.*)", (int*)NULL)); + ASSERT_TRUE(RE2::FullMatch("1234567890123456", "(.*)", (long long*)NULL)); + ASSERT_TRUE(RE2::FullMatch("123.4567890123456", "(.*)", (double*)NULL)); + ASSERT_TRUE(RE2::FullMatch("123.4567890123456", "(.*)", (float*)NULL)); + + // Fail on non-void* NULL arg if the match doesn't parse for the given type. + ASSERT_FALSE(RE2::FullMatch("hello", "h(.*)lo", &s, (char*)NULL)); + ASSERT_FALSE(RE2::FullMatch("hello", "(.*)", (int*)NULL)); + ASSERT_FALSE(RE2::FullMatch("1234567890123456", "(.*)", (int*)NULL)); + ASSERT_FALSE(RE2::FullMatch("hello", "(.*)", (double*)NULL)); + ASSERT_FALSE(RE2::FullMatch("hello", "(.*)", (float*)NULL)); +} + +// // Check that numeric parsing code does not read past the end of +// // the number being parsed. +// // This implementation requires mmap(2) et al. and thus cannot +// // be used unless they are available. +// TEST(RE2, NULTerminated) { +// #if defined(_POSIX_MAPPED_FILES) && _POSIX_MAPPED_FILES > 0 +// char *v; +// int x; +// long pagesize = sysconf(_SC_PAGE_SIZE); + +// #ifndef MAP_ANONYMOUS +// #define MAP_ANONYMOUS MAP_ANON +// #endif +// v = static_cast(mmap(NULL, 2*pagesize, PROT_READ|PROT_WRITE, +// MAP_ANONYMOUS|MAP_PRIVATE, -1, 0)); +// ASSERT_TRUE(v != reinterpret_cast(-1)); +// LOG(INFO) << "Memory at " << (void*)v; +// ASSERT_EQ(munmap(v + pagesize, pagesize), 0) << " error " << errno; +// v[pagesize - 1] = '1'; + +// x = 0; +// ASSERT_TRUE(RE2::FullMatch(StringPiece(v + pagesize - 1, 1), "(.*)", &x)); +// ASSERT_EQ(x, 1); +// #endif +// } + +TEST(RE2, FullMatchTypeTests) { + // Type tests + std::string zeros(1000, '0'); + { + char c; + ASSERT_TRUE(RE2::FullMatch("Hello", "(H)ello", &c)); + ASSERT_EQ(c, 'H'); + } + { + unsigned char c; + ASSERT_TRUE(RE2::FullMatch("Hello", "(H)ello", &c)); + ASSERT_EQ(c, static_cast('H')); + } + { + int16_t v; + ASSERT_TRUE(RE2::FullMatch("100", "(-?\\d+)", &v)); ASSERT_EQ(v, 100); + ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v)); ASSERT_EQ(v, -100); + ASSERT_TRUE(RE2::FullMatch("32767", "(-?\\d+)", &v)); ASSERT_EQ(v, 32767); + ASSERT_TRUE(RE2::FullMatch("-32768", "(-?\\d+)", &v)); ASSERT_EQ(v, -32768); + ASSERT_FALSE(RE2::FullMatch("-32769", "(-?\\d+)", &v)); + ASSERT_FALSE(RE2::FullMatch("32768", "(-?\\d+)", &v)); + } + { + uint16_t v; + ASSERT_TRUE(RE2::FullMatch("100", "(\\d+)", &v)); ASSERT_EQ(v, 100); + ASSERT_TRUE(RE2::FullMatch("32767", "(\\d+)", &v)); ASSERT_EQ(v, 32767); + ASSERT_TRUE(RE2::FullMatch("65535", "(\\d+)", &v)); ASSERT_EQ(v, 65535); + ASSERT_FALSE(RE2::FullMatch("65536", "(\\d+)", &v)); + } + { + int32_t v; + static const int32_t max = INT32_C(0x7fffffff); + static const int32_t min = -max - 1; + ASSERT_TRUE(RE2::FullMatch("100", "(-?\\d+)", &v)); ASSERT_EQ(v, 100); + ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v)); ASSERT_EQ(v, -100); + ASSERT_TRUE(RE2::FullMatch("2147483647", "(-?\\d+)", &v)); ASSERT_EQ(v, max); + ASSERT_TRUE(RE2::FullMatch("-2147483648", "(-?\\d+)", &v)); ASSERT_EQ(v, min); + ASSERT_FALSE(RE2::FullMatch("-2147483649", "(-?\\d+)", &v)); + ASSERT_FALSE(RE2::FullMatch("2147483648", "(-?\\d+)", &v)); + + ASSERT_TRUE(RE2::FullMatch(zeros + "2147483647", "(-?\\d+)", &v)); + ASSERT_EQ(v, max); + ASSERT_TRUE(RE2::FullMatch("-" + zeros + "2147483648", "(-?\\d+)", &v)); + ASSERT_EQ(v, min); + + ASSERT_FALSE(RE2::FullMatch("-" + zeros + "2147483649", "(-?\\d+)", &v)); + ASSERT_TRUE(RE2::FullMatch("0x7fffffff", "(.*)", RE2::CRadix(&v))); + ASSERT_EQ(v, max); + ASSERT_FALSE(RE2::FullMatch("000x7fffffff", "(.*)", RE2::CRadix(&v))); + } + { + uint32_t v; + static const uint32_t max = UINT32_C(0xffffffff); + ASSERT_TRUE(RE2::FullMatch("100", "(\\d+)", &v)); ASSERT_EQ(v, 100); + ASSERT_TRUE(RE2::FullMatch("4294967295", "(\\d+)", &v)); ASSERT_EQ(v, max); + ASSERT_FALSE(RE2::FullMatch("4294967296", "(\\d+)", &v)); + ASSERT_FALSE(RE2::FullMatch("-1", "(\\d+)", &v)); + + ASSERT_TRUE(RE2::FullMatch(zeros + "4294967295", "(\\d+)", &v)); ASSERT_EQ(v, max); + } + { + int64_t v; + static const int64_t max = INT64_C(0x7fffffffffffffff); + static const int64_t min = -max - 1; + std::string str; + + ASSERT_TRUE(RE2::FullMatch("100", "(-?\\d+)", &v)); ASSERT_EQ(v, 100); + ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v)); ASSERT_EQ(v, -100); + + str = std::to_string(max); + ASSERT_TRUE(RE2::FullMatch(str, "(-?\\d+)", &v)); ASSERT_EQ(v, max); + + str = std::to_string(min); + ASSERT_TRUE(RE2::FullMatch(str, "(-?\\d+)", &v)); ASSERT_EQ(v, min); + + str = std::to_string(max); + ASSERT_NE(str.back(), '9'); + str.back()++; + ASSERT_FALSE(RE2::FullMatch(str, "(-?\\d+)", &v)); + + str = std::to_string(min); + ASSERT_NE(str.back(), '9'); + str.back()++; + ASSERT_FALSE(RE2::FullMatch(str, "(-?\\d+)", &v)); + } + { + uint64_t v; + int64_t v2; + static const uint64_t max = UINT64_C(0xffffffffffffffff); + std::string str; + + ASSERT_TRUE(RE2::FullMatch("100", "(-?\\d+)", &v)); ASSERT_EQ(v, 100); + ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v2)); ASSERT_EQ(v2, -100); + + str = std::to_string(max); + ASSERT_TRUE(RE2::FullMatch(str, "(-?\\d+)", &v)); ASSERT_EQ(v, max); + + ASSERT_NE(str.back(), '9'); + str.back()++; + ASSERT_FALSE(RE2::FullMatch(str, "(-?\\d+)", &v)); + } +} + +TEST(RE2, FloatingPointFullMatchTypes) { + std::string zeros(1000, '0'); + { + float v; + ASSERT_TRUE(RE2::FullMatch("100", "(.*)", &v)); ASSERT_EQ(v, 100); + ASSERT_TRUE(RE2::FullMatch("-100.", "(.*)", &v)); ASSERT_EQ(v, -100); + ASSERT_TRUE(RE2::FullMatch("1e23", "(.*)", &v)); ASSERT_EQ(v, float(1e23)); + ASSERT_TRUE(RE2::FullMatch(" 100", "(.*)", &v)); ASSERT_EQ(v, 100); + + ASSERT_TRUE(RE2::FullMatch(zeros + "1e23", "(.*)", &v)); + ASSERT_EQ(v, float(1e23)); + + // 6700000000081920.1 is an edge case. + // 6700000000081920 is exactly halfway between + // two float32s, so the .1 should make it round up. + // However, the .1 is outside the precision possible with + // a float64: the nearest float64 is 6700000000081920. + // So if the code uses strtod and then converts to float32, + // round-to-even will make it round down instead of up. + // To pass the test, the parser must call strtof directly. + // This test case is carefully chosen to use only a 17-digit + // number, since C does not guarantee to get the correctly + // rounded answer for strtod and strtof unless the input is + // short. + // + // This is known to fail on Cygwin and MinGW due to a broken + // implementation of strtof(3). And apparently MSVC too. Sigh. +#if !defined(_MSC_VER) && !defined(__CYGWIN__) && !defined(__MINGW32__) + ASSERT_TRUE(RE2::FullMatch("0.1", "(.*)", &v)); + ASSERT_EQ(v, 0.1f) << StringPrintf("%.8g != %.8g", v, 0.1f); + ASSERT_TRUE(RE2::FullMatch("6700000000081920.1", "(.*)", &v)); + ASSERT_EQ(v, 6700000000081920.1f) + << StringPrintf("%.8g != %.8g", v, 6700000000081920.1f); +#endif + } + { + double v; + ASSERT_TRUE(RE2::FullMatch("100", "(.*)", &v)); ASSERT_EQ(v, 100); + ASSERT_TRUE(RE2::FullMatch("-100.", "(.*)", &v)); ASSERT_EQ(v, -100); + ASSERT_TRUE(RE2::FullMatch("1e23", "(.*)", &v)); ASSERT_EQ(v, 1e23); + ASSERT_TRUE(RE2::FullMatch(zeros + "1e23", "(.*)", &v)); + ASSERT_EQ(v, double(1e23)); + + ASSERT_TRUE(RE2::FullMatch("0.1", "(.*)", &v)); + ASSERT_EQ(v, 0.1) << StringPrintf("%.17g != %.17g", v, 0.1); + ASSERT_TRUE(RE2::FullMatch("1.00000005960464485", "(.*)", &v)); + ASSERT_EQ(v, 1.0000000596046448) + << StringPrintf("%.17g != %.17g", v, 1.0000000596046448); + } +} + +TEST(RE2, FullMatchAnchored) { + int i; + // Check that matching is fully anchored + ASSERT_FALSE(RE2::FullMatch("x1001", "(\\d+)", &i)); + ASSERT_FALSE(RE2::FullMatch("1001x", "(\\d+)", &i)); + ASSERT_TRUE(RE2::FullMatch("x1001", "x(\\d+)", &i)); ASSERT_EQ(i, 1001); + ASSERT_TRUE(RE2::FullMatch("1001x", "(\\d+)x", &i)); ASSERT_EQ(i, 1001); +} + +TEST(RE2, FullMatchBraces) { + // Braces + ASSERT_TRUE(RE2::FullMatch("0abcd", "[0-9a-f+.-]{5,}")); + ASSERT_TRUE(RE2::FullMatch("0abcde", "[0-9a-f+.-]{5,}")); + ASSERT_FALSE(RE2::FullMatch("0abc", "[0-9a-f+.-]{5,}")); +} + +TEST(RE2, Complicated) { + // Complicated RE2 + ASSERT_TRUE(RE2::FullMatch("foo", "foo|bar|[A-Z]")); + ASSERT_TRUE(RE2::FullMatch("bar", "foo|bar|[A-Z]")); + ASSERT_TRUE(RE2::FullMatch("X", "foo|bar|[A-Z]")); + ASSERT_FALSE(RE2::FullMatch("XY", "foo|bar|[A-Z]")); +} + +// TEST(RE2, FullMatchEnd) { +// // Check full-match handling (needs '$' tacked on internally) +// ASSERT_TRUE(RE2::FullMatch("fo", "fo|foo")); +// ASSERT_TRUE(RE2::FullMatch("foo", "fo|foo")); +// ASSERT_TRUE(RE2::FullMatch("fo", "fo|foo$")); +// ASSERT_TRUE(RE2::FullMatch("foo", "fo|foo$")); +// ASSERT_TRUE(RE2::FullMatch("foo", "foo$")); +// ASSERT_FALSE(RE2::FullMatch("foo$bar", "foo\\$")); +// ASSERT_FALSE(RE2::FullMatch("fox", "fo|bar")); + +// // Uncomment the following if we change the handling of '$' to +// // prevent it from matching a trailing newline +// if (false) { +// // Check that we don't get bitten by pcre's special handling of a +// // '\n' at the end of the string matching '$' +// ASSERT_FALSE(RE2::PartialMatch("foo\n", "foo$")); +// } +// } + +TEST(RE2, FullMatchArgCount) { + // Number of args + int a[16]; + ASSERT_TRUE(RE2::FullMatch("", "")); + + memset(a, 0, sizeof(0)); + ASSERT_TRUE(RE2::FullMatch("1", "(\\d){1}", &a[0])); + ASSERT_EQ(a[0], 1); + + memset(a, 0, sizeof(0)); + ASSERT_TRUE(RE2::FullMatch("12", "(\\d)(\\d)", &a[0], &a[1])); + ASSERT_EQ(a[0], 1); + ASSERT_EQ(a[1], 2); + + memset(a, 0, sizeof(0)); + ASSERT_TRUE(RE2::FullMatch("123", "(\\d)(\\d)(\\d)", &a[0], &a[1], &a[2])); + ASSERT_EQ(a[0], 1); + ASSERT_EQ(a[1], 2); + ASSERT_EQ(a[2], 3); + + memset(a, 0, sizeof(0)); + ASSERT_TRUE(RE2::FullMatch("1234", "(\\d)(\\d)(\\d)(\\d)", &a[0], &a[1], + &a[2], &a[3])); + ASSERT_EQ(a[0], 1); + ASSERT_EQ(a[1], 2); + ASSERT_EQ(a[2], 3); + ASSERT_EQ(a[3], 4); + + memset(a, 0, sizeof(0)); + ASSERT_TRUE(RE2::FullMatch("12345", "(\\d)(\\d)(\\d)(\\d)(\\d)", &a[0], &a[1], + &a[2], &a[3], &a[4])); + ASSERT_EQ(a[0], 1); + ASSERT_EQ(a[1], 2); + ASSERT_EQ(a[2], 3); + ASSERT_EQ(a[3], 4); + ASSERT_EQ(a[4], 5); + + memset(a, 0, sizeof(0)); + ASSERT_TRUE(RE2::FullMatch("123456", "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)", &a[0], + &a[1], &a[2], &a[3], &a[4], &a[5])); + ASSERT_EQ(a[0], 1); + ASSERT_EQ(a[1], 2); + ASSERT_EQ(a[2], 3); + ASSERT_EQ(a[3], 4); + ASSERT_EQ(a[4], 5); + ASSERT_EQ(a[5], 6); + + memset(a, 0, sizeof(0)); + ASSERT_TRUE(RE2::FullMatch("1234567", "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)", + &a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6])); + ASSERT_EQ(a[0], 1); + ASSERT_EQ(a[1], 2); + ASSERT_EQ(a[2], 3); + ASSERT_EQ(a[3], 4); + ASSERT_EQ(a[4], 5); + ASSERT_EQ(a[5], 6); + ASSERT_EQ(a[6], 7); + + memset(a, 0, sizeof(0)); + ASSERT_TRUE(RE2::FullMatch("1234567890123456", + "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)" + "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)", + &a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], + &a[7], &a[8], &a[9], &a[10], &a[11], &a[12], + &a[13], &a[14], &a[15])); + ASSERT_EQ(a[0], 1); + ASSERT_EQ(a[1], 2); + ASSERT_EQ(a[2], 3); + ASSERT_EQ(a[3], 4); + ASSERT_EQ(a[4], 5); + ASSERT_EQ(a[5], 6); + ASSERT_EQ(a[6], 7); + ASSERT_EQ(a[7], 8); + ASSERT_EQ(a[8], 9); + ASSERT_EQ(a[9], 0); + ASSERT_EQ(a[10], 1); + ASSERT_EQ(a[11], 2); + ASSERT_EQ(a[12], 3); + ASSERT_EQ(a[13], 4); + ASSERT_EQ(a[14], 5); + ASSERT_EQ(a[15], 6); +} + +TEST(RE2, Accessors) { + // Check the pattern() accessor + { + const std::string kPattern = "http://([^/]+)/.*"; + const RE2 re(kPattern); + ASSERT_EQ(kPattern, re.pattern()); + } + + // Check RE2 error field. + { + RE2 re("foo"); + ASSERT_TRUE(re.error().empty()); // Must have no error + ASSERT_TRUE(re.ok()); + ASSERT_EQ(re.error_code(), RE2::NoError); + } +} + +// TEST(RE2, UTF8) { +// // Check UTF-8 handling +// // Three Japanese characters (nihongo) +// const char utf8_string[] = { +// (char)0xe6, (char)0x97, (char)0xa5, // 65e5 +// (char)0xe6, (char)0x9c, (char)0xac, // 627c +// (char)0xe8, (char)0xaa, (char)0x9e, // 8a9e +// 0 +// }; +// const char utf8_pattern[] = { +// '.', +// (char)0xe6, (char)0x9c, (char)0xac, // 627c +// '.', +// 0 +// }; + +// // Both should match in either mode, bytes or UTF-8 +// RE2 re_test1(".........", RE2::Latin1); +// ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test1)); +// RE2 re_test2("..."); +// ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test2)); + +// // Check that '.' matches one byte or UTF-8 character +// // according to the mode. +// std::string s; +// RE2 re_test3("(.)", RE2::Latin1); +// ASSERT_TRUE(RE2::PartialMatch(utf8_string, re_test3, &s)); +// ASSERT_EQ(s, std::string("\xe6")); +// RE2 re_test4("(.)"); +// ASSERT_TRUE(RE2::PartialMatch(utf8_string, re_test4, &s)); +// ASSERT_EQ(s, std::string("\xe6\x97\xa5")); + +// // Check that string matches itself in either mode +// RE2 re_test5(utf8_string, RE2::Latin1); +// ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test5)); +// RE2 re_test6(utf8_string); +// ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test6)); + +// // Check that pattern matches string only in UTF8 mode +// RE2 re_test7(utf8_pattern, RE2::Latin1); +// ASSERT_FALSE(RE2::FullMatch(utf8_string, re_test7)); +// RE2 re_test8(utf8_pattern); +// ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test8)); +// } + +TEST(RE2, UngreedyUTF8) { + // Check that ungreedy, UTF8 regular expressions don't match when they + // oughtn't -- see bug 82246. + { + // This code always worked. + const char* pattern = "\\w+X"; + const std::string target = "a aX"; + RE2 match_sentence(pattern, RE2::Latin1); + RE2 match_sentence_re(pattern); + + ASSERT_FALSE(RE2::FullMatch(target, match_sentence)); + ASSERT_FALSE(RE2::FullMatch(target, match_sentence_re)); + } + { + const char* pattern = "(?U)\\w+X"; + const std::string target = "a aX"; + RE2 match_sentence(pattern, RE2::Latin1); + ASSERT_EQ(match_sentence.error(), ""); + RE2 match_sentence_re(pattern); + + ASSERT_FALSE(RE2::FullMatch(target, match_sentence)); + ASSERT_FALSE(RE2::FullMatch(target, match_sentence_re)); + } +} + +// TEST(RE2, Rejects) { +// { +// RE2 re("a\\1", RE2::Quiet); +// ASSERT_FALSE(re.ok()); } +// { +// RE2 re("a[x", RE2::Quiet); +// ASSERT_FALSE(re.ok()); +// } +// { +// RE2 re("a[z-a]", RE2::Quiet); +// ASSERT_FALSE(re.ok()); +// } +// { +// RE2 re("a[[:foobar:]]", RE2::Quiet); +// ASSERT_FALSE(re.ok()); +// } +// { +// RE2 re("a(b", RE2::Quiet); +// ASSERT_FALSE(re.ok()); +// } +// { +// RE2 re("a\\", RE2::Quiet); +// ASSERT_FALSE(re.ok()); +// } +// } + +TEST(RE2, NoCrash) { + // Test that using a bad regexp doesn't crash. + { + RE2 re("a\\", RE2::Quiet); + ASSERT_FALSE(re.ok()); + ASSERT_FALSE(RE2::PartialMatch("a\\b", re)); + } + + // Test that using an enormous regexp doesn't crash + { + RE2 re("(((.{100}){100}){100}){100}", RE2::Quiet); + ASSERT_FALSE(re.ok()); + ASSERT_FALSE(RE2::PartialMatch("aaa", re)); + } + + // Test that a crazy regexp still compiles and runs. + { + RE2 re(".{512}x", RE2::Quiet); + ASSERT_TRUE(re.ok()); + std::string s; + s.append(515, 'c'); + s.append("x"); + ASSERT_TRUE(RE2::PartialMatch(s, re)); + } +} + +TEST(RE2, Recursion) { + // Test that recursion is stopped. + // This test is PCRE-legacy -- there's no recursion in RE2. + int bytes = 15 * 1024; // enough to crash PCRE + TestRecursion(bytes, "."); + TestRecursion(bytes, "a"); + TestRecursion(bytes, "a."); + TestRecursion(bytes, "ab."); + TestRecursion(bytes, "abc."); +} + +TEST(RE2, BigCountedRepetition) { + // Test that counted repetition works, given tons of memory. + RE2::Options opt; + opt.set_max_mem(256<<20); + + RE2 re(".{512}x", opt); + ASSERT_TRUE(re.ok()); + std::string s; + s.append(515, 'c'); + s.append("x"); + ASSERT_TRUE(RE2::PartialMatch(s, re)); +} + +TEST(RE2, DeepRecursion) { + // Test for deep stack recursion. This would fail with a + // segmentation violation due to stack overflow before pcre was + // patched. + // Again, a PCRE legacy test. RE2 doesn't recurse. + std::string comment("x*"); + std::string a(131072, 'a'); + comment += a; + comment += "*x"; + RE2 re("((?:\\s|xx.*\n|x[*](?:\n|.)*?[*]x)*)"); + ASSERT_TRUE(RE2::FullMatch(comment, re)); +} + +// Suggested by Josh Hyman. Failed when SearchOnePass was +// not implementing case-folding. +TEST(CaseInsensitive, MatchAndConsume) { + std::string text = "A fish named *Wanda*"; + StringPiece sp(text); + StringPiece result; + EXPECT_TRUE(RE2::PartialMatch(text, "(?i)([wand]{5})", &result)); + EXPECT_TRUE(RE2::FindAndConsume(&sp, "(?i)([wand]{5})", &result)); +} + +// RE2 should permit implicit conversions from string, StringPiece, const char*, +// and C string literals. +TEST(RE2, ImplicitConversions) { + std::string re_string("."); + StringPiece re_stringpiece("."); + const char* re_cstring = "."; + EXPECT_TRUE(RE2::PartialMatch("e", re_string)); + EXPECT_TRUE(RE2::PartialMatch("e", re_stringpiece)); + EXPECT_TRUE(RE2::PartialMatch("e", re_cstring)); + EXPECT_TRUE(RE2::PartialMatch("e", ".")); +} + +// Bugs introduced by 8622304 +TEST(RE2, CL8622304) { + // reported by ingow + std::string dir; + EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])")); // ok + EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])", &dir)); // fails + + // reported by jacobsa + std::string key, val; + EXPECT_TRUE(RE2::PartialMatch("bar:1,0x2F,030,4,5;baz:true;fooby:false,true", + "(\\w+)(?::((?:[^;\\\\]|\\\\.)*))?;?", + &key, + &val)); + EXPECT_EQ(key, "bar"); + EXPECT_EQ(val, "1,0x2F,030,4,5"); +} + +// // Check that RE2 returns correct regexp pieces on error. +// // In particular, make sure it returns whole runes +// // and that it always reports invalid UTF-8. +// // Also check that Perl error flag piece is big enough. +// static struct ErrorTest { +// const char *regexp; +// RE2::ErrorCode error_code; +// const char *error_arg; +// } error_tests[] = { +// { "ab\\αcd", RE2::ErrorBadEscape, "\\α" }, +// { "ef\\x☺01", RE2::ErrorBadEscape, "\\x☺0" }, +// { "gh\\x1☺01", RE2::ErrorBadEscape, "\\x1☺" }, +// { "ij\\x1", RE2::ErrorBadEscape, "\\x1" }, +// { "kl\\x", RE2::ErrorBadEscape, "\\x" }, +// { "uv\\x{0000☺}", RE2::ErrorBadEscape, "\\x{0000☺" }, +// { "wx\\p{ABC", RE2::ErrorBadCharRange, "\\p{ABC" }, +// // used to return (?s but the error is X +// { "yz(?smiUX:abc)", RE2::ErrorBadPerlOp, "(?smiUX" }, +// { "aa(?sm☺i", RE2::ErrorBadPerlOp, "(?sm☺" }, +// { "bb[abc", RE2::ErrorMissingBracket, "[abc" }, +// { "abc(def", RE2::ErrorMissingParen, "abc(def" }, +// { "abc)def", RE2::ErrorUnexpectedParen, "abc)def" }, + +// // no argument string returned for invalid UTF-8 +// { "mn\\x1\377", RE2::ErrorBadUTF8, "" }, +// { "op\377qr", RE2::ErrorBadUTF8, "" }, +// { "st\\x{00000\377", RE2::ErrorBadUTF8, "" }, +// { "zz\\p{\377}", RE2::ErrorBadUTF8, "" }, +// { "zz\\x{00\377}", RE2::ErrorBadUTF8, "" }, +// { "zz(?Pabc)", RE2::ErrorBadUTF8, "" }, +// }; +// TEST(RE2, ErrorCodeAndArg) { +// for (size_t i = 0; i < arraysize(error_tests); i++) { +// RE2 re(error_tests[i].regexp, RE2::Quiet); +// EXPECT_FALSE(re.ok()); +// EXPECT_EQ(re.error_code(), error_tests[i].error_code) << re.error(); +// EXPECT_EQ(re.error_arg(), error_tests[i].error_arg) << re.error(); +// } +// } + +// // Check that "never match \n" mode never matches \n. +// static struct NeverTest { +// const char* regexp; +// const char* text; +// const char* match; +// } never_tests[] = { +// { "(.*)", "abc\ndef\nghi\n", "abc" }, +// { "(?s)(abc.*def)", "abc\ndef\n", NULL }, +// { "(abc(.|\n)*def)", "abc\ndef\n", NULL }, +// { "(abc[^x]*def)", "abc\ndef\n", NULL }, +// { "(abc[^x]*def)", "abczzzdef\ndef\n", "abczzzdef" }, +// }; +// TEST(RE2, NeverNewline) { +// RE2::Options opt; +// opt.set_never_nl(true); +// for (size_t i = 0; i < arraysize(never_tests); i++) { +// const NeverTest& t = never_tests[i]; +// RE2 re(t.regexp, opt); +// if (t.match == NULL) { +// EXPECT_FALSE(re.PartialMatch(t.text, re)); +// } else { +// StringPiece m; +// EXPECT_TRUE(re.PartialMatch(t.text, re, &m)); +// EXPECT_EQ(m, t.match); +// } +// } +// } + +// // Check that dot_nl option works. +// TEST(RE2, DotNL) { +// RE2::Options opt; +// opt.set_dot_nl(true); +// EXPECT_TRUE(RE2::PartialMatch("\n", RE2(".", opt))); +// EXPECT_FALSE(RE2::PartialMatch("\n", RE2("(?-s).", opt))); +// opt.set_never_nl(true); +// EXPECT_FALSE(RE2::PartialMatch("\n", RE2(".", opt))); +// } + +// // Check that there are no capturing groups in "never capture" mode. +// TEST(RE2, NeverCapture) { +// RE2::Options opt; +// opt.set_never_capture(true); +// RE2 re("(r)(e)", opt); +// EXPECT_EQ(0, re.NumberOfCapturingGroups()); +// } + +// Bitstate bug was looking at submatch[0] even if nsubmatch == 0. +// Triggered by a failed DFA search falling back to Bitstate when +// using Match with a NULL submatch set. Bitstate tried to read +// the submatch[0] entry even if nsubmatch was 0. +TEST(RE2, BitstateCaptureBug) { + RE2::Options opt; + opt.set_max_mem(20000); + RE2 re("(_________$)", opt); + StringPiece s = "xxxxxxxxxxxxxxxxxxxxxxxxxx_________x"; + EXPECT_FALSE(re.Match(s, 0, s.size(), RE2::UNANCHORED, NULL, 0)); +} + +// C++ version of bug 609710. +TEST(RE2, UnicodeClasses) { + const std::string str = "ABCDEFGHI譚永鋒"; + std::string a, b, c; + + EXPECT_TRUE(RE2::FullMatch("A", "\\p{L}")); + EXPECT_TRUE(RE2::FullMatch("A", "\\p{Lu}")); + EXPECT_FALSE(RE2::FullMatch("A", "\\p{Ll}")); + EXPECT_FALSE(RE2::FullMatch("A", "\\P{L}")); + EXPECT_FALSE(RE2::FullMatch("A", "\\P{Lu}")); + EXPECT_TRUE(RE2::FullMatch("A", "\\P{Ll}")); + + EXPECT_TRUE(RE2::FullMatch("譚", "\\p{L}")); + EXPECT_FALSE(RE2::FullMatch("譚", "\\p{Lu}")); + EXPECT_FALSE(RE2::FullMatch("譚", "\\p{Ll}")); + EXPECT_FALSE(RE2::FullMatch("譚", "\\P{L}")); + EXPECT_TRUE(RE2::FullMatch("譚", "\\P{Lu}")); + EXPECT_TRUE(RE2::FullMatch("譚", "\\P{Ll}")); + + EXPECT_TRUE(RE2::FullMatch("永", "\\p{L}")); + EXPECT_FALSE(RE2::FullMatch("永", "\\p{Lu}")); + EXPECT_FALSE(RE2::FullMatch("永", "\\p{Ll}")); + EXPECT_FALSE(RE2::FullMatch("永", "\\P{L}")); + EXPECT_TRUE(RE2::FullMatch("永", "\\P{Lu}")); + EXPECT_TRUE(RE2::FullMatch("永", "\\P{Ll}")); + + EXPECT_TRUE(RE2::FullMatch("鋒", "\\p{L}")); + EXPECT_FALSE(RE2::FullMatch("鋒", "\\p{Lu}")); + EXPECT_FALSE(RE2::FullMatch("鋒", "\\p{Ll}")); + EXPECT_FALSE(RE2::FullMatch("鋒", "\\P{L}")); + EXPECT_TRUE(RE2::FullMatch("鋒", "\\P{Lu}")); + EXPECT_TRUE(RE2::FullMatch("鋒", "\\P{Ll}")); + + EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?(.).*?(.)", &a, &b, &c)); + EXPECT_EQ("A", a); + EXPECT_EQ("B", b); + EXPECT_EQ("C", c); + + EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{L}]).*?(.)", &a, &b, &c)); + EXPECT_EQ("A", a); + EXPECT_EQ("B", b); + EXPECT_EQ("C", c); + + EXPECT_FALSE(RE2::PartialMatch(str, "\\P{L}")); + + EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{Lu}]).*?(.)", &a, &b, &c)); + EXPECT_EQ("A", a); + EXPECT_EQ("B", b); + EXPECT_EQ("C", c); + + EXPECT_FALSE(RE2::PartialMatch(str, "[^\\p{Lu}\\p{Lo}]")); + + EXPECT_TRUE(RE2::PartialMatch(str, ".*(.).*?([\\p{Lu}\\p{Lo}]).*?(.)", &a, &b, &c)); + EXPECT_EQ("譚", a); + EXPECT_EQ("永", b); + EXPECT_EQ("鋒", c); +} + +TEST(RE2, LazyRE2) { + // Test with and without options. + static LazyRE2 a = {"a"}; + static LazyRE2 b = {"b", RE2::Latin1}; + + EXPECT_EQ("a", a->pattern()); + EXPECT_EQ(RE2::Options::EncodingUTF8, a->options().encoding()); + + EXPECT_EQ("b", b->pattern()); + EXPECT_EQ(RE2::Options::EncodingLatin1, b->options().encoding()); +} + +// Bug reported by saito. 2009/02/17 +TEST(RE2, NullVsEmptyString) { + RE2 re(".*"); + EXPECT_TRUE(re.ok()); + + StringPiece null; + EXPECT_TRUE(RE2::FullMatch(null, re)); + + StringPiece empty(""); + EXPECT_TRUE(RE2::FullMatch(empty, re)); +} + +// // Similar to the previous test, check that the null string and the empty +// // string both match, but also that the null string can only provide null +// // submatches whereas the empty string can also provide empty submatches. +// TEST(RE2, NullVsEmptyStringSubmatches) { +// RE2 re("()|(foo)"); +// EXPECT_TRUE(re.ok()); + +// // matches[0] is overall match, [1] is (), [2] is (foo), [3] is nonexistent. +// StringPiece matches[4]; + +// for (size_t i = 0; i < arraysize(matches); i++) +// matches[i] = "bar"; + +// StringPiece null; +// EXPECT_TRUE(re.Match(null, 0, null.size(), RE2::UNANCHORED, +// matches, arraysize(matches))); +// for (size_t i = 0; i < arraysize(matches); i++) { +// EXPECT_TRUE(matches[i].data() == NULL); // always null +// EXPECT_TRUE(matches[i].empty()); +// } + +// for (size_t i = 0; i < arraysize(matches); i++) +// matches[i] = "bar"; + +// StringPiece empty(""); +// EXPECT_TRUE(re.Match(empty, 0, empty.size(), RE2::UNANCHORED, +// matches, arraysize(matches))); +// EXPECT_TRUE(matches[0].data() != NULL); // empty, not null +// EXPECT_TRUE(matches[0].empty()); +// EXPECT_TRUE(matches[1].data() != NULL); // empty, not null +// EXPECT_TRUE(matches[1].empty()); +// EXPECT_TRUE(matches[2].data() == NULL); +// EXPECT_TRUE(matches[2].empty()); +// EXPECT_TRUE(matches[3].data() == NULL); +// EXPECT_TRUE(matches[3].empty()); +// } + +// Issue 1816809 +TEST(RE2, Bug1816809) { + RE2 re("(((((llx((-3)|(4)))(;(llx((-3)|(4))))*))))"); + StringPiece piece("llx-3;llx4"); + std::string x; + EXPECT_TRUE(RE2::Consume(&piece, re, &x)); +} + +// Issue 3061120 +TEST(RE2, Bug3061120) { + RE2 re("(?i)\\W"); + EXPECT_FALSE(RE2::PartialMatch("x", re)); // always worked + EXPECT_FALSE(RE2::PartialMatch("k", re)); // broke because of kelvin + EXPECT_FALSE(RE2::PartialMatch("s", re)); // broke because of latin long s +} + +// TEST(RE2, CapturingGroupNames) { +// // Opening parentheses annotated with group IDs: +// // 12 3 45 6 7 +// RE2 re("((abc)(?P)|((e+)(?P.*)(?Pu+)))"); +// EXPECT_TRUE(re.ok()); +// const std::map& have = re.CapturingGroupNames(); +// std::map want; +// want[3] = "G2"; +// want[6] = "G2"; +// want[7] = "G1"; +// EXPECT_EQ(want, have); +// } + +// TEST(RE2, RegexpToStringLossOfAnchor) { +// EXPECT_EQ(RE2("^[a-c]at", RE2::POSIX).Regexp()->ToString(), "^[a-c]at"); +// EXPECT_EQ(RE2("^[a-c]at").Regexp()->ToString(), "(?-m:^)[a-c]at"); +// EXPECT_EQ(RE2("ca[t-z]$", RE2::POSIX).Regexp()->ToString(), "ca[t-z]$"); +// EXPECT_EQ(RE2("ca[t-z]$").Regexp()->ToString(), "ca[t-z](?-m:$)"); +// } + +// // Issue 10131674 +// TEST(RE2, Bug10131674) { +// // Some of these escapes describe values that do not fit in a byte. +// RE2 re("\\140\\440\\174\\271\\150\\656\\106\\201\\004\\332", RE2::Latin1); +// EXPECT_FALSE(re.ok()); +// EXPECT_FALSE(RE2::FullMatch("hello world", re)); +// } + +// TEST(RE2, Bug18391750) { +// // Stray write past end of match_ in nfa.cc, caught by fuzzing + address sanitizer. +// const char t[] = { +// (char)0x28, (char)0x28, (char)0xfc, (char)0xfc, (char)0x08, (char)0x08, +// (char)0x26, (char)0x26, (char)0x28, (char)0xc2, (char)0x9b, (char)0xc5, +// (char)0xc5, (char)0xd4, (char)0x8f, (char)0x8f, (char)0x69, (char)0x69, +// (char)0xe7, (char)0x29, (char)0x7b, (char)0x37, (char)0x31, (char)0x31, +// (char)0x7d, (char)0xae, (char)0x7c, (char)0x7c, (char)0xf3, (char)0x29, +// (char)0xae, (char)0xae, (char)0x2e, (char)0x2a, (char)0x29, (char)0x00, +// }; +// RE2::Options opt; +// opt.set_encoding(RE2::Options::EncodingLatin1); +// opt.set_longest_match(true); +// opt.set_dot_nl(true); +// opt.set_case_sensitive(false); +// RE2 re(t, opt); +// ASSERT_TRUE(re.ok()); +// RE2::PartialMatch(t, re); +// } + +TEST(RE2, Bug18458852) { + // Bug in parser accepting invalid (too large) rune, + // causing compiler to fail in DCHECK in UTF-8 + // character class code. + const char b[] = { + (char)0x28, (char)0x05, (char)0x05, (char)0x41, (char)0x41, (char)0x28, + (char)0x24, (char)0x5b, (char)0x5e, (char)0xf5, (char)0x87, (char)0x87, + (char)0x90, (char)0x29, (char)0x5d, (char)0x29, (char)0x29, (char)0x00, + }; + RE2 re(b); + ASSERT_FALSE(re.ok()); +} + +TEST(RE2, Bug18523943) { + // Bug in BitState: case kFailInst failed the match entirely. + + RE2::Options opt; + const char a[] = { + (char)0x29, (char)0x29, (char)0x24, (char)0x00, + }; + const char b[] = { + (char)0x28, (char)0x0a, (char)0x2a, (char)0x2a, (char)0x29, (char)0x00, + }; + opt.set_log_errors(false); + opt.set_encoding(RE2::Options::EncodingLatin1); + opt.set_posix_syntax(true); + opt.set_longest_match(true); + opt.set_literal(false); + opt.set_never_nl(true); + + RE2 re((const char*)b, opt); + ASSERT_TRUE(re.ok()); + std::string s1; + ASSERT_TRUE(RE2::PartialMatch((const char*)a, re, &s1)); +} + +TEST(RE2, Bug21371806) { + // Bug in parser accepting Unicode groups in Latin-1 mode, + // causing compiler to fail in DCHECK in prog.cc. + + RE2::Options opt; + opt.set_encoding(RE2::Options::EncodingLatin1); + + RE2 re("g\\p{Zl}]", opt); + ASSERT_TRUE(re.ok()); +} + +// TEST(RE2, Bug26356109) { +// // Bug in parser caused by factoring of common prefixes in alternations. + +// // In the past, this was factored to "a\\C*?[bc]". Thus, the automaton would +// // consume "ab" and then stop (when unanchored) whereas it should consume all +// // of "abc" as per first-match semantics. +// RE2 re("a\\C*?c|a\\C*?b"); +// ASSERT_TRUE(re.ok()); + +// std::string s = "abc"; +// StringPiece m; + +// ASSERT_TRUE(re.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1)); +// ASSERT_EQ(m, s) << " (UNANCHORED) got m='" << m << "', want '" << s << "'"; + +// ASSERT_TRUE(re.Match(s, 0, s.size(), RE2::ANCHOR_BOTH, &m, 1)); +// ASSERT_EQ(m, s) << " (ANCHOR_BOTH) got m='" << m << "', want '" << s << "'"; +// } + +// TEST(RE2, Issue104) { +// // RE2::GlobalReplace always advanced by one byte when the empty string was +// // matched, which would clobber any rune that is longer than one byte. + +// std::string s = "bc"; +// ASSERT_EQ(3, RE2::GlobalReplace(&s, "a*", "d")); +// ASSERT_EQ("dbdcd", s); + +// s = "ąć"; +// ASSERT_EQ(3, RE2::GlobalReplace(&s, "Ć*", "Ĉ")); +// ASSERT_EQ("ĈąĈćĈ", s); + + +// s = "人类"; +// ASSERT_EQ(3, RE2::GlobalReplace(&s, "大*", "小")); +// ASSERT_EQ("小人小类小", s); +// } + +// TEST(RE2, Issue310) { +// // (?:|a)* matched more text than (?:|a)+ did. + +// std::string s = "aaa"; +// StringPiece m; + +// RE2 star("(?:|a)*"); +// ASSERT_TRUE(star.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1)); +// ASSERT_EQ(m, "") << " got m='" << m << "', want ''"; + +// RE2 plus("(?:|a)+"); +// ASSERT_TRUE(plus.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1)); +// ASSERT_EQ(m, "") << " got m='" << m << "', want ''"; +// } + +} // namespace re2 diff --git a/re2/testing/regexp_benchmark.cc b/re2/testing/regexp_benchmark.cc new file mode 100644 index 0000000000000000000000000000000000000000..3eeb09889ed893724d671df08bf70e3c827925b9 --- /dev/null +++ b/re2/testing/regexp_benchmark.cc @@ -0,0 +1,1570 @@ +// Copyright 2006-2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Benchmarks for regular expression implementations. + +#include +#include +#include +#include +#include +#include +#include + +#include "util/benchmark.h" +#include "util/test.h" +#include "util/flags.h" +#include "util/logging.h" +#include "util/malloc_counter.h" +#include "util/strutil.h" +#include "re2/prog.h" +#include "re2/re2.h" +#include "re2/regexp.h" +#include "util/mutex.h" +#include "util/pcre.h" + +namespace re2 { +void Test(); +void MemoryUsage(); +} // namespace re2 + +typedef testing::MallocCounter MallocCounter; + +namespace re2 { + +void Test() { + Regexp* re = Regexp::Parse("(\\d+)-(\\d+)-(\\d+)", Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + CHECK(prog->IsOnePass()); + CHECK(prog->CanBitState()); + const char* text = "650-253-0001"; + StringPiece sp[4]; + CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); + CHECK_EQ(sp[0], "650-253-0001"); + CHECK_EQ(sp[1], "650"); + CHECK_EQ(sp[2], "253"); + CHECK_EQ(sp[3], "0001"); + delete prog; + re->Decref(); + LOG(INFO) << "test passed\n"; +} + +void MemoryUsage() { + const char* regexp = "(\\d+)-(\\d+)-(\\d+)"; + const char* text = "650-253-0001"; + { + MallocCounter mc(MallocCounter::THIS_THREAD_ONLY); + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + // Can't pass mc.HeapGrowth() and mc.PeakHeapGrowth() to LOG(INFO) directly, + // because LOG(INFO) might do a big allocation before they get evaluated. + fprintf(stderr, "Regexp: %7lld bytes (peak=%lld)\n", + mc.HeapGrowth(), mc.PeakHeapGrowth()); + mc.Reset(); + + Prog* prog = re->CompileToProg(0); + CHECK(prog); + CHECK(prog->IsOnePass()); + CHECK(prog->CanBitState()); + fprintf(stderr, "Prog: %7lld bytes (peak=%lld)\n", + mc.HeapGrowth(), mc.PeakHeapGrowth()); + mc.Reset(); + + StringPiece sp[4]; + CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); + fprintf(stderr, "Search: %7lld bytes (peak=%lld)\n", + mc.HeapGrowth(), mc.PeakHeapGrowth()); + delete prog; + re->Decref(); + } + + { + MallocCounter mc(MallocCounter::THIS_THREAD_ONLY); + + PCRE re(regexp, PCRE::UTF8); + fprintf(stderr, "RE: %7lld bytes (peak=%lld)\n", + mc.HeapGrowth(), mc.PeakHeapGrowth()); + PCRE::FullMatch(text, re); + fprintf(stderr, "RE: %7lld bytes (peak=%lld)\n", + mc.HeapGrowth(), mc.PeakHeapGrowth()); + } + + { + MallocCounter mc(MallocCounter::THIS_THREAD_ONLY); + + PCRE* re = new PCRE(regexp, PCRE::UTF8); + fprintf(stderr, "PCRE*: %7lld bytes (peak=%lld)\n", + mc.HeapGrowth(), mc.PeakHeapGrowth()); + PCRE::FullMatch(text, *re); + fprintf(stderr, "PCRE*: %7lld bytes (peak=%lld)\n", + mc.HeapGrowth(), mc.PeakHeapGrowth()); + delete re; + } + + { + MallocCounter mc(MallocCounter::THIS_THREAD_ONLY); + + RE2 re(regexp); + fprintf(stderr, "RE2: %7lld bytes (peak=%lld)\n", + mc.HeapGrowth(), mc.PeakHeapGrowth()); + RE2::FullMatch(text, re); + fprintf(stderr, "RE2: %7lld bytes (peak=%lld)\n", + mc.HeapGrowth(), mc.PeakHeapGrowth()); + } + + fprintf(stderr, "sizeof: PCRE=%zd RE2=%zd Prog=%zd Inst=%zd\n", + sizeof(PCRE), sizeof(RE2), sizeof(Prog), sizeof(Prog::Inst)); +} + +int NumCPUs() { + return static_cast(std::thread::hardware_concurrency()); +} + +// Regular expression implementation wrappers. +// Defined at bottom of file, but they are repetitive +// and not interesting. + +typedef void SearchImpl(benchmark::State& state, const char* regexp, + const StringPiece& text, Prog::Anchor anchor, + bool expect_match); + +SearchImpl SearchDFA, SearchNFA, SearchOnePass, SearchBitState, SearchPCRE, + SearchRE2, SearchCachedDFA, SearchCachedNFA, SearchCachedOnePass, + SearchCachedBitState, SearchCachedPCRE, SearchCachedRE2; + +typedef void ParseImpl(benchmark::State& state, const char* regexp, + const StringPiece& text); + +ParseImpl Parse1NFA, Parse1OnePass, Parse1BitState, Parse1PCRE, Parse1RE2, + Parse1Backtrack, Parse1CachedNFA, Parse1CachedOnePass, Parse1CachedBitState, + Parse1CachedPCRE, Parse1CachedRE2, Parse1CachedBacktrack; + +ParseImpl Parse3NFA, Parse3OnePass, Parse3BitState, Parse3PCRE, Parse3RE2, + Parse3Backtrack, Parse3CachedNFA, Parse3CachedOnePass, Parse3CachedBitState, + Parse3CachedPCRE, Parse3CachedRE2, Parse3CachedBacktrack; + +ParseImpl SearchParse2CachedPCRE, SearchParse2CachedRE2; + +ParseImpl SearchParse1CachedPCRE, SearchParse1CachedRE2; + +// Benchmark: failed search for regexp in random text. + +// Generate random text that won't contain the search string, +// to test worst-case search behavior. +std::string RandomText(int64_t nbytes) { + static const std::string* const text = []() { + std::string* text = new std::string; + srand(1); + text->resize(16<<20); + for (int64_t i = 0; i < 16<<20; i++) { + // Generate a one-byte rune that isn't a control character (e.g. '\n'). + // Clipping to 0x20 introduces some bias, but we don't need uniformity. + int byte = rand() & 0x7F; + if (byte < 0x20) + byte = 0x20; + (*text)[i] = byte; + } + return text; + }(); + CHECK_LE(nbytes, 16<<20); + return text->substr(0, nbytes); +} + +// Makes text of size nbytes, then calls run to search +// the text for regexp iters times. +void Search(benchmark::State& state, const char* regexp, SearchImpl* search) { + std::string s = RandomText(state.range(0)); + search(state, regexp, s, Prog::kUnanchored, false); + state.SetBytesProcessed(state.iterations() * state.range(0)); +} + +// These three are easy because they have prefixes, +// giving the search loop something to prefix accel. +#define EASY0 "ABCDEFGHIJKLMNOPQRSTUVWXYZ$" +#define EASY1 "A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$" +#define EASY2 "(?i)" EASY0 + +// This is a little harder, since it starts with a character class +// and thus can't be memchr'ed. Could look for ABC and work backward, +// but no one does that. +#define MEDIUM "[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$" + +// This is a fair amount harder, because of the leading [ -~]*. +// A bad backtracking implementation will take O(text^2) time to +// figure out there's no match. +#define HARD "[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$" + +// This has quite a high degree of fanout. +// NFA execution will be particularly slow. +#define FANOUT "(?:[\\x{80}-\\x{10FFFF}]?){100}[\\x{80}-\\x{10FFFF}]" + +// This stresses engines that are trying to track parentheses. +#define PARENS "([ -~])*(A)(B)(C)(D)(E)(F)(G)(H)(I)(J)(K)(L)(M)" \ + "(N)(O)(P)(Q)(R)(S)(T)(U)(V)(W)(X)(Y)(Z)$" + +void Search_Easy0_CachedDFA(benchmark::State& state) { Search(state, EASY0, SearchCachedDFA); } +void Search_Easy0_CachedNFA(benchmark::State& state) { Search(state, EASY0, SearchCachedNFA); } +void Search_Easy0_CachedPCRE(benchmark::State& state) { Search(state, EASY0, SearchCachedPCRE); } +void Search_Easy0_CachedRE2(benchmark::State& state) { Search(state, EASY0, SearchCachedRE2); } + +BENCHMARK_RANGE(Search_Easy0_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Easy0_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_Easy0_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_Easy0_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); + +void Search_Easy1_CachedDFA(benchmark::State& state) { Search(state, EASY1, SearchCachedDFA); } +void Search_Easy1_CachedNFA(benchmark::State& state) { Search(state, EASY1, SearchCachedNFA); } +void Search_Easy1_CachedPCRE(benchmark::State& state) { Search(state, EASY1, SearchCachedPCRE); } +void Search_Easy1_CachedRE2(benchmark::State& state) { Search(state, EASY1, SearchCachedRE2); } + +BENCHMARK_RANGE(Search_Easy1_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Easy1_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_Easy1_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_Easy1_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); + +void Search_Easy2_CachedDFA(benchmark::State& state) { Search(state, EASY2, SearchCachedDFA); } +void Search_Easy2_CachedNFA(benchmark::State& state) { Search(state, EASY2, SearchCachedNFA); } +void Search_Easy2_CachedPCRE(benchmark::State& state) { Search(state, EASY2, SearchCachedPCRE); } +void Search_Easy2_CachedRE2(benchmark::State& state) { Search(state, EASY2, SearchCachedRE2); } + +BENCHMARK_RANGE(Search_Easy2_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Easy2_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_Easy2_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_Easy2_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); + +void Search_Medium_CachedDFA(benchmark::State& state) { Search(state, MEDIUM, SearchCachedDFA); } +void Search_Medium_CachedNFA(benchmark::State& state) { Search(state, MEDIUM, SearchCachedNFA); } +void Search_Medium_CachedPCRE(benchmark::State& state) { Search(state, MEDIUM, SearchCachedPCRE); } +void Search_Medium_CachedRE2(benchmark::State& state) { Search(state, MEDIUM, SearchCachedRE2); } + +BENCHMARK_RANGE(Search_Medium_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Medium_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_Medium_CachedPCRE, 8, 256<<10)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_Medium_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); + +void Search_Hard_CachedDFA(benchmark::State& state) { Search(state, HARD, SearchCachedDFA); } +void Search_Hard_CachedNFA(benchmark::State& state) { Search(state, HARD, SearchCachedNFA); } +void Search_Hard_CachedPCRE(benchmark::State& state) { Search(state, HARD, SearchCachedPCRE); } +void Search_Hard_CachedRE2(benchmark::State& state) { Search(state, HARD, SearchCachedRE2); } + +BENCHMARK_RANGE(Search_Hard_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Hard_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_Hard_CachedPCRE, 8, 4<<10)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_Hard_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); + +void Search_Fanout_CachedDFA(benchmark::State& state) { Search(state, FANOUT, SearchCachedDFA); } +void Search_Fanout_CachedNFA(benchmark::State& state) { Search(state, FANOUT, SearchCachedNFA); } +void Search_Fanout_CachedPCRE(benchmark::State& state) { Search(state, FANOUT, SearchCachedPCRE); } +void Search_Fanout_CachedRE2(benchmark::State& state) { Search(state, FANOUT, SearchCachedRE2); } + +BENCHMARK_RANGE(Search_Fanout_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Fanout_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_Fanout_CachedPCRE, 8, 4<<10)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_Fanout_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); + +void Search_Parens_CachedDFA(benchmark::State& state) { Search(state, PARENS, SearchCachedDFA); } +void Search_Parens_CachedNFA(benchmark::State& state) { Search(state, PARENS, SearchCachedNFA); } +void Search_Parens_CachedPCRE(benchmark::State& state) { Search(state, PARENS, SearchCachedPCRE); } +void Search_Parens_CachedRE2(benchmark::State& state) { Search(state, PARENS, SearchCachedRE2); } + +BENCHMARK_RANGE(Search_Parens_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Parens_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_Parens_CachedPCRE, 8, 8)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_Parens_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); + +void SearchBigFixed(benchmark::State& state, SearchImpl* search) { + std::string s; + s.append(state.range(0)/2, 'x'); + std::string regexp = "^" + s + ".*$"; + std::string t = RandomText(state.range(0)/2); + s += t; + search(state, regexp.c_str(), s, Prog::kUnanchored, true); + state.SetBytesProcessed(state.iterations() * state.range(0)); +} + +void Search_BigFixed_CachedDFA(benchmark::State& state) { SearchBigFixed(state, SearchCachedDFA); } +void Search_BigFixed_CachedNFA(benchmark::State& state) { SearchBigFixed(state, SearchCachedNFA); } +void Search_BigFixed_CachedPCRE(benchmark::State& state) { SearchBigFixed(state, SearchCachedPCRE); } +void Search_BigFixed_CachedRE2(benchmark::State& state) { SearchBigFixed(state, SearchCachedRE2); } + +BENCHMARK_RANGE(Search_BigFixed_CachedDFA, 8, 1<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_BigFixed_CachedNFA, 8, 32<<10)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_BigFixed_CachedPCRE, 8, 32<<10)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_BigFixed_CachedRE2, 8, 1<<20)->ThreadRange(1, NumCPUs()); + +// Benchmark: FindAndConsume + +void FindAndConsume(benchmark::State& state) { + std::string s = RandomText(state.range(0)); + s.append("Hello World"); + RE2 re("((Hello World))"); + for (auto _ : state) { + StringPiece t = s; + StringPiece u; + CHECK(RE2::FindAndConsume(&t, re, &u)); + CHECK_EQ(u, "Hello World"); + } + state.SetBytesProcessed(state.iterations() * state.range(0)); +} + +BENCHMARK_RANGE(FindAndConsume, 8, 16<<20)->ThreadRange(1, NumCPUs()); + +// Benchmark: successful anchored search. + +void SearchSuccess(benchmark::State& state, const char* regexp, + SearchImpl* search) { + std::string s = RandomText(state.range(0)); + search(state, regexp, s, Prog::kAnchored, true); + state.SetBytesProcessed(state.iterations() * state.range(0)); +} + +// Unambiguous search (RE2 can use OnePass). + +void Search_Success_DFA(benchmark::State& state) { SearchSuccess(state, ".*$", SearchDFA); } +void Search_Success_NFA(benchmark::State& state) { SearchSuccess(state, ".*$", SearchNFA); } +void Search_Success_PCRE(benchmark::State& state) { SearchSuccess(state, ".*$", SearchPCRE); } +void Search_Success_RE2(benchmark::State& state) { SearchSuccess(state, ".*$", SearchRE2); } +void Search_Success_OnePass(benchmark::State& state) { SearchSuccess(state, ".*$", SearchOnePass); } + +BENCHMARK_RANGE(Search_Success_DFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Success_NFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_Success_PCRE, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_Success_RE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Success_OnePass, 8, 2<<20)->ThreadRange(1, NumCPUs()); + +void Search_Success_CachedDFA(benchmark::State& state) { SearchSuccess(state, ".*$", SearchCachedDFA); } +void Search_Success_CachedNFA(benchmark::State& state) { SearchSuccess(state, ".*$", SearchCachedNFA); } +void Search_Success_CachedPCRE(benchmark::State& state) { SearchSuccess(state, ".*$", SearchCachedPCRE); } +void Search_Success_CachedRE2(benchmark::State& state) { SearchSuccess(state, ".*$", SearchCachedRE2); } +void Search_Success_CachedOnePass(benchmark::State& state) { SearchSuccess(state, ".*$", SearchCachedOnePass); } + +BENCHMARK_RANGE(Search_Success_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Success_CachedNFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_Success_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_Success_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Success_CachedOnePass, 8, 2<<20)->ThreadRange(1, NumCPUs()); + +// Ambiguous search (RE2 cannot use OnePass). +// Used to be ".*.$", but that is coalesced to ".+$" these days. + +void Search_Success1_DFA(benchmark::State& state) { SearchSuccess(state, ".*\\C$", SearchDFA); } +void Search_Success1_NFA(benchmark::State& state) { SearchSuccess(state, ".*\\C$", SearchNFA); } +void Search_Success1_PCRE(benchmark::State& state) { SearchSuccess(state, ".*\\C$", SearchPCRE); } +void Search_Success1_RE2(benchmark::State& state) { SearchSuccess(state, ".*\\C$", SearchRE2); } +void Search_Success1_BitState(benchmark::State& state) { SearchSuccess(state, ".*\\C$", SearchBitState); } + +BENCHMARK_RANGE(Search_Success1_DFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Success1_NFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_Success1_PCRE, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_Success1_RE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Success1_BitState, 8, 2<<20)->ThreadRange(1, NumCPUs()); + +void Search_Success1_CachedDFA(benchmark::State& state) { SearchSuccess(state, ".*\\C$", SearchCachedDFA); } +void Search_Success1_CachedNFA(benchmark::State& state) { SearchSuccess(state, ".*\\C$", SearchCachedNFA); } +void Search_Success1_CachedPCRE(benchmark::State& state) { SearchSuccess(state, ".*\\C$", SearchCachedPCRE); } +void Search_Success1_CachedRE2(benchmark::State& state) { SearchSuccess(state, ".*\\C$", SearchCachedRE2); } +void Search_Success1_CachedBitState(benchmark::State& state) { SearchSuccess(state, ".*\\C$", SearchCachedBitState); } + +BENCHMARK_RANGE(Search_Success1_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Success1_CachedNFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_Success1_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_Success1_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Success1_CachedBitState, 8, 2<<20)->ThreadRange(1, NumCPUs()); + +// Benchmark: AltMatch optimisation (just to verify that it works) +// Note that OnePass doesn't implement it! + +void SearchAltMatch(benchmark::State& state, SearchImpl* search) { + std::string s = RandomText(state.range(0)); + search(state, "\\C*", s, Prog::kAnchored, true); + state.SetBytesProcessed(state.iterations() * state.range(0)); +} + +void Search_AltMatch_DFA(benchmark::State& state) { SearchAltMatch(state, SearchDFA); } +void Search_AltMatch_NFA(benchmark::State& state) { SearchAltMatch(state, SearchNFA); } +void Search_AltMatch_OnePass(benchmark::State& state) { SearchAltMatch(state, SearchOnePass); } +void Search_AltMatch_BitState(benchmark::State& state) { SearchAltMatch(state, SearchBitState); } +void Search_AltMatch_PCRE(benchmark::State& state) { SearchAltMatch(state, SearchPCRE); } +void Search_AltMatch_RE2(benchmark::State& state) { SearchAltMatch(state, SearchRE2); } + +BENCHMARK_RANGE(Search_AltMatch_DFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_AltMatch_NFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_AltMatch_OnePass, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_AltMatch_BitState, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_AltMatch_PCRE, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_AltMatch_RE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); + +void Search_AltMatch_CachedDFA(benchmark::State& state) { SearchAltMatch(state, SearchCachedDFA); } +void Search_AltMatch_CachedNFA(benchmark::State& state) { SearchAltMatch(state, SearchCachedNFA); } +void Search_AltMatch_CachedOnePass(benchmark::State& state) { SearchAltMatch(state, SearchCachedOnePass); } +void Search_AltMatch_CachedBitState(benchmark::State& state) { SearchAltMatch(state, SearchCachedBitState); } +void Search_AltMatch_CachedPCRE(benchmark::State& state) { SearchAltMatch(state, SearchCachedPCRE); } +void Search_AltMatch_CachedRE2(benchmark::State& state) { SearchAltMatch(state, SearchCachedRE2); } + +BENCHMARK_RANGE(Search_AltMatch_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_AltMatch_CachedNFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_AltMatch_CachedOnePass, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_AltMatch_CachedBitState, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_AltMatch_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_AltMatch_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); + +// Benchmark: use regexp to find phone number. + +void SearchDigits(benchmark::State& state, SearchImpl* search) { + StringPiece s("650-253-0001"); + search(state, "([0-9]+)-([0-9]+)-([0-9]+)", s, Prog::kAnchored, true); + state.SetItemsProcessed(state.iterations()); +} + +void Search_Digits_DFA(benchmark::State& state) { SearchDigits(state, SearchDFA); } +void Search_Digits_NFA(benchmark::State& state) { SearchDigits(state, SearchNFA); } +void Search_Digits_OnePass(benchmark::State& state) { SearchDigits(state, SearchOnePass); } +void Search_Digits_PCRE(benchmark::State& state) { SearchDigits(state, SearchPCRE); } +void Search_Digits_RE2(benchmark::State& state) { SearchDigits(state, SearchRE2); } +void Search_Digits_BitState(benchmark::State& state) { SearchDigits(state, SearchBitState); } + +BENCHMARK(Search_Digits_DFA)->ThreadRange(1, NumCPUs()); +BENCHMARK(Search_Digits_NFA)->ThreadRange(1, NumCPUs()); +BENCHMARK(Search_Digits_OnePass)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK(Search_Digits_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Search_Digits_RE2)->ThreadRange(1, NumCPUs()); +BENCHMARK(Search_Digits_BitState)->ThreadRange(1, NumCPUs()); + +// Benchmark: use regexp to parse digit fields in phone number. + +void Parse3Digits(benchmark::State& state, + void (*parse3)(benchmark::State&, const char*, + const StringPiece&)) { + parse3(state, "([0-9]+)-([0-9]+)-([0-9]+)", "650-253-0001"); + state.SetItemsProcessed(state.iterations()); +} + +void Parse_Digits_NFA(benchmark::State& state) { Parse3Digits(state, Parse3NFA); } +void Parse_Digits_OnePass(benchmark::State& state) { Parse3Digits(state, Parse3OnePass); } +void Parse_Digits_PCRE(benchmark::State& state) { Parse3Digits(state, Parse3PCRE); } +void Parse_Digits_RE2(benchmark::State& state) { Parse3Digits(state, Parse3RE2); } +void Parse_Digits_Backtrack(benchmark::State& state) { Parse3Digits(state, Parse3Backtrack); } +void Parse_Digits_BitState(benchmark::State& state) { Parse3Digits(state, Parse3BitState); } + +BENCHMARK(Parse_Digits_NFA)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_Digits_OnePass)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK(Parse_Digits_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Parse_Digits_RE2)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_Digits_Backtrack)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_Digits_BitState)->ThreadRange(1, NumCPUs()); + +void Parse_CachedDigits_NFA(benchmark::State& state) { Parse3Digits(state, Parse3CachedNFA); } +void Parse_CachedDigits_OnePass(benchmark::State& state) { Parse3Digits(state, Parse3CachedOnePass); } +void Parse_CachedDigits_PCRE(benchmark::State& state) { Parse3Digits(state, Parse3CachedPCRE); } +void Parse_CachedDigits_RE2(benchmark::State& state) { Parse3Digits(state, Parse3CachedRE2); } +void Parse_CachedDigits_Backtrack(benchmark::State& state) { Parse3Digits(state, Parse3CachedBacktrack); } +void Parse_CachedDigits_BitState(benchmark::State& state) { Parse3Digits(state, Parse3CachedBitState); } + +BENCHMARK(Parse_CachedDigits_NFA)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedDigits_OnePass)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK(Parse_CachedDigits_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Parse_CachedDigits_Backtrack)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedDigits_RE2)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedDigits_BitState)->ThreadRange(1, NumCPUs()); + +void Parse3DigitDs(benchmark::State& state, + void (*parse3)(benchmark::State&, const char*, + const StringPiece&)) { + parse3(state, "(\\d+)-(\\d+)-(\\d+)", "650-253-0001"); + state.SetItemsProcessed(state.iterations()); +} + +void Parse_DigitDs_NFA(benchmark::State& state) { Parse3DigitDs(state, Parse3NFA); } +void Parse_DigitDs_OnePass(benchmark::State& state) { Parse3DigitDs(state, Parse3OnePass); } +void Parse_DigitDs_PCRE(benchmark::State& state) { Parse3DigitDs(state, Parse3PCRE); } +void Parse_DigitDs_RE2(benchmark::State& state) { Parse3DigitDs(state, Parse3RE2); } +void Parse_DigitDs_Backtrack(benchmark::State& state) { Parse3DigitDs(state, Parse3CachedBacktrack); } +void Parse_DigitDs_BitState(benchmark::State& state) { Parse3DigitDs(state, Parse3CachedBitState); } + +BENCHMARK(Parse_DigitDs_NFA)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_DigitDs_OnePass)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK(Parse_DigitDs_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Parse_DigitDs_RE2)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_DigitDs_Backtrack)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_DigitDs_BitState)->ThreadRange(1, NumCPUs()); + +void Parse_CachedDigitDs_NFA(benchmark::State& state) { Parse3DigitDs(state, Parse3CachedNFA); } +void Parse_CachedDigitDs_OnePass(benchmark::State& state) { Parse3DigitDs(state, Parse3CachedOnePass); } +void Parse_CachedDigitDs_PCRE(benchmark::State& state) { Parse3DigitDs(state, Parse3CachedPCRE); } +void Parse_CachedDigitDs_RE2(benchmark::State& state) { Parse3DigitDs(state, Parse3CachedRE2); } +void Parse_CachedDigitDs_Backtrack(benchmark::State& state) { Parse3DigitDs(state, Parse3CachedBacktrack); } +void Parse_CachedDigitDs_BitState(benchmark::State& state) { Parse3DigitDs(state, Parse3CachedBitState); } + +BENCHMARK(Parse_CachedDigitDs_NFA)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedDigitDs_OnePass)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK(Parse_CachedDigitDs_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Parse_CachedDigitDs_Backtrack)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedDigitDs_RE2)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedDigitDs_BitState)->ThreadRange(1, NumCPUs()); + +// Benchmark: splitting off leading number field. + +void Parse1Split(benchmark::State& state, + void (*parse1)(benchmark::State&, const char*, + const StringPiece&)) { + parse1(state, "[0-9]+-(.*)", "650-253-0001"); + state.SetItemsProcessed(state.iterations()); +} + +void Parse_Split_NFA(benchmark::State& state) { Parse1Split(state, Parse1NFA); } +void Parse_Split_OnePass(benchmark::State& state) { Parse1Split(state, Parse1OnePass); } +void Parse_Split_PCRE(benchmark::State& state) { Parse1Split(state, Parse1PCRE); } +void Parse_Split_RE2(benchmark::State& state) { Parse1Split(state, Parse1RE2); } +void Parse_Split_BitState(benchmark::State& state) { Parse1Split(state, Parse1BitState); } + +BENCHMARK(Parse_Split_NFA)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_Split_OnePass)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK(Parse_Split_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Parse_Split_RE2)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_Split_BitState)->ThreadRange(1, NumCPUs()); + +void Parse_CachedSplit_NFA(benchmark::State& state) { Parse1Split(state, Parse1CachedNFA); } +void Parse_CachedSplit_OnePass(benchmark::State& state) { Parse1Split(state, Parse1CachedOnePass); } +void Parse_CachedSplit_PCRE(benchmark::State& state) { Parse1Split(state, Parse1CachedPCRE); } +void Parse_CachedSplit_RE2(benchmark::State& state) { Parse1Split(state, Parse1CachedRE2); } +void Parse_CachedSplit_BitState(benchmark::State& state) { Parse1Split(state, Parse1CachedBitState); } + +BENCHMARK(Parse_CachedSplit_NFA)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedSplit_OnePass)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK(Parse_CachedSplit_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Parse_CachedSplit_RE2)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedSplit_BitState)->ThreadRange(1, NumCPUs()); + +// Benchmark: splitting off leading number field but harder (ambiguous regexp). + +void Parse1SplitHard(benchmark::State& state, + void (*run)(benchmark::State&, const char*, + const StringPiece&)) { + run(state, "[0-9]+.(.*)", "650-253-0001"); + state.SetItemsProcessed(state.iterations()); +} + +void Parse_SplitHard_NFA(benchmark::State& state) { Parse1SplitHard(state, Parse1NFA); } +void Parse_SplitHard_PCRE(benchmark::State& state) { Parse1SplitHard(state, Parse1PCRE); } +void Parse_SplitHard_RE2(benchmark::State& state) { Parse1SplitHard(state, Parse1RE2); } +void Parse_SplitHard_BitState(benchmark::State& state) { Parse1SplitHard(state, Parse1BitState); } + +#ifdef USEPCRE +BENCHMARK(Parse_SplitHard_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Parse_SplitHard_RE2)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_SplitHard_BitState)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_SplitHard_NFA)->ThreadRange(1, NumCPUs()); + +void Parse_CachedSplitHard_NFA(benchmark::State& state) { Parse1SplitHard(state, Parse1CachedNFA); } +void Parse_CachedSplitHard_PCRE(benchmark::State& state) { Parse1SplitHard(state, Parse1CachedPCRE); } +void Parse_CachedSplitHard_RE2(benchmark::State& state) { Parse1SplitHard(state, Parse1CachedRE2); } +void Parse_CachedSplitHard_BitState(benchmark::State& state) { Parse1SplitHard(state, Parse1CachedBitState); } +void Parse_CachedSplitHard_Backtrack(benchmark::State& state) { Parse1SplitHard(state, Parse1CachedBacktrack); } + +#ifdef USEPCRE +BENCHMARK(Parse_CachedSplitHard_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Parse_CachedSplitHard_RE2)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedSplitHard_BitState)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedSplitHard_NFA)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedSplitHard_Backtrack)->ThreadRange(1, NumCPUs()); + +// Benchmark: Parse1SplitHard, big text, small match. + +void Parse1SplitBig1(benchmark::State& state, + void (*run)(benchmark::State&, const char*, + const StringPiece&)) { + std::string s; + s.append(100000, 'x'); + s.append("650-253-0001"); + run(state, "[0-9]+.(.*)", s); + state.SetItemsProcessed(state.iterations()); +} + +void Parse_CachedSplitBig1_PCRE(benchmark::State& state) { Parse1SplitBig1(state, SearchParse1CachedPCRE); } +void Parse_CachedSplitBig1_RE2(benchmark::State& state) { Parse1SplitBig1(state, SearchParse1CachedRE2); } + +#ifdef USEPCRE +BENCHMARK(Parse_CachedSplitBig1_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Parse_CachedSplitBig1_RE2)->ThreadRange(1, NumCPUs()); + +// Benchmark: Parse1SplitHard, big text, big match. + +void Parse1SplitBig2(benchmark::State& state, + void (*run)(benchmark::State&, const char*, + const StringPiece&)) { + std::string s; + s.append("650-253-"); + s.append(100000, '0'); + run(state, "[0-9]+.(.*)", s); + state.SetItemsProcessed(state.iterations()); +} + +void Parse_CachedSplitBig2_PCRE(benchmark::State& state) { Parse1SplitBig2(state, SearchParse1CachedPCRE); } +void Parse_CachedSplitBig2_RE2(benchmark::State& state) { Parse1SplitBig2(state, SearchParse1CachedRE2); } + +#ifdef USEPCRE +BENCHMARK(Parse_CachedSplitBig2_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Parse_CachedSplitBig2_RE2)->ThreadRange(1, NumCPUs()); + +// Benchmark: measure time required to parse (but not execute) +// a simple regular expression. + +void ParseRegexp(benchmark::State& state, const std::string& regexp) { + for (auto _ : state) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + re->Decref(); + } +} + +void SimplifyRegexp(benchmark::State& state, const std::string& regexp) { + for (auto _ : state) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Regexp* sre = re->Simplify(); + CHECK(sre); + sre->Decref(); + re->Decref(); + } +} + +void NullWalkRegexp(benchmark::State& state, const std::string& regexp) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + for (auto _ : state) { + re->NullWalk(); + } + re->Decref(); +} + +void SimplifyCompileRegexp(benchmark::State& state, const std::string& regexp) { + for (auto _ : state) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Regexp* sre = re->Simplify(); + CHECK(sre); + Prog* prog = sre->CompileToProg(0); + CHECK(prog); + delete prog; + sre->Decref(); + re->Decref(); + } +} + +void CompileRegexp(benchmark::State& state, const std::string& regexp) { + for (auto _ : state) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + delete prog; + re->Decref(); + } +} + +void CompileToProg(benchmark::State& state, const std::string& regexp) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + for (auto _ : state) { + Prog* prog = re->CompileToProg(0); + CHECK(prog); + delete prog; + } + re->Decref(); +} + +void CompileByteMap(benchmark::State& state, const std::string& regexp) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + for (auto _ : state) { + prog->ComputeByteMap(); + } + delete prog; + re->Decref(); +} + +void CompilePCRE(benchmark::State& state, const std::string& regexp) { + for (auto _ : state) { + PCRE re(regexp, PCRE::UTF8); + CHECK_EQ(re.error(), ""); + } +} + +void CompileRE2(benchmark::State& state, const std::string& regexp) { + for (auto _ : state) { + RE2 re(regexp); + CHECK_EQ(re.error(), ""); + } +} + +void RunBuild(benchmark::State& state, const std::string& regexp, + void (*run)(benchmark::State&, const std::string&)) { + run(state, regexp); + state.SetItemsProcessed(state.iterations()); +} + +} // namespace re2 + +DEFINE_FLAG(std::string, compile_regexp, "(.*)-(\\d+)-of-(\\d+)", + "regexp for compile benchmarks"); + +namespace re2 { + +void BM_PCRE_Compile(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), CompilePCRE); } +void BM_Regexp_Parse(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), ParseRegexp); } +void BM_Regexp_Simplify(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), SimplifyRegexp); } +void BM_CompileToProg(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), CompileToProg); } +void BM_CompileByteMap(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), CompileByteMap); } +void BM_Regexp_Compile(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), CompileRegexp); } +void BM_Regexp_SimplifyCompile(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), SimplifyCompileRegexp); } +void BM_Regexp_NullWalk(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), NullWalkRegexp); } +void BM_RE2_Compile(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), CompileRE2); } + +#ifdef USEPCRE +BENCHMARK(BM_PCRE_Compile)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(BM_Regexp_Parse)->ThreadRange(1, NumCPUs()); +BENCHMARK(BM_Regexp_Simplify)->ThreadRange(1, NumCPUs()); +BENCHMARK(BM_CompileToProg)->ThreadRange(1, NumCPUs()); +BENCHMARK(BM_CompileByteMap)->ThreadRange(1, NumCPUs()); +BENCHMARK(BM_Regexp_Compile)->ThreadRange(1, NumCPUs()); +BENCHMARK(BM_Regexp_SimplifyCompile)->ThreadRange(1, NumCPUs()); +BENCHMARK(BM_Regexp_NullWalk)->ThreadRange(1, NumCPUs()); +BENCHMARK(BM_RE2_Compile)->ThreadRange(1, NumCPUs()); + +// Makes text of size nbytes, then calls run to search +// the text for regexp iters times. +void SearchPhone(benchmark::State& state, ParseImpl* search) { + std::string s = RandomText(state.range(0)); + s.append("(650) 253-0001"); + search(state, "(\\d{3}-|\\(\\d{3}\\)\\s+)(\\d{3}-\\d{4})", s); + state.SetBytesProcessed(state.iterations() * state.range(0)); +} + +void SearchPhone_CachedPCRE(benchmark::State& state) { + SearchPhone(state, SearchParse2CachedPCRE); +} + +void SearchPhone_CachedRE2(benchmark::State& state) { + SearchPhone(state, SearchParse2CachedRE2); +} + +#ifdef USEPCRE +BENCHMARK_RANGE(SearchPhone_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(SearchPhone_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); + +/* +TODO(rsc): Make this work again. +void CacheFill(int iters, int n, SearchImpl *srch) { + std::string s = DeBruijnString(n+1); + std::string t; + for (int i = n+1; i < 20; i++) { + t = s + s; + using std::swap; + swap(s, t); + } + srch(iters, StringPrintf("0[01]{%d}$", n).c_str(), s, + Prog::kUnanchored, true); + SetBenchmarkBytesProcessed(static_cast(iters)*s.size()); +} + +void CacheFillPCRE(int i, int n) { CacheFill(i, n, SearchCachedPCRE); } +void CacheFillRE2(int i, int n) { CacheFill(i, n, SearchCachedRE2); } +void CacheFillNFA(int i, int n) { CacheFill(i, n, SearchCachedNFA); } +void CacheFillDFA(int i, int n) { CacheFill(i, n, SearchCachedDFA); } + +// BENCHMARK_WITH_ARG uses __LINE__ to generate distinct identifiers +// for the static BenchmarkRegisterer, which makes it unusable inside +// a macro like DO24 below. MY_BENCHMARK_WITH_ARG uses the argument a +// to make the identifiers distinct (only possible when 'a' is a simple +// expression like 2, not like 1+1). +#define MY_BENCHMARK_WITH_ARG(n, a) \ + bool __benchmark_ ## n ## a = \ + (new ::testing::Benchmark(#n, NewPermanentCallback(&n)))->ThreadRange(1, NumCPUs()); + +#define DO24(A, B) \ + A(B, 1); A(B, 2); A(B, 3); A(B, 4); A(B, 5); A(B, 6); \ + A(B, 7); A(B, 8); A(B, 9); A(B, 10); A(B, 11); A(B, 12); \ + A(B, 13); A(B, 14); A(B, 15); A(B, 16); A(B, 17); A(B, 18); \ + A(B, 19); A(B, 20); A(B, 21); A(B, 22); A(B, 23); A(B, 24); + +DO24(MY_BENCHMARK_WITH_ARG, CacheFillPCRE) +DO24(MY_BENCHMARK_WITH_ARG, CacheFillNFA) +DO24(MY_BENCHMARK_WITH_ARG, CacheFillRE2) +DO24(MY_BENCHMARK_WITH_ARG, CacheFillDFA) + +#undef DO24 +#undef MY_BENCHMARK_WITH_ARG +*/ + +//////////////////////////////////////////////////////////////////////// +// +// Implementation routines. Sad that there are so many, +// but all the interfaces are slightly different. + +// Runs implementation to search for regexp in text, iters times. +// Expect_match says whether the regexp should be found. +// Anchored says whether to run an anchored search. + +void SearchDFA(benchmark::State& state, const char* regexp, + const StringPiece& text, Prog::Anchor anchor, + bool expect_match) { + for (auto _ : state) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + bool failed = false; + CHECK_EQ(prog->SearchDFA(text, StringPiece(), anchor, Prog::kFirstMatch, + NULL, &failed, NULL), + expect_match); + CHECK(!failed); + delete prog; + re->Decref(); + } +} + +void SearchNFA(benchmark::State& state, const char* regexp, + const StringPiece& text, Prog::Anchor anchor, + bool expect_match) { + for (auto _ : state) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + CHECK_EQ(prog->SearchNFA(text, StringPiece(), anchor, Prog::kFirstMatch, + NULL, 0), + expect_match); + delete prog; + re->Decref(); + } +} + +void SearchOnePass(benchmark::State& state, const char* regexp, + const StringPiece& text, Prog::Anchor anchor, + bool expect_match) { + for (auto _ : state) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + CHECK(prog->IsOnePass()); + CHECK_EQ(prog->SearchOnePass(text, text, anchor, Prog::kFirstMatch, NULL, 0), + expect_match); + delete prog; + re->Decref(); + } +} + +void SearchBitState(benchmark::State& state, const char* regexp, + const StringPiece& text, Prog::Anchor anchor, + bool expect_match) { + for (auto _ : state) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + CHECK(prog->CanBitState()); + CHECK_EQ(prog->SearchBitState(text, text, anchor, Prog::kFirstMatch, NULL, 0), + expect_match); + delete prog; + re->Decref(); + } +} + +void SearchPCRE(benchmark::State& state, const char* regexp, + const StringPiece& text, Prog::Anchor anchor, + bool expect_match) { + for (auto _ : state) { + PCRE re(regexp, PCRE::UTF8); + CHECK_EQ(re.error(), ""); + if (anchor == Prog::kAnchored) + CHECK_EQ(PCRE::FullMatch(text, re), expect_match); + else + CHECK_EQ(PCRE::PartialMatch(text, re), expect_match); + } +} + +void SearchRE2(benchmark::State& state, const char* regexp, + const StringPiece& text, Prog::Anchor anchor, + bool expect_match) { + for (auto _ : state) { + RE2 re(regexp); + CHECK_EQ(re.error(), ""); + if (anchor == Prog::kAnchored) + CHECK_EQ(RE2::FullMatch(text, re), expect_match); + else + CHECK_EQ(RE2::PartialMatch(text, re), expect_match); + } +} + +// SearchCachedXXX is like SearchXXX but only does the +// regexp parsing and compiling once. This lets us measure +// search time without the per-regexp overhead. + +Prog* GetCachedProg(const char* regexp) { + static auto& mutex = *new Mutex; + MutexLock lock(&mutex); + static auto& cache = *new std::unordered_map; + Prog* prog = cache[regexp]; + if (prog == NULL) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + prog = re->CompileToProg(int64_t{1}<<31); // mostly for the DFA + CHECK(prog); + cache[regexp] = prog; + re->Decref(); + // We must call this here - while we have exclusive access. + prog->IsOnePass(); + } + return prog; +} + +PCRE* GetCachedPCRE(const char* regexp) { + static auto& mutex = *new Mutex; + MutexLock lock(&mutex); + static auto& cache = *new std::unordered_map; + PCRE* re = cache[regexp]; + if (re == NULL) { + re = new PCRE(regexp, PCRE::UTF8); + CHECK_EQ(re->error(), ""); + cache[regexp] = re; + } + return re; +} + +RE2* GetCachedRE2(const char* regexp) { + static auto& mutex = *new Mutex; + MutexLock lock(&mutex); + static auto& cache = *new std::unordered_map; + RE2* re = cache[regexp]; + if (re == NULL) { + re = new RE2(regexp); + CHECK_EQ(re->error(), ""); + cache[regexp] = re; + } + return re; +} + +void SearchCachedDFA(benchmark::State& state, const char* regexp, + const StringPiece& text, Prog::Anchor anchor, + bool expect_match) { + Prog* prog = GetCachedProg(regexp); + for (auto _ : state) { + bool failed = false; + CHECK_EQ(prog->SearchDFA(text, StringPiece(), anchor, Prog::kFirstMatch, + NULL, &failed, NULL), + expect_match); + CHECK(!failed); + } +} + +void SearchCachedNFA(benchmark::State& state, const char* regexp, + const StringPiece& text, Prog::Anchor anchor, + bool expect_match) { + Prog* prog = GetCachedProg(regexp); + for (auto _ : state) { + CHECK_EQ(prog->SearchNFA(text, StringPiece(), anchor, Prog::kFirstMatch, + NULL, 0), + expect_match); + } +} + +void SearchCachedOnePass(benchmark::State& state, const char* regexp, + const StringPiece& text, Prog::Anchor anchor, + bool expect_match) { + Prog* prog = GetCachedProg(regexp); + CHECK(prog->IsOnePass()); + for (auto _ : state) { + CHECK_EQ(prog->SearchOnePass(text, text, anchor, Prog::kFirstMatch, NULL, 0), + expect_match); + } +} + +void SearchCachedBitState(benchmark::State& state, const char* regexp, + const StringPiece& text, Prog::Anchor anchor, + bool expect_match) { + Prog* prog = GetCachedProg(regexp); + CHECK(prog->CanBitState()); + for (auto _ : state) { + CHECK_EQ(prog->SearchBitState(text, text, anchor, Prog::kFirstMatch, NULL, 0), + expect_match); + } +} + +void SearchCachedPCRE(benchmark::State& state, const char* regexp, + const StringPiece& text, Prog::Anchor anchor, + bool expect_match) { + PCRE& re = *GetCachedPCRE(regexp); + for (auto _ : state) { + if (anchor == Prog::kAnchored) + CHECK_EQ(PCRE::FullMatch(text, re), expect_match); + else + CHECK_EQ(PCRE::PartialMatch(text, re), expect_match); + } +} + +void SearchCachedRE2(benchmark::State& state, const char* regexp, + const StringPiece& text, Prog::Anchor anchor, + bool expect_match) { + RE2& re = *GetCachedRE2(regexp); + for (auto _ : state) { + if (anchor == Prog::kAnchored) + CHECK_EQ(RE2::FullMatch(text, re), expect_match); + else + CHECK_EQ(RE2::PartialMatch(text, re), expect_match); + } +} + +// Runs implementation to full match regexp against text, +// extracting three submatches. Expects match always. + +void Parse3NFA(benchmark::State& state, const char* regexp, + const StringPiece& text) { + for (auto _ : state) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + StringPiece sp[4]; // 4 because sp[0] is whole match. + CHECK(prog->SearchNFA(text, StringPiece(), Prog::kAnchored, + Prog::kFullMatch, sp, 4)); + delete prog; + re->Decref(); + } +} + +void Parse3OnePass(benchmark::State& state, const char* regexp, + const StringPiece& text) { + for (auto _ : state) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + CHECK(prog->IsOnePass()); + StringPiece sp[4]; // 4 because sp[0] is whole match. + CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); + delete prog; + re->Decref(); + } +} + +void Parse3BitState(benchmark::State& state, const char* regexp, + const StringPiece& text) { + for (auto _ : state) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + CHECK(prog->CanBitState()); + StringPiece sp[4]; // 4 because sp[0] is whole match. + CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); + delete prog; + re->Decref(); + } +} + +void Parse3Backtrack(benchmark::State& state, const char* regexp, + const StringPiece& text) { + for (auto _ : state) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + StringPiece sp[4]; // 4 because sp[0] is whole match. + CHECK(prog->UnsafeSearchBacktrack(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); + delete prog; + re->Decref(); + } +} + +void Parse3PCRE(benchmark::State& state, const char* regexp, + const StringPiece& text) { + for (auto _ : state) { + PCRE re(regexp, PCRE::UTF8); + CHECK_EQ(re.error(), ""); + StringPiece sp1, sp2, sp3; + CHECK(PCRE::FullMatch(text, re, &sp1, &sp2, &sp3)); + } +} + +void Parse3RE2(benchmark::State& state, const char* regexp, + const StringPiece& text) { + for (auto _ : state) { + RE2 re(regexp); + CHECK_EQ(re.error(), ""); + StringPiece sp1, sp2, sp3; + CHECK(RE2::FullMatch(text, re, &sp1, &sp2, &sp3)); + } +} + +void Parse3CachedNFA(benchmark::State& state, const char* regexp, + const StringPiece& text) { + Prog* prog = GetCachedProg(regexp); + StringPiece sp[4]; // 4 because sp[0] is whole match. + for (auto _ : state) { + CHECK(prog->SearchNFA(text, StringPiece(), Prog::kAnchored, + Prog::kFullMatch, sp, 4)); + } +} + +void Parse3CachedOnePass(benchmark::State& state, const char* regexp, + const StringPiece& text) { + Prog* prog = GetCachedProg(regexp); + CHECK(prog->IsOnePass()); + StringPiece sp[4]; // 4 because sp[0] is whole match. + for (auto _ : state) { + CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); + } +} + +void Parse3CachedBitState(benchmark::State& state, const char* regexp, + const StringPiece& text) { + Prog* prog = GetCachedProg(regexp); + CHECK(prog->CanBitState()); + StringPiece sp[4]; // 4 because sp[0] is whole match. + for (auto _ : state) { + CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); + } +} + +void Parse3CachedBacktrack(benchmark::State& state, const char* regexp, + const StringPiece& text) { + Prog* prog = GetCachedProg(regexp); + StringPiece sp[4]; // 4 because sp[0] is whole match. + for (auto _ : state) { + CHECK(prog->UnsafeSearchBacktrack(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); + } +} + +void Parse3CachedPCRE(benchmark::State& state, const char* regexp, + const StringPiece& text) { + PCRE& re = *GetCachedPCRE(regexp); + StringPiece sp1, sp2, sp3; + for (auto _ : state) { + CHECK(PCRE::FullMatch(text, re, &sp1, &sp2, &sp3)); + } +} + +void Parse3CachedRE2(benchmark::State& state, const char* regexp, + const StringPiece& text) { + RE2& re = *GetCachedRE2(regexp); + StringPiece sp1, sp2, sp3; + for (auto _ : state) { + CHECK(RE2::FullMatch(text, re, &sp1, &sp2, &sp3)); + } +} + +// Runs implementation to full match regexp against text, +// extracting three submatches. Expects match always. + +void Parse1NFA(benchmark::State& state, const char* regexp, + const StringPiece& text) { + for (auto _ : state) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + StringPiece sp[2]; // 2 because sp[0] is whole match. + CHECK(prog->SearchNFA(text, StringPiece(), Prog::kAnchored, + Prog::kFullMatch, sp, 2)); + delete prog; + re->Decref(); + } +} + +void Parse1OnePass(benchmark::State& state, const char* regexp, + const StringPiece& text) { + for (auto _ : state) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + CHECK(prog->IsOnePass()); + StringPiece sp[2]; // 2 because sp[0] is whole match. + CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2)); + delete prog; + re->Decref(); + } +} + +void Parse1BitState(benchmark::State& state, const char* regexp, + const StringPiece& text) { + for (auto _ : state) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + CHECK(prog->CanBitState()); + StringPiece sp[2]; // 2 because sp[0] is whole match. + CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2)); + delete prog; + re->Decref(); + } +} + +void Parse1PCRE(benchmark::State& state, const char* regexp, + const StringPiece& text) { + for (auto _ : state) { + PCRE re(regexp, PCRE::UTF8); + CHECK_EQ(re.error(), ""); + StringPiece sp1; + CHECK(PCRE::FullMatch(text, re, &sp1)); + } +} + +void Parse1RE2(benchmark::State& state, const char* regexp, + const StringPiece& text) { + for (auto _ : state) { + RE2 re(regexp); + CHECK_EQ(re.error(), ""); + StringPiece sp1; + CHECK(RE2::FullMatch(text, re, &sp1)); + } +} + +void Parse1CachedNFA(benchmark::State& state, const char* regexp, + const StringPiece& text) { + Prog* prog = GetCachedProg(regexp); + StringPiece sp[2]; // 2 because sp[0] is whole match. + for (auto _ : state) { + CHECK(prog->SearchNFA(text, StringPiece(), Prog::kAnchored, + Prog::kFullMatch, sp, 2)); + } +} + +void Parse1CachedOnePass(benchmark::State& state, const char* regexp, + const StringPiece& text) { + Prog* prog = GetCachedProg(regexp); + CHECK(prog->IsOnePass()); + StringPiece sp[2]; // 2 because sp[0] is whole match. + for (auto _ : state) { + CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2)); + } +} + +void Parse1CachedBitState(benchmark::State& state, const char* regexp, + const StringPiece& text) { + Prog* prog = GetCachedProg(regexp); + CHECK(prog->CanBitState()); + StringPiece sp[2]; // 2 because sp[0] is whole match. + for (auto _ : state) { + CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2)); + } +} + +void Parse1CachedBacktrack(benchmark::State& state, const char* regexp, + const StringPiece& text) { + Prog* prog = GetCachedProg(regexp); + StringPiece sp[2]; // 2 because sp[0] is whole match. + for (auto _ : state) { + CHECK(prog->UnsafeSearchBacktrack(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2)); + } +} + +void Parse1CachedPCRE(benchmark::State& state, const char* regexp, + const StringPiece& text) { + PCRE& re = *GetCachedPCRE(regexp); + StringPiece sp1; + for (auto _ : state) { + CHECK(PCRE::FullMatch(text, re, &sp1)); + } +} + +void Parse1CachedRE2(benchmark::State& state, const char* regexp, + const StringPiece& text) { + RE2& re = *GetCachedRE2(regexp); + StringPiece sp1; + for (auto _ : state) { + CHECK(RE2::FullMatch(text, re, &sp1)); + } +} + +void SearchParse2CachedPCRE(benchmark::State& state, const char* regexp, + const StringPiece& text) { + PCRE& re = *GetCachedPCRE(regexp); + for (auto _ : state) { + StringPiece sp1, sp2; + CHECK(PCRE::PartialMatch(text, re, &sp1, &sp2)); + } +} + +void SearchParse2CachedRE2(benchmark::State& state, const char* regexp, + const StringPiece& text) { + RE2& re = *GetCachedRE2(regexp); + for (auto _ : state) { + StringPiece sp1, sp2; + CHECK(RE2::PartialMatch(text, re, &sp1, &sp2)); + } +} + +void SearchParse1CachedPCRE(benchmark::State& state, const char* regexp, + const StringPiece& text) { + PCRE& re = *GetCachedPCRE(regexp); + for (auto _ : state) { + StringPiece sp1; + CHECK(PCRE::PartialMatch(text, re, &sp1)); + } +} + +void SearchParse1CachedRE2(benchmark::State& state, const char* regexp, + const StringPiece& text) { + RE2& re = *GetCachedRE2(regexp); + for (auto _ : state) { + StringPiece sp1; + CHECK(RE2::PartialMatch(text, re, &sp1)); + } +} + +void EmptyPartialMatchPCRE(benchmark::State& state) { + PCRE re(""); + for (auto _ : state) { + PCRE::PartialMatch("", re); + } +} + +void EmptyPartialMatchRE2(benchmark::State& state) { + RE2 re(""); + for (auto _ : state) { + RE2::PartialMatch("", re); + } +} +#ifdef USEPCRE +BENCHMARK(EmptyPartialMatchPCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(EmptyPartialMatchRE2)->ThreadRange(1, NumCPUs()); + +void SimplePartialMatchPCRE(benchmark::State& state) { + PCRE re("abcdefg"); + for (auto _ : state) { + PCRE::PartialMatch("abcdefg", re); + } +} + +void SimplePartialMatchRE2(benchmark::State& state) { + RE2 re("abcdefg"); + for (auto _ : state) { + RE2::PartialMatch("abcdefg", re); + } +} +#ifdef USEPCRE +BENCHMARK(SimplePartialMatchPCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(SimplePartialMatchRE2)->ThreadRange(1, NumCPUs()); + +static std::string http_text = + "GET /asdfhjasdhfasdlfhasdflkjasdfkljasdhflaskdjhf" + "alksdjfhasdlkfhasdlkjfhasdljkfhadsjklf HTTP/1.1"; + +void HTTPPartialMatchPCRE(benchmark::State& state) { + StringPiece a; + PCRE re("(?-s)^(?:GET|POST) +([^ ]+) HTTP"); + for (auto _ : state) { + PCRE::PartialMatch(http_text, re, &a); + } +} + +void HTTPPartialMatchRE2(benchmark::State& state) { + StringPiece a; + RE2 re("(?-s)^(?:GET|POST) +([^ ]+) HTTP"); + for (auto _ : state) { + RE2::PartialMatch(http_text, re, &a); + } +} + +#ifdef USEPCRE +BENCHMARK(HTTPPartialMatchPCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(HTTPPartialMatchRE2)->ThreadRange(1, NumCPUs()); + +static std::string smallhttp_text = + "GET /abc HTTP/1.1"; + +void SmallHTTPPartialMatchPCRE(benchmark::State& state) { + StringPiece a; + PCRE re("(?-s)^(?:GET|POST) +([^ ]+) HTTP"); + for (auto _ : state) { + PCRE::PartialMatch(smallhttp_text, re, &a); + } +} + +void SmallHTTPPartialMatchRE2(benchmark::State& state) { + StringPiece a; + RE2 re("(?-s)^(?:GET|POST) +([^ ]+) HTTP"); + for (auto _ : state) { + RE2::PartialMatch(smallhttp_text, re, &a); + } +} + +#ifdef USEPCRE +BENCHMARK(SmallHTTPPartialMatchPCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(SmallHTTPPartialMatchRE2)->ThreadRange(1, NumCPUs()); + +void DotMatchPCRE(benchmark::State& state) { + StringPiece a; + PCRE re("(?-s)^(.+)"); + for (auto _ : state) { + PCRE::PartialMatch(http_text, re, &a); + } +} + +void DotMatchRE2(benchmark::State& state) { + StringPiece a; + RE2 re("(?-s)^(.+)"); + for (auto _ : state) { + RE2::PartialMatch(http_text, re, &a); + } +} + +#ifdef USEPCRE +BENCHMARK(DotMatchPCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(DotMatchRE2)->ThreadRange(1, NumCPUs()); + +void ASCIIMatchPCRE(benchmark::State& state) { + StringPiece a; + PCRE re("(?-s)^([ -~]+)"); + for (auto _ : state) { + PCRE::PartialMatch(http_text, re, &a); + } +} + +void ASCIIMatchRE2(benchmark::State& state) { + StringPiece a; + RE2 re("(?-s)^([ -~]+)"); + for (auto _ : state) { + RE2::PartialMatch(http_text, re, &a); + } +} + +#ifdef USEPCRE +BENCHMARK(ASCIIMatchPCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(ASCIIMatchRE2)->ThreadRange(1, NumCPUs()); + +void FullMatchPCRE(benchmark::State& state, const char *regexp) { + std::string s = RandomText(state.range(0)); + s += "ABCDEFGHIJ"; + PCRE re(regexp); + for (auto _ : state) { + CHECK(PCRE::FullMatch(s, re)); + } + state.SetBytesProcessed(state.iterations() * state.range(0)); +} + +void FullMatchRE2(benchmark::State& state, const char *regexp) { + std::string s = RandomText(state.range(0)); + s += "ABCDEFGHIJ"; + RE2 re(regexp, RE2::Latin1); + for (auto _ : state) { + CHECK(RE2::FullMatch(s, re)); + } + state.SetBytesProcessed(state.iterations() * state.range(0)); +} + +void FullMatch_DotStar_CachedPCRE(benchmark::State& state) { FullMatchPCRE(state, "(?s).*"); } +void FullMatch_DotStar_CachedRE2(benchmark::State& state) { FullMatchRE2(state, "(?s).*"); } + +void FullMatch_DotStarDollar_CachedPCRE(benchmark::State& state) { FullMatchPCRE(state, "(?s).*$"); } +void FullMatch_DotStarDollar_CachedRE2(benchmark::State& state) { FullMatchRE2(state, "(?s).*$"); } + +void FullMatch_DotStarCapture_CachedPCRE(benchmark::State& state) { FullMatchPCRE(state, "(?s)((.*)()()($))"); } +void FullMatch_DotStarCapture_CachedRE2(benchmark::State& state) { FullMatchRE2(state, "(?s)((.*)()()($))"); } + +#ifdef USEPCRE +BENCHMARK_RANGE(FullMatch_DotStar_CachedPCRE, 8, 2<<20); +#endif +BENCHMARK_RANGE(FullMatch_DotStar_CachedRE2, 8, 2<<20); + +#ifdef USEPCRE +BENCHMARK_RANGE(FullMatch_DotStarDollar_CachedPCRE, 8, 2<<20); +#endif +BENCHMARK_RANGE(FullMatch_DotStarDollar_CachedRE2, 8, 2<<20); + +#ifdef USEPCRE +BENCHMARK_RANGE(FullMatch_DotStarCapture_CachedPCRE, 8, 2<<20); +#endif +BENCHMARK_RANGE(FullMatch_DotStarCapture_CachedRE2, 8, 2<<20); + +void PossibleMatchRangeCommon(benchmark::State& state, const char* regexp) { + RE2 re(regexp); + std::string min; + std::string max; + const int kMaxLen = 16; + for (auto _ : state) { + CHECK(re.PossibleMatchRange(&min, &max, kMaxLen)); + } +} + +void PossibleMatchRange_Trivial(benchmark::State& state) { + PossibleMatchRangeCommon(state, ".*"); +} +void PossibleMatchRange_Complex(benchmark::State& state) { + PossibleMatchRangeCommon(state, "^abc[def]?[gh]{1,2}.*"); +} +void PossibleMatchRange_Prefix(benchmark::State& state) { + PossibleMatchRangeCommon(state, "^some_random_prefix.*"); +} +void PossibleMatchRange_NoProg(benchmark::State& state) { + PossibleMatchRangeCommon(state, "^some_random_string$"); +} + +BENCHMARK(PossibleMatchRange_Trivial); +BENCHMARK(PossibleMatchRange_Complex); +BENCHMARK(PossibleMatchRange_Prefix); +BENCHMARK(PossibleMatchRange_NoProg); + +} // namespace re2 diff --git a/re2/testing/set_test.cc b/re2/testing/set_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..5a760c4b5e27d986ec582bf512f7968e6906194c --- /dev/null +++ b/re2/testing/set_test.cc @@ -0,0 +1,230 @@ +// Copyright 2010 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include +#include +#include + +#include "util/test.h" +#include "util/logging.h" +#include "re2/re2.h" +#include "re2/set.h" + +namespace re2 { + +TEST(Set, Unanchored) { + RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED); + + ASSERT_EQ(s.Add("foo", NULL), 0); + ASSERT_EQ(s.Add("(", NULL), -1); + ASSERT_EQ(s.Add("bar", NULL), 1); + ASSERT_EQ(s.Compile(), true); + + ASSERT_EQ(s.Match("foobar", NULL), true); + ASSERT_EQ(s.Match("fooba", NULL), true); + ASSERT_EQ(s.Match("oobar", NULL), true); + + std::vector v; + ASSERT_EQ(s.Match("foobar", &v), true); + ASSERT_EQ(v.size(), 2); + ASSERT_EQ(v[0], 0); + ASSERT_EQ(v[1], 1); + + ASSERT_EQ(s.Match("fooba", &v), true); + ASSERT_EQ(v.size(), 1); + ASSERT_EQ(v[0], 0); + + ASSERT_EQ(s.Match("oobar", &v), true); + ASSERT_EQ(v.size(), 1); + ASSERT_EQ(v[0], 1); +} + +TEST(Set, UnanchoredFactored) { + RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED); + + ASSERT_EQ(s.Add("foo", NULL), 0); + ASSERT_EQ(s.Add("(", NULL), -1); + ASSERT_EQ(s.Add("foobar", NULL), 1); + ASSERT_EQ(s.Compile(), true); + + ASSERT_EQ(s.Match("foobar", NULL), true); + ASSERT_EQ(s.Match("obarfoobaroo", NULL), true); + ASSERT_EQ(s.Match("fooba", NULL), true); + ASSERT_EQ(s.Match("oobar", NULL), false); + + std::vector v; + ASSERT_EQ(s.Match("foobar", &v), true); + ASSERT_EQ(v.size(), 2); + ASSERT_EQ(v[0], 0); + ASSERT_EQ(v[1], 1); + + ASSERT_EQ(s.Match("obarfoobaroo", &v), true); + ASSERT_EQ(v.size(), 2); + ASSERT_EQ(v[0], 0); + ASSERT_EQ(v[1], 1); + + ASSERT_EQ(s.Match("fooba", &v), true); + ASSERT_EQ(v.size(), 1); + ASSERT_EQ(v[0], 0); + + ASSERT_EQ(s.Match("oobar", &v), false); + ASSERT_EQ(v.size(), 0); +} + +TEST(Set, UnanchoredDollar) { + RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED); + + ASSERT_EQ(s.Add("foo$", NULL), 0); + ASSERT_EQ(s.Compile(), true); + + ASSERT_EQ(s.Match("foo", NULL), true); + ASSERT_EQ(s.Match("foobar", NULL), false); + + std::vector v; + ASSERT_EQ(s.Match("foo", &v), true); + ASSERT_EQ(v.size(), 1); + ASSERT_EQ(v[0], 0); + + ASSERT_EQ(s.Match("foobar", &v), false); + ASSERT_EQ(v.size(), 0); +} + +TEST(Set, UnanchoredWordBoundary) { + RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED); + + ASSERT_EQ(s.Add("foo\\b", NULL), 0); + ASSERT_EQ(s.Compile(), true); + + ASSERT_EQ(s.Match("foo", NULL), true); + ASSERT_EQ(s.Match("foobar", NULL), false); + ASSERT_EQ(s.Match("foo bar", NULL), true); + + std::vector v; + ASSERT_EQ(s.Match("foo", &v), true); + ASSERT_EQ(v.size(), 1); + ASSERT_EQ(v[0], 0); + + ASSERT_EQ(s.Match("foobar", &v), false); + ASSERT_EQ(v.size(), 0); + + ASSERT_EQ(s.Match("foo bar", &v), true); + ASSERT_EQ(v.size(), 1); + ASSERT_EQ(v[0], 0); +} + +TEST(Set, Anchored) { + RE2::Set s(RE2::DefaultOptions, RE2::ANCHOR_BOTH); + + ASSERT_EQ(s.Add("foo", NULL), 0); + ASSERT_EQ(s.Add("(", NULL), -1); + ASSERT_EQ(s.Add("bar", NULL), 1); + ASSERT_EQ(s.Compile(), true); + + ASSERT_EQ(s.Match("foobar", NULL), false); + ASSERT_EQ(s.Match("fooba", NULL), false); + ASSERT_EQ(s.Match("oobar", NULL), false); + ASSERT_EQ(s.Match("foo", NULL), true); + ASSERT_EQ(s.Match("bar", NULL), true); + + std::vector v; + ASSERT_EQ(s.Match("foobar", &v), false); + ASSERT_EQ(v.size(), 0); + + ASSERT_EQ(s.Match("fooba", &v), false); + ASSERT_EQ(v.size(), 0); + + ASSERT_EQ(s.Match("oobar", &v), false); + ASSERT_EQ(v.size(), 0); + + ASSERT_EQ(s.Match("foo", &v), true); + ASSERT_EQ(v.size(), 1); + ASSERT_EQ(v[0], 0); + + ASSERT_EQ(s.Match("bar", &v), true); + ASSERT_EQ(v.size(), 1); + ASSERT_EQ(v[0], 1); +} + +TEST(Set, EmptyUnanchored) { + RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED); + + ASSERT_EQ(s.Compile(), true); + + ASSERT_EQ(s.Match("", NULL), false); + ASSERT_EQ(s.Match("foobar", NULL), false); + + std::vector v; + ASSERT_EQ(s.Match("", &v), false); + ASSERT_EQ(v.size(), 0); + + ASSERT_EQ(s.Match("foobar", &v), false); + ASSERT_EQ(v.size(), 0); +} + +TEST(Set, EmptyAnchored) { + RE2::Set s(RE2::DefaultOptions, RE2::ANCHOR_BOTH); + + ASSERT_EQ(s.Compile(), true); + + ASSERT_EQ(s.Match("", NULL), false); + ASSERT_EQ(s.Match("foobar", NULL), false); + + std::vector v; + ASSERT_EQ(s.Match("", &v), false); + ASSERT_EQ(v.size(), 0); + + ASSERT_EQ(s.Match("foobar", &v), false); + ASSERT_EQ(v.size(), 0); +} + +TEST(Set, Prefix) { + RE2::Set s(RE2::DefaultOptions, RE2::ANCHOR_BOTH); + + ASSERT_EQ(s.Add("/prefix/\\d*", NULL), 0); + ASSERT_EQ(s.Compile(), true); + + ASSERT_EQ(s.Match("/prefix", NULL), false); + ASSERT_EQ(s.Match("/prefix/", NULL), true); + ASSERT_EQ(s.Match("/prefix/42", NULL), true); + + std::vector v; + ASSERT_EQ(s.Match("/prefix", &v), false); + ASSERT_EQ(v.size(), 0); + + ASSERT_EQ(s.Match("/prefix/", &v), true); + ASSERT_EQ(v.size(), 1); + ASSERT_EQ(v[0], 0); + + ASSERT_EQ(s.Match("/prefix/42", &v), true); + ASSERT_EQ(v.size(), 1); + ASSERT_EQ(v[0], 0); +} + +TEST(Set, MoveSemantics) { + RE2::Set s1(RE2::DefaultOptions, RE2::UNANCHORED); + ASSERT_EQ(s1.Add("foo\\d+", NULL), 0); + ASSERT_EQ(s1.Compile(), true); + ASSERT_EQ(s1.Match("abc foo1 xyz", NULL), true); + ASSERT_EQ(s1.Match("abc bar2 xyz", NULL), false); + + // The moved-to object should do what the moved-from object did. + RE2::Set s2 = std::move(s1); + ASSERT_EQ(s2.Match("abc foo1 xyz", NULL), true); + ASSERT_EQ(s2.Match("abc bar2 xyz", NULL), false); + + // The moved-from object should have been reset and be reusable. + ASSERT_EQ(s1.Add("bar\\d+", NULL), 0); + ASSERT_EQ(s1.Compile(), true); + ASSERT_EQ(s1.Match("abc foo1 xyz", NULL), false); + ASSERT_EQ(s1.Match("abc bar2 xyz", NULL), true); + + // Verify that "overwriting" works and also doesn't leak memory. + // (The latter will need a leak detector such as LeakSanitizer.) + s1 = std::move(s2); + ASSERT_EQ(s1.Match("abc foo1 xyz", NULL), true); + ASSERT_EQ(s1.Match("abc bar2 xyz", NULL), false); +} + +} // namespace re2 diff --git a/runtests b/runtests new file mode 100644 index 0000000000000000000000000000000000000000..94584a660df481773179052b250579aa3ef2c2b9 --- /dev/null +++ b/runtests @@ -0,0 +1,33 @@ +#!/usr/bin/env sh + +# System Integrity Protection on Darwin complicated these matters somewhat. +# See https://github.com/google/re2/issues/175 for details. +if [ "x$1" = "x-shared-library-path" ]; then + if [ "x$(uname)" = "xDarwin" ]; then + DYLD_LIBRARY_PATH="$2:$DYLD_LIBRARY_PATH" + export DYLD_LIBRARY_PATH + else + LD_LIBRARY_PATH="$2:$LD_LIBRARY_PATH" + export LD_LIBRARY_PATH + fi + shift 2 +fi + +success=true +for i; do + printf "%-40s" $i + if $($i >$i.log 2>&1) 2>/dev/null; then + echo PASS + else + echo FAIL';' output in $i.log + success=false + fi +done + +if $success; then + echo 'ALL TESTS PASSED.' + exit 0 +else + echo 'TESTS FAILED.' + exit 1 +fi diff --git a/testinstall.cc b/testinstall.cc new file mode 100644 index 0000000000000000000000000000000000000000..19cc9003bf8decc18a7be4dacfb0bb4f181a39b0 --- /dev/null +++ b/testinstall.cc @@ -0,0 +1,27 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include +#include + +int main() { + re2::FilteredRE2 f; + int id; + f.Add("a.*b.*c", RE2::DefaultOptions, &id); + std::vector v; + f.Compile(&v); + std::vector ids; + f.FirstMatch("abbccc", ids); + + int n; + if (RE2::FullMatch("axbyc", "a.*b.*c") && + RE2::PartialMatch("foo123bar", "(\\d+)", &n) && n == 123) { + printf("PASS\n"); + return 0; + } + + printf("FAIL\n"); + return 2; +} diff --git a/util/benchmark.cc b/util/benchmark.cc new file mode 100644 index 0000000000000000000000000000000000000000..e39c3349abd3818694e3ecb7cbcca40dfbb22734 --- /dev/null +++ b/util/benchmark.cc @@ -0,0 +1,131 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include +#include +#include +#include + +#include "util/benchmark.h" +#include "util/flags.h" +#include "re2/re2.h" + +#ifdef _WIN32 +#define snprintf _snprintf +#endif + +using ::testing::Benchmark; + +static Benchmark* benchmarks[10000]; +static int nbenchmarks; + +void Benchmark::Register() { + lo_ = std::max(1, lo_); + hi_ = std::max(lo_, hi_); + benchmarks[nbenchmarks++] = this; +} + +static int64_t nsec() { + return std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()) + .count(); +} + +static int64_t t0; +static int64_t ns; +static int64_t bytes; +static int64_t items; + +void StartBenchmarkTiming() { + if (t0 == 0) { + t0 = nsec(); + } +} + +void StopBenchmarkTiming() { + if (t0 != 0) { + ns += nsec() - t0; + t0 = 0; + } +} + +void SetBenchmarkBytesProcessed(int64_t b) { bytes = b; } + +void SetBenchmarkItemsProcessed(int64_t i) { items = i; } + +static void RunFunc(Benchmark* b, int iters, int arg) { + t0 = nsec(); + ns = 0; + bytes = 0; + items = 0; + b->func()(iters, arg); + StopBenchmarkTiming(); +} + +static int round(int n) { + int base = 1; + while (base * 10 < n) base *= 10; + if (n < 2 * base) return 2 * base; + if (n < 5 * base) return 5 * base; + return 10 * base; +} + +static void RunBench(Benchmark* b, int arg) { + int iters, last; + + // Run once just in case it's expensive. + iters = 1; + RunFunc(b, iters, arg); + while (ns < (int)1e9 && iters < (int)1e9) { + last = iters; + if (ns / iters == 0) { + iters = (int)1e9; + } else { + iters = (int)1e9 / static_cast(ns / iters); + } + iters = std::max(last + 1, std::min(iters + iters / 2, 100 * last)); + iters = round(iters); + RunFunc(b, iters, arg); + } + + char mb[100]; + char suf[100]; + mb[0] = '\0'; + suf[0] = '\0'; + if (ns > 0 && bytes > 0) + snprintf(mb, sizeof mb, "\t%7.2f MB/s", + ((double)bytes / 1e6) / ((double)ns / 1e9)); + if (b->has_arg()) { + if (arg >= (1 << 20)) { + snprintf(suf, sizeof suf, "/%dM", arg / (1 << 20)); + } else if (arg >= (1 << 10)) { + snprintf(suf, sizeof suf, "/%dK", arg / (1 << 10)); + } else { + snprintf(suf, sizeof suf, "/%d", arg); + } + } + printf("%s%s\t%8d\t%10lld ns/op%s\n", b->name(), suf, iters, + (long long)ns / iters, mb); + fflush(stdout); +} + +static bool WantBench(const char* name, int argc, const char** argv) { + if (argc == 1) return true; + for (int i = 1; i < argc; i++) { + if (RE2::PartialMatch(name, argv[i])) + return true; + } + return false; +} + +int main(int argc, const char** argv) { + for (int i = 0; i < nbenchmarks; i++) { + Benchmark* b = benchmarks[i]; + if (!WantBench(b->name(), argc, argv)) + continue; + for (int arg = b->lo(); arg <= b->hi(); arg <<= 1) + RunBench(b, arg); + } +} diff --git a/util/benchmark.h b/util/benchmark.h new file mode 100644 index 0000000000000000000000000000000000000000..d97b49e17f4cb091455d06cd8717e081119caf27 --- /dev/null +++ b/util/benchmark.h @@ -0,0 +1,156 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef UTIL_BENCHMARK_H_ +#define UTIL_BENCHMARK_H_ + +#include +#include + +#include "util/logging.h" +#include "util/util.h" + +// Globals for the old benchmark API. +void StartBenchmarkTiming(); +void StopBenchmarkTiming(); +void SetBenchmarkBytesProcessed(int64_t b); +void SetBenchmarkItemsProcessed(int64_t i); + +namespace benchmark { + +// The new benchmark API implemented as a layer over the old benchmark API. +// (Please refer to https://github.com/google/benchmark for documentation.) +class State { + private: + class Iterator { + public: + // Benchmark code looks like this: + // + // for (auto _ : state) { + // // ... + // } + // + // We try to avoid compiler warnings about such variables being unused. + struct ATTRIBUTE_UNUSED Value {}; + + explicit Iterator(int64_t iters) : iters_(iters) {} + + bool operator!=(const Iterator& that) const { + if (iters_ != that.iters_) { + return true; + } else { + // We are about to stop the loop, so stop timing. + StopBenchmarkTiming(); + return false; + } + } + + Value operator*() const { + return Value(); + } + + Iterator& operator++() { + --iters_; + return *this; + } + + private: + int64_t iters_; + }; + + public: + explicit State(int64_t iters) + : iters_(iters), arg_(0), has_arg_(false) {} + + State(int64_t iters, int64_t arg) + : iters_(iters), arg_(arg), has_arg_(true) {} + + Iterator begin() { + // We are about to start the loop, so start timing. + StartBenchmarkTiming(); + return Iterator(iters_); + } + + Iterator end() { + return Iterator(0); + } + + void SetBytesProcessed(int64_t b) { SetBenchmarkBytesProcessed(b); } + void SetItemsProcessed(int64_t i) { SetBenchmarkItemsProcessed(i); } + int64_t iterations() const { return iters_; } + // Pretend to support multiple arguments. + int64_t range(int pos) const { CHECK(has_arg_); return arg_; } + + private: + int64_t iters_; + int64_t arg_; + bool has_arg_; + + State(const State&) = delete; + State& operator=(const State&) = delete; +}; + +} // namespace benchmark + +namespace testing { + +class Benchmark { + public: + Benchmark(const char* name, void (*func)(benchmark::State&)) + : name_(name), + func_([func](int iters, int arg) { + benchmark::State state(iters); + func(state); + }), + lo_(0), + hi_(0), + has_arg_(false) { + Register(); + } + + Benchmark(const char* name, void (*func)(benchmark::State&), int lo, int hi) + : name_(name), + func_([func](int iters, int arg) { + benchmark::State state(iters, arg); + func(state); + }), + lo_(lo), + hi_(hi), + has_arg_(true) { + Register(); + } + + // Pretend to support multiple threads. + Benchmark* ThreadRange(int lo, int hi) { return this; } + + const char* name() const { return name_; } + const std::function& func() const { return func_; } + int lo() const { return lo_; } + int hi() const { return hi_; } + bool has_arg() const { return has_arg_; } + + private: + void Register(); + + const char* name_; + std::function func_; + int lo_; + int hi_; + bool has_arg_; + + Benchmark(const Benchmark&) = delete; + Benchmark& operator=(const Benchmark&) = delete; +}; + +} // namespace testing + +#define BENCHMARK(f) \ + ::testing::Benchmark* _benchmark_##f = \ + (new ::testing::Benchmark(#f, f)) + +#define BENCHMARK_RANGE(f, lo, hi) \ + ::testing::Benchmark* _benchmark_##f = \ + (new ::testing::Benchmark(#f, f, lo, hi)) + +#endif // UTIL_BENCHMARK_H_ diff --git a/util/flags.h b/util/flags.h new file mode 100644 index 0000000000000000000000000000000000000000..3386b729d4319882ed096d2f44dceab8946f88fa --- /dev/null +++ b/util/flags.h @@ -0,0 +1,26 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef UTIL_FLAGS_H_ +#define UTIL_FLAGS_H_ + +// Simplified version of Google's command line flags. +// Does not support parsing the command line. +// If you want to do that, see +// https://gflags.github.io/gflags/ + +#define DEFINE_FLAG(type, name, deflt, desc) \ + namespace re2 { type FLAGS_##name = deflt; } + +#define DECLARE_FLAG(type, name) \ + namespace re2 { extern type FLAGS_##name; } + +namespace re2 { +template +T GetFlag(const T& flag) { + return flag; +} +} // namespace re2 + +#endif // UTIL_FLAGS_H_ diff --git a/util/fuzz.cc b/util/fuzz.cc new file mode 100644 index 0000000000000000000000000000000000000000..9cac1185ac65106e53ba7178d828bce82e8f163d --- /dev/null +++ b/util/fuzz.cc @@ -0,0 +1,21 @@ +// Copyright 2016 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include +#include + +// Entry point for libFuzzer. +extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size); + +int main(int argc, char** argv) { + uint8_t data[32]; + for (int i = 0; i < 32; i++) { + for (int j = 0; j < 32; j++) { + data[j] = random() & 0xFF; + } + LLVMFuzzerTestOneInput(data, 32); + } + return 0; +} diff --git a/util/logging.h b/util/logging.h new file mode 100644 index 0000000000000000000000000000000000000000..5b2217f29ca4c79c3696aa66f6dbdef6be01f95d --- /dev/null +++ b/util/logging.h @@ -0,0 +1,109 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef UTIL_LOGGING_H_ +#define UTIL_LOGGING_H_ + +// Simplified version of Google's logging. + +#include +#include +#include +#include +#include + +#include "util/util.h" + +// Debug-only checking. +#define DCHECK(condition) assert(condition) +#define DCHECK_EQ(val1, val2) assert((val1) == (val2)) +#define DCHECK_NE(val1, val2) assert((val1) != (val2)) +#define DCHECK_LE(val1, val2) assert((val1) <= (val2)) +#define DCHECK_LT(val1, val2) assert((val1) < (val2)) +#define DCHECK_GE(val1, val2) assert((val1) >= (val2)) +#define DCHECK_GT(val1, val2) assert((val1) > (val2)) + +// Always-on checking +#define CHECK(x) if(x){}else LogMessageFatal(__FILE__, __LINE__).stream() << "Check failed: " #x +#define CHECK_LT(x, y) CHECK((x) < (y)) +#define CHECK_GT(x, y) CHECK((x) > (y)) +#define CHECK_LE(x, y) CHECK((x) <= (y)) +#define CHECK_GE(x, y) CHECK((x) >= (y)) +#define CHECK_EQ(x, y) CHECK((x) == (y)) +#define CHECK_NE(x, y) CHECK((x) != (y)) + +#define LOG_INFO LogMessage(__FILE__, __LINE__) +#define LOG_WARNING LogMessage(__FILE__, __LINE__) +#define LOG_ERROR LogMessage(__FILE__, __LINE__) +#define LOG_FATAL LogMessageFatal(__FILE__, __LINE__) +#define LOG_QFATAL LOG_FATAL + +// It seems that one of the Windows header files defines ERROR as 0. +#ifdef _WIN32 +#define LOG_0 LOG_INFO +#endif + +#ifdef NDEBUG +#define LOG_DFATAL LOG_ERROR +#else +#define LOG_DFATAL LOG_FATAL +#endif + +#define LOG(severity) LOG_ ## severity.stream() + +#define VLOG(x) if((x)>0){}else LOG_INFO.stream() + +class LogMessage { + public: + LogMessage(const char* file, int line) + : flushed_(false) { + stream() << file << ":" << line << ": "; + } + void Flush() { + stream() << "\n"; + std::string s = str_.str(); + size_t n = s.size(); + if (fwrite(s.data(), 1, n, stderr) < n) {} // shut up gcc + flushed_ = true; + } + ~LogMessage() { + if (!flushed_) { + Flush(); + } + } + std::ostream& stream() { return str_; } + + private: + bool flushed_; + std::ostringstream str_; + + LogMessage(const LogMessage&) = delete; + LogMessage& operator=(const LogMessage&) = delete; +}; + +// Silence "destructor never returns" warning for ~LogMessageFatal(). +// Since this is a header file, push and then pop to limit the scope. +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable: 4722) +#endif + +class LogMessageFatal : public LogMessage { + public: + LogMessageFatal(const char* file, int line) + : LogMessage(file, line) {} + ATTRIBUTE_NORETURN ~LogMessageFatal() { + Flush(); + abort(); + } + private: + LogMessageFatal(const LogMessageFatal&) = delete; + LogMessageFatal& operator=(const LogMessageFatal&) = delete; +}; + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +#endif // UTIL_LOGGING_H_ diff --git a/util/malloc_counter.h b/util/malloc_counter.h new file mode 100644 index 0000000000000000000000000000000000000000..81b564ff9868bbbd19c0fa1ef8ae8cd08a88abb0 --- /dev/null +++ b/util/malloc_counter.h @@ -0,0 +1,19 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef UTIL_MALLOC_COUNTER_H_ +#define UTIL_MALLOC_COUNTER_H_ + +namespace testing { +class MallocCounter { + public: + MallocCounter(int x) {} + static const int THIS_THREAD_ONLY = 0; + long long HeapGrowth() { return 0; } + long long PeakHeapGrowth() { return 0; } + void Reset() {} +}; +} // namespace testing + +#endif // UTIL_MALLOC_COUNTER_H_ diff --git a/util/mix.h b/util/mix.h new file mode 100644 index 0000000000000000000000000000000000000000..d85c172ab0e3fb95a67ea2ab315f178596fca869 --- /dev/null +++ b/util/mix.h @@ -0,0 +1,41 @@ +// Copyright 2016 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef UTIL_MIX_H_ +#define UTIL_MIX_H_ + +#include +#include + +namespace re2 { + +// Silence "truncation of constant value" warning for kMul in 32-bit mode. +// Since this is a header file, push and then pop to limit the scope. +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable: 4309) +#endif + +class HashMix { + public: + HashMix() : hash_(1) {} + explicit HashMix(size_t val) : hash_(val + 83) {} + void Mix(size_t val) { + static const size_t kMul = static_cast(0xdc3eb94af8ab4c93ULL); + hash_ *= kMul; + hash_ = ((hash_ << 19) | + (hash_ >> (std::numeric_limits::digits - 19))) + val; + } + size_t get() const { return hash_; } + private: + size_t hash_; +}; + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +} // namespace re2 + +#endif // UTIL_MIX_H_ diff --git a/util/mutex.h b/util/mutex.h new file mode 100644 index 0000000000000000000000000000000000000000..158046bb5c9f40b7bbcbebe2c5ebf8dae23979d2 --- /dev/null +++ b/util/mutex.h @@ -0,0 +1,148 @@ +// Copyright 2007 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef UTIL_MUTEX_H_ +#define UTIL_MUTEX_H_ + +/* + * A simple mutex wrapper, supporting locks and read-write locks. + * You should assume the locks are *not* re-entrant. + */ + +#ifdef _WIN32 +// Requires Windows Vista or Windows Server 2008 at minimum. +#include +#if defined(WINVER) && WINVER >= 0x0600 +#define MUTEX_IS_WIN32_SRWLOCK +#endif +#else +#ifndef _POSIX_C_SOURCE +#define _POSIX_C_SOURCE 200809L +#endif +#include +#if defined(_POSIX_READER_WRITER_LOCKS) && _POSIX_READER_WRITER_LOCKS > 0 +#define MUTEX_IS_PTHREAD_RWLOCK +#endif +#endif + +#if defined(MUTEX_IS_WIN32_SRWLOCK) +typedef SRWLOCK MutexType; +#elif defined(MUTEX_IS_PTHREAD_RWLOCK) +#include +#include +typedef pthread_rwlock_t MutexType; +#else +#include +typedef std::mutex MutexType; +#endif + +namespace re2 { + +class Mutex { + public: + inline Mutex(); + inline ~Mutex(); + inline void Lock(); // Block if needed until free then acquire exclusively + inline void Unlock(); // Release a lock acquired via Lock() + // Note that on systems that don't support read-write locks, these may + // be implemented as synonyms to Lock() and Unlock(). So you can use + // these for efficiency, but don't use them anyplace where being able + // to do shared reads is necessary to avoid deadlock. + inline void ReaderLock(); // Block until free or shared then acquire a share + inline void ReaderUnlock(); // Release a read share of this Mutex + inline void WriterLock() { Lock(); } // Acquire an exclusive lock + inline void WriterUnlock() { Unlock(); } // Release a lock from WriterLock() + + private: + MutexType mutex_; + + // Catch the error of writing Mutex when intending MutexLock. + Mutex(Mutex *ignored); + + Mutex(const Mutex&) = delete; + Mutex& operator=(const Mutex&) = delete; +}; + +#if defined(MUTEX_IS_WIN32_SRWLOCK) + +Mutex::Mutex() : mutex_(SRWLOCK_INIT) { } +Mutex::~Mutex() { } +void Mutex::Lock() { AcquireSRWLockExclusive(&mutex_); } +void Mutex::Unlock() { ReleaseSRWLockExclusive(&mutex_); } +void Mutex::ReaderLock() { AcquireSRWLockShared(&mutex_); } +void Mutex::ReaderUnlock() { ReleaseSRWLockShared(&mutex_); } + +#elif defined(MUTEX_IS_PTHREAD_RWLOCK) + +#define SAFE_PTHREAD(fncall) \ + do { \ + if ((fncall) != 0) abort(); \ + } while (0) + +Mutex::Mutex() { SAFE_PTHREAD(pthread_rwlock_init(&mutex_, NULL)); } +Mutex::~Mutex() { SAFE_PTHREAD(pthread_rwlock_destroy(&mutex_)); } +void Mutex::Lock() { SAFE_PTHREAD(pthread_rwlock_wrlock(&mutex_)); } +void Mutex::Unlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); } +void Mutex::ReaderLock() { SAFE_PTHREAD(pthread_rwlock_rdlock(&mutex_)); } +void Mutex::ReaderUnlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); } + +#undef SAFE_PTHREAD + +#else + +Mutex::Mutex() { } +Mutex::~Mutex() { } +void Mutex::Lock() { mutex_.lock(); } +void Mutex::Unlock() { mutex_.unlock(); } +void Mutex::ReaderLock() { Lock(); } // C++11 doesn't have std::shared_mutex. +void Mutex::ReaderUnlock() { Unlock(); } + +#endif + +// -------------------------------------------------------------------------- +// Some helper classes + +// MutexLock(mu) acquires mu when constructed and releases it when destroyed. +class MutexLock { + public: + explicit MutexLock(Mutex *mu) : mu_(mu) { mu_->Lock(); } + ~MutexLock() { mu_->Unlock(); } + private: + Mutex * const mu_; + + MutexLock(const MutexLock&) = delete; + MutexLock& operator=(const MutexLock&) = delete; +}; + +// ReaderMutexLock and WriterMutexLock do the same, for rwlocks +class ReaderMutexLock { + public: + explicit ReaderMutexLock(Mutex *mu) : mu_(mu) { mu_->ReaderLock(); } + ~ReaderMutexLock() { mu_->ReaderUnlock(); } + private: + Mutex * const mu_; + + ReaderMutexLock(const ReaderMutexLock&) = delete; + ReaderMutexLock& operator=(const ReaderMutexLock&) = delete; +}; + +class WriterMutexLock { + public: + explicit WriterMutexLock(Mutex *mu) : mu_(mu) { mu_->WriterLock(); } + ~WriterMutexLock() { mu_->WriterUnlock(); } + private: + Mutex * const mu_; + + WriterMutexLock(const WriterMutexLock&) = delete; + WriterMutexLock& operator=(const WriterMutexLock&) = delete; +}; + +// Catch bug where variable name is omitted, e.g. MutexLock (&mu); +#define MutexLock(x) static_assert(false, "MutexLock declaration missing variable name") +#define ReaderMutexLock(x) static_assert(false, "ReaderMutexLock declaration missing variable name") +#define WriterMutexLock(x) static_assert(false, "WriterMutexLock declaration missing variable name") + +} // namespace re2 + +#endif // UTIL_MUTEX_H_ diff --git a/util/pcre.cc b/util/pcre.cc new file mode 100644 index 0000000000000000000000000000000000000000..b68985144ff6439182e849c485636b9fe697732b --- /dev/null +++ b/util/pcre.cc @@ -0,0 +1,1025 @@ +// Copyright 2003-2009 Google Inc. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This is a variant of PCRE's pcrecpp.cc, originally written at Google. +// The main changes are the addition of the HitLimit method and +// compilation as PCRE in namespace re2. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "util/util.h" +#include "util/flags.h" +#include "util/logging.h" +#include "util/pcre.h" +#include "util/strutil.h" + +// Silence warnings about the wacky formatting in the operator() functions. +#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 6 +#pragma GCC diagnostic ignored "-Wmisleading-indentation" +#endif + +#define PCREPORT(level) LOG(level) + +// Default PCRE limits. +// Defaults chosen to allow a plausible amount of CPU and +// not exceed main thread stacks. Note that other threads +// often have smaller stacks, and therefore tightening +// regexp_stack_limit may frequently be necessary. +DEFINE_FLAG(int, regexp_stack_limit, 256 << 10, + "default PCRE stack limit (bytes)"); +DEFINE_FLAG(int, regexp_match_limit, 1000000, + "default PCRE match limit (function calls)"); + +#ifndef USEPCRE + +// Fake just enough of the PCRE API to allow this file to build. :) + +struct pcre_extra { + int flags; + int match_limit; + int match_limit_recursion; +}; + +#define PCRE_EXTRA_MATCH_LIMIT 0 +#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0 +#define PCRE_ANCHORED 0 +#define PCRE_NOTEMPTY 0 +#define PCRE_ERROR_NOMATCH 1 +#define PCRE_ERROR_MATCHLIMIT 2 +#define PCRE_ERROR_RECURSIONLIMIT 3 +#define PCRE_INFO_CAPTURECOUNT 0 + +void pcre_free(void*) { +} + +pcre* pcre_compile(const char*, int, const char**, int*, const unsigned char*) { + return NULL; +} + +int pcre_exec(const pcre*, const pcre_extra*, const char*, int, int, int, int*, int) { + return 0; +} + +int pcre_fullinfo(const pcre*, const pcre_extra*, int, void*) { + return 0; +} + +#endif + +namespace re2 { + +// Maximum number of args we can set +static const int kMaxArgs = 16; +static const int kVecSize = (1 + kMaxArgs) * 3; // results + PCRE workspace + +// Approximate size of a recursive invocation of PCRE's +// internal "match()" frame. This varies depending on the +// compiler and architecture, of course, so the constant is +// just a conservative estimate. To find the exact number, +// run regexp_unittest with --regexp_stack_limit=0 under +// a debugger and look at the frames when it crashes. +// The exact frame size was 656 in production on 2008/02/03. +static const int kPCREFrameSize = 700; + +// Special name for missing C++ arguments. +PCRE::Arg PCRE::no_more_args((void*)NULL); + +const PCRE::PartialMatchFunctor PCRE::PartialMatch = { }; +const PCRE::FullMatchFunctor PCRE::FullMatch = { } ; +const PCRE::ConsumeFunctor PCRE::Consume = { }; +const PCRE::FindAndConsumeFunctor PCRE::FindAndConsume = { }; + +// If a regular expression has no error, its error_ field points here +static const std::string empty_string; + +void PCRE::Init(const char* pattern, Option options, int match_limit, + int stack_limit, bool report_errors) { + pattern_ = pattern; + options_ = options; + match_limit_ = match_limit; + stack_limit_ = stack_limit; + hit_limit_ = false; + error_ = &empty_string; + report_errors_ = report_errors; + re_full_ = NULL; + re_partial_ = NULL; + + if (options & ~(EnabledCompileOptions | EnabledExecOptions)) { + error_ = new std::string("illegal regexp option"); + PCREPORT(ERROR) + << "Error compiling '" << pattern << "': illegal regexp option"; + } else { + re_partial_ = Compile(UNANCHORED); + if (re_partial_ != NULL) { + re_full_ = Compile(ANCHOR_BOTH); + } + } +} + +PCRE::PCRE(const char* pattern) { + Init(pattern, None, 0, 0, true); +} +PCRE::PCRE(const char* pattern, Option option) { + Init(pattern, option, 0, 0, true); +} +PCRE::PCRE(const std::string& pattern) { + Init(pattern.c_str(), None, 0, 0, true); +} +PCRE::PCRE(const std::string& pattern, Option option) { + Init(pattern.c_str(), option, 0, 0, true); +} +PCRE::PCRE(const std::string& pattern, const PCRE_Options& re_option) { + Init(pattern.c_str(), re_option.option(), re_option.match_limit(), + re_option.stack_limit(), re_option.report_errors()); +} + +PCRE::PCRE(const char *pattern, const PCRE_Options& re_option) { + Init(pattern, re_option.option(), re_option.match_limit(), + re_option.stack_limit(), re_option.report_errors()); +} + +PCRE::~PCRE() { + if (re_full_ != NULL) pcre_free(re_full_); + if (re_partial_ != NULL) pcre_free(re_partial_); + if (error_ != &empty_string) delete error_; +} + +pcre* PCRE::Compile(Anchor anchor) { + // Special treatment for anchoring. This is needed because at + // runtime pcre only provides an option for anchoring at the + // beginning of a string. + // + // There are three types of anchoring we want: + // UNANCHORED Compile the original pattern, and use + // a pcre unanchored match. + // ANCHOR_START Compile the original pattern, and use + // a pcre anchored match. + // ANCHOR_BOTH Tack a "\z" to the end of the original pattern + // and use a pcre anchored match. + + const char* error = ""; + int eoffset; + pcre* re; + if (anchor != ANCHOR_BOTH) { + re = pcre_compile(pattern_.c_str(), + (options_ & EnabledCompileOptions), + &error, &eoffset, NULL); + } else { + // Tack a '\z' at the end of PCRE. Parenthesize it first so that + // the '\z' applies to all top-level alternatives in the regexp. + std::string wrapped = "(?:"; // A non-counting grouping operator + wrapped += pattern_; + wrapped += ")\\z"; + re = pcre_compile(wrapped.c_str(), + (options_ & EnabledCompileOptions), + &error, &eoffset, NULL); + } + if (re == NULL) { + if (error_ == &empty_string) error_ = new std::string(error); + PCREPORT(ERROR) << "Error compiling '" << pattern_ << "': " << error; + } + return re; +} + +/***** Convenience interfaces *****/ + +bool PCRE::FullMatchFunctor::operator ()(const StringPiece& text, + const PCRE& re, + const Arg& a0, + const Arg& a1, + const Arg& a2, + const Arg& a3, + const Arg& a4, + const Arg& a5, + const Arg& a6, + const Arg& a7, + const Arg& a8, + const Arg& a9, + const Arg& a10, + const Arg& a11, + const Arg& a12, + const Arg& a13, + const Arg& a14, + const Arg& a15) const { + const Arg* args[kMaxArgs]; + int n = 0; + if (&a0 == &no_more_args) goto done; args[n++] = &a0; + if (&a1 == &no_more_args) goto done; args[n++] = &a1; + if (&a2 == &no_more_args) goto done; args[n++] = &a2; + if (&a3 == &no_more_args) goto done; args[n++] = &a3; + if (&a4 == &no_more_args) goto done; args[n++] = &a4; + if (&a5 == &no_more_args) goto done; args[n++] = &a5; + if (&a6 == &no_more_args) goto done; args[n++] = &a6; + if (&a7 == &no_more_args) goto done; args[n++] = &a7; + if (&a8 == &no_more_args) goto done; args[n++] = &a8; + if (&a9 == &no_more_args) goto done; args[n++] = &a9; + if (&a10 == &no_more_args) goto done; args[n++] = &a10; + if (&a11 == &no_more_args) goto done; args[n++] = &a11; + if (&a12 == &no_more_args) goto done; args[n++] = &a12; + if (&a13 == &no_more_args) goto done; args[n++] = &a13; + if (&a14 == &no_more_args) goto done; args[n++] = &a14; + if (&a15 == &no_more_args) goto done; args[n++] = &a15; +done: + + size_t consumed; + int vec[kVecSize] = {}; + return re.DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize); +} + +bool PCRE::PartialMatchFunctor::operator ()(const StringPiece& text, + const PCRE& re, + const Arg& a0, + const Arg& a1, + const Arg& a2, + const Arg& a3, + const Arg& a4, + const Arg& a5, + const Arg& a6, + const Arg& a7, + const Arg& a8, + const Arg& a9, + const Arg& a10, + const Arg& a11, + const Arg& a12, + const Arg& a13, + const Arg& a14, + const Arg& a15) const { + const Arg* args[kMaxArgs]; + int n = 0; + if (&a0 == &no_more_args) goto done; args[n++] = &a0; + if (&a1 == &no_more_args) goto done; args[n++] = &a1; + if (&a2 == &no_more_args) goto done; args[n++] = &a2; + if (&a3 == &no_more_args) goto done; args[n++] = &a3; + if (&a4 == &no_more_args) goto done; args[n++] = &a4; + if (&a5 == &no_more_args) goto done; args[n++] = &a5; + if (&a6 == &no_more_args) goto done; args[n++] = &a6; + if (&a7 == &no_more_args) goto done; args[n++] = &a7; + if (&a8 == &no_more_args) goto done; args[n++] = &a8; + if (&a9 == &no_more_args) goto done; args[n++] = &a9; + if (&a10 == &no_more_args) goto done; args[n++] = &a10; + if (&a11 == &no_more_args) goto done; args[n++] = &a11; + if (&a12 == &no_more_args) goto done; args[n++] = &a12; + if (&a13 == &no_more_args) goto done; args[n++] = &a13; + if (&a14 == &no_more_args) goto done; args[n++] = &a14; + if (&a15 == &no_more_args) goto done; args[n++] = &a15; +done: + + size_t consumed; + int vec[kVecSize] = {}; + return re.DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize); +} + +bool PCRE::ConsumeFunctor::operator ()(StringPiece* input, + const PCRE& pattern, + const Arg& a0, + const Arg& a1, + const Arg& a2, + const Arg& a3, + const Arg& a4, + const Arg& a5, + const Arg& a6, + const Arg& a7, + const Arg& a8, + const Arg& a9, + const Arg& a10, + const Arg& a11, + const Arg& a12, + const Arg& a13, + const Arg& a14, + const Arg& a15) const { + const Arg* args[kMaxArgs]; + int n = 0; + if (&a0 == &no_more_args) goto done; args[n++] = &a0; + if (&a1 == &no_more_args) goto done; args[n++] = &a1; + if (&a2 == &no_more_args) goto done; args[n++] = &a2; + if (&a3 == &no_more_args) goto done; args[n++] = &a3; + if (&a4 == &no_more_args) goto done; args[n++] = &a4; + if (&a5 == &no_more_args) goto done; args[n++] = &a5; + if (&a6 == &no_more_args) goto done; args[n++] = &a6; + if (&a7 == &no_more_args) goto done; args[n++] = &a7; + if (&a8 == &no_more_args) goto done; args[n++] = &a8; + if (&a9 == &no_more_args) goto done; args[n++] = &a9; + if (&a10 == &no_more_args) goto done; args[n++] = &a10; + if (&a11 == &no_more_args) goto done; args[n++] = &a11; + if (&a12 == &no_more_args) goto done; args[n++] = &a12; + if (&a13 == &no_more_args) goto done; args[n++] = &a13; + if (&a14 == &no_more_args) goto done; args[n++] = &a14; + if (&a15 == &no_more_args) goto done; args[n++] = &a15; +done: + + size_t consumed; + int vec[kVecSize] = {}; + if (pattern.DoMatchImpl(*input, ANCHOR_START, &consumed, + args, n, vec, kVecSize)) { + input->remove_prefix(consumed); + return true; + } else { + return false; + } +} + +bool PCRE::FindAndConsumeFunctor::operator ()(StringPiece* input, + const PCRE& pattern, + const Arg& a0, + const Arg& a1, + const Arg& a2, + const Arg& a3, + const Arg& a4, + const Arg& a5, + const Arg& a6, + const Arg& a7, + const Arg& a8, + const Arg& a9, + const Arg& a10, + const Arg& a11, + const Arg& a12, + const Arg& a13, + const Arg& a14, + const Arg& a15) const { + const Arg* args[kMaxArgs]; + int n = 0; + if (&a0 == &no_more_args) goto done; args[n++] = &a0; + if (&a1 == &no_more_args) goto done; args[n++] = &a1; + if (&a2 == &no_more_args) goto done; args[n++] = &a2; + if (&a3 == &no_more_args) goto done; args[n++] = &a3; + if (&a4 == &no_more_args) goto done; args[n++] = &a4; + if (&a5 == &no_more_args) goto done; args[n++] = &a5; + if (&a6 == &no_more_args) goto done; args[n++] = &a6; + if (&a7 == &no_more_args) goto done; args[n++] = &a7; + if (&a8 == &no_more_args) goto done; args[n++] = &a8; + if (&a9 == &no_more_args) goto done; args[n++] = &a9; + if (&a10 == &no_more_args) goto done; args[n++] = &a10; + if (&a11 == &no_more_args) goto done; args[n++] = &a11; + if (&a12 == &no_more_args) goto done; args[n++] = &a12; + if (&a13 == &no_more_args) goto done; args[n++] = &a13; + if (&a14 == &no_more_args) goto done; args[n++] = &a14; + if (&a15 == &no_more_args) goto done; args[n++] = &a15; +done: + + size_t consumed; + int vec[kVecSize] = {}; + if (pattern.DoMatchImpl(*input, UNANCHORED, &consumed, + args, n, vec, kVecSize)) { + input->remove_prefix(consumed); + return true; + } else { + return false; + } +} + +bool PCRE::Replace(std::string *str, + const PCRE& pattern, + const StringPiece& rewrite) { + int vec[kVecSize] = {}; + int matches = pattern.TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize); + if (matches == 0) + return false; + + std::string s; + if (!pattern.Rewrite(&s, rewrite, *str, vec, matches)) + return false; + + assert(vec[0] >= 0); + assert(vec[1] >= 0); + str->replace(vec[0], vec[1] - vec[0], s); + return true; +} + +int PCRE::GlobalReplace(std::string *str, + const PCRE& pattern, + const StringPiece& rewrite) { + int count = 0; + int vec[kVecSize] = {}; + std::string out; + size_t start = 0; + bool last_match_was_empty_string = false; + + while (start <= str->size()) { + // If the previous match was for the empty string, we shouldn't + // just match again: we'll match in the same way and get an + // infinite loop. Instead, we do the match in a special way: + // anchored -- to force another try at the same position -- + // and with a flag saying that this time, ignore empty matches. + // If this special match returns, that means there's a non-empty + // match at this position as well, and we can continue. If not, + // we do what perl does, and just advance by one. + // Notice that perl prints '@@@' for this; + // perl -le '$_ = "aa"; s/b*|aa/@/g; print' + int matches; + if (last_match_was_empty_string) { + matches = pattern.TryMatch(*str, start, ANCHOR_START, false, + vec, kVecSize); + if (matches <= 0) { + if (start < str->size()) + out.push_back((*str)[start]); + start++; + last_match_was_empty_string = false; + continue; + } + } else { + matches = pattern.TryMatch(*str, start, UNANCHORED, true, + vec, kVecSize); + if (matches <= 0) + break; + } + size_t matchstart = vec[0], matchend = vec[1]; + assert(matchstart >= start); + assert(matchend >= matchstart); + + out.append(*str, start, matchstart - start); + pattern.Rewrite(&out, rewrite, *str, vec, matches); + start = matchend; + count++; + last_match_was_empty_string = (matchstart == matchend); + } + + if (count == 0) + return 0; + + if (start < str->size()) + out.append(*str, start, str->size() - start); + using std::swap; + swap(out, *str); + return count; +} + +bool PCRE::Extract(const StringPiece &text, + const PCRE& pattern, + const StringPiece &rewrite, + std::string *out) { + int vec[kVecSize] = {}; + int matches = pattern.TryMatch(text, 0, UNANCHORED, true, vec, kVecSize); + if (matches == 0) + return false; + out->clear(); + return pattern.Rewrite(out, rewrite, text, vec, matches); +} + +std::string PCRE::QuoteMeta(const StringPiece& unquoted) { + std::string result; + result.reserve(unquoted.size() << 1); + + // Escape any ascii character not in [A-Za-z_0-9]. + // + // Note that it's legal to escape a character even if it has no + // special meaning in a regular expression -- so this function does + // that. (This also makes it identical to the perl function of the + // same name except for the null-character special case; + // see `perldoc -f quotemeta`.) + for (size_t ii = 0; ii < unquoted.size(); ++ii) { + // Note that using 'isalnum' here raises the benchmark time from + // 32ns to 58ns: + if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && + (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && + (unquoted[ii] < '0' || unquoted[ii] > '9') && + unquoted[ii] != '_' && + // If this is the part of a UTF8 or Latin1 character, we need + // to copy this byte without escaping. Experimentally this is + // what works correctly with the regexp library. + !(unquoted[ii] & 128)) { + if (unquoted[ii] == '\0') { // Special handling for null chars. + // Can't use "\\0" since the next character might be a digit. + result += "\\x00"; + continue; + } + result += '\\'; + } + result += unquoted[ii]; + } + + return result; +} + +/***** Actual matching and rewriting code *****/ + +bool PCRE::HitLimit() { + return hit_limit_ != 0; +} + +void PCRE::ClearHitLimit() { + hit_limit_ = 0; +} + +int PCRE::TryMatch(const StringPiece& text, + size_t startpos, + Anchor anchor, + bool empty_ok, + int *vec, + int vecsize) const { + pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_; + if (re == NULL) { + PCREPORT(ERROR) << "Matching against invalid re: " << *error_; + return 0; + } + + int match_limit = match_limit_; + if (match_limit <= 0) { + match_limit = GetFlag(FLAGS_regexp_match_limit); + } + + int stack_limit = stack_limit_; + if (stack_limit <= 0) { + stack_limit = GetFlag(FLAGS_regexp_stack_limit); + } + + pcre_extra extra = { 0 }; + if (match_limit > 0) { + extra.flags |= PCRE_EXTRA_MATCH_LIMIT; + extra.match_limit = match_limit; + } + if (stack_limit > 0) { + extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION; + extra.match_limit_recursion = stack_limit / kPCREFrameSize; + } + + int options = 0; + if (anchor != UNANCHORED) + options |= PCRE_ANCHORED; + if (!empty_ok) + options |= PCRE_NOTEMPTY; + + int rc = pcre_exec(re, // The regular expression object + &extra, + (text.data() == NULL) ? "" : text.data(), + static_cast(text.size()), + static_cast(startpos), + options, + vec, + vecsize); + + // Handle errors + if (rc == 0) { + // pcre_exec() returns 0 as a special case when the number of + // capturing subpatterns exceeds the size of the vector. + // When this happens, there is a match and the output vector + // is filled, but we miss out on the positions of the extra subpatterns. + rc = vecsize / 2; + } else if (rc < 0) { + switch (rc) { + case PCRE_ERROR_NOMATCH: + return 0; + case PCRE_ERROR_MATCHLIMIT: + // Writing to hit_limit is not safe if multiple threads + // are using the PCRE, but the flag is only intended + // for use by unit tests anyway, so we let it go. + hit_limit_ = true; + PCREPORT(WARNING) << "Exceeded match limit of " << match_limit + << " when matching '" << pattern_ << "'" + << " against text that is " << text.size() << " bytes."; + return 0; + case PCRE_ERROR_RECURSIONLIMIT: + // See comment about hit_limit above. + hit_limit_ = true; + PCREPORT(WARNING) << "Exceeded stack limit of " << stack_limit + << " when matching '" << pattern_ << "'" + << " against text that is " << text.size() << " bytes."; + return 0; + default: + // There are other return codes from pcre.h : + // PCRE_ERROR_NULL (-2) + // PCRE_ERROR_BADOPTION (-3) + // PCRE_ERROR_BADMAGIC (-4) + // PCRE_ERROR_UNKNOWN_NODE (-5) + // PCRE_ERROR_NOMEMORY (-6) + // PCRE_ERROR_NOSUBSTRING (-7) + // ... + PCREPORT(ERROR) << "Unexpected return code: " << rc + << " when matching '" << pattern_ << "'" + << ", re=" << re + << ", text=" << text + << ", vec=" << vec + << ", vecsize=" << vecsize; + return 0; + } + } + + return rc; +} + +bool PCRE::DoMatchImpl(const StringPiece& text, + Anchor anchor, + size_t* consumed, + const Arg* const* args, + int n, + int* vec, + int vecsize) const { + assert((1 + n) * 3 <= vecsize); // results + PCRE workspace + if (NumberOfCapturingGroups() < n) { + // RE has fewer capturing groups than number of Arg pointers passed in. + return false; + } + + int matches = TryMatch(text, 0, anchor, true, vec, vecsize); + assert(matches >= 0); // TryMatch never returns negatives + if (matches == 0) + return false; + + *consumed = vec[1]; + + if (n == 0 || args == NULL) { + // We are not interested in results + return true; + } + + // If we got here, we must have matched the whole pattern. + // We do not need (can not do) any more checks on the value of 'matches' here + // -- see the comment for TryMatch. + for (int i = 0; i < n; i++) { + const int start = vec[2*(i+1)]; + const int limit = vec[2*(i+1)+1]; + + // Avoid invoking undefined behavior when text.data() happens + // to be null and start happens to be -1, the latter being the + // case for an unmatched subexpression. Even if text.data() is + // not null, pointing one byte before was a longstanding bug. + const char* addr = NULL; + if (start != -1) { + addr = text.data() + start; + } + + if (!args[i]->Parse(addr, limit-start)) { + // TODO: Should we indicate what the error was? + return false; + } + } + + return true; +} + +bool PCRE::DoMatch(const StringPiece& text, + Anchor anchor, + size_t* consumed, + const Arg* const args[], + int n) const { + assert(n >= 0); + const int vecsize = (1 + n) * 3; // results + PCRE workspace + // (as for kVecSize) + int* vec = new int[vecsize]; + bool b = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize); + delete[] vec; + return b; +} + +bool PCRE::Rewrite(std::string *out, const StringPiece &rewrite, + const StringPiece &text, int *vec, int veclen) const { + int number_of_capturing_groups = NumberOfCapturingGroups(); + for (const char *s = rewrite.data(), *end = s + rewrite.size(); + s < end; s++) { + int c = *s; + if (c == '\\') { + c = *++s; + if (isdigit(c)) { + int n = (c - '0'); + if (n >= veclen) { + if (n <= number_of_capturing_groups) { + // unmatched optional capturing group. treat + // its value as empty string; i.e., nothing to append. + } else { + PCREPORT(ERROR) << "requested group " << n + << " in regexp " << rewrite.data(); + return false; + } + } + int start = vec[2 * n]; + if (start >= 0) + out->append(text.data() + start, vec[2 * n + 1] - start); + } else if (c == '\\') { + out->push_back('\\'); + } else { + PCREPORT(ERROR) << "invalid rewrite pattern: " << rewrite.data(); + return false; + } + } else { + out->push_back(c); + } + } + return true; +} + +bool PCRE::CheckRewriteString(const StringPiece& rewrite, + std::string* error) const { + int max_token = -1; + for (const char *s = rewrite.data(), *end = s + rewrite.size(); + s < end; s++) { + int c = *s; + if (c != '\\') { + continue; + } + if (++s == end) { + *error = "Rewrite schema error: '\\' not allowed at end."; + return false; + } + c = *s; + if (c == '\\') { + continue; + } + if (!isdigit(c)) { + *error = "Rewrite schema error: " + "'\\' must be followed by a digit or '\\'."; + return false; + } + int n = (c - '0'); + if (max_token < n) { + max_token = n; + } + } + + if (max_token > NumberOfCapturingGroups()) { + *error = StringPrintf( + "Rewrite schema requests %d matches, but the regexp only has %d " + "parenthesized subexpressions.", + max_token, NumberOfCapturingGroups()); + return false; + } + return true; +} + + +// Return the number of capturing subpatterns, or -1 if the +// regexp wasn't valid on construction. +int PCRE::NumberOfCapturingGroups() const { + if (re_partial_ == NULL) return -1; + + int result; + int rc = pcre_fullinfo(re_partial_, // The regular expression object + NULL, // We did not study the pattern + PCRE_INFO_CAPTURECOUNT, + &result); + if (rc != 0) { + PCREPORT(ERROR) << "Unexpected return code: " << rc; + return -1; + } + return result; +} + + +/***** Parsers for various types *****/ + +bool PCRE::Arg::parse_null(const char* str, size_t n, void* dest) { + // We fail if somebody asked us to store into a non-NULL void* pointer + return (dest == NULL); +} + +bool PCRE::Arg::parse_string(const char* str, size_t n, void* dest) { + if (dest == NULL) return true; + reinterpret_cast(dest)->assign(str, n); + return true; +} + +bool PCRE::Arg::parse_stringpiece(const char* str, size_t n, void* dest) { + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = StringPiece(str, n); + return true; +} + +bool PCRE::Arg::parse_char(const char* str, size_t n, void* dest) { + if (n != 1) return false; + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = str[0]; + return true; +} + +bool PCRE::Arg::parse_schar(const char* str, size_t n, void* dest) { + if (n != 1) return false; + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = str[0]; + return true; +} + +bool PCRE::Arg::parse_uchar(const char* str, size_t n, void* dest) { + if (n != 1) return false; + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = str[0]; + return true; +} + +// Largest number spec that we are willing to parse +static const int kMaxNumberLength = 32; + +// PCREQUIPCRES "buf" must have length at least kMaxNumberLength+1 +// PCREQUIPCRES "n > 0" +// Copies "str" into "buf" and null-terminates if necessary. +// Returns one of: +// a. "str" if no termination is needed +// b. "buf" if the string was copied and null-terminated +// c. "" if the input was invalid and has no hope of being parsed +static const char* TerminateNumber(char* buf, const char* str, size_t n) { + if ((n > 0) && isspace(*str)) { + // We are less forgiving than the strtoxxx() routines and do not + // allow leading spaces. + return ""; + } + + // See if the character right after the input text may potentially + // look like a digit. + if (isdigit(str[n]) || + ((str[n] >= 'a') && (str[n] <= 'f')) || + ((str[n] >= 'A') && (str[n] <= 'F'))) { + if (n > kMaxNumberLength) return ""; // Input too big to be a valid number + memcpy(buf, str, n); + buf[n] = '\0'; + return buf; + } else { + // We can parse right out of the supplied string, so return it. + return str; + } +} + +bool PCRE::Arg::parse_long_radix(const char* str, + size_t n, + void* dest, + int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, str, n); + char* end; + errno = 0; + long r = strtol(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = r; + return true; +} + +bool PCRE::Arg::parse_ulong_radix(const char* str, + size_t n, + void* dest, + int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, str, n); + if (str[0] == '-') { + // strtoul() will silently accept negative numbers and parse + // them. This module is more strict and treats them as errors. + return false; + } + + char* end; + errno = 0; + unsigned long r = strtoul(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = r; + return true; +} + +bool PCRE::Arg::parse_short_radix(const char* str, + size_t n, + void* dest, + int radix) { + long r; + if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse + if ((short)r != r) return false; // Out of range + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = (short)r; + return true; +} + +bool PCRE::Arg::parse_ushort_radix(const char* str, + size_t n, + void* dest, + int radix) { + unsigned long r; + if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse + if ((unsigned short)r != r) return false; // Out of range + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = (unsigned short)r; + return true; +} + +bool PCRE::Arg::parse_int_radix(const char* str, + size_t n, + void* dest, + int radix) { + long r; + if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse + if ((int)r != r) return false; // Out of range + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = (int)r; + return true; +} + +bool PCRE::Arg::parse_uint_radix(const char* str, + size_t n, + void* dest, + int radix) { + unsigned long r; + if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse + if ((unsigned int)r != r) return false; // Out of range + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = (unsigned int)r; + return true; +} + +bool PCRE::Arg::parse_longlong_radix(const char* str, + size_t n, + void* dest, + int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, str, n); + char* end; + errno = 0; + long long r = strtoll(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = r; + return true; +} + +bool PCRE::Arg::parse_ulonglong_radix(const char* str, + size_t n, + void* dest, + int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, str, n); + if (str[0] == '-') { + // strtoull() will silently accept negative numbers and parse + // them. This module is more strict and treats them as errors. + return false; + } + char* end; + errno = 0; + unsigned long long r = strtoull(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = r; + return true; +} + +static bool parse_double_float(const char* str, size_t n, bool isfloat, + void* dest) { + if (n == 0) return false; + static const int kMaxLength = 200; + char buf[kMaxLength]; + if (n >= kMaxLength) return false; + memcpy(buf, str, n); + buf[n] = '\0'; + char* end; + errno = 0; + double r; + if (isfloat) { + r = strtof(buf, &end); + } else { + r = strtod(buf, &end); + } + if (end != buf + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + if (isfloat) { + *(reinterpret_cast(dest)) = (float)r; + } else { + *(reinterpret_cast(dest)) = r; + } + return true; +} + +bool PCRE::Arg::parse_double(const char* str, size_t n, void* dest) { + return parse_double_float(str, n, false, dest); +} + +bool PCRE::Arg::parse_float(const char* str, size_t n, void* dest) { + return parse_double_float(str, n, true, dest); +} + +#define DEFINE_INTEGER_PARSER(name) \ + bool PCRE::Arg::parse_##name(const char* str, size_t n, void* dest) { \ + return parse_##name##_radix(str, n, dest, 10); \ + } \ + bool PCRE::Arg::parse_##name##_hex(const char* str, size_t n, void* dest) { \ + return parse_##name##_radix(str, n, dest, 16); \ + } \ + bool PCRE::Arg::parse_##name##_octal(const char* str, size_t n, \ + void* dest) { \ + return parse_##name##_radix(str, n, dest, 8); \ + } \ + bool PCRE::Arg::parse_##name##_cradix(const char* str, size_t n, \ + void* dest) { \ + return parse_##name##_radix(str, n, dest, 0); \ + } + +DEFINE_INTEGER_PARSER(short); +DEFINE_INTEGER_PARSER(ushort); +DEFINE_INTEGER_PARSER(int); +DEFINE_INTEGER_PARSER(uint); +DEFINE_INTEGER_PARSER(long); +DEFINE_INTEGER_PARSER(ulong); +DEFINE_INTEGER_PARSER(longlong); +DEFINE_INTEGER_PARSER(ulonglong); + +#undef DEFINE_INTEGER_PARSER + +} // namespace re2 diff --git a/util/pcre.h b/util/pcre.h new file mode 100644 index 0000000000000000000000000000000000000000..896b0bdf8935a8e901fc74a2be0fdf0786e0bbc4 --- /dev/null +++ b/util/pcre.h @@ -0,0 +1,681 @@ +// Copyright 2003-2010 Google Inc. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef UTIL_PCRE_H_ +#define UTIL_PCRE_H_ + +// This is a variant of PCRE's pcrecpp.h, originally written at Google. +// The main changes are the addition of the HitLimit method and +// compilation as PCRE in namespace re2. + +// C++ interface to the pcre regular-expression library. PCRE supports +// Perl-style regular expressions (with extensions like \d, \w, \s, +// ...). +// +// ----------------------------------------------------------------------- +// REGEXP SYNTAX: +// +// This module uses the pcre library and hence supports its syntax +// for regular expressions: +// +// http://www.google.com/search?q=pcre +// +// The syntax is pretty similar to Perl's. For those not familiar +// with Perl's regular expressions, here are some examples of the most +// commonly used extensions: +// +// "hello (\\w+) world" -- \w matches a "word" character +// "version (\\d+)" -- \d matches a digit +// "hello\\s+world" -- \s matches any whitespace character +// "\\b(\\w+)\\b" -- \b matches empty string at a word boundary +// "(?i)hello" -- (?i) turns on case-insensitive matching +// "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible +// +// ----------------------------------------------------------------------- +// MATCHING INTERFACE: +// +// The "FullMatch" operation checks that supplied text matches a +// supplied pattern exactly. +// +// Example: successful match +// CHECK(PCRE::FullMatch("hello", "h.*o")); +// +// Example: unsuccessful match (requires full match): +// CHECK(!PCRE::FullMatch("hello", "e")); +// +// ----------------------------------------------------------------------- +// UTF-8 AND THE MATCHING INTERFACE: +// +// By default, pattern and text are plain text, one byte per character. +// The UTF8 flag, passed to the constructor, causes both pattern +// and string to be treated as UTF-8 text, still a byte stream but +// potentially multiple bytes per character. In practice, the text +// is likelier to be UTF-8 than the pattern, but the match returned +// may depend on the UTF8 flag, so always use it when matching +// UTF8 text. E.g., "." will match one byte normally but with UTF8 +// set may match up to three bytes of a multi-byte character. +// +// Example: +// PCRE re(utf8_pattern, PCRE::UTF8); +// CHECK(PCRE::FullMatch(utf8_string, re)); +// +// ----------------------------------------------------------------------- +// MATCHING WITH SUBSTRING EXTRACTION: +// +// You can supply extra pointer arguments to extract matched substrings. +// +// Example: extracts "ruby" into "s" and 1234 into "i" +// int i; +// std::string s; +// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i)); +// +// Example: fails because string cannot be stored in integer +// CHECK(!PCRE::FullMatch("ruby", "(.*)", &i)); +// +// Example: fails because there aren't enough sub-patterns: +// CHECK(!PCRE::FullMatch("ruby:1234", "\\w+:\\d+", &s)); +// +// Example: does not try to extract any extra sub-patterns +// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s)); +// +// Example: does not try to extract into NULL +// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i)); +// +// Example: integer overflow causes failure +// CHECK(!PCRE::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i)); +// +// ----------------------------------------------------------------------- +// PARTIAL MATCHES +// +// You can use the "PartialMatch" operation when you want the pattern +// to match any substring of the text. +// +// Example: simple search for a string: +// CHECK(PCRE::PartialMatch("hello", "ell")); +// +// Example: find first number in a string +// int number; +// CHECK(PCRE::PartialMatch("x*100 + 20", "(\\d+)", &number)); +// CHECK_EQ(number, 100); +// +// ----------------------------------------------------------------------- +// PPCRE-COMPILED PCREGULAR EXPPCRESSIONS +// +// PCRE makes it easy to use any string as a regular expression, without +// requiring a separate compilation step. +// +// If speed is of the essence, you can create a pre-compiled "PCRE" +// object from the pattern and use it multiple times. If you do so, +// you can typically parse text faster than with sscanf. +// +// Example: precompile pattern for faster matching: +// PCRE pattern("h.*o"); +// while (ReadLine(&str)) { +// if (PCRE::FullMatch(str, pattern)) ...; +// } +// +// ----------------------------------------------------------------------- +// SCANNING TEXT INCPCREMENTALLY +// +// The "Consume" operation may be useful if you want to repeatedly +// match regular expressions at the front of a string and skip over +// them as they match. This requires use of the "StringPiece" type, +// which represents a sub-range of a real string. +// +// Example: read lines of the form "var = value" from a string. +// std::string contents = ...; // Fill string somehow +// StringPiece input(contents); // Wrap a StringPiece around it +// +// std::string var; +// int value; +// while (PCRE::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) { +// ...; +// } +// +// Each successful call to "Consume" will set "var/value", and also +// advance "input" so it points past the matched text. Note that if the +// regular expression matches an empty string, input will advance +// by 0 bytes. If the regular expression being used might match +// an empty string, the loop body must check for this case and either +// advance the string or break out of the loop. +// +// The "FindAndConsume" operation is similar to "Consume" but does not +// anchor your match at the beginning of the string. For example, you +// could extract all words from a string by repeatedly calling +// PCRE::FindAndConsume(&input, "(\\w+)", &word) +// +// ----------------------------------------------------------------------- +// PARSING HEX/OCTAL/C-RADIX NUMBERS +// +// By default, if you pass a pointer to a numeric value, the +// corresponding text is interpreted as a base-10 number. You can +// instead wrap the pointer with a call to one of the operators Hex(), +// Octal(), or CRadix() to interpret the text in another base. The +// CRadix operator interprets C-style "0" (base-8) and "0x" (base-16) +// prefixes, but defaults to base-10. +// +// Example: +// int a, b, c, d; +// CHECK(PCRE::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)", +// Octal(&a), Hex(&b), CRadix(&c), CRadix(&d)); +// will leave 64 in a, b, c, and d. + +#include "util/util.h" +#include "re2/stringpiece.h" + +#ifdef USEPCRE +#include +namespace re2 { +const bool UsingPCRE = true; +} // namespace re2 +#else +struct pcre; // opaque +namespace re2 { +const bool UsingPCRE = false; +} // namespace re2 +#endif + +namespace re2 { + +class PCRE_Options; + +// Interface for regular expression matching. Also corresponds to a +// pre-compiled regular expression. An "PCRE" object is safe for +// concurrent use by multiple threads. +class PCRE { + public: + // We convert user-passed pointers into special Arg objects + class Arg; + + // Marks end of arg list. + // ONLY USE IN OPTIONAL ARG DEFAULTS. + // DO NOT PASS EXPLICITLY. + static Arg no_more_args; + + // Options are same value as those in pcre. We provide them here + // to avoid users needing to include pcre.h and also to isolate + // users from pcre should we change the underlying library. + // Only those needed by Google programs are exposed here to + // avoid collision with options employed internally by regexp.cc + // Note that some options have equivalents that can be specified in + // the regexp itself. For example, prefixing your regexp with + // "(?s)" has the same effect as the PCRE_DOTALL option. + enum Option { + None = 0x0000, + UTF8 = 0x0800, // == PCRE_UTF8 + EnabledCompileOptions = UTF8, + EnabledExecOptions = 0x0000, // TODO: use to replace anchor flag + }; + + // We provide implicit conversions from strings so that users can + // pass in a string or a "const char*" wherever an "PCRE" is expected. + PCRE(const char* pattern); + PCRE(const char* pattern, Option option); + PCRE(const std::string& pattern); + PCRE(const std::string& pattern, Option option); + PCRE(const char *pattern, const PCRE_Options& re_option); + PCRE(const std::string& pattern, const PCRE_Options& re_option); + + ~PCRE(); + + // The string specification for this PCRE. E.g. + // PCRE re("ab*c?d+"); + // re.pattern(); // "ab*c?d+" + const std::string& pattern() const { return pattern_; } + + // If PCRE could not be created properly, returns an error string. + // Else returns the empty string. + const std::string& error() const { return *error_; } + + // Whether the PCRE has hit a match limit during execution. + // Not thread safe. Intended only for testing. + // If hitting match limits is a problem, + // you should be using PCRE2 (re2/re2.h) + // instead of checking this flag. + bool HitLimit(); + void ClearHitLimit(); + + /***** The useful part: the matching interface *****/ + + // Matches "text" against "pattern". If pointer arguments are + // supplied, copies matched sub-patterns into them. + // + // You can pass in a "const char*" or a "std::string" for "text". + // You can pass in a "const char*" or a "std::string" or a "PCRE" for "pattern". + // + // The provided pointer arguments can be pointers to any scalar numeric + // type, or one of: + // std::string (matched piece is copied to string) + // StringPiece (StringPiece is mutated to point to matched piece) + // T (where "bool T::ParseFrom(const char*, size_t)" exists) + // (void*)NULL (the corresponding matched sub-pattern is not copied) + // + // Returns true iff all of the following conditions are satisfied: + // a. "text" matches "pattern" exactly + // b. The number of matched sub-patterns is >= number of supplied pointers + // c. The "i"th argument has a suitable type for holding the + // string captured as the "i"th sub-pattern. If you pass in + // NULL for the "i"th argument, or pass fewer arguments than + // number of sub-patterns, "i"th captured sub-pattern is + // ignored. + // + // CAVEAT: An optional sub-pattern that does not exist in the + // matched string is assigned the empty string. Therefore, the + // following will return false (because the empty string is not a + // valid number): + // int number; + // PCRE::FullMatch("abc", "[a-z]+(\\d+)?", &number); + struct FullMatchFunctor { + bool operator ()(const StringPiece& text, const PCRE& re, // 3..16 args + const Arg& ptr1 = no_more_args, + const Arg& ptr2 = no_more_args, + const Arg& ptr3 = no_more_args, + const Arg& ptr4 = no_more_args, + const Arg& ptr5 = no_more_args, + const Arg& ptr6 = no_more_args, + const Arg& ptr7 = no_more_args, + const Arg& ptr8 = no_more_args, + const Arg& ptr9 = no_more_args, + const Arg& ptr10 = no_more_args, + const Arg& ptr11 = no_more_args, + const Arg& ptr12 = no_more_args, + const Arg& ptr13 = no_more_args, + const Arg& ptr14 = no_more_args, + const Arg& ptr15 = no_more_args, + const Arg& ptr16 = no_more_args) const; + }; + + static const FullMatchFunctor FullMatch; + + // Exactly like FullMatch(), except that "pattern" is allowed to match + // a substring of "text". + struct PartialMatchFunctor { + bool operator ()(const StringPiece& text, const PCRE& re, // 3..16 args + const Arg& ptr1 = no_more_args, + const Arg& ptr2 = no_more_args, + const Arg& ptr3 = no_more_args, + const Arg& ptr4 = no_more_args, + const Arg& ptr5 = no_more_args, + const Arg& ptr6 = no_more_args, + const Arg& ptr7 = no_more_args, + const Arg& ptr8 = no_more_args, + const Arg& ptr9 = no_more_args, + const Arg& ptr10 = no_more_args, + const Arg& ptr11 = no_more_args, + const Arg& ptr12 = no_more_args, + const Arg& ptr13 = no_more_args, + const Arg& ptr14 = no_more_args, + const Arg& ptr15 = no_more_args, + const Arg& ptr16 = no_more_args) const; + }; + + static const PartialMatchFunctor PartialMatch; + + // Like FullMatch() and PartialMatch(), except that pattern has to + // match a prefix of "text", and "input" is advanced past the matched + // text. Note: "input" is modified iff this routine returns true. + struct ConsumeFunctor { + bool operator ()(StringPiece* input, const PCRE& pattern, // 3..16 args + const Arg& ptr1 = no_more_args, + const Arg& ptr2 = no_more_args, + const Arg& ptr3 = no_more_args, + const Arg& ptr4 = no_more_args, + const Arg& ptr5 = no_more_args, + const Arg& ptr6 = no_more_args, + const Arg& ptr7 = no_more_args, + const Arg& ptr8 = no_more_args, + const Arg& ptr9 = no_more_args, + const Arg& ptr10 = no_more_args, + const Arg& ptr11 = no_more_args, + const Arg& ptr12 = no_more_args, + const Arg& ptr13 = no_more_args, + const Arg& ptr14 = no_more_args, + const Arg& ptr15 = no_more_args, + const Arg& ptr16 = no_more_args) const; + }; + + static const ConsumeFunctor Consume; + + // Like Consume(..), but does not anchor the match at the beginning of the + // string. That is, "pattern" need not start its match at the beginning of + // "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next + // word in "s" and stores it in "word". + struct FindAndConsumeFunctor { + bool operator ()(StringPiece* input, const PCRE& pattern, + const Arg& ptr1 = no_more_args, + const Arg& ptr2 = no_more_args, + const Arg& ptr3 = no_more_args, + const Arg& ptr4 = no_more_args, + const Arg& ptr5 = no_more_args, + const Arg& ptr6 = no_more_args, + const Arg& ptr7 = no_more_args, + const Arg& ptr8 = no_more_args, + const Arg& ptr9 = no_more_args, + const Arg& ptr10 = no_more_args, + const Arg& ptr11 = no_more_args, + const Arg& ptr12 = no_more_args, + const Arg& ptr13 = no_more_args, + const Arg& ptr14 = no_more_args, + const Arg& ptr15 = no_more_args, + const Arg& ptr16 = no_more_args) const; + }; + + static const FindAndConsumeFunctor FindAndConsume; + + // Replace the first match of "pattern" in "str" with "rewrite". + // Within "rewrite", backslash-escaped digits (\1 to \9) can be + // used to insert text matching corresponding parenthesized group + // from the pattern. \0 in "rewrite" refers to the entire matching + // text. E.g., + // + // std::string s = "yabba dabba doo"; + // CHECK(PCRE::Replace(&s, "b+", "d")); + // + // will leave "s" containing "yada dabba doo" + // + // Returns true if the pattern matches and a replacement occurs, + // false otherwise. + static bool Replace(std::string *str, + const PCRE& pattern, + const StringPiece& rewrite); + + // Like Replace(), except replaces all occurrences of the pattern in + // the string with the rewrite. Replacements are not subject to + // re-matching. E.g., + // + // std::string s = "yabba dabba doo"; + // CHECK(PCRE::GlobalReplace(&s, "b+", "d")); + // + // will leave "s" containing "yada dada doo" + // + // Returns the number of replacements made. + static int GlobalReplace(std::string *str, + const PCRE& pattern, + const StringPiece& rewrite); + + // Like Replace, except that if the pattern matches, "rewrite" + // is copied into "out" with substitutions. The non-matching + // portions of "text" are ignored. + // + // Returns true iff a match occurred and the extraction happened + // successfully; if no match occurs, the string is left unaffected. + static bool Extract(const StringPiece &text, + const PCRE& pattern, + const StringPiece &rewrite, + std::string *out); + + // Check that the given @p rewrite string is suitable for use with + // this PCRE. It checks that: + // * The PCRE has enough parenthesized subexpressions to satisfy all + // of the \N tokens in @p rewrite, and + // * The @p rewrite string doesn't have any syntax errors + // ('\' followed by anything besides [0-9] and '\'). + // Making this test will guarantee that "replace" and "extract" + // operations won't LOG(ERROR) or fail because of a bad rewrite + // string. + // @param rewrite The proposed rewrite string. + // @param error An error message is recorded here, iff we return false. + // Otherwise, it is unchanged. + // @return true, iff @p rewrite is suitable for use with the PCRE. + bool CheckRewriteString(const StringPiece& rewrite, + std::string* error) const; + + // Returns a copy of 'unquoted' with all potentially meaningful + // regexp characters backslash-escaped. The returned string, used + // as a regular expression, will exactly match the original string. + // For example, + // 1.5-2.0? + // becomes: + // 1\.5\-2\.0\? + static std::string QuoteMeta(const StringPiece& unquoted); + + /***** Generic matching interface (not so nice to use) *****/ + + // Type of match (TODO: Should be restructured as an Option) + enum Anchor { + UNANCHORED, // No anchoring + ANCHOR_START, // Anchor at start only + ANCHOR_BOTH, // Anchor at start and end + }; + + // General matching routine. Stores the length of the match in + // "*consumed" if successful. + bool DoMatch(const StringPiece& text, + Anchor anchor, + size_t* consumed, + const Arg* const* args, int n) const; + + // Return the number of capturing subpatterns, or -1 if the + // regexp wasn't valid on construction. + int NumberOfCapturingGroups() const; + + private: + void Init(const char* pattern, Option option, int match_limit, + int stack_limit, bool report_errors); + + // Match against "text", filling in "vec" (up to "vecsize" * 2/3) with + // pairs of integers for the beginning and end positions of matched + // text. The first pair corresponds to the entire matched text; + // subsequent pairs correspond, in order, to parentheses-captured + // matches. Returns the number of pairs (one more than the number of + // the last subpattern with a match) if matching was successful + // and zero if the match failed. + // I.e. for PCRE("(foo)|(bar)|(baz)") it will return 2, 3, and 4 when matching + // against "foo", "bar", and "baz" respectively. + // When matching PCRE("(foo)|hello") against "hello", it will return 1. + // But the values for all subpattern are filled in into "vec". + int TryMatch(const StringPiece& text, + size_t startpos, + Anchor anchor, + bool empty_ok, + int *vec, + int vecsize) const; + + // Append the "rewrite" string, with backslash subsitutions from "text" + // and "vec", to string "out". + bool Rewrite(std::string *out, + const StringPiece &rewrite, + const StringPiece &text, + int *vec, + int veclen) const; + + // internal implementation for DoMatch + bool DoMatchImpl(const StringPiece& text, + Anchor anchor, + size_t* consumed, + const Arg* const args[], + int n, + int* vec, + int vecsize) const; + + // Compile the regexp for the specified anchoring mode + pcre* Compile(Anchor anchor); + + std::string pattern_; + Option options_; + pcre* re_full_; // For full matches + pcre* re_partial_; // For partial matches + const std::string* error_; // Error indicator (or empty string) + bool report_errors_; // Silences error logging if false + int match_limit_; // Limit on execution resources + int stack_limit_; // Limit on stack resources (bytes) + mutable int32_t hit_limit_; // Hit limit during execution (bool) + + PCRE(const PCRE&) = delete; + PCRE& operator=(const PCRE&) = delete; +}; + +// PCRE_Options allow you to set the PCRE::Options, plus any pcre +// "extra" options. The only extras are match_limit, which limits +// the CPU time of a match, and stack_limit, which limits the +// stack usage. Setting a limit to <= 0 lets PCRE pick a sensible default +// that should not cause too many problems in production code. +// If PCRE hits a limit during a match, it may return a false negative, +// but (hopefully) it won't crash. +// +// NOTE: If you are handling regular expressions specified by +// (external or internal) users, rather than hard-coded ones, +// you should be using PCRE2, which uses an alternate implementation +// that avoids these issues. See http://go/re2quick. +class PCRE_Options { + public: + // constructor + PCRE_Options() : option_(PCRE::None), match_limit_(0), stack_limit_(0), report_errors_(true) {} + // accessors + PCRE::Option option() const { return option_; } + void set_option(PCRE::Option option) { + option_ = option; + } + int match_limit() const { return match_limit_; } + void set_match_limit(int match_limit) { + match_limit_ = match_limit; + } + int stack_limit() const { return stack_limit_; } + void set_stack_limit(int stack_limit) { + stack_limit_ = stack_limit; + } + + // If the regular expression is malformed, an error message will be printed + // iff report_errors() is true. Default: true. + bool report_errors() const { return report_errors_; } + void set_report_errors(bool report_errors) { + report_errors_ = report_errors; + } + private: + PCRE::Option option_; + int match_limit_; + int stack_limit_; + bool report_errors_; +}; + + +/***** Implementation details *****/ + +// Hex/Octal/Binary? + +// Special class for parsing into objects that define a ParseFrom() method +template +class _PCRE_MatchObject { + public: + static inline bool Parse(const char* str, size_t n, void* dest) { + if (dest == NULL) return true; + T* object = reinterpret_cast(dest); + return object->ParseFrom(str, n); + } +}; + +class PCRE::Arg { + public: + // Empty constructor so we can declare arrays of PCRE::Arg + Arg(); + + // Constructor specially designed for NULL arguments + Arg(void*); + + typedef bool (*Parser)(const char* str, size_t n, void* dest); + +// Type-specific parsers +#define MAKE_PARSER(type, name) \ + Arg(type* p) : arg_(p), parser_(name) {} \ + Arg(type* p, Parser parser) : arg_(p), parser_(parser) {} + + MAKE_PARSER(char, parse_char); + MAKE_PARSER(signed char, parse_schar); + MAKE_PARSER(unsigned char, parse_uchar); + MAKE_PARSER(float, parse_float); + MAKE_PARSER(double, parse_double); + MAKE_PARSER(std::string, parse_string); + MAKE_PARSER(StringPiece, parse_stringpiece); + + MAKE_PARSER(short, parse_short); + MAKE_PARSER(unsigned short, parse_ushort); + MAKE_PARSER(int, parse_int); + MAKE_PARSER(unsigned int, parse_uint); + MAKE_PARSER(long, parse_long); + MAKE_PARSER(unsigned long, parse_ulong); + MAKE_PARSER(long long, parse_longlong); + MAKE_PARSER(unsigned long long, parse_ulonglong); + +#undef MAKE_PARSER + + // Generic constructor + template Arg(T*, Parser parser); + // Generic constructor template + template Arg(T* p) + : arg_(p), parser_(_PCRE_MatchObject::Parse) { + } + + // Parse the data + bool Parse(const char* str, size_t n) const; + + private: + void* arg_; + Parser parser_; + + static bool parse_null (const char* str, size_t n, void* dest); + static bool parse_char (const char* str, size_t n, void* dest); + static bool parse_schar (const char* str, size_t n, void* dest); + static bool parse_uchar (const char* str, size_t n, void* dest); + static bool parse_float (const char* str, size_t n, void* dest); + static bool parse_double (const char* str, size_t n, void* dest); + static bool parse_string (const char* str, size_t n, void* dest); + static bool parse_stringpiece (const char* str, size_t n, void* dest); + +#define DECLARE_INTEGER_PARSER(name) \ + private: \ + static bool parse_##name(const char* str, size_t n, void* dest); \ + static bool parse_##name##_radix(const char* str, size_t n, void* dest, \ + int radix); \ + \ + public: \ + static bool parse_##name##_hex(const char* str, size_t n, void* dest); \ + static bool parse_##name##_octal(const char* str, size_t n, void* dest); \ + static bool parse_##name##_cradix(const char* str, size_t n, void* dest) + + DECLARE_INTEGER_PARSER(short); + DECLARE_INTEGER_PARSER(ushort); + DECLARE_INTEGER_PARSER(int); + DECLARE_INTEGER_PARSER(uint); + DECLARE_INTEGER_PARSER(long); + DECLARE_INTEGER_PARSER(ulong); + DECLARE_INTEGER_PARSER(longlong); + DECLARE_INTEGER_PARSER(ulonglong); + +#undef DECLARE_INTEGER_PARSER + +}; + +inline PCRE::Arg::Arg() : arg_(NULL), parser_(parse_null) { } +inline PCRE::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { } + +inline bool PCRE::Arg::Parse(const char* str, size_t n) const { + return (*parser_)(str, n, arg_); +} + +// This part of the parser, appropriate only for ints, deals with bases +#define MAKE_INTEGER_PARSER(type, name) \ + inline PCRE::Arg Hex(type* ptr) { \ + return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_hex); \ + } \ + inline PCRE::Arg Octal(type* ptr) { \ + return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_octal); \ + } \ + inline PCRE::Arg CRadix(type* ptr) { \ + return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_cradix); \ + } + +MAKE_INTEGER_PARSER(short, short); +MAKE_INTEGER_PARSER(unsigned short, ushort); +MAKE_INTEGER_PARSER(int, int); +MAKE_INTEGER_PARSER(unsigned int, uint); +MAKE_INTEGER_PARSER(long, long); +MAKE_INTEGER_PARSER(unsigned long, ulong); +MAKE_INTEGER_PARSER(long long, longlong); +MAKE_INTEGER_PARSER(unsigned long long, ulonglong); + +#undef MAKE_INTEGER_PARSER + +} // namespace re2 + +#endif // UTIL_PCRE_H_ diff --git a/util/rune.cc b/util/rune.cc new file mode 100644 index 0000000000000000000000000000000000000000..4f625ea380f4c77e1c8f66f2caf4d4a0c67d6f7b --- /dev/null +++ b/util/rune.cc @@ -0,0 +1,260 @@ +/* + * The authors of this software are Rob Pike and Ken Thompson. + * Copyright (c) 2002 by Lucent Technologies. + * Permission to use, copy, modify, and distribute this software for any + * purpose without fee is hereby granted, provided that this entire notice + * is included in all copies of any software which is or includes a copy + * or modification of this software and in all copies of the supporting + * documentation for such software. + * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED + * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY + * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY + * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. + */ + +#include +#include + +#include "util/utf.h" + +namespace re2 { + +enum +{ + Bit1 = 7, + Bitx = 6, + Bit2 = 5, + Bit3 = 4, + Bit4 = 3, + Bit5 = 2, + + T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ + Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ + T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ + T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ + T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ + + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, + /* 0001 1111 1111 1111 1111 1111 */ + + Maskx = (1< T1 + */ + c = *(unsigned char*)str; + if(c < Tx) { + *rune = c; + return 1; + } + + /* + * two character sequence + * 0080-07FF => T2 Tx + */ + c1 = *(unsigned char*)(str+1) ^ Tx; + if(c1 & Testx) + goto bad; + if(c < T3) { + if(c < T2) + goto bad; + l = ((c << Bitx) | c1) & Rune2; + if(l <= Rune1) + goto bad; + *rune = l; + return 2; + } + + /* + * three character sequence + * 0800-FFFF => T3 Tx Tx + */ + c2 = *(unsigned char*)(str+2) ^ Tx; + if(c2 & Testx) + goto bad; + if(c < T4) { + l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; + if(l <= Rune2) + goto bad; + *rune = l; + return 3; + } + + /* + * four character sequence (21-bit value) + * 10000-1FFFFF => T4 Tx Tx Tx + */ + c3 = *(unsigned char*)(str+3) ^ Tx; + if (c3 & Testx) + goto bad; + if (c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if (l <= Rune3) + goto bad; + *rune = l; + return 4; + } + + /* + * Support for 5-byte or longer UTF-8 would go here, but + * since we don't have that, we'll just fall through to bad. + */ + + /* + * bad decoding + */ +bad: + *rune = Bad; + return 1; +} + +int +runetochar(char *str, const Rune *rune) +{ + /* Runes are signed, so convert to unsigned for range check. */ + unsigned long c; + + /* + * one character sequence + * 00000-0007F => 00-7F + */ + c = *rune; + if(c <= Rune1) { + str[0] = static_cast(c); + return 1; + } + + /* + * two character sequence + * 0080-07FF => T2 Tx + */ + if(c <= Rune2) { + str[0] = T2 | static_cast(c >> 1*Bitx); + str[1] = Tx | (c & Maskx); + return 2; + } + + /* + * If the Rune is out of range, convert it to the error rune. + * Do this test here because the error rune encodes to three bytes. + * Doing it earlier would duplicate work, since an out of range + * Rune wouldn't have fit in one or two bytes. + */ + if (c > Runemax) + c = Runeerror; + + /* + * three character sequence + * 0800-FFFF => T3 Tx Tx + */ + if (c <= Rune3) { + str[0] = T3 | static_cast(c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence (21-bit value) + * 10000-1FFFFF => T4 Tx Tx Tx + */ + str[0] = T4 | static_cast(c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; +} + +int +runelen(Rune rune) +{ + char str[10]; + + return runetochar(str, &rune); +} + +int +fullrune(const char *str, int n) +{ + if (n > 0) { + int c = *(unsigned char*)str; + if (c < Tx) + return 1; + if (n > 1) { + if (c < T3) + return 1; + if (n > 2) { + if (c < T4 || n > 3) + return 1; + } + } + } + return 0; +} + + +int +utflen(const char *s) +{ + int c; + long n; + Rune rune; + + n = 0; + for(;;) { + c = *(unsigned char*)s; + if(c < Runeself) { + if(c == 0) + return n; + s++; + } else + s += chartorune(&rune, s); + n++; + } + return 0; +} + +char* +utfrune(const char *s, Rune c) +{ + long c1; + Rune r; + int n; + + if(c < Runesync) /* not part of utf sequence */ + return strchr((char*)s, c); + + for(;;) { + c1 = *(unsigned char*)s; + if(c1 < Runeself) { /* one byte rune */ + if(c1 == 0) + return 0; + if(c1 == c) + return (char*)s; + s++; + continue; + } + n = chartorune(&r, s); + if(r == c) + return (char*)s; + s += n; + } + return 0; +} + +} // namespace re2 diff --git a/util/strutil.cc b/util/strutil.cc new file mode 100644 index 0000000000000000000000000000000000000000..fb7e6b1b0c776f86338845aa92cde1db2dc5c6cb --- /dev/null +++ b/util/strutil.cc @@ -0,0 +1,149 @@ +// Copyright 1999-2005 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include + +#include "util/strutil.h" + +#ifdef _WIN32 +#define snprintf _snprintf +#define vsnprintf _vsnprintf +#endif + +namespace re2 { + +// ---------------------------------------------------------------------- +// CEscapeString() +// Copies 'src' to 'dest', escaping dangerous characters using +// C-style escape sequences. 'src' and 'dest' should not overlap. +// Returns the number of bytes written to 'dest' (not including the \0) +// or (size_t)-1 if there was insufficient space. +// ---------------------------------------------------------------------- +static size_t CEscapeString(const char* src, size_t src_len, + char* dest, size_t dest_len) { + const char* src_end = src + src_len; + size_t used = 0; + + for (; src < src_end; src++) { + if (dest_len - used < 2) // space for two-character escape + return (size_t)-1; + + unsigned char c = *src; + switch (c) { + case '\n': dest[used++] = '\\'; dest[used++] = 'n'; break; + case '\r': dest[used++] = '\\'; dest[used++] = 'r'; break; + case '\t': dest[used++] = '\\'; dest[used++] = 't'; break; + case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break; + case '\'': dest[used++] = '\\'; dest[used++] = '\''; break; + case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break; + default: + // Note that if we emit \xNN and the src character after that is a hex + // digit then that digit must be escaped too to prevent it being + // interpreted as part of the character code by C. + if (c < ' ' || c > '~') { + if (dest_len - used < 5) // space for four-character escape + \0 + return (size_t)-1; + snprintf(dest + used, 5, "\\%03o", c); + used += 4; + } else { + dest[used++] = c; break; + } + } + } + + if (dest_len - used < 1) // make sure that there is room for \0 + return (size_t)-1; + + dest[used] = '\0'; // doesn't count towards return value though + return used; +} + +// ---------------------------------------------------------------------- +// CEscape() +// Copies 'src' to result, escaping dangerous characters using +// C-style escape sequences. 'src' and 'dest' should not overlap. +// ---------------------------------------------------------------------- +std::string CEscape(const StringPiece& src) { + const size_t dest_len = src.size() * 4 + 1; // Maximum possible expansion + char* dest = new char[dest_len]; + const size_t used = CEscapeString(src.data(), src.size(), + dest, dest_len); + std::string s = std::string(dest, used); + delete[] dest; + return s; +} + +void PrefixSuccessor(std::string* prefix) { + // We can increment the last character in the string and be done + // unless that character is 255, in which case we have to erase the + // last character and increment the previous character, unless that + // is 255, etc. If the string is empty or consists entirely of + // 255's, we just return the empty string. + while (!prefix->empty()) { + char& c = prefix->back(); + if (c == '\xff') { // char literal avoids signed/unsigned. + prefix->pop_back(); + } else { + ++c; + break; + } + } +} + +static void StringAppendV(std::string* dst, const char* format, va_list ap) { + // First try with a small fixed size buffer + char space[1024]; + + // It's possible for methods that use a va_list to invalidate + // the data in it upon use. The fix is to make a copy + // of the structure before using it and use that copy instead. + va_list backup_ap; + va_copy(backup_ap, ap); + int result = vsnprintf(space, sizeof(space), format, backup_ap); + va_end(backup_ap); + + if ((result >= 0) && (static_cast(result) < sizeof(space))) { + // It fit + dst->append(space, result); + return; + } + + // Repeatedly increase buffer size until it fits + int length = sizeof(space); + while (true) { + if (result < 0) { + // Older behavior: just try doubling the buffer size + length *= 2; + } else { + // We need exactly "result+1" characters + length = result+1; + } + char* buf = new char[length]; + + // Restore the va_list before we use it again + va_copy(backup_ap, ap); + result = vsnprintf(buf, length, format, backup_ap); + va_end(backup_ap); + + if ((result >= 0) && (result < length)) { + // It fit + dst->append(buf, result); + delete[] buf; + return; + } + delete[] buf; + } +} + +std::string StringPrintf(const char* format, ...) { + va_list ap; + va_start(ap, format); + std::string result; + StringAppendV(&result, format, ap); + va_end(ap); + return result; +} + +} // namespace re2 diff --git a/util/strutil.h b/util/strutil.h new file mode 100644 index 0000000000000000000000000000000000000000..a69908a0dd94108de70d0b8a8401262ae070645a --- /dev/null +++ b/util/strutil.h @@ -0,0 +1,21 @@ +// Copyright 2016 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef UTIL_STRUTIL_H_ +#define UTIL_STRUTIL_H_ + +#include + +#include "re2/stringpiece.h" +#include "util/util.h" + +namespace re2 { + +std::string CEscape(const StringPiece& src); +void PrefixSuccessor(std::string* prefix); +std::string StringPrintf(const char* format, ...); + +} // namespace re2 + +#endif // UTIL_STRUTIL_H_ diff --git a/util/test.cc b/util/test.cc new file mode 100644 index 0000000000000000000000000000000000000000..028616b359ac4d803d1a018420e1e2b43f49dbc9 --- /dev/null +++ b/util/test.cc @@ -0,0 +1,34 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include + +#include "util/test.h" + +namespace testing { +std::string TempDir() { return "/tmp/"; } +} // namespace testing + +struct Test { + void (*fn)(void); + const char *name; +}; + +static Test tests[10000]; +static int ntests; + +void RegisterTest(void (*fn)(void), const char *name) { + tests[ntests].fn = fn; + tests[ntests++].name = name; +} + +int main(int argc, char** argv) { + for (int i = 0; i < ntests; i++) { + printf("%s\n", tests[i].name); + tests[i].fn(); + } + printf("PASS\n"); + return 0; +} diff --git a/util/test.h b/util/test.h new file mode 100644 index 0000000000000000000000000000000000000000..54e6f8fbbbc845ede96eaacd59f27d400fe3b971 --- /dev/null +++ b/util/test.h @@ -0,0 +1,50 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef UTIL_TEST_H_ +#define UTIL_TEST_H_ + +#include "util/util.h" +#include "util/logging.h" + +namespace testing { +std::string TempDir(); +} // namespace testing + +#define TEST(x, y) \ + void x##y(void); \ + TestRegisterer r##x##y(x##y, # x "." # y); \ + void x##y(void) + +void RegisterTest(void (*)(void), const char*); + +class TestRegisterer { + public: + TestRegisterer(void (*fn)(void), const char *s) { + RegisterTest(fn, s); + } +}; + +// fatal assertions +#define ASSERT_TRUE CHECK +#define ASSERT_FALSE(x) CHECK(!(x)) +#define ASSERT_EQ CHECK_EQ +#define ASSERT_NE CHECK_NE +#define ASSERT_LT CHECK_LT +#define ASSERT_LE CHECK_LE +#define ASSERT_GT CHECK_GT +#define ASSERT_GE CHECK_GE + +// nonfatal assertions +// TODO(rsc): Do a better job? +#define EXPECT_TRUE CHECK +#define EXPECT_FALSE(x) CHECK(!(x)) +#define EXPECT_EQ CHECK_EQ +#define EXPECT_NE CHECK_NE +#define EXPECT_LT CHECK_LT +#define EXPECT_LE CHECK_LE +#define EXPECT_GT CHECK_GT +#define EXPECT_GE CHECK_GE + +#endif // UTIL_TEST_H_ diff --git a/util/utf.h b/util/utf.h new file mode 100644 index 0000000000000000000000000000000000000000..85b42972390159f8ef9286417485f0d252ce7c74 --- /dev/null +++ b/util/utf.h @@ -0,0 +1,44 @@ +/* + * The authors of this software are Rob Pike and Ken Thompson. + * Copyright (c) 2002 by Lucent Technologies. + * Permission to use, copy, modify, and distribute this software for any + * purpose without fee is hereby granted, provided that this entire notice + * is included in all copies of any software which is or includes a copy + * or modification of this software and in all copies of the supporting + * documentation for such software. + * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED + * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY + * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY + * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. + * + * This file and rune.cc have been converted to compile as C++ code + * in name space re2. + */ + +#ifndef UTIL_UTF_H_ +#define UTIL_UTF_H_ + +#include + +namespace re2 { + +typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/ + +enum +{ + UTFmax = 4, /* maximum bytes per rune */ + Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ + Runeself = 0x80, /* rune and UTF sequences are the same (<) */ + Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0x10FFFF, /* maximum rune value */ +}; + +int runetochar(char* s, const Rune* r); +int chartorune(Rune* r, const char* s); +int fullrune(const char* s, int n); +int utflen(const char* s); +char* utfrune(const char*, Rune); + +} // namespace re2 + +#endif // UTIL_UTF_H_ diff --git a/util/util.h b/util/util.h new file mode 100644 index 0000000000000000000000000000000000000000..56e46c1a3385bbd7e21fcf321281f4f42ec81dd9 --- /dev/null +++ b/util/util.h @@ -0,0 +1,42 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef UTIL_UTIL_H_ +#define UTIL_UTIL_H_ + +#define arraysize(array) (sizeof(array)/sizeof((array)[0])) + +#ifndef ATTRIBUTE_NORETURN +#if defined(__GNUC__) +#define ATTRIBUTE_NORETURN __attribute__((noreturn)) +#elif defined(_MSC_VER) +#define ATTRIBUTE_NORETURN __declspec(noreturn) +#else +#define ATTRIBUTE_NORETURN +#endif +#endif + +#ifndef ATTRIBUTE_UNUSED +#if defined(__GNUC__) +#define ATTRIBUTE_UNUSED __attribute__((unused)) +#else +#define ATTRIBUTE_UNUSED +#endif +#endif + +#ifndef FALLTHROUGH_INTENDED +#if defined(__clang__) +#define FALLTHROUGH_INTENDED [[clang::fallthrough]] +#elif defined(__GNUC__) && __GNUC__ >= 7 +#define FALLTHROUGH_INTENDED [[gnu::fallthrough]] +#else +#define FALLTHROUGH_INTENDED do {} while (0) +#endif +#endif + +#ifndef NO_THREAD_SAFETY_ANALYSIS +#define NO_THREAD_SAFETY_ANALYSIS +#endif + +#endif // UTIL_UTIL_H_