From 0d239f0ddbaf7ead30919879219e0c27a263d9b3 Mon Sep 17 00:00:00 2001 From: yangwentong <425822674@qq.com> Date: Fri, 30 Sep 2022 08:48:46 +0800 Subject: [PATCH] =?UTF-8?q?fuzz=E6=98=AFre2=E9=81=97=E7=95=99=E4=BB=A3?= =?UTF-8?q?=E7=A0=81=EF=BC=8C=E6=9C=AA=E4=BD=BF=E7=94=A8=EF=BC=8C=E5=B7=B2?= =?UTF-8?q?=E7=A7=BB=E9=99=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- re2/fuzzing/compiler-rt/LICENSE | 219 ------------- .../include/fuzzer/FuzzedDataProvider.h | 305 ------------------ re2/fuzzing/re2_fuzzer.cc | 247 -------------- 3 files changed, 771 deletions(-) delete mode 100644 re2/fuzzing/compiler-rt/LICENSE delete mode 100644 re2/fuzzing/compiler-rt/include/fuzzer/FuzzedDataProvider.h delete mode 100644 re2/fuzzing/re2_fuzzer.cc diff --git a/re2/fuzzing/compiler-rt/LICENSE b/re2/fuzzing/compiler-rt/LICENSE deleted file mode 100644 index f9dc506..0000000 --- a/re2/fuzzing/compiler-rt/LICENSE +++ /dev/null @@ -1,219 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - - ---- LLVM Exceptions to the Apache 2.0 License ---- - -As an exception, if, as a result of your compiling your source code, portions -of this Software are embedded into an Object form of such source code, you -may redistribute such embedded portions in such Object form without complying -with the conditions of Sections 4(a), 4(b) and 4(d) of the License. - -In addition, if you combine or link compiled forms of this Software with -software that is licensed under the GPLv2 ("Combined Software") and if a -court of competent jurisdiction determines that the patent provision (Section -3), the indemnity provision (Section 9) or other Section of the License -conflicts with the conditions of the GPLv2, you may retroactively and -prospectively choose to deem waived or otherwise exclude such Section(s) of -the License, but only in their entirety and only with respect to the Combined -Software. - diff --git a/re2/fuzzing/compiler-rt/include/fuzzer/FuzzedDataProvider.h b/re2/fuzzing/compiler-rt/include/fuzzer/FuzzedDataProvider.h deleted file mode 100644 index 3e069eb..0000000 --- a/re2/fuzzing/compiler-rt/include/fuzzer/FuzzedDataProvider.h +++ /dev/null @@ -1,305 +0,0 @@ -//===- FuzzedDataProvider.h - Utility header for fuzz targets ---*- C++ -* ===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// A single header library providing an utility class to break up an array of -// bytes. Whenever run on the same input, provides the same output, as long as -// its methods are called in the same order, with the same arguments. -//===----------------------------------------------------------------------===// - -#ifndef LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_ -#define LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// In addition to the comments below, the API is also briefly documented at -// https://github.com/google/fuzzing/blob/master/docs/split-inputs.md#fuzzed-data-provider -class FuzzedDataProvider { - public: - // |data| is an array of length |size| that the FuzzedDataProvider wraps to - // provide more granular access. |data| must outlive the FuzzedDataProvider. - FuzzedDataProvider(const uint8_t *data, size_t size) - : data_ptr_(data), remaining_bytes_(size) {} - ~FuzzedDataProvider() = default; - - // Returns a std::vector containing |num_bytes| of input data. If fewer than - // |num_bytes| of data remain, returns a shorter std::vector containing all - // of the data that's left. Can be used with any byte sized type, such as - // char, unsigned char, uint8_t, etc. - template std::vector ConsumeBytes(size_t num_bytes) { - num_bytes = std::min(num_bytes, remaining_bytes_); - return ConsumeBytes(num_bytes, num_bytes); - } - - // Similar to |ConsumeBytes|, but also appends the terminator value at the end - // of the resulting vector. Useful, when a mutable null-terminated C-string is - // needed, for example. But that is a rare case. Better avoid it, if possible, - // and prefer using |ConsumeBytes| or |ConsumeBytesAsString| methods. - template - std::vector ConsumeBytesWithTerminator(size_t num_bytes, - T terminator = 0) { - num_bytes = std::min(num_bytes, remaining_bytes_); - std::vector result = ConsumeBytes(num_bytes + 1, num_bytes); - result.back() = terminator; - return result; - } - - // Returns a std::string containing |num_bytes| of input data. Using this and - // |.c_str()| on the resulting string is the best way to get an immutable - // null-terminated C string. If fewer than |num_bytes| of data remain, returns - // a shorter std::string containing all of the data that's left. - std::string ConsumeBytesAsString(size_t num_bytes) { - static_assert(sizeof(std::string::value_type) == sizeof(uint8_t), - "ConsumeBytesAsString cannot convert the data to a string."); - - num_bytes = std::min(num_bytes, remaining_bytes_); - std::string result( - reinterpret_cast(data_ptr_), - num_bytes); - Advance(num_bytes); - return result; - } - - // Returns a number in the range [min, max] by consuming bytes from the - // input data. The value might not be uniformly distributed in the given - // range. If there's no input data left, always returns |min|. |min| must - // be less than or equal to |max|. - template T ConsumeIntegralInRange(T min, T max) { - static_assert(std::is_integral::value, "An integral type is required."); - static_assert(sizeof(T) <= sizeof(uint64_t), "Unsupported integral type."); - - if (min > max) - abort(); - - // Use the biggest type possible to hold the range and the result. - uint64_t range = static_cast(max) - min; - uint64_t result = 0; - size_t offset = 0; - - while (offset < sizeof(T) * CHAR_BIT && (range >> offset) > 0 && - remaining_bytes_ != 0) { - // Pull bytes off the end of the seed data. Experimentally, this seems to - // allow the fuzzer to more easily explore the input space. This makes - // sense, since it works by modifying inputs that caused new code to run, - // and this data is often used to encode length of data read by - // |ConsumeBytes|. Separating out read lengths makes it easier modify the - // contents of the data that is actually read. - --remaining_bytes_; - result = (result << CHAR_BIT) | data_ptr_[remaining_bytes_]; - offset += CHAR_BIT; - } - - // Avoid division by 0, in case |range + 1| results in overflow. - if (range != std::numeric_limits::max()) - result = result % (range + 1); - - return static_cast(min + result); - } - - // Returns a std::string of length from 0 to |max_length|. When it runs out of - // input data, returns what remains of the input. Designed to be more stable - // with respect to a fuzzer inserting characters than just picking a random - // length and then consuming that many bytes with |ConsumeBytes|. - std::string ConsumeRandomLengthString(size_t max_length) { - // Reads bytes from the start of |data_ptr_|. Maps "\\" to "\", and maps "\" - // followed by anything else to the end of the string. As a result of this - // logic, a fuzzer can insert characters into the string, and the string - // will be lengthened to include those new characters, resulting in a more - // stable fuzzer than picking the length of a string independently from - // picking its contents. - std::string result; - - // Reserve the anticipated capaticity to prevent several reallocations. - result.reserve(std::min(max_length, remaining_bytes_)); - for (size_t i = 0; i < max_length && remaining_bytes_ != 0; ++i) { - char next = ConvertUnsignedToSigned(data_ptr_[0]); - Advance(1); - if (next == '\\' && remaining_bytes_ != 0) { - next = ConvertUnsignedToSigned(data_ptr_[0]); - Advance(1); - if (next != '\\') - break; - } - result += next; - } - - result.shrink_to_fit(); - return result; - } - - // Returns a std::vector containing all remaining bytes of the input data. - template std::vector ConsumeRemainingBytes() { - return ConsumeBytes(remaining_bytes_); - } - - // Returns a std::string containing all remaining bytes of the input data. - // Prefer using |ConsumeRemainingBytes| unless you actually need a std::string - // object. - std::string ConsumeRemainingBytesAsString() { - return ConsumeBytesAsString(remaining_bytes_); - } - - // Returns a number in the range [Type's min, Type's max]. The value might - // not be uniformly distributed in the given range. If there's no input data - // left, always returns |min|. - template T ConsumeIntegral() { - return ConsumeIntegralInRange(std::numeric_limits::min(), - std::numeric_limits::max()); - } - - // Reads one byte and returns a bool, or false when no data remains. - bool ConsumeBool() { return 1 & ConsumeIntegral(); } - - // Returns a copy of the value selected from the given fixed-size |array|. - template - T PickValueInArray(const T (&array)[size]) { - static_assert(size > 0, "The array must be non empty."); - return array[ConsumeIntegralInRange(0, size - 1)]; - } - - template - T PickValueInArray(std::initializer_list list) { - // TODO(Dor1s): switch to static_assert once C++14 is allowed. - if (!list.size()) - abort(); - - return *(list.begin() + ConsumeIntegralInRange(0, list.size() - 1)); - } - - // Returns an enum value. The enum must start at 0 and be contiguous. It must - // also contain |kMaxValue| aliased to its largest (inclusive) value. Such as: - // enum class Foo { SomeValue, OtherValue, kMaxValue = OtherValue }; - template T ConsumeEnum() { - static_assert(std::is_enum::value, "|T| must be an enum type."); - return static_cast(ConsumeIntegralInRange( - 0, static_cast(T::kMaxValue))); - } - - // Returns a floating point number in the range [0.0, 1.0]. If there's no - // input data left, always returns 0. - template T ConsumeProbability() { - static_assert(std::is_floating_point::value, - "A floating point type is required."); - - // Use different integral types for different floating point types in order - // to provide better density of the resulting values. - using IntegralType = - typename std::conditional<(sizeof(T) <= sizeof(uint32_t)), uint32_t, - uint64_t>::type; - - T result = static_cast(ConsumeIntegral()); - result /= static_cast(std::numeric_limits::max()); - return result; - } - - // Returns a floating point value in the range [Type's lowest, Type's max] by - // consuming bytes from the input data. If there's no input data left, always - // returns approximately 0. - template T ConsumeFloatingPoint() { - return ConsumeFloatingPointInRange(std::numeric_limits::lowest(), - std::numeric_limits::max()); - } - - // Returns a floating point value in the given range by consuming bytes from - // the input data. If there's no input data left, returns |min|. Note that - // |min| must be less than or equal to |max|. - template T ConsumeFloatingPointInRange(T min, T max) { - if (min > max) - abort(); - - T range = .0; - T result = min; - constexpr T zero(.0); - if (max > zero && min < zero && max > min + std::numeric_limits::max()) { - // The diff |max - min| would overflow the given floating point type. Use - // the half of the diff as the range and consume a bool to decide whether - // the result is in the first of the second part of the diff. - range = (max / 2.0) - (min / 2.0); - if (ConsumeBool()) { - result += range; - } - } else { - range = max - min; - } - - return result + range * ConsumeProbability(); - } - - // Reports the remaining bytes available for fuzzed input. - size_t remaining_bytes() { return remaining_bytes_; } - - private: - FuzzedDataProvider(const FuzzedDataProvider &) = delete; - FuzzedDataProvider &operator=(const FuzzedDataProvider &) = delete; - - void Advance(size_t num_bytes) { - if (num_bytes > remaining_bytes_) - abort(); - - data_ptr_ += num_bytes; - remaining_bytes_ -= num_bytes; - } - - template - std::vector ConsumeBytes(size_t size, size_t num_bytes_to_consume) { - static_assert(sizeof(T) == sizeof(uint8_t), "Incompatible data type."); - - // The point of using the size-based constructor below is to increase the - // odds of having a vector object with capacity being equal to the length. - // That part is always implementation specific, but at least both libc++ and - // libstdc++ allocate the requested number of bytes in that constructor, - // which seems to be a natural choice for other implementations as well. - // To increase the odds even more, we also call |shrink_to_fit| below. - std::vector result(size); - if (size == 0) { - if (num_bytes_to_consume != 0) - abort(); - return result; - } - - std::memcpy(result.data(), data_ptr_, num_bytes_to_consume); - Advance(num_bytes_to_consume); - - // Even though |shrink_to_fit| is also implementation specific, we expect it - // to provide an additional assurance in case vector's constructor allocated - // a buffer which is larger than the actual amount of data we put inside it. - result.shrink_to_fit(); - return result; - } - - template TS ConvertUnsignedToSigned(TU value) { - static_assert(sizeof(TS) == sizeof(TU), "Incompatible data types."); - static_assert(!std::numeric_limits::is_signed, - "Source type must be unsigned."); - - // TODO(Dor1s): change to `if constexpr` once C++17 becomes mainstream. - if (std::numeric_limits::is_modulo) - return static_cast(value); - - // Avoid using implementation-defined unsigned to signer conversions. - // To learn more, see https://stackoverflow.com/questions/13150449. - if (value <= std::numeric_limits::max()) { - return static_cast(value); - } else { - constexpr auto TS_min = std::numeric_limits::min(); - return TS_min + static_cast(value - TS_min); - } - } - - const uint8_t *data_ptr_; - size_t remaining_bytes_; -}; - -#endif // LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_ diff --git a/re2/fuzzing/re2_fuzzer.cc b/re2/fuzzing/re2_fuzzer.cc deleted file mode 100644 index 3082a76..0000000 --- a/re2/fuzzing/re2_fuzzer.cc +++ /dev/null @@ -1,247 +0,0 @@ -// Copyright 2016 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include -#include -#include -#include -#include -#include - -#include "re2/re2.h" -#include "re2/regexp.h" -#include "re2/walker-inl.h" - -using re2::StringPiece; - -// NOT static, NOT signed. -uint8_t dummy = 0; - -// Walks kRegexpConcat and kRegexpAlternate subexpressions -// to determine their maximum length. -class SubexpressionWalker : public re2::Regexp::Walker { - public: - SubexpressionWalker() = default; - ~SubexpressionWalker() override = default; - - int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg, - int* child_args, int nchild_args) override { - switch (re->op()) { - case re2::kRegexpConcat: - case re2::kRegexpAlternate: { - int max = nchild_args; - for (int i = 0; i < nchild_args; i++) - max = std::max(max, child_args[i]); - return max; - } - - default: - break; - } - return -1; - } - - // Should never be called: we use Walk(), not WalkExponential(). - int ShortVisit(re2::Regexp* re, int parent_arg) override { - return parent_arg; - } - - private: - SubexpressionWalker(const SubexpressionWalker&) = delete; - SubexpressionWalker& operator=(const SubexpressionWalker&) = delete; -}; - -// Walks substrings (i.e. kRegexpLiteralString subexpressions) -// to determine their maximum length... in runes, but avoiding -// overheads due to UTF-8 encoding is worthwhile when fuzzing. -class SubstringWalker : public re2::Regexp::Walker { - public: - SubstringWalker() = default; - ~SubstringWalker() override = default; - - int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg, - int* child_args, int nchild_args) override { - switch (re->op()) { - case re2::kRegexpConcat: - case re2::kRegexpAlternate: - case re2::kRegexpStar: - case re2::kRegexpPlus: - case re2::kRegexpQuest: - case re2::kRegexpRepeat: - case re2::kRegexpCapture: { - int max = -1; - for (int i = 0; i < nchild_args; i++) - max = std::max(max, child_args[i]); - return max; - } - - case re2::kRegexpLiteralString: - return re->nrunes(); - - default: - break; - } - return -1; - } - - // Should never be called: we use Walk(), not WalkExponential(). - int ShortVisit(re2::Regexp* re, int parent_arg) override { - return parent_arg; - } - - private: - SubstringWalker(const SubstringWalker&) = delete; - SubstringWalker& operator=(const SubstringWalker&) = delete; -}; - -void TestOneInput(StringPiece pattern, const RE2::Options& options, - StringPiece text) { - // Crudely limit the use of ., \p, \P, \d, \D, \s, \S, \w and \W. - // Otherwise, we will waste time on inputs that have long runs of various - // character classes. The fuzzer has shown itself to be easily capable of - // generating such patterns that fall within the other limits, but result - // in timeouts nonetheless. The marginal cost is high - even more so when - // counted repetition is involved - whereas the marginal benefit is zero. - // Crudely limit the use of 'k', 'K', 's' and 'S' too because they become - // three-element character classes when case-insensitive and using UTF-8. - // TODO(junyer): Handle [:isalnum:] et al. when they start to cause pain. - int char_class = 0; - int backslash_p = 0; // very expensive, so handle specially - for (size_t i = 0; i < pattern.size(); i++) { - if (pattern[i] == '.' || - pattern[i] == 'k' || pattern[i] == 'K' || - pattern[i] == 's' || pattern[i] == 'S') - char_class++; - if (pattern[i] != '\\') - continue; - i++; - if (i >= pattern.size()) - break; - if (pattern[i] == 'p' || pattern[i] == 'P' || - pattern[i] == 'd' || pattern[i] == 'D' || - pattern[i] == 's' || pattern[i] == 'S' || - pattern[i] == 'w' || pattern[i] == 'W') - char_class++; - if (pattern[i] == 'p' || pattern[i] == 'P') - backslash_p++; - } - if (char_class > 9) - return; - if (backslash_p > 1) - return; - - // The default is 1000. Even 100 turned out to be too generous - // for fuzzing, empirically speaking, so let's try 10 instead. - re2::Regexp::FUZZING_ONLY_set_maximum_repeat_count(10); - - RE2 re(pattern, options); - if (!re.ok()) - return; - - // Don't waste time fuzzing programs with large subexpressions. - // They can cause bug reports due to fuzzer timeouts. And they - // aren't interesting for fuzzing purposes. - if (SubexpressionWalker().Walk(re.Regexp(), -1) > 9) - return; - - // Don't waste time fuzzing programs with large substrings. - // They can cause bug reports due to fuzzer timeouts when they - // are repetitions (e.g. hundreds of NUL bytes) and matching is - // unanchored. And they aren't interesting for fuzzing purposes. - if (SubstringWalker().Walk(re.Regexp(), -1) > 9) - return; - - // Don't waste time fuzzing high-size programs. - // They can cause bug reports due to fuzzer timeouts. - int size = re.ProgramSize(); - if (size > 9999) - return; - int rsize = re.ReverseProgramSize(); - if (rsize > 9999) - return; - - // Don't waste time fuzzing high-fanout programs. - // They can cause bug reports due to fuzzer timeouts. - std::vector histogram; - int fanout = re.ProgramFanout(&histogram); - if (fanout > 9) - return; - int rfanout = re.ReverseProgramFanout(&histogram); - if (rfanout > 9) - return; - - if (re.NumberOfCapturingGroups() == 0) { - // Avoid early return due to too many arguments. - StringPiece sp = text; - RE2::FullMatch(sp, re); - RE2::PartialMatch(sp, re); - RE2::Consume(&sp, re); - sp = text; // Reset. - RE2::FindAndConsume(&sp, re); - } else { - // Okay, we have at least one capturing group... - // Try conversion for variously typed arguments. - StringPiece sp = text; - short s; - RE2::FullMatch(sp, re, &s); - long l; - RE2::PartialMatch(sp, re, &l); - float f; - RE2::Consume(&sp, re, &f); - sp = text; // Reset. - double d; - RE2::FindAndConsume(&sp, re, &d); - } - - std::string s = std::string(text); - RE2::Replace(&s, re, ""); - s = std::string(text); // Reset. - RE2::GlobalReplace(&s, re, ""); - - std::string min, max; - re.PossibleMatchRange(&min, &max, /*maxlen=*/9); - - // Exercise some other API functionality. - dummy += re.NamedCapturingGroups().size(); - dummy += re.CapturingGroupNames().size(); - dummy += RE2::QuoteMeta(pattern).size(); -} - -// Entry point for libFuzzer. -extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { - // An input larger than 4 KiB probably isn't interesting. (This limit - // allows for fdp.ConsumeRandomLengthString()'s backslash behaviour.) - if (size == 0 || size > 4096) - return 0; - - FuzzedDataProvider fdp(data, size); - - // The convention here is that fdp.ConsumeBool() returning false sets - // the default value whereas returning true sets the alternate value: - // most options default to false and so can be set directly; encoding - // defaults to UTF-8; case_sensitive defaults to true. We do NOT want - // to log errors. max_mem is 64 MiB because we can afford to use more - // RAM in exchange for (hopefully) faster fuzzing. - RE2::Options options; - options.set_encoding(fdp.ConsumeBool() ? RE2::Options::EncodingLatin1 - : RE2::Options::EncodingUTF8); - options.set_posix_syntax(fdp.ConsumeBool()); - options.set_longest_match(fdp.ConsumeBool()); - options.set_log_errors(false); - options.set_max_mem(64 << 20); - options.set_literal(fdp.ConsumeBool()); - options.set_never_nl(fdp.ConsumeBool()); - options.set_dot_nl(fdp.ConsumeBool()); - options.set_never_capture(fdp.ConsumeBool()); - options.set_case_sensitive(!fdp.ConsumeBool()); - options.set_perl_classes(fdp.ConsumeBool()); - options.set_word_boundary(fdp.ConsumeBool()); - options.set_one_line(fdp.ConsumeBool()); - - std::string pattern = fdp.ConsumeRandomLengthString(999); - std::string text = fdp.ConsumeRandomLengthString(999); - - TestOneInput(pattern, options, text); - return 0; -} -- Gitee