From 6c098c5d443a8e12860defac4ab62f919f1aa394 Mon Sep 17 00:00:00 2001 From: yangwentong <425822674@qq.com> Date: Fri, 25 Nov 2022 22:15:39 +0800 Subject: [PATCH] =?UTF-8?q?=E5=B0=86=E5=90=8CC=E7=B1=BB=E5=9E=8B=E8=BD=AC?= =?UTF-8?q?=E6=8D=A2=E7=9A=84=E9=80=BB=E8=BE=91=E4=B8=8E=E4=B8=9A=E5=8A=A1?= =?UTF-8?q?=E5=AE=9E=E7=8E=B0=E9=80=BB=E8=BE=91=E5=88=86=E7=A6=BB,=20rure?= =?UTF-8?q?=E7=9A=84=E6=8E=A5=E5=8F=A3=E4=BB=A3=E7=A0=81=E6=94=BE=E5=9C=A8?= =?UTF-8?q?lib.rs=E4=B8=AD,=20rure.rs=E6=94=B9=E4=B8=BAlib=5Finternal.rs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- regex-capi/ctest/test.c | 171 ------ regex-capi/include/rure.h | 129 ---- regex-capi/src/lib.rs | 520 +++++++++++++++- regex-capi/src/{rure.rs => lib_internal.rs} | 644 ++------------------ 4 files changed, 564 insertions(+), 900 deletions(-) rename regex-capi/src/{rure.rs => lib_internal.rs} (51%) diff --git a/regex-capi/ctest/test.c b/regex-capi/ctest/test.c index 86bf1dc..a75e23b 100644 --- a/regex-capi/ctest/test.c +++ b/regex-capi/ctest/test.c @@ -28,34 +28,6 @@ bool test_is_match() { return passed; } -bool test_shortest_match() { - bool passed = true; - const char *haystack = "aaaaa"; - - rure *re = rure_compile_must("a+"); - size_t end = 0; - bool matched = rure_shortest_match(re, (const uint8_t *)haystack, - strlen(haystack), 0, &end); - if (!matched) { - if (DEBUG) { - fprintf(stderr, - "[test_shortest_match] expected match, " - "but got no match\n"); - } - passed = false; - } - size_t expect_end = 1; - if (end != expect_end) { - if (DEBUG) { - fprintf(stderr, - "[test_shortest_match] expected match end location %zu " - "but got %zu\n", expect_end, end); - } - passed = false; - } - rure_free(re); - return passed; -} bool test_find() { bool passed = true; @@ -115,19 +87,6 @@ bool test_captures() { passed = false; goto done; } - int32_t expect_capture_index = 2; - int32_t capture_index = rure_capture_name_index(re, "snowman"); - if (capture_index != expect_capture_index) { - if (DEBUG) { - fprintf(stderr, - "[test_captures] " - "expected capture index %d for name 'snowman', but " - "got %d\n", - expect_capture_index, capture_index); - } - passed = false; - goto done; - } size_t expect_start = 9; size_t expect_end = 12; rure_captures_at(caps, 2, &match); @@ -147,67 +106,6 @@ done: return passed; } -bool test_iter() { - bool passed = true; - const uint8_t *haystack = (const uint8_t *)"abc xyz"; - size_t haystack_len = strlen((const char *)haystack); - - rure *re = rure_compile_must("\\w+(\\w)"); - rure_match match = {0}; - rure_captures *caps = rure_captures_new(re); - rure_iter *it = rure_iter_new(re); - - bool matched = rure_iter_next(it, haystack, haystack_len, &match); - if (!matched) { - if (DEBUG) { - fprintf(stderr, - "[test_iter] expected first match, but got no match\n"); - } - passed = false; - goto done; - } - size_t expect_start = 0; - size_t expect_end = 3; - if (match.start != expect_start || match.end != expect_end) { - if (DEBUG) { - fprintf(stderr, - "[test_iter] expected first match at (%zu, %zu), but " - "got match at (%zu, %zu)\n", - expect_start, expect_end, match.start, match.end); - } - passed = false; - goto done; - } - - matched = rure_iter_next_captures(it, haystack, haystack_len, caps); - if (!matched) { - if (DEBUG) { - fprintf(stderr, - "[test_iter] expected second match, but got no match\n"); - } - passed = false; - goto done; - } - rure_captures_at(caps, 1, &match); - expect_start = 6; - expect_end = 7; - if (match.start != expect_start || match.end != expect_end) { - if (DEBUG) { - fprintf(stderr, - "[test_iter] expected second match at (%zu, %zu), but " - "got match at (%zu, %zu)\n", - expect_start, expect_end, match.start, match.end); - } - passed = false; - goto done; - } -done: - rure_iter_free(it); - rure_captures_free(caps); - rure_free(re); - return passed; -} - bool test_iter_capture_name(char *expect, char *given) { bool passed = true; if (strcmp(expect, given)) { @@ -316,35 +214,6 @@ bool test_compile_error() { return passed; } -bool test_compile_error_size_limit() { - bool passed = true; - rure_options *opts = rure_options_new(); - rure_options_size_limit(opts, 0); - rure_error *err = rure_error_new(); - rure *re = rure_compile((const uint8_t *)"\\w{100}", 8, 0, opts, err); - if (re != NULL) { - if (DEBUG) { - fprintf(stderr, - "[test_compile_error_size_limit] " - "expected NULL regex pointer, but got non-NULL pointer\n"); - } - passed = false; - rure_free(re); - } - const char *msg = rure_error_message(err); - if (NULL == strstr(msg, "exceeds size")) { - if (DEBUG) { - fprintf(stderr, - "[test_compile_error] " - "expected an 'exceeds size' error message, but " - "got this instead: '%s'\n", msg); - } - passed = false; - } - rure_options_free(opts); - rure_error_free(err); - return passed; -} bool test_regex_set_matches() { @@ -495,41 +364,6 @@ done2: #undef PAT_COUNT } -bool test_regex_set_options() { - - bool passed = true; - rure_options *opts = rure_options_new(); - rure_options_size_limit(opts, 0); - rure_error *err = rure_error_new(); - - const char *patterns[] = { "\\w{100}" }; - const size_t patterns_lengths[] = { 8 }; - - rure_set *re = rure_compile_set( - (const uint8_t **) patterns, patterns_lengths, 1, 0, opts, err); - if (re != NULL) { - if (DEBUG) { - fprintf(stderr, - "[test_compile_error_size_limit] " - "expected NULL regex pointer, but got non-NULL pointer\n"); - } - passed = false; - rure_set_free(re); - } - const char *msg = rure_error_message(err); - if (NULL == strstr(msg, "exceeds size")) { - if (DEBUG) { - fprintf(stderr, - "[test_compile_error] " - "expected an 'exceeds size' error message, but " - "got this instead: '%s'\n", msg); - } - passed = false; - } - rure_options_free(opts); - rure_error_free(err); - return passed; -} bool test_escape() { bool passed = true; @@ -673,17 +507,12 @@ int main() { bool passed = true; run_test(test_is_match, "test_is_match", &passed); - run_test(test_shortest_match, "test_shortest_match", &passed); run_test(test_find, "test_find", &passed); run_test(test_captures, "test_captures", &passed); - run_test(test_iter, "test_iter", &passed); run_test(test_iter_capture_names, "test_iter_capture_names", &passed); run_test(test_flags, "test_flags", &passed); run_test(test_compile_error, "test_compile_error", &passed); - run_test(test_compile_error_size_limit, "test_compile_error_size_limit", - &passed); run_test(test_regex_set_matches, "test_regex_set_match", &passed); - run_test(test_regex_set_options, "test_regex_set_options", &passed); run_test(test_regex_set_match_start, "test_regex_set_match_start", &passed); run_test(test_escape, "test_escape", &passed); diff --git a/regex-capi/include/rure.h b/regex-capi/include/rure.h index 286421e..5ba573f 100644 --- a/regex-capi/include/rure.h +++ b/regex-capi/include/rure.h @@ -262,41 +262,7 @@ bool rure_find(rure *re, const uint8_t *haystack, size_t length, bool rure_find_captures(rure *re, const uint8_t *haystack, size_t length, size_t start, rure_captures *captures); -/* - * rure_shortest_match returns true if and only if re matches anywhere in - * haystack. If a match is found, then its end location is stored in the - * pointer given. The end location is the place at which the regex engine - * determined that a match exists, but may occur before the end of the proper - * leftmost-first match. - * - * haystack may contain arbitrary bytes, but ASCII compatible text is more - * useful. UTF-8 is even more useful. Other text encodings aren't supported. - * length should be the number of bytes in haystack. - * - * start is the position at which to start searching. Note that setting the - * start position is distinct from incrementing the pointer, since the regex - * engine may look at bytes before the start position to determine match - * information. For example, if the start position is greater than 0, then the - * \A ("begin text") anchor can never match. - * - * rure_shortest_match should be preferred to rure_find since it may be faster. - * - * N.B. The performance of this search is not impacted by the presence of - * capturing groups in your regular expression. - */ -bool rure_shortest_match(rure *re, const uint8_t *haystack, size_t length, - size_t start, size_t *end); -/* - * rure_capture_name_index returns the capture index for the name given. If - * no such named capturing group exists in re, then -1 is returned. - * - * The capture index may be used with rure_captures_at. - * - * This function never returns 0 since the first capture group always - * corresponds to the entire match and is always unnamed. - */ -int32_t rure_capture_name_index(rure *re, const char *name); /* * rure_iter_capture_names_new creates a new capture_names iterator. @@ -320,15 +286,6 @@ void rure_iter_capture_names_free(rure_iter_capture_names *it); */ bool rure_iter_capture_names_next(rure_iter_capture_names *it, char **name); -/* - * rure_iter_new creates a new iterator. - * - * An iterator will report all successive non-overlapping matches of re. - * When calling iterator functions, the same haystack and length must be - * supplied to all invocations. (Strict pointer equality is, however, not - * required.) - */ -rure_iter *rure_iter_new(rure *re); /* * rure_iter_free frees the iterator given. @@ -337,52 +294,7 @@ rure_iter *rure_iter_new(rure *re); */ void rure_iter_free(rure_iter *it); -/* - * rure_iter_next advances the iterator and returns true if and only if a - * match was found. If a match is found, then the match pointer is set with the - * start and end location of the match, in bytes. - * - * If no match is found, then subsequent calls will return false indefinitely. - * - * haystack may contain arbitrary bytes, but ASCII compatible text is more - * useful. UTF-8 is even more useful. Other text encodings aren't supported. - * length should be the number of bytes in haystack. The given haystack must - * be logically equivalent to all other haystacks given to this iterator. - * - * rure_iter_next should be preferred to rure_iter_next_captures since it may - * be faster. - * - * N.B. The performance of this search is not impacted by the presence of - * capturing groups in your regular expression. - */ -bool rure_iter_next(rure_iter *it, const uint8_t *haystack, size_t length, - rure_match *match); -/* - * rure_iter_next_captures advances the iterator and returns true if and only if a - * match was found. If a match is found, then all of its capture locations are - * stored in the captures pointer given. - * - * If no match is found, then subsequent calls will return false indefinitely. - * - * haystack may contain arbitrary bytes, but ASCII compatible text is more - * useful. UTF-8 is even more useful. Other text encodings aren't supported. - * length should be the number of bytes in haystack. The given haystack must - * be logically equivalent to all other haystacks given to this iterator. - * - * Only use this function if you specifically need access to capture locations. - * It is not necessary to use this function just because your regular - * expression contains capturing groups. - * - * Capture locations can be accessed using the rure_captures_* functions. - * - * N.B. The performance of this search can be impacted by the number of - * capturing groups. If you're using this function, it may be beneficial to - * use non-capturing groups (e.g., `(?:re)`) where possible. - */ -bool rure_iter_next_captures(rure_iter *it, - const uint8_t *haystack, size_t length, - rure_captures *captures); /* * rure_captures_new allocates storage for all capturing groups in re. @@ -424,48 +336,7 @@ bool rure_captures_at(rure_captures *captures, size_t i, rure_match *match); */ size_t rure_captures_len(rure_captures *captures); -/* - * rure_options_new allocates space for options. - * - * Options may be freed immediately after a call to rure_compile, but otherwise - * may be freely used in multiple calls to rure_compile. - * - * It is not safe to set options from multiple threads simultaneously. It is - * safe to call rure_compile from multiple threads simultaneously using the - * same options pointer. - */ -rure_options *rure_options_new(); - -/* - * rure_options_free frees the given options. - * - * This must be called at most once. - */ -void rure_options_free(rure_options *options); -/* - * rure_options_size_limit sets the appoximate size limit of the compiled - * regular expression. - * - * This size limit roughly corresponds to the number of bytes occupied by a - * single compiled program. If the program would exceed this number, then a - * compilation error will be returned from rure_compile. - */ -void rure_options_size_limit(rure_options *options, size_t limit); - -/* - * rure_options_dfa_size_limit sets the approximate size of the cache used by - * the DFA during search. - * - * This roughly corresponds to the number of bytes that the DFA will use while - * searching. - * - * Note that this is a *per thread* limit. There is no way to set a global - * limit. In particular, if a regular expression is used from multiple threads - * simultaneously, then each thread may use up to the number of bytes - * specified here. - */ -void rure_options_dfa_size_limit(rure_options *options, size_t limit); /* * rure_compile_set compiles the given list of patterns into a single regular diff --git a/regex-capi/src/lib.rs b/regex-capi/src/lib.rs index 59f9681..d28bae5 100644 --- a/regex-capi/src/lib.rs +++ b/regex-capi/src/lib.rs @@ -1,6 +1,516 @@ -#[macro_use] -mod error; -mod rure; +/****************************************************************************** + * Copyright (c) USTC(Suzhou) & Huawei Technologies Co., Ltd. 2022. All rights reserved. + * re2-rust licensed under the Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR + * PURPOSE. + * See the Mulan PSL v2 for more details. + * Author: mengning, liuzhitao, yangwentong + * Create: 2022-11-25 + * Description: Rure is a C API to Rust's regex library. + ******************************************************************************/ + #[macro_use] + mod error; + pub use crate::error::*; + + use std::ffi::{CStr, CString}; + use std::ops::Deref; + use std::ptr; + use std::slice; + use std::str; + + use libc::{c_char, size_t}; + use regex::bytes::CaptureLocations; + use regex::{bytes, Regex}; + + use crate::error::{Error, ErrorKind}; + use std::io; + use std::io::Write; + use std::process::abort; + + include!("lib_internal.rs"); + + const RURE_FLAG_CASEI: u32 = 1 << 0; + const RURE_FLAG_MULTI: u32 = 1 << 1; + const RURE_FLAG_DOTNL: u32 = 1 << 2; + const RURE_FLAG_SWAP_GREED: u32 = 1 << 3; + const RURE_FLAG_SPACE: u32 = 1 << 4; + const RURE_FLAG_UNICODE: u32 = 1 << 5; + const RURE_DEFAULT_FLAGS: u32 = RURE_FLAG_UNICODE; + + pub struct RegexBytes { + re: bytes::Regex, + // capture_names: HashMap, + } + + pub struct RegexUnicode { + re: Regex, + } + + pub struct Options { + size_limit: usize, + dfa_size_limit: usize, + } + + // The `RegexSet` is not exposed with option support or matching at an + // arbitrary position with a crate just yet. To circumvent this, we use + // the `Exec` structure directly. + pub struct RegexSet { + re: bytes::RegexSet, + } + + #[repr(C)] + pub struct rure_match { + pub start: size_t, + pub end: size_t, + } + + pub struct Captures(bytes::Locations); + + pub struct IterCaptureNames { + capture_names: bytes::CaptureNames<'static>, + name_ptrs: Vec<*mut c_char>, + } + + #[repr(C)] + pub struct Atoms { + atom: *mut c_char, + } + + #[repr(C)] + pub struct MyVec { + data: *mut Atoms, + len: i32, + } + + impl Deref for RegexBytes { + type Target = bytes::Regex; + fn deref(&self) -> &bytes::Regex { + &self.re + } + } + + impl Deref for RegexUnicode { + type Target = Regex; + fn deref(&self) -> &Regex { + &self.re + } + } + + impl Deref for RegexSet { + type Target = bytes::RegexSet; + fn deref(&self) -> &bytes::RegexSet { + &self.re + } + } + + impl Default for Options { + fn default() -> Options { + Options { + size_limit: 10 * (1 << 20), + dfa_size_limit: 2 * (1 << 20), + } + } + } + + #[no_mangle] + extern "C" fn rure_compile_must(pattern: *const c_char) -> *const RegexBytes { + let len = unsafe { CStr::from_ptr(pattern).to_bytes().len() }; + let pat = pattern as *const u8; + let mut err = Error::new(ErrorKind::None); + let re = rure_compile(pat, len, RURE_DEFAULT_FLAGS, ptr::null(), &mut err); + if err.is_err() { + let _ = writeln!(&mut io::stderr(), "{}", err); + let _ = writeln!(&mut io::stderr(), "aborting from rure_compile_must"); + abort() + } + re + } + + #[no_mangle] + extern "C" fn rure_compile( + pattern: *const u8, + length: size_t, + flags: u32, + options: *const Options, + error: *mut Error, + ) -> *const RegexBytes { + let pat = unsafe { slice::from_raw_parts(pattern, length) }; + let pat = match str::from_utf8(pat) { + Ok(pat) => pat, + Err(err) => unsafe { + if !error.is_null() { + *error = Error::new(ErrorKind::Str(err)); + } + return ptr::null(); + }, + }; + rure_compile_internal(pat, flags, options, error) + } + + #[no_mangle] + extern "C" fn rure_free(re: *const RegexBytes) { + unsafe { + drop(Box::from_raw(re as *mut Regex)); + } + } + + #[no_mangle] + extern "C" fn rure_is_match( + re: *const RegexBytes, + haystack: *const u8, + len: size_t, + _start: size_t, + ) -> bool { + let re = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(haystack, len) }; + re.is_match(haystack) + } + + #[no_mangle] + extern "C" fn rure_find( + re: *const RegexBytes, + haystack: *const u8, + len: size_t, + start: size_t, + match_info: *mut rure_match, + ) -> bool { + let re = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(haystack, len) }; + rure_find_internal(re, haystack, start, match_info) + } + + #[no_mangle] + extern "C" fn rure_find_captures( + re: *const RegexBytes, + haystack: *const u8, + len: size_t, + start: size_t, + captures: *mut Captures, + ) -> bool { + let re = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(haystack, len) }; + let slots = unsafe { &mut (*captures).0 }; + re.read_captures_at(slots, haystack, start).is_some() + } + + #[no_mangle] + extern "C" fn rure_iter_capture_names_new(re: *const RegexBytes) -> *mut IterCaptureNames { + let re = unsafe { &*re }; + Box::into_raw(Box::new(IterCaptureNames { + capture_names: re.re.capture_names(), + name_ptrs: Vec::new(), + })) + } + + #[no_mangle] + extern "C" fn rure_iter_capture_names_free(it: *mut IterCaptureNames) { + unsafe { + let it = &mut *it; + while let Some(ptr) = it.name_ptrs.pop() { + drop(CString::from_raw(ptr)); + } + drop(Box::from_raw(it)); + } + } + + #[no_mangle] + extern "C" fn rure_iter_capture_names_next( + it: *mut IterCaptureNames, + capture_name: *mut *mut c_char, + ) -> bool { + if capture_name.is_null() { + return false; + } + let it = unsafe { &mut *it }; + let cn = match it.capture_names.next() { + // Top-level iterator ran out of capture groups + None => return false, + Some(val) => { + match val { + // inner Option didn't have a name + None => "", + Some(name) => name, + } + } + }; + unsafe { + let cs = match CString::new(cn.as_bytes()) { + Result::Ok(val) => val, + Result::Err(_) => return false, + }; + let ptr = cs.into_raw(); + it.name_ptrs.push(ptr); + *capture_name = ptr; + } + true + } + + #[no_mangle] + extern "C" fn rure_captures_new(re: *const RegexBytes) -> *mut Captures { + let re = unsafe { &*re }; + let captures = Captures(re.locations()); + Box::into_raw(Box::new(captures)) + } + + #[no_mangle] + extern "C" fn rure_captures_free(captures: *const Captures) { + unsafe { + drop(Box::from_raw(captures as *mut Captures)); + } + } + + #[no_mangle] + extern "C" fn rure_captures_at( + captures: *const Captures, + i: size_t, + match_info: *mut rure_match, + ) -> bool { + let locs = unsafe { &(*captures).0 }; + rure_captures_at_internal(locs, i, match_info) + } + + #[no_mangle] + extern "C" fn rure_captures_len(captures: *const Captures) -> size_t { + unsafe { (*captures).0.len() } + } + + #[no_mangle] + extern "C" fn rure_compile_set( + patterns: *const *const u8, + patterns_lengths: *const size_t, + patterns_count: size_t, + flags: u32, + options: *const Options, + error: *mut Error, + ) -> *const RegexSet { + let (raw_pats, raw_patsl) = unsafe { + ( + slice::from_raw_parts(patterns, patterns_count), + slice::from_raw_parts(patterns_lengths, patterns_count), + ) + }; + rure_compile_set_internal(raw_pats, raw_patsl, patterns_count, flags, options, error) + } + + #[no_mangle] + extern "C" fn rure_set_free(re: *const RegexSet) { + unsafe { + drop(Box::from_raw(re as *mut RegexSet)); + } + } + + #[no_mangle] + extern "C" fn rure_set_is_match( + re: *const RegexSet, + haystack: *const u8, + len: size_t, + start: size_t, + ) -> bool { + let re = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(haystack, len) }; + re.is_match_at(haystack, start) + } + + #[no_mangle] + extern "C" fn rure_set_matches( + re: *const RegexSet, + haystack: *const u8, + len: size_t, + start: size_t, + matches: *mut bool, + ) -> bool { + let re = unsafe { &*re }; + let matches = unsafe { slice::from_raw_parts_mut(matches, re.len()) }; + let haystack = unsafe { slice::from_raw_parts(haystack, len) }; + + rure_set_matches_internal(re, matches, haystack, start) + } + + #[no_mangle] + extern "C" fn rure_set_len(re: *const RegexSet) -> size_t { + unsafe { (*re).len() } + } + + #[no_mangle] + extern "C" fn rure_escape_must(pattern: *const c_char) -> *const c_char { + let len = unsafe { CStr::from_ptr(pattern).to_bytes().len() }; + let pat = pattern as *const u8; + let mut err = Error::new(ErrorKind::None); + let esc = rure_escape(pat, len, &mut err); + if err.is_err() { + println!("{}", "aborting from rure_escape_must"); + let _ = writeln!(&mut io::stderr(), "{}", err); + let _ = writeln!(&mut io::stderr(), "aborting from rure_escape_must"); + abort() + } + esc + } + + /// A helper function that implements fallible escaping in a way that returns + /// an error if escaping failed. + /// + /// This should ideally be exposed, but it needs API design work. In + /// particular, this should not return a C string, but a `const uint8_t *` + /// instead, since it may contain a NUL byte. + fn rure_escape(pattern: *const u8, length: size_t, error: *mut Error) -> *const c_char { + let pat: &[u8] = unsafe { slice::from_raw_parts(pattern, length) }; + let str_pat = match str::from_utf8(pat) { + Ok(val) => val, + Err(err) => unsafe { + if !error.is_null() { + *error = Error::new(ErrorKind::Str(err)); + } + return ptr::null(); + }, + }; + let esc_pat = regex::escape(str_pat); + let c_esc_pat = match CString::new(esc_pat) { + Ok(val) => val, + Err(err) => unsafe { + if !error.is_null() { + *error = Error::new(ErrorKind::Nul(err)); + } + return ptr::null(); + }, + }; + c_esc_pat.into_raw() as *const c_char + } + + #[no_mangle] + extern "C" fn rure_cstring_free(s: *mut c_char) { + unsafe { + drop(CString::from_raw(s)); + } + } + + #[no_mangle] + extern "C" fn rure_replace( + re: *const RegexUnicode, + haystack: *const u8, + len_h: size_t, + rewrite: *const u8, + len_r: size_t, + ) -> *const u8 { + let re = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(haystack, len_h) }; + let rewrite = unsafe { slice::from_raw_parts(rewrite, len_r) }; + rure_replace_internal(re, haystack, rewrite) + } + + #[no_mangle] + extern "C" fn rure_replace_all( + re: *const RegexUnicode, + haystack: *const u8, + len_h: size_t, + rewrite: *const u8, + len_r: size_t, + ) -> *const u8 { + let re = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(haystack, len_h) }; + let rewrite = unsafe { slice::from_raw_parts(rewrite, len_r) }; + rure_replace_all_internal(re, haystack, rewrite) + } + + /* + * Simple way to use regex + */ + + #[no_mangle] + extern "C" fn rure_new(pattern: *const u8, length: size_t) -> *const RegexBytes { + let pat = unsafe { slice::from_raw_parts(pattern, length) }; + rure_new_internal(pat) + } + + #[no_mangle] + extern "C" fn rure_consume( + re: *const RegexBytes, + haystack: *const u8, + len: size_t, + match_info: *mut rure_match, + ) -> bool { + let exp = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(haystack, len) }; + rure_consume_internal(exp, haystack, match_info) + } + + #[no_mangle] + extern "C" fn rure_max_submatch(rewrite: *const c_char) -> i32 { + let len = unsafe { CStr::from_ptr(rewrite).to_bytes().len() }; + let pat = rewrite as *const u8; + let text = unsafe { slice::from_raw_parts(pat, len) }; + + rure_max_submatch_internal(text) + } + + #[no_mangle] + extern "C" fn rure_check_rewrite_string(rewrite: *const c_char, cap_num: i32) -> bool { + let len = unsafe { CStr::from_ptr(rewrite).to_bytes().len() }; + let pat = rewrite as *const u8; + let text = unsafe { slice::from_raw_parts(pat, len) }; + + rure_check_rewrite_string_internal(text, cap_num) + } + + #[no_mangle] + extern "C" fn rure_rewrite_str_convert(rewrite: *const u8, length: size_t) -> *const c_char { + let rewrite = unsafe { slice::from_raw_parts(rewrite, length) }; + + rure_rewrite_str_convert_internal(rewrite) + } + + #[no_mangle] + extern "C" fn rure_rewrite( + rewrite: *const u8, + length: size_t, + vecs: *const *const u8, + vecs_lengths: *const size_t, + vecs_count: size_t, + ) -> *const c_char { + // 获取rewrite + let rewrite = unsafe { slice::from_raw_parts(rewrite, length) }; + let rewrite_str = std::str::from_utf8(rewrite).unwrap(); + + //获取vecs中的内容 + let (raw_vecs, raw_vecsl) = unsafe { + ( + slice::from_raw_parts(vecs, vecs_count), + slice::from_raw_parts(vecs_lengths, vecs_count), + ) + }; + + let mut rure_vecs = Vec::with_capacity(vecs_count); + for (&raw_vec, &raw_vecl) in raw_vecs.iter().zip(raw_vecsl) { + let rure_vec = unsafe { slice::from_raw_parts(raw_vec, raw_vecl) }; + rure_vecs.push(str::from_utf8(rure_vec).unwrap()); + } + + rure_rewrite_internal(rewrite_str, vecs_count, rure_vecs) + } + + #[no_mangle] + extern "C" fn rure_replace_count(re: *const RegexUnicode, haystack: *const c_char) -> size_t { + let len = unsafe { CStr::from_ptr(haystack).to_bytes().len() }; + let hay = haystack as *const u8; + + let re = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(hay, len) }; + rure_replace_count_internal(haystack, re) + } + + #[no_mangle] + extern "C" fn rure_filter_compile( + regex_str: *const u8, + regex_len: size_t, + min_atoms_len: size_t, + ) -> MyVec { + let r = unsafe { slice::from_raw_parts(regex_str, regex_len) }; + let regex_str = str::from_utf8(r).unwrap(); + let atoms = my_compile(regex_str, min_atoms_len as i32); + atoms + } + -pub use crate::error::*; -pub use crate::rure::*; diff --git a/regex-capi/src/rure.rs b/regex-capi/src/lib_internal.rs similarity index 51% rename from regex-capi/src/rure.rs rename to regex-capi/src/lib_internal.rs index 84f93e9..cec5ec6 100644 --- a/regex-capi/src/rure.rs +++ b/regex-capi/src/lib_internal.rs @@ -9,142 +9,15 @@ * PURPOSE. * See the Mulan PSL v2 for more details. * Author: mengning, liuzhitao, yangwentong - * Create: 2022-06-21 - * Description: Rure is a C API to Rust's regex library. + * Create: 2022-11-25 + * Description: The business logic implementation layer uses pure rust. ******************************************************************************/ -use std::collections::HashMap; -use std::ffi::{CStr, CString}; -use std::ops::Deref; -use std::ptr; -use std::slice; -use std::str; - -use libc::{c_char, size_t}; -use regex::{bytes, Regex}; - -use crate::error::{Error, ErrorKind}; - -const RURE_FLAG_CASEI: u32 = 1 << 0; -const RURE_FLAG_MULTI: u32 = 1 << 1; -const RURE_FLAG_DOTNL: u32 = 1 << 2; -const RURE_FLAG_SWAP_GREED: u32 = 1 << 3; -const RURE_FLAG_SPACE: u32 = 1 << 4; -const RURE_FLAG_UNICODE: u32 = 1 << 5; -const RURE_DEFAULT_FLAGS: u32 = RURE_FLAG_UNICODE; - -pub struct RegexBytes { - re: bytes::Regex, - capture_names: HashMap, -} - -pub struct RegexUnicode { - re: Regex, -} - -pub struct Options { - size_limit: usize, - dfa_size_limit: usize, -} - -// The `RegexSet` is not exposed with option support or matching at an -// arbitrary position with a crate just yet. To circumvent this, we use -// the `Exec` structure directly. -pub struct RegexSet { - re: bytes::RegexSet, -} - -#[repr(C)] -pub struct rure_match { - pub start: size_t, - pub end: size_t, -} - -pub struct Captures(bytes::Locations); - -pub struct Iter { - re: *const RegexBytes, - last_end: usize, - last_match: Option, -} - -pub struct IterCaptureNames { - capture_names: bytes::CaptureNames<'static>, - name_ptrs: Vec<*mut c_char>, -} - -#[repr(C)] -pub struct Atoms { - atom: *mut c_char, -} - -#[repr(C)] -pub struct MyVec { - data: *mut Atoms, - len: i32, -} - -impl Deref for RegexBytes { - type Target = bytes::Regex; - fn deref(&self) -> &bytes::Regex { - &self.re - } -} - -impl Deref for RegexUnicode { - type Target = Regex; - fn deref(&self) -> &Regex { - &self.re - } -} - -impl Deref for RegexSet { - type Target = bytes::RegexSet; - fn deref(&self) -> &bytes::RegexSet { - &self.re - } -} - -impl Default for Options { - fn default() -> Options { - Options { - size_limit: 10 * (1 << 20), - dfa_size_limit: 2 * (1 << 20), - } - } -} - -#[no_mangle] -extern "C" fn rure_compile_must(pattern: *const c_char) -> *const RegexBytes { - let len = unsafe { CStr::from_ptr(pattern).to_bytes().len() }; - let pat = pattern as *const u8; - let mut err = Error::new(ErrorKind::None); - let re = rure_compile(pat, len, RURE_DEFAULT_FLAGS, ptr::null(), &mut err); - // if err.is_err() { - // let _ = writeln!(&mut io::stderr(), "{}", err); - // let _ = writeln!(&mut io::stderr(), "aborting from rure_compile_must"); - // unsafe { abort() } - // } - re -} - -#[no_mangle] -extern "C" fn rure_compile( - pattern: *const u8, - length: size_t, +fn rure_compile_internal( + pat: &str, flags: u32, options: *const Options, error: *mut Error, ) -> *const RegexBytes { - let pat = unsafe { slice::from_raw_parts(pattern, length) }; - let pat = match str::from_utf8(pat) { - Ok(pat) => pat, - Err(err) => unsafe { - if !error.is_null() { - *error = Error::new(ErrorKind::Str(err)); - } - return ptr::null(); - }, - }; let mut builder = bytes::RegexBuilder::new(pat); if !options.is_null() { let options = unsafe { &*options }; @@ -159,13 +32,15 @@ extern "C" fn rure_compile( builder.unicode(flags & RURE_FLAG_UNICODE > 0); match builder.build() { Ok(re) => { - let mut capture_names = HashMap::new(); - for (i, name) in re.capture_names().enumerate() { - if let Some(name) = name { - capture_names.insert(name.to_owned(), i as i32); - } - } - let re = RegexBytes { re, capture_names }; + // let mut capture_names = HashMap::new(); + // for (i, name) in re.capture_names().enumerate() { + // if let Some(name) = name { + // capture_names.insert(name.to_owned(), i as i32); + // } + // } + // let re = RegexBytes { re, capture_names }; + let re = RegexBytes { re }; + Box::into_raw(Box::new(re)) } Err(err) => unsafe { @@ -177,33 +52,12 @@ extern "C" fn rure_compile( } } -#[no_mangle] -extern "C" fn rure_free(re: *const RegexBytes) { - unsafe { drop(Box::from_raw(re as *mut Regex)); } -} - -#[no_mangle] -extern "C" fn rure_is_match( - re: *const RegexBytes, - haystack: *const u8, - len: size_t, - _start: size_t, -) -> bool { - let re = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(haystack, len) }; - re.is_match(haystack) -} - -#[no_mangle] -extern "C" fn rure_find( - re: *const RegexBytes, - haystack: *const u8, - len: size_t, +fn rure_find_internal( + re: &RegexBytes, + haystack: &[u8], start: size_t, match_info: *mut rure_match, ) -> bool { - let re = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(haystack, len) }; re.find_at(haystack, start) .map(|m| unsafe { if !match_info.is_null() { @@ -214,214 +68,11 @@ extern "C" fn rure_find( .is_some() } -#[no_mangle] -extern "C" fn rure_find_captures( - re: *const RegexBytes, - haystack: *const u8, - len: size_t, - start: size_t, - captures: *mut Captures, -) -> bool { - let re = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(haystack, len) }; - let slots = unsafe { &mut (*captures).0 }; - re.read_captures_at(slots, haystack, start).is_some() -} - -#[no_mangle] -extern "C" fn rure_shortest_match( - re: *const RegexBytes, - haystack: *const u8, - len: size_t, - start: size_t, - end: *mut usize, -) -> bool { - let re = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(haystack, len) }; - match re.shortest_match_at(haystack, start) { - None => false, - Some(i) => { - if !end.is_null() { - unsafe { - *end = i; - } - } - true - } - } -} - -#[no_mangle] -extern "C" fn rure_capture_name_index(re: *const RegexBytes, name: *const c_char) -> i32 { - let re = unsafe { &*re }; - let name = unsafe { CStr::from_ptr(name) }; - let name = match name.to_str() { - Err(_) => return -1, - Ok(name) => name, - }; - re.capture_names.get(name).copied().unwrap_or(-1) -} - -#[no_mangle] -extern "C" fn rure_iter_capture_names_new(re: *const RegexBytes) -> *mut IterCaptureNames { - let re = unsafe { &*re }; - Box::into_raw(Box::new(IterCaptureNames { - capture_names: re.re.capture_names(), - name_ptrs: Vec::new(), - })) -} - -#[no_mangle] -extern "C" fn rure_iter_capture_names_free(it: *mut IterCaptureNames) { - unsafe { - let it = &mut *it; - while let Some(ptr) = it.name_ptrs.pop() { - drop(CString::from_raw(ptr)); - } - drop(Box::from_raw(it)); - } -} - -#[no_mangle] -extern "C" fn rure_iter_capture_names_next( - it: *mut IterCaptureNames, - capture_name: *mut *mut c_char, -) -> bool { - if capture_name.is_null() { - return false; - } - let it = unsafe { &mut *it }; - let cn = match it.capture_names.next() { - // Top-level iterator ran out of capture groups - None => return false, - Some(val) => { - match val { - // inner Option didn't have a name - None => "", - Some(name) => name, - } - } - }; - unsafe { - let cs = match CString::new(cn.as_bytes()) { - Result::Ok(val) => val, - Result::Err(_) => return false, - }; - let ptr = cs.into_raw(); - it.name_ptrs.push(ptr); - *capture_name = ptr; - } - true -} - -#[no_mangle] -extern "C" fn rure_iter_new(re: *const RegexBytes) -> *mut Iter { - Box::into_raw(Box::new(Iter { - re, - last_end: 0, - last_match: None, - })) -} - -#[no_mangle] -extern "C" fn rure_iter_free(it: *mut Iter) { - unsafe { drop(Box::from_raw(it)); } -} - -#[no_mangle] -extern "C" fn rure_iter_next( - it: *mut Iter, - haystack: *const u8, - len: size_t, - match_info: *mut rure_match, -) -> bool { - let it = unsafe { &mut *it }; - let re = unsafe { &*it.re }; - let text = unsafe { slice::from_raw_parts(haystack, len) }; - if it.last_end > text.len() { - return false; - } - let (s, e) = match re.find_at(text, it.last_end) { - None => return false, - Some(m) => (m.start(), m.end()), - }; - if s == e { - // This is an empty match. To ensure we make progress, start - // the next search at the smallest possible starting position - // of the next match following this one. - it.last_end += 1; - // Don't accept empty matches immediately following a match. - // Just move on to the next match. - if Some(e) == it.last_match { - return rure_iter_next(it, haystack, len, match_info); - } - } else { - it.last_end = e; - } - it.last_match = Some(e); - if !match_info.is_null() { - unsafe { - (*match_info).start = s; - (*match_info).end = e; - } - } - true -} - -#[no_mangle] -extern "C" fn rure_iter_next_captures( - it: *mut Iter, - haystack: *const u8, - len: size_t, - captures: *mut Captures, -) -> bool { - let it = unsafe { &mut *it }; - let re = unsafe { &*it.re }; - let slots = unsafe { &mut (*captures).0 }; - let text = unsafe { slice::from_raw_parts(haystack, len) }; - if it.last_end > text.len() { - return false; - } - let (s, e) = match re.read_captures_at(slots, text, it.last_end) { - None => return false, - Some(m) => (m.start(), m.end()), - }; - if s == e { - // This is an empty match. To ensure we make progress, start - // the next search at the smallest possible starting position - // of the next match following this one. - it.last_end += 1; - // Don't accept empty matches immediately following a match. - // Just move on to the next match. - if Some(e) == it.last_match { - return rure_iter_next_captures(it, haystack, len, captures); - } - } else { - it.last_end = e; - } - it.last_match = Some(e); - true -} - -#[no_mangle] -extern "C" fn rure_captures_new(re: *const RegexBytes) -> *mut Captures { - let re = unsafe { &*re }; - let captures = Captures(re.locations()); - Box::into_raw(Box::new(captures)) -} - -#[no_mangle] -extern "C" fn rure_captures_free(captures: *const Captures) { - unsafe { drop(Box::from_raw(captures as *mut Captures)); } -} - -#[no_mangle] -extern "C" fn rure_captures_at( - captures: *const Captures, +fn rure_captures_at_internal( + locs: &CaptureLocations, i: size_t, match_info: *mut rure_match, ) -> bool { - let locs = unsafe { &(*captures).0 }; match locs.pos(i) { Some((start, end)) => { if !match_info.is_null() { @@ -436,49 +87,14 @@ extern "C" fn rure_captures_at( } } -#[no_mangle] -extern "C" fn rure_captures_len(captures: *const Captures) -> size_t { - unsafe { (*captures).0.len() } -} - -#[no_mangle] -extern "C" fn rure_options_new() -> *mut Options { - Box::into_raw(Box::new(Options::default())) -} - -#[no_mangle] -extern "C" fn rure_options_free(options: *mut Options) { - unsafe { drop(Box::from_raw(options)); } -} - -#[no_mangle] -extern "C" fn rure_options_size_limit(options: *mut Options, limit: size_t) { - let options = unsafe { &mut *options }; - options.size_limit = limit; -} - -#[no_mangle] -extern "C" fn rure_options_dfa_size_limit(options: *mut Options, limit: size_t) { - let options = unsafe { &mut *options }; - options.dfa_size_limit = limit; -} - -#[no_mangle] -extern "C" fn rure_compile_set( - patterns: *const *const u8, - patterns_lengths: *const size_t, +fn rure_compile_set_internal( + raw_pats: &[*const u8], + raw_patsl: &[usize], patterns_count: size_t, flags: u32, options: *const Options, error: *mut Error, ) -> *const RegexSet { - let (raw_pats, raw_patsl) = unsafe { - ( - slice::from_raw_parts(patterns, patterns_count), - slice::from_raw_parts(patterns_lengths, patterns_count), - ) - }; - let mut pats = Vec::with_capacity(patterns_count); for (&raw_pat, &raw_patl) in raw_pats.iter().zip(raw_patsl) { let pat = unsafe { slice::from_raw_parts(raw_pat, raw_patl) }; @@ -516,35 +132,12 @@ extern "C" fn rure_compile_set( } } -#[no_mangle] -extern "C" fn rure_set_free(re: *const RegexSet) { - unsafe { drop(Box::from_raw(re as *mut RegexSet)); } -} - -#[no_mangle] -extern "C" fn rure_set_is_match( - re: *const RegexSet, - haystack: *const u8, - len: size_t, - start: size_t, -) -> bool { - let re = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(haystack, len) }; - re.is_match_at(haystack, start) -} - -#[no_mangle] -extern "C" fn rure_set_matches( - re: *const RegexSet, - haystack: *const u8, - len: size_t, +fn rure_set_matches_internal( + re: &RegexSet, + matches: &mut [bool], + haystack: &[u8], start: size_t, - matches: *mut bool, ) -> bool { - let re = unsafe { &*re }; - let matches = unsafe { slice::from_raw_parts_mut(matches, re.len()) }; - let haystack = unsafe { slice::from_raw_parts(haystack, len) }; - // read_matches_at isn't guaranteed to set non-matches to false for item in matches.iter_mut() { *item = false; @@ -552,74 +145,7 @@ extern "C" fn rure_set_matches( re.read_matches_at(matches, haystack, start) } -#[no_mangle] -extern "C" fn rure_set_len(re: *const RegexSet) -> size_t { - unsafe { (*re).len() } -} - -#[no_mangle] -extern "C" fn rure_escape_must(pattern: *const c_char) -> *const c_char { - let len = unsafe { CStr::from_ptr(pattern).to_bytes().len() }; - let pat = pattern as *const u8; - let mut err = Error::new(ErrorKind::None); - let esc = rure_escape(pat, len, &mut err); - if err.is_err() { - println!("{}", "aborting from rure_escape_must"); - // let _ = writeln!(&mut io::stderr(), "{}", err); - // let _ = writeln!( - // &mut io::stderr(), "aborting from rure_escape_must"); - // unsafe { abort() } - } - esc - } - - -/// A helper function that implements fallible escaping in a way that returns -/// an error if escaping failed. -/// -/// This should ideally be exposed, but it needs API design work. In -/// particular, this should not return a C string, but a `const uint8_t *` -/// instead, since it may contain a NUL byte. -fn rure_escape(pattern: *const u8, length: size_t, error: *mut Error) -> *const c_char { - let pat: &[u8] = unsafe { slice::from_raw_parts(pattern, length) }; - let str_pat = match str::from_utf8(pat) { - Ok(val) => val, - Err(err) => unsafe { - if !error.is_null() { - *error = Error::new(ErrorKind::Str(err)); - } - return ptr::null(); - }, - }; - let esc_pat = regex::escape(str_pat); - let c_esc_pat = match CString::new(esc_pat) { - Ok(val) => val, - Err(err) => unsafe { - if !error.is_null() { - *error = Error::new(ErrorKind::Nul(err)); - } - return ptr::null(); - }, - }; - c_esc_pat.into_raw() as *const c_char -} - -#[no_mangle] -extern "C" fn rure_cstring_free(s: *mut c_char) { - unsafe { drop(CString::from_raw(s)); } -} - -#[no_mangle] -extern "C" fn rure_replace( - re: *const RegexUnicode, - haystack: *const u8, - len_h: size_t, - rewrite: *const u8, - len_r: size_t, -) -> *const u8 { - let re = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(haystack, len_h) }; - let rewrite = unsafe { slice::from_raw_parts(rewrite, len_r) }; +fn rure_replace_internal(re: &RegexUnicode, haystack: &[u8], rewrite: &[u8]) -> *const u8 { let haystack = match str::from_utf8(haystack) { Ok(haystack) => haystack, Err(_err) => { @@ -643,17 +169,7 @@ extern "C" fn rure_replace( c_esc_pat.into_raw() as *const u8 } -#[no_mangle] -extern "C" fn rure_replace_all( - re: *const RegexUnicode, - haystack: *const u8, - len_h: size_t, - rewrite: *const u8, - len_r: size_t, -) -> *const u8 { - let re = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(haystack, len_h) }; - let rewrite = unsafe { slice::from_raw_parts(rewrite, len_r) }; +fn rure_replace_all_internal(re: &RegexUnicode, haystack: &[u8], rewrite: &[u8]) -> *const u8 { let haystack = match str::from_utf8(haystack) { Ok(haystack) => haystack, Err(_err) => { @@ -677,13 +193,7 @@ extern "C" fn rure_replace_all( c_esc_pat.into_raw() as *const u8 } -/* - * Simple way to use regex - */ - -#[no_mangle] -extern "C" fn rure_new(pattern: *const u8, length: size_t) -> *const RegexBytes { - let pat = unsafe { slice::from_raw_parts(pattern, length) }; +fn rure_new_internal(pat: &[u8]) -> *const RegexBytes { let pat = match str::from_utf8(pat) { Ok(pat) => pat, Err(_err) => { @@ -697,15 +207,7 @@ extern "C" fn rure_new(pattern: *const u8, length: size_t) -> *const RegexBytes exp as *const RegexBytes } -#[no_mangle] -extern "C" fn rure_consume( - re: *const RegexBytes, - haystack: *const u8, - len: size_t, - match_info: *mut rure_match, -) -> bool { - let exp = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(haystack, len) }; +fn rure_consume_internal(exp: &RegexBytes, haystack: &[u8], match_info: *mut rure_match) -> bool { exp.find(haystack) .map(|m| unsafe { if !match_info.is_null() { @@ -716,14 +218,10 @@ extern "C" fn rure_consume( .is_some() } -#[no_mangle] -extern "C" fn rure_max_submatch(rewrite: *const c_char) -> i32 { +fn rure_max_submatch_internal(text: &[u8]) -> i32 { let mut max: i32 = 0; let mut flag = 0; let zero_number = '0' as i32; - let len = unsafe { CStr::from_ptr(rewrite).to_bytes().len() }; - let pat = rewrite as *const u8; - let text = unsafe { slice::from_raw_parts(pat, len) }; let rewrite = std::str::from_utf8(text).unwrap(); for s in rewrite.chars() { if s == '\\' { @@ -741,11 +239,7 @@ extern "C" fn rure_max_submatch(rewrite: *const c_char) -> i32 { max } -#[no_mangle] -extern "C" fn rure_check_rewrite_string(rewrite: *const c_char, cap_num: i32) -> bool { - let len = unsafe { CStr::from_ptr(rewrite).to_bytes().len() }; - let pat = rewrite as *const u8; - let text = unsafe { slice::from_raw_parts(pat, len) }; +fn rure_check_rewrite_string_internal(text: &[u8], cap_num: i32) -> bool { let s = std::str::from_utf8(text).unwrap(); let mut max_token = -1; let chars = s.chars().collect::>(); @@ -786,9 +280,7 @@ extern "C" fn rure_check_rewrite_string(rewrite: *const c_char, cap_num: i32) -> return true; } -#[no_mangle] -extern "C" fn rure_rewrite_str_convert(rewrite: *const u8, length: size_t) -> *const c_char { - let rewrite = unsafe { slice::from_raw_parts(rewrite, length) }; +fn rure_rewrite_str_convert_internal(rewrite: &[u8]) -> *const c_char { let rewrite_str = std::str::from_utf8(rewrite).unwrap(); let rewrite_chars = rewrite_str.chars().collect::>(); let mut i = 0; @@ -826,32 +318,11 @@ extern "C" fn rure_rewrite_str_convert(rewrite: *const u8, length: size_t) -> *c rure_str.into_raw() as *const c_char } -#[no_mangle] -extern "C" fn rure_rewrite( - rewrite: *const u8, - length: size_t, - vecs: *const *const u8, - vecs_lengths: *const size_t, +fn rure_rewrite_internal( + rewrite_str: &str, vecs_count: size_t, + rure_vecs: Vec<&str>, ) -> *const c_char { - // 获取rewrite - let rewrite = unsafe { slice::from_raw_parts(rewrite, length) }; - let rewrite_str = std::str::from_utf8(rewrite).unwrap(); - - //获取vecs中的内容 - let (raw_vecs, raw_vecsl) = unsafe { - ( - slice::from_raw_parts(vecs, vecs_count), - slice::from_raw_parts(vecs_lengths, vecs_count), - ) - }; - - let mut rure_vecs = Vec::with_capacity(vecs_count); - for (&raw_vec, &raw_vecl) in raw_vecs.iter().zip(raw_vecsl) { - let rure_vec = unsafe { slice::from_raw_parts(raw_vec, raw_vecl) }; - rure_vecs.push(str::from_utf8(rure_vec).unwrap()); - } - let rewrite_chars = rewrite_str.chars().collect::>(); let mut i = 0; let mut out = String::new(); @@ -897,24 +368,19 @@ extern "C" fn rure_rewrite( out.into_raw() as *const c_char } -#[no_mangle] -extern "C" fn rure_replace_count(re: *const RegexUnicode, haystack: *const c_char) -> size_t { - let len = unsafe { CStr::from_ptr(haystack).to_bytes().len() }; - let hay = haystack as *const u8; +fn rure_replace_count_internal(haystack: &[u8], re: &RegexUnicode) -> size_t { let mut count = 0; - let re = unsafe { &*re }; - let haystack = unsafe { slice::from_raw_parts(hay, len) }; let haystack = str::from_utf8(haystack).unwrap(); for _mat in re.find_iter(haystack) { count += 1; } - return count; + count } /** - * 负责对字符集进行连接操作 - * - */ +* 负责对字符集进行连接操作 +* +*/ fn connection(str: &str, vec1: Vec, vec2: Vec) -> Vec { let mut vec_tmp = Vec::new(); if str.len() > 0 { @@ -939,10 +405,10 @@ fn connection(str: &str, vec1: Vec, vec2: Vec) -> Vec { /** * (abc123|abc|ghi789|abc1234) - 3-abc - 6-abc123 - 6-ghi789 - 7-abc1234 + 3-abc + 6-abc123 + 6-ghi789 + 7-abc1234 * abc abc123 ghi789 abc1234 */ fn group_multiple_selection(str: &str, min_atoms_len: i32) -> Vec { @@ -978,11 +444,11 @@ fn group_multiple_selection(str: &str, min_atoms_len: i32) -> Vec { } /** - * 处理 - * a[a-c]a[zv] - * [abc] - * [a-c]+ - */ +* 处理 +* a[a-c]a[zv] +* [abc] +* [a-c]+ +*/ fn char_class_expansion(str: &str) -> Vec { let mut flag_connect = 0; @@ -1165,15 +631,3 @@ fn my_compile(str: &str, min_atoms_len: i32) -> MyVec { std::mem::forget(a); MyVec { data, len } } - -#[no_mangle] -extern "C" fn rure_filter_compile( - regex_str: *const u8, - regex_len: size_t, - min_atoms_len: size_t, -) -> MyVec { - let r = unsafe { slice::from_raw_parts(regex_str, regex_len) }; - let regex_str = str::from_utf8(r).unwrap(); - let atoms = my_compile(regex_str, min_atoms_len as i32); - atoms -} -- Gitee