diff --git a/engine/src/ast/field_expr.rs b/engine/src/ast/field_expr.rs index cf0e00de..e0456c06 100644 --- a/engine/src/ast/field_expr.rs +++ b/engine/src/ast/field_expr.rs @@ -11,7 +11,7 @@ use crate::{ filter::CompiledExpr, lex::{Lex, LexErrorKind, LexResult, LexWith, expect, skip_space, span}, range_set::RangeSet, - rhs_types::{Bytes, ExplicitIpRange, ListName, Regex, Wildcard}, + rhs_types::{Bytes, ExplicitIpRange, ListName, RegexExpr, Wildcard}, scheme::{Field, Identifier, List}, searcher::{EmptySearcher, TwoWaySearcher}, strict_partial_ord::StrictPartialOrd, @@ -152,7 +152,7 @@ pub enum ComparisonOpExpr { /// "matches / ~" comparison #[serde(serialize_with = "serialize_matches")] - Matches(Regex), + Matches(RegexExpr), /// "wildcard" comparison #[serde(serialize_with = "serialize_wildcard")] @@ -205,7 +205,7 @@ fn serialize_contains(rhs: &Bytes, ser: S) -> Result(rhs: &Regex, ser: S) -> Result { +fn serialize_matches(rhs: &RegexExpr, ser: S) -> Result { serialize_op_rhs("Matches", rhs, ser) } @@ -376,7 +376,7 @@ impl ComparisonExpr { (ComparisonOpExpr::Contains(bytes), input) } BytesOp::Matches => { - let (regex, input) = Regex::lex_with(input, parser)?; + let (regex, input) = RegexExpr::lex_with(input, parser)?; (ComparisonOpExpr::Matches(regex), input) } BytesOp::Wildcard => { @@ -688,7 +688,9 @@ impl Expr for ComparisonExpr { search!(TwoWaySearcher::new(bytes)) } - ComparisonOpExpr::Matches(regex) => lhs.compile_with(compiler, false, regex), + ComparisonOpExpr::Matches(regex) => { + lhs.compile_with(compiler, false, regex.into_regex()) + } ComparisonOpExpr::Wildcard(wildcard) => lhs.compile_with(compiler, false, wildcard), ComparisonOpExpr::StrictWildcard(wildcard) => { lhs.compile_with(compiler, false, wildcard) @@ -2806,7 +2808,12 @@ mod tests { // Matches operator let parser = FilterParser::new(&SCHEME); - let r = Regex::new("a.b", RegexFormat::Literal, parser.settings()).unwrap(); + let r = RegexExpr::new( + "a.b", + RegexFormat::Literal, + &parser.settings().regex_provider, + ) + .unwrap(); let expr = assert_ok!( parser.lex_as("http.host matches r###\"a.b\"###"), ComparisonExpr { diff --git a/engine/src/ast/parse.rs b/engine/src/ast/parse.rs index e9984294..e6e1fea9 100644 --- a/engine/src/ast/parse.rs +++ b/engine/src/ast/parse.rs @@ -1,11 +1,14 @@ use super::{FilterAst, FilterValueAst}; use crate::{ + RegexProvider, lex::{LexErrorKind, LexResult, LexWith, complete}, + rhs_types::RegexDefaultProvider, scheme::Scheme, }; use std::cmp::{max, min}; use std::error::Error; use std::fmt::{self, Debug, Display, Formatter}; +use std::sync::{Arc, LazyLock}; /// An opaque filter parsing error associated with the original input. /// @@ -96,28 +99,37 @@ impl Display for ParseError<'_> { } } +static DEFAULT_REGEX_PROVIDER: LazyLock> = + LazyLock::new(|| Arc::new(RegexDefaultProvider::default())); + /// Parser settings. -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(Clone, Debug)] pub struct ParserSettings { - /// Approximate size of the cache used by the DFA of a regex. - /// Default: 10MB - pub regex_dfa_size_limit: usize, - /// Approximate size limit of the compiled regular expression. - /// Default: 2MB - pub regex_compiled_size_limit: usize, + /// Regex provider. + pub regex_provider: Arc, /// Maximum number of star metacharacters allowed in a wildcard. /// Default: unlimited pub wildcard_star_limit: usize, } +impl PartialEq for ParserSettings { + fn eq(&self, other: &Self) -> bool { + let Self { + regex_provider, + wildcard_star_limit, + } = self; + Arc::ptr_eq(regex_provider, &other.regex_provider) + && *wildcard_star_limit == other.wildcard_star_limit + } +} + +impl Eq for ParserSettings {} + impl Default for ParserSettings { #[inline] fn default() -> Self { Self { - // Default value extracted from the regex crate. - regex_compiled_size_limit: 10 * (1 << 20), - // Default value extracted from the regex crate. - regex_dfa_size_limit: 2 * (1 << 20), + regex_provider: DEFAULT_REGEX_PROVIDER.clone(), wildcard_star_limit: usize::MAX, } } @@ -176,30 +188,6 @@ impl<'s> FilterParser<'s> { &self.settings } - /// Set the approximate size limit of the compiled regular expression. - #[inline] - pub fn regex_set_compiled_size_limit(&mut self, regex_compiled_size_limit: usize) { - self.settings.regex_compiled_size_limit = regex_compiled_size_limit; - } - - /// Get the approximate size limit of the compiled regular expression. - #[inline] - pub fn regex_get_compiled_size_limit(&self) -> usize { - self.settings.regex_compiled_size_limit - } - - /// Set the approximate size of the cache used by the DFA of a regex. - #[inline] - pub fn regex_set_dfa_size_limit(&mut self, regex_dfa_size_limit: usize) { - self.settings.regex_dfa_size_limit = regex_dfa_size_limit; - } - - /// Get the approximate size of the cache used by the DFA of a regex. - #[inline] - pub fn regex_get_dfa_size_limit(&self) -> usize { - self.settings.regex_dfa_size_limit - } - /// Set the maximum number of star metacharacters allowed in a wildcard. #[inline] pub fn wildcard_set_star_limit(&mut self, wildcard_star_limit: usize) { diff --git a/engine/src/lib.rs b/engine/src/lib.rs index 64eb4f2c..b435936c 100644 --- a/engine/src/lib.rs +++ b/engine/src/lib.rs @@ -112,8 +112,8 @@ pub use self::{ panic_catcher_get_backtrace, panic_catcher_set_fallback_mode, panic_catcher_set_hook, }, rhs_types::{ - Bytes, BytesFormat, ExplicitIpRange, IntRange, IpCidr, IpRange, Regex, RegexError, - RegexFormat, + Bytes, BytesFormat, ExplicitIpRange, IntRange, IpCidr, IpRange, Regex, + RegexDefaultProvider, RegexError, RegexExpr, RegexFormat, RegexProvider, }, scheme::{ Field, FieldIndex, FieldRedefinitionError, FieldRef, Function, FunctionRedefinitionError, @@ -125,3 +125,6 @@ pub use self::{ TypeMismatchError, }, }; + +#[cfg(feature = "regex")] +pub use self::rhs_types::RegexSettings; diff --git a/engine/src/rhs_types/mod.rs b/engine/src/rhs_types/mod.rs index 18d805af..e6844ca6 100644 --- a/engine/src/rhs_types/mod.rs +++ b/engine/src/rhs_types/mod.rs @@ -16,6 +16,11 @@ pub use self::{ ip::{ExplicitIpRange, IpCidr, IpRange}, list::ListName, map::UninhabitedMap, - regex::{Error as RegexError, Regex, RegexFormat}, + regex::{ + Error as RegexError, Regex, RegexDefaultProvider, RegexExpr, RegexFormat, RegexProvider, + }, wildcard::{Wildcard, WildcardError}, }; + +#[cfg(feature = "regex")] +pub use self::regex::RegexSettings; diff --git a/engine/src/rhs_types/regex/imp_real.rs b/engine/src/rhs_types/regex/imp_real.rs index ebfa74f9..b51d4534 100644 --- a/engine/src/rhs_types/regex/imp_real.rs +++ b/engine/src/rhs_types/regex/imp_real.rs @@ -1,22 +1,54 @@ -use regex_automata::MatchKind; - -use super::Error; -use crate::{ParserSettings, RegexFormat}; -use std::ops::Deref; +use super::{Error, Regex}; +use crate::RegexProvider; use std::sync::Arc; -/// Wrapper around [`regex_automata::meta::Regex`] -#[derive(Clone)] -pub struct Regex { - pattern: Arc, - regex: regex_automata::meta::Regex, - format: RegexFormat, +pub(crate) type MetaRegex = regex_automata::meta::Regex; + +impl Regex for MetaRegex { + #[inline] + fn is_match(&self, input: &[u8]) -> bool { + MetaRegex::is_match(self, input) + } +} + +/// Regex settings. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct RegexSettings { + /// Approximate size of the cache used by the DFA of a regex. + /// Default: 10MB + pub dfa_size_limit: usize, + /// Approximate size limit of the compiled regular expression. + /// Default: 2MB + pub compiled_size_limit: usize, +} + +impl Default for RegexSettings { + #[inline] + fn default() -> Self { + Self { + // Default value extracted from the regex crate. + compiled_size_limit: 10 * (1 << 20), + // Default value extracted from the regex crate. + dfa_size_limit: 2 * (1 << 20), + } + } } -impl Regex { +/// Default regex provider. +#[derive(Debug, Default)] +pub struct RegexDefaultProvider { + settings: RegexSettings, +} + +impl RegexDefaultProvider { + /// Creates a new default regex provider. + pub const fn new(settings: RegexSettings) -> Self { + Self { settings } + } + /// Retrieves the syntax configuration that will be used to build the regex. #[inline] - fn syntax_config() -> regex_automata::util::syntax::Config { + pub fn syntax_config() -> regex_automata::util::syntax::Config { regex_automata::util::syntax::Config::new() .unicode(false) .utf8(false) @@ -24,32 +56,23 @@ impl Regex { /// Retrieves the meta configuration that will be used to build the regex. #[inline] - fn meta_config(settings: &ParserSettings) -> regex_automata::meta::Config { + pub fn meta_config(settings: &RegexSettings) -> regex_automata::meta::Config { regex_automata::meta::Config::new() - .match_kind(MatchKind::LeftmostFirst) + .match_kind(regex_automata::MatchKind::LeftmostFirst) .utf8_empty(false) .dfa(false) - .nfa_size_limit(Some(settings.regex_compiled_size_limit)) - .onepass_size_limit(Some(settings.regex_compiled_size_limit)) - .dfa_size_limit(Some(settings.regex_compiled_size_limit)) - .hybrid_cache_capacity(settings.regex_dfa_size_limit) + .nfa_size_limit(Some(settings.compiled_size_limit)) + .onepass_size_limit(Some(settings.compiled_size_limit)) + .dfa_size_limit(Some(settings.compiled_size_limit)) + .hybrid_cache_capacity(settings.dfa_size_limit) } - /// Compiles a regular expression. - pub fn new( - pattern: &str, - format: RegexFormat, - settings: &ParserSettings, - ) -> Result { + /// Builds a new regex object from the provided pattern. + pub fn build(&self, pattern: &str) -> Result { ::regex_automata::meta::Builder::new() - .configure(Self::meta_config(settings)) + .configure(Self::meta_config(&self.settings)) .syntax(Self::syntax_config()) .build(pattern) - .map(|regex| Regex { - pattern: Arc::from(pattern), - regex, - format, - }) .map_err(|err| { if let Some(limit) = err.size_limit() { Error::CompiledTooBig(limit) @@ -60,45 +83,27 @@ impl Regex { } }) } - - /// Returns the pattern of this regex. - #[inline] - pub fn as_str(&self) -> &str { - &self.pattern - } - - /// Returns the format used by the pattern. - #[inline] - pub fn format(&self) -> RegexFormat { - self.format - } } -impl From for regex_automata::meta::Regex { - #[inline] - fn from(regex: Regex) -> Self { - regex.regex - } -} - -impl Deref for Regex { - type Target = regex_automata::meta::Regex; - - #[inline] - fn deref(&self) -> &Self::Target { - &self.regex +impl RegexProvider for RegexDefaultProvider { + fn lookup_regex(&self, pattern: &str) -> Result, Error> { + self.build(pattern).map(|re| Arc::new(re) as Arc) } } #[test] fn test_compiled_size_limit() { + use super::{RegexDefaultProvider, RegexSettings}; + use crate::{RegexExpr, RegexFormat}; + const COMPILED_SIZE_LIMIT: usize = 1024 * 1024; - let settings = ParserSettings { - regex_compiled_size_limit: COMPILED_SIZE_LIMIT, + let settings = RegexSettings { + compiled_size_limit: COMPILED_SIZE_LIMIT, ..Default::default() }; + let regex_provider = RegexDefaultProvider::new(settings); assert_eq!( - Regex::new(".{4079,65535}", RegexFormat::Literal, &settings), + RegexExpr::new(".{4079,65535}", RegexFormat::Literal, ®ex_provider), Err(Error::CompiledTooBig(COMPILED_SIZE_LIMIT)) ); } diff --git a/engine/src/rhs_types/regex/imp_stub.rs b/engine/src/rhs_types/regex/imp_stub.rs index 0f5a1081..1860754c 100644 --- a/engine/src/rhs_types/regex/imp_stub.rs +++ b/engine/src/rhs_types/regex/imp_stub.rs @@ -1,40 +1,21 @@ -use thiserror::Error; - use crate::{FilterParser, RegexFormat}; +use thiserror::Error; -/// Dummy regex error. -#[derive(Debug, PartialEq, Error)] -pub enum Error {} - -/// Dummy regex wrapper that can only store a pattern -/// but not actually be used for matching. -#[derive(Clone)] -pub struct Regex { - pattern: String, - format: RegexFormat, -} - -impl Regex { - /// Creates a new dummy regex. - pub fn new(pattern: &str, format: RegexFormat, _: &FilterParser<'_>) -> Result { - Ok(Self { - pattern: pattern.to_string(), - format, - }) - } +pub(crate) struct StubRegex {} +impl Regex for StubRegex { /// Not implemented and will panic if called. - pub fn is_match(&self, _text: &[u8]) -> bool { + fn is_match(&self, _text: &[u8]) -> bool { unimplemented!("Engine was built without regex support") } +} - /// Returns the original string of this dummy regex wrapper. - pub fn as_str(&self) -> &str { - self.pattern.as_str() - } +/// Default regex provider. +#[derive(Debug, Default)] +pub struct DefaultRegexProvider; - /// Returns the format behind the regex - pub fn format(&self) -> RegexFormat { - self.format +impl RegexProvider for DefaultRegexProvider { + fn lookup(&self, pattern: &str) -> Result, Error> { + Ok(Arc::new(StubRegex {})) } } diff --git a/engine/src/rhs_types/regex/mod.rs b/engine/src/rhs_types/regex/mod.rs index 3ed5b096..3f82c2e5 100644 --- a/engine/src/rhs_types/regex/mod.rs +++ b/engine/src/rhs_types/regex/mod.rs @@ -5,6 +5,7 @@ use cfg_if::cfg_if; use serde::{Serialize, Serializer}; use std::fmt::{self, Debug, Display, Formatter}; use std::hash::{Hash, Hasher}; +use std::sync::Arc; use thiserror::Error; cfg_if! { @@ -26,28 +27,74 @@ pub enum RegexFormat { Raw(u8), } -impl PartialEq for Regex { - fn eq(&self, other: &Regex) -> bool { +/// Regex expression. +#[derive(Clone)] +pub struct RegexExpr { + pattern: Arc, + format: RegexFormat, + regex: Arc, +} + +impl RegexExpr { + /// Create a new regex expression. + pub fn new( + pattern: &str, + format: RegexFormat, + provider: &impl RegexProvider, + ) -> Result { + provider.lookup_regex(pattern).map(|regex| Self { + pattern: pattern.to_owned().into(), + format, + regex, + }) + } + /// Returns the pattern of this regex. + #[inline] + pub fn as_str(&self) -> &str { + &self.pattern + } + + /// Returns the format used by the pattern. + #[inline] + pub fn format(&self) -> RegexFormat { + self.format + } + + /// Returns the associated regex object. + #[inline] + pub fn as_regex(&self) -> &Arc { + &self.regex + } + + /// Converts the regex expression into its underlying regex object. + #[inline] + pub fn into_regex(self) -> Arc { + self.regex + } +} + +impl PartialEq for RegexExpr { + fn eq(&self, other: &RegexExpr) -> bool { self.as_str() == other.as_str() } } -impl Eq for Regex {} +impl Eq for RegexExpr {} -impl Hash for Regex { +impl Hash for RegexExpr { fn hash(&self, state: &mut H) { self.as_str().hash(state); } } -impl Display for Regex { +impl Display for RegexExpr { /// Shows the original regular expression. fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - write!(f, "{}", self.as_str()) + Display::fmt(self.as_str(), f) } } -impl Debug for Regex { +impl Debug for RegexExpr { /// Shows the original regular expression. fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { f.debug_struct("Regex") @@ -60,15 +107,22 @@ impl Debug for Regex { fn lex_regex_from_raw_string<'i>( input: &'i str, parser: &FilterParser<'_>, -) -> LexResult<'i, Regex> { +) -> LexResult<'i, RegexExpr> { let ((lexed, hashes), input) = lex_raw_string_as_str(input)?; - match Regex::new(lexed, RegexFormat::Raw(hashes), parser.settings()) { + match RegexExpr::new( + lexed, + RegexFormat::Raw(hashes), + &parser.settings().regex_provider, + ) { Ok(regex) => Ok((regex, input)), Err(err) => Err((LexErrorKind::ParseRegex(err), input)), } } -fn lex_regex_from_literal<'i>(input: &'i str, parser: &FilterParser<'_>) -> LexResult<'i, Regex> { +fn lex_regex_from_literal<'i>( + input: &'i str, + parser: &FilterParser<'_>, +) -> LexResult<'i, RegexExpr> { let mut regex_buf = String::new(); let mut in_char_class = false; let (regex_str, input) = { @@ -104,13 +158,17 @@ fn lex_regex_from_literal<'i>(input: &'i str, parser: &FilterParser<'_>) -> LexR }; } }; - match Regex::new(®ex_buf, RegexFormat::Literal, parser.settings()) { + match RegexExpr::new( + ®ex_buf, + RegexFormat::Literal, + &parser.settings().regex_provider, + ) { Ok(regex) => Ok((regex, input)), Err(err) => Err((LexErrorKind::ParseRegex(err), regex_str)), } } -impl<'i, 's> LexWith<'i, &FilterParser<'s>> for Regex { +impl<'i, 's> LexWith<'i, &FilterParser<'s>> for RegexExpr { fn lex_with(input: &'i str, parser: &FilterParser<'s>) -> LexResult<'i, Self> { if let Some(c) = input.as_bytes().first() { match c { @@ -124,7 +182,7 @@ impl<'i, 's> LexWith<'i, &FilterParser<'s>> for Regex { } } -impl Serialize for Regex { +impl Serialize for RegexExpr { fn serialize(&self, ser: S) -> Result { self.as_str().serialize(ser) } @@ -146,7 +204,13 @@ pub enum Error { Other(String), } -impl Compare for Regex { +/// Trait representing a compiled regex object. +pub trait Regex: Send + Sync { + /// Matches the regex against some input. + fn is_match(&self, input: &[u8]) -> bool; +} + +impl Compare for Arc { #[inline] fn compare<'e>(&self, value: &LhsValue<'e>, _: &'e ExecutionContext<'e, U>) -> bool { self.is_match(match value { @@ -156,10 +220,23 @@ impl Compare for Regex { } } +/// Regex provider. +pub trait RegexProvider: Debug + Send + Sync { + /// Attempts to retrieve a regex from the provider. + fn lookup_regex(&self, pattern: &str) -> Result, Error>; +} + +impl RegexProvider for Arc { + #[inline] + fn lookup_regex(&self, pattern: &str) -> Result, Error> { + (**self).lookup_regex(pattern) + } +} + #[cfg(test)] mod test { use super::*; - use crate::{ParserSettings, SchemeBuilder}; + use crate::SchemeBuilder; #[test] fn test() { @@ -167,11 +244,11 @@ mod test { let parser = FilterParser::new(&scheme); let expr = assert_ok!( - Regex::lex_with(r#""[a-z"\]]+\d{1,10}\"";"#, &parser), - Regex::new( + RegexExpr::lex_with(r#""[a-z"\]]+\d{1,10}\"";"#, &parser), + RegexExpr::new( r#"[a-z"\]]+\d{1,10}""#, RegexFormat::Literal, - &ParserSettings::default(), + &parser.settings().regex_provider, ) .unwrap(), ";" @@ -180,7 +257,7 @@ mod test { assert_json!(expr, r#"[a-z"\]]+\d{1,10}""#); assert_err!( - Regex::lex_with(r#""abcd\"#, &parser), + RegexExpr::lex_with(r#""abcd\"#, &parser), LexErrorKind::MissingEndingQuote, "abcd\\" ); @@ -192,14 +269,14 @@ mod test { let parser = FilterParser::new(&scheme); let expr = assert_ok!( - Regex::lex_with( + RegexExpr::lex_with( r###"r#"[a-z"\]]+\d{1,10}""#;"###, &FilterParser::new(&scheme) ), - Regex::new( + RegexExpr::new( r#"[a-z"\]]+\d{1,10}""#, RegexFormat::Raw(1), - parser.settings(), + &parser.settings().regex_provider, ) .unwrap(), ";" @@ -208,14 +285,14 @@ mod test { assert_json!(expr, r#"[a-z"\]]+\d{1,10}""#); let expr = assert_ok!( - Regex::lex_with( + RegexExpr::lex_with( r##"r#"(?u)\*\a\f\t\n\r\v\x7F\x{10FFFF}\u007F\u{7F}\U0000007F\U{7F}"#"##, &parser, ), - Regex::new( + RegexExpr::new( r#"(?u)\*\a\f\t\n\r\v\x7F\x{10FFFF}\u007F\u{7F}\U0000007F\U{7F}"#, RegexFormat::Raw(1), - parser.settings(), + &parser.settings().regex_provider, ) .unwrap(), "" @@ -227,7 +304,7 @@ mod test { ); assert_err!( - Regex::lex_with("x", &FilterParser::new(&scheme)), + RegexExpr::lex_with("x", &FilterParser::new(&scheme)), LexErrorKind::ExpectedName("\" or r"), "x" );