regex_automata/dfa/
regex.rs

1/*!
2A DFA-backed `Regex`.
3
4This module provides [`Regex`], which is defined generically over the
5[`Automaton`] trait. A `Regex` implements convenience routines you might have
6come to expect, such as finding the start/end of a match and iterating over
7all non-overlapping matches. This `Regex` type is limited in its capabilities
8to what a DFA can provide. Therefore, APIs involving capturing groups, for
9example, are not provided.
10
11Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that
12finds the end offset of a match, where as the other is a "reverse" DFA that
13find the start offset of a match.
14
15See the [parent module](crate::dfa) for examples.
16*/
17
18#[cfg(feature = "alloc")]
19use alloc::vec::Vec;
20
21#[cfg(feature = "dfa-build")]
22use crate::dfa::dense::BuildError;
23use crate::{
24    dfa::{automaton::Automaton, dense},
25    util::{iter, search::Input},
26    Anchored, Match, MatchError,
27};
28#[cfg(feature = "alloc")]
29use crate::{
30    dfa::{sparse, StartKind},
31    util::search::MatchKind,
32};
33
34// When the alloc feature is enabled, the regex type sets its A type parameter
35// to default to an owned dense DFA. But without alloc, we set no default. This
36// makes things a lot more convenient in the common case, since writing out the
37// DFA types is pretty annoying.
38//
39// Since we have two different definitions but only want to write one doc
40// string, we use a macro to capture the doc and other attributes once and then
41// repeat them for each definition.
42macro_rules! define_regex_type {
43    ($(#[$doc:meta])*) => {
44        #[cfg(feature = "alloc")]
45        $(#[$doc])*
46        pub struct Regex<A = dense::OwnedDFA> {
47            forward: A,
48            reverse: A,
49        }
50
51        #[cfg(not(feature = "alloc"))]
52        $(#[$doc])*
53        pub struct Regex<A> {
54            forward: A,
55            reverse: A,
56        }
57    };
58}
59
60define_regex_type!(
61    /// A regular expression that uses deterministic finite automata for fast
62    /// searching.
63    ///
64    /// A regular expression is comprised of two DFAs, a "forward" DFA and a
65    /// "reverse" DFA. The forward DFA is responsible for detecting the end of
66    /// a match while the reverse DFA is responsible for detecting the start
67    /// of a match. Thus, in order to find the bounds of any given match, a
68    /// forward search must first be run followed by a reverse search. A match
69    /// found by the forward DFA guarantees that the reverse DFA will also find
70    /// a match.
71    ///
72    /// The type of the DFA used by a `Regex` corresponds to the `A` type
73    /// parameter, which must satisfy the [`Automaton`] trait. Typically, `A`
74    /// is either a [`dense::DFA`] or a [`sparse::DFA`], where dense DFAs use
75    /// more memory but search faster, while sparse DFAs use less memory but
76    /// search more slowly.
77    ///
78    /// # Crate features
79    ///
80    /// Note that despite what the documentation auto-generates, the _only_
81    /// crate feature needed to use this type is `dfa-search`. You do _not_
82    /// need to enable the `alloc` feature.
83    ///
84    /// By default, a regex's automaton type parameter is set to
85    /// `dense::DFA<Vec<u32>>` when the `alloc` feature is enabled. For most
86    /// in-memory work loads, this is the most convenient type that gives the
87    /// best search performance. When the `alloc` feature is disabled, no
88    /// default type is used.
89    ///
90    /// # When should I use this?
91    ///
92    /// Generally speaking, if you can afford the overhead of building a full
93    /// DFA for your regex, and you don't need things like capturing groups,
94    /// then this is a good choice if you're looking to optimize for matching
95    /// speed. Note however that its speed may be worse than a general purpose
96    /// regex engine if you don't provide a [`dense::Config::prefilter`] to the
97    /// underlying DFA.
98    ///
99    /// # Sparse DFAs
100    ///
101    /// Since a `Regex` is generic over the [`Automaton`] trait, it can be
102    /// used with any kind of DFA. While this crate constructs dense DFAs by
103    /// default, it is easy enough to build corresponding sparse DFAs, and then
104    /// build a regex from them:
105    ///
106    /// ```
107    /// use regex_automata::dfa::regex::Regex;
108    ///
109    /// // First, build a regex that uses dense DFAs.
110    /// let dense_re = Regex::new("foo[0-9]+")?;
111    ///
112    /// // Second, build sparse DFAs from the forward and reverse dense DFAs.
113    /// let fwd = dense_re.forward().to_sparse()?;
114    /// let rev = dense_re.reverse().to_sparse()?;
115    ///
116    /// // Third, build a new regex from the constituent sparse DFAs.
117    /// let sparse_re = Regex::builder().build_from_dfas(fwd, rev);
118    ///
119    /// // A regex that uses sparse DFAs can be used just like with dense DFAs.
120    /// assert_eq!(true, sparse_re.is_match(b"foo123"));
121    ///
122    /// # Ok::<(), Box<dyn std::error::Error>>(())
123    /// ```
124    ///
125    /// Alternatively, one can use a [`Builder`] to construct a sparse DFA
126    /// more succinctly. (Note though that dense DFAs are still constructed
127    /// first internally, and then converted to sparse DFAs, as in the example
128    /// above.)
129    ///
130    /// ```
131    /// use regex_automata::dfa::regex::Regex;
132    ///
133    /// let sparse_re = Regex::builder().build_sparse(r"foo[0-9]+")?;
134    /// // A regex that uses sparse DFAs can be used just like with dense DFAs.
135    /// assert!(sparse_re.is_match(b"foo123"));
136    ///
137    /// # Ok::<(), Box<dyn std::error::Error>>(())
138    /// ```
139    ///
140    /// # Fallibility
141    ///
142    /// Most of the search routines defined on this type will _panic_ when the
143    /// underlying search fails. This might be because the DFA gave up because
144    /// it saw a quit byte, whether configured explicitly or via heuristic
145    /// Unicode word boundary support, although neither are enabled by default.
146    /// Or it might fail because an invalid `Input` configuration is given,
147    /// for example, with an unsupported [`Anchored`] mode.
148    ///
149    /// If you need to handle these error cases instead of allowing them to
150    /// trigger a panic, then the lower level [`Regex::try_search`] provides
151    /// a fallible API that never panics.
152    ///
153    /// # Example
154    ///
155    /// This example shows how to cause a search to terminate if it sees a
156    /// `\n` byte, and handle the error returned. This could be useful if, for
157    /// example, you wanted to prevent a user supplied pattern from matching
158    /// across a line boundary.
159    ///
160    /// ```
161    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
162    /// use regex_automata::{dfa::{self, regex::Regex}, Input, MatchError};
163    ///
164    /// let re = Regex::builder()
165    ///     .dense(dfa::dense::Config::new().quit(b'\n', true))
166    ///     .build(r"foo\p{any}+bar")?;
167    ///
168    /// let input = Input::new("foo\nbar");
169    /// // Normally this would produce a match, since \p{any} contains '\n'.
170    /// // But since we instructed the automaton to enter a quit state if a
171    /// // '\n' is observed, this produces a match error instead.
172    /// let expected = MatchError::quit(b'\n', 3);
173    /// let got = re.try_search(&input).unwrap_err();
174    /// assert_eq!(expected, got);
175    ///
176    /// # Ok::<(), Box<dyn std::error::Error>>(())
177    /// ```
178    #[derive(Clone, Debug)]
179);
180
181#[cfg(all(feature = "syntax", feature = "dfa-build"))]
182impl Regex {
183    /// Parse the given regular expression using the default configuration and
184    /// return the corresponding regex.
185    ///
186    /// If you want a non-default configuration, then use the [`Builder`] to
187    /// set your own configuration.
188    ///
189    /// # Example
190    ///
191    /// ```
192    /// use regex_automata::{Match, dfa::regex::Regex};
193    ///
194    /// let re = Regex::new("foo[0-9]+bar")?;
195    /// assert_eq!(
196    ///     Some(Match::must(0, 3..14)),
197    ///     re.find(b"zzzfoo12345barzzz"),
198    /// );
199    /// # Ok::<(), Box<dyn std::error::Error>>(())
200    /// ```
201    pub fn new(pattern: &str) -> Result<Regex, BuildError> {
202        Builder::new().build(pattern)
203    }
204
205    /// Like `new`, but parses multiple patterns into a single "regex set."
206    /// This similarly uses the default regex configuration.
207    ///
208    /// # Example
209    ///
210    /// ```
211    /// use regex_automata::{Match, dfa::regex::Regex};
212    ///
213    /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?;
214    ///
215    /// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux");
216    /// assert_eq!(Some(Match::must(0, 0..3)), it.next());
217    /// assert_eq!(Some(Match::must(1, 4..5)), it.next());
218    /// assert_eq!(Some(Match::must(0, 6..9)), it.next());
219    /// assert_eq!(Some(Match::must(1, 10..14)), it.next());
220    /// assert_eq!(Some(Match::must(1, 15..16)), it.next());
221    /// assert_eq!(Some(Match::must(0, 17..21)), it.next());
222    /// assert_eq!(None, it.next());
223    /// # Ok::<(), Box<dyn std::error::Error>>(())
224    /// ```
225    pub fn new_many<P: AsRef<str>>(
226        patterns: &[P],
227    ) -> Result<Regex, BuildError> {
228        Builder::new().build_many(patterns)
229    }
230}
231
232#[cfg(all(feature = "syntax", feature = "dfa-build"))]
233impl Regex<sparse::DFA<Vec<u8>>> {
234    /// Parse the given regular expression using the default configuration,
235    /// except using sparse DFAs, and return the corresponding regex.
236    ///
237    /// If you want a non-default configuration, then use the [`Builder`] to
238    /// set your own configuration.
239    ///
240    /// # Example
241    ///
242    /// ```
243    /// use regex_automata::{Match, dfa::regex::Regex};
244    ///
245    /// let re = Regex::new_sparse("foo[0-9]+bar")?;
246    /// assert_eq!(
247    ///     Some(Match::must(0, 3..14)),
248    ///     re.find(b"zzzfoo12345barzzz"),
249    /// );
250    /// # Ok::<(), Box<dyn std::error::Error>>(())
251    /// ```
252    pub fn new_sparse(
253        pattern: &str,
254    ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
255        Builder::new().build_sparse(pattern)
256    }
257
258    /// Like `new`, but parses multiple patterns into a single "regex set"
259    /// using sparse DFAs. This otherwise similarly uses the default regex
260    /// configuration.
261    ///
262    /// # Example
263    ///
264    /// ```
265    /// use regex_automata::{Match, dfa::regex::Regex};
266    ///
267    /// let re = Regex::new_many_sparse(&["[a-z]+", "[0-9]+"])?;
268    ///
269    /// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux");
270    /// assert_eq!(Some(Match::must(0, 0..3)), it.next());
271    /// assert_eq!(Some(Match::must(1, 4..5)), it.next());
272    /// assert_eq!(Some(Match::must(0, 6..9)), it.next());
273    /// assert_eq!(Some(Match::must(1, 10..14)), it.next());
274    /// assert_eq!(Some(Match::must(1, 15..16)), it.next());
275    /// assert_eq!(Some(Match::must(0, 17..21)), it.next());
276    /// assert_eq!(None, it.next());
277    /// # Ok::<(), Box<dyn std::error::Error>>(())
278    /// ```
279    pub fn new_many_sparse<P: AsRef<str>>(
280        patterns: &[P],
281    ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
282        Builder::new().build_many_sparse(patterns)
283    }
284}
285
286/// Convenience routines for regex construction.
287impl Regex<dense::DFA<&'static [u32]>> {
288    /// Return a builder for configuring the construction of a `Regex`.
289    ///
290    /// This is a convenience routine to avoid needing to import the
291    /// [`Builder`] type in common cases.
292    ///
293    /// # Example
294    ///
295    /// This example shows how to use the builder to disable UTF-8 mode
296    /// everywhere.
297    ///
298    /// ```
299    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
300    /// use regex_automata::{
301    ///     dfa::regex::Regex, nfa::thompson, util::syntax, Match,
302    /// };
303    ///
304    /// let re = Regex::builder()
305    ///     .syntax(syntax::Config::new().utf8(false))
306    ///     .thompson(thompson::Config::new().utf8(false))
307    ///     .build(r"foo(?-u:[^b])ar.*")?;
308    /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
309    /// let expected = Some(Match::must(0, 1..9));
310    /// let got = re.find(haystack);
311    /// assert_eq!(expected, got);
312    ///
313    /// # Ok::<(), Box<dyn std::error::Error>>(())
314    /// ```
315    pub fn builder() -> Builder {
316        Builder::new()
317    }
318}
319
320/// Standard search routines for finding and iterating over matches.
321impl<A: Automaton> Regex<A> {
322    /// Returns true if and only if this regex matches the given haystack.
323    ///
324    /// This routine may short circuit if it knows that scanning future input
325    /// will never lead to a different result. In particular, if the underlying
326    /// DFA enters a match state or a dead state, then this routine will return
327    /// `true` or `false`, respectively, without inspecting any future input.
328    ///
329    /// # Panics
330    ///
331    /// This routine panics if the search could not complete. This can occur
332    /// in a number of circumstances:
333    ///
334    /// * The configuration of the DFA may permit it to "quit" the search.
335    /// For example, setting quit bytes or enabling heuristic support for
336    /// Unicode word boundaries. The default configuration does not enable any
337    /// option that could result in the DFA quitting.
338    /// * When the provided `Input` configuration is not supported. For
339    /// example, by providing an unsupported anchor mode.
340    ///
341    /// When a search panics, callers cannot know whether a match exists or
342    /// not.
343    ///
344    /// Use [`Regex::try_search`] if you want to handle these error conditions.
345    ///
346    /// # Example
347    ///
348    /// ```
349    /// use regex_automata::dfa::regex::Regex;
350    ///
351    /// let re = Regex::new("foo[0-9]+bar")?;
352    /// assert_eq!(true, re.is_match("foo12345bar"));
353    /// assert_eq!(false, re.is_match("foobar"));
354    /// # Ok::<(), Box<dyn std::error::Error>>(())
355    /// ```
356    #[inline]
357    pub fn is_match<'h, I: Into<Input<'h>>>(&self, input: I) -> bool {
358        // Not only can we do an "earliest" search, but we can avoid doing a
359        // reverse scan too.
360        let input = input.into().earliest(true);
361        self.forward().try_search_fwd(&input).map(|x| x.is_some()).unwrap()
362    }
363
364    /// Returns the start and end offset of the leftmost match. If no match
365    /// exists, then `None` is returned.
366    ///
367    /// # Panics
368    ///
369    /// This routine panics if the search could not complete. This can occur
370    /// in a number of circumstances:
371    ///
372    /// * The configuration of the DFA may permit it to "quit" the search.
373    /// For example, setting quit bytes or enabling heuristic support for
374    /// Unicode word boundaries. The default configuration does not enable any
375    /// option that could result in the DFA quitting.
376    /// * When the provided `Input` configuration is not supported. For
377    /// example, by providing an unsupported anchor mode.
378    ///
379    /// When a search panics, callers cannot know whether a match exists or
380    /// not.
381    ///
382    /// Use [`Regex::try_search`] if you want to handle these error conditions.
383    ///
384    /// # Example
385    ///
386    /// ```
387    /// use regex_automata::{Match, dfa::regex::Regex};
388    ///
389    /// // Greediness is applied appropriately.
390    /// let re = Regex::new("foo[0-9]+")?;
391    /// assert_eq!(Some(Match::must(0, 3..11)), re.find("zzzfoo12345zzz"));
392    ///
393    /// // Even though a match is found after reading the first byte (`a`),
394    /// // the default leftmost-first match semantics demand that we find the
395    /// // earliest match that prefers earlier parts of the pattern over latter
396    /// // parts.
397    /// let re = Regex::new("abc|a")?;
398    /// assert_eq!(Some(Match::must(0, 0..3)), re.find("abc"));
399    /// # Ok::<(), Box<dyn std::error::Error>>(())
400    /// ```
401    #[inline]
402    pub fn find<'h, I: Into<Input<'h>>>(&self, input: I) -> Option<Match> {
403        self.try_search(&input.into()).unwrap()
404    }
405
406    /// Returns an iterator over all non-overlapping leftmost matches in the
407    /// given bytes. If no match exists, then the iterator yields no elements.
408    ///
409    /// This corresponds to the "standard" regex search iterator.
410    ///
411    /// # Panics
412    ///
413    /// If the search returns an error during iteration, then iteration
414    /// panics. See [`Regex::find`] for the panic conditions.
415    ///
416    /// Use [`Regex::try_search`] with
417    /// [`util::iter::Searcher`](crate::util::iter::Searcher) if you want to
418    /// handle these error conditions.
419    ///
420    /// # Example
421    ///
422    /// ```
423    /// use regex_automata::{Match, dfa::regex::Regex};
424    ///
425    /// let re = Regex::new("foo[0-9]+")?;
426    /// let text = "foo1 foo12 foo123";
427    /// let matches: Vec<Match> = re.find_iter(text).collect();
428    /// assert_eq!(matches, vec![
429    ///     Match::must(0, 0..4),
430    ///     Match::must(0, 5..10),
431    ///     Match::must(0, 11..17),
432    /// ]);
433    /// # Ok::<(), Box<dyn std::error::Error>>(())
434    /// ```
435    #[inline]
436    pub fn find_iter<'r, 'h, I: Into<Input<'h>>>(
437        &'r self,
438        input: I,
439    ) -> FindMatches<'r, 'h, A> {
440        let it = iter::Searcher::new(input.into());
441        FindMatches { re: self, it }
442    }
443}
444
445/// Lower level fallible search routines that permit controlling where the
446/// search starts and ends in a particular sequence.
447impl<A: Automaton> Regex<A> {
448    /// Returns the start and end offset of the leftmost match. If no match
449    /// exists, then `None` is returned.
450    ///
451    /// This is like [`Regex::find`] but with two differences:
452    ///
453    /// 1. It is not generic over `Into<Input>` and instead accepts a
454    /// `&Input`. This permits reusing the same `Input` for multiple searches
455    /// without needing to create a new one. This _may_ help with latency.
456    /// 2. It returns an error if the search could not complete where as
457    /// [`Regex::find`] will panic.
458    ///
459    /// # Errors
460    ///
461    /// This routine errors if the search could not complete. This can occur
462    /// in the following circumstances:
463    ///
464    /// * The configuration of the DFA may permit it to "quit" the search.
465    /// For example, setting quit bytes or enabling heuristic support for
466    /// Unicode word boundaries. The default configuration does not enable any
467    /// option that could result in the DFA quitting.
468    /// * When the provided `Input` configuration is not supported. For
469    /// example, by providing an unsupported anchor mode.
470    ///
471    /// When a search returns an error, callers cannot know whether a match
472    /// exists or not.
473    #[inline]
474    pub fn try_search(
475        &self,
476        input: &Input<'_>,
477    ) -> Result<Option<Match>, MatchError> {
478        let (fwd, rev) = (self.forward(), self.reverse());
479        let end = match fwd.try_search_fwd(input)? {
480            None => return Ok(None),
481            Some(end) => end,
482        };
483        // This special cases an empty match at the beginning of the search. If
484        // our end matches our start, then since a reverse DFA can't match past
485        // the start, it must follow that our starting position is also our end
486        // position. So short circuit and skip the reverse search.
487        if input.start() == end.offset() {
488            return Ok(Some(Match::new(
489                end.pattern(),
490                end.offset()..end.offset(),
491            )));
492        }
493        // We can also skip the reverse search if we know our search was
494        // anchored. This occurs either when the input config is anchored or
495        // when we know the regex itself is anchored. In this case, we know the
496        // start of the match, if one is found, must be the start of the
497        // search.
498        if self.is_anchored(input) {
499            return Ok(Some(Match::new(
500                end.pattern(),
501                input.start()..end.offset(),
502            )));
503        }
504        // N.B. I have tentatively convinced myself that it isn't necessary
505        // to specify the specific pattern for the reverse search since the
506        // reverse search will always find the same pattern to match as the
507        // forward search. But I lack a rigorous proof. Why not just provide
508        // the pattern anyway? Well, if it is needed, then leaving it out
509        // gives us a chance to find a witness. (Also, if we don't need to
510        // specify the pattern, then we don't need to build the reverse DFA
511        // with 'starts_for_each_pattern' enabled.)
512        //
513        // We also need to be careful to disable 'earliest' for the reverse
514        // search, since it could be enabled for the forward search. In the
515        // reverse case, to satisfy "leftmost" criteria, we need to match
516        // as much as we can. We also need to be careful to make the search
517        // anchored. We don't want the reverse search to report any matches
518        // other than the one beginning at the end of our forward search.
519        let revsearch = input
520            .clone()
521            .span(input.start()..end.offset())
522            .anchored(Anchored::Yes)
523            .earliest(false);
524        let start = rev
525            .try_search_rev(&revsearch)?
526            .expect("reverse search must match if forward search does");
527        assert_eq!(
528            start.pattern(),
529            end.pattern(),
530            "forward and reverse search must match same pattern",
531        );
532        assert!(start.offset() <= end.offset());
533        Ok(Some(Match::new(end.pattern(), start.offset()..end.offset())))
534    }
535
536    /// Returns true if either the given input specifies an anchored search
537    /// or if the underlying DFA is always anchored.
538    fn is_anchored(&self, input: &Input<'_>) -> bool {
539        match input.get_anchored() {
540            Anchored::No => self.forward().is_always_start_anchored(),
541            Anchored::Yes | Anchored::Pattern(_) => true,
542        }
543    }
544}
545
546/// Non-search APIs for querying information about the regex and setting a
547/// prefilter.
548impl<A: Automaton> Regex<A> {
549    /// Return the underlying DFA responsible for forward matching.
550    ///
551    /// This is useful for accessing the underlying DFA and converting it to
552    /// some other format or size. See the [`Builder::build_from_dfas`] docs
553    /// for an example of where this might be useful.
554    pub fn forward(&self) -> &A {
555        &self.forward
556    }
557
558    /// Return the underlying DFA responsible for reverse matching.
559    ///
560    /// This is useful for accessing the underlying DFA and converting it to
561    /// some other format or size. See the [`Builder::build_from_dfas`] docs
562    /// for an example of where this might be useful.
563    pub fn reverse(&self) -> &A {
564        &self.reverse
565    }
566
567    /// Returns the total number of patterns matched by this regex.
568    ///
569    /// # Example
570    ///
571    /// ```
572    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
573    /// use regex_automata::dfa::regex::Regex;
574    ///
575    /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?;
576    /// assert_eq!(3, re.pattern_len());
577    /// # Ok::<(), Box<dyn std::error::Error>>(())
578    /// ```
579    pub fn pattern_len(&self) -> usize {
580        assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len());
581        self.forward().pattern_len()
582    }
583}
584
585/// An iterator over all non-overlapping matches for an infallible search.
586///
587/// The iterator yields a [`Match`] value until no more matches could be found.
588/// If the underlying regex engine returns an error, then a panic occurs.
589///
590/// The type parameters are as follows:
591///
592/// * `A` represents the type of the underlying DFA that implements the
593/// [`Automaton`] trait.
594///
595/// The lifetime parameters are as follows:
596///
597/// * `'h` represents the lifetime of the haystack being searched.
598/// * `'r` represents the lifetime of the regex object itself.
599///
600/// This iterator can be created with the [`Regex::find_iter`] method.
601#[derive(Debug)]
602pub struct FindMatches<'r, 'h, A> {
603    re: &'r Regex<A>,
604    it: iter::Searcher<'h>,
605}
606
607impl<'r, 'h, A: Automaton> Iterator for FindMatches<'r, 'h, A> {
608    type Item = Match;
609
610    #[inline]
611    fn next(&mut self) -> Option<Match> {
612        let FindMatches { re, ref mut it } = *self;
613        it.advance(|input| re.try_search(input))
614    }
615}
616
617/// A builder for a regex based on deterministic finite automatons.
618///
619/// This builder permits configuring options for the syntax of a pattern, the
620/// NFA construction, the DFA construction and finally the regex searching
621/// itself. This builder is different from a general purpose regex builder in
622/// that it permits fine grain configuration of the construction process. The
623/// trade off for this is complexity, and the possibility of setting a
624/// configuration that might not make sense. For example, there are two
625/// different UTF-8 modes:
626///
627/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
628/// whether the pattern itself can contain sub-expressions that match invalid
629/// UTF-8.
630/// * [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) controls
631/// how the regex iterators themselves advance the starting position of the
632/// next search when a match with zero length is found.
633///
634/// Generally speaking, callers will want to either enable all of these or
635/// disable all of these.
636///
637/// Internally, building a regex requires building two DFAs, where one is
638/// responsible for finding the end of a match and the other is responsible
639/// for finding the start of a match. If you only need to detect whether
640/// something matched, or only the end of a match, then you should use a
641/// [`dense::Builder`] to construct a single DFA, which is cheaper than
642/// building two DFAs.
643///
644/// # Build methods
645///
646/// This builder has a few "build" methods. In general, it's the result of
647/// combining the following parameters:
648///
649/// * Building one or many regexes.
650/// * Building a regex with dense or sparse DFAs.
651///
652/// The simplest "build" method is [`Builder::build`]. It accepts a single
653/// pattern and builds a dense DFA using `usize` for the state identifier
654/// representation.
655///
656/// The most general "build" method is [`Builder::build_many`], which permits
657/// building a regex that searches for multiple patterns simultaneously while
658/// using a specific state identifier representation.
659///
660/// The most flexible "build" method, but hardest to use, is
661/// [`Builder::build_from_dfas`]. This exposes the fact that a [`Regex`] is
662/// just a pair of DFAs, and this method allows you to specify those DFAs
663/// exactly.
664///
665/// # Example
666///
667/// This example shows how to disable UTF-8 mode in the syntax and the regex
668/// itself. This is generally what you want for matching on arbitrary bytes.
669///
670/// ```
671/// # if cfg!(miri) { return Ok(()); } // miri takes too long
672/// use regex_automata::{
673///     dfa::regex::Regex, nfa::thompson, util::syntax, Match,
674/// };
675///
676/// let re = Regex::builder()
677///     .syntax(syntax::Config::new().utf8(false))
678///     .thompson(thompson::Config::new().utf8(false))
679///     .build(r"foo(?-u:[^b])ar.*")?;
680/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
681/// let expected = Some(Match::must(0, 1..9));
682/// let got = re.find(haystack);
683/// assert_eq!(expected, got);
684/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
685/// // but the subsequent `.*` does not! Disabling UTF-8
686/// // on the syntax permits this.
687/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]);
688///
689/// # Ok::<(), Box<dyn std::error::Error>>(())
690/// ```
691#[derive(Clone, Debug)]
692pub struct Builder {
693    #[cfg(feature = "dfa-build")]
694    dfa: dense::Builder,
695}
696
697impl Builder {
698    /// Create a new regex builder with the default configuration.
699    pub fn new() -> Builder {
700        Builder {
701            #[cfg(feature = "dfa-build")]
702            dfa: dense::Builder::new(),
703        }
704    }
705
706    /// Build a regex from the given pattern.
707    ///
708    /// If there was a problem parsing or compiling the pattern, then an error
709    /// is returned.
710    #[cfg(all(feature = "syntax", feature = "dfa-build"))]
711    pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> {
712        self.build_many(&[pattern])
713    }
714
715    /// Build a regex from the given pattern using sparse DFAs.
716    ///
717    /// If there was a problem parsing or compiling the pattern, then an error
718    /// is returned.
719    #[cfg(all(feature = "syntax", feature = "dfa-build"))]
720    pub fn build_sparse(
721        &self,
722        pattern: &str,
723    ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
724        self.build_many_sparse(&[pattern])
725    }
726
727    /// Build a regex from the given patterns.
728    #[cfg(all(feature = "syntax", feature = "dfa-build"))]
729    pub fn build_many<P: AsRef<str>>(
730        &self,
731        patterns: &[P],
732    ) -> Result<Regex, BuildError> {
733        let forward = self.dfa.build_many(patterns)?;
734        let reverse = self
735            .dfa
736            .clone()
737            .configure(
738                dense::Config::new()
739                    .prefilter(None)
740                    .specialize_start_states(false)
741                    .start_kind(StartKind::Anchored)
742                    .match_kind(MatchKind::All),
743            )
744            .thompson(crate::nfa::thompson::Config::new().reverse(true))
745            .build_many(patterns)?;
746        Ok(self.build_from_dfas(forward, reverse))
747    }
748
749    /// Build a sparse regex from the given patterns.
750    #[cfg(all(feature = "syntax", feature = "dfa-build"))]
751    pub fn build_many_sparse<P: AsRef<str>>(
752        &self,
753        patterns: &[P],
754    ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
755        let re = self.build_many(patterns)?;
756        let forward = re.forward().to_sparse()?;
757        let reverse = re.reverse().to_sparse()?;
758        Ok(self.build_from_dfas(forward, reverse))
759    }
760
761    /// Build a regex from its component forward and reverse DFAs.
762    ///
763    /// This is useful when deserializing a regex from some arbitrary
764    /// memory region. This is also useful for building regexes from other
765    /// types of DFAs.
766    ///
767    /// If you're building the DFAs from scratch instead of building new DFAs
768    /// from other DFAs, then you'll need to make sure that the reverse DFA is
769    /// configured correctly to match the intended semantics. Namely:
770    ///
771    /// * It should be anchored.
772    /// * It should use [`MatchKind::All`] semantics.
773    /// * It should match in reverse.
774    /// * Otherwise, its configuration should match the forward DFA.
775    ///
776    /// If these conditions aren't satisfied, then the behavior of searches is
777    /// unspecified.
778    ///
779    /// Note that when using this constructor, no configuration is applied.
780    /// Since this routine provides the DFAs to the builder, there is no
781    /// opportunity to apply other configuration options.
782    ///
783    /// # Example
784    ///
785    /// This example is a bit a contrived. The usual use of these methods
786    /// would involve serializing `initial_re` somewhere and then deserializing
787    /// it later to build a regex. But in this case, we do everything in
788    /// memory.
789    ///
790    /// ```
791    /// use regex_automata::dfa::regex::Regex;
792    ///
793    /// let initial_re = Regex::new("foo[0-9]+")?;
794    /// assert_eq!(true, initial_re.is_match(b"foo123"));
795    ///
796    /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse());
797    /// let re = Regex::builder().build_from_dfas(fwd, rev);
798    /// assert_eq!(true, re.is_match(b"foo123"));
799    /// # Ok::<(), Box<dyn std::error::Error>>(())
800    /// ```
801    ///
802    /// This example shows how to build a `Regex` that uses sparse DFAs instead
803    /// of dense DFAs without using one of the convenience `build_sparse`
804    /// routines:
805    ///
806    /// ```
807    /// use regex_automata::dfa::regex::Regex;
808    ///
809    /// let initial_re = Regex::new("foo[0-9]+")?;
810    /// assert_eq!(true, initial_re.is_match(b"foo123"));
811    ///
812    /// let fwd = initial_re.forward().to_sparse()?;
813    /// let rev = initial_re.reverse().to_sparse()?;
814    /// let re = Regex::builder().build_from_dfas(fwd, rev);
815    /// assert_eq!(true, re.is_match(b"foo123"));
816    /// # Ok::<(), Box<dyn std::error::Error>>(())
817    /// ```
818    pub fn build_from_dfas<A: Automaton>(
819        &self,
820        forward: A,
821        reverse: A,
822    ) -> Regex<A> {
823        Regex { forward, reverse }
824    }
825
826    /// Set the syntax configuration for this builder using
827    /// [`syntax::Config`](crate::util::syntax::Config).
828    ///
829    /// This permits setting things like case insensitivity, Unicode and multi
830    /// line mode.
831    #[cfg(all(feature = "syntax", feature = "dfa-build"))]
832    pub fn syntax(
833        &mut self,
834        config: crate::util::syntax::Config,
835    ) -> &mut Builder {
836        self.dfa.syntax(config);
837        self
838    }
839
840    /// Set the Thompson NFA configuration for this builder using
841    /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
842    ///
843    /// This permits setting things like whether additional time should be
844    /// spent shrinking the size of the NFA.
845    #[cfg(all(feature = "syntax", feature = "dfa-build"))]
846    pub fn thompson(
847        &mut self,
848        config: crate::nfa::thompson::Config,
849    ) -> &mut Builder {
850        self.dfa.thompson(config);
851        self
852    }
853
854    /// Set the dense DFA compilation configuration for this builder using
855    /// [`dense::Config`].
856    ///
857    /// This permits setting things like whether the underlying DFAs should
858    /// be minimized.
859    #[cfg(feature = "dfa-build")]
860    pub fn dense(&mut self, config: dense::Config) -> &mut Builder {
861        self.dfa.configure(config);
862        self
863    }
864}
865
866impl Default for Builder {
867    fn default() -> Builder {
868        Builder::new()
869    }
870}
regex_automata/dfa/regex.rs

regex_automata/dfa/
regex.rs