regex_automata/dfa/regex.rs
1/*!
2A DFA-backed `Regex`.
3
4This module provides [`Regex`], which is defined generically over the
5[`Automaton`] trait. A `Regex` implements convenience routines you might have
6come to expect, such as finding the start/end of a match and iterating over
7all non-overlapping matches. This `Regex` type is limited in its capabilities
8to what a DFA can provide. Therefore, APIs involving capturing groups, for
9example, are not provided.
10
11Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that
12finds the end offset of a match, where as the other is a "reverse" DFA that
13find the start offset of a match.
14
15See the [parent module](crate::dfa) for examples.
16*/
17
18#[cfg(feature = "alloc")]
19use alloc::vec::Vec;
20
21#[cfg(feature = "dfa-build")]
22use crate::dfa::dense::BuildError;
23use crate::{
24 dfa::{automaton::Automaton, dense},
25 util::{iter, search::Input},
26 Anchored, Match, MatchError,
27};
28#[cfg(feature = "alloc")]
29use crate::{
30 dfa::{sparse, StartKind},
31 util::search::MatchKind,
32};
33
34// When the alloc feature is enabled, the regex type sets its A type parameter
35// to default to an owned dense DFA. But without alloc, we set no default. This
36// makes things a lot more convenient in the common case, since writing out the
37// DFA types is pretty annoying.
38//
39// Since we have two different definitions but only want to write one doc
40// string, we use a macro to capture the doc and other attributes once and then
41// repeat them for each definition.
42macro_rules! define_regex_type {
43 ($(#[$doc:meta])*) => {
44 #[cfg(feature = "alloc")]
45 $(#[$doc])*
46 pub struct Regex<A = dense::OwnedDFA> {
47 forward: A,
48 reverse: A,
49 }
50
51 #[cfg(not(feature = "alloc"))]
52 $(#[$doc])*
53 pub struct Regex<A> {
54 forward: A,
55 reverse: A,
56 }
57 };
58}
59
60define_regex_type!(
61 /// A regular expression that uses deterministic finite automata for fast
62 /// searching.
63 ///
64 /// A regular expression is comprised of two DFAs, a "forward" DFA and a
65 /// "reverse" DFA. The forward DFA is responsible for detecting the end of
66 /// a match while the reverse DFA is responsible for detecting the start
67 /// of a match. Thus, in order to find the bounds of any given match, a
68 /// forward search must first be run followed by a reverse search. A match
69 /// found by the forward DFA guarantees that the reverse DFA will also find
70 /// a match.
71 ///
72 /// The type of the DFA used by a `Regex` corresponds to the `A` type
73 /// parameter, which must satisfy the [`Automaton`] trait. Typically, `A`
74 /// is either a [`dense::DFA`] or a [`sparse::DFA`], where dense DFAs use
75 /// more memory but search faster, while sparse DFAs use less memory but
76 /// search more slowly.
77 ///
78 /// # Crate features
79 ///
80 /// Note that despite what the documentation auto-generates, the _only_
81 /// crate feature needed to use this type is `dfa-search`. You do _not_
82 /// need to enable the `alloc` feature.
83 ///
84 /// By default, a regex's automaton type parameter is set to
85 /// `dense::DFA<Vec<u32>>` when the `alloc` feature is enabled. For most
86 /// in-memory work loads, this is the most convenient type that gives the
87 /// best search performance. When the `alloc` feature is disabled, no
88 /// default type is used.
89 ///
90 /// # When should I use this?
91 ///
92 /// Generally speaking, if you can afford the overhead of building a full
93 /// DFA for your regex, and you don't need things like capturing groups,
94 /// then this is a good choice if you're looking to optimize for matching
95 /// speed. Note however that its speed may be worse than a general purpose
96 /// regex engine if you don't provide a [`dense::Config::prefilter`] to the
97 /// underlying DFA.
98 ///
99 /// # Sparse DFAs
100 ///
101 /// Since a `Regex` is generic over the [`Automaton`] trait, it can be
102 /// used with any kind of DFA. While this crate constructs dense DFAs by
103 /// default, it is easy enough to build corresponding sparse DFAs, and then
104 /// build a regex from them:
105 ///
106 /// ```
107 /// use regex_automata::dfa::regex::Regex;
108 ///
109 /// // First, build a regex that uses dense DFAs.
110 /// let dense_re = Regex::new("foo[0-9]+")?;
111 ///
112 /// // Second, build sparse DFAs from the forward and reverse dense DFAs.
113 /// let fwd = dense_re.forward().to_sparse()?;
114 /// let rev = dense_re.reverse().to_sparse()?;
115 ///
116 /// // Third, build a new regex from the constituent sparse DFAs.
117 /// let sparse_re = Regex::builder().build_from_dfas(fwd, rev);
118 ///
119 /// // A regex that uses sparse DFAs can be used just like with dense DFAs.
120 /// assert_eq!(true, sparse_re.is_match(b"foo123"));
121 ///
122 /// # Ok::<(), Box<dyn std::error::Error>>(())
123 /// ```
124 ///
125 /// Alternatively, one can use a [`Builder`] to construct a sparse DFA
126 /// more succinctly. (Note though that dense DFAs are still constructed
127 /// first internally, and then converted to sparse DFAs, as in the example
128 /// above.)
129 ///
130 /// ```
131 /// use regex_automata::dfa::regex::Regex;
132 ///
133 /// let sparse_re = Regex::builder().build_sparse(r"foo[0-9]+")?;
134 /// // A regex that uses sparse DFAs can be used just like with dense DFAs.
135 /// assert!(sparse_re.is_match(b"foo123"));
136 ///
137 /// # Ok::<(), Box<dyn std::error::Error>>(())
138 /// ```
139 ///
140 /// # Fallibility
141 ///
142 /// Most of the search routines defined on this type will _panic_ when the
143 /// underlying search fails. This might be because the DFA gave up because
144 /// it saw a quit byte, whether configured explicitly or via heuristic
145 /// Unicode word boundary support, although neither are enabled by default.
146 /// Or it might fail because an invalid `Input` configuration is given,
147 /// for example, with an unsupported [`Anchored`] mode.
148 ///
149 /// If you need to handle these error cases instead of allowing them to
150 /// trigger a panic, then the lower level [`Regex::try_search`] provides
151 /// a fallible API that never panics.
152 ///
153 /// # Example
154 ///
155 /// This example shows how to cause a search to terminate if it sees a
156 /// `\n` byte, and handle the error returned. This could be useful if, for
157 /// example, you wanted to prevent a user supplied pattern from matching
158 /// across a line boundary.
159 ///
160 /// ```
161 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
162 /// use regex_automata::{dfa::{self, regex::Regex}, Input, MatchError};
163 ///
164 /// let re = Regex::builder()
165 /// .dense(dfa::dense::Config::new().quit(b'\n', true))
166 /// .build(r"foo\p{any}+bar")?;
167 ///
168 /// let input = Input::new("foo\nbar");
169 /// // Normally this would produce a match, since \p{any} contains '\n'.
170 /// // But since we instructed the automaton to enter a quit state if a
171 /// // '\n' is observed, this produces a match error instead.
172 /// let expected = MatchError::quit(b'\n', 3);
173 /// let got = re.try_search(&input).unwrap_err();
174 /// assert_eq!(expected, got);
175 ///
176 /// # Ok::<(), Box<dyn std::error::Error>>(())
177 /// ```
178 #[derive(Clone, Debug)]
179);
180
181#[cfg(all(feature = "syntax", feature = "dfa-build"))]
182impl Regex {
183 /// Parse the given regular expression using the default configuration and
184 /// return the corresponding regex.
185 ///
186 /// If you want a non-default configuration, then use the [`Builder`] to
187 /// set your own configuration.
188 ///
189 /// # Example
190 ///
191 /// ```
192 /// use regex_automata::{Match, dfa::regex::Regex};
193 ///
194 /// let re = Regex::new("foo[0-9]+bar")?;
195 /// assert_eq!(
196 /// Some(Match::must(0, 3..14)),
197 /// re.find(b"zzzfoo12345barzzz"),
198 /// );
199 /// # Ok::<(), Box<dyn std::error::Error>>(())
200 /// ```
201 pub fn new(pattern: &str) -> Result<Regex, BuildError> {
202 Builder::new().build(pattern)
203 }
204
205 /// Like `new`, but parses multiple patterns into a single "regex set."
206 /// This similarly uses the default regex configuration.
207 ///
208 /// # Example
209 ///
210 /// ```
211 /// use regex_automata::{Match, dfa::regex::Regex};
212 ///
213 /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?;
214 ///
215 /// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux");
216 /// assert_eq!(Some(Match::must(0, 0..3)), it.next());
217 /// assert_eq!(Some(Match::must(1, 4..5)), it.next());
218 /// assert_eq!(Some(Match::must(0, 6..9)), it.next());
219 /// assert_eq!(Some(Match::must(1, 10..14)), it.next());
220 /// assert_eq!(Some(Match::must(1, 15..16)), it.next());
221 /// assert_eq!(Some(Match::must(0, 17..21)), it.next());
222 /// assert_eq!(None, it.next());
223 /// # Ok::<(), Box<dyn std::error::Error>>(())
224 /// ```
225 pub fn new_many<P: AsRef<str>>(
226 patterns: &[P],
227 ) -> Result<Regex, BuildError> {
228 Builder::new().build_many(patterns)
229 }
230}
231
232#[cfg(all(feature = "syntax", feature = "dfa-build"))]
233impl Regex<sparse::DFA<Vec<u8>>> {
234 /// Parse the given regular expression using the default configuration,
235 /// except using sparse DFAs, and return the corresponding regex.
236 ///
237 /// If you want a non-default configuration, then use the [`Builder`] to
238 /// set your own configuration.
239 ///
240 /// # Example
241 ///
242 /// ```
243 /// use regex_automata::{Match, dfa::regex::Regex};
244 ///
245 /// let re = Regex::new_sparse("foo[0-9]+bar")?;
246 /// assert_eq!(
247 /// Some(Match::must(0, 3..14)),
248 /// re.find(b"zzzfoo12345barzzz"),
249 /// );
250 /// # Ok::<(), Box<dyn std::error::Error>>(())
251 /// ```
252 pub fn new_sparse(
253 pattern: &str,
254 ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
255 Builder::new().build_sparse(pattern)
256 }
257
258 /// Like `new`, but parses multiple patterns into a single "regex set"
259 /// using sparse DFAs. This otherwise similarly uses the default regex
260 /// configuration.
261 ///
262 /// # Example
263 ///
264 /// ```
265 /// use regex_automata::{Match, dfa::regex::Regex};
266 ///
267 /// let re = Regex::new_many_sparse(&["[a-z]+", "[0-9]+"])?;
268 ///
269 /// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux");
270 /// assert_eq!(Some(Match::must(0, 0..3)), it.next());
271 /// assert_eq!(Some(Match::must(1, 4..5)), it.next());
272 /// assert_eq!(Some(Match::must(0, 6..9)), it.next());
273 /// assert_eq!(Some(Match::must(1, 10..14)), it.next());
274 /// assert_eq!(Some(Match::must(1, 15..16)), it.next());
275 /// assert_eq!(Some(Match::must(0, 17..21)), it.next());
276 /// assert_eq!(None, it.next());
277 /// # Ok::<(), Box<dyn std::error::Error>>(())
278 /// ```
279 pub fn new_many_sparse<P: AsRef<str>>(
280 patterns: &[P],
281 ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
282 Builder::new().build_many_sparse(patterns)
283 }
284}
285
286/// Convenience routines for regex construction.
287impl Regex<dense::DFA<&'static [u32]>> {
288 /// Return a builder for configuring the construction of a `Regex`.
289 ///
290 /// This is a convenience routine to avoid needing to import the
291 /// [`Builder`] type in common cases.
292 ///
293 /// # Example
294 ///
295 /// This example shows how to use the builder to disable UTF-8 mode
296 /// everywhere.
297 ///
298 /// ```
299 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
300 /// use regex_automata::{
301 /// dfa::regex::Regex, nfa::thompson, util::syntax, Match,
302 /// };
303 ///
304 /// let re = Regex::builder()
305 /// .syntax(syntax::Config::new().utf8(false))
306 /// .thompson(thompson::Config::new().utf8(false))
307 /// .build(r"foo(?-u:[^b])ar.*")?;
308 /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
309 /// let expected = Some(Match::must(0, 1..9));
310 /// let got = re.find(haystack);
311 /// assert_eq!(expected, got);
312 ///
313 /// # Ok::<(), Box<dyn std::error::Error>>(())
314 /// ```
315 pub fn builder() -> Builder {
316 Builder::new()
317 }
318}
319
320/// Standard search routines for finding and iterating over matches.
321impl<A: Automaton> Regex<A> {
322 /// Returns true if and only if this regex matches the given haystack.
323 ///
324 /// This routine may short circuit if it knows that scanning future input
325 /// will never lead to a different result. In particular, if the underlying
326 /// DFA enters a match state or a dead state, then this routine will return
327 /// `true` or `false`, respectively, without inspecting any future input.
328 ///
329 /// # Panics
330 ///
331 /// This routine panics if the search could not complete. This can occur
332 /// in a number of circumstances:
333 ///
334 /// * The configuration of the DFA may permit it to "quit" the search.
335 /// For example, setting quit bytes or enabling heuristic support for
336 /// Unicode word boundaries. The default configuration does not enable any
337 /// option that could result in the DFA quitting.
338 /// * When the provided `Input` configuration is not supported. For
339 /// example, by providing an unsupported anchor mode.
340 ///
341 /// When a search panics, callers cannot know whether a match exists or
342 /// not.
343 ///
344 /// Use [`Regex::try_search`] if you want to handle these error conditions.
345 ///
346 /// # Example
347 ///
348 /// ```
349 /// use regex_automata::dfa::regex::Regex;
350 ///
351 /// let re = Regex::new("foo[0-9]+bar")?;
352 /// assert_eq!(true, re.is_match("foo12345bar"));
353 /// assert_eq!(false, re.is_match("foobar"));
354 /// # Ok::<(), Box<dyn std::error::Error>>(())
355 /// ```
356 #[inline]
357 pub fn is_match<'h, I: Into<Input<'h>>>(&self, input: I) -> bool {
358 // Not only can we do an "earliest" search, but we can avoid doing a
359 // reverse scan too.
360 let input = input.into().earliest(true);
361 self.forward().try_search_fwd(&input).map(|x| x.is_some()).unwrap()
362 }
363
364 /// Returns the start and end offset of the leftmost match. If no match
365 /// exists, then `None` is returned.
366 ///
367 /// # Panics
368 ///
369 /// This routine panics if the search could not complete. This can occur
370 /// in a number of circumstances:
371 ///
372 /// * The configuration of the DFA may permit it to "quit" the search.
373 /// For example, setting quit bytes or enabling heuristic support for
374 /// Unicode word boundaries. The default configuration does not enable any
375 /// option that could result in the DFA quitting.
376 /// * When the provided `Input` configuration is not supported. For
377 /// example, by providing an unsupported anchor mode.
378 ///
379 /// When a search panics, callers cannot know whether a match exists or
380 /// not.
381 ///
382 /// Use [`Regex::try_search`] if you want to handle these error conditions.
383 ///
384 /// # Example
385 ///
386 /// ```
387 /// use regex_automata::{Match, dfa::regex::Regex};
388 ///
389 /// // Greediness is applied appropriately.
390 /// let re = Regex::new("foo[0-9]+")?;
391 /// assert_eq!(Some(Match::must(0, 3..11)), re.find("zzzfoo12345zzz"));
392 ///
393 /// // Even though a match is found after reading the first byte (`a`),
394 /// // the default leftmost-first match semantics demand that we find the
395 /// // earliest match that prefers earlier parts of the pattern over latter
396 /// // parts.
397 /// let re = Regex::new("abc|a")?;
398 /// assert_eq!(Some(Match::must(0, 0..3)), re.find("abc"));
399 /// # Ok::<(), Box<dyn std::error::Error>>(())
400 /// ```
401 #[inline]
402 pub fn find<'h, I: Into<Input<'h>>>(&self, input: I) -> Option<Match> {
403 self.try_search(&input.into()).unwrap()
404 }
405
406 /// Returns an iterator over all non-overlapping leftmost matches in the
407 /// given bytes. If no match exists, then the iterator yields no elements.
408 ///
409 /// This corresponds to the "standard" regex search iterator.
410 ///
411 /// # Panics
412 ///
413 /// If the search returns an error during iteration, then iteration
414 /// panics. See [`Regex::find`] for the panic conditions.
415 ///
416 /// Use [`Regex::try_search`] with
417 /// [`util::iter::Searcher`](crate::util::iter::Searcher) if you want to
418 /// handle these error conditions.
419 ///
420 /// # Example
421 ///
422 /// ```
423 /// use regex_automata::{Match, dfa::regex::Regex};
424 ///
425 /// let re = Regex::new("foo[0-9]+")?;
426 /// let text = "foo1 foo12 foo123";
427 /// let matches: Vec<Match> = re.find_iter(text).collect();
428 /// assert_eq!(matches, vec![
429 /// Match::must(0, 0..4),
430 /// Match::must(0, 5..10),
431 /// Match::must(0, 11..17),
432 /// ]);
433 /// # Ok::<(), Box<dyn std::error::Error>>(())
434 /// ```
435 #[inline]
436 pub fn find_iter<'r, 'h, I: Into<Input<'h>>>(
437 &'r self,
438 input: I,
439 ) -> FindMatches<'r, 'h, A> {
440 let it = iter::Searcher::new(input.into());
441 FindMatches { re: self, it }
442 }
443}
444
445/// Lower level fallible search routines that permit controlling where the
446/// search starts and ends in a particular sequence.
447impl<A: Automaton> Regex<A> {
448 /// Returns the start and end offset of the leftmost match. If no match
449 /// exists, then `None` is returned.
450 ///
451 /// This is like [`Regex::find`] but with two differences:
452 ///
453 /// 1. It is not generic over `Into<Input>` and instead accepts a
454 /// `&Input`. This permits reusing the same `Input` for multiple searches
455 /// without needing to create a new one. This _may_ help with latency.
456 /// 2. It returns an error if the search could not complete where as
457 /// [`Regex::find`] will panic.
458 ///
459 /// # Errors
460 ///
461 /// This routine errors if the search could not complete. This can occur
462 /// in the following circumstances:
463 ///
464 /// * The configuration of the DFA may permit it to "quit" the search.
465 /// For example, setting quit bytes or enabling heuristic support for
466 /// Unicode word boundaries. The default configuration does not enable any
467 /// option that could result in the DFA quitting.
468 /// * When the provided `Input` configuration is not supported. For
469 /// example, by providing an unsupported anchor mode.
470 ///
471 /// When a search returns an error, callers cannot know whether a match
472 /// exists or not.
473 #[inline]
474 pub fn try_search(
475 &self,
476 input: &Input<'_>,
477 ) -> Result<Option<Match>, MatchError> {
478 let (fwd, rev) = (self.forward(), self.reverse());
479 let end = match fwd.try_search_fwd(input)? {
480 None => return Ok(None),
481 Some(end) => end,
482 };
483 // This special cases an empty match at the beginning of the search. If
484 // our end matches our start, then since a reverse DFA can't match past
485 // the start, it must follow that our starting position is also our end
486 // position. So short circuit and skip the reverse search.
487 if input.start() == end.offset() {
488 return Ok(Some(Match::new(
489 end.pattern(),
490 end.offset()..end.offset(),
491 )));
492 }
493 // We can also skip the reverse search if we know our search was
494 // anchored. This occurs either when the input config is anchored or
495 // when we know the regex itself is anchored. In this case, we know the
496 // start of the match, if one is found, must be the start of the
497 // search.
498 if self.is_anchored(input) {
499 return Ok(Some(Match::new(
500 end.pattern(),
501 input.start()..end.offset(),
502 )));
503 }
504 // N.B. I have tentatively convinced myself that it isn't necessary
505 // to specify the specific pattern for the reverse search since the
506 // reverse search will always find the same pattern to match as the
507 // forward search. But I lack a rigorous proof. Why not just provide
508 // the pattern anyway? Well, if it is needed, then leaving it out
509 // gives us a chance to find a witness. (Also, if we don't need to
510 // specify the pattern, then we don't need to build the reverse DFA
511 // with 'starts_for_each_pattern' enabled.)
512 //
513 // We also need to be careful to disable 'earliest' for the reverse
514 // search, since it could be enabled for the forward search. In the
515 // reverse case, to satisfy "leftmost" criteria, we need to match
516 // as much as we can. We also need to be careful to make the search
517 // anchored. We don't want the reverse search to report any matches
518 // other than the one beginning at the end of our forward search.
519 let revsearch = input
520 .clone()
521 .span(input.start()..end.offset())
522 .anchored(Anchored::Yes)
523 .earliest(false);
524 let start = rev
525 .try_search_rev(&revsearch)?
526 .expect("reverse search must match if forward search does");
527 assert_eq!(
528 start.pattern(),
529 end.pattern(),
530 "forward and reverse search must match same pattern",
531 );
532 assert!(start.offset() <= end.offset());
533 Ok(Some(Match::new(end.pattern(), start.offset()..end.offset())))
534 }
535
536 /// Returns true if either the given input specifies an anchored search
537 /// or if the underlying DFA is always anchored.
538 fn is_anchored(&self, input: &Input<'_>) -> bool {
539 match input.get_anchored() {
540 Anchored::No => self.forward().is_always_start_anchored(),
541 Anchored::Yes | Anchored::Pattern(_) => true,
542 }
543 }
544}
545
546/// Non-search APIs for querying information about the regex and setting a
547/// prefilter.
548impl<A: Automaton> Regex<A> {
549 /// Return the underlying DFA responsible for forward matching.
550 ///
551 /// This is useful for accessing the underlying DFA and converting it to
552 /// some other format or size. See the [`Builder::build_from_dfas`] docs
553 /// for an example of where this might be useful.
554 pub fn forward(&self) -> &A {
555 &self.forward
556 }
557
558 /// Return the underlying DFA responsible for reverse matching.
559 ///
560 /// This is useful for accessing the underlying DFA and converting it to
561 /// some other format or size. See the [`Builder::build_from_dfas`] docs
562 /// for an example of where this might be useful.
563 pub fn reverse(&self) -> &A {
564 &self.reverse
565 }
566
567 /// Returns the total number of patterns matched by this regex.
568 ///
569 /// # Example
570 ///
571 /// ```
572 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
573 /// use regex_automata::dfa::regex::Regex;
574 ///
575 /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?;
576 /// assert_eq!(3, re.pattern_len());
577 /// # Ok::<(), Box<dyn std::error::Error>>(())
578 /// ```
579 pub fn pattern_len(&self) -> usize {
580 assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len());
581 self.forward().pattern_len()
582 }
583}
584
585/// An iterator over all non-overlapping matches for an infallible search.
586///
587/// The iterator yields a [`Match`] value until no more matches could be found.
588/// If the underlying regex engine returns an error, then a panic occurs.
589///
590/// The type parameters are as follows:
591///
592/// * `A` represents the type of the underlying DFA that implements the
593/// [`Automaton`] trait.
594///
595/// The lifetime parameters are as follows:
596///
597/// * `'h` represents the lifetime of the haystack being searched.
598/// * `'r` represents the lifetime of the regex object itself.
599///
600/// This iterator can be created with the [`Regex::find_iter`] method.
601#[derive(Debug)]
602pub struct FindMatches<'r, 'h, A> {
603 re: &'r Regex<A>,
604 it: iter::Searcher<'h>,
605}
606
607impl<'r, 'h, A: Automaton> Iterator for FindMatches<'r, 'h, A> {
608 type Item = Match;
609
610 #[inline]
611 fn next(&mut self) -> Option<Match> {
612 let FindMatches { re, ref mut it } = *self;
613 it.advance(|input| re.try_search(input))
614 }
615}
616
617/// A builder for a regex based on deterministic finite automatons.
618///
619/// This builder permits configuring options for the syntax of a pattern, the
620/// NFA construction, the DFA construction and finally the regex searching
621/// itself. This builder is different from a general purpose regex builder in
622/// that it permits fine grain configuration of the construction process. The
623/// trade off for this is complexity, and the possibility of setting a
624/// configuration that might not make sense. For example, there are two
625/// different UTF-8 modes:
626///
627/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
628/// whether the pattern itself can contain sub-expressions that match invalid
629/// UTF-8.
630/// * [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) controls
631/// how the regex iterators themselves advance the starting position of the
632/// next search when a match with zero length is found.
633///
634/// Generally speaking, callers will want to either enable all of these or
635/// disable all of these.
636///
637/// Internally, building a regex requires building two DFAs, where one is
638/// responsible for finding the end of a match and the other is responsible
639/// for finding the start of a match. If you only need to detect whether
640/// something matched, or only the end of a match, then you should use a
641/// [`dense::Builder`] to construct a single DFA, which is cheaper than
642/// building two DFAs.
643///
644/// # Build methods
645///
646/// This builder has a few "build" methods. In general, it's the result of
647/// combining the following parameters:
648///
649/// * Building one or many regexes.
650/// * Building a regex with dense or sparse DFAs.
651///
652/// The simplest "build" method is [`Builder::build`]. It accepts a single
653/// pattern and builds a dense DFA using `usize` for the state identifier
654/// representation.
655///
656/// The most general "build" method is [`Builder::build_many`], which permits
657/// building a regex that searches for multiple patterns simultaneously while
658/// using a specific state identifier representation.
659///
660/// The most flexible "build" method, but hardest to use, is
661/// [`Builder::build_from_dfas`]. This exposes the fact that a [`Regex`] is
662/// just a pair of DFAs, and this method allows you to specify those DFAs
663/// exactly.
664///
665/// # Example
666///
667/// This example shows how to disable UTF-8 mode in the syntax and the regex
668/// itself. This is generally what you want for matching on arbitrary bytes.
669///
670/// ```
671/// # if cfg!(miri) { return Ok(()); } // miri takes too long
672/// use regex_automata::{
673/// dfa::regex::Regex, nfa::thompson, util::syntax, Match,
674/// };
675///
676/// let re = Regex::builder()
677/// .syntax(syntax::Config::new().utf8(false))
678/// .thompson(thompson::Config::new().utf8(false))
679/// .build(r"foo(?-u:[^b])ar.*")?;
680/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
681/// let expected = Some(Match::must(0, 1..9));
682/// let got = re.find(haystack);
683/// assert_eq!(expected, got);
684/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
685/// // but the subsequent `.*` does not! Disabling UTF-8
686/// // on the syntax permits this.
687/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]);
688///
689/// # Ok::<(), Box<dyn std::error::Error>>(())
690/// ```
691#[derive(Clone, Debug)]
692pub struct Builder {
693 #[cfg(feature = "dfa-build")]
694 dfa: dense::Builder,
695}
696
697impl Builder {
698 /// Create a new regex builder with the default configuration.
699 pub fn new() -> Builder {
700 Builder {
701 #[cfg(feature = "dfa-build")]
702 dfa: dense::Builder::new(),
703 }
704 }
705
706 /// Build a regex from the given pattern.
707 ///
708 /// If there was a problem parsing or compiling the pattern, then an error
709 /// is returned.
710 #[cfg(all(feature = "syntax", feature = "dfa-build"))]
711 pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> {
712 self.build_many(&[pattern])
713 }
714
715 /// Build a regex from the given pattern using sparse DFAs.
716 ///
717 /// If there was a problem parsing or compiling the pattern, then an error
718 /// is returned.
719 #[cfg(all(feature = "syntax", feature = "dfa-build"))]
720 pub fn build_sparse(
721 &self,
722 pattern: &str,
723 ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
724 self.build_many_sparse(&[pattern])
725 }
726
727 /// Build a regex from the given patterns.
728 #[cfg(all(feature = "syntax", feature = "dfa-build"))]
729 pub fn build_many<P: AsRef<str>>(
730 &self,
731 patterns: &[P],
732 ) -> Result<Regex, BuildError> {
733 let forward = self.dfa.build_many(patterns)?;
734 let reverse = self
735 .dfa
736 .clone()
737 .configure(
738 dense::Config::new()
739 .prefilter(None)
740 .specialize_start_states(false)
741 .start_kind(StartKind::Anchored)
742 .match_kind(MatchKind::All),
743 )
744 .thompson(crate::nfa::thompson::Config::new().reverse(true))
745 .build_many(patterns)?;
746 Ok(self.build_from_dfas(forward, reverse))
747 }
748
749 /// Build a sparse regex from the given patterns.
750 #[cfg(all(feature = "syntax", feature = "dfa-build"))]
751 pub fn build_many_sparse<P: AsRef<str>>(
752 &self,
753 patterns: &[P],
754 ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
755 let re = self.build_many(patterns)?;
756 let forward = re.forward().to_sparse()?;
757 let reverse = re.reverse().to_sparse()?;
758 Ok(self.build_from_dfas(forward, reverse))
759 }
760
761 /// Build a regex from its component forward and reverse DFAs.
762 ///
763 /// This is useful when deserializing a regex from some arbitrary
764 /// memory region. This is also useful for building regexes from other
765 /// types of DFAs.
766 ///
767 /// If you're building the DFAs from scratch instead of building new DFAs
768 /// from other DFAs, then you'll need to make sure that the reverse DFA is
769 /// configured correctly to match the intended semantics. Namely:
770 ///
771 /// * It should be anchored.
772 /// * It should use [`MatchKind::All`] semantics.
773 /// * It should match in reverse.
774 /// * Otherwise, its configuration should match the forward DFA.
775 ///
776 /// If these conditions aren't satisfied, then the behavior of searches is
777 /// unspecified.
778 ///
779 /// Note that when using this constructor, no configuration is applied.
780 /// Since this routine provides the DFAs to the builder, there is no
781 /// opportunity to apply other configuration options.
782 ///
783 /// # Example
784 ///
785 /// This example is a bit a contrived. The usual use of these methods
786 /// would involve serializing `initial_re` somewhere and then deserializing
787 /// it later to build a regex. But in this case, we do everything in
788 /// memory.
789 ///
790 /// ```
791 /// use regex_automata::dfa::regex::Regex;
792 ///
793 /// let initial_re = Regex::new("foo[0-9]+")?;
794 /// assert_eq!(true, initial_re.is_match(b"foo123"));
795 ///
796 /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse());
797 /// let re = Regex::builder().build_from_dfas(fwd, rev);
798 /// assert_eq!(true, re.is_match(b"foo123"));
799 /// # Ok::<(), Box<dyn std::error::Error>>(())
800 /// ```
801 ///
802 /// This example shows how to build a `Regex` that uses sparse DFAs instead
803 /// of dense DFAs without using one of the convenience `build_sparse`
804 /// routines:
805 ///
806 /// ```
807 /// use regex_automata::dfa::regex::Regex;
808 ///
809 /// let initial_re = Regex::new("foo[0-9]+")?;
810 /// assert_eq!(true, initial_re.is_match(b"foo123"));
811 ///
812 /// let fwd = initial_re.forward().to_sparse()?;
813 /// let rev = initial_re.reverse().to_sparse()?;
814 /// let re = Regex::builder().build_from_dfas(fwd, rev);
815 /// assert_eq!(true, re.is_match(b"foo123"));
816 /// # Ok::<(), Box<dyn std::error::Error>>(())
817 /// ```
818 pub fn build_from_dfas<A: Automaton>(
819 &self,
820 forward: A,
821 reverse: A,
822 ) -> Regex<A> {
823 Regex { forward, reverse }
824 }
825
826 /// Set the syntax configuration for this builder using
827 /// [`syntax::Config`](crate::util::syntax::Config).
828 ///
829 /// This permits setting things like case insensitivity, Unicode and multi
830 /// line mode.
831 #[cfg(all(feature = "syntax", feature = "dfa-build"))]
832 pub fn syntax(
833 &mut self,
834 config: crate::util::syntax::Config,
835 ) -> &mut Builder {
836 self.dfa.syntax(config);
837 self
838 }
839
840 /// Set the Thompson NFA configuration for this builder using
841 /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
842 ///
843 /// This permits setting things like whether additional time should be
844 /// spent shrinking the size of the NFA.
845 #[cfg(all(feature = "syntax", feature = "dfa-build"))]
846 pub fn thompson(
847 &mut self,
848 config: crate::nfa::thompson::Config,
849 ) -> &mut Builder {
850 self.dfa.thompson(config);
851 self
852 }
853
854 /// Set the dense DFA compilation configuration for this builder using
855 /// [`dense::Config`].
856 ///
857 /// This permits setting things like whether the underlying DFAs should
858 /// be minimized.
859 #[cfg(feature = "dfa-build")]
860 pub fn dense(&mut self, config: dense::Config) -> &mut Builder {
861 self.dfa.configure(config);
862 self
863 }
864}
865
866impl Default for Builder {
867 fn default() -> Builder {
868 Builder::new()
869 }
870}