Coverage Report

Created: 2025-06-23 13:53

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/build/cargo-vendor-dir/regex-automata-0.4.9/src/hybrid/regex.rs
Line
Count
Source
1
/*!
2
A lazy DFA backed `Regex`.
3
4
This module provides a [`Regex`] backed by a lazy DFA. A `Regex` implements
5
convenience routines you might have come to expect, such as finding a match
6
and iterating over all non-overlapping matches. This `Regex` type is limited
7
in its capabilities to what a lazy DFA can provide. Therefore, APIs involving
8
capturing groups, for example, are not provided.
9
10
Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that
11
finds the end offset of a match, where as the other is a "reverse" DFA that
12
find the start offset of a match.
13
14
See the [parent module](crate::hybrid) for examples.
15
*/
16
17
use crate::{
18
    hybrid::{
19
        dfa::{self, DFA},
20
        error::BuildError,
21
    },
22
    nfa::thompson,
23
    util::{
24
        iter,
25
        search::{Anchored, Input, Match, MatchError, MatchKind},
26
    },
27
};
28
29
/// A regular expression that uses hybrid NFA/DFAs (also called "lazy DFAs")
30
/// for searching.
31
///
32
/// A regular expression is comprised of two lazy DFAs, a "forward" DFA and a
33
/// "reverse" DFA. The forward DFA is responsible for detecting the end of
34
/// a match while the reverse DFA is responsible for detecting the start
35
/// of a match. Thus, in order to find the bounds of any given match, a
36
/// forward search must first be run followed by a reverse search. A match
37
/// found by the forward DFA guarantees that the reverse DFA will also find
38
/// a match.
39
///
40
/// # Fallibility
41
///
42
/// Most of the search routines defined on this type will _panic_ when the
43
/// underlying search fails. This might be because the DFA gave up because it
44
/// saw a quit byte, whether configured explicitly or via heuristic Unicode
45
/// word boundary support, although neither are enabled by default. It might
46
/// also fail if the underlying DFA determines it isn't making effective use of
47
/// the cache (which also never happens by default). Or it might fail because
48
/// an invalid `Input` configuration is given, for example, with an unsupported
49
/// [`Anchored`] mode.
50
///
51
/// If you need to handle these error cases instead of allowing them to trigger
52
/// a panic, then the lower level [`Regex::try_search`] provides a fallible API
53
/// that never panics.
54
///
55
/// # Example
56
///
57
/// This example shows how to cause a search to terminate if it sees a
58
/// `\n` byte, and handle the error returned. This could be useful if, for
59
/// example, you wanted to prevent a user supplied pattern from matching
60
/// across a line boundary.
61
///
62
/// ```
63
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
64
/// use regex_automata::{hybrid::{dfa, regex::Regex}, Input, MatchError};
65
///
66
/// let re = Regex::builder()
67
///     .dfa(dfa::Config::new().quit(b'\n', true))
68
///     .build(r"foo\p{any}+bar")?;
69
/// let mut cache = re.create_cache();
70
///
71
/// let input = Input::new("foo\nbar");
72
/// // Normally this would produce a match, since \p{any} contains '\n'.
73
/// // But since we instructed the automaton to enter a quit state if a
74
/// // '\n' is observed, this produces a match error instead.
75
/// let expected = MatchError::quit(b'\n', 3);
76
/// let got = re.try_search(&mut cache, &input).unwrap_err();
77
/// assert_eq!(expected, got);
78
///
79
/// # Ok::<(), Box<dyn std::error::Error>>(())
80
/// ```
81
#[derive(Debug)]
82
pub struct Regex {
83
    /// The forward lazy DFA. This can only find the end of a match.
84
    forward: DFA,
85
    /// The reverse lazy DFA. This can only find the start of a match.
86
    ///
87
    /// This is built with 'all' match semantics (instead of leftmost-first)
88
    /// so that it always finds the longest possible match (which corresponds
89
    /// to the leftmost starting position). It is also compiled as an anchored
90
    /// matcher and has 'starts_for_each_pattern' enabled. Including starting
91
    /// states for each pattern is necessary to ensure that we only look for
92
    /// matches of a pattern that matched in the forward direction. Otherwise,
93
    /// we might wind up finding the "leftmost" starting position of a totally
94
    /// different pattern!
95
    reverse: DFA,
96
}
97
98
/// Convenience routines for regex and cache construction.
99
impl Regex {
100
    /// Parse the given regular expression using the default configuration and
101
    /// return the corresponding regex.
102
    ///
103
    /// If you want a non-default configuration, then use the [`Builder`] to
104
    /// set your own configuration.
105
    ///
106
    /// # Example
107
    ///
108
    /// ```
109
    /// use regex_automata::{hybrid::regex::Regex, Match};
110
    ///
111
    /// let re = Regex::new("foo[0-9]+bar")?;
112
    /// let mut cache = re.create_cache();
113
    /// assert_eq!(
114
    ///     Some(Match::must(0, 3..14)),
115
    ///     re.find(&mut cache, "zzzfoo12345barzzz"),
116
    /// );
117
    /// # Ok::<(), Box<dyn std::error::Error>>(())
118
    /// ```
119
    #[cfg(feature = "syntax")]
120
0
    pub fn new(pattern: &str) -> Result<Regex, BuildError> {
121
0
        Regex::builder().build(pattern)
122
0
    }
123
124
    /// Like `new`, but parses multiple patterns into a single "multi regex."
125
    /// This similarly uses the default regex configuration.
126
    ///
127
    /// # Example
128
    ///
129
    /// ```
130
    /// use regex_automata::{hybrid::regex::Regex, Match};
131
    ///
132
    /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?;
133
    /// let mut cache = re.create_cache();
134
    ///
135
    /// let mut it = re.find_iter(&mut cache, "abc 1 foo 4567 0 quux");
136
    /// assert_eq!(Some(Match::must(0, 0..3)), it.next());
137
    /// assert_eq!(Some(Match::must(1, 4..5)), it.next());
138
    /// assert_eq!(Some(Match::must(0, 6..9)), it.next());
139
    /// assert_eq!(Some(Match::must(1, 10..14)), it.next());
140
    /// assert_eq!(Some(Match::must(1, 15..16)), it.next());
141
    /// assert_eq!(Some(Match::must(0, 17..21)), it.next());
142
    /// assert_eq!(None, it.next());
143
    /// # Ok::<(), Box<dyn std::error::Error>>(())
144
    /// ```
145
    #[cfg(feature = "syntax")]
146
0
    pub fn new_many<P: AsRef<str>>(
147
0
        patterns: &[P],
148
0
    ) -> Result<Regex, BuildError> {
149
0
        Regex::builder().build_many(patterns)
150
0
    }
151
152
    /// Return a builder for configuring the construction of a `Regex`.
153
    ///
154
    /// This is a convenience routine to avoid needing to import the
155
    /// [`Builder`] type in common cases.
156
    ///
157
    /// # Example
158
    ///
159
    /// This example shows how to use the builder to disable UTF-8 mode
160
    /// everywhere.
161
    ///
162
    /// ```
163
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
164
    /// use regex_automata::{
165
    ///     hybrid::regex::Regex, nfa::thompson, util::syntax, Match,
166
    /// };
167
    ///
168
    /// let re = Regex::builder()
169
    ///     .syntax(syntax::Config::new().utf8(false))
170
    ///     .thompson(thompson::Config::new().utf8(false))
171
    ///     .build(r"foo(?-u:[^b])ar.*")?;
172
    /// let mut cache = re.create_cache();
173
    ///
174
    /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
175
    /// let expected = Some(Match::must(0, 1..9));
176
    /// let got = re.find(&mut cache, haystack);
177
    /// assert_eq!(expected, got);
178
    ///
179
    /// # Ok::<(), Box<dyn std::error::Error>>(())
180
    /// ```
181
0
    pub fn builder() -> Builder {
182
0
        Builder::new()
183
0
    }
184
185
    /// Create a new cache for this `Regex`.
186
    ///
187
    /// The cache returned should only be used for searches for this
188
    /// `Regex`. If you want to reuse the cache for another `Regex`, then
189
    /// you must call [`Cache::reset`] with that `Regex` (or, equivalently,
190
    /// [`Regex::reset_cache`]).
191
2
    pub fn create_cache(&self) -> Cache {
192
2
        Cache::new(self)
193
2
    }
194
195
    /// Reset the given cache such that it can be used for searching with the
196
    /// this `Regex` (and only this `Regex`).
197
    ///
198
    /// A cache reset permits reusing memory already allocated in this cache
199
    /// with a different `Regex`.
200
    ///
201
    /// Resetting a cache sets its "clear count" to 0. This is relevant if the
202
    /// `Regex` has been configured to "give up" after it has cleared the cache
203
    /// a certain number of times.
204
    ///
205
    /// # Example
206
    ///
207
    /// This shows how to re-purpose a cache for use with a different `Regex`.
208
    ///
209
    /// ```
210
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
211
    /// use regex_automata::{hybrid::regex::Regex, Match};
212
    ///
213
    /// let re1 = Regex::new(r"\w")?;
214
    /// let re2 = Regex::new(r"\W")?;
215
    ///
216
    /// let mut cache = re1.create_cache();
217
    /// assert_eq!(
218
    ///     Some(Match::must(0, 0..2)),
219
    ///     re1.find(&mut cache, "Δ"),
220
    /// );
221
    ///
222
    /// // Using 'cache' with re2 is not allowed. It may result in panics or
223
    /// // incorrect results. In order to re-purpose the cache, we must reset
224
    /// // it with the Regex we'd like to use it with.
225
    /// //
226
    /// // Similarly, after this reset, using the cache with 're1' is also not
227
    /// // allowed.
228
    /// re2.reset_cache(&mut cache);
229
    /// assert_eq!(
230
    ///     Some(Match::must(0, 0..3)),
231
    ///     re2.find(&mut cache, "☃"),
232
    /// );
233
    ///
234
    /// # Ok::<(), Box<dyn std::error::Error>>(())
235
    /// ```
236
0
    pub fn reset_cache(&self, cache: &mut Cache) {
237
0
        self.forward().reset_cache(&mut cache.forward);
238
0
        self.reverse().reset_cache(&mut cache.reverse);
239
0
    }
240
}
241
242
/// Standard infallible search routines for finding and iterating over matches.
243
impl Regex {
244
    /// Returns true if and only if this regex matches the given haystack.
245
    ///
246
    /// This routine may short circuit if it knows that scanning future input
247
    /// will never lead to a different result. In particular, if the underlying
248
    /// DFA enters a match state or a dead state, then this routine will return
249
    /// `true` or `false`, respectively, without inspecting any future input.
250
    ///
251
    /// # Panics
252
    ///
253
    /// This routine panics if the search could not complete. This can occur
254
    /// in a number of circumstances:
255
    ///
256
    /// * The configuration of the lazy DFA may permit it to "quit" the search.
257
    /// For example, setting quit bytes or enabling heuristic support for
258
    /// Unicode word boundaries. The default configuration does not enable any
259
    /// option that could result in the lazy DFA quitting.
260
    /// * The configuration of the lazy DFA may also permit it to "give up"
261
    /// on a search if it makes ineffective use of its transition table
262
    /// cache. The default configuration does not enable this by default,
263
    /// although it is typically a good idea to.
264
    /// * When the provided `Input` configuration is not supported. For
265
    /// example, by providing an unsupported anchor mode.
266
    ///
267
    /// When a search panics, callers cannot know whether a match exists or
268
    /// not.
269
    ///
270
    /// Use [`Regex::try_search`] if you want to handle these error conditions.
271
    ///
272
    /// # Example
273
    ///
274
    /// ```
275
    /// use regex_automata::hybrid::regex::Regex;
276
    ///
277
    /// let re = Regex::new("foo[0-9]+bar")?;
278
    /// let mut cache = re.create_cache();
279
    ///
280
    /// assert!(re.is_match(&mut cache, "foo12345bar"));
281
    /// assert!(!re.is_match(&mut cache, "foobar"));
282
    /// # Ok::<(), Box<dyn std::error::Error>>(())
283
    /// ```
284
    #[inline]
285
0
    pub fn is_match<'h, I: Into<Input<'h>>>(
286
0
        &self,
287
0
        cache: &mut Cache,
288
0
        input: I,
289
0
    ) -> bool {
290
0
        // Not only can we do an "earliest" search, but we can avoid doing a
291
0
        // reverse scan too.
292
0
        self.forward()
293
0
            .try_search_fwd(&mut cache.forward, &input.into().earliest(true))
294
0
            .unwrap()
295
0
            .is_some()
296
0
    }
297
298
    /// Returns the start and end offset of the leftmost match. If no match
299
    /// exists, then `None` is returned.
300
    ///
301
    /// # Panics
302
    ///
303
    /// This routine panics if the search could not complete. This can occur
304
    /// in a number of circumstances:
305
    ///
306
    /// * The configuration of the lazy DFA may permit it to "quit" the search.
307
    /// For example, setting quit bytes or enabling heuristic support for
308
    /// Unicode word boundaries. The default configuration does not enable any
309
    /// option that could result in the lazy DFA quitting.
310
    /// * The configuration of the lazy DFA may also permit it to "give up"
311
    /// on a search if it makes ineffective use of its transition table
312
    /// cache. The default configuration does not enable this by default,
313
    /// although it is typically a good idea to.
314
    /// * When the provided `Input` configuration is not supported. For
315
    /// example, by providing an unsupported anchor mode.
316
    ///
317
    /// When a search panics, callers cannot know whether a match exists or
318
    /// not.
319
    ///
320
    /// Use [`Regex::try_search`] if you want to handle these error conditions.
321
    ///
322
    /// # Example
323
    ///
324
    /// ```
325
    /// use regex_automata::{Match, hybrid::regex::Regex};
326
    ///
327
    /// let re = Regex::new("foo[0-9]+")?;
328
    /// let mut cache = re.create_cache();
329
    /// assert_eq!(
330
    ///     Some(Match::must(0, 3..11)),
331
    ///     re.find(&mut cache, "zzzfoo12345zzz"),
332
    /// );
333
    ///
334
    /// // Even though a match is found after reading the first byte (`a`),
335
    /// // the default leftmost-first match semantics demand that we find the
336
    /// // earliest match that prefers earlier parts of the pattern over latter
337
    /// // parts.
338
    /// let re = Regex::new("abc|a")?;
339
    /// let mut cache = re.create_cache();
340
    /// assert_eq!(Some(Match::must(0, 0..3)), re.find(&mut cache, "abc"));
341
    /// # Ok::<(), Box<dyn std::error::Error>>(())
342
    /// ```
343
    #[inline]
344
0
    pub fn find<'h, I: Into<Input<'h>>>(
345
0
        &self,
346
0
        cache: &mut Cache,
347
0
        input: I,
348
0
    ) -> Option<Match> {
349
0
        self.try_search(cache, &input.into()).unwrap()
350
0
    }
351
352
    /// Returns an iterator over all non-overlapping leftmost matches in the
353
    /// given bytes. If no match exists, then the iterator yields no elements.
354
    ///
355
    /// # Panics
356
    ///
357
    /// This routine panics if the search could not complete. This can occur
358
    /// in a number of circumstances:
359
    ///
360
    /// * The configuration of the lazy DFA may permit it to "quit" the search.
361
    /// For example, setting quit bytes or enabling heuristic support for
362
    /// Unicode word boundaries. The default configuration does not enable any
363
    /// option that could result in the lazy DFA quitting.
364
    /// * The configuration of the lazy DFA may also permit it to "give up"
365
    /// on a search if it makes ineffective use of its transition table
366
    /// cache. The default configuration does not enable this by default,
367
    /// although it is typically a good idea to.
368
    /// * When the provided `Input` configuration is not supported. For
369
    /// example, by providing an unsupported anchor mode.
370
    ///
371
    /// When a search panics, callers cannot know whether a match exists or
372
    /// not.
373
    ///
374
    /// The above conditions also apply to the iterator returned as well. For
375
    /// example, if the lazy DFA gives up or quits during a search using this
376
    /// method, then a panic will occur during iteration.
377
    ///
378
    /// Use [`Regex::try_search`] with [`util::iter::Searcher`](iter::Searcher)
379
    /// if you want to handle these error conditions.
380
    ///
381
    /// # Example
382
    ///
383
    /// ```
384
    /// use regex_automata::{hybrid::regex::Regex, Match};
385
    ///
386
    /// let re = Regex::new("foo[0-9]+")?;
387
    /// let mut cache = re.create_cache();
388
    ///
389
    /// let text = "foo1 foo12 foo123";
390
    /// let matches: Vec<Match> = re.find_iter(&mut cache, text).collect();
391
    /// assert_eq!(matches, vec![
392
    ///     Match::must(0, 0..4),
393
    ///     Match::must(0, 5..10),
394
    ///     Match::must(0, 11..17),
395
    /// ]);
396
    /// # Ok::<(), Box<dyn std::error::Error>>(())
397
    /// ```
398
    #[inline]
399
0
    pub fn find_iter<'r, 'c, 'h, I: Into<Input<'h>>>(
400
0
        &'r self,
401
0
        cache: &'c mut Cache,
402
0
        input: I,
403
0
    ) -> FindMatches<'r, 'c, 'h> {
404
0
        let it = iter::Searcher::new(input.into());
405
0
        FindMatches { re: self, cache, it }
406
0
    }
407
}
408
409
/// Lower level "search" primitives that accept a `&Input` for cheap reuse
410
/// and return an error if one occurs instead of panicking.
411
impl Regex {
412
    /// Returns the start and end offset of the leftmost match. If no match
413
    /// exists, then `None` is returned.
414
    ///
415
    /// This is like [`Regex::find`] but with two differences:
416
    ///
417
    /// 1. It is not generic over `Into<Input>` and instead accepts a
418
    /// `&Input`. This permits reusing the same `Input` for multiple searches
419
    /// without needing to create a new one. This _may_ help with latency.
420
    /// 2. It returns an error if the search could not complete where as
421
    /// [`Regex::find`] will panic.
422
    ///
423
    /// # Errors
424
    ///
425
    /// This routine errors if the search could not complete. This can occur
426
    /// in a number of circumstances:
427
    ///
428
    /// * The configuration of the lazy DFA may permit it to "quit" the search.
429
    /// For example, setting quit bytes or enabling heuristic support for
430
    /// Unicode word boundaries. The default configuration does not enable any
431
    /// option that could result in the lazy DFA quitting.
432
    /// * The configuration of the lazy DFA may also permit it to "give up"
433
    /// on a search if it makes ineffective use of its transition table
434
    /// cache. The default configuration does not enable this by default,
435
    /// although it is typically a good idea to.
436
    /// * When the provided `Input` configuration is not supported. For
437
    /// example, by providing an unsupported anchor mode.
438
    ///
439
    /// When a search returns an error, callers cannot know whether a match
440
    /// exists or not.
441
    #[inline]
442
0
    pub fn try_search(
443
0
        &self,
444
0
        cache: &mut Cache,
445
0
        input: &Input<'_>,
446
0
    ) -> Result<Option<Match>, MatchError> {
447
0
        let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse);
448
0
        let end = match self.forward().try_search_fwd(fcache, input)? {
449
0
            None => return Ok(None),
450
0
            Some(end) => end,
451
0
        };
452
0
        // This special cases an empty match at the beginning of the search. If
453
0
        // our end matches our start, then since a reverse DFA can't match past
454
0
        // the start, it must follow that our starting position is also our end
455
0
        // position. So short circuit and skip the reverse search.
456
0
        if input.start() == end.offset() {
457
0
            return Ok(Some(Match::new(
458
0
                end.pattern(),
459
0
                end.offset()..end.offset(),
460
0
            )));
461
0
        }
462
0
        // We can also skip the reverse search if we know our search was
463
0
        // anchored. This occurs either when the input config is anchored or
464
0
        // when we know the regex itself is anchored. In this case, we know the
465
0
        // start of the match, if one is found, must be the start of the
466
0
        // search.
467
0
        if self.is_anchored(input) {
468
0
            return Ok(Some(Match::new(
469
0
                end.pattern(),
470
0
                input.start()..end.offset(),
471
0
            )));
472
0
        }
473
0
        // N.B. I have tentatively convinced myself that it isn't necessary
474
0
        // to specify the specific pattern for the reverse search since the
475
0
        // reverse search will always find the same pattern to match as the
476
0
        // forward search. But I lack a rigorous proof. Why not just provide
477
0
        // the pattern anyway? Well, if it is needed, then leaving it out
478
0
        // gives us a chance to find a witness. (Also, if we don't need to
479
0
        // specify the pattern, then we don't need to build the reverse DFA
480
0
        // with 'starts_for_each_pattern' enabled. It doesn't matter too much
481
0
        // for the lazy DFA, but does make the overall DFA bigger.)
482
0
        //
483
0
        // We also need to be careful to disable 'earliest' for the reverse
484
0
        // search, since it could be enabled for the forward search. In the
485
0
        // reverse case, to satisfy "leftmost" criteria, we need to match as
486
0
        // much as we can. We also need to be careful to make the search
487
0
        // anchored. We don't want the reverse search to report any matches
488
0
        // other than the one beginning at the end of our forward search.
489
0
        let revsearch = input
490
0
            .clone()
491
0
            .span(input.start()..end.offset())
492
0
            .anchored(Anchored::Yes)
493
0
            .earliest(false);
494
0
        let start = self
495
0
            .reverse()
496
0
            .try_search_rev(rcache, &revsearch)?
497
0
            .expect("reverse search must match if forward search does");
498
0
        debug_assert_eq!(
499
0
            start.pattern(),
500
0
            end.pattern(),
501
0
            "forward and reverse search must match same pattern",
502
        );
503
0
        debug_assert!(start.offset() <= end.offset());
504
0
        Ok(Some(Match::new(end.pattern(), start.offset()..end.offset())))
505
0
    }
506
507
    /// Returns true if either the given input specifies an anchored search
508
    /// or if the underlying NFA is always anchored.
509
0
    fn is_anchored(&self, input: &Input<'_>) -> bool {
510
0
        match input.get_anchored() {
511
            Anchored::No => {
512
0
                self.forward().get_nfa().is_always_start_anchored()
513
            }
514
0
            Anchored::Yes | Anchored::Pattern(_) => true,
515
        }
516
0
    }
517
}
518
519
/// Non-search APIs for querying information about the regex and setting a
520
/// prefilter.
521
impl Regex {
522
    /// Return the underlying lazy DFA responsible for forward matching.
523
    ///
524
    /// This is useful for accessing the underlying lazy DFA and using it
525
    /// directly if the situation calls for it.
526
302
    pub fn forward(&self) -> &DFA {
527
302
        &self.forward
528
302
    }
529
530
    /// Return the underlying lazy DFA responsible for reverse matching.
531
    ///
532
    /// This is useful for accessing the underlying lazy DFA and using it
533
    /// directly if the situation calls for it.
534
2
    pub fn reverse(&self) -> &DFA {
535
2
        &self.reverse
536
2
    }
537
538
    /// Returns the total number of patterns matched by this regex.
539
    ///
540
    /// # Example
541
    ///
542
    /// ```
543
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
544
    /// use regex_automata::hybrid::regex::Regex;
545
    ///
546
    /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?;
547
    /// assert_eq!(3, re.pattern_len());
548
    /// # Ok::<(), Box<dyn std::error::Error>>(())
549
    /// ```
550
0
    pub fn pattern_len(&self) -> usize {
551
0
        assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len());
552
0
        self.forward().pattern_len()
553
0
    }
554
}
555
556
/// An iterator over all non-overlapping matches for an infallible search.
557
///
558
/// The iterator yields a [`Match`] value until no more matches could be found.
559
/// If the underlying regex engine returns an error, then a panic occurs.
560
///
561
/// The lifetime parameters are as follows:
562
///
563
/// * `'r` represents the lifetime of the regex object.
564
/// * `'h` represents the lifetime of the haystack being searched.
565
/// * `'c` represents the lifetime of the regex cache.
566
///
567
/// This iterator can be created with the [`Regex::find_iter`] method.
568
#[derive(Debug)]
569
pub struct FindMatches<'r, 'c, 'h> {
570
    re: &'r Regex,
571
    cache: &'c mut Cache,
572
    it: iter::Searcher<'h>,
573
}
574
575
impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> {
576
    type Item = Match;
577
578
    #[inline]
579
0
    fn next(&mut self) -> Option<Match> {
580
0
        let FindMatches { re, ref mut cache, ref mut it } = *self;
581
0
        it.advance(|input| re.try_search(cache, input))
582
0
    }
583
}
584
585
/// A cache represents a partially computed forward and reverse DFA.
586
///
587
/// A cache is the key component that differentiates a classical DFA and a
588
/// hybrid NFA/DFA (also called a "lazy DFA"). Where a classical DFA builds a
589
/// complete transition table that can handle all possible inputs, a hybrid
590
/// NFA/DFA starts with an empty transition table and builds only the parts
591
/// required during search. The parts that are built are stored in a cache. For
592
/// this reason, a cache is a required parameter for nearly every operation on
593
/// a [`Regex`].
594
///
595
/// Caches can be created from their corresponding `Regex` via
596
/// [`Regex::create_cache`]. A cache can only be used with either the `Regex`
597
/// that created it, or the `Regex` that was most recently used to reset it
598
/// with [`Cache::reset`]. Using a cache with any other `Regex` may result in
599
/// panics or incorrect results.
600
#[derive(Debug, Clone)]
601
pub struct Cache {
602
    forward: dfa::Cache,
603
    reverse: dfa::Cache,
604
}
605
606
impl Cache {
607
    /// Create a new cache for the given `Regex`.
608
    ///
609
    /// The cache returned should only be used for searches for the given
610
    /// `Regex`. If you want to reuse the cache for another `Regex`, then you
611
    /// must call [`Cache::reset`] with that `Regex`.
612
2
    pub fn new(re: &Regex) -> Cache {
613
2
        let forward = dfa::Cache::new(re.forward());
614
2
        let reverse = dfa::Cache::new(re.reverse());
615
2
        Cache { forward, reverse }
616
2
    }
617
618
    /// Reset this cache such that it can be used for searching with the given
619
    /// `Regex` (and only that `Regex`).
620
    ///
621
    /// A cache reset permits reusing memory already allocated in this cache
622
    /// with a different `Regex`.
623
    ///
624
    /// Resetting a cache sets its "clear count" to 0. This is relevant if the
625
    /// `Regex` has been configured to "give up" after it has cleared the cache
626
    /// a certain number of times.
627
    ///
628
    /// # Example
629
    ///
630
    /// This shows how to re-purpose a cache for use with a different `Regex`.
631
    ///
632
    /// ```
633
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
634
    /// use regex_automata::{hybrid::regex::Regex, Match};
635
    ///
636
    /// let re1 = Regex::new(r"\w")?;
637
    /// let re2 = Regex::new(r"\W")?;
638
    ///
639
    /// let mut cache = re1.create_cache();
640
    /// assert_eq!(
641
    ///     Some(Match::must(0, 0..2)),
642
    ///     re1.find(&mut cache, "Δ"),
643
    /// );
644
    ///
645
    /// // Using 'cache' with re2 is not allowed. It may result in panics or
646
    /// // incorrect results. In order to re-purpose the cache, we must reset
647
    /// // it with the Regex we'd like to use it with.
648
    /// //
649
    /// // Similarly, after this reset, using the cache with 're1' is also not
650
    /// // allowed.
651
    /// cache.reset(&re2);
652
    /// assert_eq!(
653
    ///     Some(Match::must(0, 0..3)),
654
    ///     re2.find(&mut cache, "☃"),
655
    /// );
656
    ///
657
    /// # Ok::<(), Box<dyn std::error::Error>>(())
658
    /// ```
659
0
    pub fn reset(&mut self, re: &Regex) {
660
0
        self.forward.reset(re.forward());
661
0
        self.reverse.reset(re.reverse());
662
0
    }
663
664
    /// Return a reference to the forward cache.
665
0
    pub fn forward(&mut self) -> &dfa::Cache {
666
0
        &self.forward
667
0
    }
668
669
    /// Return a reference to the reverse cache.
670
0
    pub fn reverse(&mut self) -> &dfa::Cache {
671
0
        &self.reverse
672
0
    }
673
674
    /// Return a mutable reference to the forward cache.
675
    ///
676
    /// If you need mutable references to both the forward and reverse caches,
677
    /// then use [`Cache::as_parts_mut`].
678
0
    pub fn forward_mut(&mut self) -> &mut dfa::Cache {
679
0
        &mut self.forward
680
0
    }
681
682
    /// Return a mutable reference to the reverse cache.
683
    ///
684
    /// If you need mutable references to both the forward and reverse caches,
685
    /// then use [`Cache::as_parts_mut`].
686
0
    pub fn reverse_mut(&mut self) -> &mut dfa::Cache {
687
0
        &mut self.reverse
688
0
    }
689
690
    /// Return references to the forward and reverse caches, respectively.
691
0
    pub fn as_parts(&self) -> (&dfa::Cache, &dfa::Cache) {
692
0
        (&self.forward, &self.reverse)
693
0
    }
694
695
    /// Return mutable references to the forward and reverse caches,
696
    /// respectively.
697
300
    pub fn as_parts_mut(&mut self) -> (&mut dfa::Cache, &mut dfa::Cache) {
698
300
        (&mut self.forward, &mut self.reverse)
699
300
    }
700
701
    /// Returns the heap memory usage, in bytes, as a sum of the forward and
702
    /// reverse lazy DFA caches.
703
    ///
704
    /// This does **not** include the stack size used up by this cache. To
705
    /// compute that, use `std::mem::size_of::<Cache>()`.
706
0
    pub fn memory_usage(&self) -> usize {
707
0
        self.forward.memory_usage() + self.reverse.memory_usage()
708
0
    }
709
}
710
711
/// A builder for a regex based on a hybrid NFA/DFA.
712
///
713
/// This builder permits configuring options for the syntax of a pattern, the
714
/// NFA construction, the lazy DFA construction and finally the regex searching
715
/// itself. This builder is different from a general purpose regex builder
716
/// in that it permits fine grain configuration of the construction process.
717
/// The trade off for this is complexity, and the possibility of setting a
718
/// configuration that might not make sense. For example, there are two
719
/// different UTF-8 modes:
720
///
721
/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
722
/// whether the pattern itself can contain sub-expressions that match invalid
723
/// UTF-8.
724
/// * [`thompson::Config::utf8`] controls how the regex iterators themselves
725
/// advance the starting position of the next search when a match with zero
726
/// length is found.
727
///
728
/// Generally speaking, callers will want to either enable all of these or
729
/// disable all of these.
730
///
731
/// Internally, building a regex requires building two hybrid NFA/DFAs,
732
/// where one is responsible for finding the end of a match and the other is
733
/// responsible for finding the start of a match. If you only need to detect
734
/// whether something matched, or only the end of a match, then you should use
735
/// a [`dfa::Builder`] to construct a single hybrid NFA/DFA, which is cheaper
736
/// than building two of them.
737
///
738
/// # Example
739
///
740
/// This example shows how to disable UTF-8 mode in the syntax and the regex
741
/// itself. This is generally what you want for matching on arbitrary bytes.
742
///
743
/// ```
744
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
745
/// use regex_automata::{
746
///     hybrid::regex::Regex, nfa::thompson, util::syntax, Match,
747
/// };
748
///
749
/// let re = Regex::builder()
750
///     .syntax(syntax::Config::new().utf8(false))
751
///     .thompson(thompson::Config::new().utf8(false))
752
///     .build(r"foo(?-u:[^b])ar.*")?;
753
/// let mut cache = re.create_cache();
754
///
755
/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
756
/// let expected = Some(Match::must(0, 1..9));
757
/// let got = re.find(&mut cache, haystack);
758
/// assert_eq!(expected, got);
759
/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
760
/// // but the subsequent `.*` does not! Disabling UTF-8
761
/// // on the syntax permits this.
762
/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]);
763
///
764
/// # Ok::<(), Box<dyn std::error::Error>>(())
765
/// ```
766
#[derive(Clone, Debug)]
767
pub struct Builder {
768
    dfa: dfa::Builder,
769
}
770
771
impl Builder {
772
    /// Create a new regex builder with the default configuration.
773
2
    pub fn new() -> Builder {
774
2
        Builder { dfa: DFA::builder() }
775
2
    }
776
777
    /// Build a regex from the given pattern.
778
    ///
779
    /// If there was a problem parsing or compiling the pattern, then an error
780
    /// is returned.
781
    #[cfg(feature = "syntax")]
782
0
    pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> {
783
0
        self.build_many(&[pattern])
784
0
    }
785
786
    /// Build a regex from the given patterns.
787
    #[cfg(feature = "syntax")]
788
0
    pub fn build_many<P: AsRef<str>>(
789
0
        &self,
790
0
        patterns: &[P],
791
0
    ) -> Result<Regex, BuildError> {
792
0
        let forward = self.dfa.build_many(patterns)?;
793
0
        let reverse = self
794
0
            .dfa
795
0
            .clone()
796
0
            .configure(
797
0
                DFA::config()
798
0
                    .prefilter(None)
799
0
                    .specialize_start_states(false)
800
0
                    .match_kind(MatchKind::All),
801
0
            )
802
0
            .thompson(thompson::Config::new().reverse(true))
803
0
            .build_many(patterns)?;
804
0
        Ok(self.build_from_dfas(forward, reverse))
805
0
    }
806
807
    /// Build a regex from its component forward and reverse hybrid NFA/DFAs.
808
    ///
809
    /// This is useful when you've built a forward and reverse lazy DFA
810
    /// separately, and want to combine them into a single regex. Once build,
811
    /// the individual DFAs given can still be accessed via [`Regex::forward`]
812
    /// and [`Regex::reverse`].
813
    ///
814
    /// It is important that the reverse lazy DFA be compiled under the
815
    /// following conditions:
816
    ///
817
    /// * It should use [`MatchKind::All`] semantics.
818
    /// * It should match in reverse.
819
    /// * Otherwise, its configuration should match the forward DFA.
820
    ///
821
    /// If these conditions aren't satisfied, then the behavior of searches is
822
    /// unspecified.
823
    ///
824
    /// Note that when using this constructor, no configuration is applied.
825
    /// Since this routine provides the DFAs to the builder, there is no
826
    /// opportunity to apply other configuration options.
827
    ///
828
    /// # Example
829
    ///
830
    /// This shows how to build individual lazy forward and reverse DFAs, and
831
    /// then combine them into a single `Regex`.
832
    ///
833
    /// ```
834
    /// use regex_automata::{
835
    ///     hybrid::{dfa::DFA, regex::Regex},
836
    ///     nfa::thompson,
837
    ///     MatchKind,
838
    /// };
839
    ///
840
    /// let fwd = DFA::new(r"foo[0-9]+")?;
841
    /// let rev = DFA::builder()
842
    ///     .configure(DFA::config().match_kind(MatchKind::All))
843
    ///     .thompson(thompson::Config::new().reverse(true))
844
    ///     .build(r"foo[0-9]+")?;
845
    ///
846
    /// let re = Regex::builder().build_from_dfas(fwd, rev);
847
    /// let mut cache = re.create_cache();
848
    /// assert_eq!(true, re.is_match(&mut cache, "foo123"));
849
    /// # Ok::<(), Box<dyn std::error::Error>>(())
850
    /// ```
851
2
    pub fn build_from_dfas(&self, forward: DFA, reverse: DFA) -> Regex {
852
2
        Regex { forward, reverse }
853
2
    }
854
855
    /// Set the syntax configuration for this builder using
856
    /// [`syntax::Config`](crate::util::syntax::Config).
857
    ///
858
    /// This permits setting things like case insensitivity, Unicode and multi
859
    /// line mode.
860
    #[cfg(feature = "syntax")]
861
0
    pub fn syntax(
862
0
        &mut self,
863
0
        config: crate::util::syntax::Config,
864
0
    ) -> &mut Builder {
865
0
        self.dfa.syntax(config);
866
0
        self
867
0
    }
868
869
    /// Set the Thompson NFA configuration for this builder using
870
    /// [`nfa::thompson::Config`](thompson::Config).
871
    ///
872
    /// This permits setting things like whether additional time should be
873
    /// spent shrinking the size of the NFA.
874
    #[cfg(feature = "syntax")]
875
0
    pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
876
0
        self.dfa.thompson(config);
877
0
        self
878
0
    }
879
880
    /// Set the lazy DFA compilation configuration for this builder using
881
    /// [`dfa::Config`].
882
    ///
883
    /// This permits setting things like whether Unicode word boundaries should
884
    /// be heuristically supported or settings how the behavior of the cache.
885
0
    pub fn dfa(&mut self, config: dfa::Config) -> &mut Builder {
886
0
        self.dfa.configure(config);
887
0
        self
888
0
    }
889
}
890
891
impl Default for Builder {
892
0
    fn default() -> Builder {
893
0
        Builder::new()
894
0
    }
895
}