Coverage Report

Created: 2025-01-23 14:27

/build/cargo-vendor-dir/regex-1.10.3/src/builders.rs
Line
Count
Source (jump to first uncovered line)
1
#![allow(warnings)]
2
3
// This module defines an internal builder that encapsulates all interaction
4
// with meta::Regex construction, and then 4 public API builders that wrap
5
// around it. The docs are essentially repeated on each of the 4 public
6
// builders, with tweaks to the examples as needed.
7
//
8
// The reason why there are so many builders is partially because of a misstep
9
// in the initial API design: the builder constructor takes in the pattern
10
// strings instead of using the `build` method to accept the pattern strings.
11
// This means `new` has a different signature for each builder. It probably
12
// would have been nicer to to use one builder with `fn new()`, and then add
13
// `build(pat)` and `build_many(pats)` constructors.
14
//
15
// The other reason is because I think the `bytes` module should probably
16
// have its own builder type. That way, it is completely isolated from the
17
// top-level API.
18
//
19
// If I could do it again, I'd probably have a `regex::Builder` and a
20
// `regex::bytes::Builder`. Each would have `build` and `build_set` (or
21
// `build_many`) methods for constructing a single pattern `Regex` and a
22
// multi-pattern `RegexSet`, respectively.
23
24
use alloc::{
25
    string::{String, ToString},
26
    sync::Arc,
27
    vec,
28
    vec::Vec,
29
};
30
31
use regex_automata::{
32
    meta, nfa::thompson::WhichCaptures, util::syntax, MatchKind,
33
};
34
35
use crate::error::Error;
36
37
/// A builder for constructing a `Regex`, `bytes::Regex`, `RegexSet` or a
38
/// `bytes::RegexSet`.
39
///
40
/// This is essentially the implementation of the four different builder types
41
/// in the public API: `RegexBuilder`, `bytes::RegexBuilder`, `RegexSetBuilder`
42
/// and `bytes::RegexSetBuilder`.
43
#[derive(Clone, Debug)]
44
struct Builder {
45
    pats: Vec<String>,
46
    metac: meta::Config,
47
    syntaxc: syntax::Config,
48
}
49
50
impl Default for Builder {
51
0
    fn default() -> Builder {
52
0
        let metac = meta::Config::new()
53
0
            .nfa_size_limit(Some(10 * (1 << 20)))
54
0
            .hybrid_cache_capacity(2 * (1 << 20));
55
0
        Builder { pats: vec![], metac, syntaxc: syntax::Config::default() }
56
0
    }
57
}
58
59
impl Builder {
60
0
    fn new<I, S>(patterns: I) -> Builder
61
0
    where
62
0
        S: AsRef<str>,
63
0
        I: IntoIterator<Item = S>,
64
0
    {
65
0
        let mut b = Builder::default();
66
0
        b.pats.extend(patterns.into_iter().map(|p| p.as_ref().to_string()));
67
0
        b
68
0
    }
69
70
0
    fn build_one_string(&self) -> Result<crate::Regex, Error> {
71
0
        assert_eq!(1, self.pats.len());
72
0
        let metac = self
73
0
            .metac
74
0
            .clone()
75
0
            .match_kind(MatchKind::LeftmostFirst)
76
0
            .utf8_empty(true);
77
0
        let syntaxc = self.syntaxc.clone().utf8(true);
78
0
        let pattern = Arc::from(self.pats[0].as_str());
79
0
        meta::Builder::new()
80
0
            .configure(metac)
81
0
            .syntax(syntaxc)
82
0
            .build(&pattern)
83
0
            .map(|meta| crate::Regex { meta, pattern })
84
0
            .map_err(Error::from_meta_build_error)
85
0
    }
86
87
0
    fn build_one_bytes(&self) -> Result<crate::bytes::Regex, Error> {
88
0
        assert_eq!(1, self.pats.len());
89
0
        let metac = self
90
0
            .metac
91
0
            .clone()
92
0
            .match_kind(MatchKind::LeftmostFirst)
93
0
            .utf8_empty(false);
94
0
        let syntaxc = self.syntaxc.clone().utf8(false);
95
0
        let pattern = Arc::from(self.pats[0].as_str());
96
0
        meta::Builder::new()
97
0
            .configure(metac)
98
0
            .syntax(syntaxc)
99
0
            .build(&pattern)
100
0
            .map(|meta| crate::bytes::Regex { meta, pattern })
101
0
            .map_err(Error::from_meta_build_error)
102
0
    }
103
104
0
    fn build_many_string(&self) -> Result<crate::RegexSet, Error> {
105
0
        let metac = self
106
0
            .metac
107
0
            .clone()
108
0
            .match_kind(MatchKind::All)
109
0
            .utf8_empty(true)
110
0
            .which_captures(WhichCaptures::None);
111
0
        let syntaxc = self.syntaxc.clone().utf8(true);
112
0
        let patterns = Arc::from(self.pats.as_slice());
113
0
        meta::Builder::new()
114
0
            .configure(metac)
115
0
            .syntax(syntaxc)
116
0
            .build_many(&patterns)
117
0
            .map(|meta| crate::RegexSet { meta, patterns })
118
0
            .map_err(Error::from_meta_build_error)
119
0
    }
120
121
0
    fn build_many_bytes(&self) -> Result<crate::bytes::RegexSet, Error> {
122
0
        let metac = self
123
0
            .metac
124
0
            .clone()
125
0
            .match_kind(MatchKind::All)
126
0
            .utf8_empty(false)
127
0
            .which_captures(WhichCaptures::None);
128
0
        let syntaxc = self.syntaxc.clone().utf8(false);
129
0
        let patterns = Arc::from(self.pats.as_slice());
130
0
        meta::Builder::new()
131
0
            .configure(metac)
132
0
            .syntax(syntaxc)
133
0
            .build_many(&patterns)
134
0
            .map(|meta| crate::bytes::RegexSet { meta, patterns })
135
0
            .map_err(Error::from_meta_build_error)
136
0
    }
137
138
0
    fn case_insensitive(&mut self, yes: bool) -> &mut Builder {
139
0
        self.syntaxc = self.syntaxc.case_insensitive(yes);
140
0
        self
141
0
    }
142
143
0
    fn multi_line(&mut self, yes: bool) -> &mut Builder {
144
0
        self.syntaxc = self.syntaxc.multi_line(yes);
145
0
        self
146
0
    }
147
148
0
    fn dot_matches_new_line(&mut self, yes: bool) -> &mut Builder {
149
0
        self.syntaxc = self.syntaxc.dot_matches_new_line(yes);
150
0
        self
151
0
    }
152
153
0
    fn crlf(&mut self, yes: bool) -> &mut Builder {
154
0
        self.syntaxc = self.syntaxc.crlf(yes);
155
0
        self
156
0
    }
157
158
0
    fn line_terminator(&mut self, byte: u8) -> &mut Builder {
159
0
        self.metac = self.metac.clone().line_terminator(byte);
160
0
        self.syntaxc = self.syntaxc.line_terminator(byte);
161
0
        self
162
0
    }
163
164
0
    fn swap_greed(&mut self, yes: bool) -> &mut Builder {
165
0
        self.syntaxc = self.syntaxc.swap_greed(yes);
166
0
        self
167
0
    }
168
169
0
    fn ignore_whitespace(&mut self, yes: bool) -> &mut Builder {
170
0
        self.syntaxc = self.syntaxc.ignore_whitespace(yes);
171
0
        self
172
0
    }
173
174
0
    fn unicode(&mut self, yes: bool) -> &mut Builder {
175
0
        self.syntaxc = self.syntaxc.unicode(yes);
176
0
        self
177
0
    }
178
179
0
    fn octal(&mut self, yes: bool) -> &mut Builder {
180
0
        self.syntaxc = self.syntaxc.octal(yes);
181
0
        self
182
0
    }
183
184
0
    fn size_limit(&mut self, limit: usize) -> &mut Builder {
185
0
        self.metac = self.metac.clone().nfa_size_limit(Some(limit));
186
0
        self
187
0
    }
188
189
0
    fn dfa_size_limit(&mut self, limit: usize) -> &mut Builder {
190
0
        self.metac = self.metac.clone().hybrid_cache_capacity(limit);
191
0
        self
192
0
    }
193
194
0
    fn nest_limit(&mut self, limit: u32) -> &mut Builder {
195
0
        self.syntaxc = self.syntaxc.nest_limit(limit);
196
0
        self
197
0
    }
198
}
199
200
pub(crate) mod string {
201
    use crate::{error::Error, Regex, RegexSet};
202
203
    use super::Builder;
204
205
    /// A configurable builder for a [`Regex`].
206
    ///
207
    /// This builder can be used to programmatically set flags such as `i`
208
    /// (case insensitive) and `x` (for verbose mode). This builder can also be
209
    /// used to configure things like the line terminator and a size limit on
210
    /// the compiled regular expression.
211
    #[derive(Clone, Debug)]
212
    pub struct RegexBuilder {
213
        builder: Builder,
214
    }
215
216
    impl RegexBuilder {
217
        /// Create a new builder with a default configuration for the given
218
        /// pattern.
219
        ///
220
        /// If the pattern is invalid or exceeds the configured size limits,
221
        /// then an error will be returned when [`RegexBuilder::build`] is
222
        /// called.
223
0
        pub fn new(pattern: &str) -> RegexBuilder {
224
0
            RegexBuilder { builder: Builder::new([pattern]) }
225
0
        }
226
227
        /// Compiles the pattern given to `RegexBuilder::new` with the
228
        /// configuration set on this builder.
229
        ///
230
        /// If the pattern isn't a valid regex or if a configured size limit
231
        /// was exceeded, then an error is returned.
232
0
        pub fn build(&self) -> Result<Regex, Error> {
233
0
            self.builder.build_one_string()
234
0
        }
235
236
        /// This configures Unicode mode for the entire pattern.
237
        ///
238
        /// Enabling Unicode mode does a number of things:
239
        ///
240
        /// * Most fundamentally, it causes the fundamental atom of matching
241
        /// to be a single codepoint. When Unicode mode is disabled, it's a
242
        /// single byte. For example, when Unicode mode is enabled, `.` will
243
        /// match `💩` once, where as it will match 4 times when Unicode mode
244
        /// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.)
245
        /// * Case insensitive matching uses Unicode simple case folding rules.
246
        /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are
247
        /// available.
248
        /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and
249
        /// `\d`.
250
        /// * The word boundary assertions, `\b` and `\B`, use the Unicode
251
        /// definition of a word character.
252
        ///
253
        /// Note that if Unicode mode is disabled, then the regex will fail to
254
        /// compile if it could match invalid UTF-8. For example, when Unicode
255
        /// mode is disabled, then since `.` matches any byte (except for
256
        /// `\n`), then it can match invalid UTF-8 and thus building a regex
257
        /// from it will fail. Another example is `\w` and `\W`. Since `\w` can
258
        /// only match ASCII bytes when Unicode mode is disabled, it's allowed.
259
        /// But `\W` can match more than ASCII bytes, including invalid UTF-8,
260
        /// and so it is not allowed. This restriction can be lifted only by
261
        /// using a [`bytes::Regex`](crate::bytes::Regex).
262
        ///
263
        /// For more details on the Unicode support in this crate, see the
264
        /// [Unicode section](crate#unicode) in this crate's top-level
265
        /// documentation.
266
        ///
267
        /// The default for this is `true`.
268
        ///
269
        /// # Example
270
        ///
271
        /// ```
272
        /// use regex::RegexBuilder;
273
        ///
274
        /// let re = RegexBuilder::new(r"\w")
275
        ///     .unicode(false)
276
        ///     .build()
277
        ///     .unwrap();
278
        /// // Normally greek letters would be included in \w, but since
279
        /// // Unicode mode is disabled, it only matches ASCII letters.
280
        /// assert!(!re.is_match("δ"));
281
        ///
282
        /// let re = RegexBuilder::new(r"s")
283
        ///     .case_insensitive(true)
284
        ///     .unicode(false)
285
        ///     .build()
286
        ///     .unwrap();
287
        /// // Normally 'Å¿' is included when searching for 's' case
288
        /// // insensitively due to Unicode's simple case folding rules. But
289
        /// // when Unicode mode is disabled, only ASCII case insensitive rules
290
        /// // are used.
291
        /// assert!(!re.is_match("Å¿"));
292
        /// ```
293
0
        pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
294
0
            self.builder.unicode(yes);
295
0
            self
296
0
        }
297
298
        /// This configures whether to enable case insensitive matching for the
299
        /// entire pattern.
300
        ///
301
        /// This setting can also be configured using the inline flag `i`
302
        /// in the pattern. For example, `(?i:foo)` matches `foo` case
303
        /// insensitively while `(?-i:foo)` matches `foo` case sensitively.
304
        ///
305
        /// The default for this is `false`.
306
        ///
307
        /// # Example
308
        ///
309
        /// ```
310
        /// use regex::RegexBuilder;
311
        ///
312
        /// let re = RegexBuilder::new(r"foo(?-i:bar)quux")
313
        ///     .case_insensitive(true)
314
        ///     .build()
315
        ///     .unwrap();
316
        /// assert!(re.is_match("FoObarQuUx"));
317
        /// // Even though case insensitive matching is enabled in the builder,
318
        /// // it can be locally disabled within the pattern. In this case,
319
        /// // `bar` is matched case sensitively.
320
        /// assert!(!re.is_match("fooBARquux"));
321
        /// ```
322
0
        pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder {
323
0
            self.builder.case_insensitive(yes);
324
0
            self
325
0
        }
326
327
        /// This configures multi-line mode for the entire pattern.
328
        ///
329
        /// Enabling multi-line mode changes the behavior of the `^` and `$`
330
        /// anchor assertions. Instead of only matching at the beginning and
331
        /// end of a haystack, respectively, multi-line mode causes them to
332
        /// match at the beginning and end of a line *in addition* to the
333
        /// beginning and end of a haystack. More precisely, `^` will match at
334
        /// the position immediately following a `\n` and `$` will match at the
335
        /// position immediately preceding a `\n`.
336
        ///
337
        /// The behavior of this option can be impacted by other settings too:
338
        ///
339
        /// * The [`RegexBuilder::line_terminator`] option changes `\n` above
340
        /// to any ASCII byte.
341
        /// * The [`RegexBuilder::crlf`] option changes the line terminator to
342
        /// be either `\r` or `\n`, but never at the position between a `\r`
343
        /// and `\n`.
344
        ///
345
        /// This setting can also be configured using the inline flag `m` in
346
        /// the pattern.
347
        ///
348
        /// The default for this is `false`.
349
        ///
350
        /// # Example
351
        ///
352
        /// ```
353
        /// use regex::RegexBuilder;
354
        ///
355
        /// let re = RegexBuilder::new(r"^foo$")
356
        ///     .multi_line(true)
357
        ///     .build()
358
        ///     .unwrap();
359
        /// assert_eq!(Some(1..4), re.find("\nfoo\n").map(|m| m.range()));
360
        /// ```
361
0
        pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
362
0
            self.builder.multi_line(yes);
363
0
            self
364
0
        }
365
366
        /// This configures dot-matches-new-line mode for the entire pattern.
367
        ///
368
        /// Perhaps surprisingly, the default behavior for `.` is not to match
369
        /// any character, but rather, to match any character except for the
370
        /// line terminator (which is `\n` by default). When this mode is
371
        /// enabled, the behavior changes such that `.` truly matches any
372
        /// character.
373
        ///
374
        /// This setting can also be configured using the inline flag `s` in
375
        /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
376
        /// regexes.
377
        ///
378
        /// The default for this is `false`.
379
        ///
380
        /// # Example
381
        ///
382
        /// ```
383
        /// use regex::RegexBuilder;
384
        ///
385
        /// let re = RegexBuilder::new(r"foo.bar")
386
        ///     .dot_matches_new_line(true)
387
        ///     .build()
388
        ///     .unwrap();
389
        /// let hay = "foo\nbar";
390
        /// assert_eq!(Some("foo\nbar"), re.find(hay).map(|m| m.as_str()));
391
        /// ```
392
0
        pub fn dot_matches_new_line(
393
0
            &mut self,
394
0
            yes: bool,
395
0
        ) -> &mut RegexBuilder {
396
0
            self.builder.dot_matches_new_line(yes);
397
0
            self
398
0
        }
399
400
        /// This configures CRLF mode for the entire pattern.
401
        ///
402
        /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
403
        /// short) and `\n` ("line feed" or LF for short) are treated as line
404
        /// terminators. This results in the following:
405
        ///
406
        /// * Unless dot-matches-new-line mode is enabled, `.` will now match
407
        /// any character except for `\n` and `\r`.
408
        /// * When multi-line mode is enabled, `^` will match immediately
409
        /// following a `\n` or a `\r`. Similarly, `$` will match immediately
410
        /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
411
        /// between `\r` and `\n`.
412
        ///
413
        /// This setting can also be configured using the inline flag `R` in
414
        /// the pattern.
415
        ///
416
        /// The default for this is `false`.
417
        ///
418
        /// # Example
419
        ///
420
        /// ```
421
        /// use regex::RegexBuilder;
422
        ///
423
        /// let re = RegexBuilder::new(r"^foo$")
424
        ///     .multi_line(true)
425
        ///     .crlf(true)
426
        ///     .build()
427
        ///     .unwrap();
428
        /// let hay = "\r\nfoo\r\n";
429
        /// // If CRLF mode weren't enabled here, then '$' wouldn't match
430
        /// // immediately after 'foo', and thus no match would be found.
431
        /// assert_eq!(Some("foo"), re.find(hay).map(|m| m.as_str()));
432
        /// ```
433
        ///
434
        /// This example demonstrates that `^` will never match at a position
435
        /// between `\r` and `\n`. (`$` will similarly not match between a `\r`
436
        /// and a `\n`.)
437
        ///
438
        /// ```
439
        /// use regex::RegexBuilder;
440
        ///
441
        /// let re = RegexBuilder::new(r"^")
442
        ///     .multi_line(true)
443
        ///     .crlf(true)
444
        ///     .build()
445
        ///     .unwrap();
446
        /// let hay = "\r\n\r\n";
447
        /// let ranges: Vec<_> = re.find_iter(hay).map(|m| m.range()).collect();
448
        /// assert_eq!(ranges, vec![0..0, 2..2, 4..4]);
449
        /// ```
450
0
        pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder {
451
0
            self.builder.crlf(yes);
452
0
            self
453
0
        }
454
455
        /// Configures the line terminator to be used by the regex.
456
        ///
457
        /// The line terminator is relevant in two ways for a particular regex:
458
        ///
459
        /// * When dot-matches-new-line mode is *not* enabled (the default),
460
        /// then `.` will match any character except for the configured line
461
        /// terminator.
462
        /// * When multi-line mode is enabled (not the default), then `^` and
463
        /// `$` will match immediately after and before, respectively, a line
464
        /// terminator.
465
        ///
466
        /// In both cases, if CRLF mode is enabled in a particular context,
467
        /// then it takes precedence over any configured line terminator.
468
        ///
469
        /// This option cannot be configured from within the pattern.
470
        ///
471
        /// The default line terminator is `\n`.
472
        ///
473
        /// # Example
474
        ///
475
        /// This shows how to treat the NUL byte as a line terminator. This can
476
        /// be a useful heuristic when searching binary data.
477
        ///
478
        /// ```
479
        /// use regex::RegexBuilder;
480
        ///
481
        /// let re = RegexBuilder::new(r"^foo$")
482
        ///     .multi_line(true)
483
        ///     .line_terminator(b'\x00')
484
        ///     .build()
485
        ///     .unwrap();
486
        /// let hay = "\x00foo\x00";
487
        /// assert_eq!(Some(1..4), re.find(hay).map(|m| m.range()));
488
        /// ```
489
        ///
490
        /// This example shows that the behavior of `.` is impacted by this
491
        /// setting as well:
492
        ///
493
        /// ```
494
        /// use regex::RegexBuilder;
495
        ///
496
        /// let re = RegexBuilder::new(r".")
497
        ///     .line_terminator(b'\x00')
498
        ///     .build()
499
        ///     .unwrap();
500
        /// assert!(re.is_match("\n"));
501
        /// assert!(!re.is_match("\x00"));
502
        /// ```
503
        ///
504
        /// This shows that building a regex will fail if the byte given
505
        /// is not ASCII and the pattern could result in matching invalid
506
        /// UTF-8. This is because any singular non-ASCII byte is not valid
507
        /// UTF-8, and it is not permitted for a [`Regex`] to match invalid
508
        /// UTF-8. (It is permissible to use a non-ASCII byte when building a
509
        /// [`bytes::Regex`](crate::bytes::Regex).)
510
        ///
511
        /// ```
512
        /// use regex::RegexBuilder;
513
        ///
514
        /// assert!(RegexBuilder::new(r".").line_terminator(0x80).build().is_err());
515
        /// // Note that using a non-ASCII byte isn't enough on its own to
516
        /// // cause regex compilation to fail. You actually have to make use
517
        /// // of it in the regex in a way that leads to matching invalid
518
        /// // UTF-8. If you don't, then regex compilation will succeed!
519
        /// assert!(RegexBuilder::new(r"a").line_terminator(0x80).build().is_ok());
520
        /// ```
521
0
        pub fn line_terminator(&mut self, byte: u8) -> &mut RegexBuilder {
522
0
            self.builder.line_terminator(byte);
523
0
            self
524
0
        }
525
526
        /// This configures swap-greed mode for the entire pattern.
527
        ///
528
        /// When swap-greed mode is enabled, patterns like `a+` will become
529
        /// non-greedy and patterns like `a+?` will become greedy. In other
530
        /// words, the meanings of `a+` and `a+?` are switched.
531
        ///
532
        /// This setting can also be configured using the inline flag `U` in
533
        /// the pattern.
534
        ///
535
        /// The default for this is `false`.
536
        ///
537
        /// # Example
538
        ///
539
        /// ```
540
        /// use regex::RegexBuilder;
541
        ///
542
        /// let re = RegexBuilder::new(r"a+")
543
        ///     .swap_greed(true)
544
        ///     .build()
545
        ///     .unwrap();
546
        /// assert_eq!(Some("a"), re.find("aaa").map(|m| m.as_str()));
547
        /// ```
548
0
        pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
549
0
            self.builder.swap_greed(yes);
550
0
            self
551
0
        }
552
553
        /// This configures verbose mode for the entire pattern.
554
        ///
555
        /// When enabled, whitespace will treated as insignifcant in the
556
        /// pattern and `#` can be used to start a comment until the next new
557
        /// line.
558
        ///
559
        /// Normally, in most places in a pattern, whitespace is treated
560
        /// literally. For example ` +` will match one or more ASCII whitespace
561
        /// characters.
562
        ///
563
        /// When verbose mode is enabled, `\#` can be used to match a literal
564
        /// `#` and `\ ` can be used to match a literal ASCII whitespace
565
        /// character.
566
        ///
567
        /// Verbose mode is useful for permitting regexes to be formatted and
568
        /// broken up more nicely. This may make them more easily readable.
569
        ///
570
        /// This setting can also be configured using the inline flag `x` in
571
        /// the pattern.
572
        ///
573
        /// The default for this is `false`.
574
        ///
575
        /// # Example
576
        ///
577
        /// ```
578
        /// use regex::RegexBuilder;
579
        ///
580
        /// let pat = r"
581
        ///     \b
582
        ///     (?<first>\p{Uppercase}\w*)  # always start with uppercase letter
583
        ///     [\s--\n]+                   # whitespace should separate names
584
        ///     (?: # middle name can be an initial!
585
        ///         (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*))
586
        ///         [\s--\n]+
587
        ///     )?
588
        ///     (?<last>\p{Uppercase}\w*)
589
        ///     \b
590
        /// ";
591
        /// let re = RegexBuilder::new(pat)
592
        ///     .ignore_whitespace(true)
593
        ///     .build()
594
        ///     .unwrap();
595
        ///
596
        /// let caps = re.captures("Harry Potter").unwrap();
597
        /// assert_eq!("Harry", &caps["first"]);
598
        /// assert_eq!("Potter", &caps["last"]);
599
        ///
600
        /// let caps = re.captures("Harry J. Potter").unwrap();
601
        /// assert_eq!("Harry", &caps["first"]);
602
        /// // Since a middle name/initial isn't required for an overall match,
603
        /// // we can't assume that 'initial' or 'middle' will be populated!
604
        /// assert_eq!(Some("J"), caps.name("initial").map(|m| m.as_str()));
605
        /// assert_eq!(None, caps.name("middle").map(|m| m.as_str()));
606
        /// assert_eq!("Potter", &caps["last"]);
607
        ///
608
        /// let caps = re.captures("Harry James Potter").unwrap();
609
        /// assert_eq!("Harry", &caps["first"]);
610
        /// // Since a middle name/initial isn't required for an overall match,
611
        /// // we can't assume that 'initial' or 'middle' will be populated!
612
        /// assert_eq!(None, caps.name("initial").map(|m| m.as_str()));
613
        /// assert_eq!(Some("James"), caps.name("middle").map(|m| m.as_str()));
614
        /// assert_eq!("Potter", &caps["last"]);
615
        /// ```
616
0
        pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder {
617
0
            self.builder.ignore_whitespace(yes);
618
0
            self
619
0
        }
620
621
        /// This configures octal mode for the entire pattern.
622
        ///
623
        /// Octal syntax is a little-known way of uttering Unicode codepoints
624
        /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
625
        /// equivalent patterns, where the last example shows octal syntax.
626
        ///
627
        /// While supporting octal syntax isn't in and of itself a problem,
628
        /// it does make good error messages harder. That is, in PCRE based
629
        /// regex engines, syntax like `\1` invokes a backreference, which is
630
        /// explicitly unsupported this library. However, many users expect
631
        /// backreferences to be supported. Therefore, when octal support
632
        /// is disabled, the error message will explicitly mention that
633
        /// backreferences aren't supported.
634
        ///
635
        /// The default for this is `false`.
636
        ///
637
        /// # Example
638
        ///
639
        /// ```
640
        /// use regex::RegexBuilder;
641
        ///
642
        /// // Normally this pattern would not compile, with an error message
643
        /// // about backreferences not being supported. But with octal mode
644
        /// // enabled, octal escape sequences work.
645
        /// let re = RegexBuilder::new(r"\141")
646
        ///     .octal(true)
647
        ///     .build()
648
        ///     .unwrap();
649
        /// assert!(re.is_match("a"));
650
        /// ```
651
0
        pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
652
0
            self.builder.octal(yes);
653
0
            self
654
0
        }
655
656
        /// Sets the approximate size limit, in bytes, of the compiled regex.
657
        ///
658
        /// This roughly corresponds to the number of heap memory, in
659
        /// bytes, occupied by a single regex. If the regex would otherwise
660
        /// approximately exceed this limit, then compiling that regex will
661
        /// fail.
662
        ///
663
        /// The main utility of a method like this is to avoid compiling
664
        /// regexes that use an unexpected amount of resources, such as
665
        /// time and memory. Even if the memory usage of a large regex is
666
        /// acceptable, its search time may not be. Namely, worst case time
667
        /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and
668
        /// `n ~ len(haystack)`. That is, search time depends, in part, on the
669
        /// size of the compiled regex. This means that putting a limit on the
670
        /// size of the regex limits how much a regex can impact search time.
671
        ///
672
        /// For more information about regex size limits, see the section on
673
        /// [untrusted inputs](crate#untrusted-input) in the top-level crate
674
        /// documentation.
675
        ///
676
        /// The default for this is some reasonable number that permits most
677
        /// patterns to compile successfully.
678
        ///
679
        /// # Example
680
        ///
681
        /// ```
682
        /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
683
        /// use regex::RegexBuilder;
684
        ///
685
        /// // It may surprise you how big some seemingly small patterns can
686
        /// // be! Since \w is Unicode aware, this generates a regex that can
687
        /// // match approximately 140,000 distinct codepoints.
688
        /// assert!(RegexBuilder::new(r"\w").size_limit(45_000).build().is_err());
689
        /// ```
690
0
        pub fn size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
691
0
            self.builder.size_limit(bytes);
692
0
            self
693
0
        }
694
695
        /// Set the approximate capacity, in bytes, of the cache of transitions
696
        /// used by the lazy DFA.
697
        ///
698
        /// While the lazy DFA isn't always used, in tends to be the most
699
        /// commonly use regex engine in default configurations. It tends to
700
        /// adopt the performance profile of a fully build DFA, but without the
701
        /// downside of taking worst case exponential time to build.
702
        ///
703
        /// The downside is that it needs to keep a cache of transitions and
704
        /// states that are built while running a search, and this cache
705
        /// can fill up. When it fills up, the cache will reset itself. Any
706
        /// previously generated states and transitions will then need to be
707
        /// re-generated. If this happens too many times, then this library
708
        /// will bail out of using the lazy DFA and switch to a different regex
709
        /// engine.
710
        ///
711
        /// If your regex provokes this particular downside of the lazy DFA,
712
        /// then it may be beneficial to increase its cache capacity. This will
713
        /// potentially reduce the frequency of cache resetting (ideally to
714
        /// `0`). While it won't fix all potential performance problems with
715
        /// the lazy DFA, increasing the cache capacity does fix some.
716
        ///
717
        /// There is no easy way to determine, a priori, whether increasing
718
        /// this cache capacity will help. In general, the larger your regex,
719
        /// the more cache it's likely to use. But that isn't an ironclad rule.
720
        /// For example, a regex like `[01]*1[01]{N}` would normally produce a
721
        /// fully build DFA that is exponential in size with respect to `N`.
722
        /// The lazy DFA will prevent exponential space blow-up, but it cache
723
        /// is likely to fill up, even when it's large and even for smallish
724
        /// values of `N`.
725
        ///
726
        /// If you aren't sure whether this helps or not, it is sensible to
727
        /// set this to some arbitrarily large number in testing, such as
728
        /// `usize::MAX`. Namely, this represents the amount of capacity that
729
        /// *may* be used. It's probably not a good idea to use `usize::MAX` in
730
        /// production though, since it implies there are no controls on heap
731
        /// memory used by this library during a search. In effect, set it to
732
        /// whatever you're willing to allocate for a single regex search.
733
0
        pub fn dfa_size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
734
0
            self.builder.dfa_size_limit(bytes);
735
0
            self
736
0
        }
737
738
        /// Set the nesting limit for this parser.
739
        ///
740
        /// The nesting limit controls how deep the abstract syntax tree is
741
        /// allowed to be. If the AST exceeds the given limit (e.g., with too
742
        /// many nested groups), then an error is returned by the parser.
743
        ///
744
        /// The purpose of this limit is to act as a heuristic to prevent stack
745
        /// overflow for consumers that do structural induction on an AST using
746
        /// explicit recursion. While this crate never does this (instead using
747
        /// constant stack space and moving the call stack to the heap), other
748
        /// crates may.
749
        ///
750
        /// This limit is not checked until the entire AST is parsed.
751
        /// Therefore, if callers want to put a limit on the amount of heap
752
        /// space used, then they should impose a limit on the length, in
753
        /// bytes, of the concrete pattern string. In particular, this is
754
        /// viable since this parser implementation will limit itself to heap
755
        /// space proportional to the length of the pattern string. See also
756
        /// the [untrusted inputs](crate#untrusted-input) section in the
757
        /// top-level crate documentation for more information about this.
758
        ///
759
        /// Note that a nest limit of `0` will return a nest limit error for
760
        /// most patterns but not all. For example, a nest limit of `0` permits
761
        /// `a` but not `ab`, since `ab` requires an explicit concatenation,
762
        /// which results in a nest depth of `1`. In general, a nest limit is
763
        /// not something that manifests in an obvious way in the concrete
764
        /// syntax, therefore, it should not be used in a granular way.
765
        ///
766
        /// # Example
767
        ///
768
        /// ```
769
        /// use regex::RegexBuilder;
770
        ///
771
        /// assert!(RegexBuilder::new(r"a").nest_limit(0).build().is_ok());
772
        /// assert!(RegexBuilder::new(r"ab").nest_limit(0).build().is_err());
773
        /// ```
774
0
        pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
775
0
            self.builder.nest_limit(limit);
776
0
            self
777
0
        }
778
    }
779
780
    /// A configurable builder for a [`RegexSet`].
781
    ///
782
    /// This builder can be used to programmatically set flags such as
783
    /// `i` (case insensitive) and `x` (for verbose mode). This builder
784
    /// can also be used to configure things like the line terminator
785
    /// and a size limit on the compiled regular expression.
786
    #[derive(Clone, Debug)]
787
    pub struct RegexSetBuilder {
788
        builder: Builder,
789
    }
790
791
    impl RegexSetBuilder {
792
        /// Create a new builder with a default configuration for the given
793
        /// patterns.
794
        ///
795
        /// If the patterns are invalid or exceed the configured size limits,
796
        /// then an error will be returned when [`RegexSetBuilder::build`] is
797
        /// called.
798
0
        pub fn new<I, S>(patterns: I) -> RegexSetBuilder
799
0
        where
800
0
            I: IntoIterator<Item = S>,
801
0
            S: AsRef<str>,
802
0
        {
803
0
            RegexSetBuilder { builder: Builder::new(patterns) }
804
0
        }
805
806
        /// Compiles the patterns given to `RegexSetBuilder::new` with the
807
        /// configuration set on this builder.
808
        ///
809
        /// If the patterns aren't valid regexes or if a configured size limit
810
        /// was exceeded, then an error is returned.
811
0
        pub fn build(&self) -> Result<RegexSet, Error> {
812
0
            self.builder.build_many_string()
813
0
        }
814
815
        /// This configures Unicode mode for the all of the patterns.
816
        ///
817
        /// Enabling Unicode mode does a number of things:
818
        ///
819
        /// * Most fundamentally, it causes the fundamental atom of matching
820
        /// to be a single codepoint. When Unicode mode is disabled, it's a
821
        /// single byte. For example, when Unicode mode is enabled, `.` will
822
        /// match `💩` once, where as it will match 4 times when Unicode mode
823
        /// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.)
824
        /// * Case insensitive matching uses Unicode simple case folding rules.
825
        /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are
826
        /// available.
827
        /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and
828
        /// `\d`.
829
        /// * The word boundary assertions, `\b` and `\B`, use the Unicode
830
        /// definition of a word character.
831
        ///
832
        /// Note that if Unicode mode is disabled, then the regex will fail to
833
        /// compile if it could match invalid UTF-8. For example, when Unicode
834
        /// mode is disabled, then since `.` matches any byte (except for
835
        /// `\n`), then it can match invalid UTF-8 and thus building a regex
836
        /// from it will fail. Another example is `\w` and `\W`. Since `\w` can
837
        /// only match ASCII bytes when Unicode mode is disabled, it's allowed.
838
        /// But `\W` can match more than ASCII bytes, including invalid UTF-8,
839
        /// and so it is not allowed. This restriction can be lifted only by
840
        /// using a [`bytes::RegexSet`](crate::bytes::RegexSet).
841
        ///
842
        /// For more details on the Unicode support in this crate, see the
843
        /// [Unicode section](crate#unicode) in this crate's top-level
844
        /// documentation.
845
        ///
846
        /// The default for this is `true`.
847
        ///
848
        /// # Example
849
        ///
850
        /// ```
851
        /// use regex::RegexSetBuilder;
852
        ///
853
        /// let re = RegexSetBuilder::new([r"\w"])
854
        ///     .unicode(false)
855
        ///     .build()
856
        ///     .unwrap();
857
        /// // Normally greek letters would be included in \w, but since
858
        /// // Unicode mode is disabled, it only matches ASCII letters.
859
        /// assert!(!re.is_match("δ"));
860
        ///
861
        /// let re = RegexSetBuilder::new([r"s"])
862
        ///     .case_insensitive(true)
863
        ///     .unicode(false)
864
        ///     .build()
865
        ///     .unwrap();
866
        /// // Normally 'Å¿' is included when searching for 's' case
867
        /// // insensitively due to Unicode's simple case folding rules. But
868
        /// // when Unicode mode is disabled, only ASCII case insensitive rules
869
        /// // are used.
870
        /// assert!(!re.is_match("Å¿"));
871
        /// ```
872
0
        pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
873
0
            self.builder.unicode(yes);
874
0
            self
875
0
        }
876
877
        /// This configures whether to enable case insensitive matching for all
878
        /// of the patterns.
879
        ///
880
        /// This setting can also be configured using the inline flag `i`
881
        /// in the pattern. For example, `(?i:foo)` matches `foo` case
882
        /// insensitively while `(?-i:foo)` matches `foo` case sensitively.
883
        ///
884
        /// The default for this is `false`.
885
        ///
886
        /// # Example
887
        ///
888
        /// ```
889
        /// use regex::RegexSetBuilder;
890
        ///
891
        /// let re = RegexSetBuilder::new([r"foo(?-i:bar)quux"])
892
        ///     .case_insensitive(true)
893
        ///     .build()
894
        ///     .unwrap();
895
        /// assert!(re.is_match("FoObarQuUx"));
896
        /// // Even though case insensitive matching is enabled in the builder,
897
        /// // it can be locally disabled within the pattern. In this case,
898
        /// // `bar` is matched case sensitively.
899
        /// assert!(!re.is_match("fooBARquux"));
900
        /// ```
901
0
        pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder {
902
0
            self.builder.case_insensitive(yes);
903
0
            self
904
0
        }
905
906
        /// This configures multi-line mode for all of the patterns.
907
        ///
908
        /// Enabling multi-line mode changes the behavior of the `^` and `$`
909
        /// anchor assertions. Instead of only matching at the beginning and
910
        /// end of a haystack, respectively, multi-line mode causes them to
911
        /// match at the beginning and end of a line *in addition* to the
912
        /// beginning and end of a haystack. More precisely, `^` will match at
913
        /// the position immediately following a `\n` and `$` will match at the
914
        /// position immediately preceding a `\n`.
915
        ///
916
        /// The behavior of this option can be impacted by other settings too:
917
        ///
918
        /// * The [`RegexSetBuilder::line_terminator`] option changes `\n`
919
        /// above to any ASCII byte.
920
        /// * The [`RegexSetBuilder::crlf`] option changes the line terminator
921
        /// to be either `\r` or `\n`, but never at the position between a `\r`
922
        /// and `\n`.
923
        ///
924
        /// This setting can also be configured using the inline flag `m` in
925
        /// the pattern.
926
        ///
927
        /// The default for this is `false`.
928
        ///
929
        /// # Example
930
        ///
931
        /// ```
932
        /// use regex::RegexSetBuilder;
933
        ///
934
        /// let re = RegexSetBuilder::new([r"^foo$"])
935
        ///     .multi_line(true)
936
        ///     .build()
937
        ///     .unwrap();
938
        /// assert!(re.is_match("\nfoo\n"));
939
        /// ```
940
0
        pub fn multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder {
941
0
            self.builder.multi_line(yes);
942
0
            self
943
0
        }
944
945
        /// This configures dot-matches-new-line mode for the entire pattern.
946
        ///
947
        /// Perhaps surprisingly, the default behavior for `.` is not to match
948
        /// any character, but rather, to match any character except for the
949
        /// line terminator (which is `\n` by default). When this mode is
950
        /// enabled, the behavior changes such that `.` truly matches any
951
        /// character.
952
        ///
953
        /// This setting can also be configured using the inline flag `s` in
954
        /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
955
        /// regexes.
956
        ///
957
        /// The default for this is `false`.
958
        ///
959
        /// # Example
960
        ///
961
        /// ```
962
        /// use regex::RegexSetBuilder;
963
        ///
964
        /// let re = RegexSetBuilder::new([r"foo.bar"])
965
        ///     .dot_matches_new_line(true)
966
        ///     .build()
967
        ///     .unwrap();
968
        /// let hay = "foo\nbar";
969
        /// assert!(re.is_match(hay));
970
        /// ```
971
0
        pub fn dot_matches_new_line(
972
0
            &mut self,
973
0
            yes: bool,
974
0
        ) -> &mut RegexSetBuilder {
975
0
            self.builder.dot_matches_new_line(yes);
976
0
            self
977
0
        }
978
979
        /// This configures CRLF mode for all of the patterns.
980
        ///
981
        /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
982
        /// short) and `\n` ("line feed" or LF for short) are treated as line
983
        /// terminators. This results in the following:
984
        ///
985
        /// * Unless dot-matches-new-line mode is enabled, `.` will now match
986
        /// any character except for `\n` and `\r`.
987
        /// * When multi-line mode is enabled, `^` will match immediately
988
        /// following a `\n` or a `\r`. Similarly, `$` will match immediately
989
        /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
990
        /// between `\r` and `\n`.
991
        ///
992
        /// This setting can also be configured using the inline flag `R` in
993
        /// the pattern.
994
        ///
995
        /// The default for this is `false`.
996
        ///
997
        /// # Example
998
        ///
999
        /// ```
1000
        /// use regex::RegexSetBuilder;
1001
        ///
1002
        /// let re = RegexSetBuilder::new([r"^foo$"])
1003
        ///     .multi_line(true)
1004
        ///     .crlf(true)
1005
        ///     .build()
1006
        ///     .unwrap();
1007
        /// let hay = "\r\nfoo\r\n";
1008
        /// // If CRLF mode weren't enabled here, then '$' wouldn't match
1009
        /// // immediately after 'foo', and thus no match would be found.
1010
        /// assert!(re.is_match(hay));
1011
        /// ```
1012
        ///
1013
        /// This example demonstrates that `^` will never match at a position
1014
        /// between `\r` and `\n`. (`$` will similarly not match between a `\r`
1015
        /// and a `\n`.)
1016
        ///
1017
        /// ```
1018
        /// use regex::RegexSetBuilder;
1019
        ///
1020
        /// let re = RegexSetBuilder::new([r"^\n"])
1021
        ///     .multi_line(true)
1022
        ///     .crlf(true)
1023
        ///     .build()
1024
        ///     .unwrap();
1025
        /// assert!(!re.is_match("\r\n"));
1026
        /// ```
1027
0
        pub fn crlf(&mut self, yes: bool) -> &mut RegexSetBuilder {
1028
0
            self.builder.crlf(yes);
1029
0
            self
1030
0
        }
1031
1032
        /// Configures the line terminator to be used by the regex.
1033
        ///
1034
        /// The line terminator is relevant in two ways for a particular regex:
1035
        ///
1036
        /// * When dot-matches-new-line mode is *not* enabled (the default),
1037
        /// then `.` will match any character except for the configured line
1038
        /// terminator.
1039
        /// * When multi-line mode is enabled (not the default), then `^` and
1040
        /// `$` will match immediately after and before, respectively, a line
1041
        /// terminator.
1042
        ///
1043
        /// In both cases, if CRLF mode is enabled in a particular context,
1044
        /// then it takes precedence over any configured line terminator.
1045
        ///
1046
        /// This option cannot be configured from within the pattern.
1047
        ///
1048
        /// The default line terminator is `\n`.
1049
        ///
1050
        /// # Example
1051
        ///
1052
        /// This shows how to treat the NUL byte as a line terminator. This can
1053
        /// be a useful heuristic when searching binary data.
1054
        ///
1055
        /// ```
1056
        /// use regex::RegexSetBuilder;
1057
        ///
1058
        /// let re = RegexSetBuilder::new([r"^foo$"])
1059
        ///     .multi_line(true)
1060
        ///     .line_terminator(b'\x00')
1061
        ///     .build()
1062
        ///     .unwrap();
1063
        /// let hay = "\x00foo\x00";
1064
        /// assert!(re.is_match(hay));
1065
        /// ```
1066
        ///
1067
        /// This example shows that the behavior of `.` is impacted by this
1068
        /// setting as well:
1069
        ///
1070
        /// ```
1071
        /// use regex::RegexSetBuilder;
1072
        ///
1073
        /// let re = RegexSetBuilder::new([r"."])
1074
        ///     .line_terminator(b'\x00')
1075
        ///     .build()
1076
        ///     .unwrap();
1077
        /// assert!(re.is_match("\n"));
1078
        /// assert!(!re.is_match("\x00"));
1079
        /// ```
1080
        ///
1081
        /// This shows that building a regex will fail if the byte given
1082
        /// is not ASCII and the pattern could result in matching invalid
1083
        /// UTF-8. This is because any singular non-ASCII byte is not valid
1084
        /// UTF-8, and it is not permitted for a [`RegexSet`] to match invalid
1085
        /// UTF-8. (It is permissible to use a non-ASCII byte when building a
1086
        /// [`bytes::RegexSet`](crate::bytes::RegexSet).)
1087
        ///
1088
        /// ```
1089
        /// use regex::RegexSetBuilder;
1090
        ///
1091
        /// assert!(
1092
        ///     RegexSetBuilder::new([r"."])
1093
        ///         .line_terminator(0x80)
1094
        ///         .build()
1095
        ///         .is_err()
1096
        /// );
1097
        /// // Note that using a non-ASCII byte isn't enough on its own to
1098
        /// // cause regex compilation to fail. You actually have to make use
1099
        /// // of it in the regex in a way that leads to matching invalid
1100
        /// // UTF-8. If you don't, then regex compilation will succeed!
1101
        /// assert!(
1102
        ///     RegexSetBuilder::new([r"a"])
1103
        ///         .line_terminator(0x80)
1104
        ///         .build()
1105
        ///         .is_ok()
1106
        /// );
1107
        /// ```
1108
0
        pub fn line_terminator(&mut self, byte: u8) -> &mut RegexSetBuilder {
1109
0
            self.builder.line_terminator(byte);
1110
0
            self
1111
0
        }
1112
1113
        /// This configures swap-greed mode for all of the patterns.
1114
        ///
1115
        /// When swap-greed mode is enabled, patterns like `a+` will become
1116
        /// non-greedy and patterns like `a+?` will become greedy. In other
1117
        /// words, the meanings of `a+` and `a+?` are switched.
1118
        ///
1119
        /// This setting can also be configured using the inline flag `U` in
1120
        /// the pattern.
1121
        ///
1122
        /// Note that this is generally not useful for a `RegexSet` since a
1123
        /// `RegexSet` can only report whether a pattern matches or not. Since
1124
        /// greediness never impacts whether a match is found or not (only the
1125
        /// offsets of the match), it follows that whether parts of a pattern
1126
        /// are greedy or not doesn't matter for a `RegexSet`.
1127
        ///
1128
        /// The default for this is `false`.
1129
0
        pub fn swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder {
1130
0
            self.builder.swap_greed(yes);
1131
0
            self
1132
0
        }
1133
1134
        /// This configures verbose mode for all of the patterns.
1135
        ///
1136
        /// When enabled, whitespace will treated as insignifcant in the
1137
        /// pattern and `#` can be used to start a comment until the next new
1138
        /// line.
1139
        ///
1140
        /// Normally, in most places in a pattern, whitespace is treated
1141
        /// literally. For example ` +` will match one or more ASCII whitespace
1142
        /// characters.
1143
        ///
1144
        /// When verbose mode is enabled, `\#` can be used to match a literal
1145
        /// `#` and `\ ` can be used to match a literal ASCII whitespace
1146
        /// character.
1147
        ///
1148
        /// Verbose mode is useful for permitting regexes to be formatted and
1149
        /// broken up more nicely. This may make them more easily readable.
1150
        ///
1151
        /// This setting can also be configured using the inline flag `x` in
1152
        /// the pattern.
1153
        ///
1154
        /// The default for this is `false`.
1155
        ///
1156
        /// # Example
1157
        ///
1158
        /// ```
1159
        /// use regex::RegexSetBuilder;
1160
        ///
1161
        /// let pat = r"
1162
        ///     \b
1163
        ///     (?<first>\p{Uppercase}\w*)  # always start with uppercase letter
1164
        ///     [\s--\n]+                   # whitespace should separate names
1165
        ///     (?: # middle name can be an initial!
1166
        ///         (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*))
1167
        ///         [\s--\n]+
1168
        ///     )?
1169
        ///     (?<last>\p{Uppercase}\w*)
1170
        ///     \b
1171
        /// ";
1172
        /// let re = RegexSetBuilder::new([pat])
1173
        ///     .ignore_whitespace(true)
1174
        ///     .build()
1175
        ///     .unwrap();
1176
        /// assert!(re.is_match("Harry Potter"));
1177
        /// assert!(re.is_match("Harry J. Potter"));
1178
        /// assert!(re.is_match("Harry James Potter"));
1179
        /// assert!(!re.is_match("harry J. Potter"));
1180
        /// ```
1181
0
        pub fn ignore_whitespace(
1182
0
            &mut self,
1183
0
            yes: bool,
1184
0
        ) -> &mut RegexSetBuilder {
1185
0
            self.builder.ignore_whitespace(yes);
1186
0
            self
1187
0
        }
1188
1189
        /// This configures octal mode for all of the patterns.
1190
        ///
1191
        /// Octal syntax is a little-known way of uttering Unicode codepoints
1192
        /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
1193
        /// equivalent patterns, where the last example shows octal syntax.
1194
        ///
1195
        /// While supporting octal syntax isn't in and of itself a problem,
1196
        /// it does make good error messages harder. That is, in PCRE based
1197
        /// regex engines, syntax like `\1` invokes a backreference, which is
1198
        /// explicitly unsupported this library. However, many users expect
1199
        /// backreferences to be supported. Therefore, when octal support
1200
        /// is disabled, the error message will explicitly mention that
1201
        /// backreferences aren't supported.
1202
        ///
1203
        /// The default for this is `false`.
1204
        ///
1205
        /// # Example
1206
        ///
1207
        /// ```
1208
        /// use regex::RegexSetBuilder;
1209
        ///
1210
        /// // Normally this pattern would not compile, with an error message
1211
        /// // about backreferences not being supported. But with octal mode
1212
        /// // enabled, octal escape sequences work.
1213
        /// let re = RegexSetBuilder::new([r"\141"])
1214
        ///     .octal(true)
1215
        ///     .build()
1216
        ///     .unwrap();
1217
        /// assert!(re.is_match("a"));
1218
        /// ```
1219
0
        pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
1220
0
            self.builder.octal(yes);
1221
0
            self
1222
0
        }
1223
1224
        /// Sets the approximate size limit, in bytes, of the compiled regex.
1225
        ///
1226
        /// This roughly corresponds to the number of heap memory, in
1227
        /// bytes, occupied by a single regex. If the regex would otherwise
1228
        /// approximately exceed this limit, then compiling that regex will
1229
        /// fail.
1230
        ///
1231
        /// The main utility of a method like this is to avoid compiling
1232
        /// regexes that use an unexpected amount of resources, such as
1233
        /// time and memory. Even if the memory usage of a large regex is
1234
        /// acceptable, its search time may not be. Namely, worst case time
1235
        /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and
1236
        /// `n ~ len(haystack)`. That is, search time depends, in part, on the
1237
        /// size of the compiled regex. This means that putting a limit on the
1238
        /// size of the regex limits how much a regex can impact search time.
1239
        ///
1240
        /// For more information about regex size limits, see the section on
1241
        /// [untrusted inputs](crate#untrusted-input) in the top-level crate
1242
        /// documentation.
1243
        ///
1244
        /// The default for this is some reasonable number that permits most
1245
        /// patterns to compile successfully.
1246
        ///
1247
        /// # Example
1248
        ///
1249
        /// ```
1250
        /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
1251
        /// use regex::RegexSetBuilder;
1252
        ///
1253
        /// // It may surprise you how big some seemingly small patterns can
1254
        /// // be! Since \w is Unicode aware, this generates a regex that can
1255
        /// // match approximately 140,000 distinct codepoints.
1256
        /// assert!(
1257
        ///     RegexSetBuilder::new([r"\w"])
1258
        ///         .size_limit(45_000)
1259
        ///         .build()
1260
        ///         .is_err()
1261
        /// );
1262
        /// ```
1263
0
        pub fn size_limit(&mut self, bytes: usize) -> &mut RegexSetBuilder {
1264
0
            self.builder.size_limit(bytes);
1265
0
            self
1266
0
        }
1267
1268
        /// Set the approximate capacity, in bytes, of the cache of transitions
1269
        /// used by the lazy DFA.
1270
        ///
1271
        /// While the lazy DFA isn't always used, in tends to be the most
1272
        /// commonly use regex engine in default configurations. It tends to
1273
        /// adopt the performance profile of a fully build DFA, but without the
1274
        /// downside of taking worst case exponential time to build.
1275
        ///
1276
        /// The downside is that it needs to keep a cache of transitions and
1277
        /// states that are built while running a search, and this cache
1278
        /// can fill up. When it fills up, the cache will reset itself. Any
1279
        /// previously generated states and transitions will then need to be
1280
        /// re-generated. If this happens too many times, then this library
1281
        /// will bail out of using the lazy DFA and switch to a different regex
1282
        /// engine.
1283
        ///
1284
        /// If your regex provokes this particular downside of the lazy DFA,
1285
        /// then it may be beneficial to increase its cache capacity. This will
1286
        /// potentially reduce the frequency of cache resetting (ideally to
1287
        /// `0`). While it won't fix all potential performance problems with
1288
        /// the lazy DFA, increasing the cache capacity does fix some.
1289
        ///
1290
        /// There is no easy way to determine, a priori, whether increasing
1291
        /// this cache capacity will help. In general, the larger your regex,
1292
        /// the more cache it's likely to use. But that isn't an ironclad rule.
1293
        /// For example, a regex like `[01]*1[01]{N}` would normally produce a
1294
        /// fully build DFA that is exponential in size with respect to `N`.
1295
        /// The lazy DFA will prevent exponential space blow-up, but it cache
1296
        /// is likely to fill up, even when it's large and even for smallish
1297
        /// values of `N`.
1298
        ///
1299
        /// If you aren't sure whether this helps or not, it is sensible to
1300
        /// set this to some arbitrarily large number in testing, such as
1301
        /// `usize::MAX`. Namely, this represents the amount of capacity that
1302
        /// *may* be used. It's probably not a good idea to use `usize::MAX` in
1303
        /// production though, since it implies there are no controls on heap
1304
        /// memory used by this library during a search. In effect, set it to
1305
        /// whatever you're willing to allocate for a single regex search.
1306
0
        pub fn dfa_size_limit(
1307
0
            &mut self,
1308
0
            bytes: usize,
1309
0
        ) -> &mut RegexSetBuilder {
1310
0
            self.builder.dfa_size_limit(bytes);
1311
0
            self
1312
0
        }
1313
1314
        /// Set the nesting limit for this parser.
1315
        ///
1316
        /// The nesting limit controls how deep the abstract syntax tree is
1317
        /// allowed to be. If the AST exceeds the given limit (e.g., with too
1318
        /// many nested groups), then an error is returned by the parser.
1319
        ///
1320
        /// The purpose of this limit is to act as a heuristic to prevent stack
1321
        /// overflow for consumers that do structural induction on an AST using
1322
        /// explicit recursion. While this crate never does this (instead using
1323
        /// constant stack space and moving the call stack to the heap), other
1324
        /// crates may.
1325
        ///
1326
        /// This limit is not checked until the entire AST is parsed.
1327
        /// Therefore, if callers want to put a limit on the amount of heap
1328
        /// space used, then they should impose a limit on the length, in
1329
        /// bytes, of the concrete pattern string. In particular, this is
1330
        /// viable since this parser implementation will limit itself to heap
1331
        /// space proportional to the length of the pattern string. See also
1332
        /// the [untrusted inputs](crate#untrusted-input) section in the
1333
        /// top-level crate documentation for more information about this.
1334
        ///
1335
        /// Note that a nest limit of `0` will return a nest limit error for
1336
        /// most patterns but not all. For example, a nest limit of `0` permits
1337
        /// `a` but not `ab`, since `ab` requires an explicit concatenation,
1338
        /// which results in a nest depth of `1`. In general, a nest limit is
1339
        /// not something that manifests in an obvious way in the concrete
1340
        /// syntax, therefore, it should not be used in a granular way.
1341
        ///
1342
        /// # Example
1343
        ///
1344
        /// ```
1345
        /// use regex::RegexSetBuilder;
1346
        ///
1347
        /// assert!(RegexSetBuilder::new([r"a"]).nest_limit(0).build().is_ok());
1348
        /// assert!(RegexSetBuilder::new([r"ab"]).nest_limit(0).build().is_err());
1349
        /// ```
1350
0
        pub fn nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder {
1351
0
            self.builder.nest_limit(limit);
1352
0
            self
1353
0
        }
1354
    }
1355
}
1356
1357
pub(crate) mod bytes {
1358
    use crate::{
1359
        bytes::{Regex, RegexSet},
1360
        error::Error,
1361
    };
1362
1363
    use super::Builder;
1364
1365
    /// A configurable builder for a [`Regex`].
1366
    ///
1367
    /// This builder can be used to programmatically set flags such as `i`
1368
    /// (case insensitive) and `x` (for verbose mode). This builder can also be
1369
    /// used to configure things like the line terminator and a size limit on
1370
    /// the compiled regular expression.
1371
    #[derive(Clone, Debug)]
1372
    pub struct RegexBuilder {
1373
        builder: Builder,
1374
    }
1375
1376
    impl RegexBuilder {
1377
        /// Create a new builder with a default configuration for the given
1378
        /// pattern.
1379
        ///
1380
        /// If the pattern is invalid or exceeds the configured size limits,
1381
        /// then an error will be returned when [`RegexBuilder::build`] is
1382
        /// called.
1383
0
        pub fn new(pattern: &str) -> RegexBuilder {
1384
0
            RegexBuilder { builder: Builder::new([pattern]) }
1385
0
        }
1386
1387
        /// Compiles the pattern given to `RegexBuilder::new` with the
1388
        /// configuration set on this builder.
1389
        ///
1390
        /// If the pattern isn't a valid regex or if a configured size limit
1391
        /// was exceeded, then an error is returned.
1392
0
        pub fn build(&self) -> Result<Regex, Error> {
1393
0
            self.builder.build_one_bytes()
1394
0
        }
1395
1396
        /// This configures Unicode mode for the entire pattern.
1397
        ///
1398
        /// Enabling Unicode mode does a number of things:
1399
        ///
1400
        /// * Most fundamentally, it causes the fundamental atom of matching
1401
        /// to be a single codepoint. When Unicode mode is disabled, it's a
1402
        /// single byte. For example, when Unicode mode is enabled, `.` will
1403
        /// match `💩` once, where as it will match 4 times when Unicode mode
1404
        /// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.)
1405
        /// * Case insensitive matching uses Unicode simple case folding rules.
1406
        /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are
1407
        /// available.
1408
        /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and
1409
        /// `\d`.
1410
        /// * The word boundary assertions, `\b` and `\B`, use the Unicode
1411
        /// definition of a word character.
1412
        ///
1413
        /// Note that unlike the top-level `Regex` for searching `&str`, it
1414
        /// is permitted to disable Unicode mode even if the resulting pattern
1415
        /// could match invalid UTF-8. For example, `(?-u:.)` is not a valid
1416
        /// pattern for a top-level `Regex`, but is valid for a `bytes::Regex`.
1417
        ///
1418
        /// For more details on the Unicode support in this crate, see the
1419
        /// [Unicode section](crate#unicode) in this crate's top-level
1420
        /// documentation.
1421
        ///
1422
        /// The default for this is `true`.
1423
        ///
1424
        /// # Example
1425
        ///
1426
        /// ```
1427
        /// use regex::bytes::RegexBuilder;
1428
        ///
1429
        /// let re = RegexBuilder::new(r"\w")
1430
        ///     .unicode(false)
1431
        ///     .build()
1432
        ///     .unwrap();
1433
        /// // Normally greek letters would be included in \w, but since
1434
        /// // Unicode mode is disabled, it only matches ASCII letters.
1435
        /// assert!(!re.is_match("δ".as_bytes()));
1436
        ///
1437
        /// let re = RegexBuilder::new(r"s")
1438
        ///     .case_insensitive(true)
1439
        ///     .unicode(false)
1440
        ///     .build()
1441
        ///     .unwrap();
1442
        /// // Normally 'Å¿' is included when searching for 's' case
1443
        /// // insensitively due to Unicode's simple case folding rules. But
1444
        /// // when Unicode mode is disabled, only ASCII case insensitive rules
1445
        /// // are used.
1446
        /// assert!(!re.is_match("Å¿".as_bytes()));
1447
        /// ```
1448
        ///
1449
        /// Since this builder is for constructing a [`bytes::Regex`](Regex),
1450
        /// one can disable Unicode mode even if it would match invalid UTF-8:
1451
        ///
1452
        /// ```
1453
        /// use regex::bytes::RegexBuilder;
1454
        ///
1455
        /// let re = RegexBuilder::new(r".")
1456
        ///     .unicode(false)
1457
        ///     .build()
1458
        ///     .unwrap();
1459
        /// // Normally greek letters would be included in \w, but since
1460
        /// // Unicode mode is disabled, it only matches ASCII letters.
1461
        /// assert!(re.is_match(b"\xFF"));
1462
        /// ```
1463
0
        pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
1464
0
            self.builder.unicode(yes);
1465
0
            self
1466
0
        }
1467
1468
        /// This configures whether to enable case insensitive matching for the
1469
        /// entire pattern.
1470
        ///
1471
        /// This setting can also be configured using the inline flag `i`
1472
        /// in the pattern. For example, `(?i:foo)` matches `foo` case
1473
        /// insensitively while `(?-i:foo)` matches `foo` case sensitively.
1474
        ///
1475
        /// The default for this is `false`.
1476
        ///
1477
        /// # Example
1478
        ///
1479
        /// ```
1480
        /// use regex::bytes::RegexBuilder;
1481
        ///
1482
        /// let re = RegexBuilder::new(r"foo(?-i:bar)quux")
1483
        ///     .case_insensitive(true)
1484
        ///     .build()
1485
        ///     .unwrap();
1486
        /// assert!(re.is_match(b"FoObarQuUx"));
1487
        /// // Even though case insensitive matching is enabled in the builder,
1488
        /// // it can be locally disabled within the pattern. In this case,
1489
        /// // `bar` is matched case sensitively.
1490
        /// assert!(!re.is_match(b"fooBARquux"));
1491
        /// ```
1492
0
        pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder {
1493
0
            self.builder.case_insensitive(yes);
1494
0
            self
1495
0
        }
1496
1497
        /// This configures multi-line mode for the entire pattern.
1498
        ///
1499
        /// Enabling multi-line mode changes the behavior of the `^` and `$`
1500
        /// anchor assertions. Instead of only matching at the beginning and
1501
        /// end of a haystack, respectively, multi-line mode causes them to
1502
        /// match at the beginning and end of a line *in addition* to the
1503
        /// beginning and end of a haystack. More precisely, `^` will match at
1504
        /// the position immediately following a `\n` and `$` will match at the
1505
        /// position immediately preceding a `\n`.
1506
        ///
1507
        /// The behavior of this option can be impacted by other settings too:
1508
        ///
1509
        /// * The [`RegexBuilder::line_terminator`] option changes `\n` above
1510
        /// to any ASCII byte.
1511
        /// * The [`RegexBuilder::crlf`] option changes the line terminator to
1512
        /// be either `\r` or `\n`, but never at the position between a `\r`
1513
        /// and `\n`.
1514
        ///
1515
        /// This setting can also be configured using the inline flag `m` in
1516
        /// the pattern.
1517
        ///
1518
        /// The default for this is `false`.
1519
        ///
1520
        /// # Example
1521
        ///
1522
        /// ```
1523
        /// use regex::bytes::RegexBuilder;
1524
        ///
1525
        /// let re = RegexBuilder::new(r"^foo$")
1526
        ///     .multi_line(true)
1527
        ///     .build()
1528
        ///     .unwrap();
1529
        /// assert_eq!(Some(1..4), re.find(b"\nfoo\n").map(|m| m.range()));
1530
        /// ```
1531
0
        pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
1532
0
            self.builder.multi_line(yes);
1533
0
            self
1534
0
        }
1535
1536
        /// This configures dot-matches-new-line mode for the entire pattern.
1537
        ///
1538
        /// Perhaps surprisingly, the default behavior for `.` is not to match
1539
        /// any character, but rather, to match any character except for the
1540
        /// line terminator (which is `\n` by default). When this mode is
1541
        /// enabled, the behavior changes such that `.` truly matches any
1542
        /// character.
1543
        ///
1544
        /// This setting can also be configured using the inline flag `s` in
1545
        /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
1546
        /// regexes.
1547
        ///
1548
        /// The default for this is `false`.
1549
        ///
1550
        /// # Example
1551
        ///
1552
        /// ```
1553
        /// use regex::bytes::RegexBuilder;
1554
        ///
1555
        /// let re = RegexBuilder::new(r"foo.bar")
1556
        ///     .dot_matches_new_line(true)
1557
        ///     .build()
1558
        ///     .unwrap();
1559
        /// let hay = b"foo\nbar";
1560
        /// assert_eq!(Some(&b"foo\nbar"[..]), re.find(hay).map(|m| m.as_bytes()));
1561
        /// ```
1562
0
        pub fn dot_matches_new_line(
1563
0
            &mut self,
1564
0
            yes: bool,
1565
0
        ) -> &mut RegexBuilder {
1566
0
            self.builder.dot_matches_new_line(yes);
1567
0
            self
1568
0
        }
1569
1570
        /// This configures CRLF mode for the entire pattern.
1571
        ///
1572
        /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
1573
        /// short) and `\n` ("line feed" or LF for short) are treated as line
1574
        /// terminators. This results in the following:
1575
        ///
1576
        /// * Unless dot-matches-new-line mode is enabled, `.` will now match
1577
        /// any character except for `\n` and `\r`.
1578
        /// * When multi-line mode is enabled, `^` will match immediately
1579
        /// following a `\n` or a `\r`. Similarly, `$` will match immediately
1580
        /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
1581
        /// between `\r` and `\n`.
1582
        ///
1583
        /// This setting can also be configured using the inline flag `R` in
1584
        /// the pattern.
1585
        ///
1586
        /// The default for this is `false`.
1587
        ///
1588
        /// # Example
1589
        ///
1590
        /// ```
1591
        /// use regex::bytes::RegexBuilder;
1592
        ///
1593
        /// let re = RegexBuilder::new(r"^foo$")
1594
        ///     .multi_line(true)
1595
        ///     .crlf(true)
1596
        ///     .build()
1597
        ///     .unwrap();
1598
        /// let hay = b"\r\nfoo\r\n";
1599
        /// // If CRLF mode weren't enabled here, then '$' wouldn't match
1600
        /// // immediately after 'foo', and thus no match would be found.
1601
        /// assert_eq!(Some(&b"foo"[..]), re.find(hay).map(|m| m.as_bytes()));
1602
        /// ```
1603
        ///
1604
        /// This example demonstrates that `^` will never match at a position
1605
        /// between `\r` and `\n`. (`$` will similarly not match between a `\r`
1606
        /// and a `\n`.)
1607
        ///
1608
        /// ```
1609
        /// use regex::bytes::RegexBuilder;
1610
        ///
1611
        /// let re = RegexBuilder::new(r"^")
1612
        ///     .multi_line(true)
1613
        ///     .crlf(true)
1614
        ///     .build()
1615
        ///     .unwrap();
1616
        /// let hay = b"\r\n\r\n";
1617
        /// let ranges: Vec<_> = re.find_iter(hay).map(|m| m.range()).collect();
1618
        /// assert_eq!(ranges, vec![0..0, 2..2, 4..4]);
1619
        /// ```
1620
0
        pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder {
1621
0
            self.builder.crlf(yes);
1622
0
            self
1623
0
        }
1624
1625
        /// Configures the line terminator to be used by the regex.
1626
        ///
1627
        /// The line terminator is relevant in two ways for a particular regex:
1628
        ///
1629
        /// * When dot-matches-new-line mode is *not* enabled (the default),
1630
        /// then `.` will match any character except for the configured line
1631
        /// terminator.
1632
        /// * When multi-line mode is enabled (not the default), then `^` and
1633
        /// `$` will match immediately after and before, respectively, a line
1634
        /// terminator.
1635
        ///
1636
        /// In both cases, if CRLF mode is enabled in a particular context,
1637
        /// then it takes precedence over any configured line terminator.
1638
        ///
1639
        /// This option cannot be configured from within the pattern.
1640
        ///
1641
        /// The default line terminator is `\n`.
1642
        ///
1643
        /// # Example
1644
        ///
1645
        /// This shows how to treat the NUL byte as a line terminator. This can
1646
        /// be a useful heuristic when searching binary data.
1647
        ///
1648
        /// ```
1649
        /// use regex::bytes::RegexBuilder;
1650
        ///
1651
        /// let re = RegexBuilder::new(r"^foo$")
1652
        ///     .multi_line(true)
1653
        ///     .line_terminator(b'\x00')
1654
        ///     .build()
1655
        ///     .unwrap();
1656
        /// let hay = b"\x00foo\x00";
1657
        /// assert_eq!(Some(1..4), re.find(hay).map(|m| m.range()));
1658
        /// ```
1659
        ///
1660
        /// This example shows that the behavior of `.` is impacted by this
1661
        /// setting as well:
1662
        ///
1663
        /// ```
1664
        /// use regex::bytes::RegexBuilder;
1665
        ///
1666
        /// let re = RegexBuilder::new(r".")
1667
        ///     .line_terminator(b'\x00')
1668
        ///     .build()
1669
        ///     .unwrap();
1670
        /// assert!(re.is_match(b"\n"));
1671
        /// assert!(!re.is_match(b"\x00"));
1672
        /// ```
1673
        ///
1674
        /// This shows that building a regex will work even when the byte
1675
        /// given is not ASCII. This is unlike the top-level `Regex` API where
1676
        /// matching invalid UTF-8 is not allowed.
1677
        ///
1678
        /// Note though that you must disable Unicode mode. This is required
1679
        /// because Unicode mode requires matching one codepoint at a time,
1680
        /// and there is no way to match a non-ASCII byte as if it were a
1681
        /// codepoint.
1682
        ///
1683
        /// ```
1684
        /// use regex::bytes::RegexBuilder;
1685
        ///
1686
        /// assert!(
1687
        ///     RegexBuilder::new(r".")
1688
        ///         .unicode(false)
1689
        ///         .line_terminator(0x80)
1690
        ///         .build()
1691
        ///         .is_ok(),
1692
        /// );
1693
        /// ```
1694
0
        pub fn line_terminator(&mut self, byte: u8) -> &mut RegexBuilder {
1695
0
            self.builder.line_terminator(byte);
1696
0
            self
1697
0
        }
1698
1699
        /// This configures swap-greed mode for the entire pattern.
1700
        ///
1701
        /// When swap-greed mode is enabled, patterns like `a+` will become
1702
        /// non-greedy and patterns like `a+?` will become greedy. In other
1703
        /// words, the meanings of `a+` and `a+?` are switched.
1704
        ///
1705
        /// This setting can also be configured using the inline flag `U` in
1706
        /// the pattern.
1707
        ///
1708
        /// The default for this is `false`.
1709
        ///
1710
        /// # Example
1711
        ///
1712
        /// ```
1713
        /// use regex::bytes::RegexBuilder;
1714
        ///
1715
        /// let re = RegexBuilder::new(r"a+")
1716
        ///     .swap_greed(true)
1717
        ///     .build()
1718
        ///     .unwrap();
1719
        /// assert_eq!(Some(&b"a"[..]), re.find(b"aaa").map(|m| m.as_bytes()));
1720
        /// ```
1721
0
        pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
1722
0
            self.builder.swap_greed(yes);
1723
0
            self
1724
0
        }
1725
1726
        /// This configures verbose mode for the entire pattern.
1727
        ///
1728
        /// When enabled, whitespace will treated as insignifcant in the
1729
        /// pattern and `#` can be used to start a comment until the next new
1730
        /// line.
1731
        ///
1732
        /// Normally, in most places in a pattern, whitespace is treated
1733
        /// literally. For example ` +` will match one or more ASCII whitespace
1734
        /// characters.
1735
        ///
1736
        /// When verbose mode is enabled, `\#` can be used to match a literal
1737
        /// `#` and `\ ` can be used to match a literal ASCII whitespace
1738
        /// character.
1739
        ///
1740
        /// Verbose mode is useful for permitting regexes to be formatted and
1741
        /// broken up more nicely. This may make them more easily readable.
1742
        ///
1743
        /// This setting can also be configured using the inline flag `x` in
1744
        /// the pattern.
1745
        ///
1746
        /// The default for this is `false`.
1747
        ///
1748
        /// # Example
1749
        ///
1750
        /// ```
1751
        /// use regex::bytes::RegexBuilder;
1752
        ///
1753
        /// let pat = r"
1754
        ///     \b
1755
        ///     (?<first>\p{Uppercase}\w*)  # always start with uppercase letter
1756
        ///     [\s--\n]+                   # whitespace should separate names
1757
        ///     (?: # middle name can be an initial!
1758
        ///         (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*))
1759
        ///         [\s--\n]+
1760
        ///     )?
1761
        ///     (?<last>\p{Uppercase}\w*)
1762
        ///     \b
1763
        /// ";
1764
        /// let re = RegexBuilder::new(pat)
1765
        ///     .ignore_whitespace(true)
1766
        ///     .build()
1767
        ///     .unwrap();
1768
        ///
1769
        /// let caps = re.captures(b"Harry Potter").unwrap();
1770
        /// assert_eq!(&b"Harry"[..], &caps["first"]);
1771
        /// assert_eq!(&b"Potter"[..], &caps["last"]);
1772
        ///
1773
        /// let caps = re.captures(b"Harry J. Potter").unwrap();
1774
        /// assert_eq!(&b"Harry"[..], &caps["first"]);
1775
        /// // Since a middle name/initial isn't required for an overall match,
1776
        /// // we can't assume that 'initial' or 'middle' will be populated!
1777
        /// assert_eq!(
1778
        ///     Some(&b"J"[..]),
1779
        ///     caps.name("initial").map(|m| m.as_bytes()),
1780
        /// );
1781
        /// assert_eq!(None, caps.name("middle").map(|m| m.as_bytes()));
1782
        /// assert_eq!(&b"Potter"[..], &caps["last"]);
1783
        ///
1784
        /// let caps = re.captures(b"Harry James Potter").unwrap();
1785
        /// assert_eq!(&b"Harry"[..], &caps["first"]);
1786
        /// // Since a middle name/initial isn't required for an overall match,
1787
        /// // we can't assume that 'initial' or 'middle' will be populated!
1788
        /// assert_eq!(None, caps.name("initial").map(|m| m.as_bytes()));
1789
        /// assert_eq!(
1790
        ///     Some(&b"James"[..]),
1791
        ///     caps.name("middle").map(|m| m.as_bytes()),
1792
        /// );
1793
        /// assert_eq!(&b"Potter"[..], &caps["last"]);
1794
        /// ```
1795
0
        pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder {
1796
0
            self.builder.ignore_whitespace(yes);
1797
0
            self
1798
0
        }
1799
1800
        /// This configures octal mode for the entire pattern.
1801
        ///
1802
        /// Octal syntax is a little-known way of uttering Unicode codepoints
1803
        /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
1804
        /// equivalent patterns, where the last example shows octal syntax.
1805
        ///
1806
        /// While supporting octal syntax isn't in and of itself a problem,
1807
        /// it does make good error messages harder. That is, in PCRE based
1808
        /// regex engines, syntax like `\1` invokes a backreference, which is
1809
        /// explicitly unsupported this library. However, many users expect
1810
        /// backreferences to be supported. Therefore, when octal support
1811
        /// is disabled, the error message will explicitly mention that
1812
        /// backreferences aren't supported.
1813
        ///
1814
        /// The default for this is `false`.
1815
        ///
1816
        /// # Example
1817
        ///
1818
        /// ```
1819
        /// use regex::bytes::RegexBuilder;
1820
        ///
1821
        /// // Normally this pattern would not compile, with an error message
1822
        /// // about backreferences not being supported. But with octal mode
1823
        /// // enabled, octal escape sequences work.
1824
        /// let re = RegexBuilder::new(r"\141")
1825
        ///     .octal(true)
1826
        ///     .build()
1827
        ///     .unwrap();
1828
        /// assert!(re.is_match(b"a"));
1829
        /// ```
1830
0
        pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
1831
0
            self.builder.octal(yes);
1832
0
            self
1833
0
        }
1834
1835
        /// Sets the approximate size limit, in bytes, of the compiled regex.
1836
        ///
1837
        /// This roughly corresponds to the number of heap memory, in
1838
        /// bytes, occupied by a single regex. If the regex would otherwise
1839
        /// approximately exceed this limit, then compiling that regex will
1840
        /// fail.
1841
        ///
1842
        /// The main utility of a method like this is to avoid compiling
1843
        /// regexes that use an unexpected amount of resources, such as
1844
        /// time and memory. Even if the memory usage of a large regex is
1845
        /// acceptable, its search time may not be. Namely, worst case time
1846
        /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and
1847
        /// `n ~ len(haystack)`. That is, search time depends, in part, on the
1848
        /// size of the compiled regex. This means that putting a limit on the
1849
        /// size of the regex limits how much a regex can impact search time.
1850
        ///
1851
        /// For more information about regex size limits, see the section on
1852
        /// [untrusted inputs](crate#untrusted-input) in the top-level crate
1853
        /// documentation.
1854
        ///
1855
        /// The default for this is some reasonable number that permits most
1856
        /// patterns to compile successfully.
1857
        ///
1858
        /// # Example
1859
        ///
1860
        /// ```
1861
        /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
1862
        /// use regex::bytes::RegexBuilder;
1863
        ///
1864
        /// // It may surprise you how big some seemingly small patterns can
1865
        /// // be! Since \w is Unicode aware, this generates a regex that can
1866
        /// // match approximately 140,000 distinct codepoints.
1867
        /// assert!(RegexBuilder::new(r"\w").size_limit(45_000).build().is_err());
1868
        /// ```
1869
0
        pub fn size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
1870
0
            self.builder.size_limit(bytes);
1871
0
            self
1872
0
        }
1873
1874
        /// Set the approximate capacity, in bytes, of the cache of transitions
1875
        /// used by the lazy DFA.
1876
        ///
1877
        /// While the lazy DFA isn't always used, in tends to be the most
1878
        /// commonly use regex engine in default configurations. It tends to
1879
        /// adopt the performance profile of a fully build DFA, but without the
1880
        /// downside of taking worst case exponential time to build.
1881
        ///
1882
        /// The downside is that it needs to keep a cache of transitions and
1883
        /// states that are built while running a search, and this cache
1884
        /// can fill up. When it fills up, the cache will reset itself. Any
1885
        /// previously generated states and transitions will then need to be
1886
        /// re-generated. If this happens too many times, then this library
1887
        /// will bail out of using the lazy DFA and switch to a different regex
1888
        /// engine.
1889
        ///
1890
        /// If your regex provokes this particular downside of the lazy DFA,
1891
        /// then it may be beneficial to increase its cache capacity. This will
1892
        /// potentially reduce the frequency of cache resetting (ideally to
1893
        /// `0`). While it won't fix all potential performance problems with
1894
        /// the lazy DFA, increasing the cache capacity does fix some.
1895
        ///
1896
        /// There is no easy way to determine, a priori, whether increasing
1897
        /// this cache capacity will help. In general, the larger your regex,
1898
        /// the more cache it's likely to use. But that isn't an ironclad rule.
1899
        /// For example, a regex like `[01]*1[01]{N}` would normally produce a
1900
        /// fully build DFA that is exponential in size with respect to `N`.
1901
        /// The lazy DFA will prevent exponential space blow-up, but it cache
1902
        /// is likely to fill up, even when it's large and even for smallish
1903
        /// values of `N`.
1904
        ///
1905
        /// If you aren't sure whether this helps or not, it is sensible to
1906
        /// set this to some arbitrarily large number in testing, such as
1907
        /// `usize::MAX`. Namely, this represents the amount of capacity that
1908
        /// *may* be used. It's probably not a good idea to use `usize::MAX` in
1909
        /// production though, since it implies there are no controls on heap
1910
        /// memory used by this library during a search. In effect, set it to
1911
        /// whatever you're willing to allocate for a single regex search.
1912
0
        pub fn dfa_size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
1913
0
            self.builder.dfa_size_limit(bytes);
1914
0
            self
1915
0
        }
1916
1917
        /// Set the nesting limit for this parser.
1918
        ///
1919
        /// The nesting limit controls how deep the abstract syntax tree is
1920
        /// allowed to be. If the AST exceeds the given limit (e.g., with too
1921
        /// many nested groups), then an error is returned by the parser.
1922
        ///
1923
        /// The purpose of this limit is to act as a heuristic to prevent stack
1924
        /// overflow for consumers that do structural induction on an AST using
1925
        /// explicit recursion. While this crate never does this (instead using
1926
        /// constant stack space and moving the call stack to the heap), other
1927
        /// crates may.
1928
        ///
1929
        /// This limit is not checked until the entire AST is parsed.
1930
        /// Therefore, if callers want to put a limit on the amount of heap
1931
        /// space used, then they should impose a limit on the length, in
1932
        /// bytes, of the concrete pattern string. In particular, this is
1933
        /// viable since this parser implementation will limit itself to heap
1934
        /// space proportional to the length of the pattern string. See also
1935
        /// the [untrusted inputs](crate#untrusted-input) section in the
1936
        /// top-level crate documentation for more information about this.
1937
        ///
1938
        /// Note that a nest limit of `0` will return a nest limit error for
1939
        /// most patterns but not all. For example, a nest limit of `0` permits
1940
        /// `a` but not `ab`, since `ab` requires an explicit concatenation,
1941
        /// which results in a nest depth of `1`. In general, a nest limit is
1942
        /// not something that manifests in an obvious way in the concrete
1943
        /// syntax, therefore, it should not be used in a granular way.
1944
        ///
1945
        /// # Example
1946
        ///
1947
        /// ```
1948
        /// use regex::bytes::RegexBuilder;
1949
        ///
1950
        /// assert!(RegexBuilder::new(r"a").nest_limit(0).build().is_ok());
1951
        /// assert!(RegexBuilder::new(r"ab").nest_limit(0).build().is_err());
1952
        /// ```
1953
0
        pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
1954
0
            self.builder.nest_limit(limit);
1955
0
            self
1956
0
        }
1957
    }
1958
1959
    /// A configurable builder for a [`RegexSet`].
1960
    ///
1961
    /// This builder can be used to programmatically set flags such as `i`
1962
    /// (case insensitive) and `x` (for verbose mode). This builder can also be
1963
    /// used to configure things like the line terminator and a size limit on
1964
    /// the compiled regular expression.
1965
    #[derive(Clone, Debug)]
1966
    pub struct RegexSetBuilder {
1967
        builder: Builder,
1968
    }
1969
1970
    impl RegexSetBuilder {
1971
        /// Create a new builder with a default configuration for the given
1972
        /// patterns.
1973
        ///
1974
        /// If the patterns are invalid or exceed the configured size limits,
1975
        /// then an error will be returned when [`RegexSetBuilder::build`] is
1976
        /// called.
1977
0
        pub fn new<I, S>(patterns: I) -> RegexSetBuilder
1978
0
        where
1979
0
            I: IntoIterator<Item = S>,
1980
0
            S: AsRef<str>,
1981
0
        {
1982
0
            RegexSetBuilder { builder: Builder::new(patterns) }
1983
0
        }
1984
1985
        /// Compiles the patterns given to `RegexSetBuilder::new` with the
1986
        /// configuration set on this builder.
1987
        ///
1988
        /// If the patterns aren't valid regexes or if a configured size limit
1989
        /// was exceeded, then an error is returned.
1990
0
        pub fn build(&self) -> Result<RegexSet, Error> {
1991
0
            self.builder.build_many_bytes()
1992
0
        }
1993
1994
        /// This configures Unicode mode for the all of the patterns.
1995
        ///
1996
        /// Enabling Unicode mode does a number of things:
1997
        ///
1998
        /// * Most fundamentally, it causes the fundamental atom of matching
1999
        /// to be a single codepoint. When Unicode mode is disabled, it's a
2000
        /// single byte. For example, when Unicode mode is enabled, `.` will
2001
        /// match `💩` once, where as it will match 4 times when Unicode mode
2002
        /// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.)
2003
        /// * Case insensitive matching uses Unicode simple case folding rules.
2004
        /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are
2005
        /// available.
2006
        /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and
2007
        /// `\d`.
2008
        /// * The word boundary assertions, `\b` and `\B`, use the Unicode
2009
        /// definition of a word character.
2010
        ///
2011
        /// Note that unlike the top-level `RegexSet` for searching `&str`,
2012
        /// it is permitted to disable Unicode mode even if the resulting
2013
        /// pattern could match invalid UTF-8. For example, `(?-u:.)` is not
2014
        /// a valid pattern for a top-level `RegexSet`, but is valid for a
2015
        /// `bytes::RegexSet`.
2016
        ///
2017
        /// For more details on the Unicode support in this crate, see the
2018
        /// [Unicode section](crate#unicode) in this crate's top-level
2019
        /// documentation.
2020
        ///
2021
        /// The default for this is `true`.
2022
        ///
2023
        /// # Example
2024
        ///
2025
        /// ```
2026
        /// use regex::bytes::RegexSetBuilder;
2027
        ///
2028
        /// let re = RegexSetBuilder::new([r"\w"])
2029
        ///     .unicode(false)
2030
        ///     .build()
2031
        ///     .unwrap();
2032
        /// // Normally greek letters would be included in \w, but since
2033
        /// // Unicode mode is disabled, it only matches ASCII letters.
2034
        /// assert!(!re.is_match("δ".as_bytes()));
2035
        ///
2036
        /// let re = RegexSetBuilder::new([r"s"])
2037
        ///     .case_insensitive(true)
2038
        ///     .unicode(false)
2039
        ///     .build()
2040
        ///     .unwrap();
2041
        /// // Normally 'Å¿' is included when searching for 's' case
2042
        /// // insensitively due to Unicode's simple case folding rules. But
2043
        /// // when Unicode mode is disabled, only ASCII case insensitive rules
2044
        /// // are used.
2045
        /// assert!(!re.is_match("Å¿".as_bytes()));
2046
        /// ```
2047
        ///
2048
        /// Since this builder is for constructing a
2049
        /// [`bytes::RegexSet`](RegexSet), one can disable Unicode mode even if
2050
        /// it would match invalid UTF-8:
2051
        ///
2052
        /// ```
2053
        /// use regex::bytes::RegexSetBuilder;
2054
        ///
2055
        /// let re = RegexSetBuilder::new([r"."])
2056
        ///     .unicode(false)
2057
        ///     .build()
2058
        ///     .unwrap();
2059
        /// // Normally greek letters would be included in \w, but since
2060
        /// // Unicode mode is disabled, it only matches ASCII letters.
2061
        /// assert!(re.is_match(b"\xFF"));
2062
        /// ```
2063
0
        pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
2064
0
            self.builder.unicode(yes);
2065
0
            self
2066
0
        }
2067
2068
        /// This configures whether to enable case insensitive matching for all
2069
        /// of the patterns.
2070
        ///
2071
        /// This setting can also be configured using the inline flag `i`
2072
        /// in the pattern. For example, `(?i:foo)` matches `foo` case
2073
        /// insensitively while `(?-i:foo)` matches `foo` case sensitively.
2074
        ///
2075
        /// The default for this is `false`.
2076
        ///
2077
        /// # Example
2078
        ///
2079
        /// ```
2080
        /// use regex::bytes::RegexSetBuilder;
2081
        ///
2082
        /// let re = RegexSetBuilder::new([r"foo(?-i:bar)quux"])
2083
        ///     .case_insensitive(true)
2084
        ///     .build()
2085
        ///     .unwrap();
2086
        /// assert!(re.is_match(b"FoObarQuUx"));
2087
        /// // Even though case insensitive matching is enabled in the builder,
2088
        /// // it can be locally disabled within the pattern. In this case,
2089
        /// // `bar` is matched case sensitively.
2090
        /// assert!(!re.is_match(b"fooBARquux"));
2091
        /// ```
2092
0
        pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder {
2093
0
            self.builder.case_insensitive(yes);
2094
0
            self
2095
0
        }
2096
2097
        /// This configures multi-line mode for all of the patterns.
2098
        ///
2099
        /// Enabling multi-line mode changes the behavior of the `^` and `$`
2100
        /// anchor assertions. Instead of only matching at the beginning and
2101
        /// end of a haystack, respectively, multi-line mode causes them to
2102
        /// match at the beginning and end of a line *in addition* to the
2103
        /// beginning and end of a haystack. More precisely, `^` will match at
2104
        /// the position immediately following a `\n` and `$` will match at the
2105
        /// position immediately preceding a `\n`.
2106
        ///
2107
        /// The behavior of this option can be impacted by other settings too:
2108
        ///
2109
        /// * The [`RegexSetBuilder::line_terminator`] option changes `\n`
2110
        /// above to any ASCII byte.
2111
        /// * The [`RegexSetBuilder::crlf`] option changes the line terminator
2112
        /// to be either `\r` or `\n`, but never at the position between a `\r`
2113
        /// and `\n`.
2114
        ///
2115
        /// This setting can also be configured using the inline flag `m` in
2116
        /// the pattern.
2117
        ///
2118
        /// The default for this is `false`.
2119
        ///
2120
        /// # Example
2121
        ///
2122
        /// ```
2123
        /// use regex::bytes::RegexSetBuilder;
2124
        ///
2125
        /// let re = RegexSetBuilder::new([r"^foo$"])
2126
        ///     .multi_line(true)
2127
        ///     .build()
2128
        ///     .unwrap();
2129
        /// assert!(re.is_match(b"\nfoo\n"));
2130
        /// ```
2131
0
        pub fn multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder {
2132
0
            self.builder.multi_line(yes);
2133
0
            self
2134
0
        }
2135
2136
        /// This configures dot-matches-new-line mode for the entire pattern.
2137
        ///
2138
        /// Perhaps surprisingly, the default behavior for `.` is not to match
2139
        /// any character, but rather, to match any character except for the
2140
        /// line terminator (which is `\n` by default). When this mode is
2141
        /// enabled, the behavior changes such that `.` truly matches any
2142
        /// character.
2143
        ///
2144
        /// This setting can also be configured using the inline flag `s` in
2145
        /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
2146
        /// regexes.
2147
        ///
2148
        /// The default for this is `false`.
2149
        ///
2150
        /// # Example
2151
        ///
2152
        /// ```
2153
        /// use regex::bytes::RegexSetBuilder;
2154
        ///
2155
        /// let re = RegexSetBuilder::new([r"foo.bar"])
2156
        ///     .dot_matches_new_line(true)
2157
        ///     .build()
2158
        ///     .unwrap();
2159
        /// let hay = b"foo\nbar";
2160
        /// assert!(re.is_match(hay));
2161
        /// ```
2162
0
        pub fn dot_matches_new_line(
2163
0
            &mut self,
2164
0
            yes: bool,
2165
0
        ) -> &mut RegexSetBuilder {
2166
0
            self.builder.dot_matches_new_line(yes);
2167
0
            self
2168
0
        }
2169
2170
        /// This configures CRLF mode for all of the patterns.
2171
        ///
2172
        /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
2173
        /// short) and `\n` ("line feed" or LF for short) are treated as line
2174
        /// terminators. This results in the following:
2175
        ///
2176
        /// * Unless dot-matches-new-line mode is enabled, `.` will now match
2177
        /// any character except for `\n` and `\r`.
2178
        /// * When multi-line mode is enabled, `^` will match immediately
2179
        /// following a `\n` or a `\r`. Similarly, `$` will match immediately
2180
        /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
2181
        /// between `\r` and `\n`.
2182
        ///
2183
        /// This setting can also be configured using the inline flag `R` in
2184
        /// the pattern.
2185
        ///
2186
        /// The default for this is `false`.
2187
        ///
2188
        /// # Example
2189
        ///
2190
        /// ```
2191
        /// use regex::bytes::RegexSetBuilder;
2192
        ///
2193
        /// let re = RegexSetBuilder::new([r"^foo$"])
2194
        ///     .multi_line(true)
2195
        ///     .crlf(true)
2196
        ///     .build()
2197
        ///     .unwrap();
2198
        /// let hay = b"\r\nfoo\r\n";
2199
        /// // If CRLF mode weren't enabled here, then '$' wouldn't match
2200
        /// // immediately after 'foo', and thus no match would be found.
2201
        /// assert!(re.is_match(hay));
2202
        /// ```
2203
        ///
2204
        /// This example demonstrates that `^` will never match at a position
2205
        /// between `\r` and `\n`. (`$` will similarly not match between a `\r`
2206
        /// and a `\n`.)
2207
        ///
2208
        /// ```
2209
        /// use regex::bytes::RegexSetBuilder;
2210
        ///
2211
        /// let re = RegexSetBuilder::new([r"^\n"])
2212
        ///     .multi_line(true)
2213
        ///     .crlf(true)
2214
        ///     .build()
2215
        ///     .unwrap();
2216
        /// assert!(!re.is_match(b"\r\n"));
2217
        /// ```
2218
0
        pub fn crlf(&mut self, yes: bool) -> &mut RegexSetBuilder {
2219
0
            self.builder.crlf(yes);
2220
0
            self
2221
0
        }
2222
2223
        /// Configures the line terminator to be used by the regex.
2224
        ///
2225
        /// The line terminator is relevant in two ways for a particular regex:
2226
        ///
2227
        /// * When dot-matches-new-line mode is *not* enabled (the default),
2228
        /// then `.` will match any character except for the configured line
2229
        /// terminator.
2230
        /// * When multi-line mode is enabled (not the default), then `^` and
2231
        /// `$` will match immediately after and before, respectively, a line
2232
        /// terminator.
2233
        ///
2234
        /// In both cases, if CRLF mode is enabled in a particular context,
2235
        /// then it takes precedence over any configured line terminator.
2236
        ///
2237
        /// This option cannot be configured from within the pattern.
2238
        ///
2239
        /// The default line terminator is `\n`.
2240
        ///
2241
        /// # Example
2242
        ///
2243
        /// This shows how to treat the NUL byte as a line terminator. This can
2244
        /// be a useful heuristic when searching binary data.
2245
        ///
2246
        /// ```
2247
        /// use regex::bytes::RegexSetBuilder;
2248
        ///
2249
        /// let re = RegexSetBuilder::new([r"^foo$"])
2250
        ///     .multi_line(true)
2251
        ///     .line_terminator(b'\x00')
2252
        ///     .build()
2253
        ///     .unwrap();
2254
        /// let hay = b"\x00foo\x00";
2255
        /// assert!(re.is_match(hay));
2256
        /// ```
2257
        ///
2258
        /// This example shows that the behavior of `.` is impacted by this
2259
        /// setting as well:
2260
        ///
2261
        /// ```
2262
        /// use regex::bytes::RegexSetBuilder;
2263
        ///
2264
        /// let re = RegexSetBuilder::new([r"."])
2265
        ///     .line_terminator(b'\x00')
2266
        ///     .build()
2267
        ///     .unwrap();
2268
        /// assert!(re.is_match(b"\n"));
2269
        /// assert!(!re.is_match(b"\x00"));
2270
        /// ```
2271
        ///
2272
        /// This shows that building a regex will work even when the byte given
2273
        /// is not ASCII. This is unlike the top-level `RegexSet` API where
2274
        /// matching invalid UTF-8 is not allowed.
2275
        ///
2276
        /// Note though that you must disable Unicode mode. This is required
2277
        /// because Unicode mode requires matching one codepoint at a time,
2278
        /// and there is no way to match a non-ASCII byte as if it were a
2279
        /// codepoint.
2280
        ///
2281
        /// ```
2282
        /// use regex::bytes::RegexSetBuilder;
2283
        ///
2284
        /// assert!(
2285
        ///     RegexSetBuilder::new([r"."])
2286
        ///         .unicode(false)
2287
        ///         .line_terminator(0x80)
2288
        ///         .build()
2289
        ///         .is_ok(),
2290
        /// );
2291
        /// ```
2292
0
        pub fn line_terminator(&mut self, byte: u8) -> &mut RegexSetBuilder {
2293
0
            self.builder.line_terminator(byte);
2294
0
            self
2295
0
        }
2296
2297
        /// This configures swap-greed mode for all of the patterns.
2298
        ///
2299
        /// When swap-greed mode is enabled, patterns like `a+` will become
2300
        /// non-greedy and patterns like `a+?` will become greedy. In other
2301
        /// words, the meanings of `a+` and `a+?` are switched.
2302
        ///
2303
        /// This setting can also be configured using the inline flag `U` in
2304
        /// the pattern.
2305
        ///
2306
        /// Note that this is generally not useful for a `RegexSet` since a
2307
        /// `RegexSet` can only report whether a pattern matches or not. Since
2308
        /// greediness never impacts whether a match is found or not (only the
2309
        /// offsets of the match), it follows that whether parts of a pattern
2310
        /// are greedy or not doesn't matter for a `RegexSet`.
2311
        ///
2312
        /// The default for this is `false`.
2313
0
        pub fn swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder {
2314
0
            self.builder.swap_greed(yes);
2315
0
            self
2316
0
        }
2317
2318
        /// This configures verbose mode for all of the patterns.
2319
        ///
2320
        /// When enabled, whitespace will treated as insignifcant in the
2321
        /// pattern and `#` can be used to start a comment until the next new
2322
        /// line.
2323
        ///
2324
        /// Normally, in most places in a pattern, whitespace is treated
2325
        /// literally. For example ` +` will match one or more ASCII whitespace
2326
        /// characters.
2327
        ///
2328
        /// When verbose mode is enabled, `\#` can be used to match a literal
2329
        /// `#` and `\ ` can be used to match a literal ASCII whitespace
2330
        /// character.
2331
        ///
2332
        /// Verbose mode is useful for permitting regexes to be formatted and
2333
        /// broken up more nicely. This may make them more easily readable.
2334
        ///
2335
        /// This setting can also be configured using the inline flag `x` in
2336
        /// the pattern.
2337
        ///
2338
        /// The default for this is `false`.
2339
        ///
2340
        /// # Example
2341
        ///
2342
        /// ```
2343
        /// use regex::bytes::RegexSetBuilder;
2344
        ///
2345
        /// let pat = r"
2346
        ///     \b
2347
        ///     (?<first>\p{Uppercase}\w*)  # always start with uppercase letter
2348
        ///     [\s--\n]+                   # whitespace should separate names
2349
        ///     (?: # middle name can be an initial!
2350
        ///         (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*))
2351
        ///         [\s--\n]+
2352
        ///     )?
2353
        ///     (?<last>\p{Uppercase}\w*)
2354
        ///     \b
2355
        /// ";
2356
        /// let re = RegexSetBuilder::new([pat])
2357
        ///     .ignore_whitespace(true)
2358
        ///     .build()
2359
        ///     .unwrap();
2360
        /// assert!(re.is_match(b"Harry Potter"));
2361
        /// assert!(re.is_match(b"Harry J. Potter"));
2362
        /// assert!(re.is_match(b"Harry James Potter"));
2363
        /// assert!(!re.is_match(b"harry J. Potter"));
2364
        /// ```
2365
0
        pub fn ignore_whitespace(
2366
0
            &mut self,
2367
0
            yes: bool,
2368
0
        ) -> &mut RegexSetBuilder {
2369
0
            self.builder.ignore_whitespace(yes);
2370
0
            self
2371
0
        }
2372
2373
        /// This configures octal mode for all of the patterns.
2374
        ///
2375
        /// Octal syntax is a little-known way of uttering Unicode codepoints
2376
        /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
2377
        /// equivalent patterns, where the last example shows octal syntax.
2378
        ///
2379
        /// While supporting octal syntax isn't in and of itself a problem,
2380
        /// it does make good error messages harder. That is, in PCRE based
2381
        /// regex engines, syntax like `\1` invokes a backreference, which is
2382
        /// explicitly unsupported this library. However, many users expect
2383
        /// backreferences to be supported. Therefore, when octal support
2384
        /// is disabled, the error message will explicitly mention that
2385
        /// backreferences aren't supported.
2386
        ///
2387
        /// The default for this is `false`.
2388
        ///
2389
        /// # Example
2390
        ///
2391
        /// ```
2392
        /// use regex::bytes::RegexSetBuilder;
2393
        ///
2394
        /// // Normally this pattern would not compile, with an error message
2395
        /// // about backreferences not being supported. But with octal mode
2396
        /// // enabled, octal escape sequences work.
2397
        /// let re = RegexSetBuilder::new([r"\141"])
2398
        ///     .octal(true)
2399
        ///     .build()
2400
        ///     .unwrap();
2401
        /// assert!(re.is_match(b"a"));
2402
        /// ```
2403
0
        pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
2404
0
            self.builder.octal(yes);
2405
0
            self
2406
0
        }
2407
2408
        /// Sets the approximate size limit, in bytes, of the compiled regex.
2409
        ///
2410
        /// This roughly corresponds to the number of heap memory, in
2411
        /// bytes, occupied by a single regex. If the regex would otherwise
2412
        /// approximately exceed this limit, then compiling that regex will
2413
        /// fail.
2414
        ///
2415
        /// The main utility of a method like this is to avoid compiling
2416
        /// regexes that use an unexpected amount of resources, such as
2417
        /// time and memory. Even if the memory usage of a large regex is
2418
        /// acceptable, its search time may not be. Namely, worst case time
2419
        /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and
2420
        /// `n ~ len(haystack)`. That is, search time depends, in part, on the
2421
        /// size of the compiled regex. This means that putting a limit on the
2422
        /// size of the regex limits how much a regex can impact search time.
2423
        ///
2424
        /// For more information about regex size limits, see the section on
2425
        /// [untrusted inputs](crate#untrusted-input) in the top-level crate
2426
        /// documentation.
2427
        ///
2428
        /// The default for this is some reasonable number that permits most
2429
        /// patterns to compile successfully.
2430
        ///
2431
        /// # Example
2432
        ///
2433
        /// ```
2434
        /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
2435
        /// use regex::bytes::RegexSetBuilder;
2436
        ///
2437
        /// // It may surprise you how big some seemingly small patterns can
2438
        /// // be! Since \w is Unicode aware, this generates a regex that can
2439
        /// // match approximately 140,000 distinct codepoints.
2440
        /// assert!(
2441
        ///     RegexSetBuilder::new([r"\w"])
2442
        ///         .size_limit(45_000)
2443
        ///         .build()
2444
        ///         .is_err()
2445
        /// );
2446
        /// ```
2447
0
        pub fn size_limit(&mut self, bytes: usize) -> &mut RegexSetBuilder {
2448
0
            self.builder.size_limit(bytes);
2449
0
            self
2450
0
        }
2451
2452
        /// Set the approximate capacity, in bytes, of the cache of transitions
2453
        /// used by the lazy DFA.
2454
        ///
2455
        /// While the lazy DFA isn't always used, in tends to be the most
2456
        /// commonly use regex engine in default configurations. It tends to
2457
        /// adopt the performance profile of a fully build DFA, but without the
2458
        /// downside of taking worst case exponential time to build.
2459
        ///
2460
        /// The downside is that it needs to keep a cache of transitions and
2461
        /// states that are built while running a search, and this cache
2462
        /// can fill up. When it fills up, the cache will reset itself. Any
2463
        /// previously generated states and transitions will then need to be
2464
        /// re-generated. If this happens too many times, then this library
2465
        /// will bail out of using the lazy DFA and switch to a different regex
2466
        /// engine.
2467
        ///
2468
        /// If your regex provokes this particular downside of the lazy DFA,
2469
        /// then it may be beneficial to increase its cache capacity. This will
2470
        /// potentially reduce the frequency of cache resetting (ideally to
2471
        /// `0`). While it won't fix all potential performance problems with
2472
        /// the lazy DFA, increasing the cache capacity does fix some.
2473
        ///
2474
        /// There is no easy way to determine, a priori, whether increasing
2475
        /// this cache capacity will help. In general, the larger your regex,
2476
        /// the more cache it's likely to use. But that isn't an ironclad rule.
2477
        /// For example, a regex like `[01]*1[01]{N}` would normally produce a
2478
        /// fully build DFA that is exponential in size with respect to `N`.
2479
        /// The lazy DFA will prevent exponential space blow-up, but it cache
2480
        /// is likely to fill up, even when it's large and even for smallish
2481
        /// values of `N`.
2482
        ///
2483
        /// If you aren't sure whether this helps or not, it is sensible to
2484
        /// set this to some arbitrarily large number in testing, such as
2485
        /// `usize::MAX`. Namely, this represents the amount of capacity that
2486
        /// *may* be used. It's probably not a good idea to use `usize::MAX` in
2487
        /// production though, since it implies there are no controls on heap
2488
        /// memory used by this library during a search. In effect, set it to
2489
        /// whatever you're willing to allocate for a single regex search.
2490
0
        pub fn dfa_size_limit(
2491
0
            &mut self,
2492
0
            bytes: usize,
2493
0
        ) -> &mut RegexSetBuilder {
2494
0
            self.builder.dfa_size_limit(bytes);
2495
0
            self
2496
0
        }
2497
2498
        /// Set the nesting limit for this parser.
2499
        ///
2500
        /// The nesting limit controls how deep the abstract syntax tree is
2501
        /// allowed to be. If the AST exceeds the given limit (e.g., with too
2502
        /// many nested groups), then an error is returned by the parser.
2503
        ///
2504
        /// The purpose of this limit is to act as a heuristic to prevent stack
2505
        /// overflow for consumers that do structural induction on an AST using
2506
        /// explicit recursion. While this crate never does this (instead using
2507
        /// constant stack space and moving the call stack to the heap), other
2508
        /// crates may.
2509
        ///
2510
        /// This limit is not checked until the entire AST is parsed.
2511
        /// Therefore, if callers want to put a limit on the amount of heap
2512
        /// space used, then they should impose a limit on the length, in
2513
        /// bytes, of the concrete pattern string. In particular, this is
2514
        /// viable since this parser implementation will limit itself to heap
2515
        /// space proportional to the length of the pattern string. See also
2516
        /// the [untrusted inputs](crate#untrusted-input) section in the
2517
        /// top-level crate documentation for more information about this.
2518
        ///
2519
        /// Note that a nest limit of `0` will return a nest limit error for
2520
        /// most patterns but not all. For example, a nest limit of `0` permits
2521
        /// `a` but not `ab`, since `ab` requires an explicit concatenation,
2522
        /// which results in a nest depth of `1`. In general, a nest limit is
2523
        /// not something that manifests in an obvious way in the concrete
2524
        /// syntax, therefore, it should not be used in a granular way.
2525
        ///
2526
        /// # Example
2527
        ///
2528
        /// ```
2529
        /// use regex::bytes::RegexSetBuilder;
2530
        ///
2531
        /// assert!(RegexSetBuilder::new([r"a"]).nest_limit(0).build().is_ok());
2532
        /// assert!(RegexSetBuilder::new([r"ab"]).nest_limit(0).build().is_err());
2533
        /// ```
2534
0
        pub fn nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder {
2535
0
            self.builder.nest_limit(limit);
2536
0
            self
2537
0
        }
2538
    }
2539
}