/build/cargo-vendor-dir/regex-1.10.3/src/builders.rs
Line | Count | Source (jump to first uncovered line) |
1 | | #![allow(warnings)] |
2 | | |
3 | | // This module defines an internal builder that encapsulates all interaction |
4 | | // with meta::Regex construction, and then 4 public API builders that wrap |
5 | | // around it. The docs are essentially repeated on each of the 4 public |
6 | | // builders, with tweaks to the examples as needed. |
7 | | // |
8 | | // The reason why there are so many builders is partially because of a misstep |
9 | | // in the initial API design: the builder constructor takes in the pattern |
10 | | // strings instead of using the `build` method to accept the pattern strings. |
11 | | // This means `new` has a different signature for each builder. It probably |
12 | | // would have been nicer to to use one builder with `fn new()`, and then add |
13 | | // `build(pat)` and `build_many(pats)` constructors. |
14 | | // |
15 | | // The other reason is because I think the `bytes` module should probably |
16 | | // have its own builder type. That way, it is completely isolated from the |
17 | | // top-level API. |
18 | | // |
19 | | // If I could do it again, I'd probably have a `regex::Builder` and a |
20 | | // `regex::bytes::Builder`. Each would have `build` and `build_set` (or |
21 | | // `build_many`) methods for constructing a single pattern `Regex` and a |
22 | | // multi-pattern `RegexSet`, respectively. |
23 | | |
24 | | use alloc::{ |
25 | | string::{String, ToString}, |
26 | | sync::Arc, |
27 | | vec, |
28 | | vec::Vec, |
29 | | }; |
30 | | |
31 | | use regex_automata::{ |
32 | | meta, nfa::thompson::WhichCaptures, util::syntax, MatchKind, |
33 | | }; |
34 | | |
35 | | use crate::error::Error; |
36 | | |
37 | | /// A builder for constructing a `Regex`, `bytes::Regex`, `RegexSet` or a |
38 | | /// `bytes::RegexSet`. |
39 | | /// |
40 | | /// This is essentially the implementation of the four different builder types |
41 | | /// in the public API: `RegexBuilder`, `bytes::RegexBuilder`, `RegexSetBuilder` |
42 | | /// and `bytes::RegexSetBuilder`. |
43 | | #[derive(Clone, Debug)] |
44 | | struct Builder { |
45 | | pats: Vec<String>, |
46 | | metac: meta::Config, |
47 | | syntaxc: syntax::Config, |
48 | | } |
49 | | |
50 | | impl Default for Builder { |
51 | 0 | fn default() -> Builder { |
52 | 0 | let metac = meta::Config::new() |
53 | 0 | .nfa_size_limit(Some(10 * (1 << 20))) |
54 | 0 | .hybrid_cache_capacity(2 * (1 << 20)); |
55 | 0 | Builder { pats: vec![], metac, syntaxc: syntax::Config::default() } |
56 | 0 | } |
57 | | } |
58 | | |
59 | | impl Builder { |
60 | 0 | fn new<I, S>(patterns: I) -> Builder |
61 | 0 | where |
62 | 0 | S: AsRef<str>, |
63 | 0 | I: IntoIterator<Item = S>, |
64 | 0 | { |
65 | 0 | let mut b = Builder::default(); |
66 | 0 | b.pats.extend(patterns.into_iter().map(|p| p.as_ref().to_string())); |
67 | 0 | b |
68 | 0 | } |
69 | | |
70 | 0 | fn build_one_string(&self) -> Result<crate::Regex, Error> { |
71 | 0 | assert_eq!(1, self.pats.len()); |
72 | 0 | let metac = self |
73 | 0 | .metac |
74 | 0 | .clone() |
75 | 0 | .match_kind(MatchKind::LeftmostFirst) |
76 | 0 | .utf8_empty(true); |
77 | 0 | let syntaxc = self.syntaxc.clone().utf8(true); |
78 | 0 | let pattern = Arc::from(self.pats[0].as_str()); |
79 | 0 | meta::Builder::new() |
80 | 0 | .configure(metac) |
81 | 0 | .syntax(syntaxc) |
82 | 0 | .build(&pattern) |
83 | 0 | .map(|meta| crate::Regex { meta, pattern }) |
84 | 0 | .map_err(Error::from_meta_build_error) |
85 | 0 | } |
86 | | |
87 | 0 | fn build_one_bytes(&self) -> Result<crate::bytes::Regex, Error> { |
88 | 0 | assert_eq!(1, self.pats.len()); |
89 | 0 | let metac = self |
90 | 0 | .metac |
91 | 0 | .clone() |
92 | 0 | .match_kind(MatchKind::LeftmostFirst) |
93 | 0 | .utf8_empty(false); |
94 | 0 | let syntaxc = self.syntaxc.clone().utf8(false); |
95 | 0 | let pattern = Arc::from(self.pats[0].as_str()); |
96 | 0 | meta::Builder::new() |
97 | 0 | .configure(metac) |
98 | 0 | .syntax(syntaxc) |
99 | 0 | .build(&pattern) |
100 | 0 | .map(|meta| crate::bytes::Regex { meta, pattern }) |
101 | 0 | .map_err(Error::from_meta_build_error) |
102 | 0 | } |
103 | | |
104 | 0 | fn build_many_string(&self) -> Result<crate::RegexSet, Error> { |
105 | 0 | let metac = self |
106 | 0 | .metac |
107 | 0 | .clone() |
108 | 0 | .match_kind(MatchKind::All) |
109 | 0 | .utf8_empty(true) |
110 | 0 | .which_captures(WhichCaptures::None); |
111 | 0 | let syntaxc = self.syntaxc.clone().utf8(true); |
112 | 0 | let patterns = Arc::from(self.pats.as_slice()); |
113 | 0 | meta::Builder::new() |
114 | 0 | .configure(metac) |
115 | 0 | .syntax(syntaxc) |
116 | 0 | .build_many(&patterns) |
117 | 0 | .map(|meta| crate::RegexSet { meta, patterns }) |
118 | 0 | .map_err(Error::from_meta_build_error) |
119 | 0 | } |
120 | | |
121 | 0 | fn build_many_bytes(&self) -> Result<crate::bytes::RegexSet, Error> { |
122 | 0 | let metac = self |
123 | 0 | .metac |
124 | 0 | .clone() |
125 | 0 | .match_kind(MatchKind::All) |
126 | 0 | .utf8_empty(false) |
127 | 0 | .which_captures(WhichCaptures::None); |
128 | 0 | let syntaxc = self.syntaxc.clone().utf8(false); |
129 | 0 | let patterns = Arc::from(self.pats.as_slice()); |
130 | 0 | meta::Builder::new() |
131 | 0 | .configure(metac) |
132 | 0 | .syntax(syntaxc) |
133 | 0 | .build_many(&patterns) |
134 | 0 | .map(|meta| crate::bytes::RegexSet { meta, patterns }) |
135 | 0 | .map_err(Error::from_meta_build_error) |
136 | 0 | } |
137 | | |
138 | 0 | fn case_insensitive(&mut self, yes: bool) -> &mut Builder { |
139 | 0 | self.syntaxc = self.syntaxc.case_insensitive(yes); |
140 | 0 | self |
141 | 0 | } |
142 | | |
143 | 0 | fn multi_line(&mut self, yes: bool) -> &mut Builder { |
144 | 0 | self.syntaxc = self.syntaxc.multi_line(yes); |
145 | 0 | self |
146 | 0 | } |
147 | | |
148 | 0 | fn dot_matches_new_line(&mut self, yes: bool) -> &mut Builder { |
149 | 0 | self.syntaxc = self.syntaxc.dot_matches_new_line(yes); |
150 | 0 | self |
151 | 0 | } |
152 | | |
153 | 0 | fn crlf(&mut self, yes: bool) -> &mut Builder { |
154 | 0 | self.syntaxc = self.syntaxc.crlf(yes); |
155 | 0 | self |
156 | 0 | } |
157 | | |
158 | 0 | fn line_terminator(&mut self, byte: u8) -> &mut Builder { |
159 | 0 | self.metac = self.metac.clone().line_terminator(byte); |
160 | 0 | self.syntaxc = self.syntaxc.line_terminator(byte); |
161 | 0 | self |
162 | 0 | } |
163 | | |
164 | 0 | fn swap_greed(&mut self, yes: bool) -> &mut Builder { |
165 | 0 | self.syntaxc = self.syntaxc.swap_greed(yes); |
166 | 0 | self |
167 | 0 | } |
168 | | |
169 | 0 | fn ignore_whitespace(&mut self, yes: bool) -> &mut Builder { |
170 | 0 | self.syntaxc = self.syntaxc.ignore_whitespace(yes); |
171 | 0 | self |
172 | 0 | } |
173 | | |
174 | 0 | fn unicode(&mut self, yes: bool) -> &mut Builder { |
175 | 0 | self.syntaxc = self.syntaxc.unicode(yes); |
176 | 0 | self |
177 | 0 | } |
178 | | |
179 | 0 | fn octal(&mut self, yes: bool) -> &mut Builder { |
180 | 0 | self.syntaxc = self.syntaxc.octal(yes); |
181 | 0 | self |
182 | 0 | } |
183 | | |
184 | 0 | fn size_limit(&mut self, limit: usize) -> &mut Builder { |
185 | 0 | self.metac = self.metac.clone().nfa_size_limit(Some(limit)); |
186 | 0 | self |
187 | 0 | } |
188 | | |
189 | 0 | fn dfa_size_limit(&mut self, limit: usize) -> &mut Builder { |
190 | 0 | self.metac = self.metac.clone().hybrid_cache_capacity(limit); |
191 | 0 | self |
192 | 0 | } |
193 | | |
194 | 0 | fn nest_limit(&mut self, limit: u32) -> &mut Builder { |
195 | 0 | self.syntaxc = self.syntaxc.nest_limit(limit); |
196 | 0 | self |
197 | 0 | } |
198 | | } |
199 | | |
200 | | pub(crate) mod string { |
201 | | use crate::{error::Error, Regex, RegexSet}; |
202 | | |
203 | | use super::Builder; |
204 | | |
205 | | /// A configurable builder for a [`Regex`]. |
206 | | /// |
207 | | /// This builder can be used to programmatically set flags such as `i` |
208 | | /// (case insensitive) and `x` (for verbose mode). This builder can also be |
209 | | /// used to configure things like the line terminator and a size limit on |
210 | | /// the compiled regular expression. |
211 | | #[derive(Clone, Debug)] |
212 | | pub struct RegexBuilder { |
213 | | builder: Builder, |
214 | | } |
215 | | |
216 | | impl RegexBuilder { |
217 | | /// Create a new builder with a default configuration for the given |
218 | | /// pattern. |
219 | | /// |
220 | | /// If the pattern is invalid or exceeds the configured size limits, |
221 | | /// then an error will be returned when [`RegexBuilder::build`] is |
222 | | /// called. |
223 | 0 | pub fn new(pattern: &str) -> RegexBuilder { |
224 | 0 | RegexBuilder { builder: Builder::new([pattern]) } |
225 | 0 | } |
226 | | |
227 | | /// Compiles the pattern given to `RegexBuilder::new` with the |
228 | | /// configuration set on this builder. |
229 | | /// |
230 | | /// If the pattern isn't a valid regex or if a configured size limit |
231 | | /// was exceeded, then an error is returned. |
232 | 0 | pub fn build(&self) -> Result<Regex, Error> { |
233 | 0 | self.builder.build_one_string() |
234 | 0 | } |
235 | | |
236 | | /// This configures Unicode mode for the entire pattern. |
237 | | /// |
238 | | /// Enabling Unicode mode does a number of things: |
239 | | /// |
240 | | /// * Most fundamentally, it causes the fundamental atom of matching |
241 | | /// to be a single codepoint. When Unicode mode is disabled, it's a |
242 | | /// single byte. For example, when Unicode mode is enabled, `.` will |
243 | | /// match `💩` once, where as it will match 4 times when Unicode mode |
244 | | /// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.) |
245 | | /// * Case insensitive matching uses Unicode simple case folding rules. |
246 | | /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are |
247 | | /// available. |
248 | | /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and |
249 | | /// `\d`. |
250 | | /// * The word boundary assertions, `\b` and `\B`, use the Unicode |
251 | | /// definition of a word character. |
252 | | /// |
253 | | /// Note that if Unicode mode is disabled, then the regex will fail to |
254 | | /// compile if it could match invalid UTF-8. For example, when Unicode |
255 | | /// mode is disabled, then since `.` matches any byte (except for |
256 | | /// `\n`), then it can match invalid UTF-8 and thus building a regex |
257 | | /// from it will fail. Another example is `\w` and `\W`. Since `\w` can |
258 | | /// only match ASCII bytes when Unicode mode is disabled, it's allowed. |
259 | | /// But `\W` can match more than ASCII bytes, including invalid UTF-8, |
260 | | /// and so it is not allowed. This restriction can be lifted only by |
261 | | /// using a [`bytes::Regex`](crate::bytes::Regex). |
262 | | /// |
263 | | /// For more details on the Unicode support in this crate, see the |
264 | | /// [Unicode section](crate#unicode) in this crate's top-level |
265 | | /// documentation. |
266 | | /// |
267 | | /// The default for this is `true`. |
268 | | /// |
269 | | /// # Example |
270 | | /// |
271 | | /// ``` |
272 | | /// use regex::RegexBuilder; |
273 | | /// |
274 | | /// let re = RegexBuilder::new(r"\w") |
275 | | /// .unicode(false) |
276 | | /// .build() |
277 | | /// .unwrap(); |
278 | | /// // Normally greek letters would be included in \w, but since |
279 | | /// // Unicode mode is disabled, it only matches ASCII letters. |
280 | | /// assert!(!re.is_match("δ")); |
281 | | /// |
282 | | /// let re = RegexBuilder::new(r"s") |
283 | | /// .case_insensitive(true) |
284 | | /// .unicode(false) |
285 | | /// .build() |
286 | | /// .unwrap(); |
287 | | /// // Normally 'Å¿' is included when searching for 's' case |
288 | | /// // insensitively due to Unicode's simple case folding rules. But |
289 | | /// // when Unicode mode is disabled, only ASCII case insensitive rules |
290 | | /// // are used. |
291 | | /// assert!(!re.is_match("Å¿")); |
292 | | /// ``` |
293 | 0 | pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder { |
294 | 0 | self.builder.unicode(yes); |
295 | 0 | self |
296 | 0 | } |
297 | | |
298 | | /// This configures whether to enable case insensitive matching for the |
299 | | /// entire pattern. |
300 | | /// |
301 | | /// This setting can also be configured using the inline flag `i` |
302 | | /// in the pattern. For example, `(?i:foo)` matches `foo` case |
303 | | /// insensitively while `(?-i:foo)` matches `foo` case sensitively. |
304 | | /// |
305 | | /// The default for this is `false`. |
306 | | /// |
307 | | /// # Example |
308 | | /// |
309 | | /// ``` |
310 | | /// use regex::RegexBuilder; |
311 | | /// |
312 | | /// let re = RegexBuilder::new(r"foo(?-i:bar)quux") |
313 | | /// .case_insensitive(true) |
314 | | /// .build() |
315 | | /// .unwrap(); |
316 | | /// assert!(re.is_match("FoObarQuUx")); |
317 | | /// // Even though case insensitive matching is enabled in the builder, |
318 | | /// // it can be locally disabled within the pattern. In this case, |
319 | | /// // `bar` is matched case sensitively. |
320 | | /// assert!(!re.is_match("fooBARquux")); |
321 | | /// ``` |
322 | 0 | pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder { |
323 | 0 | self.builder.case_insensitive(yes); |
324 | 0 | self |
325 | 0 | } |
326 | | |
327 | | /// This configures multi-line mode for the entire pattern. |
328 | | /// |
329 | | /// Enabling multi-line mode changes the behavior of the `^` and `$` |
330 | | /// anchor assertions. Instead of only matching at the beginning and |
331 | | /// end of a haystack, respectively, multi-line mode causes them to |
332 | | /// match at the beginning and end of a line *in addition* to the |
333 | | /// beginning and end of a haystack. More precisely, `^` will match at |
334 | | /// the position immediately following a `\n` and `$` will match at the |
335 | | /// position immediately preceding a `\n`. |
336 | | /// |
337 | | /// The behavior of this option can be impacted by other settings too: |
338 | | /// |
339 | | /// * The [`RegexBuilder::line_terminator`] option changes `\n` above |
340 | | /// to any ASCII byte. |
341 | | /// * The [`RegexBuilder::crlf`] option changes the line terminator to |
342 | | /// be either `\r` or `\n`, but never at the position between a `\r` |
343 | | /// and `\n`. |
344 | | /// |
345 | | /// This setting can also be configured using the inline flag `m` in |
346 | | /// the pattern. |
347 | | /// |
348 | | /// The default for this is `false`. |
349 | | /// |
350 | | /// # Example |
351 | | /// |
352 | | /// ``` |
353 | | /// use regex::RegexBuilder; |
354 | | /// |
355 | | /// let re = RegexBuilder::new(r"^foo$") |
356 | | /// .multi_line(true) |
357 | | /// .build() |
358 | | /// .unwrap(); |
359 | | /// assert_eq!(Some(1..4), re.find("\nfoo\n").map(|m| m.range())); |
360 | | /// ``` |
361 | 0 | pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder { |
362 | 0 | self.builder.multi_line(yes); |
363 | 0 | self |
364 | 0 | } |
365 | | |
366 | | /// This configures dot-matches-new-line mode for the entire pattern. |
367 | | /// |
368 | | /// Perhaps surprisingly, the default behavior for `.` is not to match |
369 | | /// any character, but rather, to match any character except for the |
370 | | /// line terminator (which is `\n` by default). When this mode is |
371 | | /// enabled, the behavior changes such that `.` truly matches any |
372 | | /// character. |
373 | | /// |
374 | | /// This setting can also be configured using the inline flag `s` in |
375 | | /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent |
376 | | /// regexes. |
377 | | /// |
378 | | /// The default for this is `false`. |
379 | | /// |
380 | | /// # Example |
381 | | /// |
382 | | /// ``` |
383 | | /// use regex::RegexBuilder; |
384 | | /// |
385 | | /// let re = RegexBuilder::new(r"foo.bar") |
386 | | /// .dot_matches_new_line(true) |
387 | | /// .build() |
388 | | /// .unwrap(); |
389 | | /// let hay = "foo\nbar"; |
390 | | /// assert_eq!(Some("foo\nbar"), re.find(hay).map(|m| m.as_str())); |
391 | | /// ``` |
392 | 0 | pub fn dot_matches_new_line( |
393 | 0 | &mut self, |
394 | 0 | yes: bool, |
395 | 0 | ) -> &mut RegexBuilder { |
396 | 0 | self.builder.dot_matches_new_line(yes); |
397 | 0 | self |
398 | 0 | } |
399 | | |
400 | | /// This configures CRLF mode for the entire pattern. |
401 | | /// |
402 | | /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for |
403 | | /// short) and `\n` ("line feed" or LF for short) are treated as line |
404 | | /// terminators. This results in the following: |
405 | | /// |
406 | | /// * Unless dot-matches-new-line mode is enabled, `.` will now match |
407 | | /// any character except for `\n` and `\r`. |
408 | | /// * When multi-line mode is enabled, `^` will match immediately |
409 | | /// following a `\n` or a `\r`. Similarly, `$` will match immediately |
410 | | /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match |
411 | | /// between `\r` and `\n`. |
412 | | /// |
413 | | /// This setting can also be configured using the inline flag `R` in |
414 | | /// the pattern. |
415 | | /// |
416 | | /// The default for this is `false`. |
417 | | /// |
418 | | /// # Example |
419 | | /// |
420 | | /// ``` |
421 | | /// use regex::RegexBuilder; |
422 | | /// |
423 | | /// let re = RegexBuilder::new(r"^foo$") |
424 | | /// .multi_line(true) |
425 | | /// .crlf(true) |
426 | | /// .build() |
427 | | /// .unwrap(); |
428 | | /// let hay = "\r\nfoo\r\n"; |
429 | | /// // If CRLF mode weren't enabled here, then '$' wouldn't match |
430 | | /// // immediately after 'foo', and thus no match would be found. |
431 | | /// assert_eq!(Some("foo"), re.find(hay).map(|m| m.as_str())); |
432 | | /// ``` |
433 | | /// |
434 | | /// This example demonstrates that `^` will never match at a position |
435 | | /// between `\r` and `\n`. (`$` will similarly not match between a `\r` |
436 | | /// and a `\n`.) |
437 | | /// |
438 | | /// ``` |
439 | | /// use regex::RegexBuilder; |
440 | | /// |
441 | | /// let re = RegexBuilder::new(r"^") |
442 | | /// .multi_line(true) |
443 | | /// .crlf(true) |
444 | | /// .build() |
445 | | /// .unwrap(); |
446 | | /// let hay = "\r\n\r\n"; |
447 | | /// let ranges: Vec<_> = re.find_iter(hay).map(|m| m.range()).collect(); |
448 | | /// assert_eq!(ranges, vec![0..0, 2..2, 4..4]); |
449 | | /// ``` |
450 | 0 | pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder { |
451 | 0 | self.builder.crlf(yes); |
452 | 0 | self |
453 | 0 | } |
454 | | |
455 | | /// Configures the line terminator to be used by the regex. |
456 | | /// |
457 | | /// The line terminator is relevant in two ways for a particular regex: |
458 | | /// |
459 | | /// * When dot-matches-new-line mode is *not* enabled (the default), |
460 | | /// then `.` will match any character except for the configured line |
461 | | /// terminator. |
462 | | /// * When multi-line mode is enabled (not the default), then `^` and |
463 | | /// `$` will match immediately after and before, respectively, a line |
464 | | /// terminator. |
465 | | /// |
466 | | /// In both cases, if CRLF mode is enabled in a particular context, |
467 | | /// then it takes precedence over any configured line terminator. |
468 | | /// |
469 | | /// This option cannot be configured from within the pattern. |
470 | | /// |
471 | | /// The default line terminator is `\n`. |
472 | | /// |
473 | | /// # Example |
474 | | /// |
475 | | /// This shows how to treat the NUL byte as a line terminator. This can |
476 | | /// be a useful heuristic when searching binary data. |
477 | | /// |
478 | | /// ``` |
479 | | /// use regex::RegexBuilder; |
480 | | /// |
481 | | /// let re = RegexBuilder::new(r"^foo$") |
482 | | /// .multi_line(true) |
483 | | /// .line_terminator(b'\x00') |
484 | | /// .build() |
485 | | /// .unwrap(); |
486 | | /// let hay = "\x00foo\x00"; |
487 | | /// assert_eq!(Some(1..4), re.find(hay).map(|m| m.range())); |
488 | | /// ``` |
489 | | /// |
490 | | /// This example shows that the behavior of `.` is impacted by this |
491 | | /// setting as well: |
492 | | /// |
493 | | /// ``` |
494 | | /// use regex::RegexBuilder; |
495 | | /// |
496 | | /// let re = RegexBuilder::new(r".") |
497 | | /// .line_terminator(b'\x00') |
498 | | /// .build() |
499 | | /// .unwrap(); |
500 | | /// assert!(re.is_match("\n")); |
501 | | /// assert!(!re.is_match("\x00")); |
502 | | /// ``` |
503 | | /// |
504 | | /// This shows that building a regex will fail if the byte given |
505 | | /// is not ASCII and the pattern could result in matching invalid |
506 | | /// UTF-8. This is because any singular non-ASCII byte is not valid |
507 | | /// UTF-8, and it is not permitted for a [`Regex`] to match invalid |
508 | | /// UTF-8. (It is permissible to use a non-ASCII byte when building a |
509 | | /// [`bytes::Regex`](crate::bytes::Regex).) |
510 | | /// |
511 | | /// ``` |
512 | | /// use regex::RegexBuilder; |
513 | | /// |
514 | | /// assert!(RegexBuilder::new(r".").line_terminator(0x80).build().is_err()); |
515 | | /// // Note that using a non-ASCII byte isn't enough on its own to |
516 | | /// // cause regex compilation to fail. You actually have to make use |
517 | | /// // of it in the regex in a way that leads to matching invalid |
518 | | /// // UTF-8. If you don't, then regex compilation will succeed! |
519 | | /// assert!(RegexBuilder::new(r"a").line_terminator(0x80).build().is_ok()); |
520 | | /// ``` |
521 | 0 | pub fn line_terminator(&mut self, byte: u8) -> &mut RegexBuilder { |
522 | 0 | self.builder.line_terminator(byte); |
523 | 0 | self |
524 | 0 | } |
525 | | |
526 | | /// This configures swap-greed mode for the entire pattern. |
527 | | /// |
528 | | /// When swap-greed mode is enabled, patterns like `a+` will become |
529 | | /// non-greedy and patterns like `a+?` will become greedy. In other |
530 | | /// words, the meanings of `a+` and `a+?` are switched. |
531 | | /// |
532 | | /// This setting can also be configured using the inline flag `U` in |
533 | | /// the pattern. |
534 | | /// |
535 | | /// The default for this is `false`. |
536 | | /// |
537 | | /// # Example |
538 | | /// |
539 | | /// ``` |
540 | | /// use regex::RegexBuilder; |
541 | | /// |
542 | | /// let re = RegexBuilder::new(r"a+") |
543 | | /// .swap_greed(true) |
544 | | /// .build() |
545 | | /// .unwrap(); |
546 | | /// assert_eq!(Some("a"), re.find("aaa").map(|m| m.as_str())); |
547 | | /// ``` |
548 | 0 | pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder { |
549 | 0 | self.builder.swap_greed(yes); |
550 | 0 | self |
551 | 0 | } |
552 | | |
553 | | /// This configures verbose mode for the entire pattern. |
554 | | /// |
555 | | /// When enabled, whitespace will treated as insignifcant in the |
556 | | /// pattern and `#` can be used to start a comment until the next new |
557 | | /// line. |
558 | | /// |
559 | | /// Normally, in most places in a pattern, whitespace is treated |
560 | | /// literally. For example ` +` will match one or more ASCII whitespace |
561 | | /// characters. |
562 | | /// |
563 | | /// When verbose mode is enabled, `\#` can be used to match a literal |
564 | | /// `#` and `\ ` can be used to match a literal ASCII whitespace |
565 | | /// character. |
566 | | /// |
567 | | /// Verbose mode is useful for permitting regexes to be formatted and |
568 | | /// broken up more nicely. This may make them more easily readable. |
569 | | /// |
570 | | /// This setting can also be configured using the inline flag `x` in |
571 | | /// the pattern. |
572 | | /// |
573 | | /// The default for this is `false`. |
574 | | /// |
575 | | /// # Example |
576 | | /// |
577 | | /// ``` |
578 | | /// use regex::RegexBuilder; |
579 | | /// |
580 | | /// let pat = r" |
581 | | /// \b |
582 | | /// (?<first>\p{Uppercase}\w*) # always start with uppercase letter |
583 | | /// [\s--\n]+ # whitespace should separate names |
584 | | /// (?: # middle name can be an initial! |
585 | | /// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*)) |
586 | | /// [\s--\n]+ |
587 | | /// )? |
588 | | /// (?<last>\p{Uppercase}\w*) |
589 | | /// \b |
590 | | /// "; |
591 | | /// let re = RegexBuilder::new(pat) |
592 | | /// .ignore_whitespace(true) |
593 | | /// .build() |
594 | | /// .unwrap(); |
595 | | /// |
596 | | /// let caps = re.captures("Harry Potter").unwrap(); |
597 | | /// assert_eq!("Harry", &caps["first"]); |
598 | | /// assert_eq!("Potter", &caps["last"]); |
599 | | /// |
600 | | /// let caps = re.captures("Harry J. Potter").unwrap(); |
601 | | /// assert_eq!("Harry", &caps["first"]); |
602 | | /// // Since a middle name/initial isn't required for an overall match, |
603 | | /// // we can't assume that 'initial' or 'middle' will be populated! |
604 | | /// assert_eq!(Some("J"), caps.name("initial").map(|m| m.as_str())); |
605 | | /// assert_eq!(None, caps.name("middle").map(|m| m.as_str())); |
606 | | /// assert_eq!("Potter", &caps["last"]); |
607 | | /// |
608 | | /// let caps = re.captures("Harry James Potter").unwrap(); |
609 | | /// assert_eq!("Harry", &caps["first"]); |
610 | | /// // Since a middle name/initial isn't required for an overall match, |
611 | | /// // we can't assume that 'initial' or 'middle' will be populated! |
612 | | /// assert_eq!(None, caps.name("initial").map(|m| m.as_str())); |
613 | | /// assert_eq!(Some("James"), caps.name("middle").map(|m| m.as_str())); |
614 | | /// assert_eq!("Potter", &caps["last"]); |
615 | | /// ``` |
616 | 0 | pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder { |
617 | 0 | self.builder.ignore_whitespace(yes); |
618 | 0 | self |
619 | 0 | } |
620 | | |
621 | | /// This configures octal mode for the entire pattern. |
622 | | /// |
623 | | /// Octal syntax is a little-known way of uttering Unicode codepoints |
624 | | /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all |
625 | | /// equivalent patterns, where the last example shows octal syntax. |
626 | | /// |
627 | | /// While supporting octal syntax isn't in and of itself a problem, |
628 | | /// it does make good error messages harder. That is, in PCRE based |
629 | | /// regex engines, syntax like `\1` invokes a backreference, which is |
630 | | /// explicitly unsupported this library. However, many users expect |
631 | | /// backreferences to be supported. Therefore, when octal support |
632 | | /// is disabled, the error message will explicitly mention that |
633 | | /// backreferences aren't supported. |
634 | | /// |
635 | | /// The default for this is `false`. |
636 | | /// |
637 | | /// # Example |
638 | | /// |
639 | | /// ``` |
640 | | /// use regex::RegexBuilder; |
641 | | /// |
642 | | /// // Normally this pattern would not compile, with an error message |
643 | | /// // about backreferences not being supported. But with octal mode |
644 | | /// // enabled, octal escape sequences work. |
645 | | /// let re = RegexBuilder::new(r"\141") |
646 | | /// .octal(true) |
647 | | /// .build() |
648 | | /// .unwrap(); |
649 | | /// assert!(re.is_match("a")); |
650 | | /// ``` |
651 | 0 | pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder { |
652 | 0 | self.builder.octal(yes); |
653 | 0 | self |
654 | 0 | } |
655 | | |
656 | | /// Sets the approximate size limit, in bytes, of the compiled regex. |
657 | | /// |
658 | | /// This roughly corresponds to the number of heap memory, in |
659 | | /// bytes, occupied by a single regex. If the regex would otherwise |
660 | | /// approximately exceed this limit, then compiling that regex will |
661 | | /// fail. |
662 | | /// |
663 | | /// The main utility of a method like this is to avoid compiling |
664 | | /// regexes that use an unexpected amount of resources, such as |
665 | | /// time and memory. Even if the memory usage of a large regex is |
666 | | /// acceptable, its search time may not be. Namely, worst case time |
667 | | /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and |
668 | | /// `n ~ len(haystack)`. That is, search time depends, in part, on the |
669 | | /// size of the compiled regex. This means that putting a limit on the |
670 | | /// size of the regex limits how much a regex can impact search time. |
671 | | /// |
672 | | /// For more information about regex size limits, see the section on |
673 | | /// [untrusted inputs](crate#untrusted-input) in the top-level crate |
674 | | /// documentation. |
675 | | /// |
676 | | /// The default for this is some reasonable number that permits most |
677 | | /// patterns to compile successfully. |
678 | | /// |
679 | | /// # Example |
680 | | /// |
681 | | /// ``` |
682 | | /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 |
683 | | /// use regex::RegexBuilder; |
684 | | /// |
685 | | /// // It may surprise you how big some seemingly small patterns can |
686 | | /// // be! Since \w is Unicode aware, this generates a regex that can |
687 | | /// // match approximately 140,000 distinct codepoints. |
688 | | /// assert!(RegexBuilder::new(r"\w").size_limit(45_000).build().is_err()); |
689 | | /// ``` |
690 | 0 | pub fn size_limit(&mut self, bytes: usize) -> &mut RegexBuilder { |
691 | 0 | self.builder.size_limit(bytes); |
692 | 0 | self |
693 | 0 | } |
694 | | |
695 | | /// Set the approximate capacity, in bytes, of the cache of transitions |
696 | | /// used by the lazy DFA. |
697 | | /// |
698 | | /// While the lazy DFA isn't always used, in tends to be the most |
699 | | /// commonly use regex engine in default configurations. It tends to |
700 | | /// adopt the performance profile of a fully build DFA, but without the |
701 | | /// downside of taking worst case exponential time to build. |
702 | | /// |
703 | | /// The downside is that it needs to keep a cache of transitions and |
704 | | /// states that are built while running a search, and this cache |
705 | | /// can fill up. When it fills up, the cache will reset itself. Any |
706 | | /// previously generated states and transitions will then need to be |
707 | | /// re-generated. If this happens too many times, then this library |
708 | | /// will bail out of using the lazy DFA and switch to a different regex |
709 | | /// engine. |
710 | | /// |
711 | | /// If your regex provokes this particular downside of the lazy DFA, |
712 | | /// then it may be beneficial to increase its cache capacity. This will |
713 | | /// potentially reduce the frequency of cache resetting (ideally to |
714 | | /// `0`). While it won't fix all potential performance problems with |
715 | | /// the lazy DFA, increasing the cache capacity does fix some. |
716 | | /// |
717 | | /// There is no easy way to determine, a priori, whether increasing |
718 | | /// this cache capacity will help. In general, the larger your regex, |
719 | | /// the more cache it's likely to use. But that isn't an ironclad rule. |
720 | | /// For example, a regex like `[01]*1[01]{N}` would normally produce a |
721 | | /// fully build DFA that is exponential in size with respect to `N`. |
722 | | /// The lazy DFA will prevent exponential space blow-up, but it cache |
723 | | /// is likely to fill up, even when it's large and even for smallish |
724 | | /// values of `N`. |
725 | | /// |
726 | | /// If you aren't sure whether this helps or not, it is sensible to |
727 | | /// set this to some arbitrarily large number in testing, such as |
728 | | /// `usize::MAX`. Namely, this represents the amount of capacity that |
729 | | /// *may* be used. It's probably not a good idea to use `usize::MAX` in |
730 | | /// production though, since it implies there are no controls on heap |
731 | | /// memory used by this library during a search. In effect, set it to |
732 | | /// whatever you're willing to allocate for a single regex search. |
733 | 0 | pub fn dfa_size_limit(&mut self, bytes: usize) -> &mut RegexBuilder { |
734 | 0 | self.builder.dfa_size_limit(bytes); |
735 | 0 | self |
736 | 0 | } |
737 | | |
738 | | /// Set the nesting limit for this parser. |
739 | | /// |
740 | | /// The nesting limit controls how deep the abstract syntax tree is |
741 | | /// allowed to be. If the AST exceeds the given limit (e.g., with too |
742 | | /// many nested groups), then an error is returned by the parser. |
743 | | /// |
744 | | /// The purpose of this limit is to act as a heuristic to prevent stack |
745 | | /// overflow for consumers that do structural induction on an AST using |
746 | | /// explicit recursion. While this crate never does this (instead using |
747 | | /// constant stack space and moving the call stack to the heap), other |
748 | | /// crates may. |
749 | | /// |
750 | | /// This limit is not checked until the entire AST is parsed. |
751 | | /// Therefore, if callers want to put a limit on the amount of heap |
752 | | /// space used, then they should impose a limit on the length, in |
753 | | /// bytes, of the concrete pattern string. In particular, this is |
754 | | /// viable since this parser implementation will limit itself to heap |
755 | | /// space proportional to the length of the pattern string. See also |
756 | | /// the [untrusted inputs](crate#untrusted-input) section in the |
757 | | /// top-level crate documentation for more information about this. |
758 | | /// |
759 | | /// Note that a nest limit of `0` will return a nest limit error for |
760 | | /// most patterns but not all. For example, a nest limit of `0` permits |
761 | | /// `a` but not `ab`, since `ab` requires an explicit concatenation, |
762 | | /// which results in a nest depth of `1`. In general, a nest limit is |
763 | | /// not something that manifests in an obvious way in the concrete |
764 | | /// syntax, therefore, it should not be used in a granular way. |
765 | | /// |
766 | | /// # Example |
767 | | /// |
768 | | /// ``` |
769 | | /// use regex::RegexBuilder; |
770 | | /// |
771 | | /// assert!(RegexBuilder::new(r"a").nest_limit(0).build().is_ok()); |
772 | | /// assert!(RegexBuilder::new(r"ab").nest_limit(0).build().is_err()); |
773 | | /// ``` |
774 | 0 | pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder { |
775 | 0 | self.builder.nest_limit(limit); |
776 | 0 | self |
777 | 0 | } |
778 | | } |
779 | | |
780 | | /// A configurable builder for a [`RegexSet`]. |
781 | | /// |
782 | | /// This builder can be used to programmatically set flags such as |
783 | | /// `i` (case insensitive) and `x` (for verbose mode). This builder |
784 | | /// can also be used to configure things like the line terminator |
785 | | /// and a size limit on the compiled regular expression. |
786 | | #[derive(Clone, Debug)] |
787 | | pub struct RegexSetBuilder { |
788 | | builder: Builder, |
789 | | } |
790 | | |
791 | | impl RegexSetBuilder { |
792 | | /// Create a new builder with a default configuration for the given |
793 | | /// patterns. |
794 | | /// |
795 | | /// If the patterns are invalid or exceed the configured size limits, |
796 | | /// then an error will be returned when [`RegexSetBuilder::build`] is |
797 | | /// called. |
798 | 0 | pub fn new<I, S>(patterns: I) -> RegexSetBuilder |
799 | 0 | where |
800 | 0 | I: IntoIterator<Item = S>, |
801 | 0 | S: AsRef<str>, |
802 | 0 | { |
803 | 0 | RegexSetBuilder { builder: Builder::new(patterns) } |
804 | 0 | } |
805 | | |
806 | | /// Compiles the patterns given to `RegexSetBuilder::new` with the |
807 | | /// configuration set on this builder. |
808 | | /// |
809 | | /// If the patterns aren't valid regexes or if a configured size limit |
810 | | /// was exceeded, then an error is returned. |
811 | 0 | pub fn build(&self) -> Result<RegexSet, Error> { |
812 | 0 | self.builder.build_many_string() |
813 | 0 | } |
814 | | |
815 | | /// This configures Unicode mode for the all of the patterns. |
816 | | /// |
817 | | /// Enabling Unicode mode does a number of things: |
818 | | /// |
819 | | /// * Most fundamentally, it causes the fundamental atom of matching |
820 | | /// to be a single codepoint. When Unicode mode is disabled, it's a |
821 | | /// single byte. For example, when Unicode mode is enabled, `.` will |
822 | | /// match `💩` once, where as it will match 4 times when Unicode mode |
823 | | /// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.) |
824 | | /// * Case insensitive matching uses Unicode simple case folding rules. |
825 | | /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are |
826 | | /// available. |
827 | | /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and |
828 | | /// `\d`. |
829 | | /// * The word boundary assertions, `\b` and `\B`, use the Unicode |
830 | | /// definition of a word character. |
831 | | /// |
832 | | /// Note that if Unicode mode is disabled, then the regex will fail to |
833 | | /// compile if it could match invalid UTF-8. For example, when Unicode |
834 | | /// mode is disabled, then since `.` matches any byte (except for |
835 | | /// `\n`), then it can match invalid UTF-8 and thus building a regex |
836 | | /// from it will fail. Another example is `\w` and `\W`. Since `\w` can |
837 | | /// only match ASCII bytes when Unicode mode is disabled, it's allowed. |
838 | | /// But `\W` can match more than ASCII bytes, including invalid UTF-8, |
839 | | /// and so it is not allowed. This restriction can be lifted only by |
840 | | /// using a [`bytes::RegexSet`](crate::bytes::RegexSet). |
841 | | /// |
842 | | /// For more details on the Unicode support in this crate, see the |
843 | | /// [Unicode section](crate#unicode) in this crate's top-level |
844 | | /// documentation. |
845 | | /// |
846 | | /// The default for this is `true`. |
847 | | /// |
848 | | /// # Example |
849 | | /// |
850 | | /// ``` |
851 | | /// use regex::RegexSetBuilder; |
852 | | /// |
853 | | /// let re = RegexSetBuilder::new([r"\w"]) |
854 | | /// .unicode(false) |
855 | | /// .build() |
856 | | /// .unwrap(); |
857 | | /// // Normally greek letters would be included in \w, but since |
858 | | /// // Unicode mode is disabled, it only matches ASCII letters. |
859 | | /// assert!(!re.is_match("δ")); |
860 | | /// |
861 | | /// let re = RegexSetBuilder::new([r"s"]) |
862 | | /// .case_insensitive(true) |
863 | | /// .unicode(false) |
864 | | /// .build() |
865 | | /// .unwrap(); |
866 | | /// // Normally 'Å¿' is included when searching for 's' case |
867 | | /// // insensitively due to Unicode's simple case folding rules. But |
868 | | /// // when Unicode mode is disabled, only ASCII case insensitive rules |
869 | | /// // are used. |
870 | | /// assert!(!re.is_match("Å¿")); |
871 | | /// ``` |
872 | 0 | pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder { |
873 | 0 | self.builder.unicode(yes); |
874 | 0 | self |
875 | 0 | } |
876 | | |
877 | | /// This configures whether to enable case insensitive matching for all |
878 | | /// of the patterns. |
879 | | /// |
880 | | /// This setting can also be configured using the inline flag `i` |
881 | | /// in the pattern. For example, `(?i:foo)` matches `foo` case |
882 | | /// insensitively while `(?-i:foo)` matches `foo` case sensitively. |
883 | | /// |
884 | | /// The default for this is `false`. |
885 | | /// |
886 | | /// # Example |
887 | | /// |
888 | | /// ``` |
889 | | /// use regex::RegexSetBuilder; |
890 | | /// |
891 | | /// let re = RegexSetBuilder::new([r"foo(?-i:bar)quux"]) |
892 | | /// .case_insensitive(true) |
893 | | /// .build() |
894 | | /// .unwrap(); |
895 | | /// assert!(re.is_match("FoObarQuUx")); |
896 | | /// // Even though case insensitive matching is enabled in the builder, |
897 | | /// // it can be locally disabled within the pattern. In this case, |
898 | | /// // `bar` is matched case sensitively. |
899 | | /// assert!(!re.is_match("fooBARquux")); |
900 | | /// ``` |
901 | 0 | pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder { |
902 | 0 | self.builder.case_insensitive(yes); |
903 | 0 | self |
904 | 0 | } |
905 | | |
906 | | /// This configures multi-line mode for all of the patterns. |
907 | | /// |
908 | | /// Enabling multi-line mode changes the behavior of the `^` and `$` |
909 | | /// anchor assertions. Instead of only matching at the beginning and |
910 | | /// end of a haystack, respectively, multi-line mode causes them to |
911 | | /// match at the beginning and end of a line *in addition* to the |
912 | | /// beginning and end of a haystack. More precisely, `^` will match at |
913 | | /// the position immediately following a `\n` and `$` will match at the |
914 | | /// position immediately preceding a `\n`. |
915 | | /// |
916 | | /// The behavior of this option can be impacted by other settings too: |
917 | | /// |
918 | | /// * The [`RegexSetBuilder::line_terminator`] option changes `\n` |
919 | | /// above to any ASCII byte. |
920 | | /// * The [`RegexSetBuilder::crlf`] option changes the line terminator |
921 | | /// to be either `\r` or `\n`, but never at the position between a `\r` |
922 | | /// and `\n`. |
923 | | /// |
924 | | /// This setting can also be configured using the inline flag `m` in |
925 | | /// the pattern. |
926 | | /// |
927 | | /// The default for this is `false`. |
928 | | /// |
929 | | /// # Example |
930 | | /// |
931 | | /// ``` |
932 | | /// use regex::RegexSetBuilder; |
933 | | /// |
934 | | /// let re = RegexSetBuilder::new([r"^foo$"]) |
935 | | /// .multi_line(true) |
936 | | /// .build() |
937 | | /// .unwrap(); |
938 | | /// assert!(re.is_match("\nfoo\n")); |
939 | | /// ``` |
940 | 0 | pub fn multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder { |
941 | 0 | self.builder.multi_line(yes); |
942 | 0 | self |
943 | 0 | } |
944 | | |
945 | | /// This configures dot-matches-new-line mode for the entire pattern. |
946 | | /// |
947 | | /// Perhaps surprisingly, the default behavior for `.` is not to match |
948 | | /// any character, but rather, to match any character except for the |
949 | | /// line terminator (which is `\n` by default). When this mode is |
950 | | /// enabled, the behavior changes such that `.` truly matches any |
951 | | /// character. |
952 | | /// |
953 | | /// This setting can also be configured using the inline flag `s` in |
954 | | /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent |
955 | | /// regexes. |
956 | | /// |
957 | | /// The default for this is `false`. |
958 | | /// |
959 | | /// # Example |
960 | | /// |
961 | | /// ``` |
962 | | /// use regex::RegexSetBuilder; |
963 | | /// |
964 | | /// let re = RegexSetBuilder::new([r"foo.bar"]) |
965 | | /// .dot_matches_new_line(true) |
966 | | /// .build() |
967 | | /// .unwrap(); |
968 | | /// let hay = "foo\nbar"; |
969 | | /// assert!(re.is_match(hay)); |
970 | | /// ``` |
971 | 0 | pub fn dot_matches_new_line( |
972 | 0 | &mut self, |
973 | 0 | yes: bool, |
974 | 0 | ) -> &mut RegexSetBuilder { |
975 | 0 | self.builder.dot_matches_new_line(yes); |
976 | 0 | self |
977 | 0 | } |
978 | | |
979 | | /// This configures CRLF mode for all of the patterns. |
980 | | /// |
981 | | /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for |
982 | | /// short) and `\n` ("line feed" or LF for short) are treated as line |
983 | | /// terminators. This results in the following: |
984 | | /// |
985 | | /// * Unless dot-matches-new-line mode is enabled, `.` will now match |
986 | | /// any character except for `\n` and `\r`. |
987 | | /// * When multi-line mode is enabled, `^` will match immediately |
988 | | /// following a `\n` or a `\r`. Similarly, `$` will match immediately |
989 | | /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match |
990 | | /// between `\r` and `\n`. |
991 | | /// |
992 | | /// This setting can also be configured using the inline flag `R` in |
993 | | /// the pattern. |
994 | | /// |
995 | | /// The default for this is `false`. |
996 | | /// |
997 | | /// # Example |
998 | | /// |
999 | | /// ``` |
1000 | | /// use regex::RegexSetBuilder; |
1001 | | /// |
1002 | | /// let re = RegexSetBuilder::new([r"^foo$"]) |
1003 | | /// .multi_line(true) |
1004 | | /// .crlf(true) |
1005 | | /// .build() |
1006 | | /// .unwrap(); |
1007 | | /// let hay = "\r\nfoo\r\n"; |
1008 | | /// // If CRLF mode weren't enabled here, then '$' wouldn't match |
1009 | | /// // immediately after 'foo', and thus no match would be found. |
1010 | | /// assert!(re.is_match(hay)); |
1011 | | /// ``` |
1012 | | /// |
1013 | | /// This example demonstrates that `^` will never match at a position |
1014 | | /// between `\r` and `\n`. (`$` will similarly not match between a `\r` |
1015 | | /// and a `\n`.) |
1016 | | /// |
1017 | | /// ``` |
1018 | | /// use regex::RegexSetBuilder; |
1019 | | /// |
1020 | | /// let re = RegexSetBuilder::new([r"^\n"]) |
1021 | | /// .multi_line(true) |
1022 | | /// .crlf(true) |
1023 | | /// .build() |
1024 | | /// .unwrap(); |
1025 | | /// assert!(!re.is_match("\r\n")); |
1026 | | /// ``` |
1027 | 0 | pub fn crlf(&mut self, yes: bool) -> &mut RegexSetBuilder { |
1028 | 0 | self.builder.crlf(yes); |
1029 | 0 | self |
1030 | 0 | } |
1031 | | |
1032 | | /// Configures the line terminator to be used by the regex. |
1033 | | /// |
1034 | | /// The line terminator is relevant in two ways for a particular regex: |
1035 | | /// |
1036 | | /// * When dot-matches-new-line mode is *not* enabled (the default), |
1037 | | /// then `.` will match any character except for the configured line |
1038 | | /// terminator. |
1039 | | /// * When multi-line mode is enabled (not the default), then `^` and |
1040 | | /// `$` will match immediately after and before, respectively, a line |
1041 | | /// terminator. |
1042 | | /// |
1043 | | /// In both cases, if CRLF mode is enabled in a particular context, |
1044 | | /// then it takes precedence over any configured line terminator. |
1045 | | /// |
1046 | | /// This option cannot be configured from within the pattern. |
1047 | | /// |
1048 | | /// The default line terminator is `\n`. |
1049 | | /// |
1050 | | /// # Example |
1051 | | /// |
1052 | | /// This shows how to treat the NUL byte as a line terminator. This can |
1053 | | /// be a useful heuristic when searching binary data. |
1054 | | /// |
1055 | | /// ``` |
1056 | | /// use regex::RegexSetBuilder; |
1057 | | /// |
1058 | | /// let re = RegexSetBuilder::new([r"^foo$"]) |
1059 | | /// .multi_line(true) |
1060 | | /// .line_terminator(b'\x00') |
1061 | | /// .build() |
1062 | | /// .unwrap(); |
1063 | | /// let hay = "\x00foo\x00"; |
1064 | | /// assert!(re.is_match(hay)); |
1065 | | /// ``` |
1066 | | /// |
1067 | | /// This example shows that the behavior of `.` is impacted by this |
1068 | | /// setting as well: |
1069 | | /// |
1070 | | /// ``` |
1071 | | /// use regex::RegexSetBuilder; |
1072 | | /// |
1073 | | /// let re = RegexSetBuilder::new([r"."]) |
1074 | | /// .line_terminator(b'\x00') |
1075 | | /// .build() |
1076 | | /// .unwrap(); |
1077 | | /// assert!(re.is_match("\n")); |
1078 | | /// assert!(!re.is_match("\x00")); |
1079 | | /// ``` |
1080 | | /// |
1081 | | /// This shows that building a regex will fail if the byte given |
1082 | | /// is not ASCII and the pattern could result in matching invalid |
1083 | | /// UTF-8. This is because any singular non-ASCII byte is not valid |
1084 | | /// UTF-8, and it is not permitted for a [`RegexSet`] to match invalid |
1085 | | /// UTF-8. (It is permissible to use a non-ASCII byte when building a |
1086 | | /// [`bytes::RegexSet`](crate::bytes::RegexSet).) |
1087 | | /// |
1088 | | /// ``` |
1089 | | /// use regex::RegexSetBuilder; |
1090 | | /// |
1091 | | /// assert!( |
1092 | | /// RegexSetBuilder::new([r"."]) |
1093 | | /// .line_terminator(0x80) |
1094 | | /// .build() |
1095 | | /// .is_err() |
1096 | | /// ); |
1097 | | /// // Note that using a non-ASCII byte isn't enough on its own to |
1098 | | /// // cause regex compilation to fail. You actually have to make use |
1099 | | /// // of it in the regex in a way that leads to matching invalid |
1100 | | /// // UTF-8. If you don't, then regex compilation will succeed! |
1101 | | /// assert!( |
1102 | | /// RegexSetBuilder::new([r"a"]) |
1103 | | /// .line_terminator(0x80) |
1104 | | /// .build() |
1105 | | /// .is_ok() |
1106 | | /// ); |
1107 | | /// ``` |
1108 | 0 | pub fn line_terminator(&mut self, byte: u8) -> &mut RegexSetBuilder { |
1109 | 0 | self.builder.line_terminator(byte); |
1110 | 0 | self |
1111 | 0 | } |
1112 | | |
1113 | | /// This configures swap-greed mode for all of the patterns. |
1114 | | /// |
1115 | | /// When swap-greed mode is enabled, patterns like `a+` will become |
1116 | | /// non-greedy and patterns like `a+?` will become greedy. In other |
1117 | | /// words, the meanings of `a+` and `a+?` are switched. |
1118 | | /// |
1119 | | /// This setting can also be configured using the inline flag `U` in |
1120 | | /// the pattern. |
1121 | | /// |
1122 | | /// Note that this is generally not useful for a `RegexSet` since a |
1123 | | /// `RegexSet` can only report whether a pattern matches or not. Since |
1124 | | /// greediness never impacts whether a match is found or not (only the |
1125 | | /// offsets of the match), it follows that whether parts of a pattern |
1126 | | /// are greedy or not doesn't matter for a `RegexSet`. |
1127 | | /// |
1128 | | /// The default for this is `false`. |
1129 | 0 | pub fn swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder { |
1130 | 0 | self.builder.swap_greed(yes); |
1131 | 0 | self |
1132 | 0 | } |
1133 | | |
1134 | | /// This configures verbose mode for all of the patterns. |
1135 | | /// |
1136 | | /// When enabled, whitespace will treated as insignifcant in the |
1137 | | /// pattern and `#` can be used to start a comment until the next new |
1138 | | /// line. |
1139 | | /// |
1140 | | /// Normally, in most places in a pattern, whitespace is treated |
1141 | | /// literally. For example ` +` will match one or more ASCII whitespace |
1142 | | /// characters. |
1143 | | /// |
1144 | | /// When verbose mode is enabled, `\#` can be used to match a literal |
1145 | | /// `#` and `\ ` can be used to match a literal ASCII whitespace |
1146 | | /// character. |
1147 | | /// |
1148 | | /// Verbose mode is useful for permitting regexes to be formatted and |
1149 | | /// broken up more nicely. This may make them more easily readable. |
1150 | | /// |
1151 | | /// This setting can also be configured using the inline flag `x` in |
1152 | | /// the pattern. |
1153 | | /// |
1154 | | /// The default for this is `false`. |
1155 | | /// |
1156 | | /// # Example |
1157 | | /// |
1158 | | /// ``` |
1159 | | /// use regex::RegexSetBuilder; |
1160 | | /// |
1161 | | /// let pat = r" |
1162 | | /// \b |
1163 | | /// (?<first>\p{Uppercase}\w*) # always start with uppercase letter |
1164 | | /// [\s--\n]+ # whitespace should separate names |
1165 | | /// (?: # middle name can be an initial! |
1166 | | /// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*)) |
1167 | | /// [\s--\n]+ |
1168 | | /// )? |
1169 | | /// (?<last>\p{Uppercase}\w*) |
1170 | | /// \b |
1171 | | /// "; |
1172 | | /// let re = RegexSetBuilder::new([pat]) |
1173 | | /// .ignore_whitespace(true) |
1174 | | /// .build() |
1175 | | /// .unwrap(); |
1176 | | /// assert!(re.is_match("Harry Potter")); |
1177 | | /// assert!(re.is_match("Harry J. Potter")); |
1178 | | /// assert!(re.is_match("Harry James Potter")); |
1179 | | /// assert!(!re.is_match("harry J. Potter")); |
1180 | | /// ``` |
1181 | 0 | pub fn ignore_whitespace( |
1182 | 0 | &mut self, |
1183 | 0 | yes: bool, |
1184 | 0 | ) -> &mut RegexSetBuilder { |
1185 | 0 | self.builder.ignore_whitespace(yes); |
1186 | 0 | self |
1187 | 0 | } |
1188 | | |
1189 | | /// This configures octal mode for all of the patterns. |
1190 | | /// |
1191 | | /// Octal syntax is a little-known way of uttering Unicode codepoints |
1192 | | /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all |
1193 | | /// equivalent patterns, where the last example shows octal syntax. |
1194 | | /// |
1195 | | /// While supporting octal syntax isn't in and of itself a problem, |
1196 | | /// it does make good error messages harder. That is, in PCRE based |
1197 | | /// regex engines, syntax like `\1` invokes a backreference, which is |
1198 | | /// explicitly unsupported this library. However, many users expect |
1199 | | /// backreferences to be supported. Therefore, when octal support |
1200 | | /// is disabled, the error message will explicitly mention that |
1201 | | /// backreferences aren't supported. |
1202 | | /// |
1203 | | /// The default for this is `false`. |
1204 | | /// |
1205 | | /// # Example |
1206 | | /// |
1207 | | /// ``` |
1208 | | /// use regex::RegexSetBuilder; |
1209 | | /// |
1210 | | /// // Normally this pattern would not compile, with an error message |
1211 | | /// // about backreferences not being supported. But with octal mode |
1212 | | /// // enabled, octal escape sequences work. |
1213 | | /// let re = RegexSetBuilder::new([r"\141"]) |
1214 | | /// .octal(true) |
1215 | | /// .build() |
1216 | | /// .unwrap(); |
1217 | | /// assert!(re.is_match("a")); |
1218 | | /// ``` |
1219 | 0 | pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder { |
1220 | 0 | self.builder.octal(yes); |
1221 | 0 | self |
1222 | 0 | } |
1223 | | |
1224 | | /// Sets the approximate size limit, in bytes, of the compiled regex. |
1225 | | /// |
1226 | | /// This roughly corresponds to the number of heap memory, in |
1227 | | /// bytes, occupied by a single regex. If the regex would otherwise |
1228 | | /// approximately exceed this limit, then compiling that regex will |
1229 | | /// fail. |
1230 | | /// |
1231 | | /// The main utility of a method like this is to avoid compiling |
1232 | | /// regexes that use an unexpected amount of resources, such as |
1233 | | /// time and memory. Even if the memory usage of a large regex is |
1234 | | /// acceptable, its search time may not be. Namely, worst case time |
1235 | | /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and |
1236 | | /// `n ~ len(haystack)`. That is, search time depends, in part, on the |
1237 | | /// size of the compiled regex. This means that putting a limit on the |
1238 | | /// size of the regex limits how much a regex can impact search time. |
1239 | | /// |
1240 | | /// For more information about regex size limits, see the section on |
1241 | | /// [untrusted inputs](crate#untrusted-input) in the top-level crate |
1242 | | /// documentation. |
1243 | | /// |
1244 | | /// The default for this is some reasonable number that permits most |
1245 | | /// patterns to compile successfully. |
1246 | | /// |
1247 | | /// # Example |
1248 | | /// |
1249 | | /// ``` |
1250 | | /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 |
1251 | | /// use regex::RegexSetBuilder; |
1252 | | /// |
1253 | | /// // It may surprise you how big some seemingly small patterns can |
1254 | | /// // be! Since \w is Unicode aware, this generates a regex that can |
1255 | | /// // match approximately 140,000 distinct codepoints. |
1256 | | /// assert!( |
1257 | | /// RegexSetBuilder::new([r"\w"]) |
1258 | | /// .size_limit(45_000) |
1259 | | /// .build() |
1260 | | /// .is_err() |
1261 | | /// ); |
1262 | | /// ``` |
1263 | 0 | pub fn size_limit(&mut self, bytes: usize) -> &mut RegexSetBuilder { |
1264 | 0 | self.builder.size_limit(bytes); |
1265 | 0 | self |
1266 | 0 | } |
1267 | | |
1268 | | /// Set the approximate capacity, in bytes, of the cache of transitions |
1269 | | /// used by the lazy DFA. |
1270 | | /// |
1271 | | /// While the lazy DFA isn't always used, in tends to be the most |
1272 | | /// commonly use regex engine in default configurations. It tends to |
1273 | | /// adopt the performance profile of a fully build DFA, but without the |
1274 | | /// downside of taking worst case exponential time to build. |
1275 | | /// |
1276 | | /// The downside is that it needs to keep a cache of transitions and |
1277 | | /// states that are built while running a search, and this cache |
1278 | | /// can fill up. When it fills up, the cache will reset itself. Any |
1279 | | /// previously generated states and transitions will then need to be |
1280 | | /// re-generated. If this happens too many times, then this library |
1281 | | /// will bail out of using the lazy DFA and switch to a different regex |
1282 | | /// engine. |
1283 | | /// |
1284 | | /// If your regex provokes this particular downside of the lazy DFA, |
1285 | | /// then it may be beneficial to increase its cache capacity. This will |
1286 | | /// potentially reduce the frequency of cache resetting (ideally to |
1287 | | /// `0`). While it won't fix all potential performance problems with |
1288 | | /// the lazy DFA, increasing the cache capacity does fix some. |
1289 | | /// |
1290 | | /// There is no easy way to determine, a priori, whether increasing |
1291 | | /// this cache capacity will help. In general, the larger your regex, |
1292 | | /// the more cache it's likely to use. But that isn't an ironclad rule. |
1293 | | /// For example, a regex like `[01]*1[01]{N}` would normally produce a |
1294 | | /// fully build DFA that is exponential in size with respect to `N`. |
1295 | | /// The lazy DFA will prevent exponential space blow-up, but it cache |
1296 | | /// is likely to fill up, even when it's large and even for smallish |
1297 | | /// values of `N`. |
1298 | | /// |
1299 | | /// If you aren't sure whether this helps or not, it is sensible to |
1300 | | /// set this to some arbitrarily large number in testing, such as |
1301 | | /// `usize::MAX`. Namely, this represents the amount of capacity that |
1302 | | /// *may* be used. It's probably not a good idea to use `usize::MAX` in |
1303 | | /// production though, since it implies there are no controls on heap |
1304 | | /// memory used by this library during a search. In effect, set it to |
1305 | | /// whatever you're willing to allocate for a single regex search. |
1306 | 0 | pub fn dfa_size_limit( |
1307 | 0 | &mut self, |
1308 | 0 | bytes: usize, |
1309 | 0 | ) -> &mut RegexSetBuilder { |
1310 | 0 | self.builder.dfa_size_limit(bytes); |
1311 | 0 | self |
1312 | 0 | } |
1313 | | |
1314 | | /// Set the nesting limit for this parser. |
1315 | | /// |
1316 | | /// The nesting limit controls how deep the abstract syntax tree is |
1317 | | /// allowed to be. If the AST exceeds the given limit (e.g., with too |
1318 | | /// many nested groups), then an error is returned by the parser. |
1319 | | /// |
1320 | | /// The purpose of this limit is to act as a heuristic to prevent stack |
1321 | | /// overflow for consumers that do structural induction on an AST using |
1322 | | /// explicit recursion. While this crate never does this (instead using |
1323 | | /// constant stack space and moving the call stack to the heap), other |
1324 | | /// crates may. |
1325 | | /// |
1326 | | /// This limit is not checked until the entire AST is parsed. |
1327 | | /// Therefore, if callers want to put a limit on the amount of heap |
1328 | | /// space used, then they should impose a limit on the length, in |
1329 | | /// bytes, of the concrete pattern string. In particular, this is |
1330 | | /// viable since this parser implementation will limit itself to heap |
1331 | | /// space proportional to the length of the pattern string. See also |
1332 | | /// the [untrusted inputs](crate#untrusted-input) section in the |
1333 | | /// top-level crate documentation for more information about this. |
1334 | | /// |
1335 | | /// Note that a nest limit of `0` will return a nest limit error for |
1336 | | /// most patterns but not all. For example, a nest limit of `0` permits |
1337 | | /// `a` but not `ab`, since `ab` requires an explicit concatenation, |
1338 | | /// which results in a nest depth of `1`. In general, a nest limit is |
1339 | | /// not something that manifests in an obvious way in the concrete |
1340 | | /// syntax, therefore, it should not be used in a granular way. |
1341 | | /// |
1342 | | /// # Example |
1343 | | /// |
1344 | | /// ``` |
1345 | | /// use regex::RegexSetBuilder; |
1346 | | /// |
1347 | | /// assert!(RegexSetBuilder::new([r"a"]).nest_limit(0).build().is_ok()); |
1348 | | /// assert!(RegexSetBuilder::new([r"ab"]).nest_limit(0).build().is_err()); |
1349 | | /// ``` |
1350 | 0 | pub fn nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder { |
1351 | 0 | self.builder.nest_limit(limit); |
1352 | 0 | self |
1353 | 0 | } |
1354 | | } |
1355 | | } |
1356 | | |
1357 | | pub(crate) mod bytes { |
1358 | | use crate::{ |
1359 | | bytes::{Regex, RegexSet}, |
1360 | | error::Error, |
1361 | | }; |
1362 | | |
1363 | | use super::Builder; |
1364 | | |
1365 | | /// A configurable builder for a [`Regex`]. |
1366 | | /// |
1367 | | /// This builder can be used to programmatically set flags such as `i` |
1368 | | /// (case insensitive) and `x` (for verbose mode). This builder can also be |
1369 | | /// used to configure things like the line terminator and a size limit on |
1370 | | /// the compiled regular expression. |
1371 | | #[derive(Clone, Debug)] |
1372 | | pub struct RegexBuilder { |
1373 | | builder: Builder, |
1374 | | } |
1375 | | |
1376 | | impl RegexBuilder { |
1377 | | /// Create a new builder with a default configuration for the given |
1378 | | /// pattern. |
1379 | | /// |
1380 | | /// If the pattern is invalid or exceeds the configured size limits, |
1381 | | /// then an error will be returned when [`RegexBuilder::build`] is |
1382 | | /// called. |
1383 | 0 | pub fn new(pattern: &str) -> RegexBuilder { |
1384 | 0 | RegexBuilder { builder: Builder::new([pattern]) } |
1385 | 0 | } |
1386 | | |
1387 | | /// Compiles the pattern given to `RegexBuilder::new` with the |
1388 | | /// configuration set on this builder. |
1389 | | /// |
1390 | | /// If the pattern isn't a valid regex or if a configured size limit |
1391 | | /// was exceeded, then an error is returned. |
1392 | 0 | pub fn build(&self) -> Result<Regex, Error> { |
1393 | 0 | self.builder.build_one_bytes() |
1394 | 0 | } |
1395 | | |
1396 | | /// This configures Unicode mode for the entire pattern. |
1397 | | /// |
1398 | | /// Enabling Unicode mode does a number of things: |
1399 | | /// |
1400 | | /// * Most fundamentally, it causes the fundamental atom of matching |
1401 | | /// to be a single codepoint. When Unicode mode is disabled, it's a |
1402 | | /// single byte. For example, when Unicode mode is enabled, `.` will |
1403 | | /// match `💩` once, where as it will match 4 times when Unicode mode |
1404 | | /// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.) |
1405 | | /// * Case insensitive matching uses Unicode simple case folding rules. |
1406 | | /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are |
1407 | | /// available. |
1408 | | /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and |
1409 | | /// `\d`. |
1410 | | /// * The word boundary assertions, `\b` and `\B`, use the Unicode |
1411 | | /// definition of a word character. |
1412 | | /// |
1413 | | /// Note that unlike the top-level `Regex` for searching `&str`, it |
1414 | | /// is permitted to disable Unicode mode even if the resulting pattern |
1415 | | /// could match invalid UTF-8. For example, `(?-u:.)` is not a valid |
1416 | | /// pattern for a top-level `Regex`, but is valid for a `bytes::Regex`. |
1417 | | /// |
1418 | | /// For more details on the Unicode support in this crate, see the |
1419 | | /// [Unicode section](crate#unicode) in this crate's top-level |
1420 | | /// documentation. |
1421 | | /// |
1422 | | /// The default for this is `true`. |
1423 | | /// |
1424 | | /// # Example |
1425 | | /// |
1426 | | /// ``` |
1427 | | /// use regex::bytes::RegexBuilder; |
1428 | | /// |
1429 | | /// let re = RegexBuilder::new(r"\w") |
1430 | | /// .unicode(false) |
1431 | | /// .build() |
1432 | | /// .unwrap(); |
1433 | | /// // Normally greek letters would be included in \w, but since |
1434 | | /// // Unicode mode is disabled, it only matches ASCII letters. |
1435 | | /// assert!(!re.is_match("δ".as_bytes())); |
1436 | | /// |
1437 | | /// let re = RegexBuilder::new(r"s") |
1438 | | /// .case_insensitive(true) |
1439 | | /// .unicode(false) |
1440 | | /// .build() |
1441 | | /// .unwrap(); |
1442 | | /// // Normally 'Å¿' is included when searching for 's' case |
1443 | | /// // insensitively due to Unicode's simple case folding rules. But |
1444 | | /// // when Unicode mode is disabled, only ASCII case insensitive rules |
1445 | | /// // are used. |
1446 | | /// assert!(!re.is_match("Å¿".as_bytes())); |
1447 | | /// ``` |
1448 | | /// |
1449 | | /// Since this builder is for constructing a [`bytes::Regex`](Regex), |
1450 | | /// one can disable Unicode mode even if it would match invalid UTF-8: |
1451 | | /// |
1452 | | /// ``` |
1453 | | /// use regex::bytes::RegexBuilder; |
1454 | | /// |
1455 | | /// let re = RegexBuilder::new(r".") |
1456 | | /// .unicode(false) |
1457 | | /// .build() |
1458 | | /// .unwrap(); |
1459 | | /// // Normally greek letters would be included in \w, but since |
1460 | | /// // Unicode mode is disabled, it only matches ASCII letters. |
1461 | | /// assert!(re.is_match(b"\xFF")); |
1462 | | /// ``` |
1463 | 0 | pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder { |
1464 | 0 | self.builder.unicode(yes); |
1465 | 0 | self |
1466 | 0 | } |
1467 | | |
1468 | | /// This configures whether to enable case insensitive matching for the |
1469 | | /// entire pattern. |
1470 | | /// |
1471 | | /// This setting can also be configured using the inline flag `i` |
1472 | | /// in the pattern. For example, `(?i:foo)` matches `foo` case |
1473 | | /// insensitively while `(?-i:foo)` matches `foo` case sensitively. |
1474 | | /// |
1475 | | /// The default for this is `false`. |
1476 | | /// |
1477 | | /// # Example |
1478 | | /// |
1479 | | /// ``` |
1480 | | /// use regex::bytes::RegexBuilder; |
1481 | | /// |
1482 | | /// let re = RegexBuilder::new(r"foo(?-i:bar)quux") |
1483 | | /// .case_insensitive(true) |
1484 | | /// .build() |
1485 | | /// .unwrap(); |
1486 | | /// assert!(re.is_match(b"FoObarQuUx")); |
1487 | | /// // Even though case insensitive matching is enabled in the builder, |
1488 | | /// // it can be locally disabled within the pattern. In this case, |
1489 | | /// // `bar` is matched case sensitively. |
1490 | | /// assert!(!re.is_match(b"fooBARquux")); |
1491 | | /// ``` |
1492 | 0 | pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder { |
1493 | 0 | self.builder.case_insensitive(yes); |
1494 | 0 | self |
1495 | 0 | } |
1496 | | |
1497 | | /// This configures multi-line mode for the entire pattern. |
1498 | | /// |
1499 | | /// Enabling multi-line mode changes the behavior of the `^` and `$` |
1500 | | /// anchor assertions. Instead of only matching at the beginning and |
1501 | | /// end of a haystack, respectively, multi-line mode causes them to |
1502 | | /// match at the beginning and end of a line *in addition* to the |
1503 | | /// beginning and end of a haystack. More precisely, `^` will match at |
1504 | | /// the position immediately following a `\n` and `$` will match at the |
1505 | | /// position immediately preceding a `\n`. |
1506 | | /// |
1507 | | /// The behavior of this option can be impacted by other settings too: |
1508 | | /// |
1509 | | /// * The [`RegexBuilder::line_terminator`] option changes `\n` above |
1510 | | /// to any ASCII byte. |
1511 | | /// * The [`RegexBuilder::crlf`] option changes the line terminator to |
1512 | | /// be either `\r` or `\n`, but never at the position between a `\r` |
1513 | | /// and `\n`. |
1514 | | /// |
1515 | | /// This setting can also be configured using the inline flag `m` in |
1516 | | /// the pattern. |
1517 | | /// |
1518 | | /// The default for this is `false`. |
1519 | | /// |
1520 | | /// # Example |
1521 | | /// |
1522 | | /// ``` |
1523 | | /// use regex::bytes::RegexBuilder; |
1524 | | /// |
1525 | | /// let re = RegexBuilder::new(r"^foo$") |
1526 | | /// .multi_line(true) |
1527 | | /// .build() |
1528 | | /// .unwrap(); |
1529 | | /// assert_eq!(Some(1..4), re.find(b"\nfoo\n").map(|m| m.range())); |
1530 | | /// ``` |
1531 | 0 | pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder { |
1532 | 0 | self.builder.multi_line(yes); |
1533 | 0 | self |
1534 | 0 | } |
1535 | | |
1536 | | /// This configures dot-matches-new-line mode for the entire pattern. |
1537 | | /// |
1538 | | /// Perhaps surprisingly, the default behavior for `.` is not to match |
1539 | | /// any character, but rather, to match any character except for the |
1540 | | /// line terminator (which is `\n` by default). When this mode is |
1541 | | /// enabled, the behavior changes such that `.` truly matches any |
1542 | | /// character. |
1543 | | /// |
1544 | | /// This setting can also be configured using the inline flag `s` in |
1545 | | /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent |
1546 | | /// regexes. |
1547 | | /// |
1548 | | /// The default for this is `false`. |
1549 | | /// |
1550 | | /// # Example |
1551 | | /// |
1552 | | /// ``` |
1553 | | /// use regex::bytes::RegexBuilder; |
1554 | | /// |
1555 | | /// let re = RegexBuilder::new(r"foo.bar") |
1556 | | /// .dot_matches_new_line(true) |
1557 | | /// .build() |
1558 | | /// .unwrap(); |
1559 | | /// let hay = b"foo\nbar"; |
1560 | | /// assert_eq!(Some(&b"foo\nbar"[..]), re.find(hay).map(|m| m.as_bytes())); |
1561 | | /// ``` |
1562 | 0 | pub fn dot_matches_new_line( |
1563 | 0 | &mut self, |
1564 | 0 | yes: bool, |
1565 | 0 | ) -> &mut RegexBuilder { |
1566 | 0 | self.builder.dot_matches_new_line(yes); |
1567 | 0 | self |
1568 | 0 | } |
1569 | | |
1570 | | /// This configures CRLF mode for the entire pattern. |
1571 | | /// |
1572 | | /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for |
1573 | | /// short) and `\n` ("line feed" or LF for short) are treated as line |
1574 | | /// terminators. This results in the following: |
1575 | | /// |
1576 | | /// * Unless dot-matches-new-line mode is enabled, `.` will now match |
1577 | | /// any character except for `\n` and `\r`. |
1578 | | /// * When multi-line mode is enabled, `^` will match immediately |
1579 | | /// following a `\n` or a `\r`. Similarly, `$` will match immediately |
1580 | | /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match |
1581 | | /// between `\r` and `\n`. |
1582 | | /// |
1583 | | /// This setting can also be configured using the inline flag `R` in |
1584 | | /// the pattern. |
1585 | | /// |
1586 | | /// The default for this is `false`. |
1587 | | /// |
1588 | | /// # Example |
1589 | | /// |
1590 | | /// ``` |
1591 | | /// use regex::bytes::RegexBuilder; |
1592 | | /// |
1593 | | /// let re = RegexBuilder::new(r"^foo$") |
1594 | | /// .multi_line(true) |
1595 | | /// .crlf(true) |
1596 | | /// .build() |
1597 | | /// .unwrap(); |
1598 | | /// let hay = b"\r\nfoo\r\n"; |
1599 | | /// // If CRLF mode weren't enabled here, then '$' wouldn't match |
1600 | | /// // immediately after 'foo', and thus no match would be found. |
1601 | | /// assert_eq!(Some(&b"foo"[..]), re.find(hay).map(|m| m.as_bytes())); |
1602 | | /// ``` |
1603 | | /// |
1604 | | /// This example demonstrates that `^` will never match at a position |
1605 | | /// between `\r` and `\n`. (`$` will similarly not match between a `\r` |
1606 | | /// and a `\n`.) |
1607 | | /// |
1608 | | /// ``` |
1609 | | /// use regex::bytes::RegexBuilder; |
1610 | | /// |
1611 | | /// let re = RegexBuilder::new(r"^") |
1612 | | /// .multi_line(true) |
1613 | | /// .crlf(true) |
1614 | | /// .build() |
1615 | | /// .unwrap(); |
1616 | | /// let hay = b"\r\n\r\n"; |
1617 | | /// let ranges: Vec<_> = re.find_iter(hay).map(|m| m.range()).collect(); |
1618 | | /// assert_eq!(ranges, vec![0..0, 2..2, 4..4]); |
1619 | | /// ``` |
1620 | 0 | pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder { |
1621 | 0 | self.builder.crlf(yes); |
1622 | 0 | self |
1623 | 0 | } |
1624 | | |
1625 | | /// Configures the line terminator to be used by the regex. |
1626 | | /// |
1627 | | /// The line terminator is relevant in two ways for a particular regex: |
1628 | | /// |
1629 | | /// * When dot-matches-new-line mode is *not* enabled (the default), |
1630 | | /// then `.` will match any character except for the configured line |
1631 | | /// terminator. |
1632 | | /// * When multi-line mode is enabled (not the default), then `^` and |
1633 | | /// `$` will match immediately after and before, respectively, a line |
1634 | | /// terminator. |
1635 | | /// |
1636 | | /// In both cases, if CRLF mode is enabled in a particular context, |
1637 | | /// then it takes precedence over any configured line terminator. |
1638 | | /// |
1639 | | /// This option cannot be configured from within the pattern. |
1640 | | /// |
1641 | | /// The default line terminator is `\n`. |
1642 | | /// |
1643 | | /// # Example |
1644 | | /// |
1645 | | /// This shows how to treat the NUL byte as a line terminator. This can |
1646 | | /// be a useful heuristic when searching binary data. |
1647 | | /// |
1648 | | /// ``` |
1649 | | /// use regex::bytes::RegexBuilder; |
1650 | | /// |
1651 | | /// let re = RegexBuilder::new(r"^foo$") |
1652 | | /// .multi_line(true) |
1653 | | /// .line_terminator(b'\x00') |
1654 | | /// .build() |
1655 | | /// .unwrap(); |
1656 | | /// let hay = b"\x00foo\x00"; |
1657 | | /// assert_eq!(Some(1..4), re.find(hay).map(|m| m.range())); |
1658 | | /// ``` |
1659 | | /// |
1660 | | /// This example shows that the behavior of `.` is impacted by this |
1661 | | /// setting as well: |
1662 | | /// |
1663 | | /// ``` |
1664 | | /// use regex::bytes::RegexBuilder; |
1665 | | /// |
1666 | | /// let re = RegexBuilder::new(r".") |
1667 | | /// .line_terminator(b'\x00') |
1668 | | /// .build() |
1669 | | /// .unwrap(); |
1670 | | /// assert!(re.is_match(b"\n")); |
1671 | | /// assert!(!re.is_match(b"\x00")); |
1672 | | /// ``` |
1673 | | /// |
1674 | | /// This shows that building a regex will work even when the byte |
1675 | | /// given is not ASCII. This is unlike the top-level `Regex` API where |
1676 | | /// matching invalid UTF-8 is not allowed. |
1677 | | /// |
1678 | | /// Note though that you must disable Unicode mode. This is required |
1679 | | /// because Unicode mode requires matching one codepoint at a time, |
1680 | | /// and there is no way to match a non-ASCII byte as if it were a |
1681 | | /// codepoint. |
1682 | | /// |
1683 | | /// ``` |
1684 | | /// use regex::bytes::RegexBuilder; |
1685 | | /// |
1686 | | /// assert!( |
1687 | | /// RegexBuilder::new(r".") |
1688 | | /// .unicode(false) |
1689 | | /// .line_terminator(0x80) |
1690 | | /// .build() |
1691 | | /// .is_ok(), |
1692 | | /// ); |
1693 | | /// ``` |
1694 | 0 | pub fn line_terminator(&mut self, byte: u8) -> &mut RegexBuilder { |
1695 | 0 | self.builder.line_terminator(byte); |
1696 | 0 | self |
1697 | 0 | } |
1698 | | |
1699 | | /// This configures swap-greed mode for the entire pattern. |
1700 | | /// |
1701 | | /// When swap-greed mode is enabled, patterns like `a+` will become |
1702 | | /// non-greedy and patterns like `a+?` will become greedy. In other |
1703 | | /// words, the meanings of `a+` and `a+?` are switched. |
1704 | | /// |
1705 | | /// This setting can also be configured using the inline flag `U` in |
1706 | | /// the pattern. |
1707 | | /// |
1708 | | /// The default for this is `false`. |
1709 | | /// |
1710 | | /// # Example |
1711 | | /// |
1712 | | /// ``` |
1713 | | /// use regex::bytes::RegexBuilder; |
1714 | | /// |
1715 | | /// let re = RegexBuilder::new(r"a+") |
1716 | | /// .swap_greed(true) |
1717 | | /// .build() |
1718 | | /// .unwrap(); |
1719 | | /// assert_eq!(Some(&b"a"[..]), re.find(b"aaa").map(|m| m.as_bytes())); |
1720 | | /// ``` |
1721 | 0 | pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder { |
1722 | 0 | self.builder.swap_greed(yes); |
1723 | 0 | self |
1724 | 0 | } |
1725 | | |
1726 | | /// This configures verbose mode for the entire pattern. |
1727 | | /// |
1728 | | /// When enabled, whitespace will treated as insignifcant in the |
1729 | | /// pattern and `#` can be used to start a comment until the next new |
1730 | | /// line. |
1731 | | /// |
1732 | | /// Normally, in most places in a pattern, whitespace is treated |
1733 | | /// literally. For example ` +` will match one or more ASCII whitespace |
1734 | | /// characters. |
1735 | | /// |
1736 | | /// When verbose mode is enabled, `\#` can be used to match a literal |
1737 | | /// `#` and `\ ` can be used to match a literal ASCII whitespace |
1738 | | /// character. |
1739 | | /// |
1740 | | /// Verbose mode is useful for permitting regexes to be formatted and |
1741 | | /// broken up more nicely. This may make them more easily readable. |
1742 | | /// |
1743 | | /// This setting can also be configured using the inline flag `x` in |
1744 | | /// the pattern. |
1745 | | /// |
1746 | | /// The default for this is `false`. |
1747 | | /// |
1748 | | /// # Example |
1749 | | /// |
1750 | | /// ``` |
1751 | | /// use regex::bytes::RegexBuilder; |
1752 | | /// |
1753 | | /// let pat = r" |
1754 | | /// \b |
1755 | | /// (?<first>\p{Uppercase}\w*) # always start with uppercase letter |
1756 | | /// [\s--\n]+ # whitespace should separate names |
1757 | | /// (?: # middle name can be an initial! |
1758 | | /// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*)) |
1759 | | /// [\s--\n]+ |
1760 | | /// )? |
1761 | | /// (?<last>\p{Uppercase}\w*) |
1762 | | /// \b |
1763 | | /// "; |
1764 | | /// let re = RegexBuilder::new(pat) |
1765 | | /// .ignore_whitespace(true) |
1766 | | /// .build() |
1767 | | /// .unwrap(); |
1768 | | /// |
1769 | | /// let caps = re.captures(b"Harry Potter").unwrap(); |
1770 | | /// assert_eq!(&b"Harry"[..], &caps["first"]); |
1771 | | /// assert_eq!(&b"Potter"[..], &caps["last"]); |
1772 | | /// |
1773 | | /// let caps = re.captures(b"Harry J. Potter").unwrap(); |
1774 | | /// assert_eq!(&b"Harry"[..], &caps["first"]); |
1775 | | /// // Since a middle name/initial isn't required for an overall match, |
1776 | | /// // we can't assume that 'initial' or 'middle' will be populated! |
1777 | | /// assert_eq!( |
1778 | | /// Some(&b"J"[..]), |
1779 | | /// caps.name("initial").map(|m| m.as_bytes()), |
1780 | | /// ); |
1781 | | /// assert_eq!(None, caps.name("middle").map(|m| m.as_bytes())); |
1782 | | /// assert_eq!(&b"Potter"[..], &caps["last"]); |
1783 | | /// |
1784 | | /// let caps = re.captures(b"Harry James Potter").unwrap(); |
1785 | | /// assert_eq!(&b"Harry"[..], &caps["first"]); |
1786 | | /// // Since a middle name/initial isn't required for an overall match, |
1787 | | /// // we can't assume that 'initial' or 'middle' will be populated! |
1788 | | /// assert_eq!(None, caps.name("initial").map(|m| m.as_bytes())); |
1789 | | /// assert_eq!( |
1790 | | /// Some(&b"James"[..]), |
1791 | | /// caps.name("middle").map(|m| m.as_bytes()), |
1792 | | /// ); |
1793 | | /// assert_eq!(&b"Potter"[..], &caps["last"]); |
1794 | | /// ``` |
1795 | 0 | pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder { |
1796 | 0 | self.builder.ignore_whitespace(yes); |
1797 | 0 | self |
1798 | 0 | } |
1799 | | |
1800 | | /// This configures octal mode for the entire pattern. |
1801 | | /// |
1802 | | /// Octal syntax is a little-known way of uttering Unicode codepoints |
1803 | | /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all |
1804 | | /// equivalent patterns, where the last example shows octal syntax. |
1805 | | /// |
1806 | | /// While supporting octal syntax isn't in and of itself a problem, |
1807 | | /// it does make good error messages harder. That is, in PCRE based |
1808 | | /// regex engines, syntax like `\1` invokes a backreference, which is |
1809 | | /// explicitly unsupported this library. However, many users expect |
1810 | | /// backreferences to be supported. Therefore, when octal support |
1811 | | /// is disabled, the error message will explicitly mention that |
1812 | | /// backreferences aren't supported. |
1813 | | /// |
1814 | | /// The default for this is `false`. |
1815 | | /// |
1816 | | /// # Example |
1817 | | /// |
1818 | | /// ``` |
1819 | | /// use regex::bytes::RegexBuilder; |
1820 | | /// |
1821 | | /// // Normally this pattern would not compile, with an error message |
1822 | | /// // about backreferences not being supported. But with octal mode |
1823 | | /// // enabled, octal escape sequences work. |
1824 | | /// let re = RegexBuilder::new(r"\141") |
1825 | | /// .octal(true) |
1826 | | /// .build() |
1827 | | /// .unwrap(); |
1828 | | /// assert!(re.is_match(b"a")); |
1829 | | /// ``` |
1830 | 0 | pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder { |
1831 | 0 | self.builder.octal(yes); |
1832 | 0 | self |
1833 | 0 | } |
1834 | | |
1835 | | /// Sets the approximate size limit, in bytes, of the compiled regex. |
1836 | | /// |
1837 | | /// This roughly corresponds to the number of heap memory, in |
1838 | | /// bytes, occupied by a single regex. If the regex would otherwise |
1839 | | /// approximately exceed this limit, then compiling that regex will |
1840 | | /// fail. |
1841 | | /// |
1842 | | /// The main utility of a method like this is to avoid compiling |
1843 | | /// regexes that use an unexpected amount of resources, such as |
1844 | | /// time and memory. Even if the memory usage of a large regex is |
1845 | | /// acceptable, its search time may not be. Namely, worst case time |
1846 | | /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and |
1847 | | /// `n ~ len(haystack)`. That is, search time depends, in part, on the |
1848 | | /// size of the compiled regex. This means that putting a limit on the |
1849 | | /// size of the regex limits how much a regex can impact search time. |
1850 | | /// |
1851 | | /// For more information about regex size limits, see the section on |
1852 | | /// [untrusted inputs](crate#untrusted-input) in the top-level crate |
1853 | | /// documentation. |
1854 | | /// |
1855 | | /// The default for this is some reasonable number that permits most |
1856 | | /// patterns to compile successfully. |
1857 | | /// |
1858 | | /// # Example |
1859 | | /// |
1860 | | /// ``` |
1861 | | /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 |
1862 | | /// use regex::bytes::RegexBuilder; |
1863 | | /// |
1864 | | /// // It may surprise you how big some seemingly small patterns can |
1865 | | /// // be! Since \w is Unicode aware, this generates a regex that can |
1866 | | /// // match approximately 140,000 distinct codepoints. |
1867 | | /// assert!(RegexBuilder::new(r"\w").size_limit(45_000).build().is_err()); |
1868 | | /// ``` |
1869 | 0 | pub fn size_limit(&mut self, bytes: usize) -> &mut RegexBuilder { |
1870 | 0 | self.builder.size_limit(bytes); |
1871 | 0 | self |
1872 | 0 | } |
1873 | | |
1874 | | /// Set the approximate capacity, in bytes, of the cache of transitions |
1875 | | /// used by the lazy DFA. |
1876 | | /// |
1877 | | /// While the lazy DFA isn't always used, in tends to be the most |
1878 | | /// commonly use regex engine in default configurations. It tends to |
1879 | | /// adopt the performance profile of a fully build DFA, but without the |
1880 | | /// downside of taking worst case exponential time to build. |
1881 | | /// |
1882 | | /// The downside is that it needs to keep a cache of transitions and |
1883 | | /// states that are built while running a search, and this cache |
1884 | | /// can fill up. When it fills up, the cache will reset itself. Any |
1885 | | /// previously generated states and transitions will then need to be |
1886 | | /// re-generated. If this happens too many times, then this library |
1887 | | /// will bail out of using the lazy DFA and switch to a different regex |
1888 | | /// engine. |
1889 | | /// |
1890 | | /// If your regex provokes this particular downside of the lazy DFA, |
1891 | | /// then it may be beneficial to increase its cache capacity. This will |
1892 | | /// potentially reduce the frequency of cache resetting (ideally to |
1893 | | /// `0`). While it won't fix all potential performance problems with |
1894 | | /// the lazy DFA, increasing the cache capacity does fix some. |
1895 | | /// |
1896 | | /// There is no easy way to determine, a priori, whether increasing |
1897 | | /// this cache capacity will help. In general, the larger your regex, |
1898 | | /// the more cache it's likely to use. But that isn't an ironclad rule. |
1899 | | /// For example, a regex like `[01]*1[01]{N}` would normally produce a |
1900 | | /// fully build DFA that is exponential in size with respect to `N`. |
1901 | | /// The lazy DFA will prevent exponential space blow-up, but it cache |
1902 | | /// is likely to fill up, even when it's large and even for smallish |
1903 | | /// values of `N`. |
1904 | | /// |
1905 | | /// If you aren't sure whether this helps or not, it is sensible to |
1906 | | /// set this to some arbitrarily large number in testing, such as |
1907 | | /// `usize::MAX`. Namely, this represents the amount of capacity that |
1908 | | /// *may* be used. It's probably not a good idea to use `usize::MAX` in |
1909 | | /// production though, since it implies there are no controls on heap |
1910 | | /// memory used by this library during a search. In effect, set it to |
1911 | | /// whatever you're willing to allocate for a single regex search. |
1912 | 0 | pub fn dfa_size_limit(&mut self, bytes: usize) -> &mut RegexBuilder { |
1913 | 0 | self.builder.dfa_size_limit(bytes); |
1914 | 0 | self |
1915 | 0 | } |
1916 | | |
1917 | | /// Set the nesting limit for this parser. |
1918 | | /// |
1919 | | /// The nesting limit controls how deep the abstract syntax tree is |
1920 | | /// allowed to be. If the AST exceeds the given limit (e.g., with too |
1921 | | /// many nested groups), then an error is returned by the parser. |
1922 | | /// |
1923 | | /// The purpose of this limit is to act as a heuristic to prevent stack |
1924 | | /// overflow for consumers that do structural induction on an AST using |
1925 | | /// explicit recursion. While this crate never does this (instead using |
1926 | | /// constant stack space and moving the call stack to the heap), other |
1927 | | /// crates may. |
1928 | | /// |
1929 | | /// This limit is not checked until the entire AST is parsed. |
1930 | | /// Therefore, if callers want to put a limit on the amount of heap |
1931 | | /// space used, then they should impose a limit on the length, in |
1932 | | /// bytes, of the concrete pattern string. In particular, this is |
1933 | | /// viable since this parser implementation will limit itself to heap |
1934 | | /// space proportional to the length of the pattern string. See also |
1935 | | /// the [untrusted inputs](crate#untrusted-input) section in the |
1936 | | /// top-level crate documentation for more information about this. |
1937 | | /// |
1938 | | /// Note that a nest limit of `0` will return a nest limit error for |
1939 | | /// most patterns but not all. For example, a nest limit of `0` permits |
1940 | | /// `a` but not `ab`, since `ab` requires an explicit concatenation, |
1941 | | /// which results in a nest depth of `1`. In general, a nest limit is |
1942 | | /// not something that manifests in an obvious way in the concrete |
1943 | | /// syntax, therefore, it should not be used in a granular way. |
1944 | | /// |
1945 | | /// # Example |
1946 | | /// |
1947 | | /// ``` |
1948 | | /// use regex::bytes::RegexBuilder; |
1949 | | /// |
1950 | | /// assert!(RegexBuilder::new(r"a").nest_limit(0).build().is_ok()); |
1951 | | /// assert!(RegexBuilder::new(r"ab").nest_limit(0).build().is_err()); |
1952 | | /// ``` |
1953 | 0 | pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder { |
1954 | 0 | self.builder.nest_limit(limit); |
1955 | 0 | self |
1956 | 0 | } |
1957 | | } |
1958 | | |
1959 | | /// A configurable builder for a [`RegexSet`]. |
1960 | | /// |
1961 | | /// This builder can be used to programmatically set flags such as `i` |
1962 | | /// (case insensitive) and `x` (for verbose mode). This builder can also be |
1963 | | /// used to configure things like the line terminator and a size limit on |
1964 | | /// the compiled regular expression. |
1965 | | #[derive(Clone, Debug)] |
1966 | | pub struct RegexSetBuilder { |
1967 | | builder: Builder, |
1968 | | } |
1969 | | |
1970 | | impl RegexSetBuilder { |
1971 | | /// Create a new builder with a default configuration for the given |
1972 | | /// patterns. |
1973 | | /// |
1974 | | /// If the patterns are invalid or exceed the configured size limits, |
1975 | | /// then an error will be returned when [`RegexSetBuilder::build`] is |
1976 | | /// called. |
1977 | 0 | pub fn new<I, S>(patterns: I) -> RegexSetBuilder |
1978 | 0 | where |
1979 | 0 | I: IntoIterator<Item = S>, |
1980 | 0 | S: AsRef<str>, |
1981 | 0 | { |
1982 | 0 | RegexSetBuilder { builder: Builder::new(patterns) } |
1983 | 0 | } |
1984 | | |
1985 | | /// Compiles the patterns given to `RegexSetBuilder::new` with the |
1986 | | /// configuration set on this builder. |
1987 | | /// |
1988 | | /// If the patterns aren't valid regexes or if a configured size limit |
1989 | | /// was exceeded, then an error is returned. |
1990 | 0 | pub fn build(&self) -> Result<RegexSet, Error> { |
1991 | 0 | self.builder.build_many_bytes() |
1992 | 0 | } |
1993 | | |
1994 | | /// This configures Unicode mode for the all of the patterns. |
1995 | | /// |
1996 | | /// Enabling Unicode mode does a number of things: |
1997 | | /// |
1998 | | /// * Most fundamentally, it causes the fundamental atom of matching |
1999 | | /// to be a single codepoint. When Unicode mode is disabled, it's a |
2000 | | /// single byte. For example, when Unicode mode is enabled, `.` will |
2001 | | /// match `💩` once, where as it will match 4 times when Unicode mode |
2002 | | /// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.) |
2003 | | /// * Case insensitive matching uses Unicode simple case folding rules. |
2004 | | /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are |
2005 | | /// available. |
2006 | | /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and |
2007 | | /// `\d`. |
2008 | | /// * The word boundary assertions, `\b` and `\B`, use the Unicode |
2009 | | /// definition of a word character. |
2010 | | /// |
2011 | | /// Note that unlike the top-level `RegexSet` for searching `&str`, |
2012 | | /// it is permitted to disable Unicode mode even if the resulting |
2013 | | /// pattern could match invalid UTF-8. For example, `(?-u:.)` is not |
2014 | | /// a valid pattern for a top-level `RegexSet`, but is valid for a |
2015 | | /// `bytes::RegexSet`. |
2016 | | /// |
2017 | | /// For more details on the Unicode support in this crate, see the |
2018 | | /// [Unicode section](crate#unicode) in this crate's top-level |
2019 | | /// documentation. |
2020 | | /// |
2021 | | /// The default for this is `true`. |
2022 | | /// |
2023 | | /// # Example |
2024 | | /// |
2025 | | /// ``` |
2026 | | /// use regex::bytes::RegexSetBuilder; |
2027 | | /// |
2028 | | /// let re = RegexSetBuilder::new([r"\w"]) |
2029 | | /// .unicode(false) |
2030 | | /// .build() |
2031 | | /// .unwrap(); |
2032 | | /// // Normally greek letters would be included in \w, but since |
2033 | | /// // Unicode mode is disabled, it only matches ASCII letters. |
2034 | | /// assert!(!re.is_match("δ".as_bytes())); |
2035 | | /// |
2036 | | /// let re = RegexSetBuilder::new([r"s"]) |
2037 | | /// .case_insensitive(true) |
2038 | | /// .unicode(false) |
2039 | | /// .build() |
2040 | | /// .unwrap(); |
2041 | | /// // Normally 'Å¿' is included when searching for 's' case |
2042 | | /// // insensitively due to Unicode's simple case folding rules. But |
2043 | | /// // when Unicode mode is disabled, only ASCII case insensitive rules |
2044 | | /// // are used. |
2045 | | /// assert!(!re.is_match("Å¿".as_bytes())); |
2046 | | /// ``` |
2047 | | /// |
2048 | | /// Since this builder is for constructing a |
2049 | | /// [`bytes::RegexSet`](RegexSet), one can disable Unicode mode even if |
2050 | | /// it would match invalid UTF-8: |
2051 | | /// |
2052 | | /// ``` |
2053 | | /// use regex::bytes::RegexSetBuilder; |
2054 | | /// |
2055 | | /// let re = RegexSetBuilder::new([r"."]) |
2056 | | /// .unicode(false) |
2057 | | /// .build() |
2058 | | /// .unwrap(); |
2059 | | /// // Normally greek letters would be included in \w, but since |
2060 | | /// // Unicode mode is disabled, it only matches ASCII letters. |
2061 | | /// assert!(re.is_match(b"\xFF")); |
2062 | | /// ``` |
2063 | 0 | pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder { |
2064 | 0 | self.builder.unicode(yes); |
2065 | 0 | self |
2066 | 0 | } |
2067 | | |
2068 | | /// This configures whether to enable case insensitive matching for all |
2069 | | /// of the patterns. |
2070 | | /// |
2071 | | /// This setting can also be configured using the inline flag `i` |
2072 | | /// in the pattern. For example, `(?i:foo)` matches `foo` case |
2073 | | /// insensitively while `(?-i:foo)` matches `foo` case sensitively. |
2074 | | /// |
2075 | | /// The default for this is `false`. |
2076 | | /// |
2077 | | /// # Example |
2078 | | /// |
2079 | | /// ``` |
2080 | | /// use regex::bytes::RegexSetBuilder; |
2081 | | /// |
2082 | | /// let re = RegexSetBuilder::new([r"foo(?-i:bar)quux"]) |
2083 | | /// .case_insensitive(true) |
2084 | | /// .build() |
2085 | | /// .unwrap(); |
2086 | | /// assert!(re.is_match(b"FoObarQuUx")); |
2087 | | /// // Even though case insensitive matching is enabled in the builder, |
2088 | | /// // it can be locally disabled within the pattern. In this case, |
2089 | | /// // `bar` is matched case sensitively. |
2090 | | /// assert!(!re.is_match(b"fooBARquux")); |
2091 | | /// ``` |
2092 | 0 | pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder { |
2093 | 0 | self.builder.case_insensitive(yes); |
2094 | 0 | self |
2095 | 0 | } |
2096 | | |
2097 | | /// This configures multi-line mode for all of the patterns. |
2098 | | /// |
2099 | | /// Enabling multi-line mode changes the behavior of the `^` and `$` |
2100 | | /// anchor assertions. Instead of only matching at the beginning and |
2101 | | /// end of a haystack, respectively, multi-line mode causes them to |
2102 | | /// match at the beginning and end of a line *in addition* to the |
2103 | | /// beginning and end of a haystack. More precisely, `^` will match at |
2104 | | /// the position immediately following a `\n` and `$` will match at the |
2105 | | /// position immediately preceding a `\n`. |
2106 | | /// |
2107 | | /// The behavior of this option can be impacted by other settings too: |
2108 | | /// |
2109 | | /// * The [`RegexSetBuilder::line_terminator`] option changes `\n` |
2110 | | /// above to any ASCII byte. |
2111 | | /// * The [`RegexSetBuilder::crlf`] option changes the line terminator |
2112 | | /// to be either `\r` or `\n`, but never at the position between a `\r` |
2113 | | /// and `\n`. |
2114 | | /// |
2115 | | /// This setting can also be configured using the inline flag `m` in |
2116 | | /// the pattern. |
2117 | | /// |
2118 | | /// The default for this is `false`. |
2119 | | /// |
2120 | | /// # Example |
2121 | | /// |
2122 | | /// ``` |
2123 | | /// use regex::bytes::RegexSetBuilder; |
2124 | | /// |
2125 | | /// let re = RegexSetBuilder::new([r"^foo$"]) |
2126 | | /// .multi_line(true) |
2127 | | /// .build() |
2128 | | /// .unwrap(); |
2129 | | /// assert!(re.is_match(b"\nfoo\n")); |
2130 | | /// ``` |
2131 | 0 | pub fn multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder { |
2132 | 0 | self.builder.multi_line(yes); |
2133 | 0 | self |
2134 | 0 | } |
2135 | | |
2136 | | /// This configures dot-matches-new-line mode for the entire pattern. |
2137 | | /// |
2138 | | /// Perhaps surprisingly, the default behavior for `.` is not to match |
2139 | | /// any character, but rather, to match any character except for the |
2140 | | /// line terminator (which is `\n` by default). When this mode is |
2141 | | /// enabled, the behavior changes such that `.` truly matches any |
2142 | | /// character. |
2143 | | /// |
2144 | | /// This setting can also be configured using the inline flag `s` in |
2145 | | /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent |
2146 | | /// regexes. |
2147 | | /// |
2148 | | /// The default for this is `false`. |
2149 | | /// |
2150 | | /// # Example |
2151 | | /// |
2152 | | /// ``` |
2153 | | /// use regex::bytes::RegexSetBuilder; |
2154 | | /// |
2155 | | /// let re = RegexSetBuilder::new([r"foo.bar"]) |
2156 | | /// .dot_matches_new_line(true) |
2157 | | /// .build() |
2158 | | /// .unwrap(); |
2159 | | /// let hay = b"foo\nbar"; |
2160 | | /// assert!(re.is_match(hay)); |
2161 | | /// ``` |
2162 | 0 | pub fn dot_matches_new_line( |
2163 | 0 | &mut self, |
2164 | 0 | yes: bool, |
2165 | 0 | ) -> &mut RegexSetBuilder { |
2166 | 0 | self.builder.dot_matches_new_line(yes); |
2167 | 0 | self |
2168 | 0 | } |
2169 | | |
2170 | | /// This configures CRLF mode for all of the patterns. |
2171 | | /// |
2172 | | /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for |
2173 | | /// short) and `\n` ("line feed" or LF for short) are treated as line |
2174 | | /// terminators. This results in the following: |
2175 | | /// |
2176 | | /// * Unless dot-matches-new-line mode is enabled, `.` will now match |
2177 | | /// any character except for `\n` and `\r`. |
2178 | | /// * When multi-line mode is enabled, `^` will match immediately |
2179 | | /// following a `\n` or a `\r`. Similarly, `$` will match immediately |
2180 | | /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match |
2181 | | /// between `\r` and `\n`. |
2182 | | /// |
2183 | | /// This setting can also be configured using the inline flag `R` in |
2184 | | /// the pattern. |
2185 | | /// |
2186 | | /// The default for this is `false`. |
2187 | | /// |
2188 | | /// # Example |
2189 | | /// |
2190 | | /// ``` |
2191 | | /// use regex::bytes::RegexSetBuilder; |
2192 | | /// |
2193 | | /// let re = RegexSetBuilder::new([r"^foo$"]) |
2194 | | /// .multi_line(true) |
2195 | | /// .crlf(true) |
2196 | | /// .build() |
2197 | | /// .unwrap(); |
2198 | | /// let hay = b"\r\nfoo\r\n"; |
2199 | | /// // If CRLF mode weren't enabled here, then '$' wouldn't match |
2200 | | /// // immediately after 'foo', and thus no match would be found. |
2201 | | /// assert!(re.is_match(hay)); |
2202 | | /// ``` |
2203 | | /// |
2204 | | /// This example demonstrates that `^` will never match at a position |
2205 | | /// between `\r` and `\n`. (`$` will similarly not match between a `\r` |
2206 | | /// and a `\n`.) |
2207 | | /// |
2208 | | /// ``` |
2209 | | /// use regex::bytes::RegexSetBuilder; |
2210 | | /// |
2211 | | /// let re = RegexSetBuilder::new([r"^\n"]) |
2212 | | /// .multi_line(true) |
2213 | | /// .crlf(true) |
2214 | | /// .build() |
2215 | | /// .unwrap(); |
2216 | | /// assert!(!re.is_match(b"\r\n")); |
2217 | | /// ``` |
2218 | 0 | pub fn crlf(&mut self, yes: bool) -> &mut RegexSetBuilder { |
2219 | 0 | self.builder.crlf(yes); |
2220 | 0 | self |
2221 | 0 | } |
2222 | | |
2223 | | /// Configures the line terminator to be used by the regex. |
2224 | | /// |
2225 | | /// The line terminator is relevant in two ways for a particular regex: |
2226 | | /// |
2227 | | /// * When dot-matches-new-line mode is *not* enabled (the default), |
2228 | | /// then `.` will match any character except for the configured line |
2229 | | /// terminator. |
2230 | | /// * When multi-line mode is enabled (not the default), then `^` and |
2231 | | /// `$` will match immediately after and before, respectively, a line |
2232 | | /// terminator. |
2233 | | /// |
2234 | | /// In both cases, if CRLF mode is enabled in a particular context, |
2235 | | /// then it takes precedence over any configured line terminator. |
2236 | | /// |
2237 | | /// This option cannot be configured from within the pattern. |
2238 | | /// |
2239 | | /// The default line terminator is `\n`. |
2240 | | /// |
2241 | | /// # Example |
2242 | | /// |
2243 | | /// This shows how to treat the NUL byte as a line terminator. This can |
2244 | | /// be a useful heuristic when searching binary data. |
2245 | | /// |
2246 | | /// ``` |
2247 | | /// use regex::bytes::RegexSetBuilder; |
2248 | | /// |
2249 | | /// let re = RegexSetBuilder::new([r"^foo$"]) |
2250 | | /// .multi_line(true) |
2251 | | /// .line_terminator(b'\x00') |
2252 | | /// .build() |
2253 | | /// .unwrap(); |
2254 | | /// let hay = b"\x00foo\x00"; |
2255 | | /// assert!(re.is_match(hay)); |
2256 | | /// ``` |
2257 | | /// |
2258 | | /// This example shows that the behavior of `.` is impacted by this |
2259 | | /// setting as well: |
2260 | | /// |
2261 | | /// ``` |
2262 | | /// use regex::bytes::RegexSetBuilder; |
2263 | | /// |
2264 | | /// let re = RegexSetBuilder::new([r"."]) |
2265 | | /// .line_terminator(b'\x00') |
2266 | | /// .build() |
2267 | | /// .unwrap(); |
2268 | | /// assert!(re.is_match(b"\n")); |
2269 | | /// assert!(!re.is_match(b"\x00")); |
2270 | | /// ``` |
2271 | | /// |
2272 | | /// This shows that building a regex will work even when the byte given |
2273 | | /// is not ASCII. This is unlike the top-level `RegexSet` API where |
2274 | | /// matching invalid UTF-8 is not allowed. |
2275 | | /// |
2276 | | /// Note though that you must disable Unicode mode. This is required |
2277 | | /// because Unicode mode requires matching one codepoint at a time, |
2278 | | /// and there is no way to match a non-ASCII byte as if it were a |
2279 | | /// codepoint. |
2280 | | /// |
2281 | | /// ``` |
2282 | | /// use regex::bytes::RegexSetBuilder; |
2283 | | /// |
2284 | | /// assert!( |
2285 | | /// RegexSetBuilder::new([r"."]) |
2286 | | /// .unicode(false) |
2287 | | /// .line_terminator(0x80) |
2288 | | /// .build() |
2289 | | /// .is_ok(), |
2290 | | /// ); |
2291 | | /// ``` |
2292 | 0 | pub fn line_terminator(&mut self, byte: u8) -> &mut RegexSetBuilder { |
2293 | 0 | self.builder.line_terminator(byte); |
2294 | 0 | self |
2295 | 0 | } |
2296 | | |
2297 | | /// This configures swap-greed mode for all of the patterns. |
2298 | | /// |
2299 | | /// When swap-greed mode is enabled, patterns like `a+` will become |
2300 | | /// non-greedy and patterns like `a+?` will become greedy. In other |
2301 | | /// words, the meanings of `a+` and `a+?` are switched. |
2302 | | /// |
2303 | | /// This setting can also be configured using the inline flag `U` in |
2304 | | /// the pattern. |
2305 | | /// |
2306 | | /// Note that this is generally not useful for a `RegexSet` since a |
2307 | | /// `RegexSet` can only report whether a pattern matches or not. Since |
2308 | | /// greediness never impacts whether a match is found or not (only the |
2309 | | /// offsets of the match), it follows that whether parts of a pattern |
2310 | | /// are greedy or not doesn't matter for a `RegexSet`. |
2311 | | /// |
2312 | | /// The default for this is `false`. |
2313 | 0 | pub fn swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder { |
2314 | 0 | self.builder.swap_greed(yes); |
2315 | 0 | self |
2316 | 0 | } |
2317 | | |
2318 | | /// This configures verbose mode for all of the patterns. |
2319 | | /// |
2320 | | /// When enabled, whitespace will treated as insignifcant in the |
2321 | | /// pattern and `#` can be used to start a comment until the next new |
2322 | | /// line. |
2323 | | /// |
2324 | | /// Normally, in most places in a pattern, whitespace is treated |
2325 | | /// literally. For example ` +` will match one or more ASCII whitespace |
2326 | | /// characters. |
2327 | | /// |
2328 | | /// When verbose mode is enabled, `\#` can be used to match a literal |
2329 | | /// `#` and `\ ` can be used to match a literal ASCII whitespace |
2330 | | /// character. |
2331 | | /// |
2332 | | /// Verbose mode is useful for permitting regexes to be formatted and |
2333 | | /// broken up more nicely. This may make them more easily readable. |
2334 | | /// |
2335 | | /// This setting can also be configured using the inline flag `x` in |
2336 | | /// the pattern. |
2337 | | /// |
2338 | | /// The default for this is `false`. |
2339 | | /// |
2340 | | /// # Example |
2341 | | /// |
2342 | | /// ``` |
2343 | | /// use regex::bytes::RegexSetBuilder; |
2344 | | /// |
2345 | | /// let pat = r" |
2346 | | /// \b |
2347 | | /// (?<first>\p{Uppercase}\w*) # always start with uppercase letter |
2348 | | /// [\s--\n]+ # whitespace should separate names |
2349 | | /// (?: # middle name can be an initial! |
2350 | | /// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*)) |
2351 | | /// [\s--\n]+ |
2352 | | /// )? |
2353 | | /// (?<last>\p{Uppercase}\w*) |
2354 | | /// \b |
2355 | | /// "; |
2356 | | /// let re = RegexSetBuilder::new([pat]) |
2357 | | /// .ignore_whitespace(true) |
2358 | | /// .build() |
2359 | | /// .unwrap(); |
2360 | | /// assert!(re.is_match(b"Harry Potter")); |
2361 | | /// assert!(re.is_match(b"Harry J. Potter")); |
2362 | | /// assert!(re.is_match(b"Harry James Potter")); |
2363 | | /// assert!(!re.is_match(b"harry J. Potter")); |
2364 | | /// ``` |
2365 | 0 | pub fn ignore_whitespace( |
2366 | 0 | &mut self, |
2367 | 0 | yes: bool, |
2368 | 0 | ) -> &mut RegexSetBuilder { |
2369 | 0 | self.builder.ignore_whitespace(yes); |
2370 | 0 | self |
2371 | 0 | } |
2372 | | |
2373 | | /// This configures octal mode for all of the patterns. |
2374 | | /// |
2375 | | /// Octal syntax is a little-known way of uttering Unicode codepoints |
2376 | | /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all |
2377 | | /// equivalent patterns, where the last example shows octal syntax. |
2378 | | /// |
2379 | | /// While supporting octal syntax isn't in and of itself a problem, |
2380 | | /// it does make good error messages harder. That is, in PCRE based |
2381 | | /// regex engines, syntax like `\1` invokes a backreference, which is |
2382 | | /// explicitly unsupported this library. However, many users expect |
2383 | | /// backreferences to be supported. Therefore, when octal support |
2384 | | /// is disabled, the error message will explicitly mention that |
2385 | | /// backreferences aren't supported. |
2386 | | /// |
2387 | | /// The default for this is `false`. |
2388 | | /// |
2389 | | /// # Example |
2390 | | /// |
2391 | | /// ``` |
2392 | | /// use regex::bytes::RegexSetBuilder; |
2393 | | /// |
2394 | | /// // Normally this pattern would not compile, with an error message |
2395 | | /// // about backreferences not being supported. But with octal mode |
2396 | | /// // enabled, octal escape sequences work. |
2397 | | /// let re = RegexSetBuilder::new([r"\141"]) |
2398 | | /// .octal(true) |
2399 | | /// .build() |
2400 | | /// .unwrap(); |
2401 | | /// assert!(re.is_match(b"a")); |
2402 | | /// ``` |
2403 | 0 | pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder { |
2404 | 0 | self.builder.octal(yes); |
2405 | 0 | self |
2406 | 0 | } |
2407 | | |
2408 | | /// Sets the approximate size limit, in bytes, of the compiled regex. |
2409 | | /// |
2410 | | /// This roughly corresponds to the number of heap memory, in |
2411 | | /// bytes, occupied by a single regex. If the regex would otherwise |
2412 | | /// approximately exceed this limit, then compiling that regex will |
2413 | | /// fail. |
2414 | | /// |
2415 | | /// The main utility of a method like this is to avoid compiling |
2416 | | /// regexes that use an unexpected amount of resources, such as |
2417 | | /// time and memory. Even if the memory usage of a large regex is |
2418 | | /// acceptable, its search time may not be. Namely, worst case time |
2419 | | /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and |
2420 | | /// `n ~ len(haystack)`. That is, search time depends, in part, on the |
2421 | | /// size of the compiled regex. This means that putting a limit on the |
2422 | | /// size of the regex limits how much a regex can impact search time. |
2423 | | /// |
2424 | | /// For more information about regex size limits, see the section on |
2425 | | /// [untrusted inputs](crate#untrusted-input) in the top-level crate |
2426 | | /// documentation. |
2427 | | /// |
2428 | | /// The default for this is some reasonable number that permits most |
2429 | | /// patterns to compile successfully. |
2430 | | /// |
2431 | | /// # Example |
2432 | | /// |
2433 | | /// ``` |
2434 | | /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 |
2435 | | /// use regex::bytes::RegexSetBuilder; |
2436 | | /// |
2437 | | /// // It may surprise you how big some seemingly small patterns can |
2438 | | /// // be! Since \w is Unicode aware, this generates a regex that can |
2439 | | /// // match approximately 140,000 distinct codepoints. |
2440 | | /// assert!( |
2441 | | /// RegexSetBuilder::new([r"\w"]) |
2442 | | /// .size_limit(45_000) |
2443 | | /// .build() |
2444 | | /// .is_err() |
2445 | | /// ); |
2446 | | /// ``` |
2447 | 0 | pub fn size_limit(&mut self, bytes: usize) -> &mut RegexSetBuilder { |
2448 | 0 | self.builder.size_limit(bytes); |
2449 | 0 | self |
2450 | 0 | } |
2451 | | |
2452 | | /// Set the approximate capacity, in bytes, of the cache of transitions |
2453 | | /// used by the lazy DFA. |
2454 | | /// |
2455 | | /// While the lazy DFA isn't always used, in tends to be the most |
2456 | | /// commonly use regex engine in default configurations. It tends to |
2457 | | /// adopt the performance profile of a fully build DFA, but without the |
2458 | | /// downside of taking worst case exponential time to build. |
2459 | | /// |
2460 | | /// The downside is that it needs to keep a cache of transitions and |
2461 | | /// states that are built while running a search, and this cache |
2462 | | /// can fill up. When it fills up, the cache will reset itself. Any |
2463 | | /// previously generated states and transitions will then need to be |
2464 | | /// re-generated. If this happens too many times, then this library |
2465 | | /// will bail out of using the lazy DFA and switch to a different regex |
2466 | | /// engine. |
2467 | | /// |
2468 | | /// If your regex provokes this particular downside of the lazy DFA, |
2469 | | /// then it may be beneficial to increase its cache capacity. This will |
2470 | | /// potentially reduce the frequency of cache resetting (ideally to |
2471 | | /// `0`). While it won't fix all potential performance problems with |
2472 | | /// the lazy DFA, increasing the cache capacity does fix some. |
2473 | | /// |
2474 | | /// There is no easy way to determine, a priori, whether increasing |
2475 | | /// this cache capacity will help. In general, the larger your regex, |
2476 | | /// the more cache it's likely to use. But that isn't an ironclad rule. |
2477 | | /// For example, a regex like `[01]*1[01]{N}` would normally produce a |
2478 | | /// fully build DFA that is exponential in size with respect to `N`. |
2479 | | /// The lazy DFA will prevent exponential space blow-up, but it cache |
2480 | | /// is likely to fill up, even when it's large and even for smallish |
2481 | | /// values of `N`. |
2482 | | /// |
2483 | | /// If you aren't sure whether this helps or not, it is sensible to |
2484 | | /// set this to some arbitrarily large number in testing, such as |
2485 | | /// `usize::MAX`. Namely, this represents the amount of capacity that |
2486 | | /// *may* be used. It's probably not a good idea to use `usize::MAX` in |
2487 | | /// production though, since it implies there are no controls on heap |
2488 | | /// memory used by this library during a search. In effect, set it to |
2489 | | /// whatever you're willing to allocate for a single regex search. |
2490 | 0 | pub fn dfa_size_limit( |
2491 | 0 | &mut self, |
2492 | 0 | bytes: usize, |
2493 | 0 | ) -> &mut RegexSetBuilder { |
2494 | 0 | self.builder.dfa_size_limit(bytes); |
2495 | 0 | self |
2496 | 0 | } |
2497 | | |
2498 | | /// Set the nesting limit for this parser. |
2499 | | /// |
2500 | | /// The nesting limit controls how deep the abstract syntax tree is |
2501 | | /// allowed to be. If the AST exceeds the given limit (e.g., with too |
2502 | | /// many nested groups), then an error is returned by the parser. |
2503 | | /// |
2504 | | /// The purpose of this limit is to act as a heuristic to prevent stack |
2505 | | /// overflow for consumers that do structural induction on an AST using |
2506 | | /// explicit recursion. While this crate never does this (instead using |
2507 | | /// constant stack space and moving the call stack to the heap), other |
2508 | | /// crates may. |
2509 | | /// |
2510 | | /// This limit is not checked until the entire AST is parsed. |
2511 | | /// Therefore, if callers want to put a limit on the amount of heap |
2512 | | /// space used, then they should impose a limit on the length, in |
2513 | | /// bytes, of the concrete pattern string. In particular, this is |
2514 | | /// viable since this parser implementation will limit itself to heap |
2515 | | /// space proportional to the length of the pattern string. See also |
2516 | | /// the [untrusted inputs](crate#untrusted-input) section in the |
2517 | | /// top-level crate documentation for more information about this. |
2518 | | /// |
2519 | | /// Note that a nest limit of `0` will return a nest limit error for |
2520 | | /// most patterns but not all. For example, a nest limit of `0` permits |
2521 | | /// `a` but not `ab`, since `ab` requires an explicit concatenation, |
2522 | | /// which results in a nest depth of `1`. In general, a nest limit is |
2523 | | /// not something that manifests in an obvious way in the concrete |
2524 | | /// syntax, therefore, it should not be used in a granular way. |
2525 | | /// |
2526 | | /// # Example |
2527 | | /// |
2528 | | /// ``` |
2529 | | /// use regex::bytes::RegexSetBuilder; |
2530 | | /// |
2531 | | /// assert!(RegexSetBuilder::new([r"a"]).nest_limit(0).build().is_ok()); |
2532 | | /// assert!(RegexSetBuilder::new([r"ab"]).nest_limit(0).build().is_err()); |
2533 | | /// ``` |
2534 | 0 | pub fn nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder { |
2535 | 0 | self.builder.nest_limit(limit); |
2536 | 0 | self |
2537 | 0 | } |
2538 | | } |
2539 | | } |