/build/cargo-vendor-dir/regex-automata-0.4.9/src/meta/strategy.rs
Line | Count | Source |
1 | | use core::{ |
2 | | fmt::Debug, |
3 | | panic::{RefUnwindSafe, UnwindSafe}, |
4 | | }; |
5 | | |
6 | | use alloc::sync::Arc; |
7 | | |
8 | | use regex_syntax::hir::{literal, Hir}; |
9 | | |
10 | | use crate::{ |
11 | | meta::{ |
12 | | error::{BuildError, RetryError, RetryFailError, RetryQuadraticError}, |
13 | | regex::{Cache, RegexInfo}, |
14 | | reverse_inner, wrappers, |
15 | | }, |
16 | | nfa::thompson::{self, WhichCaptures, NFA}, |
17 | | util::{ |
18 | | captures::{Captures, GroupInfo}, |
19 | | look::LookMatcher, |
20 | | prefilter::{self, Prefilter, PrefilterI}, |
21 | | primitives::{NonMaxUsize, PatternID}, |
22 | | search::{Anchored, HalfMatch, Input, Match, MatchKind, PatternSet}, |
23 | | }, |
24 | | }; |
25 | | |
26 | | /// A trait that represents a single meta strategy. Its main utility is in |
27 | | /// providing a way to do dynamic dispatch over a few choices. |
28 | | /// |
29 | | /// Why dynamic dispatch? I actually don't have a super compelling reason, and |
30 | | /// importantly, I have not benchmarked it with the main alternative: an enum. |
31 | | /// I went with dynamic dispatch initially because the regex engine search code |
32 | | /// really can't be inlined into caller code in most cases because it's just |
33 | | /// too big. In other words, it is already expected that every regex search |
34 | | /// will entail at least the cost of a function call. |
35 | | /// |
36 | | /// I do wonder whether using enums would result in better codegen overall |
37 | | /// though. It's a worthwhile experiment to try. Probably the most interesting |
38 | | /// benchmark to run in such a case would be one with a high match count. That |
39 | | /// is, a benchmark to test the overall latency of a search call. |
40 | | pub(super) trait Strategy: |
41 | | Debug + Send + Sync + RefUnwindSafe + UnwindSafe + 'static |
42 | | { |
43 | | fn group_info(&self) -> &GroupInfo; |
44 | | |
45 | | fn create_cache(&self) -> Cache; |
46 | | |
47 | | fn reset_cache(&self, cache: &mut Cache); |
48 | | |
49 | | fn is_accelerated(&self) -> bool; |
50 | | |
51 | | fn memory_usage(&self) -> usize; |
52 | | |
53 | | fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match>; |
54 | | |
55 | | fn search_half( |
56 | | &self, |
57 | | cache: &mut Cache, |
58 | | input: &Input<'_>, |
59 | | ) -> Option<HalfMatch>; |
60 | | |
61 | | fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool; |
62 | | |
63 | | fn search_slots( |
64 | | &self, |
65 | | cache: &mut Cache, |
66 | | input: &Input<'_>, |
67 | | slots: &mut [Option<NonMaxUsize>], |
68 | | ) -> Option<PatternID>; |
69 | | |
70 | | fn which_overlapping_matches( |
71 | | &self, |
72 | | cache: &mut Cache, |
73 | | input: &Input<'_>, |
74 | | patset: &mut PatternSet, |
75 | | ); |
76 | | } |
77 | | |
78 | 2 | pub(super) fn new( |
79 | 2 | info: &RegexInfo, |
80 | 2 | hirs: &[&Hir], |
81 | 2 | ) -> Result<Arc<dyn Strategy>, BuildError> { |
82 | | // At this point, we're committed to a regex engine of some kind. So pull |
83 | | // out a prefilter if we can, which will feed to each of the constituent |
84 | | // regex engines. |
85 | 2 | let pre = if info.is_always_anchored_start() { |
86 | | // PERF: I'm not sure we necessarily want to do this... We may want to |
87 | | // run a prefilter for quickly rejecting in some cases. The problem |
88 | | // is that anchored searches overlap quite a bit with the use case |
89 | | // of "run a regex on every line to extract data." In that case, the |
90 | | // regex always matches, so running a prefilter doesn't really help us |
91 | | // there. The main place where a prefilter helps in an anchored search |
92 | | // is if the anchored search is not expected to match frequently. That |
93 | | // is, the prefilter gives us a way to possibly reject a haystack very |
94 | | // quickly. |
95 | | // |
96 | | // Maybe we should do use a prefilter, but only for longer haystacks? |
97 | | // Or maybe we should only use a prefilter when we think it's "fast"? |
98 | | // |
99 | | // Interestingly, I think we currently lack the infrastructure for |
100 | | // disabling a prefilter based on haystack length. That would probably |
101 | | // need to be a new 'Input' option. (Interestingly, an 'Input' used to |
102 | | // carry a 'Prefilter' with it, but I moved away from that.) |
103 | | debug!("skipping literal extraction since regex is anchored"); |
104 | 1 | None |
105 | 1 | } else if let Some(pre0 ) = info.config().get_prefilter() { |
106 | | debug!( |
107 | | "skipping literal extraction since the caller provided a prefilter" |
108 | | ); |
109 | 0 | Some(pre.clone()) |
110 | 1 | } else if info.config().get_auto_prefilter() { |
111 | 1 | let kind = info.config().get_match_kind(); |
112 | 1 | let prefixes = crate::util::prefilter::prefixes(kind, hirs); |
113 | | // If we can build a full `Strategy` from just the extracted prefixes, |
114 | | // then we can short-circuit and avoid building a regex engine at all. |
115 | 1 | if let Some(pre0 ) = Pre::from_prefixes(info, &prefixes) { |
116 | | debug!( |
117 | | "found that the regex can be broken down to a literal \ |
118 | | search, avoiding the regex engine entirely", |
119 | | ); |
120 | 0 | return Ok(pre); |
121 | 1 | } |
122 | | // This now attempts another short-circuit of the regex engine: if we |
123 | | // have a huge alternation of just plain literals, then we can just use |
124 | | // Aho-Corasick for that and avoid the regex engine entirely. |
125 | | // |
126 | | // You might think this case would just be handled by |
127 | | // `Pre::from_prefixes`, but that technique relies on heuristic literal |
128 | | // extraction from the corresponding `Hir`. That works, but part of |
129 | | // heuristics limit the size and number of literals returned. This case |
130 | | // will specifically handle patterns with very large alternations. |
131 | | // |
132 | | // One wonders if we should just roll this our heuristic literal |
133 | | // extraction, and then I think this case could disappear entirely. |
134 | 1 | if let Some(pre0 ) = Pre::from_alternation_literals(info, hirs) { |
135 | | debug!( |
136 | | "found plain alternation of literals, \ |
137 | | avoiding regex engine entirely and using Aho-Corasick" |
138 | | ); |
139 | 0 | return Ok(pre); |
140 | 1 | } |
141 | 1 | prefixes.literals().and_then(|strings| { |
142 | 0 | debug!( |
143 | 0 | "creating prefilter from {} literals: {:?}", |
144 | 0 | strings.len(), |
145 | 0 | strings, |
146 | 0 | ); |
147 | 0 | Prefilter::new(kind, strings) |
148 | 1 | }) |
149 | | } else { |
150 | | debug!("skipping literal extraction since prefilters were disabled"); |
151 | 0 | None |
152 | | }; |
153 | 2 | let mut core = Core::new(info.clone(), pre.clone(), hirs)?0 ; |
154 | | // Now that we have our core regex engines built, there are a few cases |
155 | | // where we can do a little bit better than just a normal "search forward |
156 | | // and maybe use a prefilter when in a start state." However, these cases |
157 | | // may not always work or otherwise build on top of the Core searcher. |
158 | | // For example, the reverse anchored optimization seems like it might |
159 | | // always work, but only the DFAs support reverse searching and the DFAs |
160 | | // might give up or quit for reasons. If we had, e.g., a PikeVM that |
161 | | // supported reverse searching, then we could avoid building a full Core |
162 | | // engine for this case. |
163 | 2 | core = match ReverseAnchored::new(core) { |
164 | 2 | Err(core) => core, |
165 | 0 | Ok(ra) => { |
166 | 0 | debug!("using reverse anchored strategy"); |
167 | 0 | return Ok(Arc::new(ra)); |
168 | | } |
169 | | }; |
170 | 2 | core = match ReverseSuffix::new(core, hirs) { |
171 | 2 | Err(core) => core, |
172 | 0 | Ok(rs) => { |
173 | 0 | debug!("using reverse suffix strategy"); |
174 | 0 | return Ok(Arc::new(rs)); |
175 | | } |
176 | | }; |
177 | 2 | core = match ReverseInner::new(core, hirs) { |
178 | 2 | Err(core) => core, |
179 | 0 | Ok(ri) => { |
180 | 0 | debug!("using reverse inner strategy"); |
181 | 0 | return Ok(Arc::new(ri)); |
182 | | } |
183 | | }; |
184 | | debug!("using core strategy"); |
185 | 2 | Ok(Arc::new(core)) |
186 | 2 | } |
187 | | |
188 | | #[derive(Clone, Debug)] |
189 | | struct Pre<P> { |
190 | | pre: P, |
191 | | group_info: GroupInfo, |
192 | | } |
193 | | |
194 | | impl<P: PrefilterI> Pre<P> { |
195 | 0 | fn new(pre: P) -> Arc<dyn Strategy> { |
196 | 0 | // The only thing we support when we use prefilters directly as a |
197 | 0 | // strategy is the start and end of the overall match for a single |
198 | 0 | // pattern. In other words, exactly one implicit capturing group. Which |
199 | 0 | // is exactly what we use here for a GroupInfo. |
200 | 0 | let group_info = GroupInfo::new([[None::<&str>]]).unwrap(); |
201 | 0 | Arc::new(Pre { pre, group_info }) |
202 | 0 | } |
203 | | } |
204 | | |
205 | | // This is a little weird, but we don't actually care about the type parameter |
206 | | // here because we're selecting which underlying prefilter to use. So we just |
207 | | // define it on an arbitrary type. |
208 | | impl Pre<()> { |
209 | | /// Given a sequence of prefixes, attempt to return a full `Strategy` using |
210 | | /// just the prefixes. |
211 | | /// |
212 | | /// Basically, this occurs when the prefixes given not just prefixes, |
213 | | /// but an enumeration of the entire language matched by the regular |
214 | | /// expression. |
215 | | /// |
216 | | /// A number of other conditions need to be true too. For example, there |
217 | | /// can be only one pattern, the number of explicit capture groups is 0, no |
218 | | /// look-around assertions and so on. |
219 | | /// |
220 | | /// Note that this ignores `Config::get_auto_prefilter` because if this |
221 | | /// returns something, then it isn't a prefilter but a matcher itself. |
222 | | /// Therefore, it shouldn't suffer from the problems typical to prefilters |
223 | | /// (such as a high false positive rate). |
224 | 1 | fn from_prefixes( |
225 | 1 | info: &RegexInfo, |
226 | 1 | prefixes: &literal::Seq, |
227 | 1 | ) -> Option<Arc<dyn Strategy>> { |
228 | 1 | let kind = info.config().get_match_kind(); |
229 | 1 | // Check to see if our prefixes are exact, which means we might be |
230 | 1 | // able to bypass the regex engine entirely and just rely on literal |
231 | 1 | // searches. |
232 | 1 | if !prefixes.is_exact() { |
233 | 1 | return None; |
234 | 0 | } |
235 | 0 | // We also require that we have a single regex pattern. Namely, |
236 | 0 | // we reuse the prefilter infrastructure to implement search and |
237 | 0 | // prefilters only report spans. Prefilters don't know about pattern |
238 | 0 | // IDs. The multi-regex case isn't a lost cause, we might still use |
239 | 0 | // Aho-Corasick and we might still just use a regular prefilter, but |
240 | 0 | // that's done below. |
241 | 0 | if info.pattern_len() != 1 { |
242 | 0 | return None; |
243 | 0 | } |
244 | 0 | // We can't have any capture groups either. The literal engines don't |
245 | 0 | // know how to deal with things like '(foo)(bar)'. In that case, a |
246 | 0 | // prefilter will just be used and then the regex engine will resolve |
247 | 0 | // the capture groups. |
248 | 0 | if info.props()[0].explicit_captures_len() != 0 { |
249 | 0 | return None; |
250 | 0 | } |
251 | 0 | // We also require that it has zero look-around assertions. Namely, |
252 | 0 | // literal extraction treats look-around assertions as if they match |
253 | 0 | // *every* empty string. But of course, that isn't true. So for |
254 | 0 | // example, 'foo\bquux' never matches anything, but 'fooquux' is |
255 | 0 | // extracted from that as an exact literal. Such cases should just run |
256 | 0 | // the regex engine. 'fooquux' will be used as a normal prefilter, and |
257 | 0 | // then the regex engine will try to look for an actual match. |
258 | 0 | if !info.props()[0].look_set().is_empty() { |
259 | 0 | return None; |
260 | 0 | } |
261 | 0 | // Finally, currently, our prefilters are all oriented around |
262 | 0 | // leftmost-first match semantics, so don't try to use them if the |
263 | 0 | // caller asked for anything else. |
264 | 0 | if kind != MatchKind::LeftmostFirst { |
265 | 0 | return None; |
266 | 0 | } |
267 | 0 | // The above seems like a lot of requirements to meet, but it applies |
268 | 0 | // to a lot of cases. 'foo', '[abc][123]' and 'foo|bar|quux' all meet |
269 | 0 | // the above criteria, for example. |
270 | 0 | // |
271 | 0 | // Note that this is effectively a latency optimization. If we didn't |
272 | 0 | // do this, then the extracted literals would still get bundled into |
273 | 0 | // a prefilter, and every regex engine capable of running unanchored |
274 | 0 | // searches supports prefilters. So this optimization merely sidesteps |
275 | 0 | // having to run the regex engine at all to confirm the match. Thus, it |
276 | 0 | // decreases the latency of a match. |
277 | 0 |
|
278 | 0 | // OK because we know the set is exact and thus finite. |
279 | 0 | let prefixes = prefixes.literals().unwrap(); |
280 | | debug!( |
281 | | "trying to bypass regex engine by creating \ |
282 | | prefilter from {} literals: {:?}", |
283 | | prefixes.len(), |
284 | | prefixes, |
285 | | ); |
286 | 0 | let choice = match prefilter::Choice::new(kind, prefixes) { |
287 | 0 | Some(choice) => choice, |
288 | | None => { |
289 | | debug!( |
290 | | "regex bypass failed because no prefilter could be built" |
291 | | ); |
292 | 0 | return None; |
293 | | } |
294 | | }; |
295 | 0 | let strat: Arc<dyn Strategy> = match choice { |
296 | 0 | prefilter::Choice::Memchr(pre) => Pre::new(pre), |
297 | 0 | prefilter::Choice::Memchr2(pre) => Pre::new(pre), |
298 | 0 | prefilter::Choice::Memchr3(pre) => Pre::new(pre), |
299 | 0 | prefilter::Choice::Memmem(pre) => Pre::new(pre), |
300 | 0 | prefilter::Choice::Teddy(pre) => Pre::new(pre), |
301 | 0 | prefilter::Choice::ByteSet(pre) => Pre::new(pre), |
302 | 0 | prefilter::Choice::AhoCorasick(pre) => Pre::new(pre), |
303 | | }; |
304 | 0 | Some(strat) |
305 | 1 | } |
306 | | |
307 | | /// Attempts to extract an alternation of literals, and if it's deemed |
308 | | /// worth doing, returns an Aho-Corasick prefilter as a strategy. |
309 | | /// |
310 | | /// And currently, this only returns something when 'hirs.len() == 1'. This |
311 | | /// could in theory do something if there are multiple HIRs where all of |
312 | | /// them are alternation of literals, but I haven't had the time to go down |
313 | | /// that path yet. |
314 | 1 | fn from_alternation_literals( |
315 | 1 | info: &RegexInfo, |
316 | 1 | hirs: &[&Hir], |
317 | 1 | ) -> Option<Arc<dyn Strategy>> { |
318 | | use crate::util::prefilter::AhoCorasick; |
319 | | |
320 | 1 | let lits0 = crate::meta::literal::alternation_literals(info, hirs)?; |
321 | 0 | let ac = AhoCorasick::new(MatchKind::LeftmostFirst, &lits)?; |
322 | 0 | Some(Pre::new(ac)) |
323 | 1 | } |
324 | | } |
325 | | |
326 | | // This implements Strategy for anything that implements PrefilterI. |
327 | | // |
328 | | // Note that this must only be used for regexes of length 1. Multi-regexes |
329 | | // don't work here. The prefilter interface only provides the span of a match |
330 | | // and not the pattern ID. (I did consider making it more expressive, but I |
331 | | // couldn't figure out how to tie everything together elegantly.) Thus, so long |
332 | | // as the regex only contains one pattern, we can simply assume that a match |
333 | | // corresponds to PatternID::ZERO. And indeed, that's what we do here. |
334 | | // |
335 | | // In practice, since this impl is used to report matches directly and thus |
336 | | // completely bypasses the regex engine, we only wind up using this under the |
337 | | // following restrictions: |
338 | | // |
339 | | // * There must be only one pattern. As explained above. |
340 | | // * The literal sequence must be finite and only contain exact literals. |
341 | | // * There must not be any look-around assertions. If there are, the literals |
342 | | // extracted might be exact, but a match doesn't necessarily imply an overall |
343 | | // match. As a trivial example, 'foo\bbar' does not match 'foobar'. |
344 | | // * The pattern must not have any explicit capturing groups. If it does, the |
345 | | // caller might expect them to be resolved. e.g., 'foo(bar)'. |
346 | | // |
347 | | // So when all of those things are true, we use a prefilter directly as a |
348 | | // strategy. |
349 | | // |
350 | | // In the case where the number of patterns is more than 1, we don't use this |
351 | | // but do use a special Aho-Corasick strategy if all of the regexes are just |
352 | | // simple literals or alternations of literals. (We also use the Aho-Corasick |
353 | | // strategy when len(patterns)==1 if the number of literals is large. In that |
354 | | // case, literal extraction gives up and will return an infinite set.) |
355 | | impl<P: PrefilterI> Strategy for Pre<P> { |
356 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
357 | 0 | fn group_info(&self) -> &GroupInfo { |
358 | 0 | &self.group_info |
359 | 0 | } |
360 | | |
361 | 0 | fn create_cache(&self) -> Cache { |
362 | 0 | Cache { |
363 | 0 | capmatches: Captures::all(self.group_info().clone()), |
364 | 0 | pikevm: wrappers::PikeVMCache::none(), |
365 | 0 | backtrack: wrappers::BoundedBacktrackerCache::none(), |
366 | 0 | onepass: wrappers::OnePassCache::none(), |
367 | 0 | hybrid: wrappers::HybridCache::none(), |
368 | 0 | revhybrid: wrappers::ReverseHybridCache::none(), |
369 | 0 | } |
370 | 0 | } |
371 | | |
372 | 0 | fn reset_cache(&self, _cache: &mut Cache) {} |
373 | | |
374 | 0 | fn is_accelerated(&self) -> bool { |
375 | 0 | self.pre.is_fast() |
376 | 0 | } |
377 | | |
378 | 0 | fn memory_usage(&self) -> usize { |
379 | 0 | self.pre.memory_usage() |
380 | 0 | } |
381 | | |
382 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
383 | 0 | fn search(&self, _cache: &mut Cache, input: &Input<'_>) -> Option<Match> { |
384 | 0 | if input.is_done() { |
385 | 0 | return None; |
386 | 0 | } |
387 | 0 | if input.get_anchored().is_anchored() { |
388 | 0 | return self |
389 | 0 | .pre |
390 | 0 | .prefix(input.haystack(), input.get_span()) |
391 | 0 | .map(|sp| Match::new(PatternID::ZERO, sp)); |
392 | 0 | } |
393 | 0 | self.pre |
394 | 0 | .find(input.haystack(), input.get_span()) |
395 | 0 | .map(|sp| Match::new(PatternID::ZERO, sp)) |
396 | 0 | } |
397 | | |
398 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
399 | 0 | fn search_half( |
400 | 0 | &self, |
401 | 0 | cache: &mut Cache, |
402 | 0 | input: &Input<'_>, |
403 | 0 | ) -> Option<HalfMatch> { |
404 | 0 | self.search(cache, input).map(|m| HalfMatch::new(m.pattern(), m.end())) |
405 | 0 | } |
406 | | |
407 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
408 | 0 | fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { |
409 | 0 | self.search(cache, input).is_some() |
410 | 0 | } |
411 | | |
412 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
413 | 0 | fn search_slots( |
414 | 0 | &self, |
415 | 0 | cache: &mut Cache, |
416 | 0 | input: &Input<'_>, |
417 | 0 | slots: &mut [Option<NonMaxUsize>], |
418 | 0 | ) -> Option<PatternID> { |
419 | 0 | let m = self.search(cache, input)?; |
420 | 0 | if let Some(slot) = slots.get_mut(0) { |
421 | 0 | *slot = NonMaxUsize::new(m.start()); |
422 | 0 | } |
423 | 0 | if let Some(slot) = slots.get_mut(1) { |
424 | 0 | *slot = NonMaxUsize::new(m.end()); |
425 | 0 | } |
426 | 0 | Some(m.pattern()) |
427 | 0 | } |
428 | | |
429 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
430 | 0 | fn which_overlapping_matches( |
431 | 0 | &self, |
432 | 0 | cache: &mut Cache, |
433 | 0 | input: &Input<'_>, |
434 | 0 | patset: &mut PatternSet, |
435 | 0 | ) { |
436 | 0 | if self.search(cache, input).is_some() { |
437 | 0 | patset.insert(PatternID::ZERO); |
438 | 0 | } |
439 | 0 | } |
440 | | } |
441 | | |
442 | | #[derive(Debug)] |
443 | | struct Core { |
444 | | info: RegexInfo, |
445 | | pre: Option<Prefilter>, |
446 | | nfa: NFA, |
447 | | nfarev: Option<NFA>, |
448 | | pikevm: wrappers::PikeVM, |
449 | | backtrack: wrappers::BoundedBacktracker, |
450 | | onepass: wrappers::OnePass, |
451 | | hybrid: wrappers::Hybrid, |
452 | | dfa: wrappers::DFA, |
453 | | } |
454 | | |
455 | | impl Core { |
456 | 2 | fn new( |
457 | 2 | info: RegexInfo, |
458 | 2 | pre: Option<Prefilter>, |
459 | 2 | hirs: &[&Hir], |
460 | 2 | ) -> Result<Core, BuildError> { |
461 | 2 | let mut lookm = LookMatcher::new(); |
462 | 2 | lookm.set_line_terminator(info.config().get_line_terminator()); |
463 | 2 | let thompson_config = thompson::Config::new() |
464 | 2 | .utf8(info.config().get_utf8_empty()) |
465 | 2 | .nfa_size_limit(info.config().get_nfa_size_limit()) |
466 | 2 | .shrink(false) |
467 | 2 | .which_captures(info.config().get_which_captures()) |
468 | 2 | .look_matcher(lookm); |
469 | 2 | let nfa = thompson::Compiler::new() |
470 | 2 | .configure(thompson_config.clone()) |
471 | 2 | .build_many_from_hir(hirs) |
472 | 2 | .map_err(BuildError::nfa)?0 ; |
473 | | // It's possible for the PikeVM or the BB to fail to build, even though |
474 | | // at this point, we already have a full NFA in hand. They can fail |
475 | | // when a Unicode word boundary is used but where Unicode word boundary |
476 | | // support is disabled at compile time, thus making it impossible to |
477 | | // match. (Construction can also fail if the NFA was compiled without |
478 | | // captures, but we always enable that above.) |
479 | 2 | let pikevm = wrappers::PikeVM::new(&info, pre.clone(), &nfa)?0 ; |
480 | 2 | let backtrack = |
481 | 2 | wrappers::BoundedBacktracker::new(&info, pre.clone(), &nfa)?0 ; |
482 | | // The onepass engine can of course fail to build, but we expect it to |
483 | | // fail in many cases because it is an optimization that doesn't apply |
484 | | // to all regexes. The 'OnePass' wrapper encapsulates this failure (and |
485 | | // logs a message if it occurs). |
486 | 2 | let onepass = wrappers::OnePass::new(&info, &nfa); |
487 | | // We try to encapsulate whether a particular regex engine should be |
488 | | // used within each respective wrapper, but the DFAs need a reverse NFA |
489 | | // to build itself, and we really do not want to build a reverse NFA if |
490 | | // we know we aren't going to use the lazy DFA. So we do a config check |
491 | | // up front, which is in practice the only way we won't try to use the |
492 | | // DFA. |
493 | 2 | let (nfarev, hybrid, dfa) = |
494 | 2 | if !info.config().get_hybrid() && !info.config().get_dfa()0 { |
495 | 0 | (None, wrappers::Hybrid::none(), wrappers::DFA::none()) |
496 | | } else { |
497 | | // FIXME: Technically, we don't quite yet KNOW that we need |
498 | | // a reverse NFA. It's possible for the DFAs below to both |
499 | | // fail to build just based on the forward NFA. In which case, |
500 | | // building the reverse NFA was totally wasted work. But... |
501 | | // fixing this requires breaking DFA construction apart into |
502 | | // two pieces: one for the forward part and another for the |
503 | | // reverse part. Quite annoying. Making it worse, when building |
504 | | // both DFAs fails, it's quite likely that the NFA is large and |
505 | | // that it will take quite some time to build the reverse NFA |
506 | | // too. So... it's really probably worth it to do this! |
507 | 2 | let nfarev = thompson::Compiler::new() |
508 | 2 | // Currently, reverse NFAs don't support capturing groups, |
509 | 2 | // so we MUST disable them. But even if we didn't have to, |
510 | 2 | // we would, because nothing in this crate does anything |
511 | 2 | // useful with capturing groups in reverse. And of course, |
512 | 2 | // the lazy DFA ignores capturing groups in all cases. |
513 | 2 | .configure( |
514 | 2 | thompson_config |
515 | 2 | .clone() |
516 | 2 | .which_captures(WhichCaptures::None) |
517 | 2 | .reverse(true), |
518 | 2 | ) |
519 | 2 | .build_many_from_hir(hirs) |
520 | 2 | .map_err(BuildError::nfa)?0 ; |
521 | 2 | let dfa = if !info.config().get_dfa() { |
522 | 2 | wrappers::DFA::none() |
523 | | } else { |
524 | 0 | wrappers::DFA::new(&info, pre.clone(), &nfa, &nfarev) |
525 | | }; |
526 | 2 | let hybrid = if !info.config().get_hybrid() { |
527 | 0 | wrappers::Hybrid::none() |
528 | 2 | } else if dfa.is_some() { |
529 | | debug!("skipping lazy DFA because we have a full DFA"); |
530 | 0 | wrappers::Hybrid::none() |
531 | | } else { |
532 | 2 | wrappers::Hybrid::new(&info, pre.clone(), &nfa, &nfarev) |
533 | | }; |
534 | 2 | (Some(nfarev), hybrid, dfa) |
535 | | }; |
536 | 2 | Ok(Core { |
537 | 2 | info, |
538 | 2 | pre, |
539 | 2 | nfa, |
540 | 2 | nfarev, |
541 | 2 | pikevm, |
542 | 2 | backtrack, |
543 | 2 | onepass, |
544 | 2 | hybrid, |
545 | 2 | dfa, |
546 | 2 | }) |
547 | 2 | } |
548 | | |
549 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
550 | 0 | fn try_search_mayfail( |
551 | 0 | &self, |
552 | 0 | cache: &mut Cache, |
553 | 0 | input: &Input<'_>, |
554 | 0 | ) -> Option<Result<Option<Match>, RetryFailError>> { |
555 | 0 | if let Some(e) = self.dfa.get(input) { |
556 | | trace!("using full DFA for search at {:?}", input.get_span()); |
557 | 0 | Some(e.try_search(input)) |
558 | 0 | } else if let Some(e) = self.hybrid.get(input) { |
559 | | trace!("using lazy DFA for search at {:?}", input.get_span()); |
560 | 0 | Some(e.try_search(&mut cache.hybrid, input)) |
561 | | } else { |
562 | 0 | None |
563 | | } |
564 | 0 | } |
565 | | |
566 | 0 | fn search_nofail( |
567 | 0 | &self, |
568 | 0 | cache: &mut Cache, |
569 | 0 | input: &Input<'_>, |
570 | 0 | ) -> Option<Match> { |
571 | 0 | let caps = &mut cache.capmatches; |
572 | 0 | caps.set_pattern(None); |
573 | | // We manually inline 'try_search_slots_nofail' here because we need to |
574 | | // borrow from 'cache.capmatches' in this method, but if we do, then |
575 | | // we can't pass 'cache' wholesale to to 'try_slots_no_hybrid'. It's a |
576 | | // classic example of how the borrow checker inhibits decomposition. |
577 | | // There are of course work-arounds (more types and/or interior |
578 | | // mutability), but that's more annoying than this IMO. |
579 | 0 | let pid = if let Some(ref e) = self.onepass.get(input) { |
580 | | trace!("using OnePass for search at {:?}", input.get_span()); |
581 | 0 | e.search_slots(&mut cache.onepass, input, caps.slots_mut()) |
582 | 0 | } else if let Some(ref e) = self.backtrack.get(input) { |
583 | | trace!( |
584 | | "using BoundedBacktracker for search at {:?}", |
585 | | input.get_span() |
586 | | ); |
587 | 0 | e.search_slots(&mut cache.backtrack, input, caps.slots_mut()) |
588 | | } else { |
589 | | trace!("using PikeVM for search at {:?}", input.get_span()); |
590 | 0 | let e = self.pikevm.get(); |
591 | 0 | e.search_slots(&mut cache.pikevm, input, caps.slots_mut()) |
592 | | }; |
593 | 0 | caps.set_pattern(pid); |
594 | 0 | caps.get_match() |
595 | 0 | } |
596 | | |
597 | 0 | fn search_half_nofail( |
598 | 0 | &self, |
599 | 0 | cache: &mut Cache, |
600 | 0 | input: &Input<'_>, |
601 | 0 | ) -> Option<HalfMatch> { |
602 | | // Only the lazy/full DFA returns half-matches, since the DFA requires |
603 | | // a reverse scan to find the start position. These fallback regex |
604 | | // engines can find the start and end in a single pass, so we just do |
605 | | // that and throw away the start offset to conform to the API. |
606 | 0 | let m = self.search_nofail(cache, input)?; |
607 | 0 | Some(HalfMatch::new(m.pattern(), m.end())) |
608 | 0 | } |
609 | | |
610 | 0 | fn search_slots_nofail( |
611 | 0 | &self, |
612 | 0 | cache: &mut Cache, |
613 | 0 | input: &Input<'_>, |
614 | 0 | slots: &mut [Option<NonMaxUsize>], |
615 | 0 | ) -> Option<PatternID> { |
616 | 0 | if let Some(ref e) = self.onepass.get(input) { |
617 | | trace!( |
618 | | "using OnePass for capture search at {:?}", |
619 | | input.get_span() |
620 | | ); |
621 | 0 | e.search_slots(&mut cache.onepass, input, slots) |
622 | 0 | } else if let Some(ref e) = self.backtrack.get(input) { |
623 | | trace!( |
624 | | "using BoundedBacktracker for capture search at {:?}", |
625 | | input.get_span() |
626 | | ); |
627 | 0 | e.search_slots(&mut cache.backtrack, input, slots) |
628 | | } else { |
629 | | trace!( |
630 | | "using PikeVM for capture search at {:?}", |
631 | | input.get_span() |
632 | | ); |
633 | 0 | let e = self.pikevm.get(); |
634 | 0 | e.search_slots(&mut cache.pikevm, input, slots) |
635 | | } |
636 | 0 | } |
637 | | |
638 | 0 | fn is_match_nofail(&self, cache: &mut Cache, input: &Input<'_>) -> bool { |
639 | 0 | if let Some(ref e) = self.onepass.get(input) { |
640 | | trace!( |
641 | | "using OnePass for is-match search at {:?}", |
642 | | input.get_span() |
643 | | ); |
644 | 0 | e.search_slots(&mut cache.onepass, input, &mut []).is_some() |
645 | 0 | } else if let Some(ref e) = self.backtrack.get(input) { |
646 | | trace!( |
647 | | "using BoundedBacktracker for is-match search at {:?}", |
648 | | input.get_span() |
649 | | ); |
650 | 0 | e.is_match(&mut cache.backtrack, input) |
651 | | } else { |
652 | | trace!( |
653 | | "using PikeVM for is-match search at {:?}", |
654 | | input.get_span() |
655 | | ); |
656 | 0 | let e = self.pikevm.get(); |
657 | 0 | e.is_match(&mut cache.pikevm, input) |
658 | | } |
659 | 0 | } |
660 | | |
661 | 0 | fn is_capture_search_needed(&self, slots_len: usize) -> bool { |
662 | 0 | slots_len > self.nfa.group_info().implicit_slot_len() |
663 | 0 | } |
664 | | } |
665 | | |
666 | | impl Strategy for Core { |
667 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
668 | 2 | fn group_info(&self) -> &GroupInfo { |
669 | 2 | self.nfa.group_info() |
670 | 2 | } |
671 | | |
672 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
673 | 2 | fn create_cache(&self) -> Cache { |
674 | 2 | Cache { |
675 | 2 | capmatches: Captures::all(self.group_info().clone()), |
676 | 2 | pikevm: self.pikevm.create_cache(), |
677 | 2 | backtrack: self.backtrack.create_cache(), |
678 | 2 | onepass: self.onepass.create_cache(), |
679 | 2 | hybrid: self.hybrid.create_cache(), |
680 | 2 | revhybrid: wrappers::ReverseHybridCache::none(), |
681 | 2 | } |
682 | 2 | } |
683 | | |
684 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
685 | 0 | fn reset_cache(&self, cache: &mut Cache) { |
686 | 0 | cache.pikevm.reset(&self.pikevm); |
687 | 0 | cache.backtrack.reset(&self.backtrack); |
688 | 0 | cache.onepass.reset(&self.onepass); |
689 | 0 | cache.hybrid.reset(&self.hybrid); |
690 | 0 | } |
691 | | |
692 | 0 | fn is_accelerated(&self) -> bool { |
693 | 0 | self.pre.as_ref().map_or(false, |pre| pre.is_fast()) |
694 | 0 | } |
695 | | |
696 | 0 | fn memory_usage(&self) -> usize { |
697 | 0 | self.info.memory_usage() |
698 | 0 | + self.pre.as_ref().map_or(0, |pre| pre.memory_usage()) |
699 | 0 | + self.nfa.memory_usage() |
700 | 0 | + self.nfarev.as_ref().map_or(0, |nfa| nfa.memory_usage()) |
701 | 0 | + self.onepass.memory_usage() |
702 | 0 | + self.dfa.memory_usage() |
703 | 0 | } |
704 | | |
705 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
706 | 0 | fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match> { |
707 | | // We manually inline try_search_mayfail here because letting the |
708 | | // compiler do it seems to produce pretty crappy codegen. |
709 | 0 | return if let Some(e) = self.dfa.get(input) { |
710 | | trace!("using full DFA for full search at {:?}", input.get_span()); |
711 | 0 | match e.try_search(input) { |
712 | 0 | Ok(x) => x, |
713 | 0 | Err(_err) => { |
714 | 0 | trace!("full DFA search failed: {}", _err); |
715 | 0 | self.search_nofail(cache, input) |
716 | | } |
717 | | } |
718 | 0 | } else if let Some(e) = self.hybrid.get(input) { |
719 | | trace!("using lazy DFA for full search at {:?}", input.get_span()); |
720 | 0 | match e.try_search(&mut cache.hybrid, input) { |
721 | 0 | Ok(x) => x, |
722 | 0 | Err(_err) => { |
723 | 0 | trace!("lazy DFA search failed: {}", _err); |
724 | 0 | self.search_nofail(cache, input) |
725 | | } |
726 | | } |
727 | | } else { |
728 | 0 | self.search_nofail(cache, input) |
729 | | }; |
730 | 0 | } |
731 | | |
732 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
733 | 300 | fn search_half( |
734 | 300 | &self, |
735 | 300 | cache: &mut Cache, |
736 | 300 | input: &Input<'_>, |
737 | 300 | ) -> Option<HalfMatch> { |
738 | | // The main difference with 'search' is that if we're using a DFA, we |
739 | | // can use a single forward scan without needing to run the reverse |
740 | | // DFA. |
741 | 300 | if let Some(e0 ) = self.dfa.get(input) { |
742 | | trace!("using full DFA for half search at {:?}", input.get_span()); |
743 | 0 | match e.try_search_half_fwd(input) { |
744 | 0 | Ok(x) => x, |
745 | 0 | Err(_err) => { |
746 | 0 | trace!("full DFA half search failed: {}", _err); |
747 | 0 | self.search_half_nofail(cache, input) |
748 | | } |
749 | | } |
750 | 300 | } else if let Some(e) = self.hybrid.get(input) { |
751 | | trace!("using lazy DFA for half search at {:?}", input.get_span()); |
752 | 300 | match e.try_search_half_fwd(&mut cache.hybrid, input) { |
753 | 300 | Ok(x) => x, |
754 | 0 | Err(_err) => { |
755 | 0 | trace!("lazy DFA half search failed: {}", _err); |
756 | 0 | self.search_half_nofail(cache, input) |
757 | | } |
758 | | } |
759 | | } else { |
760 | 0 | self.search_half_nofail(cache, input) |
761 | | } |
762 | 300 | } |
763 | | |
764 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
765 | 0 | fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { |
766 | 0 | if let Some(e) = self.dfa.get(input) { |
767 | | trace!( |
768 | | "using full DFA for is-match search at {:?}", |
769 | | input.get_span() |
770 | | ); |
771 | 0 | match e.try_search_half_fwd(input) { |
772 | 0 | Ok(x) => x.is_some(), |
773 | 0 | Err(_err) => { |
774 | 0 | trace!("full DFA half search failed: {}", _err); |
775 | 0 | self.is_match_nofail(cache, input) |
776 | | } |
777 | | } |
778 | 0 | } else if let Some(e) = self.hybrid.get(input) { |
779 | | trace!( |
780 | | "using lazy DFA for is-match search at {:?}", |
781 | | input.get_span() |
782 | | ); |
783 | 0 | match e.try_search_half_fwd(&mut cache.hybrid, input) { |
784 | 0 | Ok(x) => x.is_some(), |
785 | 0 | Err(_err) => { |
786 | 0 | trace!("lazy DFA half search failed: {}", _err); |
787 | 0 | self.is_match_nofail(cache, input) |
788 | | } |
789 | | } |
790 | | } else { |
791 | 0 | self.is_match_nofail(cache, input) |
792 | | } |
793 | 0 | } |
794 | | |
795 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
796 | 0 | fn search_slots( |
797 | 0 | &self, |
798 | 0 | cache: &mut Cache, |
799 | 0 | input: &Input<'_>, |
800 | 0 | slots: &mut [Option<NonMaxUsize>], |
801 | 0 | ) -> Option<PatternID> { |
802 | 0 | // Even if the regex has explicit capture groups, if the caller didn't |
803 | 0 | // provide any explicit slots, then it doesn't make sense to try and do |
804 | 0 | // extra work to get offsets for those slots. Ideally the caller should |
805 | 0 | // realize this and not call this routine in the first place, but alas, |
806 | 0 | // we try to save the caller from themselves if they do. |
807 | 0 | if !self.is_capture_search_needed(slots.len()) { |
808 | | trace!("asked for slots unnecessarily, trying fast path"); |
809 | 0 | let m = self.search(cache, input)?; |
810 | 0 | copy_match_to_slots(m, slots); |
811 | 0 | return Some(m.pattern()); |
812 | 0 | } |
813 | 0 | // If the onepass DFA is available for this search (which only happens |
814 | 0 | // when it's anchored), then skip running a fallible DFA. The onepass |
815 | 0 | // DFA isn't as fast as a full or lazy DFA, but it is typically quite |
816 | 0 | // a bit faster than the backtracker or the PikeVM. So it isn't as |
817 | 0 | // advantageous to try and do a full/lazy DFA scan first. |
818 | 0 | // |
819 | 0 | // We still theorize that it's better to do a full/lazy DFA scan, even |
820 | 0 | // when it's anchored, because it's usually much faster and permits us |
821 | 0 | // to say "no match" much more quickly. This does hurt the case of, |
822 | 0 | // say, parsing each line in a log file into capture groups, because |
823 | 0 | // in that case, the line always matches. So the lazy DFA scan is |
824 | 0 | // usually just wasted work. But, the lazy DFA is usually quite fast |
825 | 0 | // and doesn't cost too much here. |
826 | 0 | if self.onepass.get(&input).is_some() { |
827 | 0 | return self.search_slots_nofail(cache, &input, slots); |
828 | 0 | } |
829 | 0 | let m = match self.try_search_mayfail(cache, input) { |
830 | 0 | Some(Ok(Some(m))) => m, |
831 | 0 | Some(Ok(None)) => return None, |
832 | 0 | Some(Err(_err)) => { |
833 | 0 | trace!("fast capture search failed: {}", _err); |
834 | 0 | return self.search_slots_nofail(cache, input, slots); |
835 | | } |
836 | | None => { |
837 | 0 | return self.search_slots_nofail(cache, input, slots); |
838 | | } |
839 | | }; |
840 | | // At this point, now that we've found the bounds of the |
841 | | // match, we need to re-run something that can resolve |
842 | | // capturing groups. But we only need to run on it on the |
843 | | // match bounds and not the entire haystack. |
844 | | trace!( |
845 | | "match found at {}..{} in capture search, \ |
846 | | using another engine to find captures", |
847 | | m.start(), |
848 | | m.end(), |
849 | | ); |
850 | 0 | let input = input |
851 | 0 | .clone() |
852 | 0 | .span(m.start()..m.end()) |
853 | 0 | .anchored(Anchored::Pattern(m.pattern())); |
854 | 0 | Some( |
855 | 0 | self.search_slots_nofail(cache, &input, slots) |
856 | 0 | .expect("should find a match"), |
857 | 0 | ) |
858 | 0 | } |
859 | | |
860 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
861 | 0 | fn which_overlapping_matches( |
862 | 0 | &self, |
863 | 0 | cache: &mut Cache, |
864 | 0 | input: &Input<'_>, |
865 | 0 | patset: &mut PatternSet, |
866 | 0 | ) { |
867 | 0 | if let Some(e) = self.dfa.get(input) { |
868 | | trace!( |
869 | | "using full DFA for overlapping search at {:?}", |
870 | | input.get_span() |
871 | | ); |
872 | 0 | let _err = match e.try_which_overlapping_matches(input, patset) { |
873 | 0 | Ok(()) => return, |
874 | 0 | Err(err) => err, |
875 | | }; |
876 | | trace!("fast overlapping search failed: {}", _err); |
877 | 0 | } else if let Some(e) = self.hybrid.get(input) { |
878 | | trace!( |
879 | | "using lazy DFA for overlapping search at {:?}", |
880 | | input.get_span() |
881 | | ); |
882 | 0 | let _err = match e.try_which_overlapping_matches( |
883 | 0 | &mut cache.hybrid, |
884 | 0 | input, |
885 | 0 | patset, |
886 | 0 | ) { |
887 | | Ok(()) => { |
888 | 0 | return; |
889 | | } |
890 | 0 | Err(err) => err, |
891 | | }; |
892 | | trace!("fast overlapping search failed: {}", _err); |
893 | 0 | } |
894 | | trace!( |
895 | | "using PikeVM for overlapping search at {:?}", |
896 | | input.get_span() |
897 | | ); |
898 | 0 | let e = self.pikevm.get(); |
899 | 0 | e.which_overlapping_matches(&mut cache.pikevm, input, patset) |
900 | 0 | } |
901 | | } |
902 | | |
903 | | #[derive(Debug)] |
904 | | struct ReverseAnchored { |
905 | | core: Core, |
906 | | } |
907 | | |
908 | | impl ReverseAnchored { |
909 | 2 | fn new(core: Core) -> Result<ReverseAnchored, Core> { |
910 | 2 | if !core.info.is_always_anchored_end() { |
911 | | debug!( |
912 | | "skipping reverse anchored optimization because \ |
913 | | the regex is not always anchored at the end" |
914 | | ); |
915 | 1 | return Err(core); |
916 | 1 | } |
917 | 1 | // Note that the caller can still request an anchored search even when |
918 | 1 | // the regex isn't anchored at the start. We detect that case in the |
919 | 1 | // search routines below and just fallback to the core engine. This |
920 | 1 | // is fine because both searches are anchored. It's just a matter of |
921 | 1 | // picking one. Falling back to the core engine is a little simpler, |
922 | 1 | // since if we used the reverse anchored approach, we'd have to add an |
923 | 1 | // extra check to ensure the match reported starts at the place where |
924 | 1 | // the caller requested the search to start. |
925 | 1 | if core.info.is_always_anchored_start() { |
926 | | debug!( |
927 | | "skipping reverse anchored optimization because \ |
928 | | the regex is also anchored at the start" |
929 | | ); |
930 | 1 | return Err(core); |
931 | 0 | } |
932 | 0 | // Only DFAs can do reverse searches (currently), so we need one of |
933 | 0 | // them in order to do this optimization. It's possible (although |
934 | 0 | // pretty unlikely) that we have neither and need to give up. |
935 | 0 | if !core.hybrid.is_some() && !core.dfa.is_some() { |
936 | | debug!( |
937 | | "skipping reverse anchored optimization because \ |
938 | | we don't have a lazy DFA or a full DFA" |
939 | | ); |
940 | 0 | return Err(core); |
941 | 0 | } |
942 | 0 | Ok(ReverseAnchored { core }) |
943 | 2 | } |
944 | | |
945 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
946 | 0 | fn try_search_half_anchored_rev( |
947 | 0 | &self, |
948 | 0 | cache: &mut Cache, |
949 | 0 | input: &Input<'_>, |
950 | 0 | ) -> Result<Option<HalfMatch>, RetryFailError> { |
951 | 0 | // We of course always want an anchored search. In theory, the |
952 | 0 | // underlying regex engines should automatically enable anchored |
953 | 0 | // searches since the regex is itself anchored, but this more clearly |
954 | 0 | // expresses intent and is always correct. |
955 | 0 | let input = input.clone().anchored(Anchored::Yes); |
956 | 0 | if let Some(e) = self.core.dfa.get(&input) { |
957 | | trace!( |
958 | | "using full DFA for reverse anchored search at {:?}", |
959 | | input.get_span() |
960 | | ); |
961 | 0 | e.try_search_half_rev(&input) |
962 | 0 | } else if let Some(e) = self.core.hybrid.get(&input) { |
963 | | trace!( |
964 | | "using lazy DFA for reverse anchored search at {:?}", |
965 | | input.get_span() |
966 | | ); |
967 | 0 | e.try_search_half_rev(&mut cache.hybrid, &input) |
968 | | } else { |
969 | 0 | unreachable!("ReverseAnchored always has a DFA") |
970 | | } |
971 | 0 | } |
972 | | } |
973 | | |
974 | | // Note that in this impl, we don't check that 'input.end() == |
975 | | // input.haystack().len()'. In particular, when that condition is false, a |
976 | | // match is always impossible because we know that the regex is always anchored |
977 | | // at the end (or else 'ReverseAnchored' won't be built). We don't check that |
978 | | // here because the 'Regex' wrapper actually does that for us in all cases. |
979 | | // Thus, in this impl, we can actually assume that the end position in 'input' |
980 | | // is equivalent to the length of the haystack. |
981 | | impl Strategy for ReverseAnchored { |
982 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
983 | 0 | fn group_info(&self) -> &GroupInfo { |
984 | 0 | self.core.group_info() |
985 | 0 | } |
986 | | |
987 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
988 | 0 | fn create_cache(&self) -> Cache { |
989 | 0 | self.core.create_cache() |
990 | 0 | } |
991 | | |
992 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
993 | 0 | fn reset_cache(&self, cache: &mut Cache) { |
994 | 0 | self.core.reset_cache(cache); |
995 | 0 | } |
996 | | |
997 | 0 | fn is_accelerated(&self) -> bool { |
998 | 0 | // Since this is anchored at the end, a reverse anchored search is |
999 | 0 | // almost certainly guaranteed to result in a much faster search than |
1000 | 0 | // a standard forward search. |
1001 | 0 | true |
1002 | 0 | } |
1003 | | |
1004 | 0 | fn memory_usage(&self) -> usize { |
1005 | 0 | self.core.memory_usage() |
1006 | 0 | } |
1007 | | |
1008 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1009 | 0 | fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match> { |
1010 | 0 | if input.get_anchored().is_anchored() { |
1011 | 0 | return self.core.search(cache, input); |
1012 | 0 | } |
1013 | 0 | match self.try_search_half_anchored_rev(cache, input) { |
1014 | 0 | Err(_err) => { |
1015 | 0 | trace!("fast reverse anchored search failed: {}", _err); |
1016 | 0 | self.core.search_nofail(cache, input) |
1017 | | } |
1018 | 0 | Ok(None) => None, |
1019 | 0 | Ok(Some(hm)) => { |
1020 | 0 | Some(Match::new(hm.pattern(), hm.offset()..input.end())) |
1021 | | } |
1022 | | } |
1023 | 0 | } |
1024 | | |
1025 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1026 | 0 | fn search_half( |
1027 | 0 | &self, |
1028 | 0 | cache: &mut Cache, |
1029 | 0 | input: &Input<'_>, |
1030 | 0 | ) -> Option<HalfMatch> { |
1031 | 0 | if input.get_anchored().is_anchored() { |
1032 | 0 | return self.core.search_half(cache, input); |
1033 | 0 | } |
1034 | 0 | match self.try_search_half_anchored_rev(cache, input) { |
1035 | 0 | Err(_err) => { |
1036 | 0 | trace!("fast reverse anchored search failed: {}", _err); |
1037 | 0 | self.core.search_half_nofail(cache, input) |
1038 | | } |
1039 | 0 | Ok(None) => None, |
1040 | 0 | Ok(Some(hm)) => { |
1041 | 0 | // Careful here! 'try_search_half' is a *forward* search that |
1042 | 0 | // only cares about the *end* position of a match. But |
1043 | 0 | // 'hm.offset()' is actually the start of the match. So we |
1044 | 0 | // actually just throw that away here and, since we know we |
1045 | 0 | // have a match, return the only possible position at which a |
1046 | 0 | // match can occur: input.end(). |
1047 | 0 | Some(HalfMatch::new(hm.pattern(), input.end())) |
1048 | | } |
1049 | | } |
1050 | 0 | } |
1051 | | |
1052 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1053 | 0 | fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { |
1054 | 0 | if input.get_anchored().is_anchored() { |
1055 | 0 | return self.core.is_match(cache, input); |
1056 | 0 | } |
1057 | 0 | match self.try_search_half_anchored_rev(cache, input) { |
1058 | 0 | Err(_err) => { |
1059 | 0 | trace!("fast reverse anchored search failed: {}", _err); |
1060 | 0 | self.core.is_match_nofail(cache, input) |
1061 | | } |
1062 | 0 | Ok(None) => false, |
1063 | 0 | Ok(Some(_)) => true, |
1064 | | } |
1065 | 0 | } |
1066 | | |
1067 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1068 | 0 | fn search_slots( |
1069 | 0 | &self, |
1070 | 0 | cache: &mut Cache, |
1071 | 0 | input: &Input<'_>, |
1072 | 0 | slots: &mut [Option<NonMaxUsize>], |
1073 | 0 | ) -> Option<PatternID> { |
1074 | 0 | if input.get_anchored().is_anchored() { |
1075 | 0 | return self.core.search_slots(cache, input, slots); |
1076 | 0 | } |
1077 | 0 | match self.try_search_half_anchored_rev(cache, input) { |
1078 | 0 | Err(_err) => { |
1079 | 0 | trace!("fast reverse anchored search failed: {}", _err); |
1080 | 0 | self.core.search_slots_nofail(cache, input, slots) |
1081 | | } |
1082 | 0 | Ok(None) => None, |
1083 | 0 | Ok(Some(hm)) => { |
1084 | 0 | if !self.core.is_capture_search_needed(slots.len()) { |
1085 | | trace!("asked for slots unnecessarily, skipping captures"); |
1086 | 0 | let m = Match::new(hm.pattern(), hm.offset()..input.end()); |
1087 | 0 | copy_match_to_slots(m, slots); |
1088 | 0 | return Some(m.pattern()); |
1089 | 0 | } |
1090 | 0 | let start = hm.offset(); |
1091 | 0 | let input = input |
1092 | 0 | .clone() |
1093 | 0 | .span(start..input.end()) |
1094 | 0 | .anchored(Anchored::Pattern(hm.pattern())); |
1095 | 0 | self.core.search_slots_nofail(cache, &input, slots) |
1096 | | } |
1097 | | } |
1098 | 0 | } |
1099 | | |
1100 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1101 | 0 | fn which_overlapping_matches( |
1102 | 0 | &self, |
1103 | 0 | cache: &mut Cache, |
1104 | 0 | input: &Input<'_>, |
1105 | 0 | patset: &mut PatternSet, |
1106 | 0 | ) { |
1107 | 0 | // It seems like this could probably benefit from a reverse anchored |
1108 | 0 | // optimization, perhaps by doing an overlapping reverse search (which |
1109 | 0 | // the DFAs do support). I haven't given it much thought though, and |
1110 | 0 | // I'm currently focus more on the single pattern case. |
1111 | 0 | self.core.which_overlapping_matches(cache, input, patset) |
1112 | 0 | } |
1113 | | } |
1114 | | |
1115 | | #[derive(Debug)] |
1116 | | struct ReverseSuffix { |
1117 | | core: Core, |
1118 | | pre: Prefilter, |
1119 | | } |
1120 | | |
1121 | | impl ReverseSuffix { |
1122 | 2 | fn new(core: Core, hirs: &[&Hir]) -> Result<ReverseSuffix, Core> { |
1123 | 2 | if !core.info.config().get_auto_prefilter() { |
1124 | | debug!( |
1125 | | "skipping reverse suffix optimization because \ |
1126 | | automatic prefilters are disabled" |
1127 | | ); |
1128 | 0 | return Err(core); |
1129 | 2 | } |
1130 | 2 | // Like the reverse inner optimization, we don't do this for regexes |
1131 | 2 | // that are always anchored. It could lead to scanning too much, but |
1132 | 2 | // could say "no match" much more quickly than running the regex |
1133 | 2 | // engine if the initial literal scan doesn't match. With that said, |
1134 | 2 | // the reverse suffix optimization has lower overhead, since it only |
1135 | 2 | // requires a reverse scan after a literal match to confirm or reject |
1136 | 2 | // the match. (Although, in the case of confirmation, it then needs to |
1137 | 2 | // do another forward scan to find the end position.) |
1138 | 2 | // |
1139 | 2 | // Note that the caller can still request an anchored search even |
1140 | 2 | // when the regex isn't anchored. We detect that case in the search |
1141 | 2 | // routines below and just fallback to the core engine. Currently this |
1142 | 2 | // optimization assumes all searches are unanchored, so if we do want |
1143 | 2 | // to enable this optimization for anchored searches, it will need a |
1144 | 2 | // little work to support it. |
1145 | 2 | if core.info.is_always_anchored_start() { |
1146 | | debug!( |
1147 | | "skipping reverse suffix optimization because \ |
1148 | | the regex is always anchored at the start", |
1149 | | ); |
1150 | 1 | return Err(core); |
1151 | 1 | } |
1152 | 1 | // Only DFAs can do reverse searches (currently), so we need one of |
1153 | 1 | // them in order to do this optimization. It's possible (although |
1154 | 1 | // pretty unlikely) that we have neither and need to give up. |
1155 | 1 | if !core.hybrid.is_some() && !core.dfa.is_some()0 { |
1156 | | debug!( |
1157 | | "skipping reverse suffix optimization because \ |
1158 | | we don't have a lazy DFA or a full DFA" |
1159 | | ); |
1160 | 0 | return Err(core); |
1161 | 1 | } |
1162 | 1 | if core.pre.as_ref().map_or(false, |p| p.is_fast()0 ) { |
1163 | | debug!( |
1164 | | "skipping reverse suffix optimization because \ |
1165 | | we already have a prefilter that we think is fast" |
1166 | | ); |
1167 | 0 | return Err(core); |
1168 | 1 | } |
1169 | 1 | let kind = core.info.config().get_match_kind(); |
1170 | 1 | let suffixes = crate::util::prefilter::suffixes(kind, hirs); |
1171 | 1 | let lcs0 = match suffixes.longest_common_suffix() { |
1172 | | None => { |
1173 | | debug!( |
1174 | | "skipping reverse suffix optimization because \ |
1175 | | a longest common suffix could not be found", |
1176 | | ); |
1177 | 1 | return Err(core); |
1178 | | } |
1179 | 0 | Some(lcs) if lcs.is_empty() => { |
1180 | 0 | debug!( |
1181 | 0 | "skipping reverse suffix optimization because \ |
1182 | 0 | the longest common suffix is the empty string", |
1183 | 0 | ); |
1184 | 0 | return Err(core); |
1185 | | } |
1186 | 0 | Some(lcs) => lcs, |
1187 | | }; |
1188 | 0 | let pre = match Prefilter::new(kind, &[lcs]) { |
1189 | 0 | Some(pre) => pre, |
1190 | | None => { |
1191 | | debug!( |
1192 | | "skipping reverse suffix optimization because \ |
1193 | | a prefilter could not be constructed from the \ |
1194 | | longest common suffix", |
1195 | | ); |
1196 | 0 | return Err(core); |
1197 | | } |
1198 | | }; |
1199 | 0 | if !pre.is_fast() { |
1200 | | debug!( |
1201 | | "skipping reverse suffix optimization because \ |
1202 | | while we have a suffix prefilter, it is not \ |
1203 | | believed to be 'fast'" |
1204 | | ); |
1205 | 0 | return Err(core); |
1206 | 0 | } |
1207 | 0 | Ok(ReverseSuffix { core, pre }) |
1208 | 2 | } |
1209 | | |
1210 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1211 | 0 | fn try_search_half_start( |
1212 | 0 | &self, |
1213 | 0 | cache: &mut Cache, |
1214 | 0 | input: &Input<'_>, |
1215 | 0 | ) -> Result<Option<HalfMatch>, RetryError> { |
1216 | 0 | let mut span = input.get_span(); |
1217 | 0 | let mut min_start = 0; |
1218 | | loop { |
1219 | 0 | let litmatch = match self.pre.find(input.haystack(), span) { |
1220 | 0 | None => return Ok(None), |
1221 | 0 | Some(span) => span, |
1222 | 0 | }; |
1223 | 0 | trace!("reverse suffix scan found suffix match at {:?}", litmatch); |
1224 | 0 | let revinput = input |
1225 | 0 | .clone() |
1226 | 0 | .anchored(Anchored::Yes) |
1227 | 0 | .span(input.start()..litmatch.end); |
1228 | 0 | match self |
1229 | 0 | .try_search_half_rev_limited(cache, &revinput, min_start)? |
1230 | | { |
1231 | | None => { |
1232 | 0 | if span.start >= span.end { |
1233 | 0 | break; |
1234 | 0 | } |
1235 | 0 | span.start = litmatch.start.checked_add(1).unwrap(); |
1236 | | } |
1237 | 0 | Some(hm) => return Ok(Some(hm)), |
1238 | | } |
1239 | 0 | min_start = litmatch.end; |
1240 | | } |
1241 | 0 | Ok(None) |
1242 | 0 | } |
1243 | | |
1244 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1245 | 0 | fn try_search_half_fwd( |
1246 | 0 | &self, |
1247 | 0 | cache: &mut Cache, |
1248 | 0 | input: &Input<'_>, |
1249 | 0 | ) -> Result<Option<HalfMatch>, RetryFailError> { |
1250 | 0 | if let Some(e) = self.core.dfa.get(&input) { |
1251 | | trace!( |
1252 | | "using full DFA for forward reverse suffix search at {:?}", |
1253 | | input.get_span() |
1254 | | ); |
1255 | 0 | e.try_search_half_fwd(&input) |
1256 | 0 | } else if let Some(e) = self.core.hybrid.get(&input) { |
1257 | | trace!( |
1258 | | "using lazy DFA for forward reverse suffix search at {:?}", |
1259 | | input.get_span() |
1260 | | ); |
1261 | 0 | e.try_search_half_fwd(&mut cache.hybrid, &input) |
1262 | | } else { |
1263 | 0 | unreachable!("ReverseSuffix always has a DFA") |
1264 | | } |
1265 | 0 | } |
1266 | | |
1267 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1268 | 0 | fn try_search_half_rev_limited( |
1269 | 0 | &self, |
1270 | 0 | cache: &mut Cache, |
1271 | 0 | input: &Input<'_>, |
1272 | 0 | min_start: usize, |
1273 | 0 | ) -> Result<Option<HalfMatch>, RetryError> { |
1274 | 0 | if let Some(e) = self.core.dfa.get(&input) { |
1275 | | trace!( |
1276 | | "using full DFA for reverse suffix search at {:?}, \ |
1277 | | but will be stopped at {} to avoid quadratic behavior", |
1278 | | input.get_span(), |
1279 | | min_start, |
1280 | | ); |
1281 | 0 | e.try_search_half_rev_limited(&input, min_start) |
1282 | 0 | } else if let Some(e) = self.core.hybrid.get(&input) { |
1283 | | trace!( |
1284 | | "using lazy DFA for reverse suffix search at {:?}, \ |
1285 | | but will be stopped at {} to avoid quadratic behavior", |
1286 | | input.get_span(), |
1287 | | min_start, |
1288 | | ); |
1289 | 0 | e.try_search_half_rev_limited(&mut cache.hybrid, &input, min_start) |
1290 | | } else { |
1291 | 0 | unreachable!("ReverseSuffix always has a DFA") |
1292 | | } |
1293 | 0 | } |
1294 | | } |
1295 | | |
1296 | | impl Strategy for ReverseSuffix { |
1297 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1298 | 0 | fn group_info(&self) -> &GroupInfo { |
1299 | 0 | self.core.group_info() |
1300 | 0 | } |
1301 | | |
1302 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1303 | 0 | fn create_cache(&self) -> Cache { |
1304 | 0 | self.core.create_cache() |
1305 | 0 | } |
1306 | | |
1307 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1308 | 0 | fn reset_cache(&self, cache: &mut Cache) { |
1309 | 0 | self.core.reset_cache(cache); |
1310 | 0 | } |
1311 | | |
1312 | 0 | fn is_accelerated(&self) -> bool { |
1313 | 0 | self.pre.is_fast() |
1314 | 0 | } |
1315 | | |
1316 | 0 | fn memory_usage(&self) -> usize { |
1317 | 0 | self.core.memory_usage() + self.pre.memory_usage() |
1318 | 0 | } |
1319 | | |
1320 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1321 | 0 | fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match> { |
1322 | 0 | if input.get_anchored().is_anchored() { |
1323 | 0 | return self.core.search(cache, input); |
1324 | 0 | } |
1325 | 0 | match self.try_search_half_start(cache, input) { |
1326 | 0 | Err(RetryError::Quadratic(_err)) => { |
1327 | 0 | trace!("reverse suffix optimization failed: {}", _err); |
1328 | 0 | self.core.search(cache, input) |
1329 | | } |
1330 | 0 | Err(RetryError::Fail(_err)) => { |
1331 | 0 | trace!("reverse suffix reverse fast search failed: {}", _err); |
1332 | 0 | self.core.search_nofail(cache, input) |
1333 | | } |
1334 | 0 | Ok(None) => None, |
1335 | 0 | Ok(Some(hm_start)) => { |
1336 | 0 | let fwdinput = input |
1337 | 0 | .clone() |
1338 | 0 | .anchored(Anchored::Pattern(hm_start.pattern())) |
1339 | 0 | .span(hm_start.offset()..input.end()); |
1340 | 0 | match self.try_search_half_fwd(cache, &fwdinput) { |
1341 | 0 | Err(_err) => { |
1342 | 0 | trace!( |
1343 | 0 | "reverse suffix forward fast search failed: {}", |
1344 | 0 | _err |
1345 | 0 | ); |
1346 | 0 | self.core.search_nofail(cache, input) |
1347 | | } |
1348 | | Ok(None) => { |
1349 | 0 | unreachable!( |
1350 | 0 | "suffix match plus reverse match implies \ |
1351 | 0 | there must be a match", |
1352 | 0 | ) |
1353 | | } |
1354 | 0 | Ok(Some(hm_end)) => Some(Match::new( |
1355 | 0 | hm_start.pattern(), |
1356 | 0 | hm_start.offset()..hm_end.offset(), |
1357 | 0 | )), |
1358 | | } |
1359 | | } |
1360 | | } |
1361 | 0 | } |
1362 | | |
1363 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1364 | 0 | fn search_half( |
1365 | 0 | &self, |
1366 | 0 | cache: &mut Cache, |
1367 | 0 | input: &Input<'_>, |
1368 | 0 | ) -> Option<HalfMatch> { |
1369 | 0 | if input.get_anchored().is_anchored() { |
1370 | 0 | return self.core.search_half(cache, input); |
1371 | 0 | } |
1372 | 0 | match self.try_search_half_start(cache, input) { |
1373 | 0 | Err(RetryError::Quadratic(_err)) => { |
1374 | 0 | trace!("reverse suffix half optimization failed: {}", _err); |
1375 | 0 | self.core.search_half(cache, input) |
1376 | | } |
1377 | 0 | Err(RetryError::Fail(_err)) => { |
1378 | 0 | trace!( |
1379 | 0 | "reverse suffix reverse fast half search failed: {}", |
1380 | 0 | _err |
1381 | 0 | ); |
1382 | 0 | self.core.search_half_nofail(cache, input) |
1383 | | } |
1384 | 0 | Ok(None) => None, |
1385 | 0 | Ok(Some(hm_start)) => { |
1386 | 0 | // This is a bit subtle. It is tempting to just stop searching |
1387 | 0 | // at this point and return a half-match with an offset |
1388 | 0 | // corresponding to where the suffix was found. But the suffix |
1389 | 0 | // match does not necessarily correspond to the end of the |
1390 | 0 | // proper leftmost-first match. Consider /[a-z]+ing/ against |
1391 | 0 | // 'tingling'. The first suffix match is the first 'ing', and |
1392 | 0 | // the /[a-z]+/ matches the 't'. So if we stopped here, then |
1393 | 0 | // we'd report 'ting' as the match. But 'tingling' is the |
1394 | 0 | // correct match because of greediness. |
1395 | 0 | let fwdinput = input |
1396 | 0 | .clone() |
1397 | 0 | .anchored(Anchored::Pattern(hm_start.pattern())) |
1398 | 0 | .span(hm_start.offset()..input.end()); |
1399 | 0 | match self.try_search_half_fwd(cache, &fwdinput) { |
1400 | 0 | Err(_err) => { |
1401 | 0 | trace!( |
1402 | 0 | "reverse suffix forward fast search failed: {}", |
1403 | 0 | _err |
1404 | 0 | ); |
1405 | 0 | self.core.search_half_nofail(cache, input) |
1406 | | } |
1407 | | Ok(None) => { |
1408 | 0 | unreachable!( |
1409 | 0 | "suffix match plus reverse match implies \ |
1410 | 0 | there must be a match", |
1411 | 0 | ) |
1412 | | } |
1413 | 0 | Ok(Some(hm_end)) => Some(hm_end), |
1414 | | } |
1415 | | } |
1416 | | } |
1417 | 0 | } |
1418 | | |
1419 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1420 | 0 | fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { |
1421 | 0 | if input.get_anchored().is_anchored() { |
1422 | 0 | return self.core.is_match(cache, input); |
1423 | 0 | } |
1424 | 0 | match self.try_search_half_start(cache, input) { |
1425 | 0 | Err(RetryError::Quadratic(_err)) => { |
1426 | 0 | trace!("reverse suffix half optimization failed: {}", _err); |
1427 | 0 | self.core.is_match_nofail(cache, input) |
1428 | | } |
1429 | 0 | Err(RetryError::Fail(_err)) => { |
1430 | 0 | trace!( |
1431 | 0 | "reverse suffix reverse fast half search failed: {}", |
1432 | 0 | _err |
1433 | 0 | ); |
1434 | 0 | self.core.is_match_nofail(cache, input) |
1435 | | } |
1436 | 0 | Ok(None) => false, |
1437 | 0 | Ok(Some(_)) => true, |
1438 | | } |
1439 | 0 | } |
1440 | | |
1441 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1442 | 0 | fn search_slots( |
1443 | 0 | &self, |
1444 | 0 | cache: &mut Cache, |
1445 | 0 | input: &Input<'_>, |
1446 | 0 | slots: &mut [Option<NonMaxUsize>], |
1447 | 0 | ) -> Option<PatternID> { |
1448 | 0 | if input.get_anchored().is_anchored() { |
1449 | 0 | return self.core.search_slots(cache, input, slots); |
1450 | 0 | } |
1451 | 0 | if !self.core.is_capture_search_needed(slots.len()) { |
1452 | | trace!("asked for slots unnecessarily, trying fast path"); |
1453 | 0 | let m = self.search(cache, input)?; |
1454 | 0 | copy_match_to_slots(m, slots); |
1455 | 0 | return Some(m.pattern()); |
1456 | 0 | } |
1457 | 0 | let hm_start = match self.try_search_half_start(cache, input) { |
1458 | 0 | Err(RetryError::Quadratic(_err)) => { |
1459 | 0 | trace!( |
1460 | 0 | "reverse suffix captures optimization failed: {}", |
1461 | 0 | _err |
1462 | 0 | ); |
1463 | 0 | return self.core.search_slots(cache, input, slots); |
1464 | | } |
1465 | 0 | Err(RetryError::Fail(_err)) => { |
1466 | 0 | trace!( |
1467 | 0 | "reverse suffix reverse fast captures search failed: {}", |
1468 | 0 | _err |
1469 | 0 | ); |
1470 | 0 | return self.core.search_slots_nofail(cache, input, slots); |
1471 | | } |
1472 | 0 | Ok(None) => return None, |
1473 | 0 | Ok(Some(hm_start)) => hm_start, |
1474 | 0 | }; |
1475 | 0 | trace!( |
1476 | 0 | "match found at {}..{} in capture search, \ |
1477 | 0 | using another engine to find captures", |
1478 | 0 | hm_start.offset(), |
1479 | 0 | input.end(), |
1480 | 0 | ); |
1481 | 0 | let start = hm_start.offset(); |
1482 | 0 | let input = input |
1483 | 0 | .clone() |
1484 | 0 | .span(start..input.end()) |
1485 | 0 | .anchored(Anchored::Pattern(hm_start.pattern())); |
1486 | 0 | self.core.search_slots_nofail(cache, &input, slots) |
1487 | 0 | } |
1488 | | |
1489 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1490 | 0 | fn which_overlapping_matches( |
1491 | 0 | &self, |
1492 | 0 | cache: &mut Cache, |
1493 | 0 | input: &Input<'_>, |
1494 | 0 | patset: &mut PatternSet, |
1495 | 0 | ) { |
1496 | 0 | self.core.which_overlapping_matches(cache, input, patset) |
1497 | 0 | } |
1498 | | } |
1499 | | |
1500 | | #[derive(Debug)] |
1501 | | struct ReverseInner { |
1502 | | core: Core, |
1503 | | preinner: Prefilter, |
1504 | | nfarev: NFA, |
1505 | | hybrid: wrappers::ReverseHybrid, |
1506 | | dfa: wrappers::ReverseDFA, |
1507 | | } |
1508 | | |
1509 | | impl ReverseInner { |
1510 | 2 | fn new(core: Core, hirs: &[&Hir]) -> Result<ReverseInner, Core> { |
1511 | 2 | if !core.info.config().get_auto_prefilter() { |
1512 | | debug!( |
1513 | | "skipping reverse inner optimization because \ |
1514 | | automatic prefilters are disabled" |
1515 | | ); |
1516 | 0 | return Err(core); |
1517 | 2 | } |
1518 | 2 | // Currently we hard-code the assumption of leftmost-first match |
1519 | 2 | // semantics. This isn't a huge deal because 'all' semantics tend to |
1520 | 2 | // only be used for forward overlapping searches with multiple regexes, |
1521 | 2 | // and this optimization only supports a single pattern at the moment. |
1522 | 2 | if core.info.config().get_match_kind() != MatchKind::LeftmostFirst { |
1523 | | debug!( |
1524 | | "skipping reverse inner optimization because \ |
1525 | | match kind is {:?} but this only supports leftmost-first", |
1526 | | core.info.config().get_match_kind(), |
1527 | | ); |
1528 | 0 | return Err(core); |
1529 | 2 | } |
1530 | 2 | // It's likely that a reverse inner scan has too much overhead for it |
1531 | 2 | // to be worth it when the regex is anchored at the start. It is |
1532 | 2 | // possible for it to be quite a bit faster if the initial literal |
1533 | 2 | // scan fails to detect a match, in which case, we can say "no match" |
1534 | 2 | // very quickly. But this could be undesirable, e.g., scanning too far |
1535 | 2 | // or when the literal scan matches. If it matches, then confirming the |
1536 | 2 | // match requires a reverse scan followed by a forward scan to confirm |
1537 | 2 | // or reject, which is a fair bit of work. |
1538 | 2 | // |
1539 | 2 | // Note that the caller can still request an anchored search even |
1540 | 2 | // when the regex isn't anchored. We detect that case in the search |
1541 | 2 | // routines below and just fallback to the core engine. Currently this |
1542 | 2 | // optimization assumes all searches are unanchored, so if we do want |
1543 | 2 | // to enable this optimization for anchored searches, it will need a |
1544 | 2 | // little work to support it. |
1545 | 2 | if core.info.is_always_anchored_start() { |
1546 | | debug!( |
1547 | | "skipping reverse inner optimization because \ |
1548 | | the regex is always anchored at the start", |
1549 | | ); |
1550 | 1 | return Err(core); |
1551 | 1 | } |
1552 | 1 | // Only DFAs can do reverse searches (currently), so we need one of |
1553 | 1 | // them in order to do this optimization. It's possible (although |
1554 | 1 | // pretty unlikely) that we have neither and need to give up. |
1555 | 1 | if !core.hybrid.is_some() && !core.dfa.is_some()0 { |
1556 | | debug!( |
1557 | | "skipping reverse inner optimization because \ |
1558 | | we don't have a lazy DFA or a full DFA" |
1559 | | ); |
1560 | 0 | return Err(core); |
1561 | 1 | } |
1562 | 1 | if core.pre.as_ref().map_or(false, |p| p.is_fast()0 ) { |
1563 | | debug!( |
1564 | | "skipping reverse inner optimization because \ |
1565 | | we already have a prefilter that we think is fast" |
1566 | | ); |
1567 | 0 | return Err(core); |
1568 | 1 | } else if core.pre.is_some() { |
1569 | 0 | debug!( |
1570 | 0 | "core engine has a prefix prefilter, but it is \ |
1571 | 0 | probably not fast, so continuing with attempt to \ |
1572 | 0 | use reverse inner prefilter" |
1573 | 0 | ); |
1574 | 1 | } |
1575 | 1 | let (concat_prefix, preinner0 ) = match reverse_inner::extract(hirs) { |
1576 | 0 | Some(x) => x, |
1577 | | // N.B. the 'extract' function emits debug messages explaining |
1578 | | // why we bailed out here. |
1579 | 1 | None => return Err(core), |
1580 | | }; |
1581 | | debug!("building reverse NFA for prefix before inner literal"); |
1582 | 0 | let mut lookm = LookMatcher::new(); |
1583 | 0 | lookm.set_line_terminator(core.info.config().get_line_terminator()); |
1584 | 0 | let thompson_config = thompson::Config::new() |
1585 | 0 | .reverse(true) |
1586 | 0 | .utf8(core.info.config().get_utf8_empty()) |
1587 | 0 | .nfa_size_limit(core.info.config().get_nfa_size_limit()) |
1588 | 0 | .shrink(false) |
1589 | 0 | .which_captures(WhichCaptures::None) |
1590 | 0 | .look_matcher(lookm); |
1591 | 0 | let result = thompson::Compiler::new() |
1592 | 0 | .configure(thompson_config) |
1593 | 0 | .build_from_hir(&concat_prefix); |
1594 | 0 | let nfarev = match result { |
1595 | 0 | Ok(nfarev) => nfarev, |
1596 | 0 | Err(_err) => { |
1597 | 0 | debug!( |
1598 | 0 | "skipping reverse inner optimization because the \ |
1599 | 0 | reverse NFA failed to build: {}", |
1600 | 0 | _err, |
1601 | 0 | ); |
1602 | 0 | return Err(core); |
1603 | | } |
1604 | | }; |
1605 | | debug!("building reverse DFA for prefix before inner literal"); |
1606 | 0 | let dfa = if !core.info.config().get_dfa() { |
1607 | 0 | wrappers::ReverseDFA::none() |
1608 | | } else { |
1609 | 0 | wrappers::ReverseDFA::new(&core.info, &nfarev) |
1610 | | }; |
1611 | 0 | let hybrid = if !core.info.config().get_hybrid() { |
1612 | 0 | wrappers::ReverseHybrid::none() |
1613 | 0 | } else if dfa.is_some() { |
1614 | | debug!( |
1615 | | "skipping lazy DFA for reverse inner optimization \ |
1616 | | because we have a full DFA" |
1617 | | ); |
1618 | 0 | wrappers::ReverseHybrid::none() |
1619 | | } else { |
1620 | 0 | wrappers::ReverseHybrid::new(&core.info, &nfarev) |
1621 | | }; |
1622 | 0 | Ok(ReverseInner { core, preinner, nfarev, hybrid, dfa }) |
1623 | 2 | } |
1624 | | |
1625 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1626 | 0 | fn try_search_full( |
1627 | 0 | &self, |
1628 | 0 | cache: &mut Cache, |
1629 | 0 | input: &Input<'_>, |
1630 | 0 | ) -> Result<Option<Match>, RetryError> { |
1631 | 0 | let mut span = input.get_span(); |
1632 | 0 | let mut min_match_start = 0; |
1633 | 0 | let mut min_pre_start = 0; |
1634 | | loop { |
1635 | 0 | let litmatch = match self.preinner.find(input.haystack(), span) { |
1636 | 0 | None => return Ok(None), |
1637 | 0 | Some(span) => span, |
1638 | 0 | }; |
1639 | 0 | if litmatch.start < min_pre_start { |
1640 | | trace!( |
1641 | | "found inner prefilter match at {:?}, which starts \ |
1642 | | before the end of the last forward scan at {}, \ |
1643 | | quitting to avoid quadratic behavior", |
1644 | | litmatch, |
1645 | | min_pre_start, |
1646 | | ); |
1647 | 0 | return Err(RetryError::Quadratic(RetryQuadraticError::new())); |
1648 | 0 | } |
1649 | 0 | trace!("reverse inner scan found inner match at {:?}", litmatch); |
1650 | 0 | let revinput = input |
1651 | 0 | .clone() |
1652 | 0 | .anchored(Anchored::Yes) |
1653 | 0 | .span(input.start()..litmatch.start); |
1654 | 0 | // Note that in addition to the literal search above scanning past |
1655 | 0 | // our minimum start point, this routine can also return an error |
1656 | 0 | // as a result of detecting possible quadratic behavior if the |
1657 | 0 | // reverse scan goes past the minimum start point. That is, the |
1658 | 0 | // literal search might not, but the reverse regex search for the |
1659 | 0 | // prefix might! |
1660 | 0 | match self.try_search_half_rev_limited( |
1661 | 0 | cache, |
1662 | 0 | &revinput, |
1663 | 0 | min_match_start, |
1664 | 0 | )? { |
1665 | | None => { |
1666 | 0 | if span.start >= span.end { |
1667 | 0 | break; |
1668 | 0 | } |
1669 | 0 | span.start = litmatch.start.checked_add(1).unwrap(); |
1670 | | } |
1671 | 0 | Some(hm_start) => { |
1672 | 0 | let fwdinput = input |
1673 | 0 | .clone() |
1674 | 0 | .anchored(Anchored::Pattern(hm_start.pattern())) |
1675 | 0 | .span(hm_start.offset()..input.end()); |
1676 | 0 | match self.try_search_half_fwd_stopat(cache, &fwdinput)? { |
1677 | 0 | Err(stopat) => { |
1678 | 0 | min_pre_start = stopat; |
1679 | 0 | span.start = |
1680 | 0 | litmatch.start.checked_add(1).unwrap(); |
1681 | 0 | } |
1682 | 0 | Ok(hm_end) => { |
1683 | 0 | return Ok(Some(Match::new( |
1684 | 0 | hm_start.pattern(), |
1685 | 0 | hm_start.offset()..hm_end.offset(), |
1686 | 0 | ))) |
1687 | | } |
1688 | | } |
1689 | | } |
1690 | | } |
1691 | 0 | min_match_start = litmatch.end; |
1692 | | } |
1693 | 0 | Ok(None) |
1694 | 0 | } |
1695 | | |
1696 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1697 | 0 | fn try_search_half_fwd_stopat( |
1698 | 0 | &self, |
1699 | 0 | cache: &mut Cache, |
1700 | 0 | input: &Input<'_>, |
1701 | 0 | ) -> Result<Result<HalfMatch, usize>, RetryFailError> { |
1702 | 0 | if let Some(e) = self.core.dfa.get(&input) { |
1703 | | trace!( |
1704 | | "using full DFA for forward reverse inner search at {:?}", |
1705 | | input.get_span() |
1706 | | ); |
1707 | 0 | e.try_search_half_fwd_stopat(&input) |
1708 | 0 | } else if let Some(e) = self.core.hybrid.get(&input) { |
1709 | | trace!( |
1710 | | "using lazy DFA for forward reverse inner search at {:?}", |
1711 | | input.get_span() |
1712 | | ); |
1713 | 0 | e.try_search_half_fwd_stopat(&mut cache.hybrid, &input) |
1714 | | } else { |
1715 | 0 | unreachable!("ReverseInner always has a DFA") |
1716 | | } |
1717 | 0 | } |
1718 | | |
1719 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1720 | 0 | fn try_search_half_rev_limited( |
1721 | 0 | &self, |
1722 | 0 | cache: &mut Cache, |
1723 | 0 | input: &Input<'_>, |
1724 | 0 | min_start: usize, |
1725 | 0 | ) -> Result<Option<HalfMatch>, RetryError> { |
1726 | 0 | if let Some(e) = self.dfa.get(&input) { |
1727 | | trace!( |
1728 | | "using full DFA for reverse inner search at {:?}, \ |
1729 | | but will be stopped at {} to avoid quadratic behavior", |
1730 | | input.get_span(), |
1731 | | min_start, |
1732 | | ); |
1733 | 0 | e.try_search_half_rev_limited(&input, min_start) |
1734 | 0 | } else if let Some(e) = self.hybrid.get(&input) { |
1735 | | trace!( |
1736 | | "using lazy DFA for reverse inner search at {:?}, \ |
1737 | | but will be stopped at {} to avoid quadratic behavior", |
1738 | | input.get_span(), |
1739 | | min_start, |
1740 | | ); |
1741 | 0 | e.try_search_half_rev_limited( |
1742 | 0 | &mut cache.revhybrid, |
1743 | 0 | &input, |
1744 | 0 | min_start, |
1745 | 0 | ) |
1746 | | } else { |
1747 | 0 | unreachable!("ReverseInner always has a DFA") |
1748 | | } |
1749 | 0 | } |
1750 | | } |
1751 | | |
1752 | | impl Strategy for ReverseInner { |
1753 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1754 | 0 | fn group_info(&self) -> &GroupInfo { |
1755 | 0 | self.core.group_info() |
1756 | 0 | } |
1757 | | |
1758 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1759 | 0 | fn create_cache(&self) -> Cache { |
1760 | 0 | let mut cache = self.core.create_cache(); |
1761 | 0 | cache.revhybrid = self.hybrid.create_cache(); |
1762 | 0 | cache |
1763 | 0 | } |
1764 | | |
1765 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1766 | 0 | fn reset_cache(&self, cache: &mut Cache) { |
1767 | 0 | self.core.reset_cache(cache); |
1768 | 0 | cache.revhybrid.reset(&self.hybrid); |
1769 | 0 | } |
1770 | | |
1771 | 0 | fn is_accelerated(&self) -> bool { |
1772 | 0 | self.preinner.is_fast() |
1773 | 0 | } |
1774 | | |
1775 | 0 | fn memory_usage(&self) -> usize { |
1776 | 0 | self.core.memory_usage() |
1777 | 0 | + self.preinner.memory_usage() |
1778 | 0 | + self.nfarev.memory_usage() |
1779 | 0 | + self.dfa.memory_usage() |
1780 | 0 | } |
1781 | | |
1782 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1783 | 0 | fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match> { |
1784 | 0 | if input.get_anchored().is_anchored() { |
1785 | 0 | return self.core.search(cache, input); |
1786 | 0 | } |
1787 | 0 | match self.try_search_full(cache, input) { |
1788 | 0 | Err(RetryError::Quadratic(_err)) => { |
1789 | 0 | trace!("reverse inner optimization failed: {}", _err); |
1790 | 0 | self.core.search(cache, input) |
1791 | | } |
1792 | 0 | Err(RetryError::Fail(_err)) => { |
1793 | 0 | trace!("reverse inner fast search failed: {}", _err); |
1794 | 0 | self.core.search_nofail(cache, input) |
1795 | | } |
1796 | 0 | Ok(matornot) => matornot, |
1797 | | } |
1798 | 0 | } |
1799 | | |
1800 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1801 | 0 | fn search_half( |
1802 | 0 | &self, |
1803 | 0 | cache: &mut Cache, |
1804 | 0 | input: &Input<'_>, |
1805 | 0 | ) -> Option<HalfMatch> { |
1806 | 0 | if input.get_anchored().is_anchored() { |
1807 | 0 | return self.core.search_half(cache, input); |
1808 | 0 | } |
1809 | 0 | match self.try_search_full(cache, input) { |
1810 | 0 | Err(RetryError::Quadratic(_err)) => { |
1811 | 0 | trace!("reverse inner half optimization failed: {}", _err); |
1812 | 0 | self.core.search_half(cache, input) |
1813 | | } |
1814 | 0 | Err(RetryError::Fail(_err)) => { |
1815 | 0 | trace!("reverse inner fast half search failed: {}", _err); |
1816 | 0 | self.core.search_half_nofail(cache, input) |
1817 | | } |
1818 | 0 | Ok(None) => None, |
1819 | 0 | Ok(Some(m)) => Some(HalfMatch::new(m.pattern(), m.end())), |
1820 | | } |
1821 | 0 | } |
1822 | | |
1823 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1824 | 0 | fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { |
1825 | 0 | if input.get_anchored().is_anchored() { |
1826 | 0 | return self.core.is_match(cache, input); |
1827 | 0 | } |
1828 | 0 | match self.try_search_full(cache, input) { |
1829 | 0 | Err(RetryError::Quadratic(_err)) => { |
1830 | 0 | trace!("reverse inner half optimization failed: {}", _err); |
1831 | 0 | self.core.is_match_nofail(cache, input) |
1832 | | } |
1833 | 0 | Err(RetryError::Fail(_err)) => { |
1834 | 0 | trace!("reverse inner fast half search failed: {}", _err); |
1835 | 0 | self.core.is_match_nofail(cache, input) |
1836 | | } |
1837 | 0 | Ok(None) => false, |
1838 | 0 | Ok(Some(_)) => true, |
1839 | | } |
1840 | 0 | } |
1841 | | |
1842 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1843 | 0 | fn search_slots( |
1844 | 0 | &self, |
1845 | 0 | cache: &mut Cache, |
1846 | 0 | input: &Input<'_>, |
1847 | 0 | slots: &mut [Option<NonMaxUsize>], |
1848 | 0 | ) -> Option<PatternID> { |
1849 | 0 | if input.get_anchored().is_anchored() { |
1850 | 0 | return self.core.search_slots(cache, input, slots); |
1851 | 0 | } |
1852 | 0 | if !self.core.is_capture_search_needed(slots.len()) { |
1853 | | trace!("asked for slots unnecessarily, trying fast path"); |
1854 | 0 | let m = self.search(cache, input)?; |
1855 | 0 | copy_match_to_slots(m, slots); |
1856 | 0 | return Some(m.pattern()); |
1857 | 0 | } |
1858 | 0 | let m = match self.try_search_full(cache, input) { |
1859 | 0 | Err(RetryError::Quadratic(_err)) => { |
1860 | 0 | trace!("reverse inner captures optimization failed: {}", _err); |
1861 | 0 | return self.core.search_slots(cache, input, slots); |
1862 | | } |
1863 | 0 | Err(RetryError::Fail(_err)) => { |
1864 | 0 | trace!("reverse inner fast captures search failed: {}", _err); |
1865 | 0 | return self.core.search_slots_nofail(cache, input, slots); |
1866 | | } |
1867 | 0 | Ok(None) => return None, |
1868 | 0 | Ok(Some(m)) => m, |
1869 | 0 | }; |
1870 | 0 | trace!( |
1871 | 0 | "match found at {}..{} in capture search, \ |
1872 | 0 | using another engine to find captures", |
1873 | 0 | m.start(), |
1874 | 0 | m.end(), |
1875 | 0 | ); |
1876 | 0 | let input = input |
1877 | 0 | .clone() |
1878 | 0 | .span(m.start()..m.end()) |
1879 | 0 | .anchored(Anchored::Pattern(m.pattern())); |
1880 | 0 | self.core.search_slots_nofail(cache, &input, slots) |
1881 | 0 | } |
1882 | | |
1883 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1884 | 0 | fn which_overlapping_matches( |
1885 | 0 | &self, |
1886 | 0 | cache: &mut Cache, |
1887 | 0 | input: &Input<'_>, |
1888 | 0 | patset: &mut PatternSet, |
1889 | 0 | ) { |
1890 | 0 | self.core.which_overlapping_matches(cache, input, patset) |
1891 | 0 | } |
1892 | | } |
1893 | | |
1894 | | /// Copies the offsets in the given match to the corresponding positions in |
1895 | | /// `slots`. |
1896 | | /// |
1897 | | /// In effect, this sets the slots corresponding to the implicit group for the |
1898 | | /// pattern in the given match. If the indices for the corresponding slots do |
1899 | | /// not exist, then no slots are set. |
1900 | | /// |
1901 | | /// This is useful when the caller provides slots (or captures), but you use a |
1902 | | /// regex engine that doesn't operate on slots (like a lazy DFA). This function |
1903 | | /// lets you map the match you get back to the slots provided by the caller. |
1904 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1905 | 0 | fn copy_match_to_slots(m: Match, slots: &mut [Option<NonMaxUsize>]) { |
1906 | 0 | let slot_start = m.pattern().as_usize() * 2; |
1907 | 0 | let slot_end = slot_start + 1; |
1908 | 0 | if let Some(slot) = slots.get_mut(slot_start) { |
1909 | 0 | *slot = NonMaxUsize::new(m.start()); |
1910 | 0 | } |
1911 | 0 | if let Some(slot) = slots.get_mut(slot_end) { |
1912 | 0 | *slot = NonMaxUsize::new(m.end()); |
1913 | 0 | } |
1914 | 0 | } |