/build/cargo-vendor-dir/memchr-2.7.1/src/arch/all/twoway.rs
Line | Count | Source (jump to first uncovered line) |
1 | | /*! |
2 | | An implementation of the [Two-Way substring search algorithm][two-way]. |
3 | | |
4 | | [`Finder`] can be built for forward searches, while [`FinderRev`] can be built |
5 | | for reverse searches. |
6 | | |
7 | | Two-Way makes for a nice general purpose substring search algorithm because of |
8 | | its time and space complexity properties. It also performs well in practice. |
9 | | Namely, with `m = len(needle)` and `n = len(haystack)`, Two-Way takes `O(m)` |
10 | | time to create a finder, `O(1)` space and `O(n)` search time. In other words, |
11 | | the preprocessing step is quick, doesn't require any heap memory and the worst |
12 | | case search time is guaranteed to be linear in the haystack regardless of the |
13 | | size of the needle. |
14 | | |
15 | | While vector algorithms will usually beat Two-Way handedly, vector algorithms |
16 | | also usually have pathological or edge cases that are better handled by Two-Way. |
17 | | Moreover, not all targets support vector algorithms or implementations for them |
18 | | simply may not exist yet. |
19 | | |
20 | | Two-Way can be found in the `memmem` implementations in at least [GNU libc] and |
21 | | [musl]. |
22 | | |
23 | | [two-way]: https://en.wikipedia.org/wiki/Two-way_string-matching_algorithm |
24 | | [GNU libc]: https://www.gnu.org/software/libc/ |
25 | | [musl]: https://www.musl-libc.org/ |
26 | | */ |
27 | | |
28 | | use core::cmp; |
29 | | |
30 | | use crate::{ |
31 | | arch::all::{is_prefix, is_suffix}, |
32 | | memmem::Pre, |
33 | | }; |
34 | | |
35 | | /// A forward substring searcher that uses the Two-Way algorithm. |
36 | | #[derive(Clone, Copy, Debug)] |
37 | | pub struct Finder(TwoWay); |
38 | | |
39 | | /// A reverse substring searcher that uses the Two-Way algorithm. |
40 | | #[derive(Clone, Copy, Debug)] |
41 | | pub struct FinderRev(TwoWay); |
42 | | |
43 | | /// An implementation of the TwoWay substring search algorithm. |
44 | | /// |
45 | | /// This searcher supports forward and reverse search, although not |
46 | | /// simultaneously. It runs in `O(n + m)` time and `O(1)` space, where |
47 | | /// `n ~ len(needle)` and `m ~ len(haystack)`. |
48 | | /// |
49 | | /// The implementation here roughly matches that which was developed by |
50 | | /// Crochemore and Perrin in their 1991 paper "Two-way string-matching." The |
51 | | /// changes in this implementation are 1) the use of zero-based indices, 2) a |
52 | | /// heuristic skip table based on the last byte (borrowed from Rust's standard |
53 | | /// library) and 3) the addition of heuristics for a fast skip loop. For (3), |
54 | | /// callers can pass any kind of prefilter they want, but usually it's one |
55 | | /// based on a heuristic that uses an approximate background frequency of bytes |
56 | | /// to choose rare bytes to quickly look for candidate match positions. Note |
57 | | /// though that currently, this prefilter functionality is not exposed directly |
58 | | /// in the public API. (File an issue if you want it and provide a use case |
59 | | /// please.) |
60 | | /// |
61 | | /// The heuristic for fast skipping is automatically shut off if it's |
62 | | /// detected to be ineffective at search time. Generally, this only occurs in |
63 | | /// pathological cases. But this is generally necessary in order to preserve |
64 | | /// a `O(n + m)` time bound. |
65 | | /// |
66 | | /// The code below is fairly complex and not obviously correct at all. It's |
67 | | /// likely necessary to read the Two-Way paper cited above in order to fully |
68 | | /// grok this code. The essence of it is: |
69 | | /// |
70 | | /// 1. Do something to detect a "critical" position in the needle. |
71 | | /// 2. For the current position in the haystack, look if `needle[critical..]` |
72 | | /// matches at that position. |
73 | | /// 3. If so, look if `needle[..critical]` matches. |
74 | | /// 4. If a mismatch occurs, shift the search by some amount based on the |
75 | | /// critical position and a pre-computed shift. |
76 | | /// |
77 | | /// This type is wrapped in the forward and reverse finders that expose |
78 | | /// consistent forward or reverse APIs. |
79 | | #[derive(Clone, Copy, Debug)] |
80 | | struct TwoWay { |
81 | | /// A small bitset used as a quick prefilter (in addition to any prefilter |
82 | | /// given by the caller). Namely, a bit `i` is set if and only if `b%64==i` |
83 | | /// for any `b == needle[i]`. |
84 | | /// |
85 | | /// When used as a prefilter, if the last byte at the current candidate |
86 | | /// position is NOT in this set, then we can skip that entire candidate |
87 | | /// position (the length of the needle). This is essentially the shift |
88 | | /// trick found in Boyer-Moore, but only applied to bytes that don't appear |
89 | | /// in the needle. |
90 | | /// |
91 | | /// N.B. This trick was inspired by something similar in std's |
92 | | /// implementation of Two-Way. |
93 | | byteset: ApproximateByteSet, |
94 | | /// A critical position in needle. Specifically, this position corresponds |
95 | | /// to beginning of either the minimal or maximal suffix in needle. (N.B. |
96 | | /// See SuffixType below for why "minimal" isn't quite the correct word |
97 | | /// here.) |
98 | | /// |
99 | | /// This is the position at which every search begins. Namely, search |
100 | | /// starts by scanning text to the right of this position, and only if |
101 | | /// there's a match does the text to the left of this position get scanned. |
102 | | critical_pos: usize, |
103 | | /// The amount we shift by in the Two-Way search algorithm. This |
104 | | /// corresponds to the "small period" and "large period" cases. |
105 | | shift: Shift, |
106 | | } |
107 | | |
108 | | impl Finder { |
109 | | /// Create a searcher that finds occurrences of the given `needle`. |
110 | | /// |
111 | | /// An empty `needle` results in a match at every position in a haystack, |
112 | | /// including at `haystack.len()`. |
113 | | #[inline] |
114 | 0 | pub fn new(needle: &[u8]) -> Finder { |
115 | 0 | let byteset = ApproximateByteSet::new(needle); |
116 | 0 | let min_suffix = Suffix::forward(needle, SuffixKind::Minimal); |
117 | 0 | let max_suffix = Suffix::forward(needle, SuffixKind::Maximal); |
118 | 0 | let (period_lower_bound, critical_pos) = |
119 | 0 | if min_suffix.pos > max_suffix.pos { |
120 | 0 | (min_suffix.period, min_suffix.pos) |
121 | | } else { |
122 | 0 | (max_suffix.period, max_suffix.pos) |
123 | | }; |
124 | 0 | let shift = Shift::forward(needle, period_lower_bound, critical_pos); |
125 | 0 | Finder(TwoWay { byteset, critical_pos, shift }) |
126 | 0 | } |
127 | | |
128 | | /// Returns the first occurrence of `needle` in the given `haystack`, or |
129 | | /// `None` if no such occurrence could be found. |
130 | | /// |
131 | | /// The `needle` given must be the same as the `needle` provided to |
132 | | /// [`Finder::new`]. |
133 | | /// |
134 | | /// An empty `needle` results in a match at every position in a haystack, |
135 | | /// including at `haystack.len()`. |
136 | | #[inline] |
137 | 0 | pub fn find(&self, haystack: &[u8], needle: &[u8]) -> Option<usize> { |
138 | 0 | self.find_with_prefilter(None, haystack, needle) |
139 | 0 | } |
140 | | |
141 | | /// This is like [`Finder::find`], but it accepts a prefilter for |
142 | | /// accelerating searches. |
143 | | /// |
144 | | /// Currently this is not exposed in the public API because, at the time |
145 | | /// of writing, I didn't want to spend time thinking about how to expose |
146 | | /// the prefilter infrastructure (if at all). If you have a compelling use |
147 | | /// case for exposing this routine, please create an issue. Do *not* open |
148 | | /// a PR that just exposes `Pre` and friends. Exporting this routine will |
149 | | /// require API design. |
150 | | #[inline(always)] |
151 | 0 | pub(crate) fn find_with_prefilter( |
152 | 0 | &self, |
153 | 0 | pre: Option<Pre<'_>>, |
154 | 0 | haystack: &[u8], |
155 | 0 | needle: &[u8], |
156 | 0 | ) -> Option<usize> { |
157 | 0 | match self.0.shift { |
158 | 0 | Shift::Small { period } => { |
159 | 0 | self.find_small_imp(pre, haystack, needle, period) |
160 | | } |
161 | 0 | Shift::Large { shift } => { |
162 | 0 | self.find_large_imp(pre, haystack, needle, shift) |
163 | | } |
164 | | } |
165 | 0 | } |
166 | | |
167 | | // Each of the two search implementations below can be accelerated by a |
168 | | // prefilter, but it is not always enabled. To avoid its overhead when |
169 | | // its disabled, we explicitly inline each search implementation based on |
170 | | // whether a prefilter will be used or not. The decision on which to use |
171 | | // is made in the parent meta searcher. |
172 | | |
173 | | #[inline(always)] |
174 | 0 | fn find_small_imp( |
175 | 0 | &self, |
176 | 0 | mut pre: Option<Pre<'_>>, |
177 | 0 | haystack: &[u8], |
178 | 0 | needle: &[u8], |
179 | 0 | period: usize, |
180 | 0 | ) -> Option<usize> { |
181 | 0 | let mut pos = 0; |
182 | 0 | let mut shift = 0; |
183 | 0 | let last_byte_pos = match needle.len().checked_sub(1) { |
184 | 0 | None => return Some(pos), |
185 | 0 | Some(last_byte) => last_byte, |
186 | | }; |
187 | 0 | while pos + needle.len() <= haystack.len() { |
188 | 0 | let mut i = cmp::max(self.0.critical_pos, shift); |
189 | 0 | if let Some(pre) = pre.as_mut() { |
190 | 0 | if pre.is_effective() { |
191 | 0 | pos += pre.find(&haystack[pos..])?; |
192 | 0 | shift = 0; |
193 | 0 | i = self.0.critical_pos; |
194 | 0 | if pos + needle.len() > haystack.len() { |
195 | 0 | return None; |
196 | 0 | } |
197 | 0 | } |
198 | 0 | } |
199 | 0 | if !self.0.byteset.contains(haystack[pos + last_byte_pos]) { |
200 | 0 | pos += needle.len(); |
201 | 0 | shift = 0; |
202 | 0 | continue; |
203 | 0 | } |
204 | 0 | while i < needle.len() && needle[i] == haystack[pos + i] { |
205 | 0 | i += 1; |
206 | 0 | } |
207 | 0 | if i < needle.len() { |
208 | 0 | pos += i - self.0.critical_pos + 1; |
209 | 0 | shift = 0; |
210 | 0 | } else { |
211 | 0 | let mut j = self.0.critical_pos; |
212 | 0 | while j > shift && needle[j] == haystack[pos + j] { |
213 | 0 | j -= 1; |
214 | 0 | } |
215 | 0 | if j <= shift && needle[shift] == haystack[pos + shift] { |
216 | 0 | return Some(pos); |
217 | 0 | } |
218 | 0 | pos += period; |
219 | 0 | shift = needle.len() - period; |
220 | | } |
221 | | } |
222 | 0 | None |
223 | 0 | } |
224 | | |
225 | | #[inline(always)] |
226 | 0 | fn find_large_imp( |
227 | 0 | &self, |
228 | 0 | mut pre: Option<Pre<'_>>, |
229 | 0 | haystack: &[u8], |
230 | 0 | needle: &[u8], |
231 | 0 | shift: usize, |
232 | 0 | ) -> Option<usize> { |
233 | 0 | let mut pos = 0; |
234 | 0 | let last_byte_pos = match needle.len().checked_sub(1) { |
235 | 0 | None => return Some(pos), |
236 | 0 | Some(last_byte) => last_byte, |
237 | | }; |
238 | 0 | 'outer: while pos + needle.len() <= haystack.len() { |
239 | 0 | if let Some(pre) = pre.as_mut() { |
240 | 0 | if pre.is_effective() { |
241 | 0 | pos += pre.find(&haystack[pos..])?; |
242 | 0 | if pos + needle.len() > haystack.len() { |
243 | 0 | return None; |
244 | 0 | } |
245 | 0 | } |
246 | 0 | } |
247 | | |
248 | 0 | if !self.0.byteset.contains(haystack[pos + last_byte_pos]) { |
249 | 0 | pos += needle.len(); |
250 | 0 | continue; |
251 | 0 | } |
252 | 0 | let mut i = self.0.critical_pos; |
253 | 0 | while i < needle.len() && needle[i] == haystack[pos + i] { |
254 | 0 | i += 1; |
255 | 0 | } |
256 | 0 | if i < needle.len() { |
257 | 0 | pos += i - self.0.critical_pos + 1; |
258 | 0 | } else { |
259 | 0 | for j in (0..self.0.critical_pos).rev() { |
260 | 0 | if needle[j] != haystack[pos + j] { |
261 | 0 | pos += shift; |
262 | 0 | continue 'outer; |
263 | 0 | } |
264 | | } |
265 | 0 | return Some(pos); |
266 | | } |
267 | | } |
268 | 0 | None |
269 | 0 | } |
270 | | } |
271 | | |
272 | | impl FinderRev { |
273 | | /// Create a searcher that finds occurrences of the given `needle`. |
274 | | /// |
275 | | /// An empty `needle` results in a match at every position in a haystack, |
276 | | /// including at `haystack.len()`. |
277 | | #[inline] |
278 | 0 | pub fn new(needle: &[u8]) -> FinderRev { |
279 | 0 | let byteset = ApproximateByteSet::new(needle); |
280 | 0 | let min_suffix = Suffix::reverse(needle, SuffixKind::Minimal); |
281 | 0 | let max_suffix = Suffix::reverse(needle, SuffixKind::Maximal); |
282 | 0 | let (period_lower_bound, critical_pos) = |
283 | 0 | if min_suffix.pos < max_suffix.pos { |
284 | 0 | (min_suffix.period, min_suffix.pos) |
285 | | } else { |
286 | 0 | (max_suffix.period, max_suffix.pos) |
287 | | }; |
288 | 0 | let shift = Shift::reverse(needle, period_lower_bound, critical_pos); |
289 | 0 | FinderRev(TwoWay { byteset, critical_pos, shift }) |
290 | 0 | } |
291 | | |
292 | | /// Returns the last occurrence of `needle` in the given `haystack`, or |
293 | | /// `None` if no such occurrence could be found. |
294 | | /// |
295 | | /// The `needle` given must be the same as the `needle` provided to |
296 | | /// [`FinderRev::new`]. |
297 | | /// |
298 | | /// An empty `needle` results in a match at every position in a haystack, |
299 | | /// including at `haystack.len()`. |
300 | | #[inline] |
301 | 0 | pub fn rfind(&self, haystack: &[u8], needle: &[u8]) -> Option<usize> { |
302 | 0 | // For the reverse case, we don't use a prefilter. It's plausible that |
303 | 0 | // perhaps we should, but it's a lot of additional code to do it, and |
304 | 0 | // it's not clear that it's actually worth it. If you have a really |
305 | 0 | // compelling use case for this, please file an issue. |
306 | 0 | match self.0.shift { |
307 | 0 | Shift::Small { period } => { |
308 | 0 | self.rfind_small_imp(haystack, needle, period) |
309 | | } |
310 | 0 | Shift::Large { shift } => { |
311 | 0 | self.rfind_large_imp(haystack, needle, shift) |
312 | | } |
313 | | } |
314 | 0 | } |
315 | | |
316 | | #[inline(always)] |
317 | 0 | fn rfind_small_imp( |
318 | 0 | &self, |
319 | 0 | haystack: &[u8], |
320 | 0 | needle: &[u8], |
321 | 0 | period: usize, |
322 | 0 | ) -> Option<usize> { |
323 | 0 | let nlen = needle.len(); |
324 | 0 | let mut pos = haystack.len(); |
325 | 0 | let mut shift = nlen; |
326 | 0 | let first_byte = match needle.get(0) { |
327 | 0 | None => return Some(pos), |
328 | 0 | Some(&first_byte) => first_byte, |
329 | | }; |
330 | 0 | while pos >= nlen { |
331 | 0 | if !self.0.byteset.contains(haystack[pos - nlen]) { |
332 | 0 | pos -= nlen; |
333 | 0 | shift = nlen; |
334 | 0 | continue; |
335 | 0 | } |
336 | 0 | let mut i = cmp::min(self.0.critical_pos, shift); |
337 | 0 | while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] { |
338 | 0 | i -= 1; |
339 | 0 | } |
340 | 0 | if i > 0 || first_byte != haystack[pos - nlen] { |
341 | 0 | pos -= self.0.critical_pos - i + 1; |
342 | 0 | shift = nlen; |
343 | 0 | } else { |
344 | 0 | let mut j = self.0.critical_pos; |
345 | 0 | while j < shift && needle[j] == haystack[pos - nlen + j] { |
346 | 0 | j += 1; |
347 | 0 | } |
348 | 0 | if j >= shift { |
349 | 0 | return Some(pos - nlen); |
350 | 0 | } |
351 | 0 | pos -= period; |
352 | 0 | shift = period; |
353 | | } |
354 | | } |
355 | 0 | None |
356 | 0 | } |
357 | | |
358 | | #[inline(always)] |
359 | 0 | fn rfind_large_imp( |
360 | 0 | &self, |
361 | 0 | haystack: &[u8], |
362 | 0 | needle: &[u8], |
363 | 0 | shift: usize, |
364 | 0 | ) -> Option<usize> { |
365 | 0 | let nlen = needle.len(); |
366 | 0 | let mut pos = haystack.len(); |
367 | 0 | let first_byte = match needle.get(0) { |
368 | 0 | None => return Some(pos), |
369 | 0 | Some(&first_byte) => first_byte, |
370 | | }; |
371 | 0 | while pos >= nlen { |
372 | 0 | if !self.0.byteset.contains(haystack[pos - nlen]) { |
373 | 0 | pos -= nlen; |
374 | 0 | continue; |
375 | 0 | } |
376 | 0 | let mut i = self.0.critical_pos; |
377 | 0 | while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] { |
378 | 0 | i -= 1; |
379 | 0 | } |
380 | 0 | if i > 0 || first_byte != haystack[pos - nlen] { |
381 | 0 | pos -= self.0.critical_pos - i + 1; |
382 | 0 | } else { |
383 | 0 | let mut j = self.0.critical_pos; |
384 | 0 | while j < nlen && needle[j] == haystack[pos - nlen + j] { |
385 | 0 | j += 1; |
386 | 0 | } |
387 | 0 | if j == nlen { |
388 | 0 | return Some(pos - nlen); |
389 | 0 | } |
390 | 0 | pos -= shift; |
391 | | } |
392 | | } |
393 | 0 | None |
394 | 0 | } |
395 | | } |
396 | | |
397 | | /// A representation of the amount we're allowed to shift by during Two-Way |
398 | | /// search. |
399 | | /// |
400 | | /// When computing a critical factorization of the needle, we find the position |
401 | | /// of the critical factorization by finding the needle's maximal (or minimal) |
402 | | /// suffix, along with the period of that suffix. It turns out that the period |
403 | | /// of that suffix is a lower bound on the period of the needle itself. |
404 | | /// |
405 | | /// This lower bound is equivalent to the actual period of the needle in |
406 | | /// some cases. To describe that case, we denote the needle as `x` where |
407 | | /// `x = uv` and `v` is the lexicographic maximal suffix of `v`. The lower |
408 | | /// bound given here is always the period of `v`, which is `<= period(x)`. The |
409 | | /// case where `period(v) == period(x)` occurs when `len(u) < (len(x) / 2)` and |
410 | | /// where `u` is a suffix of `v[0..period(v)]`. |
411 | | /// |
412 | | /// This case is important because the search algorithm for when the |
413 | | /// periods are equivalent is slightly different than the search algorithm |
414 | | /// for when the periods are not equivalent. In particular, when they aren't |
415 | | /// equivalent, we know that the period of the needle is no less than half its |
416 | | /// length. In this case, we shift by an amount less than or equal to the |
417 | | /// period of the needle (determined by the maximum length of the components |
418 | | /// of the critical factorization of `x`, i.e., `max(len(u), len(v))`).. |
419 | | /// |
420 | | /// The above two cases are represented by the variants below. Each entails |
421 | | /// a different instantiation of the Two-Way search algorithm. |
422 | | /// |
423 | | /// N.B. If we could find a way to compute the exact period in all cases, |
424 | | /// then we could collapse this case analysis and simplify the algorithm. The |
425 | | /// Two-Way paper suggests this is possible, but more reading is required to |
426 | | /// grok why the authors didn't pursue that path. |
427 | | #[derive(Clone, Copy, Debug)] |
428 | | enum Shift { |
429 | | Small { period: usize }, |
430 | | Large { shift: usize }, |
431 | | } |
432 | | |
433 | | impl Shift { |
434 | | /// Compute the shift for a given needle in the forward direction. |
435 | | /// |
436 | | /// This requires a lower bound on the period and a critical position. |
437 | | /// These can be computed by extracting both the minimal and maximal |
438 | | /// lexicographic suffixes, and choosing the right-most starting position. |
439 | | /// The lower bound on the period is then the period of the chosen suffix. |
440 | 0 | fn forward( |
441 | 0 | needle: &[u8], |
442 | 0 | period_lower_bound: usize, |
443 | 0 | critical_pos: usize, |
444 | 0 | ) -> Shift { |
445 | 0 | let large = cmp::max(critical_pos, needle.len() - critical_pos); |
446 | 0 | if critical_pos * 2 >= needle.len() { |
447 | 0 | return Shift::Large { shift: large }; |
448 | 0 | } |
449 | 0 |
|
450 | 0 | let (u, v) = needle.split_at(critical_pos); |
451 | 0 | if !is_suffix(&v[..period_lower_bound], u) { |
452 | 0 | return Shift::Large { shift: large }; |
453 | 0 | } |
454 | 0 | Shift::Small { period: period_lower_bound } |
455 | 0 | } |
456 | | |
457 | | /// Compute the shift for a given needle in the reverse direction. |
458 | | /// |
459 | | /// This requires a lower bound on the period and a critical position. |
460 | | /// These can be computed by extracting both the minimal and maximal |
461 | | /// lexicographic suffixes, and choosing the left-most starting position. |
462 | | /// The lower bound on the period is then the period of the chosen suffix. |
463 | 0 | fn reverse( |
464 | 0 | needle: &[u8], |
465 | 0 | period_lower_bound: usize, |
466 | 0 | critical_pos: usize, |
467 | 0 | ) -> Shift { |
468 | 0 | let large = cmp::max(critical_pos, needle.len() - critical_pos); |
469 | 0 | if (needle.len() - critical_pos) * 2 >= needle.len() { |
470 | 0 | return Shift::Large { shift: large }; |
471 | 0 | } |
472 | 0 |
|
473 | 0 | let (v, u) = needle.split_at(critical_pos); |
474 | 0 | if !is_prefix(&v[v.len() - period_lower_bound..], u) { |
475 | 0 | return Shift::Large { shift: large }; |
476 | 0 | } |
477 | 0 | Shift::Small { period: period_lower_bound } |
478 | 0 | } |
479 | | } |
480 | | |
481 | | /// A suffix extracted from a needle along with its period. |
482 | | #[derive(Debug)] |
483 | | struct Suffix { |
484 | | /// The starting position of this suffix. |
485 | | /// |
486 | | /// If this is a forward suffix, then `&bytes[pos..]` can be used. If this |
487 | | /// is a reverse suffix, then `&bytes[..pos]` can be used. That is, for |
488 | | /// forward suffixes, this is an inclusive starting position, where as for |
489 | | /// reverse suffixes, this is an exclusive ending position. |
490 | | pos: usize, |
491 | | /// The period of this suffix. |
492 | | /// |
493 | | /// Note that this is NOT necessarily the period of the string from which |
494 | | /// this suffix comes from. (It is always less than or equal to the period |
495 | | /// of the original string.) |
496 | | period: usize, |
497 | | } |
498 | | |
499 | | impl Suffix { |
500 | 0 | fn forward(needle: &[u8], kind: SuffixKind) -> Suffix { |
501 | 0 | // suffix represents our maximal (or minimal) suffix, along with |
502 | 0 | // its period. |
503 | 0 | let mut suffix = Suffix { pos: 0, period: 1 }; |
504 | 0 | // The start of a suffix in `needle` that we are considering as a |
505 | 0 | // more maximal (or minimal) suffix than what's in `suffix`. |
506 | 0 | let mut candidate_start = 1; |
507 | 0 | // The current offset of our suffixes that we're comparing. |
508 | 0 | // |
509 | 0 | // When the characters at this offset are the same, then we mush on |
510 | 0 | // to the next position since no decision is possible. When the |
511 | 0 | // candidate's character is greater (or lesser) than the corresponding |
512 | 0 | // character than our current maximal (or minimal) suffix, then the |
513 | 0 | // current suffix is changed over to the candidate and we restart our |
514 | 0 | // search. Otherwise, the candidate suffix is no good and we restart |
515 | 0 | // our search on the next candidate. |
516 | 0 | // |
517 | 0 | // The three cases above correspond to the three cases in the loop |
518 | 0 | // below. |
519 | 0 | let mut offset = 0; |
520 | | |
521 | 0 | while candidate_start + offset < needle.len() { |
522 | 0 | let current = needle[suffix.pos + offset]; |
523 | 0 | let candidate = needle[candidate_start + offset]; |
524 | 0 | match kind.cmp(current, candidate) { |
525 | 0 | SuffixOrdering::Accept => { |
526 | 0 | suffix = Suffix { pos: candidate_start, period: 1 }; |
527 | 0 | candidate_start += 1; |
528 | 0 | offset = 0; |
529 | 0 | } |
530 | 0 | SuffixOrdering::Skip => { |
531 | 0 | candidate_start += offset + 1; |
532 | 0 | offset = 0; |
533 | 0 | suffix.period = candidate_start - suffix.pos; |
534 | 0 | } |
535 | | SuffixOrdering::Push => { |
536 | 0 | if offset + 1 == suffix.period { |
537 | 0 | candidate_start += suffix.period; |
538 | 0 | offset = 0; |
539 | 0 | } else { |
540 | 0 | offset += 1; |
541 | 0 | } |
542 | | } |
543 | | } |
544 | | } |
545 | 0 | suffix |
546 | 0 | } |
547 | | |
548 | 0 | fn reverse(needle: &[u8], kind: SuffixKind) -> Suffix { |
549 | 0 | // See the comments in `forward` for how this works. |
550 | 0 | let mut suffix = Suffix { pos: needle.len(), period: 1 }; |
551 | 0 | if needle.len() == 1 { |
552 | 0 | return suffix; |
553 | 0 | } |
554 | 0 | let mut candidate_start = match needle.len().checked_sub(1) { |
555 | 0 | None => return suffix, |
556 | 0 | Some(candidate_start) => candidate_start, |
557 | 0 | }; |
558 | 0 | let mut offset = 0; |
559 | | |
560 | 0 | while offset < candidate_start { |
561 | 0 | let current = needle[suffix.pos - offset - 1]; |
562 | 0 | let candidate = needle[candidate_start - offset - 1]; |
563 | 0 | match kind.cmp(current, candidate) { |
564 | 0 | SuffixOrdering::Accept => { |
565 | 0 | suffix = Suffix { pos: candidate_start, period: 1 }; |
566 | 0 | candidate_start -= 1; |
567 | 0 | offset = 0; |
568 | 0 | } |
569 | 0 | SuffixOrdering::Skip => { |
570 | 0 | candidate_start -= offset + 1; |
571 | 0 | offset = 0; |
572 | 0 | suffix.period = suffix.pos - candidate_start; |
573 | 0 | } |
574 | | SuffixOrdering::Push => { |
575 | 0 | if offset + 1 == suffix.period { |
576 | 0 | candidate_start -= suffix.period; |
577 | 0 | offset = 0; |
578 | 0 | } else { |
579 | 0 | offset += 1; |
580 | 0 | } |
581 | | } |
582 | | } |
583 | | } |
584 | 0 | suffix |
585 | 0 | } |
586 | | } |
587 | | |
588 | | /// The kind of suffix to extract. |
589 | | #[derive(Clone, Copy, Debug)] |
590 | | enum SuffixKind { |
591 | | /// Extract the smallest lexicographic suffix from a string. |
592 | | /// |
593 | | /// Technically, this doesn't actually pick the smallest lexicographic |
594 | | /// suffix. e.g., Given the choice between `a` and `aa`, this will choose |
595 | | /// the latter over the former, even though `a < aa`. The reasoning for |
596 | | /// this isn't clear from the paper, but it still smells like a minimal |
597 | | /// suffix. |
598 | | Minimal, |
599 | | /// Extract the largest lexicographic suffix from a string. |
600 | | /// |
601 | | /// Unlike `Minimal`, this really does pick the maximum suffix. e.g., Given |
602 | | /// the choice between `z` and `zz`, this will choose the latter over the |
603 | | /// former. |
604 | | Maximal, |
605 | | } |
606 | | |
607 | | /// The result of comparing corresponding bytes between two suffixes. |
608 | | #[derive(Clone, Copy, Debug)] |
609 | | enum SuffixOrdering { |
610 | | /// This occurs when the given candidate byte indicates that the candidate |
611 | | /// suffix is better than the current maximal (or minimal) suffix. That is, |
612 | | /// the current candidate suffix should supplant the current maximal (or |
613 | | /// minimal) suffix. |
614 | | Accept, |
615 | | /// This occurs when the given candidate byte excludes the candidate suffix |
616 | | /// from being better than the current maximal (or minimal) suffix. That |
617 | | /// is, the current candidate suffix should be dropped and the next one |
618 | | /// should be considered. |
619 | | Skip, |
620 | | /// This occurs when no decision to accept or skip the candidate suffix |
621 | | /// can be made, e.g., when corresponding bytes are equivalent. In this |
622 | | /// case, the next corresponding bytes should be compared. |
623 | | Push, |
624 | | } |
625 | | |
626 | | impl SuffixKind { |
627 | | /// Returns true if and only if the given candidate byte indicates that |
628 | | /// it should replace the current suffix as the maximal (or minimal) |
629 | | /// suffix. |
630 | 0 | fn cmp(self, current: u8, candidate: u8) -> SuffixOrdering { |
631 | | use self::SuffixOrdering::*; |
632 | | |
633 | 0 | match self { |
634 | 0 | SuffixKind::Minimal if candidate < current => Accept, |
635 | 0 | SuffixKind::Minimal if candidate > current => Skip, |
636 | 0 | SuffixKind::Minimal => Push, |
637 | 0 | SuffixKind::Maximal if candidate > current => Accept, |
638 | 0 | SuffixKind::Maximal if candidate < current => Skip, |
639 | 0 | SuffixKind::Maximal => Push, |
640 | | } |
641 | 0 | } |
642 | | } |
643 | | |
644 | | /// A bitset used to track whether a particular byte exists in a needle or not. |
645 | | /// |
646 | | /// Namely, bit 'i' is set if and only if byte%64==i for any byte in the |
647 | | /// needle. If a particular byte in the haystack is NOT in this set, then one |
648 | | /// can conclude that it is also not in the needle, and thus, one can advance |
649 | | /// in the haystack by needle.len() bytes. |
650 | | #[derive(Clone, Copy, Debug)] |
651 | | struct ApproximateByteSet(u64); |
652 | | |
653 | | impl ApproximateByteSet { |
654 | | /// Create a new set from the given needle. |
655 | 0 | fn new(needle: &[u8]) -> ApproximateByteSet { |
656 | 0 | let mut bits = 0; |
657 | 0 | for &b in needle { |
658 | 0 | bits |= 1 << (b % 64); |
659 | 0 | } |
660 | 0 | ApproximateByteSet(bits) |
661 | 0 | } |
662 | | |
663 | | /// Return true if and only if the given byte might be in this set. This |
664 | | /// may return a false positive, but will never return a false negative. |
665 | | #[inline(always)] |
666 | 0 | fn contains(&self, byte: u8) -> bool { |
667 | 0 | self.0 & (1 << (byte % 64)) != 0 |
668 | 0 | } |
669 | | } |
670 | | |
671 | | #[cfg(test)] |
672 | | mod tests { |
673 | | use alloc::vec::Vec; |
674 | | |
675 | | use super::*; |
676 | | |
677 | | /// Convenience wrapper for computing the suffix as a byte string. |
678 | | fn get_suffix_forward(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) { |
679 | | let s = Suffix::forward(needle, kind); |
680 | | (&needle[s.pos..], s.period) |
681 | | } |
682 | | |
683 | | /// Convenience wrapper for computing the reverse suffix as a byte string. |
684 | | fn get_suffix_reverse(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) { |
685 | | let s = Suffix::reverse(needle, kind); |
686 | | (&needle[..s.pos], s.period) |
687 | | } |
688 | | |
689 | | /// Return all of the non-empty suffixes in the given byte string. |
690 | | fn suffixes(bytes: &[u8]) -> Vec<&[u8]> { |
691 | | (0..bytes.len()).map(|i| &bytes[i..]).collect() |
692 | | } |
693 | | |
694 | | /// Return the lexicographically maximal suffix of the given byte string. |
695 | | fn naive_maximal_suffix_forward(needle: &[u8]) -> &[u8] { |
696 | | let mut sufs = suffixes(needle); |
697 | | sufs.sort(); |
698 | | sufs.pop().unwrap() |
699 | | } |
700 | | |
701 | | /// Return the lexicographically maximal suffix of the reverse of the given |
702 | | /// byte string. |
703 | | fn naive_maximal_suffix_reverse(needle: &[u8]) -> Vec<u8> { |
704 | | let mut reversed = needle.to_vec(); |
705 | | reversed.reverse(); |
706 | | let mut got = naive_maximal_suffix_forward(&reversed).to_vec(); |
707 | | got.reverse(); |
708 | | got |
709 | | } |
710 | | |
711 | | define_substring_forward_quickcheck!(|h, n| Some( |
712 | | Finder::new(n).find(h, n) |
713 | | )); |
714 | | define_substring_reverse_quickcheck!(|h, n| Some( |
715 | | FinderRev::new(n).rfind(h, n) |
716 | | )); |
717 | | |
718 | | #[test] |
719 | | fn forward() { |
720 | | crate::tests::substring::Runner::new() |
721 | | .fwd(|h, n| Some(Finder::new(n).find(h, n))) |
722 | | .run(); |
723 | | } |
724 | | |
725 | | #[test] |
726 | | fn reverse() { |
727 | | crate::tests::substring::Runner::new() |
728 | | .rev(|h, n| Some(FinderRev::new(n).rfind(h, n))) |
729 | | .run(); |
730 | | } |
731 | | |
732 | | #[test] |
733 | | fn suffix_forward() { |
734 | | macro_rules! assert_suffix_min { |
735 | | ($given:expr, $expected:expr, $period:expr) => { |
736 | | let (got_suffix, got_period) = |
737 | | get_suffix_forward($given.as_bytes(), SuffixKind::Minimal); |
738 | | let got_suffix = core::str::from_utf8(got_suffix).unwrap(); |
739 | | assert_eq!(($expected, $period), (got_suffix, got_period)); |
740 | | }; |
741 | | } |
742 | | |
743 | | macro_rules! assert_suffix_max { |
744 | | ($given:expr, $expected:expr, $period:expr) => { |
745 | | let (got_suffix, got_period) = |
746 | | get_suffix_forward($given.as_bytes(), SuffixKind::Maximal); |
747 | | let got_suffix = core::str::from_utf8(got_suffix).unwrap(); |
748 | | assert_eq!(($expected, $period), (got_suffix, got_period)); |
749 | | }; |
750 | | } |
751 | | |
752 | | assert_suffix_min!("a", "a", 1); |
753 | | assert_suffix_max!("a", "a", 1); |
754 | | |
755 | | assert_suffix_min!("ab", "ab", 2); |
756 | | assert_suffix_max!("ab", "b", 1); |
757 | | |
758 | | assert_suffix_min!("ba", "a", 1); |
759 | | assert_suffix_max!("ba", "ba", 2); |
760 | | |
761 | | assert_suffix_min!("abc", "abc", 3); |
762 | | assert_suffix_max!("abc", "c", 1); |
763 | | |
764 | | assert_suffix_min!("acb", "acb", 3); |
765 | | assert_suffix_max!("acb", "cb", 2); |
766 | | |
767 | | assert_suffix_min!("cba", "a", 1); |
768 | | assert_suffix_max!("cba", "cba", 3); |
769 | | |
770 | | assert_suffix_min!("abcabc", "abcabc", 3); |
771 | | assert_suffix_max!("abcabc", "cabc", 3); |
772 | | |
773 | | assert_suffix_min!("abcabcabc", "abcabcabc", 3); |
774 | | assert_suffix_max!("abcabcabc", "cabcabc", 3); |
775 | | |
776 | | assert_suffix_min!("abczz", "abczz", 5); |
777 | | assert_suffix_max!("abczz", "zz", 1); |
778 | | |
779 | | assert_suffix_min!("zzabc", "abc", 3); |
780 | | assert_suffix_max!("zzabc", "zzabc", 5); |
781 | | |
782 | | assert_suffix_min!("aaa", "aaa", 1); |
783 | | assert_suffix_max!("aaa", "aaa", 1); |
784 | | |
785 | | assert_suffix_min!("foobar", "ar", 2); |
786 | | assert_suffix_max!("foobar", "r", 1); |
787 | | } |
788 | | |
789 | | #[test] |
790 | | fn suffix_reverse() { |
791 | | macro_rules! assert_suffix_min { |
792 | | ($given:expr, $expected:expr, $period:expr) => { |
793 | | let (got_suffix, got_period) = |
794 | | get_suffix_reverse($given.as_bytes(), SuffixKind::Minimal); |
795 | | let got_suffix = core::str::from_utf8(got_suffix).unwrap(); |
796 | | assert_eq!(($expected, $period), (got_suffix, got_period)); |
797 | | }; |
798 | | } |
799 | | |
800 | | macro_rules! assert_suffix_max { |
801 | | ($given:expr, $expected:expr, $period:expr) => { |
802 | | let (got_suffix, got_period) = |
803 | | get_suffix_reverse($given.as_bytes(), SuffixKind::Maximal); |
804 | | let got_suffix = core::str::from_utf8(got_suffix).unwrap(); |
805 | | assert_eq!(($expected, $period), (got_suffix, got_period)); |
806 | | }; |
807 | | } |
808 | | |
809 | | assert_suffix_min!("a", "a", 1); |
810 | | assert_suffix_max!("a", "a", 1); |
811 | | |
812 | | assert_suffix_min!("ab", "a", 1); |
813 | | assert_suffix_max!("ab", "ab", 2); |
814 | | |
815 | | assert_suffix_min!("ba", "ba", 2); |
816 | | assert_suffix_max!("ba", "b", 1); |
817 | | |
818 | | assert_suffix_min!("abc", "a", 1); |
819 | | assert_suffix_max!("abc", "abc", 3); |
820 | | |
821 | | assert_suffix_min!("acb", "a", 1); |
822 | | assert_suffix_max!("acb", "ac", 2); |
823 | | |
824 | | assert_suffix_min!("cba", "cba", 3); |
825 | | assert_suffix_max!("cba", "c", 1); |
826 | | |
827 | | assert_suffix_min!("abcabc", "abca", 3); |
828 | | assert_suffix_max!("abcabc", "abcabc", 3); |
829 | | |
830 | | assert_suffix_min!("abcabcabc", "abcabca", 3); |
831 | | assert_suffix_max!("abcabcabc", "abcabcabc", 3); |
832 | | |
833 | | assert_suffix_min!("abczz", "a", 1); |
834 | | assert_suffix_max!("abczz", "abczz", 5); |
835 | | |
836 | | assert_suffix_min!("zzabc", "zza", 3); |
837 | | assert_suffix_max!("zzabc", "zz", 1); |
838 | | |
839 | | assert_suffix_min!("aaa", "aaa", 1); |
840 | | assert_suffix_max!("aaa", "aaa", 1); |
841 | | } |
842 | | |
843 | | #[cfg(not(miri))] |
844 | | quickcheck::quickcheck! { |
845 | | fn qc_suffix_forward_maximal(bytes: Vec<u8>) -> bool { |
846 | | if bytes.is_empty() { |
847 | | return true; |
848 | | } |
849 | | |
850 | | let (got, _) = get_suffix_forward(&bytes, SuffixKind::Maximal); |
851 | | let expected = naive_maximal_suffix_forward(&bytes); |
852 | | got == expected |
853 | | } |
854 | | |
855 | | fn qc_suffix_reverse_maximal(bytes: Vec<u8>) -> bool { |
856 | | if bytes.is_empty() { |
857 | | return true; |
858 | | } |
859 | | |
860 | | let (got, _) = get_suffix_reverse(&bytes, SuffixKind::Maximal); |
861 | | let expected = naive_maximal_suffix_reverse(&bytes); |
862 | | expected == got |
863 | | } |
864 | | } |
865 | | |
866 | | // This is a regression test caught by quickcheck that exercised a bug in |
867 | | // the reverse small period handling. The bug was that we were using 'if j |
868 | | // == shift' to determine if a match occurred, but the correct guard is 'if |
869 | | // j >= shift', which matches the corresponding guard in the forward impl. |
870 | | #[test] |
871 | | fn regression_rev_small_period() { |
872 | | let rfind = |h, n| FinderRev::new(n).rfind(h, n); |
873 | | let haystack = "ababaz"; |
874 | | let needle = "abab"; |
875 | | assert_eq!(Some(0), rfind(haystack.as_bytes(), needle.as_bytes())); |
876 | | } |
877 | | } |