/build/cargo-vendor-dir/aho-corasick-1.1.2/src/packed/teddy/generic.rs
Line | Count | Source (jump to first uncovered line) |
1 | | use core::fmt::Debug; |
2 | | |
3 | | use alloc::{ |
4 | | boxed::Box, collections::BTreeMap, format, sync::Arc, vec, vec::Vec, |
5 | | }; |
6 | | |
7 | | use crate::{ |
8 | | packed::{ |
9 | | ext::Pointer, |
10 | | pattern::Patterns, |
11 | | vector::{FatVector, Vector}, |
12 | | }, |
13 | | util::int::U32, |
14 | | PatternID, |
15 | | }; |
16 | | |
17 | | /// A match type specialized to the Teddy implementations below. |
18 | | /// |
19 | | /// Essentially, instead of representing a match at byte offsets, we use |
20 | | /// raw pointers. This is because the implementations below operate on raw |
21 | | /// pointers, and so this is a more natural return type based on how the |
22 | | /// implementation works. |
23 | | /// |
24 | | /// Also, the `PatternID` used here is a `u16`. |
25 | | #[derive(Clone, Copy, Debug)] |
26 | | pub(crate) struct Match { |
27 | | pid: PatternID, |
28 | | start: *const u8, |
29 | | end: *const u8, |
30 | | } |
31 | | |
32 | | impl Match { |
33 | | /// Returns the ID of the pattern that matched. |
34 | 0 | pub(crate) fn pattern(&self) -> PatternID { |
35 | 0 | self.pid |
36 | 0 | } |
37 | | |
38 | | /// Returns a pointer into the haystack at which the match starts. |
39 | 0 | pub(crate) fn start(&self) -> *const u8 { |
40 | 0 | self.start |
41 | 0 | } |
42 | | |
43 | | /// Returns a pointer into the haystack at which the match ends. |
44 | 0 | pub(crate) fn end(&self) -> *const u8 { |
45 | 0 | self.end |
46 | 0 | } |
47 | | } |
48 | | |
49 | | /// A "slim" Teddy implementation that is generic over both the vector type |
50 | | /// and the minimum length of the patterns being searched for. |
51 | | /// |
52 | | /// Only 1, 2, 3 and 4 bytes are supported as minimum lengths. |
53 | | #[derive(Clone, Debug)] |
54 | | pub(crate) struct Slim<V, const BYTES: usize> { |
55 | | /// A generic data structure for doing "slim" Teddy verification. |
56 | | teddy: Teddy<8>, |
57 | | /// The masks used as inputs to the shuffle operation to generate |
58 | | /// candidates (which are fed into the verification routines). |
59 | | masks: [Mask<V>; BYTES], |
60 | | } |
61 | | |
62 | | impl<V: Vector, const BYTES: usize> Slim<V, BYTES> { |
63 | | /// Create a new "slim" Teddy searcher for the given patterns. |
64 | | /// |
65 | | /// # Panics |
66 | | /// |
67 | | /// This panics when `BYTES` is any value other than 1, 2, 3 or 4. |
68 | | /// |
69 | | /// # Safety |
70 | | /// |
71 | | /// Callers must ensure that this is okay to call in the current target for |
72 | | /// the current CPU. |
73 | | #[inline(always)] |
74 | 0 | pub(crate) unsafe fn new(patterns: Arc<Patterns>) -> Slim<V, BYTES> { |
75 | 0 | assert!( |
76 | 0 | 1 <= BYTES && BYTES <= 4, |
77 | 0 | "only 1, 2, 3 or 4 bytes are supported" |
78 | | ); |
79 | 0 | let teddy = Teddy::new(patterns); |
80 | 0 | let masks = SlimMaskBuilder::from_teddy(&teddy); |
81 | 0 | Slim { teddy, masks } |
82 | 0 | } |
83 | | |
84 | | /// Returns the approximate total amount of heap used by this type, in |
85 | | /// units of bytes. |
86 | | #[inline(always)] |
87 | 0 | pub(crate) fn memory_usage(&self) -> usize { |
88 | 0 | self.teddy.memory_usage() |
89 | 0 | } |
90 | | |
91 | | /// Returns the minimum length, in bytes, that a haystack must be in order |
92 | | /// to use it with this searcher. |
93 | | #[inline(always)] |
94 | 0 | pub(crate) fn minimum_len(&self) -> usize { |
95 | 0 | V::BYTES + (BYTES - 1) |
96 | 0 | } |
97 | | } |
98 | | |
99 | | impl<V: Vector> Slim<V, 1> { |
100 | | /// Look for an occurrences of the patterns in this finder in the haystack |
101 | | /// given by the `start` and `end` pointers. |
102 | | /// |
103 | | /// If no match could be found, then `None` is returned. |
104 | | /// |
105 | | /// # Safety |
106 | | /// |
107 | | /// The given pointers representing the haystack must be valid to read |
108 | | /// from. They must also point to a region of memory that is at least the |
109 | | /// minimum length required by this searcher. |
110 | | /// |
111 | | /// Callers must ensure that this is okay to call in the current target for |
112 | | /// the current CPU. |
113 | | #[inline(always)] |
114 | 0 | pub(crate) unsafe fn find( |
115 | 0 | &self, |
116 | 0 | start: *const u8, |
117 | 0 | end: *const u8, |
118 | 0 | ) -> Option<Match> { |
119 | 0 | let len = end.distance(start); |
120 | 0 | debug_assert!(len >= self.minimum_len()); |
121 | 0 | let mut cur = start; |
122 | 0 | while cur <= end.sub(V::BYTES) { |
123 | 0 | if let Some(m) = self.find_one(cur, end) { |
124 | 0 | return Some(m); |
125 | 0 | } |
126 | 0 | cur = cur.add(V::BYTES); |
127 | | } |
128 | 0 | if cur < end { |
129 | 0 | cur = end.sub(V::BYTES); |
130 | 0 | if let Some(m) = self.find_one(cur, end) { |
131 | 0 | return Some(m); |
132 | 0 | } |
133 | 0 | } |
134 | 0 | None |
135 | 0 | } |
136 | | |
137 | | /// Look for a match starting at the `V::BYTES` at and after `cur`. If |
138 | | /// there isn't one, then `None` is returned. |
139 | | /// |
140 | | /// # Safety |
141 | | /// |
142 | | /// The given pointers representing the haystack must be valid to read |
143 | | /// from. They must also point to a region of memory that is at least the |
144 | | /// minimum length required by this searcher. |
145 | | /// |
146 | | /// Callers must ensure that this is okay to call in the current target for |
147 | | /// the current CPU. |
148 | | #[inline(always)] |
149 | 0 | unsafe fn find_one( |
150 | 0 | &self, |
151 | 0 | cur: *const u8, |
152 | 0 | end: *const u8, |
153 | 0 | ) -> Option<Match> { |
154 | 0 | let c = self.candidate(cur); |
155 | 0 | if !c.is_zero() { |
156 | 0 | if let Some(m) = self.teddy.verify(cur, end, c) { |
157 | 0 | return Some(m); |
158 | 0 | } |
159 | 0 | } |
160 | 0 | None |
161 | 0 | } |
162 | | |
163 | | /// Look for a candidate match (represented as a vector) starting at the |
164 | | /// `V::BYTES` at and after `cur`. If there isn't one, then a vector with |
165 | | /// all bits set to zero is returned. |
166 | | /// |
167 | | /// # Safety |
168 | | /// |
169 | | /// The given pointer representing the haystack must be valid to read |
170 | | /// from. |
171 | | /// |
172 | | /// Callers must ensure that this is okay to call in the current target for |
173 | | /// the current CPU. |
174 | | #[inline(always)] |
175 | 0 | unsafe fn candidate(&self, cur: *const u8) -> V { |
176 | 0 | let chunk = V::load_unaligned(cur); |
177 | 0 | Mask::members1(chunk, self.masks) |
178 | 0 | } |
179 | | } |
180 | | |
181 | | impl<V: Vector> Slim<V, 2> { |
182 | | /// See Slim<V, 1>::find. |
183 | | #[inline(always)] |
184 | 0 | pub(crate) unsafe fn find( |
185 | 0 | &self, |
186 | 0 | start: *const u8, |
187 | 0 | end: *const u8, |
188 | 0 | ) -> Option<Match> { |
189 | 0 | let len = end.distance(start); |
190 | 0 | debug_assert!(len >= self.minimum_len()); |
191 | 0 | let mut cur = start.add(1); |
192 | 0 | let mut prev0 = V::splat(0xFF); |
193 | 0 | while cur <= end.sub(V::BYTES) { |
194 | 0 | if let Some(m) = self.find_one(cur, end, &mut prev0) { |
195 | 0 | return Some(m); |
196 | 0 | } |
197 | 0 | cur = cur.add(V::BYTES); |
198 | | } |
199 | 0 | if cur < end { |
200 | 0 | cur = end.sub(V::BYTES); |
201 | 0 | prev0 = V::splat(0xFF); |
202 | 0 | if let Some(m) = self.find_one(cur, end, &mut prev0) { |
203 | 0 | return Some(m); |
204 | 0 | } |
205 | 0 | } |
206 | 0 | None |
207 | 0 | } |
208 | | |
209 | | /// See Slim<V, 1>::find_one. |
210 | | #[inline(always)] |
211 | 0 | unsafe fn find_one( |
212 | 0 | &self, |
213 | 0 | cur: *const u8, |
214 | 0 | end: *const u8, |
215 | 0 | prev0: &mut V, |
216 | 0 | ) -> Option<Match> { |
217 | 0 | let c = self.candidate(cur, prev0); |
218 | 0 | if !c.is_zero() { |
219 | 0 | if let Some(m) = self.teddy.verify(cur.sub(1), end, c) { |
220 | 0 | return Some(m); |
221 | 0 | } |
222 | 0 | } |
223 | 0 | None |
224 | 0 | } |
225 | | |
226 | | /// See Slim<V, 1>::candidate. |
227 | | #[inline(always)] |
228 | 0 | unsafe fn candidate(&self, cur: *const u8, prev0: &mut V) -> V { |
229 | 0 | let chunk = V::load_unaligned(cur); |
230 | 0 | let (res0, res1) = Mask::members2(chunk, self.masks); |
231 | 0 | let res0prev0 = res0.shift_in_one_byte(*prev0); |
232 | 0 | let res = res0prev0.and(res1); |
233 | 0 | *prev0 = res0; |
234 | 0 | res |
235 | 0 | } |
236 | | } |
237 | | |
238 | | impl<V: Vector> Slim<V, 3> { |
239 | | /// See Slim<V, 1>::find. |
240 | | #[inline(always)] |
241 | 0 | pub(crate) unsafe fn find( |
242 | 0 | &self, |
243 | 0 | start: *const u8, |
244 | 0 | end: *const u8, |
245 | 0 | ) -> Option<Match> { |
246 | 0 | let len = end.distance(start); |
247 | 0 | debug_assert!(len >= self.minimum_len()); |
248 | 0 | let mut cur = start.add(2); |
249 | 0 | let mut prev0 = V::splat(0xFF); |
250 | 0 | let mut prev1 = V::splat(0xFF); |
251 | 0 | while cur <= end.sub(V::BYTES) { |
252 | 0 | if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1) { |
253 | 0 | return Some(m); |
254 | 0 | } |
255 | 0 | cur = cur.add(V::BYTES); |
256 | | } |
257 | 0 | if cur < end { |
258 | 0 | cur = end.sub(V::BYTES); |
259 | 0 | prev0 = V::splat(0xFF); |
260 | 0 | prev1 = V::splat(0xFF); |
261 | 0 | if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1) { |
262 | 0 | return Some(m); |
263 | 0 | } |
264 | 0 | } |
265 | 0 | None |
266 | 0 | } |
267 | | |
268 | | /// See Slim<V, 1>::find_one. |
269 | | #[inline(always)] |
270 | 0 | unsafe fn find_one( |
271 | 0 | &self, |
272 | 0 | cur: *const u8, |
273 | 0 | end: *const u8, |
274 | 0 | prev0: &mut V, |
275 | 0 | prev1: &mut V, |
276 | 0 | ) -> Option<Match> { |
277 | 0 | let c = self.candidate(cur, prev0, prev1); |
278 | 0 | if !c.is_zero() { |
279 | 0 | if let Some(m) = self.teddy.verify(cur.sub(2), end, c) { |
280 | 0 | return Some(m); |
281 | 0 | } |
282 | 0 | } |
283 | 0 | None |
284 | 0 | } |
285 | | |
286 | | /// See Slim<V, 1>::candidate. |
287 | | #[inline(always)] |
288 | 0 | unsafe fn candidate( |
289 | 0 | &self, |
290 | 0 | cur: *const u8, |
291 | 0 | prev0: &mut V, |
292 | 0 | prev1: &mut V, |
293 | 0 | ) -> V { |
294 | 0 | let chunk = V::load_unaligned(cur); |
295 | 0 | let (res0, res1, res2) = Mask::members3(chunk, self.masks); |
296 | 0 | let res0prev0 = res0.shift_in_two_bytes(*prev0); |
297 | 0 | let res1prev1 = res1.shift_in_one_byte(*prev1); |
298 | 0 | let res = res0prev0.and(res1prev1).and(res2); |
299 | 0 | *prev0 = res0; |
300 | 0 | *prev1 = res1; |
301 | 0 | res |
302 | 0 | } |
303 | | } |
304 | | |
305 | | impl<V: Vector> Slim<V, 4> { |
306 | | /// See Slim<V, 1>::find. |
307 | | #[inline(always)] |
308 | 0 | pub(crate) unsafe fn find( |
309 | 0 | &self, |
310 | 0 | start: *const u8, |
311 | 0 | end: *const u8, |
312 | 0 | ) -> Option<Match> { |
313 | 0 | let len = end.distance(start); |
314 | 0 | debug_assert!(len >= self.minimum_len()); |
315 | 0 | let mut cur = start.add(3); |
316 | 0 | let mut prev0 = V::splat(0xFF); |
317 | 0 | let mut prev1 = V::splat(0xFF); |
318 | 0 | let mut prev2 = V::splat(0xFF); |
319 | 0 | while cur <= end.sub(V::BYTES) { |
320 | 0 | if let Some(m) = |
321 | 0 | self.find_one(cur, end, &mut prev0, &mut prev1, &mut prev2) |
322 | | { |
323 | 0 | return Some(m); |
324 | 0 | } |
325 | 0 | cur = cur.add(V::BYTES); |
326 | | } |
327 | 0 | if cur < end { |
328 | 0 | cur = end.sub(V::BYTES); |
329 | 0 | prev0 = V::splat(0xFF); |
330 | 0 | prev1 = V::splat(0xFF); |
331 | 0 | prev2 = V::splat(0xFF); |
332 | 0 | if let Some(m) = |
333 | 0 | self.find_one(cur, end, &mut prev0, &mut prev1, &mut prev2) |
334 | | { |
335 | 0 | return Some(m); |
336 | 0 | } |
337 | 0 | } |
338 | 0 | None |
339 | 0 | } |
340 | | |
341 | | /// See Slim<V, 1>::find_one. |
342 | | #[inline(always)] |
343 | 0 | unsafe fn find_one( |
344 | 0 | &self, |
345 | 0 | cur: *const u8, |
346 | 0 | end: *const u8, |
347 | 0 | prev0: &mut V, |
348 | 0 | prev1: &mut V, |
349 | 0 | prev2: &mut V, |
350 | 0 | ) -> Option<Match> { |
351 | 0 | let c = self.candidate(cur, prev0, prev1, prev2); |
352 | 0 | if !c.is_zero() { |
353 | 0 | if let Some(m) = self.teddy.verify(cur.sub(3), end, c) { |
354 | 0 | return Some(m); |
355 | 0 | } |
356 | 0 | } |
357 | 0 | None |
358 | 0 | } |
359 | | |
360 | | /// See Slim<V, 1>::candidate. |
361 | | #[inline(always)] |
362 | 0 | unsafe fn candidate( |
363 | 0 | &self, |
364 | 0 | cur: *const u8, |
365 | 0 | prev0: &mut V, |
366 | 0 | prev1: &mut V, |
367 | 0 | prev2: &mut V, |
368 | 0 | ) -> V { |
369 | 0 | let chunk = V::load_unaligned(cur); |
370 | 0 | let (res0, res1, res2, res3) = Mask::members4(chunk, self.masks); |
371 | 0 | let res0prev0 = res0.shift_in_three_bytes(*prev0); |
372 | 0 | let res1prev1 = res1.shift_in_two_bytes(*prev1); |
373 | 0 | let res2prev2 = res2.shift_in_one_byte(*prev2); |
374 | 0 | let res = res0prev0.and(res1prev1).and(res2prev2).and(res3); |
375 | 0 | *prev0 = res0; |
376 | 0 | *prev1 = res1; |
377 | 0 | *prev2 = res2; |
378 | 0 | res |
379 | 0 | } |
380 | | } |
381 | | |
382 | | /// A "fat" Teddy implementation that is generic over both the vector type |
383 | | /// and the minimum length of the patterns being searched for. |
384 | | /// |
385 | | /// Only 1, 2, 3 and 4 bytes are supported as minimum lengths. |
386 | | #[derive(Clone, Debug)] |
387 | | pub(crate) struct Fat<V, const BYTES: usize> { |
388 | | /// A generic data structure for doing "fat" Teddy verification. |
389 | | teddy: Teddy<16>, |
390 | | /// The masks used as inputs to the shuffle operation to generate |
391 | | /// candidates (which are fed into the verification routines). |
392 | | masks: [Mask<V>; BYTES], |
393 | | } |
394 | | |
395 | | impl<V: FatVector, const BYTES: usize> Fat<V, BYTES> { |
396 | | /// Create a new "fat" Teddy searcher for the given patterns. |
397 | | /// |
398 | | /// # Panics |
399 | | /// |
400 | | /// This panics when `BYTES` is any value other than 1, 2, 3 or 4. |
401 | | /// |
402 | | /// # Safety |
403 | | /// |
404 | | /// Callers must ensure that this is okay to call in the current target for |
405 | | /// the current CPU. |
406 | | #[inline(always)] |
407 | 0 | pub(crate) unsafe fn new(patterns: Arc<Patterns>) -> Fat<V, BYTES> { |
408 | 0 | assert!( |
409 | 0 | 1 <= BYTES && BYTES <= 4, |
410 | 0 | "only 1, 2, 3 or 4 bytes are supported" |
411 | | ); |
412 | 0 | let teddy = Teddy::new(patterns); |
413 | 0 | let masks = FatMaskBuilder::from_teddy(&teddy); |
414 | 0 | Fat { teddy, masks } |
415 | 0 | } |
416 | | |
417 | | /// Returns the approximate total amount of heap used by this type, in |
418 | | /// units of bytes. |
419 | | #[inline(always)] |
420 | 0 | pub(crate) fn memory_usage(&self) -> usize { |
421 | 0 | self.teddy.memory_usage() |
422 | 0 | } |
423 | | |
424 | | /// Returns the minimum length, in bytes, that a haystack must be in order |
425 | | /// to use it with this searcher. |
426 | | #[inline(always)] |
427 | 0 | pub(crate) fn minimum_len(&self) -> usize { |
428 | 0 | V::Half::BYTES + (BYTES - 1) |
429 | 0 | } |
430 | | } |
431 | | |
432 | | impl<V: FatVector> Fat<V, 1> { |
433 | | /// Look for an occurrences of the patterns in this finder in the haystack |
434 | | /// given by the `start` and `end` pointers. |
435 | | /// |
436 | | /// If no match could be found, then `None` is returned. |
437 | | /// |
438 | | /// # Safety |
439 | | /// |
440 | | /// The given pointers representing the haystack must be valid to read |
441 | | /// from. They must also point to a region of memory that is at least the |
442 | | /// minimum length required by this searcher. |
443 | | /// |
444 | | /// Callers must ensure that this is okay to call in the current target for |
445 | | /// the current CPU. |
446 | | #[inline(always)] |
447 | 0 | pub(crate) unsafe fn find( |
448 | 0 | &self, |
449 | 0 | start: *const u8, |
450 | 0 | end: *const u8, |
451 | 0 | ) -> Option<Match> { |
452 | 0 | let len = end.distance(start); |
453 | 0 | debug_assert!(len >= self.minimum_len()); |
454 | 0 | let mut cur = start; |
455 | 0 | while cur <= end.sub(V::Half::BYTES) { |
456 | 0 | if let Some(m) = self.find_one(cur, end) { |
457 | 0 | return Some(m); |
458 | 0 | } |
459 | 0 | cur = cur.add(V::Half::BYTES); |
460 | | } |
461 | 0 | if cur < end { |
462 | 0 | cur = end.sub(V::Half::BYTES); |
463 | 0 | if let Some(m) = self.find_one(cur, end) { |
464 | 0 | return Some(m); |
465 | 0 | } |
466 | 0 | } |
467 | 0 | None |
468 | 0 | } |
469 | | |
470 | | /// Look for a match starting at the `V::BYTES` at and after `cur`. If |
471 | | /// there isn't one, then `None` is returned. |
472 | | /// |
473 | | /// # Safety |
474 | | /// |
475 | | /// The given pointers representing the haystack must be valid to read |
476 | | /// from. They must also point to a region of memory that is at least the |
477 | | /// minimum length required by this searcher. |
478 | | /// |
479 | | /// Callers must ensure that this is okay to call in the current target for |
480 | | /// the current CPU. |
481 | | #[inline(always)] |
482 | 0 | unsafe fn find_one( |
483 | 0 | &self, |
484 | 0 | cur: *const u8, |
485 | 0 | end: *const u8, |
486 | 0 | ) -> Option<Match> { |
487 | 0 | let c = self.candidate(cur); |
488 | 0 | if !c.is_zero() { |
489 | 0 | if let Some(m) = self.teddy.verify(cur, end, c) { |
490 | 0 | return Some(m); |
491 | 0 | } |
492 | 0 | } |
493 | 0 | None |
494 | 0 | } |
495 | | |
496 | | /// Look for a candidate match (represented as a vector) starting at the |
497 | | /// `V::BYTES` at and after `cur`. If there isn't one, then a vector with |
498 | | /// all bits set to zero is returned. |
499 | | /// |
500 | | /// # Safety |
501 | | /// |
502 | | /// The given pointer representing the haystack must be valid to read |
503 | | /// from. |
504 | | /// |
505 | | /// Callers must ensure that this is okay to call in the current target for |
506 | | /// the current CPU. |
507 | | #[inline(always)] |
508 | 0 | unsafe fn candidate(&self, cur: *const u8) -> V { |
509 | 0 | let chunk = V::load_half_unaligned(cur); |
510 | 0 | Mask::members1(chunk, self.masks) |
511 | 0 | } |
512 | | } |
513 | | |
514 | | impl<V: FatVector> Fat<V, 2> { |
515 | | /// See `Fat<V, 1>::find`. |
516 | | #[inline(always)] |
517 | 0 | pub(crate) unsafe fn find( |
518 | 0 | &self, |
519 | 0 | start: *const u8, |
520 | 0 | end: *const u8, |
521 | 0 | ) -> Option<Match> { |
522 | 0 | let len = end.distance(start); |
523 | 0 | debug_assert!(len >= self.minimum_len()); |
524 | 0 | let mut cur = start.add(1); |
525 | 0 | let mut prev0 = V::splat(0xFF); |
526 | 0 | while cur <= end.sub(V::Half::BYTES) { |
527 | 0 | if let Some(m) = self.find_one(cur, end, &mut prev0) { |
528 | 0 | return Some(m); |
529 | 0 | } |
530 | 0 | cur = cur.add(V::Half::BYTES); |
531 | | } |
532 | 0 | if cur < end { |
533 | 0 | cur = end.sub(V::Half::BYTES); |
534 | 0 | prev0 = V::splat(0xFF); |
535 | 0 | if let Some(m) = self.find_one(cur, end, &mut prev0) { |
536 | 0 | return Some(m); |
537 | 0 | } |
538 | 0 | } |
539 | 0 | None |
540 | 0 | } |
541 | | |
542 | | /// See `Fat<V, 1>::find_one`. |
543 | | #[inline(always)] |
544 | 0 | unsafe fn find_one( |
545 | 0 | &self, |
546 | 0 | cur: *const u8, |
547 | 0 | end: *const u8, |
548 | 0 | prev0: &mut V, |
549 | 0 | ) -> Option<Match> { |
550 | 0 | let c = self.candidate(cur, prev0); |
551 | 0 | if !c.is_zero() { |
552 | 0 | if let Some(m) = self.teddy.verify(cur.sub(1), end, c) { |
553 | 0 | return Some(m); |
554 | 0 | } |
555 | 0 | } |
556 | 0 | None |
557 | 0 | } |
558 | | |
559 | | /// See `Fat<V, 1>::candidate`. |
560 | | #[inline(always)] |
561 | 0 | unsafe fn candidate(&self, cur: *const u8, prev0: &mut V) -> V { |
562 | 0 | let chunk = V::load_half_unaligned(cur); |
563 | 0 | let (res0, res1) = Mask::members2(chunk, self.masks); |
564 | 0 | let res0prev0 = res0.half_shift_in_one_byte(*prev0); |
565 | 0 | let res = res0prev0.and(res1); |
566 | 0 | *prev0 = res0; |
567 | 0 | res |
568 | 0 | } |
569 | | } |
570 | | |
571 | | impl<V: FatVector> Fat<V, 3> { |
572 | | /// See `Fat<V, 1>::find`. |
573 | | #[inline(always)] |
574 | 0 | pub(crate) unsafe fn find( |
575 | 0 | &self, |
576 | 0 | start: *const u8, |
577 | 0 | end: *const u8, |
578 | 0 | ) -> Option<Match> { |
579 | 0 | let len = end.distance(start); |
580 | 0 | debug_assert!(len >= self.minimum_len()); |
581 | 0 | let mut cur = start.add(2); |
582 | 0 | let mut prev0 = V::splat(0xFF); |
583 | 0 | let mut prev1 = V::splat(0xFF); |
584 | 0 | while cur <= end.sub(V::Half::BYTES) { |
585 | 0 | if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1) { |
586 | 0 | return Some(m); |
587 | 0 | } |
588 | 0 | cur = cur.add(V::Half::BYTES); |
589 | | } |
590 | 0 | if cur < end { |
591 | 0 | cur = end.sub(V::Half::BYTES); |
592 | 0 | prev0 = V::splat(0xFF); |
593 | 0 | prev1 = V::splat(0xFF); |
594 | 0 | if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1) { |
595 | 0 | return Some(m); |
596 | 0 | } |
597 | 0 | } |
598 | 0 | None |
599 | 0 | } |
600 | | |
601 | | /// See `Fat<V, 1>::find_one`. |
602 | | #[inline(always)] |
603 | 0 | unsafe fn find_one( |
604 | 0 | &self, |
605 | 0 | cur: *const u8, |
606 | 0 | end: *const u8, |
607 | 0 | prev0: &mut V, |
608 | 0 | prev1: &mut V, |
609 | 0 | ) -> Option<Match> { |
610 | 0 | let c = self.candidate(cur, prev0, prev1); |
611 | 0 | if !c.is_zero() { |
612 | 0 | if let Some(m) = self.teddy.verify(cur.sub(2), end, c) { |
613 | 0 | return Some(m); |
614 | 0 | } |
615 | 0 | } |
616 | 0 | None |
617 | 0 | } |
618 | | |
619 | | /// See `Fat<V, 1>::candidate`. |
620 | | #[inline(always)] |
621 | 0 | unsafe fn candidate( |
622 | 0 | &self, |
623 | 0 | cur: *const u8, |
624 | 0 | prev0: &mut V, |
625 | 0 | prev1: &mut V, |
626 | 0 | ) -> V { |
627 | 0 | let chunk = V::load_half_unaligned(cur); |
628 | 0 | let (res0, res1, res2) = Mask::members3(chunk, self.masks); |
629 | 0 | let res0prev0 = res0.half_shift_in_two_bytes(*prev0); |
630 | 0 | let res1prev1 = res1.half_shift_in_one_byte(*prev1); |
631 | 0 | let res = res0prev0.and(res1prev1).and(res2); |
632 | 0 | *prev0 = res0; |
633 | 0 | *prev1 = res1; |
634 | 0 | res |
635 | 0 | } |
636 | | } |
637 | | |
638 | | impl<V: FatVector> Fat<V, 4> { |
639 | | /// See `Fat<V, 1>::find`. |
640 | | #[inline(always)] |
641 | 0 | pub(crate) unsafe fn find( |
642 | 0 | &self, |
643 | 0 | start: *const u8, |
644 | 0 | end: *const u8, |
645 | 0 | ) -> Option<Match> { |
646 | 0 | let len = end.distance(start); |
647 | 0 | debug_assert!(len >= self.minimum_len()); |
648 | 0 | let mut cur = start.add(3); |
649 | 0 | let mut prev0 = V::splat(0xFF); |
650 | 0 | let mut prev1 = V::splat(0xFF); |
651 | 0 | let mut prev2 = V::splat(0xFF); |
652 | 0 | while cur <= end.sub(V::Half::BYTES) { |
653 | 0 | if let Some(m) = |
654 | 0 | self.find_one(cur, end, &mut prev0, &mut prev1, &mut prev2) |
655 | | { |
656 | 0 | return Some(m); |
657 | 0 | } |
658 | 0 | cur = cur.add(V::Half::BYTES); |
659 | | } |
660 | 0 | if cur < end { |
661 | 0 | cur = end.sub(V::Half::BYTES); |
662 | 0 | prev0 = V::splat(0xFF); |
663 | 0 | prev1 = V::splat(0xFF); |
664 | 0 | prev2 = V::splat(0xFF); |
665 | 0 | if let Some(m) = |
666 | 0 | self.find_one(cur, end, &mut prev0, &mut prev1, &mut prev2) |
667 | | { |
668 | 0 | return Some(m); |
669 | 0 | } |
670 | 0 | } |
671 | 0 | None |
672 | 0 | } |
673 | | |
674 | | /// See `Fat<V, 1>::find_one`. |
675 | | #[inline(always)] |
676 | 0 | unsafe fn find_one( |
677 | 0 | &self, |
678 | 0 | cur: *const u8, |
679 | 0 | end: *const u8, |
680 | 0 | prev0: &mut V, |
681 | 0 | prev1: &mut V, |
682 | 0 | prev2: &mut V, |
683 | 0 | ) -> Option<Match> { |
684 | 0 | let c = self.candidate(cur, prev0, prev1, prev2); |
685 | 0 | if !c.is_zero() { |
686 | 0 | if let Some(m) = self.teddy.verify(cur.sub(3), end, c) { |
687 | 0 | return Some(m); |
688 | 0 | } |
689 | 0 | } |
690 | 0 | None |
691 | 0 | } |
692 | | |
693 | | /// See `Fat<V, 1>::candidate`. |
694 | | #[inline(always)] |
695 | 0 | unsafe fn candidate( |
696 | 0 | &self, |
697 | 0 | cur: *const u8, |
698 | 0 | prev0: &mut V, |
699 | 0 | prev1: &mut V, |
700 | 0 | prev2: &mut V, |
701 | 0 | ) -> V { |
702 | 0 | let chunk = V::load_half_unaligned(cur); |
703 | 0 | let (res0, res1, res2, res3) = Mask::members4(chunk, self.masks); |
704 | 0 | let res0prev0 = res0.half_shift_in_three_bytes(*prev0); |
705 | 0 | let res1prev1 = res1.half_shift_in_two_bytes(*prev1); |
706 | 0 | let res2prev2 = res2.half_shift_in_one_byte(*prev2); |
707 | 0 | let res = res0prev0.and(res1prev1).and(res2prev2).and(res3); |
708 | 0 | *prev0 = res0; |
709 | 0 | *prev1 = res1; |
710 | 0 | *prev2 = res2; |
711 | 0 | res |
712 | 0 | } |
713 | | } |
714 | | |
715 | | /// The common elements of all "slim" and "fat" Teddy search implementations. |
716 | | /// |
717 | | /// Essentially, this contains the patterns and the buckets. Namely, it |
718 | | /// contains enough to implement the verification step after candidates are |
719 | | /// identified via the shuffle masks. |
720 | | /// |
721 | | /// It is generic over the number of buckets used. In general, the number of |
722 | | /// buckets is either 8 (for "slim" Teddy) or 16 (for "fat" Teddy). The generic |
723 | | /// parameter isn't really meant to be instantiated for any value other than |
724 | | /// 8 or 16, although it is technically possible. The main hiccup is that there |
725 | | /// is some bit-shifting done in the critical part of verification that could |
726 | | /// be quite expensive if `N` is not a multiple of 2. |
727 | | #[derive(Clone, Debug)] |
728 | | struct Teddy<const BUCKETS: usize> { |
729 | | /// The patterns we are searching for. |
730 | | /// |
731 | | /// A pattern string can be found by its `PatternID`. |
732 | | patterns: Arc<Patterns>, |
733 | | /// The allocation of patterns in buckets. This only contains the IDs of |
734 | | /// patterns. In order to do full verification, callers must provide the |
735 | | /// actual patterns when using Teddy. |
736 | | buckets: [Vec<PatternID>; BUCKETS], |
737 | | // N.B. The above representation is very simple, but it definitely results |
738 | | // in ping-ponging between different allocations during verification. I've |
739 | | // tried experimenting with other representations that flatten the pattern |
740 | | // strings into a single allocation, but it doesn't seem to help much. |
741 | | // Probably everything is small enough to fit into cache anyway, and so the |
742 | | // pointer chasing isn't a big deal? |
743 | | // |
744 | | // One other avenue I haven't explored is some kind of hashing trick |
745 | | // that let's us do another high-confidence check before launching into |
746 | | // `memcmp`. |
747 | | } |
748 | | |
749 | | impl<const BUCKETS: usize> Teddy<BUCKETS> { |
750 | | /// Create a new generic data structure for Teddy verification. |
751 | 0 | fn new(patterns: Arc<Patterns>) -> Teddy<BUCKETS> { |
752 | 0 | assert_ne!(0, patterns.len(), "Teddy requires at least one pattern"); |
753 | 0 | assert_ne!( |
754 | 0 | 0, |
755 | 0 | patterns.minimum_len(), |
756 | 0 | "Teddy does not support zero-length patterns" |
757 | | ); |
758 | 0 | assert!( |
759 | 0 | BUCKETS == 8 || BUCKETS == 16, |
760 | 0 | "Teddy only supports 8 or 16 buckets" |
761 | | ); |
762 | | // MSRV(1.63): Use core::array::from_fn below instead of allocating a |
763 | | // superfluous outer Vec. Not a big deal (especially given the BTreeMap |
764 | | // allocation below), but nice to not do it. |
765 | 0 | let buckets = |
766 | 0 | <[Vec<PatternID>; BUCKETS]>::try_from(vec![vec![]; BUCKETS]) |
767 | 0 | .unwrap(); |
768 | 0 | let mut t = Teddy { patterns, buckets }; |
769 | 0 |
|
770 | 0 | let mut map: BTreeMap<Box<[u8]>, usize> = BTreeMap::new(); |
771 | 0 | for (id, pattern) in t.patterns.iter() { |
772 | | // We try to be slightly clever in how we assign patterns into |
773 | | // buckets. Generally speaking, we want patterns with the same |
774 | | // prefix to be in the same bucket, since it minimizes the amount |
775 | | // of time we spend churning through buckets in the verification |
776 | | // step. |
777 | | // |
778 | | // So we could assign patterns with the same N-prefix (where N is |
779 | | // the size of the mask, which is one of {1, 2, 3}) to the same |
780 | | // bucket. However, case insensitive searches are fairly common, so |
781 | | // we'd for example, ideally want to treat `abc` and `ABC` as if |
782 | | // they shared the same prefix. ASCII has the nice property that |
783 | | // the lower 4 bits of A and a are the same, so we therefore group |
784 | | // patterns with the same low-nybble-N-prefix into the same bucket. |
785 | | // |
786 | | // MOREOVER, this is actually necessary for correctness! In |
787 | | // particular, by grouping patterns with the same prefix into the |
788 | | // same bucket, we ensure that we preserve correct leftmost-first |
789 | | // and leftmost-longest match semantics. In addition to the fact |
790 | | // that `patterns.iter()` iterates in the correct order, this |
791 | | // guarantees that all possible ambiguous matches will occur in |
792 | | // the same bucket. The verification routine could be adjusted to |
793 | | // support correct leftmost match semantics regardless of bucket |
794 | | // allocation, but that results in a performance hit. It's much |
795 | | // nicer to be able to just stop as soon as a match is found. |
796 | 0 | let lonybs = pattern.low_nybbles(t.mask_len()); |
797 | 0 | if let Some(&bucket) = map.get(&lonybs) { |
798 | 0 | t.buckets[bucket].push(id); |
799 | 0 | } else { |
800 | 0 | // N.B. We assign buckets in reverse because it shouldn't have |
801 | 0 | // any influence on performance, but it does make it harder to |
802 | 0 | // get leftmost match semantics accidentally correct. |
803 | 0 | let bucket = (BUCKETS - 1) - (id.as_usize() % BUCKETS); |
804 | 0 | t.buckets[bucket].push(id); |
805 | 0 | map.insert(lonybs, bucket); |
806 | 0 | } |
807 | | } |
808 | 0 | t |
809 | 0 | } |
810 | | |
811 | | /// Verify whether there are any matches starting at or after `cur` in the |
812 | | /// haystack. The candidate chunk given should correspond to 8-bit bitsets |
813 | | /// for N buckets. |
814 | | /// |
815 | | /// # Safety |
816 | | /// |
817 | | /// The given pointers representing the haystack must be valid to read |
818 | | /// from. |
819 | | #[inline(always)] |
820 | 0 | unsafe fn verify64( |
821 | 0 | &self, |
822 | 0 | cur: *const u8, |
823 | 0 | end: *const u8, |
824 | 0 | mut candidate_chunk: u64, |
825 | 0 | ) -> Option<Match> { |
826 | 0 | while candidate_chunk != 0 { |
827 | 0 | let bit = candidate_chunk.trailing_zeros().as_usize(); |
828 | 0 | candidate_chunk &= !(1 << bit); |
829 | 0 |
|
830 | 0 | let cur = cur.add(bit / BUCKETS); |
831 | 0 | let bucket = bit % BUCKETS; |
832 | 0 | if let Some(m) = self.verify_bucket(cur, end, bucket) { |
833 | 0 | return Some(m); |
834 | 0 | } |
835 | | } |
836 | 0 | None |
837 | 0 | } |
838 | | |
839 | | /// Verify whether there are any matches starting at `at` in the given |
840 | | /// `haystack` corresponding only to patterns in the given bucket. |
841 | | /// |
842 | | /// # Safety |
843 | | /// |
844 | | /// The given pointers representing the haystack must be valid to read |
845 | | /// from. |
846 | | /// |
847 | | /// The bucket index must be less than or equal to `self.buckets.len()`. |
848 | | #[inline(always)] |
849 | 0 | unsafe fn verify_bucket( |
850 | 0 | &self, |
851 | 0 | cur: *const u8, |
852 | 0 | end: *const u8, |
853 | 0 | bucket: usize, |
854 | 0 | ) -> Option<Match> { |
855 | 0 | debug_assert!(bucket < self.buckets.len()); |
856 | | // SAFETY: The caller must ensure that the bucket index is correct. |
857 | 0 | for pid in self.buckets.get_unchecked(bucket).iter().copied() { |
858 | | // SAFETY: This is safe because we are guaranteed that every |
859 | | // index in a Teddy bucket is a valid index into `pats`, by |
860 | | // construction. |
861 | 0 | debug_assert!(pid.as_usize() < self.patterns.len()); |
862 | 0 | let pat = self.patterns.get_unchecked(pid); |
863 | 0 | if pat.is_prefix_raw(cur, end) { |
864 | 0 | let start = cur; |
865 | 0 | let end = start.add(pat.len()); |
866 | 0 | return Some(Match { pid, start, end }); |
867 | 0 | } |
868 | | } |
869 | 0 | None |
870 | 0 | } |
871 | | |
872 | | /// Returns the total number of masks required by the patterns in this |
873 | | /// Teddy searcher. |
874 | | /// |
875 | | /// Basically, the mask length corresponds to the type of Teddy searcher |
876 | | /// to use: a 1-byte, 2-byte, 3-byte or 4-byte searcher. The bigger the |
877 | | /// better, typically, since searching for longer substrings usually |
878 | | /// decreases the rate of false positives. Therefore, the number of masks |
879 | | /// needed is the length of the shortest pattern in this searcher. If the |
880 | | /// length of the shortest pattern (in bytes) is bigger than 4, then the |
881 | | /// mask length is 4 since there are no Teddy searchers for more than 4 |
882 | | /// bytes. |
883 | 0 | fn mask_len(&self) -> usize { |
884 | 0 | core::cmp::min(4, self.patterns.minimum_len()) |
885 | 0 | } |
886 | | |
887 | | /// Returns the approximate total amount of heap used by this type, in |
888 | | /// units of bytes. |
889 | 0 | fn memory_usage(&self) -> usize { |
890 | 0 | // This is an upper bound rather than a precise accounting. No |
891 | 0 | // particular reason, other than it's probably very close to actual |
892 | 0 | // memory usage in practice. |
893 | 0 | self.patterns.len() * core::mem::size_of::<PatternID>() |
894 | 0 | } |
895 | | } |
896 | | |
897 | | impl Teddy<8> { |
898 | | /// Runs the verification routine for "slim" Teddy. |
899 | | /// |
900 | | /// The candidate given should be a collection of 8-bit bitsets (one bitset |
901 | | /// per lane), where the ith bit is set in the jth lane if and only if the |
902 | | /// byte occurring at `at + j` in `cur` is in the bucket `i`. |
903 | | /// |
904 | | /// # Safety |
905 | | /// |
906 | | /// Callers must ensure that this is okay to call in the current target for |
907 | | /// the current CPU. |
908 | | /// |
909 | | /// The given pointers must be valid to read from. |
910 | | #[inline(always)] |
911 | 0 | unsafe fn verify<V: Vector>( |
912 | 0 | &self, |
913 | 0 | mut cur: *const u8, |
914 | 0 | end: *const u8, |
915 | 0 | candidate: V, |
916 | 0 | ) -> Option<Match> { |
917 | 0 | debug_assert!(!candidate.is_zero()); |
918 | | // Convert the candidate into 64-bit chunks, and then verify each of |
919 | | // those chunks. |
920 | 0 | candidate.for_each_64bit_lane( |
921 | 0 | #[inline(always)] |
922 | 0 | |_, chunk| { |
923 | 0 | let result = self.verify64(cur, end, chunk); |
924 | 0 | cur = cur.add(8); |
925 | 0 | result |
926 | 0 | }, |
927 | 0 | ) |
928 | 0 | } |
929 | | } |
930 | | |
931 | | impl Teddy<16> { |
932 | | /// Runs the verification routine for "fat" Teddy. |
933 | | /// |
934 | | /// The candidate given should be a collection of 8-bit bitsets (one bitset |
935 | | /// per lane), where the ith bit is set in the jth lane if and only if the |
936 | | /// byte occurring at `at + (j < 16 ? j : j - 16)` in `cur` is in the |
937 | | /// bucket `j < 16 ? i : i + 8`. |
938 | | /// |
939 | | /// # Safety |
940 | | /// |
941 | | /// Callers must ensure that this is okay to call in the current target for |
942 | | /// the current CPU. |
943 | | /// |
944 | | /// The given pointers must be valid to read from. |
945 | | #[inline(always)] |
946 | 0 | unsafe fn verify<V: FatVector>( |
947 | 0 | &self, |
948 | 0 | mut cur: *const u8, |
949 | 0 | end: *const u8, |
950 | 0 | candidate: V, |
951 | 0 | ) -> Option<Match> { |
952 | 0 | // This is a bit tricky, but we basically want to convert our |
953 | 0 | // candidate, which looks like this (assuming a 256-bit vector): |
954 | 0 | // |
955 | 0 | // a31 a30 ... a17 a16 a15 a14 ... a01 a00 |
956 | 0 | // |
957 | 0 | // where each a(i) is an 8-bit bitset corresponding to the activated |
958 | 0 | // buckets, to this |
959 | 0 | // |
960 | 0 | // a31 a15 a30 a14 a29 a13 ... a18 a02 a17 a01 a16 a00 |
961 | 0 | // |
962 | 0 | // Namely, for Fat Teddy, the high 128-bits of the candidate correspond |
963 | 0 | // to the same bytes in the haystack in the low 128-bits (so we only |
964 | 0 | // scan 16 bytes at a time), but are for buckets 8-15 instead of 0-7. |
965 | 0 | // |
966 | 0 | // The verification routine wants to look at all potentially matching |
967 | 0 | // buckets before moving on to the next lane. So for example, both |
968 | 0 | // a16 and a00 both correspond to the first byte in our window; a00 |
969 | 0 | // contains buckets 0-7 and a16 contains buckets 8-15. Specifically, |
970 | 0 | // a16 should be checked before a01. So the transformation shown above |
971 | 0 | // allows us to use our normal verification procedure with one small |
972 | 0 | // change: we treat each bitset as 16 bits instead of 8 bits. |
973 | 0 | debug_assert!(!candidate.is_zero()); |
974 | | |
975 | | // Swap the 128-bit lanes in the candidate vector. |
976 | 0 | let swapped = candidate.swap_halves(); |
977 | 0 | // Interleave the bytes from the low 128-bit lanes, starting with |
978 | 0 | // cand first. |
979 | 0 | let r1 = candidate.interleave_low_8bit_lanes(swapped); |
980 | 0 | // Interleave the bytes from the high 128-bit lanes, starting with |
981 | 0 | // cand first. |
982 | 0 | let r2 = candidate.interleave_high_8bit_lanes(swapped); |
983 | 0 | // Now just take the 2 low 64-bit integers from both r1 and r2. We |
984 | 0 | // can drop the high 64-bit integers because they are a mirror image |
985 | 0 | // of the low 64-bit integers. All we care about are the low 128-bit |
986 | 0 | // lanes of r1 and r2. Combined, they contain all our 16-bit bitsets |
987 | 0 | // laid out in the desired order, as described above. |
988 | 0 | r1.for_each_low_64bit_lane( |
989 | 0 | r2, |
990 | 0 | #[inline(always)] |
991 | 0 | |_, chunk| { |
992 | 0 | let result = self.verify64(cur, end, chunk); |
993 | 0 | cur = cur.add(4); |
994 | 0 | result |
995 | 0 | }, |
996 | 0 | ) |
997 | 0 | } |
998 | | } |
999 | | |
1000 | | /// A vector generic mask for the low and high nybbles in a set of patterns. |
1001 | | /// Each 8-bit lane `j` in a vector corresponds to a bitset where the `i`th bit |
1002 | | /// is set if and only if the nybble `j` is in the bucket `i` at a particular |
1003 | | /// position. |
1004 | | /// |
1005 | | /// This is slightly tweaked dependending on whether Slim or Fat Teddy is being |
1006 | | /// used. For Slim Teddy, the bitsets in the lower half are the same as the |
1007 | | /// bitsets in the higher half, so that we can search `V::BYTES` bytes at a |
1008 | | /// time. (Remember, the nybbles in the haystack are used as indices into these |
1009 | | /// masks, and 256-bit shuffles only operate on 128-bit lanes.) |
1010 | | /// |
1011 | | /// For Fat Teddy, the bitsets are not repeated, but instead, the high half |
1012 | | /// bits correspond to an addition 8 buckets. So that a bitset `00100010` has |
1013 | | /// buckets 1 and 5 set if it's in the lower half, but has buckets 9 and 13 set |
1014 | | /// if it's in the higher half. |
1015 | | #[derive(Clone, Copy, Debug)] |
1016 | | struct Mask<V> { |
1017 | | lo: V, |
1018 | | hi: V, |
1019 | | } |
1020 | | |
1021 | | impl<V: Vector> Mask<V> { |
1022 | | /// Return a candidate for Teddy (fat or slim) that is searching for 1-byte |
1023 | | /// candidates. |
1024 | | /// |
1025 | | /// If a candidate is returned, it will be a collection of 8-bit bitsets |
1026 | | /// (one bitset per lane), where the ith bit is set in the jth lane if and |
1027 | | /// only if the byte occurring at the jth lane in `chunk` is in the bucket |
1028 | | /// `i`. If no candidate is found, then the vector returned will have all |
1029 | | /// lanes set to zero. |
1030 | | /// |
1031 | | /// `chunk` should correspond to a `V::BYTES` window of the haystack (where |
1032 | | /// the least significant byte corresponds to the start of the window). For |
1033 | | /// fat Teddy, the haystack window length should be `V::BYTES / 2`, with |
1034 | | /// the window repeated in each half of the vector. |
1035 | | /// |
1036 | | /// `mask1` should correspond to a low/high mask for the first byte of all |
1037 | | /// patterns that are being searched. |
1038 | | #[inline(always)] |
1039 | 0 | unsafe fn members1(chunk: V, masks: [Mask<V>; 1]) -> V { |
1040 | 0 | let lomask = V::splat(0xF); |
1041 | 0 | let hlo = chunk.and(lomask); |
1042 | 0 | let hhi = chunk.shift_8bit_lane_right::<4>().and(lomask); |
1043 | 0 | let locand = masks[0].lo.shuffle_bytes(hlo); |
1044 | 0 | let hicand = masks[0].hi.shuffle_bytes(hhi); |
1045 | 0 | locand.and(hicand) |
1046 | 0 | } |
1047 | | |
1048 | | /// Return a candidate for Teddy (fat or slim) that is searching for 2-byte |
1049 | | /// candidates. |
1050 | | /// |
1051 | | /// If candidates are returned, each will be a collection of 8-bit bitsets |
1052 | | /// (one bitset per lane), where the ith bit is set in the jth lane if and |
1053 | | /// only if the byte occurring at the jth lane in `chunk` is in the bucket |
1054 | | /// `i`. Each candidate returned corresponds to the first and second bytes |
1055 | | /// of the patterns being searched. If no candidate is found, then all of |
1056 | | /// the lanes will be set to zero in at least one of the vectors returned. |
1057 | | /// |
1058 | | /// `chunk` should correspond to a `V::BYTES` window of the haystack (where |
1059 | | /// the least significant byte corresponds to the start of the window). For |
1060 | | /// fat Teddy, the haystack window length should be `V::BYTES / 2`, with |
1061 | | /// the window repeated in each half of the vector. |
1062 | | /// |
1063 | | /// The masks should correspond to the masks computed for the first and |
1064 | | /// second bytes of all patterns that are being searched. |
1065 | | #[inline(always)] |
1066 | 0 | unsafe fn members2(chunk: V, masks: [Mask<V>; 2]) -> (V, V) { |
1067 | 0 | let lomask = V::splat(0xF); |
1068 | 0 | let hlo = chunk.and(lomask); |
1069 | 0 | let hhi = chunk.shift_8bit_lane_right::<4>().and(lomask); |
1070 | 0 |
|
1071 | 0 | let locand1 = masks[0].lo.shuffle_bytes(hlo); |
1072 | 0 | let hicand1 = masks[0].hi.shuffle_bytes(hhi); |
1073 | 0 | let cand1 = locand1.and(hicand1); |
1074 | 0 |
|
1075 | 0 | let locand2 = masks[1].lo.shuffle_bytes(hlo); |
1076 | 0 | let hicand2 = masks[1].hi.shuffle_bytes(hhi); |
1077 | 0 | let cand2 = locand2.and(hicand2); |
1078 | 0 |
|
1079 | 0 | (cand1, cand2) |
1080 | 0 | } |
1081 | | |
1082 | | /// Return a candidate for Teddy (fat or slim) that is searching for 3-byte |
1083 | | /// candidates. |
1084 | | /// |
1085 | | /// If candidates are returned, each will be a collection of 8-bit bitsets |
1086 | | /// (one bitset per lane), where the ith bit is set in the jth lane if and |
1087 | | /// only if the byte occurring at the jth lane in `chunk` is in the bucket |
1088 | | /// `i`. Each candidate returned corresponds to the first, second and third |
1089 | | /// bytes of the patterns being searched. If no candidate is found, then |
1090 | | /// all of the lanes will be set to zero in at least one of the vectors |
1091 | | /// returned. |
1092 | | /// |
1093 | | /// `chunk` should correspond to a `V::BYTES` window of the haystack (where |
1094 | | /// the least significant byte corresponds to the start of the window). For |
1095 | | /// fat Teddy, the haystack window length should be `V::BYTES / 2`, with |
1096 | | /// the window repeated in each half of the vector. |
1097 | | /// |
1098 | | /// The masks should correspond to the masks computed for the first, second |
1099 | | /// and third bytes of all patterns that are being searched. |
1100 | | #[inline(always)] |
1101 | 0 | unsafe fn members3(chunk: V, masks: [Mask<V>; 3]) -> (V, V, V) { |
1102 | 0 | let lomask = V::splat(0xF); |
1103 | 0 | let hlo = chunk.and(lomask); |
1104 | 0 | let hhi = chunk.shift_8bit_lane_right::<4>().and(lomask); |
1105 | 0 |
|
1106 | 0 | let locand1 = masks[0].lo.shuffle_bytes(hlo); |
1107 | 0 | let hicand1 = masks[0].hi.shuffle_bytes(hhi); |
1108 | 0 | let cand1 = locand1.and(hicand1); |
1109 | 0 |
|
1110 | 0 | let locand2 = masks[1].lo.shuffle_bytes(hlo); |
1111 | 0 | let hicand2 = masks[1].hi.shuffle_bytes(hhi); |
1112 | 0 | let cand2 = locand2.and(hicand2); |
1113 | 0 |
|
1114 | 0 | let locand3 = masks[2].lo.shuffle_bytes(hlo); |
1115 | 0 | let hicand3 = masks[2].hi.shuffle_bytes(hhi); |
1116 | 0 | let cand3 = locand3.and(hicand3); |
1117 | 0 |
|
1118 | 0 | (cand1, cand2, cand3) |
1119 | 0 | } |
1120 | | |
1121 | | /// Return a candidate for Teddy (fat or slim) that is searching for 4-byte |
1122 | | /// candidates. |
1123 | | /// |
1124 | | /// If candidates are returned, each will be a collection of 8-bit bitsets |
1125 | | /// (one bitset per lane), where the ith bit is set in the jth lane if and |
1126 | | /// only if the byte occurring at the jth lane in `chunk` is in the bucket |
1127 | | /// `i`. Each candidate returned corresponds to the first, second, third |
1128 | | /// and fourth bytes of the patterns being searched. If no candidate is |
1129 | | /// found, then all of the lanes will be set to zero in at least one of the |
1130 | | /// vectors returned. |
1131 | | /// |
1132 | | /// `chunk` should correspond to a `V::BYTES` window of the haystack (where |
1133 | | /// the least significant byte corresponds to the start of the window). For |
1134 | | /// fat Teddy, the haystack window length should be `V::BYTES / 2`, with |
1135 | | /// the window repeated in each half of the vector. |
1136 | | /// |
1137 | | /// The masks should correspond to the masks computed for the first, |
1138 | | /// second, third and fourth bytes of all patterns that are being searched. |
1139 | | #[inline(always)] |
1140 | 0 | unsafe fn members4(chunk: V, masks: [Mask<V>; 4]) -> (V, V, V, V) { |
1141 | 0 | let lomask = V::splat(0xF); |
1142 | 0 | let hlo = chunk.and(lomask); |
1143 | 0 | let hhi = chunk.shift_8bit_lane_right::<4>().and(lomask); |
1144 | 0 |
|
1145 | 0 | let locand1 = masks[0].lo.shuffle_bytes(hlo); |
1146 | 0 | let hicand1 = masks[0].hi.shuffle_bytes(hhi); |
1147 | 0 | let cand1 = locand1.and(hicand1); |
1148 | 0 |
|
1149 | 0 | let locand2 = masks[1].lo.shuffle_bytes(hlo); |
1150 | 0 | let hicand2 = masks[1].hi.shuffle_bytes(hhi); |
1151 | 0 | let cand2 = locand2.and(hicand2); |
1152 | 0 |
|
1153 | 0 | let locand3 = masks[2].lo.shuffle_bytes(hlo); |
1154 | 0 | let hicand3 = masks[2].hi.shuffle_bytes(hhi); |
1155 | 0 | let cand3 = locand3.and(hicand3); |
1156 | 0 |
|
1157 | 0 | let locand4 = masks[3].lo.shuffle_bytes(hlo); |
1158 | 0 | let hicand4 = masks[3].hi.shuffle_bytes(hhi); |
1159 | 0 | let cand4 = locand4.and(hicand4); |
1160 | 0 |
|
1161 | 0 | (cand1, cand2, cand3, cand4) |
1162 | 0 | } |
1163 | | } |
1164 | | |
1165 | | /// Represents the low and high nybble masks that will be used during |
1166 | | /// search. Each mask is 32 bytes wide, although only the first 16 bytes are |
1167 | | /// used for 128-bit vectors. |
1168 | | /// |
1169 | | /// Each byte in the mask corresponds to a 8-bit bitset, where bit `i` is set |
1170 | | /// if and only if the corresponding nybble is in the ith bucket. The index of |
1171 | | /// the byte (0-15, inclusive) corresponds to the nybble. |
1172 | | /// |
1173 | | /// Each mask is used as the target of a shuffle, where the indices for the |
1174 | | /// shuffle are taken from the haystack. AND'ing the shuffles for both the |
1175 | | /// low and high masks together also results in 8-bit bitsets, but where bit |
1176 | | /// `i` is set if and only if the correspond *byte* is in the ith bucket. |
1177 | | #[derive(Clone, Default)] |
1178 | | struct SlimMaskBuilder { |
1179 | | lo: [u8; 32], |
1180 | | hi: [u8; 32], |
1181 | | } |
1182 | | |
1183 | | impl SlimMaskBuilder { |
1184 | | /// Update this mask by adding the given byte to the given bucket. The |
1185 | | /// given bucket must be in the range 0-7. |
1186 | | /// |
1187 | | /// # Panics |
1188 | | /// |
1189 | | /// When `bucket >= 8`. |
1190 | 0 | fn add(&mut self, bucket: usize, byte: u8) { |
1191 | 0 | assert!(bucket < 8); |
1192 | | |
1193 | 0 | let bucket = u8::try_from(bucket).unwrap(); |
1194 | 0 | let byte_lo = usize::from(byte & 0xF); |
1195 | 0 | let byte_hi = usize::from((byte >> 4) & 0xF); |
1196 | 0 | // When using 256-bit vectors, we need to set this bucket assignment in |
1197 | 0 | // the low and high 128-bit portions of the mask. This allows us to |
1198 | 0 | // process 32 bytes at a time. Namely, AVX2 shuffles operate on each |
1199 | 0 | // of the 128-bit lanes, rather than the full 256-bit vector at once. |
1200 | 0 | self.lo[byte_lo] |= 1 << bucket; |
1201 | 0 | self.lo[byte_lo + 16] |= 1 << bucket; |
1202 | 0 | self.hi[byte_hi] |= 1 << bucket; |
1203 | 0 | self.hi[byte_hi + 16] |= 1 << bucket; |
1204 | 0 | } |
1205 | | |
1206 | | /// Turn this builder into a vector mask. |
1207 | | /// |
1208 | | /// # Panics |
1209 | | /// |
1210 | | /// When `V` represents a vector bigger than what `MaskBytes` can contain. |
1211 | | /// |
1212 | | /// # Safety |
1213 | | /// |
1214 | | /// Callers must ensure that this is okay to call in the current target for |
1215 | | /// the current CPU. |
1216 | | #[inline(always)] |
1217 | 0 | unsafe fn build<V: Vector>(&self) -> Mask<V> { |
1218 | 0 | assert!(V::BYTES <= self.lo.len()); |
1219 | 0 | assert!(V::BYTES <= self.hi.len()); |
1220 | 0 | Mask { |
1221 | 0 | lo: V::load_unaligned(self.lo[..].as_ptr()), |
1222 | 0 | hi: V::load_unaligned(self.hi[..].as_ptr()), |
1223 | 0 | } |
1224 | 0 | } |
1225 | | |
1226 | | /// A convenience function for building `N` vector masks from a slim |
1227 | | /// `Teddy` value. |
1228 | | /// |
1229 | | /// # Panics |
1230 | | /// |
1231 | | /// When `V` represents a vector bigger than what `MaskBytes` can contain. |
1232 | | /// |
1233 | | /// # Safety |
1234 | | /// |
1235 | | /// Callers must ensure that this is okay to call in the current target for |
1236 | | /// the current CPU. |
1237 | | #[inline(always)] |
1238 | 0 | unsafe fn from_teddy<const BYTES: usize, V: Vector>( |
1239 | 0 | teddy: &Teddy<8>, |
1240 | 0 | ) -> [Mask<V>; BYTES] { |
1241 | 0 | // MSRV(1.63): Use core::array::from_fn to just build the array here |
1242 | 0 | // instead of creating a vector and turning it into an array. |
1243 | 0 | let mut mask_builders = vec![SlimMaskBuilder::default(); BYTES]; |
1244 | 0 | for (bucket_index, bucket) in teddy.buckets.iter().enumerate() { |
1245 | 0 | for pid in bucket.iter().copied() { |
1246 | 0 | let pat = teddy.patterns.get(pid); |
1247 | 0 | for (i, builder) in mask_builders.iter_mut().enumerate() { |
1248 | 0 | builder.add(bucket_index, pat.bytes()[i]); |
1249 | 0 | } |
1250 | | } |
1251 | | } |
1252 | 0 | let array = |
1253 | 0 | <[SlimMaskBuilder; BYTES]>::try_from(mask_builders).unwrap(); |
1254 | 0 | array.map(|builder| builder.build()) |
1255 | 0 | } |
1256 | | } |
1257 | | |
1258 | | impl Debug for SlimMaskBuilder { |
1259 | 0 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |
1260 | 0 | let (mut parts_lo, mut parts_hi) = (vec![], vec![]); |
1261 | 0 | for i in 0..32 { |
1262 | 0 | parts_lo.push(format!("{:02}: {:08b}", i, self.lo[i])); |
1263 | 0 | parts_hi.push(format!("{:02}: {:08b}", i, self.hi[i])); |
1264 | 0 | } |
1265 | 0 | f.debug_struct("SlimMaskBuilder") |
1266 | 0 | .field("lo", &parts_lo) |
1267 | 0 | .field("hi", &parts_hi) |
1268 | 0 | .finish() |
1269 | 0 | } |
1270 | | } |
1271 | | |
1272 | | /// Represents the low and high nybble masks that will be used during "fat" |
1273 | | /// Teddy search. |
1274 | | /// |
1275 | | /// Each mask is 32 bytes wide, and at the time of writing, only 256-bit vectors |
1276 | | /// support fat Teddy. |
1277 | | /// |
1278 | | /// A fat Teddy mask is like a slim Teddy mask, except that instead of |
1279 | | /// repeating the bitsets in the high and low 128-bits in 256-bit vectors, the |
1280 | | /// high and low 128-bit halves each represent distinct buckets. (Bringing the |
1281 | | /// total to 16 instead of 8.) This permits spreading the patterns out a bit |
1282 | | /// more and thus putting less pressure on verification to be fast. |
1283 | | /// |
1284 | | /// Each byte in the mask corresponds to a 8-bit bitset, where bit `i` is set |
1285 | | /// if and only if the corresponding nybble is in the ith bucket. The index of |
1286 | | /// the byte (0-15, inclusive) corresponds to the nybble. |
1287 | | #[derive(Clone, Copy, Default)] |
1288 | | struct FatMaskBuilder { |
1289 | | lo: [u8; 32], |
1290 | | hi: [u8; 32], |
1291 | | } |
1292 | | |
1293 | | impl FatMaskBuilder { |
1294 | | /// Update this mask by adding the given byte to the given bucket. The |
1295 | | /// given bucket must be in the range 0-15. |
1296 | | /// |
1297 | | /// # Panics |
1298 | | /// |
1299 | | /// When `bucket >= 16`. |
1300 | 0 | fn add(&mut self, bucket: usize, byte: u8) { |
1301 | 0 | assert!(bucket < 16); |
1302 | | |
1303 | 0 | let bucket = u8::try_from(bucket).unwrap(); |
1304 | 0 | let byte_lo = usize::from(byte & 0xF); |
1305 | 0 | let byte_hi = usize::from((byte >> 4) & 0xF); |
1306 | 0 | // Unlike slim teddy, fat teddy only works with AVX2. For fat teddy, |
1307 | 0 | // the high 128 bits of our mask correspond to buckets 8-15, while the |
1308 | 0 | // low 128 bits correspond to buckets 0-7. |
1309 | 0 | if bucket < 8 { |
1310 | 0 | self.lo[byte_lo] |= 1 << bucket; |
1311 | 0 | self.hi[byte_hi] |= 1 << bucket; |
1312 | 0 | } else { |
1313 | 0 | self.lo[byte_lo + 16] |= 1 << (bucket % 8); |
1314 | 0 | self.hi[byte_hi + 16] |= 1 << (bucket % 8); |
1315 | 0 | } |
1316 | 0 | } |
1317 | | |
1318 | | /// Turn this builder into a vector mask. |
1319 | | /// |
1320 | | /// # Panics |
1321 | | /// |
1322 | | /// When `V` represents a vector bigger than what `MaskBytes` can contain. |
1323 | | /// |
1324 | | /// # Safety |
1325 | | /// |
1326 | | /// Callers must ensure that this is okay to call in the current target for |
1327 | | /// the current CPU. |
1328 | | #[inline(always)] |
1329 | 0 | unsafe fn build<V: Vector>(&self) -> Mask<V> { |
1330 | 0 | assert!(V::BYTES <= self.lo.len()); |
1331 | 0 | assert!(V::BYTES <= self.hi.len()); |
1332 | 0 | Mask { |
1333 | 0 | lo: V::load_unaligned(self.lo[..].as_ptr()), |
1334 | 0 | hi: V::load_unaligned(self.hi[..].as_ptr()), |
1335 | 0 | } |
1336 | 0 | } |
1337 | | |
1338 | | /// A convenience function for building `N` vector masks from a fat |
1339 | | /// `Teddy` value. |
1340 | | /// |
1341 | | /// # Panics |
1342 | | /// |
1343 | | /// When `V` represents a vector bigger than what `MaskBytes` can contain. |
1344 | | /// |
1345 | | /// # Safety |
1346 | | /// |
1347 | | /// Callers must ensure that this is okay to call in the current target for |
1348 | | /// the current CPU. |
1349 | | #[inline(always)] |
1350 | 0 | unsafe fn from_teddy<const BYTES: usize, V: Vector>( |
1351 | 0 | teddy: &Teddy<16>, |
1352 | 0 | ) -> [Mask<V>; BYTES] { |
1353 | 0 | // MSRV(1.63): Use core::array::from_fn to just build the array here |
1354 | 0 | // instead of creating a vector and turning it into an array. |
1355 | 0 | let mut mask_builders = vec![FatMaskBuilder::default(); BYTES]; |
1356 | 0 | for (bucket_index, bucket) in teddy.buckets.iter().enumerate() { |
1357 | 0 | for pid in bucket.iter().copied() { |
1358 | 0 | let pat = teddy.patterns.get(pid); |
1359 | 0 | for (i, builder) in mask_builders.iter_mut().enumerate() { |
1360 | 0 | builder.add(bucket_index, pat.bytes()[i]); |
1361 | 0 | } |
1362 | | } |
1363 | | } |
1364 | 0 | let array = |
1365 | 0 | <[FatMaskBuilder; BYTES]>::try_from(mask_builders).unwrap(); |
1366 | 0 | array.map(|builder| builder.build()) |
1367 | 0 | } |
1368 | | } |
1369 | | |
1370 | | impl Debug for FatMaskBuilder { |
1371 | 0 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |
1372 | 0 | let (mut parts_lo, mut parts_hi) = (vec![], vec![]); |
1373 | 0 | for i in 0..32 { |
1374 | 0 | parts_lo.push(format!("{:02}: {:08b}", i, self.lo[i])); |
1375 | 0 | parts_hi.push(format!("{:02}: {:08b}", i, self.hi[i])); |
1376 | 0 | } |
1377 | 0 | f.debug_struct("FatMaskBuilder") |
1378 | 0 | .field("lo", &parts_lo) |
1379 | 0 | .field("hi", &parts_hi) |
1380 | 0 | .finish() |
1381 | 0 | } |
1382 | | } |