/build/cargo-vendor-dir/wast-231.0.0/src/lexer.rs
Line | Count | Source |
1 | | //! Definition of a lexer for the WebAssembly text format. |
2 | | //! |
3 | | //! This module provides a [`Lexer`][] type which is an iterate over the raw |
4 | | //! tokens of a WebAssembly text file. A [`Lexer`][] accounts for every single |
5 | | //! byte in a WebAssembly text field, returning tokens even for comments and |
6 | | //! whitespace. Typically you'll ignore comments and whitespace, however. |
7 | | //! |
8 | | //! If you'd like to iterate over the tokens in a file you can do so via: |
9 | | //! |
10 | | //! ``` |
11 | | //! # fn foo() -> Result<(), wast::Error> { |
12 | | //! use wast::lexer::Lexer; |
13 | | //! |
14 | | //! let wat = "(module (func $foo))"; |
15 | | //! for token in Lexer::new(wat).iter(0) { |
16 | | //! println!("{:?}", token?); |
17 | | //! } |
18 | | //! # Ok(()) |
19 | | //! # } |
20 | | //! ``` |
21 | | //! |
22 | | //! Note that you'll typically not use this module but will rather use |
23 | | //! [`ParseBuffer`](crate::parser::ParseBuffer) instead. |
24 | | //! |
25 | | //! [`Lexer`]: crate::lexer::Lexer |
26 | | |
27 | | use crate::token::Span; |
28 | | use crate::Error; |
29 | | use std::borrow::Cow; |
30 | | use std::char; |
31 | | use std::fmt; |
32 | | use std::slice; |
33 | | use std::str; |
34 | | use std::str::Utf8Error; |
35 | | |
36 | | /// A structure used to lex the s-expression syntax of WAT files. |
37 | | /// |
38 | | /// This structure is used to generate [`Token`] items, which should account for |
39 | | /// every single byte of the input as we iterate over it. A [`LexError`] is |
40 | | /// returned for any non-lexable text. |
41 | | #[derive(Clone)] |
42 | | pub struct Lexer<'a> { |
43 | | input: &'a str, |
44 | | allow_confusing_unicode: bool, |
45 | | } |
46 | | |
47 | | /// A single token parsed from a `Lexer`. |
48 | | #[derive(Copy, Clone, Debug, PartialEq)] |
49 | | pub struct Token { |
50 | | /// The kind of token this represents, such as whether it's whitespace, a |
51 | | /// keyword, etc. |
52 | | pub kind: TokenKind, |
53 | | /// The byte offset within the original source for where this token came |
54 | | /// from. |
55 | | pub offset: usize, |
56 | | /// The byte length of this token as it resides in the original source. |
57 | | // |
58 | | // NB: this is `u32` to enable packing `Token` into two pointers of size. |
59 | | // This does limit a single token to being at most 4G large, but that seems |
60 | | // probably ok. |
61 | | pub len: u32, |
62 | | } |
63 | | |
64 | | #[test] |
65 | | fn token_is_not_too_big() { |
66 | | assert!(std::mem::size_of::<Token>() <= std::mem::size_of::<u64>() * 2); |
67 | | } |
68 | | |
69 | | /// Classification of what was parsed from the input stream. |
70 | | /// |
71 | | /// This enumeration contains all kinds of fragments, including comments and |
72 | | /// whitespace. |
73 | | #[derive(Copy, Clone, Debug, PartialEq)] |
74 | | pub enum TokenKind { |
75 | | /// A line comment, preceded with `;;` |
76 | | LineComment, |
77 | | |
78 | | /// A block comment, surrounded by `(;` and `;)`. Note that these can be |
79 | | /// nested. |
80 | | BlockComment, |
81 | | |
82 | | /// A fragment of source that represents whitespace. |
83 | | Whitespace, |
84 | | |
85 | | /// A left-parenthesis, including the source text for where it comes from. |
86 | | LParen, |
87 | | /// A right-parenthesis, including the source text for where it comes from. |
88 | | RParen, |
89 | | |
90 | | /// A string literal, which is actually a list of bytes. |
91 | | String, |
92 | | |
93 | | /// An identifier (like `$foo`). |
94 | | /// |
95 | | /// All identifiers start with `$` and the payload here is the original |
96 | | /// source text. |
97 | | Id, |
98 | | |
99 | | /// A keyword, or something that starts with an alphabetic character. |
100 | | /// |
101 | | /// The payload here is the original source text. |
102 | | Keyword, |
103 | | |
104 | | /// An annotation (like `@foo`). |
105 | | /// |
106 | | /// All annotations start with `@` and the payload will be the name of the |
107 | | /// annotation. |
108 | | Annotation, |
109 | | |
110 | | /// A reserved series of `idchar` symbols. Unknown what this is meant to be |
111 | | /// used for, you'll probably generate an error about an unexpected token. |
112 | | Reserved, |
113 | | |
114 | | /// An integer. |
115 | | Integer(IntegerKind), |
116 | | |
117 | | /// A float. |
118 | | Float(FloatKind), |
119 | | } |
120 | | |
121 | | /// Description of the parsed integer from the source. |
122 | | #[derive(Copy, Clone, Debug, PartialEq)] |
123 | | pub struct IntegerKind { |
124 | | sign: Option<SignToken>, |
125 | | has_underscores: bool, |
126 | | hex: bool, |
127 | | } |
128 | | |
129 | | /// Description of a parsed float from the source. |
130 | | #[allow(missing_docs)] |
131 | | #[derive(Copy, Clone, Debug, PartialEq)] |
132 | | pub enum FloatKind { |
133 | | #[doc(hidden)] |
134 | | Inf { negative: bool }, |
135 | | #[doc(hidden)] |
136 | | Nan { negative: bool }, |
137 | | #[doc(hidden)] |
138 | | NanVal { |
139 | | negative: bool, |
140 | | has_underscores: bool, |
141 | | }, |
142 | | #[doc(hidden)] |
143 | | Normal { has_underscores: bool, hex: bool }, |
144 | | } |
145 | | |
146 | | enum ReservedKind { |
147 | | /// "..." |
148 | | String, |
149 | | /// anything that's just a sequence of `idchars!()` |
150 | | Idchars, |
151 | | /// $"..." |
152 | | IdString, |
153 | | /// @"..." |
154 | | AnnotationString, |
155 | | /// everything else (a conglomeration of strings, idchars, etc) |
156 | | Reserved, |
157 | | } |
158 | | |
159 | | /// Errors that can be generated while lexing. |
160 | | /// |
161 | | /// All lexing errors have line/colum/position information as well as a |
162 | | /// `LexError` indicating what kind of error happened while lexing. |
163 | | #[derive(Debug, Clone, PartialEq, Eq)] |
164 | | #[non_exhaustive] |
165 | | pub enum LexError { |
166 | | /// A dangling block comment was found with an unbalanced `(;` which was |
167 | | /// never terminated in the file. |
168 | | DanglingBlockComment, |
169 | | |
170 | | /// An unexpected character was encountered when generally parsing and |
171 | | /// looking for something else. |
172 | | Unexpected(char), |
173 | | |
174 | | /// An invalid `char` in a string literal was found. |
175 | | InvalidStringElement(char), |
176 | | |
177 | | /// An invalid string escape letter was found (the thing after the `\` in |
178 | | /// string literals) |
179 | | InvalidStringEscape(char), |
180 | | |
181 | | /// An invalid hexadecimal digit was found. |
182 | | InvalidHexDigit(char), |
183 | | |
184 | | /// An invalid base-10 digit was found. |
185 | | InvalidDigit(char), |
186 | | |
187 | | /// Parsing expected `wanted` but ended up finding `found` instead where the |
188 | | /// two characters aren't the same. |
189 | | Expected { |
190 | | /// The character that was expected to be found |
191 | | wanted: char, |
192 | | /// The character that was actually found |
193 | | found: char, |
194 | | }, |
195 | | |
196 | | /// We needed to parse more but EOF (or end of the string) was encountered. |
197 | | UnexpectedEof, |
198 | | |
199 | | /// A number failed to parse because it was too big to fit within the target |
200 | | /// type. |
201 | | NumberTooBig, |
202 | | |
203 | | /// An invalid unicode value was found in a `\u{...}` escape in a string, |
204 | | /// only valid unicode scalars can be escaped that way. |
205 | | InvalidUnicodeValue(u32), |
206 | | |
207 | | /// A lone underscore was found when parsing a number, since underscores |
208 | | /// should always be preceded and succeeded with a digit of some form. |
209 | | LoneUnderscore, |
210 | | |
211 | | /// A "confusing" unicode character is present in a comment or a string |
212 | | /// literal, such as a character that changes the direction text is |
213 | | /// typically displayed in editors. This could cause the human-read |
214 | | /// version to behave differently than the compiler-visible version, so |
215 | | /// these are simply rejected for now. |
216 | | ConfusingUnicode(char), |
217 | | |
218 | | /// An invalid utf-8 sequence was found in a quoted identifier, such as |
219 | | /// `$"\ff"`. |
220 | | InvalidUtf8Id(Utf8Error), |
221 | | |
222 | | /// An empty identifier was found, or a lone `$`. |
223 | | EmptyId, |
224 | | |
225 | | /// An empty identifier was found, or a lone `@`. |
226 | | EmptyAnnotation, |
227 | | } |
228 | | |
229 | | /// A sign token for an integer. |
230 | | #[derive(Clone, Copy, Debug, PartialEq, Eq)] |
231 | | pub enum SignToken { |
232 | | /// Plus sign: "+", |
233 | | Plus, |
234 | | /// Minus sign: "-", |
235 | | Minus, |
236 | | } |
237 | | |
238 | | /// A fully parsed integer from a source string with a payload ready to parse |
239 | | /// into an integral type. |
240 | | #[derive(Debug, PartialEq)] |
241 | | pub struct Integer<'a> { |
242 | | sign: Option<SignToken>, |
243 | | val: Cow<'a, str>, |
244 | | hex: bool, |
245 | | } |
246 | | |
247 | | /// Possible parsed float values |
248 | | #[derive(Debug, PartialEq, Eq)] |
249 | | pub enum Float<'a> { |
250 | | /// A float `NaN` representation |
251 | | Nan { |
252 | | /// The specific bits to encode for this float, optionally |
253 | | val: Option<Cow<'a, str>>, |
254 | | /// Whether or not this is a negative `NaN` or not. |
255 | | negative: bool, |
256 | | }, |
257 | | /// An float infinite representation, |
258 | | Inf { |
259 | | #[allow(missing_docs)] |
260 | | negative: bool, |
261 | | }, |
262 | | /// A parsed and separated floating point value |
263 | | Val { |
264 | | /// Whether or not the `integral` and `fractional` are specified in hex |
265 | | hex: bool, |
266 | | /// The float parts before the `.` |
267 | | integral: Cow<'a, str>, |
268 | | /// The float parts after the `.` |
269 | | fractional: Option<Cow<'a, str>>, |
270 | | /// The exponent to multiple this `integral.fractional` portion of the |
271 | | /// float by. If `hex` is true this is `2^exponent` and otherwise it's |
272 | | /// `10^exponent` |
273 | | exponent: Option<Cow<'a, str>>, |
274 | | }, |
275 | | } |
276 | | |
277 | | // https://webassembly.github.io/spec/core/text/values.html#text-idchar |
278 | | macro_rules! idchars { |
279 | | () => { |
280 | | b'0'..=b'9' |
281 | | | b'A'..=b'Z' |
282 | | | b'a'..=b'z' |
283 | | | b'!' |
284 | | | b'#' |
285 | | | b'$' |
286 | | | b'%' |
287 | | | b'&' |
288 | | | b'\'' |
289 | | | b'*' |
290 | | | b'+' |
291 | | | b'-' |
292 | | | b'.' |
293 | | | b'/' |
294 | | | b':' |
295 | | | b'<' |
296 | | | b'=' |
297 | | | b'>' |
298 | | | b'?' |
299 | | | b'@' |
300 | | | b'\\' |
301 | | | b'^' |
302 | | | b'_' |
303 | | | b'`' |
304 | | | b'|' |
305 | | | b'~' |
306 | | } |
307 | | } |
308 | | |
309 | | impl<'a> Lexer<'a> { |
310 | | /// Creates a new lexer which will lex the `input` source string. |
311 | 495 | pub fn new(input: &str) -> Lexer<'_> { |
312 | 495 | Lexer { |
313 | 495 | input, |
314 | 495 | allow_confusing_unicode: false, |
315 | 495 | } |
316 | 495 | } |
317 | | |
318 | | /// Returns the original source input that we're lexing. |
319 | 55.4k | pub fn input(&self) -> &'a str { |
320 | 55.4k | self.input |
321 | 55.4k | } |
322 | | |
323 | | /// Configures whether "confusing" unicode characters are allowed while |
324 | | /// lexing. |
325 | | /// |
326 | | /// If allowed then no error will happen if these characters are found, but |
327 | | /// otherwise if disallowed a lex error will be produced when these |
328 | | /// characters are found. Confusing characters are denied by default. |
329 | | /// |
330 | | /// For now "confusing characters" are primarily related to the "trojan |
331 | | /// source" problem where it refers to characters which cause humans to read |
332 | | /// text differently than this lexer, such as characters that alter the |
333 | | /// left-to-right display of the source code. |
334 | 0 | pub fn allow_confusing_unicode(&mut self, allow: bool) -> &mut Self { |
335 | 0 | self.allow_confusing_unicode = allow; |
336 | 0 | self |
337 | 0 | } |
338 | | |
339 | | /// Lexes the next at the byte position `pos` in the input. |
340 | | /// |
341 | | /// Returns `Some` if a token is found or `None` if we're at EOF. |
342 | | /// |
343 | | /// The `pos` argument will be updated to point to the next token on a |
344 | | /// successful parse. |
345 | | /// |
346 | | /// # Errors |
347 | | /// |
348 | | /// Returns an error if the input is malformed. |
349 | 158k | pub fn parse(&self, pos: &mut usize) -> Result<Option<Token>, Error> { |
350 | 158k | let offset = *pos; |
351 | 158k | Ok(match self.parse_kind(pos)?0 { |
352 | 157k | Some(kind) => Some(Token { |
353 | 157k | kind, |
354 | 157k | offset, |
355 | 157k | len: (*pos - offset).try_into().unwrap(), |
356 | 157k | }), |
357 | 990 | None => None, |
358 | | }) |
359 | 158k | } |
360 | | |
361 | 158k | fn parse_kind(&self, pos: &mut usize) -> Result<Option<TokenKind>, Error> { |
362 | 158k | let start = *pos; |
363 | 158k | // This `match` generally parses the grammar specified at |
364 | 158k | // |
365 | 158k | // https://webassembly.github.io/spec/core/text/lexical.html#text-token |
366 | 158k | let remaining = &self.input.as_bytes()[start..]; |
367 | 158k | let byte157k = match remaining.first() { |
368 | 157k | Some(b) => b, |
369 | 990 | None => return Ok(None), |
370 | | }; |
371 | | |
372 | 157k | match byte { |
373 | | // Open-parens check the next character to see if this is the start |
374 | | // of a block comment, otherwise it's just a bland left-paren |
375 | | // token. |
376 | 25.9k | b'(' => match remaining.get(1) { |
377 | | Some(b';') => { |
378 | 0 | let mut level = 1; |
379 | 0 | // Note that we're doing a byte-level search here for the |
380 | 0 | // close-delimiter of `;)`. The actual source text is utf-8 |
381 | 0 | // encode in `remaining` but due to how utf-8 works we |
382 | 0 | // can safely search for an ASCII byte since it'll never |
383 | 0 | // otherwise appear in the middle of a codepoint and if we |
384 | 0 | // find it then it's guaranteed to be the right byte. |
385 | 0 | // |
386 | 0 | // Mainly we're avoiding the overhead of decoding utf-8 |
387 | 0 | // characters into a Rust `char` since it's otherwise |
388 | 0 | // unnecessary work. |
389 | 0 | let mut iter = remaining[2..].iter(); |
390 | 0 | while let Some(ch) = iter.next() { |
391 | 0 | match ch { |
392 | | b'(' => { |
393 | 0 | if let Some(b';') = iter.as_slice().first() { |
394 | 0 | level += 1; |
395 | 0 | iter.next(); |
396 | 0 | } |
397 | | } |
398 | | b';' => { |
399 | 0 | if let Some(b')') = iter.as_slice().first() { |
400 | 0 | level -= 1; |
401 | 0 | iter.next(); |
402 | 0 | if level == 0 { |
403 | 0 | let len = remaining.len() - iter.as_slice().len(); |
404 | 0 | let comment = &self.input[start..][..len]; |
405 | 0 | *pos += len; |
406 | 0 | self.check_confusing_comment(*pos, comment)?; |
407 | 0 | return Ok(Some(TokenKind::BlockComment)); |
408 | 0 | } |
409 | 0 | } |
410 | | } |
411 | 0 | _ => {} |
412 | | } |
413 | | } |
414 | 0 | Err(self.error(start, LexError::DanglingBlockComment)) |
415 | | } |
416 | | _ => { |
417 | 25.9k | *pos += 1; |
418 | 25.9k | |
419 | 25.9k | Ok(Some(TokenKind::LParen)) |
420 | | } |
421 | | }, |
422 | | |
423 | | b')' => { |
424 | 16.2k | *pos += 1; |
425 | 16.2k | Ok(Some(TokenKind::RParen)) |
426 | | } |
427 | | |
428 | | // https://webassembly.github.io/spec/core/text/lexical.html#white-space |
429 | | b' ' | b'\n' | b'\r' | b'\t' => { |
430 | 59.4k | self.skip_ws(pos); |
431 | 59.4k | Ok(Some(TokenKind::Whitespace)) |
432 | | } |
433 | | |
434 | 54.1k | c @ (idchars!()44.4k | b'"') => { |
435 | 54.1k | let (kind, src) = self.parse_reserved(pos)?0 ; |
436 | 54.1k | match kind { |
437 | | // If the reserved token was simply a single string then |
438 | | // that is converted to a standalone string token |
439 | 3.82k | ReservedKind::String => return Ok(Some(TokenKind::String)), |
440 | | |
441 | | // If only idchars were consumed then this could be a |
442 | | // specific kind of standalone token we're interested in. |
443 | | ReservedKind::Idchars => { |
444 | | // https://webassembly.github.io/spec/core/text/values.html#integers |
445 | 50.3k | if let Some(ret5.72k ) = self.classify_number(src) { |
446 | 5.72k | return Ok(Some(ret)); |
447 | | // https://webassembly.github.io/spec/core/text/values.html#text-id |
448 | 44.6k | } else if *c == b'$' { |
449 | 5.92k | return Ok(Some(TokenKind::Id)); |
450 | | // part of the WebAssembly/annotations proposal |
451 | | // (no online url yet) |
452 | 38.7k | } else if *c == b'@' { |
453 | 0 | return Ok(Some(TokenKind::Annotation)); |
454 | | // https://webassembly.github.io/spec/core/text/lexical.html#text-keyword |
455 | 38.7k | } else if b'a' <= *c && *c <= b'z' { |
456 | 38.7k | return Ok(Some(TokenKind::Keyword)); |
457 | 0 | } |
458 | | } |
459 | | |
460 | 0 | ReservedKind::IdString => return Ok(Some(TokenKind::Id)), |
461 | 0 | ReservedKind::AnnotationString => return Ok(Some(TokenKind::Annotation)), |
462 | | |
463 | | // ... otherwise this was a conglomeration of idchars, |
464 | | // strings, or just idchars that don't match a prior rule, |
465 | | // meaning this falls through to the fallback `Reserved` |
466 | | // token. |
467 | 0 | ReservedKind::Reserved => {} |
468 | | } |
469 | | |
470 | 0 | Ok(Some(TokenKind::Reserved)) |
471 | | } |
472 | | |
473 | | // This could be a line comment, otherwise `;` is a reserved token. |
474 | | // The second byte is checked to see if it's a `;;` line comment |
475 | | // |
476 | | // Note that this character being considered as part of a |
477 | | // `reserved` token is part of the annotations proposal. |
478 | 1.21k | b';' => match remaining.get(1) { |
479 | | Some(b';') => { |
480 | 1.21k | let remaining = &self.input[*pos..]; |
481 | 1.21k | let byte_pos = memchr::memchr2(b'\n', b'\r', remaining.as_bytes()) |
482 | 1.21k | .unwrap_or(remaining.len()); |
483 | 1.21k | *pos += byte_pos; |
484 | 1.21k | let comment = &remaining[..byte_pos]; |
485 | 1.21k | self.check_confusing_comment(*pos, comment)?0 ; |
486 | 1.21k | Ok(Some(TokenKind::LineComment)) |
487 | | } |
488 | | _ => { |
489 | 0 | *pos += 1; |
490 | 0 | Ok(Some(TokenKind::Reserved)) |
491 | | } |
492 | | }, |
493 | | |
494 | | // Other known reserved tokens other than `;` |
495 | | // |
496 | | // Note that these characters being considered as part of a |
497 | | // `reserved` token is part of the annotations proposal. |
498 | | b',' | b'[' | b']' | b'{' | b'}' => { |
499 | 0 | *pos += 1; |
500 | 0 | Ok(Some(TokenKind::Reserved)) |
501 | | } |
502 | | |
503 | | _ => { |
504 | 0 | let ch = self.input[start..].chars().next().unwrap(); |
505 | 0 | Err(self.error(*pos, LexError::Unexpected(ch))) |
506 | | } |
507 | | } |
508 | 158k | } |
509 | | |
510 | 59.4k | fn skip_ws(&self, pos: &mut usize) { |
511 | | // This table is a byte lookup table to determine whether a byte is a |
512 | | // whitespace byte. There are only 4 whitespace bytes for the `*.wat` |
513 | | // format right now which are ' ', '\t', '\r', and '\n'. These 4 bytes |
514 | | // have a '1' in the table below. |
515 | | // |
516 | | // Due to how utf-8 works (our input is guaranteed to be utf-8) it is |
517 | | // known that if these bytes are found they're guaranteed to be the |
518 | | // whitespace byte, so they can be safely skipped and we don't have to |
519 | | // do full utf-8 decoding. This means that the goal of this function is |
520 | | // to find the first non-whitespace byte in `remaining`. |
521 | | // |
522 | | // For now this lookup table seems to be the fastest, but projects like |
523 | | // https://github.com/lemire/despacer show other simd algorithms which |
524 | | // can possibly accelerate this even more. Note that `*.wat` files often |
525 | | // have a lot of whitespace so this function is typically quite hot when |
526 | | // parsing inputs. |
527 | | #[rustfmt::skip] |
528 | | const WS: [u8; 256] = [ |
529 | | // \t \n \r |
530 | | /* 0x00 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, |
531 | | /* 0x10 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
532 | | // ' ' |
533 | | /* 0x20 */ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
534 | | /* 0x30 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
535 | | /* 0x40 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
536 | | /* 0x50 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
537 | | /* 0x60 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
538 | | /* 0x70 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
539 | | /* 0x80 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
540 | | /* 0x90 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
541 | | /* 0xa0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
542 | | /* 0xb0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
543 | | /* 0xc0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
544 | | /* 0xd0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
545 | | /* 0xe0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
546 | | /* 0xf0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
547 | | ]; |
548 | 59.4k | let remaining = &self.input[*pos..]; |
549 | 59.4k | let non_ws_pos = remaining |
550 | 59.4k | .as_bytes() |
551 | 59.4k | .iter() |
552 | 163k | .position(|b| WS[*b as usize] != 1) |
553 | 59.4k | .unwrap_or(remaining.len()); |
554 | 59.4k | *pos += non_ws_pos; |
555 | 59.4k | } |
556 | | |
557 | | /// Splits off a "reserved" token which is then further processed later on |
558 | | /// to figure out which kind of token it is `depending on `ReservedKind`. |
559 | | /// |
560 | | /// For more information on this method see the clarification at |
561 | | /// <https://github.com/WebAssembly/spec/pull/1499> but the general gist is |
562 | | /// that this is parsing the grammar: |
563 | | /// |
564 | | /// ```text |
565 | | /// reserved := (idchar | string)+ |
566 | | /// ``` |
567 | | /// |
568 | | /// which means that it is eating any number of adjacent string/idchar |
569 | | /// tokens (e.g. `a"b"c`) and returning the classification of what was |
570 | | /// eaten. The classification assists in determining what the actual token |
571 | | /// here eaten looks like. |
572 | 54.1k | fn parse_reserved(&self, pos: &mut usize) -> Result<(ReservedKind, &'a str), Error> { |
573 | 54.1k | let mut idchars = 0u32; |
574 | 54.1k | let mut strings = 0u32; |
575 | 54.1k | let start = *pos; |
576 | 296k | while let Some(byte) = self.input.as_bytes().get(*pos) { |
577 | 296k | match byte { |
578 | 296k | // Normal `idchars` production which appends to the reserved |
579 | 296k | // token that's being produced. |
580 | 296k | idchars!() => { |
581 | 238k | idchars += 1; |
582 | 238k | *pos += 1; |
583 | 238k | } |
584 | | |
585 | | // https://webassembly.github.io/spec/core/text/values.html#text-string |
586 | | b'"' => { |
587 | 3.82k | strings += 1; |
588 | 3.82k | *pos += 1; |
589 | 3.82k | let mut it = self.input[*pos..].chars(); |
590 | 3.82k | let result = Lexer::parse_str(&mut it, self.allow_confusing_unicode); |
591 | 3.82k | *pos = self.input.len() - it.as_str().len(); |
592 | 3.82k | match result { |
593 | 3.82k | Ok(_) => {} |
594 | 0 | Err(e) => { |
595 | 0 | let err_pos = match &e { |
596 | 0 | LexError::UnexpectedEof => self.input.len(), |
597 | 0 | _ => self.input[..*pos].char_indices().next_back().unwrap().0, |
598 | | }; |
599 | 0 | return Err(self.error(err_pos, e)); |
600 | | } |
601 | | } |
602 | | } |
603 | | |
604 | | // Nothing else is considered part of a reserved token |
605 | 54.1k | _ => break, |
606 | | } |
607 | | } |
608 | 54.1k | let ret = &self.input[start..*pos]; |
609 | 54.1k | Ok(match (idchars, strings) { |
610 | 0 | (0, 0) => unreachable!(), |
611 | 3.82k | (0, 1) => (ReservedKind::String, ret), |
612 | 50.3k | (_, 0) => (ReservedKind::Idchars, ret), |
613 | | // Pattern match `@"..."` and `$"..."` for string-based |
614 | | // identifiers and annotations. |
615 | 0 | (1, 1) if ret.starts_with("$") => (ReservedKind::IdString, ret), |
616 | 0 | (1, 1) if ret.starts_with("@") => (ReservedKind::AnnotationString, ret), |
617 | 0 | _ => (ReservedKind::Reserved, ret), |
618 | | }) |
619 | 54.1k | } |
620 | | |
621 | 50.3k | fn classify_number(&self, src: &str) -> Option<TokenKind> { |
622 | 50.3k | let (sign, num) = if let Some(stripped0 ) = src.strip_prefix('+') { |
623 | 0 | (Some(SignToken::Plus), stripped) |
624 | 50.3k | } else if let Some(stripped3 ) = src.strip_prefix('-') { |
625 | 3 | (Some(SignToken::Minus), stripped) |
626 | | } else { |
627 | 50.3k | (None, src) |
628 | | }; |
629 | | |
630 | 50.3k | let negative = sign == Some(SignToken::Minus); |
631 | 50.3k | |
632 | 50.3k | // Handle `inf` and `nan` which are special numbers here |
633 | 50.3k | if num == "inf" { |
634 | 0 | return Some(TokenKind::Float(FloatKind::Inf { negative })); |
635 | 50.3k | } else if num == "nan" { |
636 | 0 | return Some(TokenKind::Float(FloatKind::Nan { negative })); |
637 | 50.3k | } else if let Some(stripped0 ) = num.strip_prefix("nan:0x") { |
638 | 0 | let mut it = stripped.as_bytes().iter(); |
639 | 0 | let has_underscores = skip_underscores(&mut it, |x| char::from(x).is_ascii_hexdigit())?; |
640 | 0 | if it.next().is_some() { |
641 | 0 | return None; |
642 | 0 | } |
643 | 0 | return Some(TokenKind::Float(FloatKind::NanVal { |
644 | 0 | negative, |
645 | 0 | has_underscores, |
646 | 0 | })); |
647 | 50.3k | } |
648 | | |
649 | | // Figure out if we're a hex number or not |
650 | | let test_valid: fn(u8) -> bool; |
651 | 50.3k | let (mut it, hex) = if let Some(stripped15 ) = num.strip_prefix("0x") { |
652 | 105 | test_valid = 15 |x: u8| char::from(x).is_ascii_hexdigit(); |
653 | 15 | (stripped.as_bytes().iter(), true) |
654 | | } else { |
655 | 53.2k | test_valid = 50.3k |x: u8| char::from(x).is_ascii_digit(); |
656 | 50.3k | (num.as_bytes().iter(), false) |
657 | | }; |
658 | | |
659 | | // Evaluate the first part, moving out all underscores |
660 | 50.3k | let mut has_underscores5.72k = skip_underscores(&mut it, test_valid)?44.6k ; |
661 | | |
662 | 5.72k | match it.clone().next() { |
663 | | // If we're followed by something this may be a float so keep going. |
664 | 188 | Some(_) => {} |
665 | | |
666 | | // Otherwise this is a valid integer literal! |
667 | | None => { |
668 | 5.53k | return Some(TokenKind::Integer(IntegerKind { |
669 | 5.53k | has_underscores, |
670 | 5.53k | sign, |
671 | 5.53k | hex, |
672 | 5.53k | })) |
673 | | } |
674 | | } |
675 | | |
676 | | // A number can optionally be after the dot so only actually try to |
677 | | // parse one if it's there. |
678 | 188 | if it.clone().next() == Some(&b'.') { |
679 | 188 | it.next(); |
680 | 188 | match it.clone().next() { |
681 | 188 | Some(c) if test_valid(*c) => { |
682 | 188 | if skip_underscores(&mut it, test_valid)?0 { |
683 | 0 | has_underscores = true; |
684 | 188 | } |
685 | | } |
686 | 0 | Some(_) | None => {} |
687 | | } |
688 | 0 | }; |
689 | | |
690 | | // Figure out if there's an exponential part here to make a float, and |
691 | | // if so parse it but defer its actual calculation until later. |
692 | 188 | match (hex, it.next()) { |
693 | | (true, Some(b'p')) | (true, Some(b'P')) | (false, Some(b'e')) | (false, Some(b'E')) => { |
694 | 0 | match it.clone().next() { |
695 | 0 | Some(b'-') => { |
696 | 0 | it.next(); |
697 | 0 | } |
698 | 0 | Some(b'+') => { |
699 | 0 | it.next(); |
700 | 0 | } |
701 | 0 | _ => {} |
702 | | } |
703 | 0 | if skip_underscores(&mut it, |x| char::from(x).is_ascii_digit())? { |
704 | 0 | has_underscores = true; |
705 | 0 | } |
706 | | } |
707 | 188 | (_, None) => {} |
708 | 0 | _ => return None, |
709 | | } |
710 | | |
711 | | // We should have eaten everything by now, if not then this is surely |
712 | | // not a float or integer literal. |
713 | 188 | if it.next().is_some() { |
714 | 0 | return None; |
715 | 188 | } |
716 | 188 | |
717 | 188 | return Some(TokenKind::Float(FloatKind::Normal { |
718 | 188 | has_underscores, |
719 | 188 | hex, |
720 | 188 | })); |
721 | | |
722 | 50.5k | fn skip_underscores(it: &mut slice::Iter<'_, u8>, good: fn(u8) -> bool) -> Option<bool> { |
723 | 50.5k | let mut last_underscore = false; |
724 | 50.5k | let mut has_underscores = false; |
725 | 50.5k | let first = *it.next()?0 ; |
726 | 50.5k | if !good(first) { |
727 | 44.6k | return None; |
728 | 5.90k | } |
729 | 8.38k | while let Some(c2.66k ) = it.clone().next() { |
730 | 2.66k | if *c == b'_' && !last_underscore9 { |
731 | 9 | has_underscores = true; |
732 | 9 | it.next(); |
733 | 9 | last_underscore = true; |
734 | 9 | continue; |
735 | 2.65k | } |
736 | 2.65k | if !good(*c) { |
737 | 188 | break; |
738 | 2.46k | } |
739 | 2.46k | last_underscore = false; |
740 | 2.46k | it.next(); |
741 | | } |
742 | 5.90k | if last_underscore { |
743 | 0 | return None; |
744 | 5.90k | } |
745 | 5.90k | Some(has_underscores) |
746 | 50.5k | } |
747 | 50.3k | } |
748 | | |
749 | | /// Verifies that `comment`, which is about to be returned, has a "confusing |
750 | | /// unicode character" in it and should instead be transformed into an |
751 | | /// error. |
752 | 1.21k | fn check_confusing_comment(&self, end: usize, comment: &str) -> Result<(), Error> { |
753 | 1.21k | if self.allow_confusing_unicode { |
754 | 0 | return Ok(()); |
755 | 1.21k | } |
756 | 1.21k | |
757 | 1.21k | // In an effort to avoid utf-8 decoding the entire `comment` the search |
758 | 1.21k | // here is a bit more optimized. This checks for the `0xe2` byte because |
759 | 1.21k | // in the utf-8 encoding that's the leading encoding byte for all |
760 | 1.21k | // "confusing characters". Each instance of 0xe2 is checked to see if it |
761 | 1.21k | // starts a confusing character, and if so that's returned. |
762 | 1.21k | // |
763 | 1.21k | // Also note that 0xe2 will never be found in the middle of a codepoint, |
764 | 1.21k | // it's always the start of a codepoint. This means that if our special |
765 | 1.21k | // characters show up they're guaranteed to start with 0xe2 bytes. |
766 | 1.21k | let bytes = comment.as_bytes(); |
767 | 1.21k | for pos0 in memchr::Memchr::new(0xe2, bytes) { |
768 | 0 | if let Some(c) = comment[pos..].chars().next() { |
769 | 0 | if is_confusing_unicode(c) { |
770 | | // Note that `self.cur()` accounts for already having |
771 | | // parsed `comment`, so we move backwards to where |
772 | | // `comment` started and then add the index within |
773 | | // `comment`. |
774 | 0 | let pos = end - comment.len() + pos; |
775 | 0 | return Err(self.error(pos, LexError::ConfusingUnicode(c))); |
776 | 0 | } |
777 | 0 | } |
778 | | } |
779 | | |
780 | 1.21k | Ok(()) |
781 | 1.21k | } |
782 | | |
783 | 7.37k | fn parse_str( |
784 | 7.37k | it: &mut str::Chars<'a>, |
785 | 7.37k | allow_confusing_unicode: bool, |
786 | 7.37k | ) -> Result<Cow<'a, [u8]>, LexError> { |
787 | | enum State { |
788 | | Start, |
789 | | String(Vec<u8>), |
790 | | } |
791 | 7.37k | let orig = it.as_str(); |
792 | 7.37k | let mut state = State::Start; |
793 | | loop { |
794 | 67.2k | match it.next().ok_or(LexError::UnexpectedEof)?0 { |
795 | 7.37k | '"' => break, |
796 | | '\\' => { |
797 | 1.19k | match state { |
798 | 952 | State::String(_) => {} |
799 | 246 | State::Start => { |
800 | 246 | let pos = orig.len() - it.as_str().len() - 1; |
801 | 246 | state = State::String(orig[..pos].as_bytes().to_vec()); |
802 | 246 | } |
803 | | } |
804 | 1.19k | let buf = match &mut state { |
805 | 1.19k | State::String(b) => b, |
806 | 0 | State::Start => unreachable!(), |
807 | | }; |
808 | 1.19k | match it.next().ok_or(LexError::UnexpectedEof)?0 { |
809 | 0 | '"' => buf.push(b'"'), |
810 | 0 | '\'' => buf.push(b'\''), |
811 | 0 | 't' => buf.push(b'\t'), |
812 | 0 | 'n' => buf.push(b'\n'), |
813 | 0 | 'r' => buf.push(b'\r'), |
814 | 0 | '\\' => buf.push(b'\\'), |
815 | | 'u' => { |
816 | 0 | Lexer::must_eat_char(it, '{')?; |
817 | 0 | let n = Lexer::hexnum(it)?; |
818 | 0 | let c = char::from_u32(n).ok_or(LexError::InvalidUnicodeValue(n))?; |
819 | 0 | buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes()); |
820 | 0 | Lexer::must_eat_char(it, '}')?; |
821 | | } |
822 | 1.19k | c1 if c1.is_ascii_hexdigit() => { |
823 | 1.19k | let c2 = Lexer::hexdigit(it)?0 ; |
824 | 1.19k | buf.push(to_hex(c1) * 16 + c2); |
825 | | } |
826 | 0 | c => return Err(LexError::InvalidStringEscape(c)), |
827 | | } |
828 | | } |
829 | 58.7k | c0 if (c as u32) < 0x20 || c as u32 == 0x7f => { |
830 | 0 | return Err(LexError::InvalidStringElement(c)) |
831 | | } |
832 | 58.7k | c0 if !allow_confusing_unicode && is_confusing_unicode(c29.8k ) => { |
833 | 0 | return Err(LexError::ConfusingUnicode(c)) |
834 | | } |
835 | 58.7k | c => match &mut state { |
836 | 58.7k | State::Start => {} |
837 | 2 | State::String(v) => { |
838 | 2 | v.extend(c.encode_utf8(&mut [0; 4]).as_bytes()); |
839 | 2 | } |
840 | | }, |
841 | | } |
842 | | } |
843 | 7.37k | match state { |
844 | 7.13k | State::Start => Ok(orig[..orig.len() - it.as_str().len() - 1].as_bytes().into()), |
845 | 246 | State::String(s) => Ok(s.into()), |
846 | | } |
847 | 7.37k | } |
848 | | |
849 | | /// Parses an id-or-string-based name from `it`. |
850 | | /// |
851 | | /// Note that `it` should already have been lexed and this is just |
852 | | /// extracting the value. If the token lexed was `@a` then this should point |
853 | | /// to `a`. |
854 | | /// |
855 | | /// This will automatically detect quoted syntax such as `@"..."` and the |
856 | | /// byte string will be parsed and validated as utf-8. |
857 | | /// |
858 | | /// # Errors |
859 | | /// |
860 | | /// Returns an error if a quoted byte string is found and contains invalid |
861 | | /// utf-8. |
862 | 1.60k | fn parse_name(it: &mut str::Chars<'a>) -> Result<Cow<'a, str>, LexError> { |
863 | 1.60k | if it.clone().next() == Some('"') { |
864 | 0 | it.next(); |
865 | 0 | match Lexer::parse_str(it, true)? { |
866 | 0 | Cow::Borrowed(bytes) => match std::str::from_utf8(bytes) { |
867 | 0 | Ok(s) => Ok(Cow::Borrowed(s)), |
868 | 0 | Err(e) => Err(LexError::InvalidUtf8Id(e)), |
869 | | }, |
870 | 0 | Cow::Owned(bytes) => match String::from_utf8(bytes) { |
871 | 0 | Ok(s) => Ok(Cow::Owned(s)), |
872 | 0 | Err(e) => Err(LexError::InvalidUtf8Id(e.utf8_error())), |
873 | | }, |
874 | | } |
875 | | } else { |
876 | 1.60k | Ok(Cow::Borrowed(it.as_str())) |
877 | | } |
878 | 1.60k | } |
879 | | |
880 | 0 | fn hexnum(it: &mut str::Chars<'_>) -> Result<u32, LexError> { |
881 | 0 | let n = Lexer::hexdigit(it)?; |
882 | 0 | let mut last_underscore = false; |
883 | 0 | let mut n = n as u32; |
884 | 0 | while let Some(c) = it.clone().next() { |
885 | 0 | if c == '_' { |
886 | 0 | it.next(); |
887 | 0 | last_underscore = true; |
888 | 0 | continue; |
889 | 0 | } |
890 | 0 | if !c.is_ascii_hexdigit() { |
891 | 0 | break; |
892 | 0 | } |
893 | 0 | last_underscore = false; |
894 | 0 | it.next(); |
895 | 0 | n = n |
896 | 0 | .checked_mul(16) |
897 | 0 | .and_then(|n| n.checked_add(to_hex(c) as u32)) |
898 | 0 | .ok_or(LexError::NumberTooBig)?; |
899 | | } |
900 | 0 | if last_underscore { |
901 | 0 | return Err(LexError::LoneUnderscore); |
902 | 0 | } |
903 | 0 | Ok(n) |
904 | 0 | } |
905 | | |
906 | | /// Reads a hexidecimal digit from the input stream, returning where it's |
907 | | /// defined and the hex value. Returns an error on EOF or an invalid hex |
908 | | /// digit. |
909 | 1.19k | fn hexdigit(it: &mut str::Chars<'_>) -> Result<u8, LexError> { |
910 | 1.19k | let ch = Lexer::must_char(it)?0 ; |
911 | 1.19k | if ch.is_ascii_hexdigit() { |
912 | 1.19k | Ok(to_hex(ch)) |
913 | | } else { |
914 | 0 | Err(LexError::InvalidHexDigit(ch)) |
915 | | } |
916 | 1.19k | } |
917 | | |
918 | | /// Reads the next character from the input string and where it's located, |
919 | | /// returning an error if the input stream is empty. |
920 | 1.19k | fn must_char(it: &mut str::Chars<'_>) -> Result<char, LexError> { |
921 | 1.19k | it.next().ok_or(LexError::UnexpectedEof) |
922 | 1.19k | } |
923 | | |
924 | | /// Expects that a specific character must be read next |
925 | 0 | fn must_eat_char(it: &mut str::Chars<'_>, wanted: char) -> Result<(), LexError> { |
926 | 0 | let found = Lexer::must_char(it)?; |
927 | 0 | if wanted == found { |
928 | 0 | Ok(()) |
929 | | } else { |
930 | 0 | Err(LexError::Expected { wanted, found }) |
931 | | } |
932 | 0 | } |
933 | | |
934 | | /// Creates an error at `pos` with the specified `kind` |
935 | 0 | fn error(&self, pos: usize, kind: LexError) -> Error { |
936 | 0 | Error::lex(Span { offset: pos }, self.input, kind) |
937 | 0 | } |
938 | | |
939 | | /// Returns an iterator over all tokens in the original source string |
940 | | /// starting at the `pos` specified. |
941 | 495 | pub fn iter(&self, mut pos: usize) -> impl Iterator<Item = Result<Token, Error>> + '_ { |
942 | 960 | std::iter::from_fn(move || self.parse(&mut pos).transpose()) |
943 | 495 | } |
944 | | |
945 | | /// Returns whether an annotation is present at `pos`. If it is present then |
946 | | /// `Ok(Some(token))` is returned corresponding to the token, otherwise |
947 | | /// `Ok(None)` is returned. If the next token cannot be parsed then an error |
948 | | /// is returned. |
949 | 25.4k | pub fn annotation(&self, mut pos: usize) -> Result<Option<Token>, Error> { |
950 | 25.4k | let bytes = self.input.as_bytes(); |
951 | 25.4k | // Quickly reject anything that for sure isn't an annotation since this |
952 | 25.4k | // method is used every time an lparen is parsed. |
953 | 25.4k | if bytes.get(pos) != Some(&b'@') { |
954 | 25.4k | return Ok(None); |
955 | 0 | } |
956 | 0 | match self.parse(&mut pos)? { |
957 | 0 | Some(token) => match token.kind { |
958 | 0 | TokenKind::Annotation => Ok(Some(token)), |
959 | 0 | _ => Ok(None), |
960 | | }, |
961 | 0 | None => Ok(None), |
962 | | } |
963 | 25.4k | } |
964 | | } |
965 | | |
966 | | impl Token { |
967 | | /// Returns the original source text for this token. |
968 | 55.4k | pub fn src<'a>(&self, s: &'a str) -> &'a str { |
969 | 55.4k | &s[self.offset..][..self.len.try_into().unwrap()] |
970 | 55.4k | } |
971 | | |
972 | | /// Returns the identifier, without the leading `$` symbol, that this token |
973 | | /// represents. |
974 | | /// |
975 | | /// Note that this method returns the contents of the identifier. With a |
976 | | /// string-based identifier this means that escapes have been resolved to |
977 | | /// their string-based equivalent. |
978 | | /// |
979 | | /// Should only be used with `TokenKind::Id`. |
980 | | /// |
981 | | /// # Errors |
982 | | /// |
983 | | /// Returns an error if this is a string-based identifier (e.g. `$"..."`) |
984 | | /// which is invalid utf-8. |
985 | 1.60k | pub fn id<'a>(&self, s: &'a str) -> Result<Cow<'a, str>, Error> { |
986 | 1.60k | let mut ch = self.src(s).chars(); |
987 | 1.60k | let dollar = ch.next(); |
988 | 1.60k | debug_assert_eq!(dollar, Some('$')); |
989 | 1.60k | let id = Lexer::parse_name(&mut ch).map_err(|e| self.error(s, e)0 )?0 ; |
990 | 1.60k | if id.is_empty() { |
991 | 0 | return Err(self.error(s, LexError::EmptyId)); |
992 | 1.60k | } |
993 | 1.60k | Ok(id) |
994 | 1.60k | } |
995 | | |
996 | | /// Returns the annotation, without the leading `@` symbol, that this token |
997 | | /// represents. |
998 | | /// |
999 | | /// Note that this method returns the contents of the identifier. With a |
1000 | | /// string-based identifier this means that escapes have been resolved to |
1001 | | /// their string-based equivalent. |
1002 | | /// |
1003 | | /// Should only be used with `TokenKind::Annotation`. |
1004 | | /// |
1005 | | /// # Errors |
1006 | | /// |
1007 | | /// Returns an error if this is a string-based identifier (e.g. `$"..."`) |
1008 | | /// which is invalid utf-8. |
1009 | 0 | pub fn annotation<'a>(&self, s: &'a str) -> Result<Cow<'a, str>, Error> { |
1010 | 0 | let mut ch = self.src(s).chars(); |
1011 | 0 | let at = ch.next(); |
1012 | 0 | debug_assert_eq!(at, Some('@')); |
1013 | 0 | let id = Lexer::parse_name(&mut ch).map_err(|e| self.error(s, e))?; |
1014 | 0 | if id.is_empty() { |
1015 | 0 | return Err(self.error(s, LexError::EmptyAnnotation)); |
1016 | 0 | } |
1017 | 0 | Ok(id) |
1018 | 0 | } |
1019 | | |
1020 | | /// Returns the keyword this token represents. |
1021 | | /// |
1022 | | /// Should only be used with [`TokenKind::Keyword`]. |
1023 | 47.6k | pub fn keyword<'a>(&self, s: &'a str) -> &'a str { |
1024 | 47.6k | self.src(s) |
1025 | 47.6k | } |
1026 | | |
1027 | | /// Returns the reserved string this token represents. |
1028 | | /// |
1029 | | /// Should only be used with [`TokenKind::Reserved`]. |
1030 | 0 | pub fn reserved<'a>(&self, s: &'a str) -> &'a str { |
1031 | 0 | self.src(s) |
1032 | 0 | } |
1033 | | |
1034 | | /// Returns the parsed string that this token represents. |
1035 | | /// |
1036 | | /// This returns either a raw byte slice into the source if that's possible |
1037 | | /// or an owned representation to handle escaped characters and such. |
1038 | | /// |
1039 | | /// Should only be used with [`TokenKind::String`]. |
1040 | 3.54k | pub fn string<'a>(&self, s: &'a str) -> Cow<'a, [u8]> { |
1041 | 3.54k | let mut ch = self.src(s).chars(); |
1042 | 3.54k | ch.next().unwrap(); |
1043 | 3.54k | Lexer::parse_str(&mut ch, true).unwrap() |
1044 | 3.54k | } |
1045 | | |
1046 | | /// Returns the decomposed float token that this represents. |
1047 | | /// |
1048 | | /// This will slice up the float token into its component parts and return a |
1049 | | /// description of the float token in the source. |
1050 | | /// |
1051 | | /// Should only be used with [`TokenKind::Float`]. |
1052 | 182 | pub fn float<'a>(&self, s: &'a str, kind: FloatKind) -> Float<'a> { |
1053 | 182 | match kind { |
1054 | 0 | FloatKind::Inf { negative } => Float::Inf { negative }, |
1055 | 0 | FloatKind::Nan { negative } => Float::Nan { |
1056 | 0 | val: None, |
1057 | 0 | negative, |
1058 | 0 | }, |
1059 | | FloatKind::NanVal { |
1060 | 0 | negative, |
1061 | 0 | has_underscores, |
1062 | 0 | } => { |
1063 | 0 | let src = self.src(s); |
1064 | 0 | let src = if src.starts_with("n") { src } else { &src[1..] }; |
1065 | 0 | let mut val = Cow::Borrowed(src.strip_prefix("nan:0x").unwrap()); |
1066 | 0 | if has_underscores { |
1067 | 0 | *val.to_mut() = val.replace("_", ""); |
1068 | 0 | } |
1069 | 0 | Float::Nan { |
1070 | 0 | val: Some(val), |
1071 | 0 | negative, |
1072 | 0 | } |
1073 | | } |
1074 | | FloatKind::Normal { |
1075 | 182 | has_underscores, |
1076 | 182 | hex, |
1077 | 182 | } => { |
1078 | 182 | let src = self.src(s); |
1079 | 182 | let (integral, fractional, exponent) = match src.find('.') { |
1080 | 182 | Some(i) => { |
1081 | 182 | let integral = &src[..i]; |
1082 | 182 | let rest = &src[i + 1..]; |
1083 | 182 | let exponent = if hex { |
1084 | 0 | rest.find('p').or_else(|| rest.find('P')) |
1085 | | } else { |
1086 | 182 | rest.find('e').or_else(|| rest.find('E')) |
1087 | | }; |
1088 | 182 | match exponent { |
1089 | 0 | Some(i) => (integral, Some(&rest[..i]), Some(&rest[i + 1..])), |
1090 | 182 | None => (integral, Some(rest), None), |
1091 | | } |
1092 | | } |
1093 | | None => { |
1094 | 0 | let exponent = if hex { |
1095 | 0 | src.find('p').or_else(|| src.find('P')) |
1096 | | } else { |
1097 | 0 | src.find('e').or_else(|| src.find('E')) |
1098 | | }; |
1099 | 0 | match exponent { |
1100 | 0 | Some(i) => (&src[..i], None, Some(&src[i + 1..])), |
1101 | 0 | None => (src, None, None), |
1102 | | } |
1103 | | } |
1104 | | }; |
1105 | 182 | let mut integral = Cow::Borrowed(integral.strip_prefix('+').unwrap_or(integral)); |
1106 | 182 | let mut fractional = fractional.and_then(|s| { |
1107 | 182 | if s.is_empty() { |
1108 | 0 | None |
1109 | | } else { |
1110 | 182 | Some(Cow::Borrowed(s)) |
1111 | | } |
1112 | 182 | }); |
1113 | 182 | let mut exponent = |
1114 | 182 | exponent.map(|s| Cow::Borrowed(s.strip_prefix('+').unwrap_or(s))0 ); |
1115 | 182 | if has_underscores { |
1116 | 0 | *integral.to_mut() = integral.replace("_", ""); |
1117 | 0 | if let Some(fractional) = &mut fractional { |
1118 | 0 | *fractional.to_mut() = fractional.replace("_", ""); |
1119 | 0 | } |
1120 | 0 | if let Some(exponent) = &mut exponent { |
1121 | 0 | *exponent.to_mut() = exponent.replace("_", ""); |
1122 | 0 | } |
1123 | 182 | } |
1124 | 182 | if hex { |
1125 | 0 | *integral.to_mut() = integral.replace("0x", ""); |
1126 | 182 | } |
1127 | 182 | Float::Val { |
1128 | 182 | hex, |
1129 | 182 | integral, |
1130 | 182 | fractional, |
1131 | 182 | exponent, |
1132 | 182 | } |
1133 | | } |
1134 | | } |
1135 | 182 | } |
1136 | | |
1137 | | /// Returns the decomposed integer token that this represents. |
1138 | | /// |
1139 | | /// This will slice up the integer token into its component parts and |
1140 | | /// return a description of the integer token in the source. |
1141 | | /// |
1142 | | /// Should only be used with [`TokenKind::Integer`]. |
1143 | 2.54k | pub fn integer<'a>(&self, s: &'a str, kind: IntegerKind) -> Integer<'a> { |
1144 | 2.54k | let src = self.src(s); |
1145 | 2.54k | let val = match kind.sign2 { |
1146 | 0 | Some(SignToken::Plus) => src.strip_prefix('+').unwrap(), |
1147 | 2 | Some(SignToken::Minus) => src, |
1148 | 2.54k | None => src, |
1149 | | }; |
1150 | 2.54k | let mut val = Cow::Borrowed(val); |
1151 | 2.54k | if kind.has_underscores { |
1152 | 3 | *val.to_mut() = val.replace("_", ""); |
1153 | 2.54k | } |
1154 | 2.54k | if kind.hex { |
1155 | 9 | *val.to_mut() = val.replace("0x", ""); |
1156 | 2.53k | } |
1157 | 2.54k | Integer { |
1158 | 2.54k | sign: kind.sign, |
1159 | 2.54k | hex: kind.hex, |
1160 | 2.54k | val, |
1161 | 2.54k | } |
1162 | 2.54k | } |
1163 | | |
1164 | 0 | fn error(&self, src: &str, err: LexError) -> Error { |
1165 | 0 | Error::lex( |
1166 | 0 | Span { |
1167 | 0 | offset: self.offset, |
1168 | 0 | }, |
1169 | 0 | src, |
1170 | 0 | err, |
1171 | 0 | ) |
1172 | 0 | } |
1173 | | } |
1174 | | |
1175 | | impl<'a> Integer<'a> { |
1176 | | /// Returns the sign token for this integer. |
1177 | 0 | pub fn sign(&self) -> Option<SignToken> { |
1178 | 0 | self.sign |
1179 | 0 | } |
1180 | | |
1181 | | /// Returns the value string that can be parsed for this integer, as well |
1182 | | /// as the base that it should be parsed in |
1183 | 2.54k | pub fn val(&self) -> (&str, u32) { |
1184 | 2.54k | (&self.val, if self.hex { 169 } else { 102.53k }) |
1185 | 2.54k | } |
1186 | | } |
1187 | | |
1188 | 2.39k | fn to_hex(c: char) -> u8 { |
1189 | 2.39k | match c { |
1190 | 146 | 'a'..='f' => c as u8 - b'a' + 10, |
1191 | 0 | 'A'..='F' => c as u8 - b'A' + 10, |
1192 | 2.25k | _ => c as u8 - b'0', |
1193 | | } |
1194 | 2.39k | } |
1195 | | |
1196 | | impl fmt::Display for LexError { |
1197 | 0 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
1198 | | use LexError::*; |
1199 | 0 | match self { |
1200 | 0 | DanglingBlockComment => f.write_str("unterminated block comment")?, |
1201 | 0 | Unexpected(c) => write!(f, "unexpected character '{}'", escape_char(*c))?, |
1202 | 0 | InvalidStringElement(c) => { |
1203 | 0 | write!(f, "invalid character in string '{}'", escape_char(*c))? |
1204 | | } |
1205 | 0 | InvalidStringEscape(c) => write!(f, "invalid string escape '{}'", escape_char(*c))?, |
1206 | 0 | InvalidHexDigit(c) => write!(f, "invalid hex digit '{}'", escape_char(*c))?, |
1207 | 0 | InvalidDigit(c) => write!(f, "invalid decimal digit '{}'", escape_char(*c))?, |
1208 | 0 | Expected { wanted, found } => write!( |
1209 | 0 | f, |
1210 | 0 | "expected '{}' but found '{}'", |
1211 | 0 | escape_char(*wanted), |
1212 | 0 | escape_char(*found) |
1213 | 0 | )?, |
1214 | 0 | UnexpectedEof => write!(f, "unexpected end-of-file")?, |
1215 | 0 | NumberTooBig => f.write_str("number is too big to parse")?, |
1216 | 0 | InvalidUnicodeValue(c) => write!(f, "invalid unicode scalar value 0x{c:x}")?, |
1217 | 0 | LoneUnderscore => write!(f, "bare underscore in numeric literal")?, |
1218 | 0 | ConfusingUnicode(c) => write!(f, "likely-confusing unicode character found {c:?}")?, |
1219 | 0 | InvalidUtf8Id(_) => write!(f, "malformed UTF-8 encoding of string-based id")?, |
1220 | 0 | EmptyId => write!(f, "empty identifier")?, |
1221 | 0 | EmptyAnnotation => write!(f, "empty annotation id")?, |
1222 | | } |
1223 | 0 | Ok(()) |
1224 | 0 | } |
1225 | | } |
1226 | | |
1227 | 0 | fn escape_char(c: char) -> String { |
1228 | 0 | match c { |
1229 | 0 | '\t' => String::from("\\t"), |
1230 | 0 | '\r' => String::from("\\r"), |
1231 | 0 | '\n' => String::from("\\n"), |
1232 | 0 | '\\' => String::from("\\\\"), |
1233 | 0 | '\'' => String::from("\\\'"), |
1234 | 0 | '\"' => String::from("\""), |
1235 | 0 | '\x20'..='\x7e' => String::from(c), |
1236 | 0 | _ => c.escape_unicode().to_string(), |
1237 | | } |
1238 | 0 | } |
1239 | | |
1240 | | /// This is an attempt to protect agains the "trojan source" [1] problem where |
1241 | | /// unicode characters can cause editors to render source code differently |
1242 | | /// for humans than the compiler itself sees. |
1243 | | /// |
1244 | | /// To mitigate this issue, and because it's relatively rare in practice, |
1245 | | /// this simply rejects characters of that form. |
1246 | | /// |
1247 | | /// [1]: https://www.trojansource.codes/ |
1248 | 29.8k | fn is_confusing_unicode(ch: char) -> bool { |
1249 | 29.8k | matches!( |
1250 | 29.8k | ch, |
1251 | | '\u{202a}' |
1252 | | | '\u{202b}' |
1253 | | | '\u{202d}' |
1254 | | | '\u{202e}' |
1255 | | | '\u{2066}' |
1256 | | | '\u{2067}' |
1257 | | | '\u{2068}' |
1258 | | | '\u{206c}' |
1259 | | | '\u{2069}' |
1260 | | ) |
1261 | 29.8k | } |
1262 | | |
1263 | | #[cfg(test)] |
1264 | | mod tests { |
1265 | | use super::*; |
1266 | | |
1267 | | #[test] |
1268 | | fn ws_smoke() { |
1269 | | fn get_whitespace(input: &str) -> &str { |
1270 | | let token = get_token(input); |
1271 | | match token.kind { |
1272 | | TokenKind::Whitespace => token.src(input), |
1273 | | other => panic!("unexpected {other:?}"), |
1274 | | } |
1275 | | } |
1276 | | assert_eq!(get_whitespace(" "), " "); |
1277 | | assert_eq!(get_whitespace(" "), " "); |
1278 | | assert_eq!(get_whitespace(" \n "), " \n "); |
1279 | | assert_eq!(get_whitespace(" x"), " "); |
1280 | | assert_eq!(get_whitespace(" ;"), " "); |
1281 | | } |
1282 | | |
1283 | | #[test] |
1284 | | fn line_comment_smoke() { |
1285 | | fn get_line_comment(input: &str) -> &str { |
1286 | | let token = get_token(input); |
1287 | | match token.kind { |
1288 | | TokenKind::LineComment => token.src(input), |
1289 | | other => panic!("unexpected {other:?}"), |
1290 | | } |
1291 | | } |
1292 | | assert_eq!(get_line_comment(";;"), ";;"); |
1293 | | assert_eq!(get_line_comment(";; xyz"), ";; xyz"); |
1294 | | assert_eq!(get_line_comment(";; xyz\nabc"), ";; xyz"); |
1295 | | assert_eq!(get_line_comment(";;\nabc"), ";;"); |
1296 | | assert_eq!(get_line_comment(";; \nabc"), ";; "); |
1297 | | assert_eq!(get_line_comment(";; \rabc"), ";; "); |
1298 | | assert_eq!(get_line_comment(";; \r\nabc"), ";; "); |
1299 | | } |
1300 | | |
1301 | | #[test] |
1302 | | fn block_comment_smoke() { |
1303 | | fn get_block_comment(input: &str) -> &str { |
1304 | | let token = get_token(input); |
1305 | | match token.kind { |
1306 | | TokenKind::BlockComment => token.src(input), |
1307 | | other => panic!("unexpected {other:?}"), |
1308 | | } |
1309 | | } |
1310 | | assert_eq!(get_block_comment("(;;)"), "(;;)"); |
1311 | | assert_eq!(get_block_comment("(; ;)"), "(; ;)"); |
1312 | | assert_eq!(get_block_comment("(; (;;) ;)"), "(; (;;) ;)"); |
1313 | | } |
1314 | | |
1315 | | fn get_token(input: &str) -> Token { |
1316 | | Lexer::new(input) |
1317 | | .parse(&mut 0) |
1318 | | .expect("no first token") |
1319 | | .expect("no token") |
1320 | | } |
1321 | | |
1322 | | #[test] |
1323 | | fn lparen() { |
1324 | | assert_eq!(get_token("((").kind, TokenKind::LParen); |
1325 | | } |
1326 | | |
1327 | | #[test] |
1328 | | fn rparen() { |
1329 | | assert_eq!(get_token(")(").kind, TokenKind::RParen); |
1330 | | } |
1331 | | |
1332 | | #[test] |
1333 | | fn strings() { |
1334 | | fn get_string(input: &str) -> Vec<u8> { |
1335 | | let token = get_token(input); |
1336 | | match token.kind { |
1337 | | TokenKind::String => token.string(input).to_vec(), |
1338 | | other => panic!("not keyword {other:?}"), |
1339 | | } |
1340 | | } |
1341 | | assert_eq!(&*get_string("\"\""), b""); |
1342 | | assert_eq!(&*get_string("\"a\""), b"a"); |
1343 | | assert_eq!(&*get_string("\"a b c d\""), b"a b c d"); |
1344 | | assert_eq!(&*get_string("\"\\\"\""), b"\""); |
1345 | | assert_eq!(&*get_string("\"\\'\""), b"'"); |
1346 | | assert_eq!(&*get_string("\"\\n\""), b"\n"); |
1347 | | assert_eq!(&*get_string("\"\\t\""), b"\t"); |
1348 | | assert_eq!(&*get_string("\"\\r\""), b"\r"); |
1349 | | assert_eq!(&*get_string("\"\\\\\""), b"\\"); |
1350 | | assert_eq!(&*get_string("\"\\01\""), &[1]); |
1351 | | assert_eq!(&*get_string("\"\\u{1}\""), &[1]); |
1352 | | assert_eq!( |
1353 | | &*get_string("\"\\u{0f3}\""), |
1354 | | '\u{0f3}'.encode_utf8(&mut [0; 4]).as_bytes() |
1355 | | ); |
1356 | | assert_eq!( |
1357 | | &*get_string("\"\\u{0_f_3}\""), |
1358 | | '\u{0f3}'.encode_utf8(&mut [0; 4]).as_bytes() |
1359 | | ); |
1360 | | |
1361 | | for i in 0..=255i32 { |
1362 | | let s = format!("\"\\{i:02x}\""); |
1363 | | assert_eq!(&*get_string(&s), &[i as u8]); |
1364 | | } |
1365 | | } |
1366 | | |
1367 | | #[test] |
1368 | | fn id() { |
1369 | | fn get_id(input: &str) -> String { |
1370 | | let token = get_token(input); |
1371 | | match token.kind { |
1372 | | TokenKind::Id => token.id(input).unwrap().to_string(), |
1373 | | other => panic!("not id {other:?}"), |
1374 | | } |
1375 | | } |
1376 | | assert_eq!(get_id("$x"), "x"); |
1377 | | assert_eq!(get_id("$xyz"), "xyz"); |
1378 | | assert_eq!(get_id("$x_z"), "x_z"); |
1379 | | assert_eq!(get_id("$0^"), "0^"); |
1380 | | assert_eq!(get_id("$0^;;"), "0^"); |
1381 | | assert_eq!(get_id("$0^ ;;"), "0^"); |
1382 | | assert_eq!(get_id("$\"x\" ;;"), "x"); |
1383 | | } |
1384 | | |
1385 | | #[test] |
1386 | | fn annotation() { |
1387 | | fn get_annotation(input: &str) -> String { |
1388 | | let token = get_token(input); |
1389 | | match token.kind { |
1390 | | TokenKind::Annotation => token.annotation(input).unwrap().to_string(), |
1391 | | other => panic!("not annotation {other:?}"), |
1392 | | } |
1393 | | } |
1394 | | assert_eq!(get_annotation("@foo"), "foo"); |
1395 | | assert_eq!(get_annotation("@foo "), "foo"); |
1396 | | assert_eq!(get_annotation("@f "), "f"); |
1397 | | assert_eq!(get_annotation("@\"x\" "), "x"); |
1398 | | assert_eq!(get_annotation("@0 "), "0"); |
1399 | | } |
1400 | | |
1401 | | #[test] |
1402 | | fn keyword() { |
1403 | | fn get_keyword(input: &str) -> &str { |
1404 | | let token = get_token(input); |
1405 | | match token.kind { |
1406 | | TokenKind::Keyword => token.keyword(input), |
1407 | | other => panic!("not keyword {other:?}"), |
1408 | | } |
1409 | | } |
1410 | | assert_eq!(get_keyword("x"), "x"); |
1411 | | assert_eq!(get_keyword("xyz"), "xyz"); |
1412 | | assert_eq!(get_keyword("x_z"), "x_z"); |
1413 | | assert_eq!(get_keyword("x_z "), "x_z"); |
1414 | | assert_eq!(get_keyword("x_z "), "x_z"); |
1415 | | } |
1416 | | |
1417 | | #[test] |
1418 | | fn reserved() { |
1419 | | fn get_reserved(input: &str) -> &str { |
1420 | | let token = get_token(input); |
1421 | | match token.kind { |
1422 | | TokenKind::Reserved => token.reserved(input), |
1423 | | other => panic!("not reserved {other:?}"), |
1424 | | } |
1425 | | } |
1426 | | assert_eq!(get_reserved("^_x "), "^_x"); |
1427 | | } |
1428 | | |
1429 | | #[test] |
1430 | | fn integer() { |
1431 | | fn get_integer(input: &str) -> String { |
1432 | | let token = get_token(input); |
1433 | | match token.kind { |
1434 | | TokenKind::Integer(i) => token.integer(input, i).val.to_string(), |
1435 | | other => panic!("not integer {other:?}"), |
1436 | | } |
1437 | | } |
1438 | | assert_eq!(get_integer("1"), "1"); |
1439 | | assert_eq!(get_integer("0"), "0"); |
1440 | | assert_eq!(get_integer("-1"), "-1"); |
1441 | | assert_eq!(get_integer("+1"), "1"); |
1442 | | assert_eq!(get_integer("+1_000"), "1000"); |
1443 | | assert_eq!(get_integer("+1_0_0_0"), "1000"); |
1444 | | assert_eq!(get_integer("+0x10"), "10"); |
1445 | | assert_eq!(get_integer("-0x10"), "-10"); |
1446 | | assert_eq!(get_integer("0x10"), "10"); |
1447 | | } |
1448 | | |
1449 | | #[test] |
1450 | | fn float() { |
1451 | | fn get_float(input: &str) -> Float<'_> { |
1452 | | let token = get_token(input); |
1453 | | match token.kind { |
1454 | | TokenKind::Float(f) => token.float(input, f), |
1455 | | other => panic!("not float {other:?}"), |
1456 | | } |
1457 | | } |
1458 | | assert_eq!( |
1459 | | get_float("nan"), |
1460 | | Float::Nan { |
1461 | | val: None, |
1462 | | negative: false |
1463 | | }, |
1464 | | ); |
1465 | | assert_eq!( |
1466 | | get_float("-nan"), |
1467 | | Float::Nan { |
1468 | | val: None, |
1469 | | negative: true, |
1470 | | }, |
1471 | | ); |
1472 | | assert_eq!( |
1473 | | get_float("+nan"), |
1474 | | Float::Nan { |
1475 | | val: None, |
1476 | | negative: false, |
1477 | | }, |
1478 | | ); |
1479 | | assert_eq!( |
1480 | | get_float("+nan:0x1"), |
1481 | | Float::Nan { |
1482 | | val: Some("1".into()), |
1483 | | negative: false, |
1484 | | }, |
1485 | | ); |
1486 | | assert_eq!( |
1487 | | get_float("nan:0x7f_ffff"), |
1488 | | Float::Nan { |
1489 | | val: Some("7fffff".into()), |
1490 | | negative: false, |
1491 | | }, |
1492 | | ); |
1493 | | assert_eq!(get_float("inf"), Float::Inf { negative: false }); |
1494 | | assert_eq!(get_float("-inf"), Float::Inf { negative: true }); |
1495 | | assert_eq!(get_float("+inf"), Float::Inf { negative: false }); |
1496 | | |
1497 | | assert_eq!( |
1498 | | get_float("1.2"), |
1499 | | Float::Val { |
1500 | | integral: "1".into(), |
1501 | | fractional: Some("2".into()), |
1502 | | exponent: None, |
1503 | | hex: false, |
1504 | | }, |
1505 | | ); |
1506 | | assert_eq!( |
1507 | | get_float("1.2e3"), |
1508 | | Float::Val { |
1509 | | integral: "1".into(), |
1510 | | fractional: Some("2".into()), |
1511 | | exponent: Some("3".into()), |
1512 | | hex: false, |
1513 | | }, |
1514 | | ); |
1515 | | assert_eq!( |
1516 | | get_float("-1_2.1_1E+0_1"), |
1517 | | Float::Val { |
1518 | | integral: "-12".into(), |
1519 | | fractional: Some("11".into()), |
1520 | | exponent: Some("01".into()), |
1521 | | hex: false, |
1522 | | }, |
1523 | | ); |
1524 | | assert_eq!( |
1525 | | get_float("+1_2.1_1E-0_1"), |
1526 | | Float::Val { |
1527 | | integral: "12".into(), |
1528 | | fractional: Some("11".into()), |
1529 | | exponent: Some("-01".into()), |
1530 | | hex: false, |
1531 | | }, |
1532 | | ); |
1533 | | assert_eq!( |
1534 | | get_float("0x1_2.3_4p5_6"), |
1535 | | Float::Val { |
1536 | | integral: "12".into(), |
1537 | | fractional: Some("34".into()), |
1538 | | exponent: Some("56".into()), |
1539 | | hex: true, |
1540 | | }, |
1541 | | ); |
1542 | | assert_eq!( |
1543 | | get_float("+0x1_2.3_4P-5_6"), |
1544 | | Float::Val { |
1545 | | integral: "12".into(), |
1546 | | fractional: Some("34".into()), |
1547 | | exponent: Some("-56".into()), |
1548 | | hex: true, |
1549 | | }, |
1550 | | ); |
1551 | | assert_eq!( |
1552 | | get_float("1."), |
1553 | | Float::Val { |
1554 | | integral: "1".into(), |
1555 | | fractional: None, |
1556 | | exponent: None, |
1557 | | hex: false, |
1558 | | }, |
1559 | | ); |
1560 | | assert_eq!( |
1561 | | get_float("0x1p-24"), |
1562 | | Float::Val { |
1563 | | integral: "1".into(), |
1564 | | fractional: None, |
1565 | | exponent: Some("-24".into()), |
1566 | | hex: true, |
1567 | | }, |
1568 | | ); |
1569 | | } |
1570 | | } |