Coverage Report

Created: 2025-06-23 13:53

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/build/cargo-vendor-dir/wast-231.0.0/src/lexer.rs
Line
Count
Source
1
//! Definition of a lexer for the WebAssembly text format.
2
//!
3
//! This module provides a [`Lexer`][] type which is an iterate over the raw
4
//! tokens of a WebAssembly text file. A [`Lexer`][] accounts for every single
5
//! byte in a WebAssembly text field, returning tokens even for comments and
6
//! whitespace. Typically you'll ignore comments and whitespace, however.
7
//!
8
//! If you'd like to iterate over the tokens in a file you can do so via:
9
//!
10
//! ```
11
//! # fn foo() -> Result<(), wast::Error> {
12
//! use wast::lexer::Lexer;
13
//!
14
//! let wat = "(module (func $foo))";
15
//! for token in Lexer::new(wat).iter(0) {
16
//!     println!("{:?}", token?);
17
//! }
18
//! # Ok(())
19
//! # }
20
//! ```
21
//!
22
//! Note that you'll typically not use this module but will rather use
23
//! [`ParseBuffer`](crate::parser::ParseBuffer) instead.
24
//!
25
//! [`Lexer`]: crate::lexer::Lexer
26
27
use crate::token::Span;
28
use crate::Error;
29
use std::borrow::Cow;
30
use std::char;
31
use std::fmt;
32
use std::slice;
33
use std::str;
34
use std::str::Utf8Error;
35
36
/// A structure used to lex the s-expression syntax of WAT files.
37
///
38
/// This structure is used to generate [`Token`] items, which should account for
39
/// every single byte of the input as we iterate over it. A [`LexError`] is
40
/// returned for any non-lexable text.
41
#[derive(Clone)]
42
pub struct Lexer<'a> {
43
    input: &'a str,
44
    allow_confusing_unicode: bool,
45
}
46
47
/// A single token parsed from a `Lexer`.
48
#[derive(Copy, Clone, Debug, PartialEq)]
49
pub struct Token {
50
    /// The kind of token this represents, such as whether it's whitespace, a
51
    /// keyword, etc.
52
    pub kind: TokenKind,
53
    /// The byte offset within the original source for where this token came
54
    /// from.
55
    pub offset: usize,
56
    /// The byte length of this token as it resides in the original source.
57
    //
58
    // NB: this is `u32` to enable packing `Token` into two pointers of size.
59
    // This does limit a single token to being at most 4G large, but that seems
60
    // probably ok.
61
    pub len: u32,
62
}
63
64
#[test]
65
fn token_is_not_too_big() {
66
    assert!(std::mem::size_of::<Token>() <= std::mem::size_of::<u64>() * 2);
67
}
68
69
/// Classification of what was parsed from the input stream.
70
///
71
/// This enumeration contains all kinds of fragments, including comments and
72
/// whitespace.
73
#[derive(Copy, Clone, Debug, PartialEq)]
74
pub enum TokenKind {
75
    /// A line comment, preceded with `;;`
76
    LineComment,
77
78
    /// A block comment, surrounded by `(;` and `;)`. Note that these can be
79
    /// nested.
80
    BlockComment,
81
82
    /// A fragment of source that represents whitespace.
83
    Whitespace,
84
85
    /// A left-parenthesis, including the source text for where it comes from.
86
    LParen,
87
    /// A right-parenthesis, including the source text for where it comes from.
88
    RParen,
89
90
    /// A string literal, which is actually a list of bytes.
91
    String,
92
93
    /// An identifier (like `$foo`).
94
    ///
95
    /// All identifiers start with `$` and the payload here is the original
96
    /// source text.
97
    Id,
98
99
    /// A keyword, or something that starts with an alphabetic character.
100
    ///
101
    /// The payload here is the original source text.
102
    Keyword,
103
104
    /// An annotation (like `@foo`).
105
    ///
106
    /// All annotations start with `@` and the payload will be the name of the
107
    /// annotation.
108
    Annotation,
109
110
    /// A reserved series of `idchar` symbols. Unknown what this is meant to be
111
    /// used for, you'll probably generate an error about an unexpected token.
112
    Reserved,
113
114
    /// An integer.
115
    Integer(IntegerKind),
116
117
    /// A float.
118
    Float(FloatKind),
119
}
120
121
/// Description of the parsed integer from the source.
122
#[derive(Copy, Clone, Debug, PartialEq)]
123
pub struct IntegerKind {
124
    sign: Option<SignToken>,
125
    has_underscores: bool,
126
    hex: bool,
127
}
128
129
/// Description of a parsed float from the source.
130
#[allow(missing_docs)]
131
#[derive(Copy, Clone, Debug, PartialEq)]
132
pub enum FloatKind {
133
    #[doc(hidden)]
134
    Inf { negative: bool },
135
    #[doc(hidden)]
136
    Nan { negative: bool },
137
    #[doc(hidden)]
138
    NanVal {
139
        negative: bool,
140
        has_underscores: bool,
141
    },
142
    #[doc(hidden)]
143
    Normal { has_underscores: bool, hex: bool },
144
}
145
146
enum ReservedKind {
147
    /// "..."
148
    String,
149
    /// anything that's just a sequence of `idchars!()`
150
    Idchars,
151
    /// $"..."
152
    IdString,
153
    /// @"..."
154
    AnnotationString,
155
    /// everything else (a conglomeration of strings, idchars, etc)
156
    Reserved,
157
}
158
159
/// Errors that can be generated while lexing.
160
///
161
/// All lexing errors have line/colum/position information as well as a
162
/// `LexError` indicating what kind of error happened while lexing.
163
#[derive(Debug, Clone, PartialEq, Eq)]
164
#[non_exhaustive]
165
pub enum LexError {
166
    /// A dangling block comment was found with an unbalanced `(;` which was
167
    /// never terminated in the file.
168
    DanglingBlockComment,
169
170
    /// An unexpected character was encountered when generally parsing and
171
    /// looking for something else.
172
    Unexpected(char),
173
174
    /// An invalid `char` in a string literal was found.
175
    InvalidStringElement(char),
176
177
    /// An invalid string escape letter was found (the thing after the `\` in
178
    /// string literals)
179
    InvalidStringEscape(char),
180
181
    /// An invalid hexadecimal digit was found.
182
    InvalidHexDigit(char),
183
184
    /// An invalid base-10 digit was found.
185
    InvalidDigit(char),
186
187
    /// Parsing expected `wanted` but ended up finding `found` instead where the
188
    /// two characters aren't the same.
189
    Expected {
190
        /// The character that was expected to be found
191
        wanted: char,
192
        /// The character that was actually found
193
        found: char,
194
    },
195
196
    /// We needed to parse more but EOF (or end of the string) was encountered.
197
    UnexpectedEof,
198
199
    /// A number failed to parse because it was too big to fit within the target
200
    /// type.
201
    NumberTooBig,
202
203
    /// An invalid unicode value was found in a `\u{...}` escape in a string,
204
    /// only valid unicode scalars can be escaped that way.
205
    InvalidUnicodeValue(u32),
206
207
    /// A lone underscore was found when parsing a number, since underscores
208
    /// should always be preceded and succeeded with a digit of some form.
209
    LoneUnderscore,
210
211
    /// A "confusing" unicode character is present in a comment or a string
212
    /// literal, such as a character that changes the direction text is
213
    /// typically displayed in editors. This could cause the human-read
214
    /// version to behave differently than the compiler-visible version, so
215
    /// these are simply rejected for now.
216
    ConfusingUnicode(char),
217
218
    /// An invalid utf-8 sequence was found in a quoted identifier, such as
219
    /// `$"\ff"`.
220
    InvalidUtf8Id(Utf8Error),
221
222
    /// An empty identifier was found, or a lone `$`.
223
    EmptyId,
224
225
    /// An empty identifier was found, or a lone `@`.
226
    EmptyAnnotation,
227
}
228
229
/// A sign token for an integer.
230
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
231
pub enum SignToken {
232
    /// Plus sign: "+",
233
    Plus,
234
    /// Minus sign: "-",
235
    Minus,
236
}
237
238
/// A fully parsed integer from a source string with a payload ready to parse
239
/// into an integral type.
240
#[derive(Debug, PartialEq)]
241
pub struct Integer<'a> {
242
    sign: Option<SignToken>,
243
    val: Cow<'a, str>,
244
    hex: bool,
245
}
246
247
/// Possible parsed float values
248
#[derive(Debug, PartialEq, Eq)]
249
pub enum Float<'a> {
250
    /// A float `NaN` representation
251
    Nan {
252
        /// The specific bits to encode for this float, optionally
253
        val: Option<Cow<'a, str>>,
254
        /// Whether or not this is a negative `NaN` or not.
255
        negative: bool,
256
    },
257
    /// An float infinite representation,
258
    Inf {
259
        #[allow(missing_docs)]
260
        negative: bool,
261
    },
262
    /// A parsed and separated floating point value
263
    Val {
264
        /// Whether or not the `integral` and `fractional` are specified in hex
265
        hex: bool,
266
        /// The float parts before the `.`
267
        integral: Cow<'a, str>,
268
        /// The float parts after the `.`
269
        fractional: Option<Cow<'a, str>>,
270
        /// The exponent to multiple this `integral.fractional` portion of the
271
        /// float by. If `hex` is true this is `2^exponent` and otherwise it's
272
        /// `10^exponent`
273
        exponent: Option<Cow<'a, str>>,
274
    },
275
}
276
277
// https://webassembly.github.io/spec/core/text/values.html#text-idchar
278
macro_rules! idchars {
279
    () => {
280
        b'0'..=b'9'
281
        | b'A'..=b'Z'
282
        | b'a'..=b'z'
283
        | b'!'
284
        | b'#'
285
        | b'$'
286
        | b'%'
287
        | b'&'
288
        | b'\''
289
        | b'*'
290
        | b'+'
291
        | b'-'
292
        | b'.'
293
        | b'/'
294
        | b':'
295
        | b'<'
296
        | b'='
297
        | b'>'
298
        | b'?'
299
        | b'@'
300
        | b'\\'
301
        | b'^'
302
        | b'_'
303
        | b'`'
304
        | b'|'
305
        | b'~'
306
    }
307
}
308
309
impl<'a> Lexer<'a> {
310
    /// Creates a new lexer which will lex the `input` source string.
311
495
    pub fn new(input: &str) -> Lexer<'_> {
312
495
        Lexer {
313
495
            input,
314
495
            allow_confusing_unicode: false,
315
495
        }
316
495
    }
317
318
    /// Returns the original source input that we're lexing.
319
55.4k
    pub fn input(&self) -> &'a str {
320
55.4k
        self.input
321
55.4k
    }
322
323
    /// Configures whether "confusing" unicode characters are allowed while
324
    /// lexing.
325
    ///
326
    /// If allowed then no error will happen if these characters are found, but
327
    /// otherwise if disallowed a lex error will be produced when these
328
    /// characters are found. Confusing characters are denied by default.
329
    ///
330
    /// For now "confusing characters" are primarily related to the "trojan
331
    /// source" problem where it refers to characters which cause humans to read
332
    /// text differently than this lexer, such as characters that alter the
333
    /// left-to-right display of the source code.
334
0
    pub fn allow_confusing_unicode(&mut self, allow: bool) -> &mut Self {
335
0
        self.allow_confusing_unicode = allow;
336
0
        self
337
0
    }
338
339
    /// Lexes the next at the byte position `pos` in the input.
340
    ///
341
    /// Returns `Some` if a token is found or `None` if we're at EOF.
342
    ///
343
    /// The `pos` argument will be updated to point to the next token on a
344
    /// successful parse.
345
    ///
346
    /// # Errors
347
    ///
348
    /// Returns an error if the input is malformed.
349
158k
    pub fn parse(&self, pos: &mut usize) -> Result<Option<Token>, Error> {
350
158k
        let offset = *pos;
351
158k
        Ok(match self.parse_kind(pos)
?0
{
352
157k
            Some(kind) => Some(Token {
353
157k
                kind,
354
157k
                offset,
355
157k
                len: (*pos - offset).try_into().unwrap(),
356
157k
            }),
357
990
            None => None,
358
        })
359
158k
    }
360
361
158k
    fn parse_kind(&self, pos: &mut usize) -> Result<Option<TokenKind>, Error> {
362
158k
        let start = *pos;
363
158k
        // This `match` generally parses the grammar specified at
364
158k
        //
365
158k
        // https://webassembly.github.io/spec/core/text/lexical.html#text-token
366
158k
        let remaining = &self.input.as_bytes()[start..];
367
158k
        let 
byte157k
= match remaining.first() {
368
157k
            Some(b) => b,
369
990
            None => return Ok(None),
370
        };
371
372
157k
        match byte {
373
            // Open-parens check the next character to see if this is the start
374
            // of a block comment, otherwise it's just a bland left-paren
375
            // token.
376
25.9k
            b'(' => match remaining.get(1) {
377
                Some(b';') => {
378
0
                    let mut level = 1;
379
0
                    // Note that we're doing a byte-level search here for the
380
0
                    // close-delimiter of `;)`. The actual source text is utf-8
381
0
                    // encode in `remaining` but due to how utf-8 works we
382
0
                    // can safely search for an ASCII byte since it'll never
383
0
                    // otherwise appear in the middle of a codepoint and if we
384
0
                    // find it then it's guaranteed to be the right byte.
385
0
                    //
386
0
                    // Mainly we're avoiding the overhead of decoding utf-8
387
0
                    // characters into a Rust `char` since it's otherwise
388
0
                    // unnecessary work.
389
0
                    let mut iter = remaining[2..].iter();
390
0
                    while let Some(ch) = iter.next() {
391
0
                        match ch {
392
                            b'(' => {
393
0
                                if let Some(b';') = iter.as_slice().first() {
394
0
                                    level += 1;
395
0
                                    iter.next();
396
0
                                }
397
                            }
398
                            b';' => {
399
0
                                if let Some(b')') = iter.as_slice().first() {
400
0
                                    level -= 1;
401
0
                                    iter.next();
402
0
                                    if level == 0 {
403
0
                                        let len = remaining.len() - iter.as_slice().len();
404
0
                                        let comment = &self.input[start..][..len];
405
0
                                        *pos += len;
406
0
                                        self.check_confusing_comment(*pos, comment)?;
407
0
                                        return Ok(Some(TokenKind::BlockComment));
408
0
                                    }
409
0
                                }
410
                            }
411
0
                            _ => {}
412
                        }
413
                    }
414
0
                    Err(self.error(start, LexError::DanglingBlockComment))
415
                }
416
                _ => {
417
25.9k
                    *pos += 1;
418
25.9k
419
25.9k
                    Ok(Some(TokenKind::LParen))
420
                }
421
            },
422
423
            b')' => {
424
16.2k
                *pos += 1;
425
16.2k
                Ok(Some(TokenKind::RParen))
426
            }
427
428
            // https://webassembly.github.io/spec/core/text/lexical.html#white-space
429
            b' ' | b'\n' | b'\r' | b'\t' => {
430
59.4k
                self.skip_ws(pos);
431
59.4k
                Ok(Some(TokenKind::Whitespace))
432
            }
433
434
54.1k
            c @ (
idchars!()44.4k
| b'"') => {
435
54.1k
                let (kind, src) = self.parse_reserved(pos)
?0
;
436
54.1k
                match kind {
437
                    // If the reserved token was simply a single string then
438
                    // that is converted to a standalone string token
439
3.82k
                    ReservedKind::String => return Ok(Some(TokenKind::String)),
440
441
                    // If only idchars were consumed then this could be a
442
                    // specific kind of standalone token we're interested in.
443
                    ReservedKind::Idchars => {
444
                        // https://webassembly.github.io/spec/core/text/values.html#integers
445
50.3k
                        if let Some(
ret5.72k
) = self.classify_number(src) {
446
5.72k
                            return Ok(Some(ret));
447
                        // https://webassembly.github.io/spec/core/text/values.html#text-id
448
44.6k
                        } else if *c == b'$' {
449
5.92k
                            return Ok(Some(TokenKind::Id));
450
                        // part of the WebAssembly/annotations proposal
451
                        // (no online url yet)
452
38.7k
                        } else if *c == b'@' {
453
0
                            return Ok(Some(TokenKind::Annotation));
454
                        // https://webassembly.github.io/spec/core/text/lexical.html#text-keyword
455
38.7k
                        } else if b'a' <= *c && *c <= b'z' {
456
38.7k
                            return Ok(Some(TokenKind::Keyword));
457
0
                        }
458
                    }
459
460
0
                    ReservedKind::IdString => return Ok(Some(TokenKind::Id)),
461
0
                    ReservedKind::AnnotationString => return Ok(Some(TokenKind::Annotation)),
462
463
                    // ... otherwise this was a conglomeration of idchars,
464
                    // strings, or just idchars that don't match a prior rule,
465
                    // meaning this falls through to the fallback `Reserved`
466
                    // token.
467
0
                    ReservedKind::Reserved => {}
468
                }
469
470
0
                Ok(Some(TokenKind::Reserved))
471
            }
472
473
            // This could be a line comment, otherwise `;` is a reserved token.
474
            // The second byte is checked to see if it's a `;;` line comment
475
            //
476
            // Note that this character being considered as part of a
477
            // `reserved` token is part of the annotations proposal.
478
1.21k
            b';' => match remaining.get(1) {
479
                Some(b';') => {
480
1.21k
                    let remaining = &self.input[*pos..];
481
1.21k
                    let byte_pos = memchr::memchr2(b'\n', b'\r', remaining.as_bytes())
482
1.21k
                        .unwrap_or(remaining.len());
483
1.21k
                    *pos += byte_pos;
484
1.21k
                    let comment = &remaining[..byte_pos];
485
1.21k
                    self.check_confusing_comment(*pos, comment)
?0
;
486
1.21k
                    Ok(Some(TokenKind::LineComment))
487
                }
488
                _ => {
489
0
                    *pos += 1;
490
0
                    Ok(Some(TokenKind::Reserved))
491
                }
492
            },
493
494
            // Other known reserved tokens other than `;`
495
            //
496
            // Note that these characters being considered as part of a
497
            // `reserved` token is part of the annotations proposal.
498
            b',' | b'[' | b']' | b'{' | b'}' => {
499
0
                *pos += 1;
500
0
                Ok(Some(TokenKind::Reserved))
501
            }
502
503
            _ => {
504
0
                let ch = self.input[start..].chars().next().unwrap();
505
0
                Err(self.error(*pos, LexError::Unexpected(ch)))
506
            }
507
        }
508
158k
    }
509
510
59.4k
    fn skip_ws(&self, pos: &mut usize) {
511
        // This table is a byte lookup table to determine whether a byte is a
512
        // whitespace byte. There are only 4 whitespace bytes for the `*.wat`
513
        // format right now which are ' ', '\t', '\r', and '\n'. These 4 bytes
514
        // have a '1' in the table below.
515
        //
516
        // Due to how utf-8 works (our input is guaranteed to be utf-8) it is
517
        // known that if these bytes are found they're guaranteed to be the
518
        // whitespace byte, so they can be safely skipped and we don't have to
519
        // do full utf-8 decoding. This means that the goal of this function is
520
        // to find the first non-whitespace byte in `remaining`.
521
        //
522
        // For now this lookup table seems to be the fastest, but projects like
523
        // https://github.com/lemire/despacer show other simd algorithms which
524
        // can possibly accelerate this even more. Note that `*.wat` files often
525
        // have a lot of whitespace so this function is typically quite hot when
526
        // parsing inputs.
527
        #[rustfmt::skip]
528
        const WS: [u8; 256] = [
529
            //                                   \t \n       \r
530
            /* 0x00 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
531
            /* 0x10 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
532
            //        ' '
533
            /* 0x20 */ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
534
            /* 0x30 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
535
            /* 0x40 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
536
            /* 0x50 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
537
            /* 0x60 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
538
            /* 0x70 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
539
            /* 0x80 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
540
            /* 0x90 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
541
            /* 0xa0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
542
            /* 0xb0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
543
            /* 0xc0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
544
            /* 0xd0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
545
            /* 0xe0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
546
            /* 0xf0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
547
        ];
548
59.4k
        let remaining = &self.input[*pos..];
549
59.4k
        let non_ws_pos = remaining
550
59.4k
            .as_bytes()
551
59.4k
            .iter()
552
163k
            .position(|b| WS[*b as usize] != 1)
553
59.4k
            .unwrap_or(remaining.len());
554
59.4k
        *pos += non_ws_pos;
555
59.4k
    }
556
557
    /// Splits off a "reserved" token which is then further processed later on
558
    /// to figure out which kind of token it is `depending on `ReservedKind`.
559
    ///
560
    /// For more information on this method see the clarification at
561
    /// <https://github.com/WebAssembly/spec/pull/1499> but the general gist is
562
    /// that this is parsing the grammar:
563
    ///
564
    /// ```text
565
    /// reserved := (idchar | string)+
566
    /// ```
567
    ///
568
    /// which means that it is eating any number of adjacent string/idchar
569
    /// tokens (e.g. `a"b"c`) and returning the classification of what was
570
    /// eaten. The classification assists in determining what the actual token
571
    /// here eaten looks like.
572
54.1k
    fn parse_reserved(&self, pos: &mut usize) -> Result<(ReservedKind, &'a str), Error> {
573
54.1k
        let mut idchars = 0u32;
574
54.1k
        let mut strings = 0u32;
575
54.1k
        let start = *pos;
576
296k
        while let Some(byte) = self.input.as_bytes().get(*pos) {
577
296k
            match byte {
578
296k
                // Normal `idchars` production which appends to the reserved
579
296k
                // token that's being produced.
580
296k
                idchars!() => {
581
238k
                    idchars += 1;
582
238k
                    *pos += 1;
583
238k
                }
584
585
                // https://webassembly.github.io/spec/core/text/values.html#text-string
586
                b'"' => {
587
3.82k
                    strings += 1;
588
3.82k
                    *pos += 1;
589
3.82k
                    let mut it = self.input[*pos..].chars();
590
3.82k
                    let result = Lexer::parse_str(&mut it, self.allow_confusing_unicode);
591
3.82k
                    *pos = self.input.len() - it.as_str().len();
592
3.82k
                    match result {
593
3.82k
                        Ok(_) => {}
594
0
                        Err(e) => {
595
0
                            let err_pos = match &e {
596
0
                                LexError::UnexpectedEof => self.input.len(),
597
0
                                _ => self.input[..*pos].char_indices().next_back().unwrap().0,
598
                            };
599
0
                            return Err(self.error(err_pos, e));
600
                        }
601
                    }
602
                }
603
604
                // Nothing else is considered part of a reserved token
605
54.1k
                _ => break,
606
            }
607
        }
608
54.1k
        let ret = &self.input[start..*pos];
609
54.1k
        Ok(match (idchars, strings) {
610
0
            (0, 0) => unreachable!(),
611
3.82k
            (0, 1) => (ReservedKind::String, ret),
612
50.3k
            (_, 0) => (ReservedKind::Idchars, ret),
613
            // Pattern match `@"..."` and `$"..."` for string-based
614
            // identifiers and annotations.
615
0
            (1, 1) if ret.starts_with("$") => (ReservedKind::IdString, ret),
616
0
            (1, 1) if ret.starts_with("@") => (ReservedKind::AnnotationString, ret),
617
0
            _ => (ReservedKind::Reserved, ret),
618
        })
619
54.1k
    }
620
621
50.3k
    fn classify_number(&self, src: &str) -> Option<TokenKind> {
622
50.3k
        let (sign, num) = if let Some(
stripped0
) = src.strip_prefix('+') {
623
0
            (Some(SignToken::Plus), stripped)
624
50.3k
        } else if let Some(
stripped3
) = src.strip_prefix('-') {
625
3
            (Some(SignToken::Minus), stripped)
626
        } else {
627
50.3k
            (None, src)
628
        };
629
630
50.3k
        let negative = sign == Some(SignToken::Minus);
631
50.3k
632
50.3k
        // Handle `inf` and `nan` which are special numbers here
633
50.3k
        if num == "inf" {
634
0
            return Some(TokenKind::Float(FloatKind::Inf { negative }));
635
50.3k
        } else if num == "nan" {
636
0
            return Some(TokenKind::Float(FloatKind::Nan { negative }));
637
50.3k
        } else if let Some(
stripped0
) = num.strip_prefix("nan:0x") {
638
0
            let mut it = stripped.as_bytes().iter();
639
0
            let has_underscores = skip_underscores(&mut it, |x| char::from(x).is_ascii_hexdigit())?;
640
0
            if it.next().is_some() {
641
0
                return None;
642
0
            }
643
0
            return Some(TokenKind::Float(FloatKind::NanVal {
644
0
                negative,
645
0
                has_underscores,
646
0
            }));
647
50.3k
        }
648
649
        // Figure out if we're a hex number or not
650
        let test_valid: fn(u8) -> bool;
651
50.3k
        let (mut it, hex) = if let Some(
stripped15
) = num.strip_prefix("0x") {
652
105
            
test_valid = 15
|x: u8| char::from(x).is_ascii_hexdigit();
653
15
            (stripped.as_bytes().iter(), true)
654
        } else {
655
53.2k
            
test_valid = 50.3k
|x: u8| char::from(x).is_ascii_digit();
656
50.3k
            (num.as_bytes().iter(), false)
657
        };
658
659
        // Evaluate the first part, moving out all underscores
660
50.3k
        let 
mut has_underscores5.72k
= skip_underscores(&mut it, test_valid)
?44.6k
;
661
662
5.72k
        match it.clone().next() {
663
            // If we're followed by something this may be a float so keep going.
664
188
            Some(_) => {}
665
666
            // Otherwise this is a valid integer literal!
667
            None => {
668
5.53k
                return Some(TokenKind::Integer(IntegerKind {
669
5.53k
                    has_underscores,
670
5.53k
                    sign,
671
5.53k
                    hex,
672
5.53k
                }))
673
            }
674
        }
675
676
        // A number can optionally be after the dot so only actually try to
677
        // parse one if it's there.
678
188
        if it.clone().next() == Some(&b'.') {
679
188
            it.next();
680
188
            match it.clone().next() {
681
188
                Some(c) if test_valid(*c) => {
682
188
                    if skip_underscores(&mut it, test_valid)
?0
{
683
0
                        has_underscores = true;
684
188
                    }
685
                }
686
0
                Some(_) | None => {}
687
            }
688
0
        };
689
690
        // Figure out if there's an exponential part here to make a float, and
691
        // if so parse it but defer its actual calculation until later.
692
188
        match (hex, it.next()) {
693
            (true, Some(b'p')) | (true, Some(b'P')) | (false, Some(b'e')) | (false, Some(b'E')) => {
694
0
                match it.clone().next() {
695
0
                    Some(b'-') => {
696
0
                        it.next();
697
0
                    }
698
0
                    Some(b'+') => {
699
0
                        it.next();
700
0
                    }
701
0
                    _ => {}
702
                }
703
0
                if skip_underscores(&mut it, |x| char::from(x).is_ascii_digit())? {
704
0
                    has_underscores = true;
705
0
                }
706
            }
707
188
            (_, None) => {}
708
0
            _ => return None,
709
        }
710
711
        // We should have eaten everything by now, if not then this is surely
712
        // not a float or integer literal.
713
188
        if it.next().is_some() {
714
0
            return None;
715
188
        }
716
188
717
188
        return Some(TokenKind::Float(FloatKind::Normal {
718
188
            has_underscores,
719
188
            hex,
720
188
        }));
721
722
50.5k
        fn skip_underscores(it: &mut slice::Iter<'_, u8>, good: fn(u8) -> bool) -> Option<bool> {
723
50.5k
            let mut last_underscore = false;
724
50.5k
            let mut has_underscores = false;
725
50.5k
            let first = *it.next()
?0
;
726
50.5k
            if !good(first) {
727
44.6k
                return None;
728
5.90k
            }
729
8.38k
            while let Some(
c2.66k
) = it.clone().next() {
730
2.66k
                if *c == b'_' && 
!last_underscore9
{
731
9
                    has_underscores = true;
732
9
                    it.next();
733
9
                    last_underscore = true;
734
9
                    continue;
735
2.65k
                }
736
2.65k
                if !good(*c) {
737
188
                    break;
738
2.46k
                }
739
2.46k
                last_underscore = false;
740
2.46k
                it.next();
741
            }
742
5.90k
            if last_underscore {
743
0
                return None;
744
5.90k
            }
745
5.90k
            Some(has_underscores)
746
50.5k
        }
747
50.3k
    }
748
749
    /// Verifies that `comment`, which is about to be returned, has a "confusing
750
    /// unicode character" in it and should instead be transformed into an
751
    /// error.
752
1.21k
    fn check_confusing_comment(&self, end: usize, comment: &str) -> Result<(), Error> {
753
1.21k
        if self.allow_confusing_unicode {
754
0
            return Ok(());
755
1.21k
        }
756
1.21k
757
1.21k
        // In an effort to avoid utf-8 decoding the entire `comment` the search
758
1.21k
        // here is a bit more optimized. This checks for the `0xe2` byte because
759
1.21k
        // in the utf-8 encoding that's the leading encoding byte for all
760
1.21k
        // "confusing characters". Each instance of 0xe2 is checked to see if it
761
1.21k
        // starts a confusing character, and if so that's returned.
762
1.21k
        //
763
1.21k
        // Also note that 0xe2 will never be found in the middle of a codepoint,
764
1.21k
        // it's always the start of a codepoint. This means that if our special
765
1.21k
        // characters show up they're guaranteed to start with 0xe2 bytes.
766
1.21k
        let bytes = comment.as_bytes();
767
1.21k
        for 
pos0
in memchr::Memchr::new(0xe2, bytes) {
768
0
            if let Some(c) = comment[pos..].chars().next() {
769
0
                if is_confusing_unicode(c) {
770
                    // Note that `self.cur()` accounts for already having
771
                    // parsed `comment`, so we move backwards to where
772
                    // `comment` started and then add the index within
773
                    // `comment`.
774
0
                    let pos = end - comment.len() + pos;
775
0
                    return Err(self.error(pos, LexError::ConfusingUnicode(c)));
776
0
                }
777
0
            }
778
        }
779
780
1.21k
        Ok(())
781
1.21k
    }
782
783
7.37k
    fn parse_str(
784
7.37k
        it: &mut str::Chars<'a>,
785
7.37k
        allow_confusing_unicode: bool,
786
7.37k
    ) -> Result<Cow<'a, [u8]>, LexError> {
787
        enum State {
788
            Start,
789
            String(Vec<u8>),
790
        }
791
7.37k
        let orig = it.as_str();
792
7.37k
        let mut state = State::Start;
793
        loop {
794
67.2k
            match it.next().ok_or(LexError::UnexpectedEof)
?0
{
795
7.37k
                '"' => break,
796
                '\\' => {
797
1.19k
                    match state {
798
952
                        State::String(_) => {}
799
246
                        State::Start => {
800
246
                            let pos = orig.len() - it.as_str().len() - 1;
801
246
                            state = State::String(orig[..pos].as_bytes().to_vec());
802
246
                        }
803
                    }
804
1.19k
                    let buf = match &mut state {
805
1.19k
                        State::String(b) => b,
806
0
                        State::Start => unreachable!(),
807
                    };
808
1.19k
                    match it.next().ok_or(LexError::UnexpectedEof)
?0
{
809
0
                        '"' => buf.push(b'"'),
810
0
                        '\'' => buf.push(b'\''),
811
0
                        't' => buf.push(b'\t'),
812
0
                        'n' => buf.push(b'\n'),
813
0
                        'r' => buf.push(b'\r'),
814
0
                        '\\' => buf.push(b'\\'),
815
                        'u' => {
816
0
                            Lexer::must_eat_char(it, '{')?;
817
0
                            let n = Lexer::hexnum(it)?;
818
0
                            let c = char::from_u32(n).ok_or(LexError::InvalidUnicodeValue(n))?;
819
0
                            buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes());
820
0
                            Lexer::must_eat_char(it, '}')?;
821
                        }
822
1.19k
                        c1 if c1.is_ascii_hexdigit() => {
823
1.19k
                            let c2 = Lexer::hexdigit(it)
?0
;
824
1.19k
                            buf.push(to_hex(c1) * 16 + c2);
825
                        }
826
0
                        c => return Err(LexError::InvalidStringEscape(c)),
827
                    }
828
                }
829
58.7k
                
c0
if (c as u32) < 0x20 || c as u32 == 0x7f => {
830
0
                    return Err(LexError::InvalidStringElement(c))
831
                }
832
58.7k
                
c0
if !allow_confusing_unicode && is_confusing_unicode(
c29.8k
) => {
833
0
                    return Err(LexError::ConfusingUnicode(c))
834
                }
835
58.7k
                c => match &mut state {
836
58.7k
                    State::Start => {}
837
2
                    State::String(v) => {
838
2
                        v.extend(c.encode_utf8(&mut [0; 4]).as_bytes());
839
2
                    }
840
                },
841
            }
842
        }
843
7.37k
        match state {
844
7.13k
            State::Start => Ok(orig[..orig.len() - it.as_str().len() - 1].as_bytes().into()),
845
246
            State::String(s) => Ok(s.into()),
846
        }
847
7.37k
    }
848
849
    /// Parses an id-or-string-based name from `it`.
850
    ///
851
    /// Note that `it` should already have been lexed and this is just
852
    /// extracting the value. If the token lexed was `@a` then this should point
853
    /// to `a`.
854
    ///
855
    /// This will automatically detect quoted syntax such as `@"..."` and the
856
    /// byte string will be parsed and validated as utf-8.
857
    ///
858
    /// # Errors
859
    ///
860
    /// Returns an error if a quoted byte string is found and contains invalid
861
    /// utf-8.
862
1.60k
    fn parse_name(it: &mut str::Chars<'a>) -> Result<Cow<'a, str>, LexError> {
863
1.60k
        if it.clone().next() == Some('"') {
864
0
            it.next();
865
0
            match Lexer::parse_str(it, true)? {
866
0
                Cow::Borrowed(bytes) => match std::str::from_utf8(bytes) {
867
0
                    Ok(s) => Ok(Cow::Borrowed(s)),
868
0
                    Err(e) => Err(LexError::InvalidUtf8Id(e)),
869
                },
870
0
                Cow::Owned(bytes) => match String::from_utf8(bytes) {
871
0
                    Ok(s) => Ok(Cow::Owned(s)),
872
0
                    Err(e) => Err(LexError::InvalidUtf8Id(e.utf8_error())),
873
                },
874
            }
875
        } else {
876
1.60k
            Ok(Cow::Borrowed(it.as_str()))
877
        }
878
1.60k
    }
879
880
0
    fn hexnum(it: &mut str::Chars<'_>) -> Result<u32, LexError> {
881
0
        let n = Lexer::hexdigit(it)?;
882
0
        let mut last_underscore = false;
883
0
        let mut n = n as u32;
884
0
        while let Some(c) = it.clone().next() {
885
0
            if c == '_' {
886
0
                it.next();
887
0
                last_underscore = true;
888
0
                continue;
889
0
            }
890
0
            if !c.is_ascii_hexdigit() {
891
0
                break;
892
0
            }
893
0
            last_underscore = false;
894
0
            it.next();
895
0
            n = n
896
0
                .checked_mul(16)
897
0
                .and_then(|n| n.checked_add(to_hex(c) as u32))
898
0
                .ok_or(LexError::NumberTooBig)?;
899
        }
900
0
        if last_underscore {
901
0
            return Err(LexError::LoneUnderscore);
902
0
        }
903
0
        Ok(n)
904
0
    }
905
906
    /// Reads a hexidecimal digit from the input stream, returning where it's
907
    /// defined and the hex value. Returns an error on EOF or an invalid hex
908
    /// digit.
909
1.19k
    fn hexdigit(it: &mut str::Chars<'_>) -> Result<u8, LexError> {
910
1.19k
        let ch = Lexer::must_char(it)
?0
;
911
1.19k
        if ch.is_ascii_hexdigit() {
912
1.19k
            Ok(to_hex(ch))
913
        } else {
914
0
            Err(LexError::InvalidHexDigit(ch))
915
        }
916
1.19k
    }
917
918
    /// Reads the next character from the input string and where it's located,
919
    /// returning an error if the input stream is empty.
920
1.19k
    fn must_char(it: &mut str::Chars<'_>) -> Result<char, LexError> {
921
1.19k
        it.next().ok_or(LexError::UnexpectedEof)
922
1.19k
    }
923
924
    /// Expects that a specific character must be read next
925
0
    fn must_eat_char(it: &mut str::Chars<'_>, wanted: char) -> Result<(), LexError> {
926
0
        let found = Lexer::must_char(it)?;
927
0
        if wanted == found {
928
0
            Ok(())
929
        } else {
930
0
            Err(LexError::Expected { wanted, found })
931
        }
932
0
    }
933
934
    /// Creates an error at `pos` with the specified `kind`
935
0
    fn error(&self, pos: usize, kind: LexError) -> Error {
936
0
        Error::lex(Span { offset: pos }, self.input, kind)
937
0
    }
938
939
    /// Returns an iterator over all tokens in the original source string
940
    /// starting at the `pos` specified.
941
495
    pub fn iter(&self, mut pos: usize) -> impl Iterator<Item = Result<Token, Error>> + '_ {
942
960
        std::iter::from_fn(move || self.parse(&mut pos).transpose())
943
495
    }
944
945
    /// Returns whether an annotation is present at `pos`. If it is present then
946
    /// `Ok(Some(token))` is returned corresponding to the token, otherwise
947
    /// `Ok(None)` is returned. If the next token cannot be parsed then an error
948
    /// is returned.
949
25.4k
    pub fn annotation(&self, mut pos: usize) -> Result<Option<Token>, Error> {
950
25.4k
        let bytes = self.input.as_bytes();
951
25.4k
        // Quickly reject anything that for sure isn't an annotation since this
952
25.4k
        // method is used every time an lparen is parsed.
953
25.4k
        if bytes.get(pos) != Some(&b'@') {
954
25.4k
            return Ok(None);
955
0
        }
956
0
        match self.parse(&mut pos)? {
957
0
            Some(token) => match token.kind {
958
0
                TokenKind::Annotation => Ok(Some(token)),
959
0
                _ => Ok(None),
960
            },
961
0
            None => Ok(None),
962
        }
963
25.4k
    }
964
}
965
966
impl Token {
967
    /// Returns the original source text for this token.
968
55.4k
    pub fn src<'a>(&self, s: &'a str) -> &'a str {
969
55.4k
        &s[self.offset..][..self.len.try_into().unwrap()]
970
55.4k
    }
971
972
    /// Returns the identifier, without the leading `$` symbol, that this token
973
    /// represents.
974
    ///
975
    /// Note that this method returns the contents of the identifier. With a
976
    /// string-based identifier this means that escapes have been resolved to
977
    /// their string-based equivalent.
978
    ///
979
    /// Should only be used with `TokenKind::Id`.
980
    ///
981
    /// # Errors
982
    ///
983
    /// Returns an error if this is a string-based identifier (e.g. `$"..."`)
984
    /// which is invalid utf-8.
985
1.60k
    pub fn id<'a>(&self, s: &'a str) -> Result<Cow<'a, str>, Error> {
986
1.60k
        let mut ch = self.src(s).chars();
987
1.60k
        let dollar = ch.next();
988
1.60k
        debug_assert_eq!(dollar, Some('$'));
989
1.60k
        let id = Lexer::parse_name(&mut ch).map_err(|e| 
self.error(s, e)0
)
?0
;
990
1.60k
        if id.is_empty() {
991
0
            return Err(self.error(s, LexError::EmptyId));
992
1.60k
        }
993
1.60k
        Ok(id)
994
1.60k
    }
995
996
    /// Returns the annotation, without the leading `@` symbol, that this token
997
    /// represents.
998
    ///
999
    /// Note that this method returns the contents of the identifier. With a
1000
    /// string-based identifier this means that escapes have been resolved to
1001
    /// their string-based equivalent.
1002
    ///
1003
    /// Should only be used with `TokenKind::Annotation`.
1004
    ///
1005
    /// # Errors
1006
    ///
1007
    /// Returns an error if this is a string-based identifier (e.g. `$"..."`)
1008
    /// which is invalid utf-8.
1009
0
    pub fn annotation<'a>(&self, s: &'a str) -> Result<Cow<'a, str>, Error> {
1010
0
        let mut ch = self.src(s).chars();
1011
0
        let at = ch.next();
1012
0
        debug_assert_eq!(at, Some('@'));
1013
0
        let id = Lexer::parse_name(&mut ch).map_err(|e| self.error(s, e))?;
1014
0
        if id.is_empty() {
1015
0
            return Err(self.error(s, LexError::EmptyAnnotation));
1016
0
        }
1017
0
        Ok(id)
1018
0
    }
1019
1020
    /// Returns the keyword this token represents.
1021
    ///
1022
    /// Should only be used with [`TokenKind::Keyword`].
1023
47.6k
    pub fn keyword<'a>(&self, s: &'a str) -> &'a str {
1024
47.6k
        self.src(s)
1025
47.6k
    }
1026
1027
    /// Returns the reserved string this token represents.
1028
    ///
1029
    /// Should only be used with [`TokenKind::Reserved`].
1030
0
    pub fn reserved<'a>(&self, s: &'a str) -> &'a str {
1031
0
        self.src(s)
1032
0
    }
1033
1034
    /// Returns the parsed string that this token represents.
1035
    ///
1036
    /// This returns either a raw byte slice into the source if that's possible
1037
    /// or an owned representation to handle escaped characters and such.
1038
    ///
1039
    /// Should only be used with [`TokenKind::String`].
1040
3.54k
    pub fn string<'a>(&self, s: &'a str) -> Cow<'a, [u8]> {
1041
3.54k
        let mut ch = self.src(s).chars();
1042
3.54k
        ch.next().unwrap();
1043
3.54k
        Lexer::parse_str(&mut ch, true).unwrap()
1044
3.54k
    }
1045
1046
    /// Returns the decomposed float token that this represents.
1047
    ///
1048
    /// This will slice up the float token into its component parts and return a
1049
    /// description of the float token in the source.
1050
    ///
1051
    /// Should only be used with [`TokenKind::Float`].
1052
182
    pub fn float<'a>(&self, s: &'a str, kind: FloatKind) -> Float<'a> {
1053
182
        match kind {
1054
0
            FloatKind::Inf { negative } => Float::Inf { negative },
1055
0
            FloatKind::Nan { negative } => Float::Nan {
1056
0
                val: None,
1057
0
                negative,
1058
0
            },
1059
            FloatKind::NanVal {
1060
0
                negative,
1061
0
                has_underscores,
1062
0
            } => {
1063
0
                let src = self.src(s);
1064
0
                let src = if src.starts_with("n") { src } else { &src[1..] };
1065
0
                let mut val = Cow::Borrowed(src.strip_prefix("nan:0x").unwrap());
1066
0
                if has_underscores {
1067
0
                    *val.to_mut() = val.replace("_", "");
1068
0
                }
1069
0
                Float::Nan {
1070
0
                    val: Some(val),
1071
0
                    negative,
1072
0
                }
1073
            }
1074
            FloatKind::Normal {
1075
182
                has_underscores,
1076
182
                hex,
1077
182
            } => {
1078
182
                let src = self.src(s);
1079
182
                let (integral, fractional, exponent) = match src.find('.') {
1080
182
                    Some(i) => {
1081
182
                        let integral = &src[..i];
1082
182
                        let rest = &src[i + 1..];
1083
182
                        let exponent = if hex {
1084
0
                            rest.find('p').or_else(|| rest.find('P'))
1085
                        } else {
1086
182
                            rest.find('e').or_else(|| rest.find('E'))
1087
                        };
1088
182
                        match exponent {
1089
0
                            Some(i) => (integral, Some(&rest[..i]), Some(&rest[i + 1..])),
1090
182
                            None => (integral, Some(rest), None),
1091
                        }
1092
                    }
1093
                    None => {
1094
0
                        let exponent = if hex {
1095
0
                            src.find('p').or_else(|| src.find('P'))
1096
                        } else {
1097
0
                            src.find('e').or_else(|| src.find('E'))
1098
                        };
1099
0
                        match exponent {
1100
0
                            Some(i) => (&src[..i], None, Some(&src[i + 1..])),
1101
0
                            None => (src, None, None),
1102
                        }
1103
                    }
1104
                };
1105
182
                let mut integral = Cow::Borrowed(integral.strip_prefix('+').unwrap_or(integral));
1106
182
                let mut fractional = fractional.and_then(|s| {
1107
182
                    if s.is_empty() {
1108
0
                        None
1109
                    } else {
1110
182
                        Some(Cow::Borrowed(s))
1111
                    }
1112
182
                });
1113
182
                let mut exponent =
1114
182
                    exponent.map(|s| 
Cow::Borrowed(s.strip_prefix('+').unwrap_or(s))0
);
1115
182
                if has_underscores {
1116
0
                    *integral.to_mut() = integral.replace("_", "");
1117
0
                    if let Some(fractional) = &mut fractional {
1118
0
                        *fractional.to_mut() = fractional.replace("_", "");
1119
0
                    }
1120
0
                    if let Some(exponent) = &mut exponent {
1121
0
                        *exponent.to_mut() = exponent.replace("_", "");
1122
0
                    }
1123
182
                }
1124
182
                if hex {
1125
0
                    *integral.to_mut() = integral.replace("0x", "");
1126
182
                }
1127
182
                Float::Val {
1128
182
                    hex,
1129
182
                    integral,
1130
182
                    fractional,
1131
182
                    exponent,
1132
182
                }
1133
            }
1134
        }
1135
182
    }
1136
1137
    /// Returns the decomposed integer token that this represents.
1138
    ///
1139
    /// This will slice up the integer token into its component parts and
1140
    /// return a description of the integer token in the source.
1141
    ///
1142
    /// Should only be used with [`TokenKind::Integer`].
1143
2.54k
    pub fn integer<'a>(&self, s: &'a str, kind: IntegerKind) -> Integer<'a> {
1144
2.54k
        let src = self.src(s);
1145
2.54k
        let val = match 
kind.sign2
{
1146
0
            Some(SignToken::Plus) => src.strip_prefix('+').unwrap(),
1147
2
            Some(SignToken::Minus) => src,
1148
2.54k
            None => src,
1149
        };
1150
2.54k
        let mut val = Cow::Borrowed(val);
1151
2.54k
        if kind.has_underscores {
1152
3
            *val.to_mut() = val.replace("_", "");
1153
2.54k
        }
1154
2.54k
        if kind.hex {
1155
9
            *val.to_mut() = val.replace("0x", "");
1156
2.53k
        }
1157
2.54k
        Integer {
1158
2.54k
            sign: kind.sign,
1159
2.54k
            hex: kind.hex,
1160
2.54k
            val,
1161
2.54k
        }
1162
2.54k
    }
1163
1164
0
    fn error(&self, src: &str, err: LexError) -> Error {
1165
0
        Error::lex(
1166
0
            Span {
1167
0
                offset: self.offset,
1168
0
            },
1169
0
            src,
1170
0
            err,
1171
0
        )
1172
0
    }
1173
}
1174
1175
impl<'a> Integer<'a> {
1176
    /// Returns the sign token for this integer.
1177
0
    pub fn sign(&self) -> Option<SignToken> {
1178
0
        self.sign
1179
0
    }
1180
1181
    /// Returns the value string that can be parsed for this integer, as well
1182
    /// as the base that it should be parsed in
1183
2.54k
    pub fn val(&self) -> (&str, u32) {
1184
2.54k
        (&self.val, if self.hex { 
169
} else {
102.53k
})
1185
2.54k
    }
1186
}
1187
1188
2.39k
fn to_hex(c: char) -> u8 {
1189
2.39k
    match c {
1190
146
        'a'..='f' => c as u8 - b'a' + 10,
1191
0
        'A'..='F' => c as u8 - b'A' + 10,
1192
2.25k
        _ => c as u8 - b'0',
1193
    }
1194
2.39k
}
1195
1196
impl fmt::Display for LexError {
1197
0
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1198
        use LexError::*;
1199
0
        match self {
1200
0
            DanglingBlockComment => f.write_str("unterminated block comment")?,
1201
0
            Unexpected(c) => write!(f, "unexpected character '{}'", escape_char(*c))?,
1202
0
            InvalidStringElement(c) => {
1203
0
                write!(f, "invalid character in string '{}'", escape_char(*c))?
1204
            }
1205
0
            InvalidStringEscape(c) => write!(f, "invalid string escape '{}'", escape_char(*c))?,
1206
0
            InvalidHexDigit(c) => write!(f, "invalid hex digit '{}'", escape_char(*c))?,
1207
0
            InvalidDigit(c) => write!(f, "invalid decimal digit '{}'", escape_char(*c))?,
1208
0
            Expected { wanted, found } => write!(
1209
0
                f,
1210
0
                "expected '{}' but found '{}'",
1211
0
                escape_char(*wanted),
1212
0
                escape_char(*found)
1213
0
            )?,
1214
0
            UnexpectedEof => write!(f, "unexpected end-of-file")?,
1215
0
            NumberTooBig => f.write_str("number is too big to parse")?,
1216
0
            InvalidUnicodeValue(c) => write!(f, "invalid unicode scalar value 0x{c:x}")?,
1217
0
            LoneUnderscore => write!(f, "bare underscore in numeric literal")?,
1218
0
            ConfusingUnicode(c) => write!(f, "likely-confusing unicode character found {c:?}")?,
1219
0
            InvalidUtf8Id(_) => write!(f, "malformed UTF-8 encoding of string-based id")?,
1220
0
            EmptyId => write!(f, "empty identifier")?,
1221
0
            EmptyAnnotation => write!(f, "empty annotation id")?,
1222
        }
1223
0
        Ok(())
1224
0
    }
1225
}
1226
1227
0
fn escape_char(c: char) -> String {
1228
0
    match c {
1229
0
        '\t' => String::from("\\t"),
1230
0
        '\r' => String::from("\\r"),
1231
0
        '\n' => String::from("\\n"),
1232
0
        '\\' => String::from("\\\\"),
1233
0
        '\'' => String::from("\\\'"),
1234
0
        '\"' => String::from("\""),
1235
0
        '\x20'..='\x7e' => String::from(c),
1236
0
        _ => c.escape_unicode().to_string(),
1237
    }
1238
0
}
1239
1240
/// This is an attempt to protect agains the "trojan source" [1] problem where
1241
/// unicode characters can cause editors to render source code differently
1242
/// for humans than the compiler itself sees.
1243
///
1244
/// To mitigate this issue, and because it's relatively rare in practice,
1245
/// this simply rejects characters of that form.
1246
///
1247
/// [1]: https://www.trojansource.codes/
1248
29.8k
fn is_confusing_unicode(ch: char) -> bool {
1249
29.8k
    matches!(
1250
29.8k
        ch,
1251
        '\u{202a}'
1252
            | '\u{202b}'
1253
            | '\u{202d}'
1254
            | '\u{202e}'
1255
            | '\u{2066}'
1256
            | '\u{2067}'
1257
            | '\u{2068}'
1258
            | '\u{206c}'
1259
            | '\u{2069}'
1260
    )
1261
29.8k
}
1262
1263
#[cfg(test)]
1264
mod tests {
1265
    use super::*;
1266
1267
    #[test]
1268
    fn ws_smoke() {
1269
        fn get_whitespace(input: &str) -> &str {
1270
            let token = get_token(input);
1271
            match token.kind {
1272
                TokenKind::Whitespace => token.src(input),
1273
                other => panic!("unexpected {other:?}"),
1274
            }
1275
        }
1276
        assert_eq!(get_whitespace(" "), " ");
1277
        assert_eq!(get_whitespace("  "), "  ");
1278
        assert_eq!(get_whitespace("  \n "), "  \n ");
1279
        assert_eq!(get_whitespace("  x"), "  ");
1280
        assert_eq!(get_whitespace("  ;"), "  ");
1281
    }
1282
1283
    #[test]
1284
    fn line_comment_smoke() {
1285
        fn get_line_comment(input: &str) -> &str {
1286
            let token = get_token(input);
1287
            match token.kind {
1288
                TokenKind::LineComment => token.src(input),
1289
                other => panic!("unexpected {other:?}"),
1290
            }
1291
        }
1292
        assert_eq!(get_line_comment(";;"), ";;");
1293
        assert_eq!(get_line_comment(";; xyz"), ";; xyz");
1294
        assert_eq!(get_line_comment(";; xyz\nabc"), ";; xyz");
1295
        assert_eq!(get_line_comment(";;\nabc"), ";;");
1296
        assert_eq!(get_line_comment(";;   \nabc"), ";;   ");
1297
        assert_eq!(get_line_comment(";;   \rabc"), ";;   ");
1298
        assert_eq!(get_line_comment(";;   \r\nabc"), ";;   ");
1299
    }
1300
1301
    #[test]
1302
    fn block_comment_smoke() {
1303
        fn get_block_comment(input: &str) -> &str {
1304
            let token = get_token(input);
1305
            match token.kind {
1306
                TokenKind::BlockComment => token.src(input),
1307
                other => panic!("unexpected {other:?}"),
1308
            }
1309
        }
1310
        assert_eq!(get_block_comment("(;;)"), "(;;)");
1311
        assert_eq!(get_block_comment("(; ;)"), "(; ;)");
1312
        assert_eq!(get_block_comment("(; (;;) ;)"), "(; (;;) ;)");
1313
    }
1314
1315
    fn get_token(input: &str) -> Token {
1316
        Lexer::new(input)
1317
            .parse(&mut 0)
1318
            .expect("no first token")
1319
            .expect("no token")
1320
    }
1321
1322
    #[test]
1323
    fn lparen() {
1324
        assert_eq!(get_token("((").kind, TokenKind::LParen);
1325
    }
1326
1327
    #[test]
1328
    fn rparen() {
1329
        assert_eq!(get_token(")(").kind, TokenKind::RParen);
1330
    }
1331
1332
    #[test]
1333
    fn strings() {
1334
        fn get_string(input: &str) -> Vec<u8> {
1335
            let token = get_token(input);
1336
            match token.kind {
1337
                TokenKind::String => token.string(input).to_vec(),
1338
                other => panic!("not keyword {other:?}"),
1339
            }
1340
        }
1341
        assert_eq!(&*get_string("\"\""), b"");
1342
        assert_eq!(&*get_string("\"a\""), b"a");
1343
        assert_eq!(&*get_string("\"a b c d\""), b"a b c d");
1344
        assert_eq!(&*get_string("\"\\\"\""), b"\"");
1345
        assert_eq!(&*get_string("\"\\'\""), b"'");
1346
        assert_eq!(&*get_string("\"\\n\""), b"\n");
1347
        assert_eq!(&*get_string("\"\\t\""), b"\t");
1348
        assert_eq!(&*get_string("\"\\r\""), b"\r");
1349
        assert_eq!(&*get_string("\"\\\\\""), b"\\");
1350
        assert_eq!(&*get_string("\"\\01\""), &[1]);
1351
        assert_eq!(&*get_string("\"\\u{1}\""), &[1]);
1352
        assert_eq!(
1353
            &*get_string("\"\\u{0f3}\""),
1354
            '\u{0f3}'.encode_utf8(&mut [0; 4]).as_bytes()
1355
        );
1356
        assert_eq!(
1357
            &*get_string("\"\\u{0_f_3}\""),
1358
            '\u{0f3}'.encode_utf8(&mut [0; 4]).as_bytes()
1359
        );
1360
1361
        for i in 0..=255i32 {
1362
            let s = format!("\"\\{i:02x}\"");
1363
            assert_eq!(&*get_string(&s), &[i as u8]);
1364
        }
1365
    }
1366
1367
    #[test]
1368
    fn id() {
1369
        fn get_id(input: &str) -> String {
1370
            let token = get_token(input);
1371
            match token.kind {
1372
                TokenKind::Id => token.id(input).unwrap().to_string(),
1373
                other => panic!("not id {other:?}"),
1374
            }
1375
        }
1376
        assert_eq!(get_id("$x"), "x");
1377
        assert_eq!(get_id("$xyz"), "xyz");
1378
        assert_eq!(get_id("$x_z"), "x_z");
1379
        assert_eq!(get_id("$0^"), "0^");
1380
        assert_eq!(get_id("$0^;;"), "0^");
1381
        assert_eq!(get_id("$0^ ;;"), "0^");
1382
        assert_eq!(get_id("$\"x\" ;;"), "x");
1383
    }
1384
1385
    #[test]
1386
    fn annotation() {
1387
        fn get_annotation(input: &str) -> String {
1388
            let token = get_token(input);
1389
            match token.kind {
1390
                TokenKind::Annotation => token.annotation(input).unwrap().to_string(),
1391
                other => panic!("not annotation {other:?}"),
1392
            }
1393
        }
1394
        assert_eq!(get_annotation("@foo"), "foo");
1395
        assert_eq!(get_annotation("@foo "), "foo");
1396
        assert_eq!(get_annotation("@f "), "f");
1397
        assert_eq!(get_annotation("@\"x\" "), "x");
1398
        assert_eq!(get_annotation("@0 "), "0");
1399
    }
1400
1401
    #[test]
1402
    fn keyword() {
1403
        fn get_keyword(input: &str) -> &str {
1404
            let token = get_token(input);
1405
            match token.kind {
1406
                TokenKind::Keyword => token.keyword(input),
1407
                other => panic!("not keyword {other:?}"),
1408
            }
1409
        }
1410
        assert_eq!(get_keyword("x"), "x");
1411
        assert_eq!(get_keyword("xyz"), "xyz");
1412
        assert_eq!(get_keyword("x_z"), "x_z");
1413
        assert_eq!(get_keyword("x_z "), "x_z");
1414
        assert_eq!(get_keyword("x_z "), "x_z");
1415
    }
1416
1417
    #[test]
1418
    fn reserved() {
1419
        fn get_reserved(input: &str) -> &str {
1420
            let token = get_token(input);
1421
            match token.kind {
1422
                TokenKind::Reserved => token.reserved(input),
1423
                other => panic!("not reserved {other:?}"),
1424
            }
1425
        }
1426
        assert_eq!(get_reserved("^_x "), "^_x");
1427
    }
1428
1429
    #[test]
1430
    fn integer() {
1431
        fn get_integer(input: &str) -> String {
1432
            let token = get_token(input);
1433
            match token.kind {
1434
                TokenKind::Integer(i) => token.integer(input, i).val.to_string(),
1435
                other => panic!("not integer {other:?}"),
1436
            }
1437
        }
1438
        assert_eq!(get_integer("1"), "1");
1439
        assert_eq!(get_integer("0"), "0");
1440
        assert_eq!(get_integer("-1"), "-1");
1441
        assert_eq!(get_integer("+1"), "1");
1442
        assert_eq!(get_integer("+1_000"), "1000");
1443
        assert_eq!(get_integer("+1_0_0_0"), "1000");
1444
        assert_eq!(get_integer("+0x10"), "10");
1445
        assert_eq!(get_integer("-0x10"), "-10");
1446
        assert_eq!(get_integer("0x10"), "10");
1447
    }
1448
1449
    #[test]
1450
    fn float() {
1451
        fn get_float(input: &str) -> Float<'_> {
1452
            let token = get_token(input);
1453
            match token.kind {
1454
                TokenKind::Float(f) => token.float(input, f),
1455
                other => panic!("not float {other:?}"),
1456
            }
1457
        }
1458
        assert_eq!(
1459
            get_float("nan"),
1460
            Float::Nan {
1461
                val: None,
1462
                negative: false
1463
            },
1464
        );
1465
        assert_eq!(
1466
            get_float("-nan"),
1467
            Float::Nan {
1468
                val: None,
1469
                negative: true,
1470
            },
1471
        );
1472
        assert_eq!(
1473
            get_float("+nan"),
1474
            Float::Nan {
1475
                val: None,
1476
                negative: false,
1477
            },
1478
        );
1479
        assert_eq!(
1480
            get_float("+nan:0x1"),
1481
            Float::Nan {
1482
                val: Some("1".into()),
1483
                negative: false,
1484
            },
1485
        );
1486
        assert_eq!(
1487
            get_float("nan:0x7f_ffff"),
1488
            Float::Nan {
1489
                val: Some("7fffff".into()),
1490
                negative: false,
1491
            },
1492
        );
1493
        assert_eq!(get_float("inf"), Float::Inf { negative: false });
1494
        assert_eq!(get_float("-inf"), Float::Inf { negative: true });
1495
        assert_eq!(get_float("+inf"), Float::Inf { negative: false });
1496
1497
        assert_eq!(
1498
            get_float("1.2"),
1499
            Float::Val {
1500
                integral: "1".into(),
1501
                fractional: Some("2".into()),
1502
                exponent: None,
1503
                hex: false,
1504
            },
1505
        );
1506
        assert_eq!(
1507
            get_float("1.2e3"),
1508
            Float::Val {
1509
                integral: "1".into(),
1510
                fractional: Some("2".into()),
1511
                exponent: Some("3".into()),
1512
                hex: false,
1513
            },
1514
        );
1515
        assert_eq!(
1516
            get_float("-1_2.1_1E+0_1"),
1517
            Float::Val {
1518
                integral: "-12".into(),
1519
                fractional: Some("11".into()),
1520
                exponent: Some("01".into()),
1521
                hex: false,
1522
            },
1523
        );
1524
        assert_eq!(
1525
            get_float("+1_2.1_1E-0_1"),
1526
            Float::Val {
1527
                integral: "12".into(),
1528
                fractional: Some("11".into()),
1529
                exponent: Some("-01".into()),
1530
                hex: false,
1531
            },
1532
        );
1533
        assert_eq!(
1534
            get_float("0x1_2.3_4p5_6"),
1535
            Float::Val {
1536
                integral: "12".into(),
1537
                fractional: Some("34".into()),
1538
                exponent: Some("56".into()),
1539
                hex: true,
1540
            },
1541
        );
1542
        assert_eq!(
1543
            get_float("+0x1_2.3_4P-5_6"),
1544
            Float::Val {
1545
                integral: "12".into(),
1546
                fractional: Some("34".into()),
1547
                exponent: Some("-56".into()),
1548
                hex: true,
1549
            },
1550
        );
1551
        assert_eq!(
1552
            get_float("1."),
1553
            Float::Val {
1554
                integral: "1".into(),
1555
                fractional: None,
1556
                exponent: None,
1557
                hex: false,
1558
            },
1559
        );
1560
        assert_eq!(
1561
            get_float("0x1p-24"),
1562
            Float::Val {
1563
                integral: "1".into(),
1564
                fractional: None,
1565
                exponent: Some("-24".into()),
1566
                hex: true,
1567
            },
1568
        );
1569
    }
1570
}