expr_parser/token/
charset.rs

1use std::{
2    convert::Infallible,
3    fmt,
4    io::{self, BufRead},
5    marker::PhantomData,
6};
7
8use bstr::ByteSlice;
9use bytes::{Bytes, BytesMut};
10use either::Either;
11use unicode_xid::UnicodeXID;
12
13use super::{Span, Token, Tokenizer};
14
15/// A tokenizer which tokenizes characters by grouping them into sets.
16pub struct CharSetTokenizer<S, C> {
17    pub source: S,
18    _marker: PhantomData<fn() -> C>,
19}
20
21pub enum CharSetResult<T, E> {
22    /// Accept the character in the token
23    Continue,
24    /// Reject the character and optionally produce a token
25    Done(Option<T>),
26    /// Reject the character and discard the current token, returning an error
27    Err(E),
28}
29
30impl<T, E> CharSetResult<T, E> {
31    pub fn map_into<U: From<T>, F: From<E>>(self) -> CharSetResult<U, F> {
32        match self {
33            CharSetResult::Continue => CharSetResult::Continue,
34            CharSetResult::Done(Some(val)) => CharSetResult::Done(Some(val.into())),
35            CharSetResult::Done(None) => CharSetResult::Done(None),
36            CharSetResult::Err(err) => CharSetResult::Err(err.into()),
37        }
38    }
39}
40
41pub trait CharSet<C>: Default {
42    type TokenKind;
43    type Error;
44
45    /// Categorize a charcter while continuing a potential token.
46    fn next_char(&mut self, c: C) -> CharSetResult<Self::TokenKind, Self::Error>;
47
48    /// What token kind (if any) to return if end of input is reached.
49    fn end_of_input(self) -> Result<Option<Self::TokenKind>, Self::Error>;
50}
51
52type CharSetToken<S, C> = (
53    <S as Source>::String,
54    <C as CharSet<<S as Source>::Char>>::TokenKind,
55);
56type CharSetError<S, C> = Either<<S as Source>::Error, <C as CharSet<<S as Source>::Char>>::Error>;
57
58impl<S: Source, C: CharSet<S::Char>> CharSetTokenizer<S, C> {
59    pub fn new(source: S) -> Self {
60        Self {
61            source,
62            _marker: PhantomData,
63        }
64    }
65
66    /// Advances in the input as long as the character matches the character set.
67    fn advance_while(&mut self) -> Result<Option<CharSetToken<S, C>>, CharSetError<S, C>> {
68        let mut result = Ok(None);
69        let mut state = C::default();
70        let predicate = |c| match state.next_char(c) {
71            CharSetResult::Continue => true,
72            CharSetResult::Done(new_kind) => {
73                result = Ok(new_kind);
74                false
75            }
76            CharSetResult::Err(err) => {
77                result = Err(err);
78                false
79            }
80        };
81        let token = self.source.advance_while(predicate).map_err(Either::Left)?;
82        let kind = result.map_err(Either::Right)?;
83        let kind = if self.source.is_empty() {
84            state.end_of_input().map_err(Either::Right)?
85        } else {
86            kind
87        };
88        Ok(kind.map(|kind| (token, kind)))
89    }
90}
91
92impl<S: Source, C: CharSet<S::Char>> Tokenizer for CharSetTokenizer<S, C> {
93    type Token = CharSetToken<S, C>;
94    type Position = usize;
95    type Error = CharSetError<S, C>;
96
97    fn next_token(&mut self) -> Option<Result<Token<Self::Token, Self::Position>, Self::Error>> {
98        while !self.source.is_empty() {
99            let start = self.source.next_index();
100            match self.advance_while() {
101                Ok(Some(token)) => {
102                    let end = self.source.next_index();
103                    return Some(Ok(Token {
104                        span: Span { start, end },
105                        kind: token,
106                    }));
107                }
108                Ok(None) => continue,
109                Err(err) => return Some(Err(err)),
110            }
111        }
112        None
113    }
114}
115
116impl<S: Source, C: CharSet<S::Char>> Iterator for CharSetTokenizer<S, C> {
117    type Item = Result<Token<<Self as Tokenizer>::Token, usize>, <Self as Tokenizer>::Error>;
118
119    fn next(&mut self) -> Option<Self::Item> {
120        self.next_token()
121    }
122}
123
124pub trait Source {
125    type Char;
126    type String;
127    type Error;
128
129    /// Returns the start index of the next character.
130    fn next_index(&self) -> usize;
131
132    fn is_empty(&self) -> bool;
133
134    /// Advances in the input as long as the character matches the predicate, returning the
135    /// matching string.
136    fn advance_while(
137        &mut self,
138        predicate: impl FnMut(Self::Char) -> bool,
139    ) -> Result<Self::String, Self::Error>;
140}
141
142pub struct StrSource<'s> {
143    /// The full source string
144    source: &'s str,
145    /// The remainder of the input which has not been tokenized yet
146    remainder: &'s str,
147}
148
149impl<'s> StrSource<'s> {
150    pub fn new(source: &'s str) -> Self {
151        Self {
152            source,
153            remainder: source,
154        }
155    }
156}
157
158impl<'s> Source for StrSource<'s> {
159    type Char = char;
160    type String = &'s str;
161    type Error = Infallible;
162
163    fn next_index(&self) -> usize {
164        self.source.len() - self.remainder.len()
165    }
166
167    fn is_empty(&self) -> bool {
168        self.remainder.is_empty()
169    }
170
171    fn advance_while(
172        &mut self,
173        mut predicate: impl FnMut(char) -> bool,
174    ) -> Result<&'s str, Infallible> {
175        let start = self.next_index();
176        let offset = self
177            .remainder
178            .char_indices()
179            .skip_while(|(_, c)| predicate(*c))
180            .map(|(idx, _)| idx)
181            .next()
182            .unwrap_or(self.remainder.len());
183        self.remainder = &self.remainder[offset..];
184        Ok(&self.source[start..start + offset])
185    }
186}
187
188pub struct BufReadSource<R> {
189    reader: R,
190    line_buffer: Vec<u8>,
191    buffer: BytesMut,
192    index: usize,
193    is_empty: bool,
194}
195
196impl<R: BufRead> BufReadSource<R> {
197    pub fn new(reader: R) -> Self {
198        Self {
199            reader,
200            line_buffer: Vec::new(),
201            buffer: BytesMut::new(),
202            index: 0,
203            is_empty: false,
204        }
205    }
206
207    fn fill_buf(&mut self) -> io::Result<()> {
208        if self.buffer.is_empty() {
209            match self.reader.read_until(b'\n', &mut self.line_buffer) {
210                Ok(0) => self.is_empty = true,
211                Ok(_) => {}
212                Err(error) => {
213                    self.is_empty = true;
214                    return Err(error);
215                }
216            }
217            self.buffer.extend_from_slice(&self.line_buffer);
218            self.line_buffer.clear();
219        }
220        Ok(())
221    }
222}
223
224impl<R: BufRead> Source for BufReadSource<R> {
225    type Char = char;
226    type String = Bytes;
227    type Error = io::Error;
228
229    fn next_index(&self) -> usize {
230        self.index
231    }
232
233    fn is_empty(&self) -> bool {
234        self.is_empty
235    }
236
237    fn advance_while(
238        &mut self,
239        mut predicate: impl FnMut(char) -> bool,
240    ) -> Result<Self::String, io::Error> {
241        let mut token = self.buffer.split_to(0);
242        while !self.is_empty {
243            self.fill_buf()?;
244            let buffer = &self.buffer;
245            let offset = buffer
246                .char_indices()
247                .skip_while(|(_, _, c)| predicate(*c))
248                .map(|(idx, _, _)| idx)
249                .next()
250                .unwrap_or(buffer.len());
251            self.index += offset;
252            let s = self.buffer.split_to(offset);
253
254            if let Err(e) = std::str::from_utf8(&s) {
255                return Err(io::Error::new(io::ErrorKind::InvalidData, e));
256            }
257            token.unsplit(s);
258            if !self.buffer.is_empty() {
259                break;
260            }
261        }
262        Ok(token.freeze())
263    }
264}
265
266pub type SimpleTokenizer<S> = CharSetTokenizer<S, SimpleCharSet>;
267
268#[derive(Clone, Copy, Default, Debug, Eq, PartialEq)]
269pub enum SimpleCharSet {
270    #[default]
271    None,
272    /// A number, which can be an integer, or a floating-point number
273    Number(NumberState),
274    /// An identifier consists of a character with the Unicode `XID_Start` property, followed by a
275    /// sequence of characters with the Unicode `XID_continue` property
276    Identifier,
277    /// A string delimited by double quotes (`"`). The boolean indicates whether an odd number of
278    /// backslashes have been matched (and thus the next `"` is escaped).
279    String(bool),
280    /// A character which forms a token on its own
281    Singleton,
282    /// Tokens starting with `<`, `=`, `>` can only contain other characters from that set.
283    Comparison,
284    /// `.` is part of a number if followed by a digit, or part of a punctuation tag otherwise.
285    Dot,
286    /// Whitespace isn't part of any token
287    Whitespace,
288    /// Any character not covered by the above categories
289    Other,
290    /// The next character will not be in this token
291    BreakNext(Option<SimpleCharSetTokenKind>),
292}
293
294impl SimpleCharSet {
295    fn categorize(ch: char) -> Self {
296        match ch {
297            '"' => Self::String(false),
298            '.' => Self::Dot,
299            ch if is_singleton_char(ch) => Self::Singleton,
300            ch if is_comparison_char(ch) => Self::Comparison,
301            ch if is_number_start_char(ch) => Self::Number(NumberState::Integer),
302            ch if is_ident_start_char(ch) => Self::Identifier,
303            ch if ch.is_whitespace() => Self::Whitespace,
304            _ => Self::Other,
305        }
306    }
307}
308
309#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
310pub enum NumberState {
311    /// The first digit of the integer part has been matched
312    #[default]
313    Integer,
314    /// The dot separating the integer and fractional parts has been matched
315    Dot,
316    /// The first digit of the fractional part has been matched
317    Fractional,
318}
319
320#[derive(Clone, Copy, Debug, Eq, PartialEq)]
321pub enum NumberKind {
322    Integer,
323    Float,
324}
325
326#[derive(Clone, Copy, Debug, Eq, PartialEq)]
327pub enum SimpleCharSetError {
328    UnterminatedString,
329}
330
331impl std::error::Error for SimpleCharSetError {}
332
333impl fmt::Display for SimpleCharSetError {
334    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
335        match self {
336            Self::UnterminatedString => f.write_str("unterminated string"),
337        }
338    }
339}
340
341impl From<Infallible> for SimpleCharSetError {
342    fn from(err: Infallible) -> Self {
343        match err {}
344    }
345}
346
347impl CharSet<char> for SimpleCharSet {
348    type TokenKind = SimpleCharSetTokenKind;
349    type Error = SimpleCharSetError;
350
351    fn next_char(&mut self, ch: char) -> CharSetResult<Self::TokenKind, Self::Error> {
352        match (*self, ch) {
353            (Self::None, ch) => {
354                *self = Self::categorize(ch);
355                CharSetResult::Continue
356            }
357            (Self::Number(mut state), ch) => {
358                let res = state.next_char(ch);
359                *self = Self::Number(state);
360                res.map_into()
361            }
362            (Self::Identifier, ch) if is_ident_char(ch) => CharSetResult::Continue,
363            (Self::String(false), '"') => {
364                *self = Self::BreakNext(Some(SimpleCharSetTokenKind::String));
365                CharSetResult::Continue
366            }
367            (Self::String(escaped), '\\') => {
368                *self = Self::String(!escaped);
369                CharSetResult::Continue
370            }
371            (Self::String(_), _) => {
372                *self = Self::String(false);
373                CharSetResult::Continue
374            }
375            (Self::Comparison, ch) if is_comparison_char(ch) => CharSetResult::Continue,
376            (Self::Dot, ch) if is_number_start_char(ch) => {
377                *self = Self::Number(NumberState::Dot);
378                CharSetResult::Continue
379            }
380            (Self::Dot, ch) if is_other_continuation_char(ch) => {
381                *self = Self::Other;
382                CharSetResult::Continue
383            }
384            (Self::Other, ch) if is_other_continuation_char(ch) => CharSetResult::Continue,
385            (
386                Self::Identifier | Self::Singleton | Self::Comparison | Self::Other | Self::Dot,
387                _,
388            ) => CharSetResult::Done(Some(SimpleCharSetTokenKind::Tag)),
389            (Self::Whitespace, ch) if ch.is_whitespace() => CharSetResult::Continue,
390            (Self::Whitespace, _) => CharSetResult::Done(None),
391            (Self::BreakNext(kind), _) => CharSetResult::Done(kind),
392        }
393    }
394
395    fn end_of_input(self) -> Result<Option<Self::TokenKind>, Self::Error> {
396        match self {
397            Self::None => Ok(None),
398            Self::Number(state) => state
399                .end_of_input()
400                .map(|opt| opt.map(Into::into))
401                .map_err(Into::into),
402            Self::Identifier | Self::Singleton | Self::Comparison | Self::Dot | Self::Other => {
403                Ok(Some(SimpleCharSetTokenKind::Tag))
404            }
405            Self::String(_) => Err(SimpleCharSetError::UnterminatedString),
406            Self::BreakNext(kind) => Ok(kind),
407            Self::Whitespace => Ok(None),
408        }
409    }
410}
411
412impl CharSet<char> for NumberState {
413    type TokenKind = NumberKind;
414    type Error = Infallible;
415
416    fn next_char(&mut self, ch: char) -> CharSetResult<Self::TokenKind, Self::Error> {
417        match (*self, ch) {
418            (Self::Integer, '.') => {
419                *self = Self::Dot;
420                CharSetResult::Continue
421            }
422            (Self::Dot, ch) if is_number_start_char(ch) => {
423                *self = Self::Fractional;
424                CharSetResult::Continue
425            }
426            (Self::Integer | Self::Fractional, ch) if is_number_char(ch) => CharSetResult::Continue,
427            (Self::Integer, _) => CharSetResult::Done(Some(NumberKind::Integer)),
428            (Self::Dot | Self::Fractional, _) => CharSetResult::Done(Some(NumberKind::Float)),
429        }
430    }
431
432    fn end_of_input(self) -> Result<Option<Self::TokenKind>, Self::Error> {
433        match self {
434            Self::Integer => Ok(Some(NumberKind::Integer)),
435            Self::Dot | Self::Fractional => Ok(Some(NumberKind::Float)),
436        }
437    }
438}
439
440#[derive(Clone, Copy, Debug, Eq, PartialEq)]
441pub enum SimpleCharSetTokenKind {
442    Tag,
443    Number(NumberKind),
444    String,
445}
446
447impl From<NumberKind> for SimpleCharSetTokenKind {
448    fn from(number: NumberKind) -> Self {
449        Self::Number(number)
450    }
451}
452
453#[inline]
454fn is_number_start_char(ch: char) -> bool {
455    ch.is_ascii_digit()
456}
457
458#[inline]
459fn is_number_char(ch: char) -> bool {
460    ch == '_' || ch.is_ascii_digit()
461}
462
463#[inline]
464fn is_ident_start_char(ch: char) -> bool {
465    ch == '_' || UnicodeXID::is_xid_start(ch)
466}
467
468#[inline]
469fn is_ident_char(ch: char) -> bool {
470    UnicodeXID::is_xid_continue(ch)
471}
472
473#[inline]
474fn is_comparison_char(ch: char) -> bool {
475    ch == '<' || ch == '=' || ch == '>'
476}
477
478#[inline]
479fn is_singleton_char(ch: char) -> bool {
480    ch == '(' || ch == ')'
481}
482
483#[inline]
484fn is_other_continuation_char(ch: char) -> bool {
485    matches!(
486        SimpleCharSet::categorize(ch),
487        SimpleCharSet::Comparison | SimpleCharSet::Dot | SimpleCharSet::Other
488    )
489}
490
491#[cfg(test)]
492mod tests {
493    use bytes::Bytes;
494    use test_case::test_case;
495
496    use super::{BufReadSource, NumberKind, SimpleCharSetTokenKind, SimpleTokenizer, StrSource};
497
498    #[test_case("abc", SimpleCharSetTokenKind::Tag, "abc" ; "tag abc")]
499    #[test_case("a\u{0300}bc", SimpleCharSetTokenKind::Tag, "a\u{0300}bc" ; "tag with combining char")]
500    #[test_case("_0", SimpleCharSetTokenKind::Tag, "_0" ; "tag _0")]
501    #[test_case("abc+", SimpleCharSetTokenKind::Tag, "abc" ; "tag followed by other char")]
502    #[test_case("   \n\t\rabc", SimpleCharSetTokenKind::Tag, "abc" ; "leading whitespace")]
503    #[test_case("<>", SimpleCharSetTokenKind::Tag, "<>" ; "comparison tag")]
504    #[test_case("=-", SimpleCharSetTokenKind::Tag, "=" ; "comparison followed by other char")]
505    #[test_case("-=", SimpleCharSetTokenKind::Tag, "-=" ; "other char followed by comparison")]
506    #[test_case("..", SimpleCharSetTokenKind::Tag, ".." ; "tag starting with dot")]
507    #[test_case("..123", SimpleCharSetTokenKind::Tag, ".." ; "tag starting with dot followed by number")]
508    #[test_case("123", SimpleCharSetTokenKind::Number(NumberKind::Integer), "123" ; "integer")]
509    #[test_case("1_234", SimpleCharSetTokenKind::Number(NumberKind::Integer), "1_234" ; "integer with underscoreNumberKind")]
510    #[test_case("1.234", SimpleCharSetTokenKind::Number(NumberKind::Float), "1.234" ; "simple float")]
511    #[test_case(".234", SimpleCharSetTokenKind::Number(NumberKind::Float), ".234" ; "float with no integer")]
512    #[test_case("1.", SimpleCharSetTokenKind::Number(NumberKind::Float), "1." ; "integer followed by dot")]
513    #[test_case("1.234.5", SimpleCharSetTokenKind::Number(NumberKind::Float), "1.234" ; "float with extra dot")]
514    #[test_case(".234.5", SimpleCharSetTokenKind::Number(NumberKind::Float), ".234" ; "float with no integer and extra dot")]
515    #[test_case(r#""abc\"\\\"\\""#, SimpleCharSetTokenKind::String, r#""abc\"\\\"\\""# ; "string")]
516    #[test_case("(((", SimpleCharSetTokenKind::Tag, "(" ; "singleton")]
517    fn lex_one(source: &str, kind: SimpleCharSetTokenKind, as_str: &str) {
518        let actual = SimpleTokenizer::new(StrSource::new(source))
519            .next()
520            .unwrap()
521            .unwrap();
522        assert_eq!(actual.kind, (as_str, kind));
523    }
524
525    #[test]
526    fn unterminated_string() {
527        let source = r#""abc\"\\\"abc"#;
528        SimpleTokenizer::new(StrSource::new(source))
529            .next()
530            .unwrap()
531            .unwrap_err();
532    }
533
534    #[test_case("abc<>+1", "abc <> + 1" ; "some tokens")]
535    #[test_case("<->", "< ->" ; "arrows")]
536    fn tokenize_many(source: &str, tokens: &str) {
537        let actual_str = SimpleTokenizer::new(StrSource::new(source))
538            .map(|res| res.map(|tok| tok.kind.0))
539            .collect::<Result<Vec<_>, _>>()
540            .unwrap();
541        let expected = tokens.split_whitespace().collect::<Vec<_>>();
542        assert_eq!(actual_str, expected);
543        let actual_br = SimpleTokenizer::new(BufReadSource::new(source.as_bytes()))
544            .map(|res| res.map(|tok| tok.kind.0))
545            .collect::<Result<Vec<_>, _>>()
546            .unwrap();
547        assert_eq!(actual_br, expected);
548    }
549
550    #[test]
551    fn invalid_utf8() {
552        let source: &[u8] = b"abc\x80\x81def";
553        let tokens = SimpleTokenizer::new(BufReadSource::new(source))
554            .map(|res| res.map(|tok| tok.kind.0).map_err(|_| ()))
555            .collect::<Vec<Result<_, _>>>();
556        assert_eq!(
557            tokens,
558            &[
559                Ok(Bytes::from_static("abc".as_bytes())),
560                Err(()),
561                Ok(Bytes::from_static("def".as_bytes()))
562            ]
563        );
564    }
565}