Skip to main content

anillo/
lexer.rs

1//! Basic type definitions and implementations for the lexing of an Anillo
2//! source file.
3
4use std::{collections::VecDeque, error::Error, io::Read, str::FromStr};
5
6use crate::{
7    error::CompilationError,
8    token::{Ring, Token, TokenInfo},
9};
10
11/// The Anillo Lexer
12///
13/// The Anillo Lexer is purposefully simple. Lexemes (or Tokens here) are only
14/// delimited by whitespace and "Special Characters" (these are defined in the
15/// techincal specification, but generally punctuation like brackets, or special
16/// semantic tokens like '$' count as "Special Characters")
17pub struct Lexer {
18    file: std::io::BufReader<std::fs::File>,
19}
20
21impl Lexer {
22    pub fn new(path: &std::path::Path) -> std::io::Result<Lexer> {
23        let f = std::fs::File::open(path)?;
24        Ok(Lexer {
25            file: std::io::BufReader::new(f),
26        })
27    }
28
29    /// Eagerly lex the ModuleSource and return a buffer of `TokenInfo`s to the
30    /// caller.
31    ///
32    /// Lexing here is done as simply as possible (and the language was designed
33    /// with this in mind). There should be absolutely no look ahead peeking
34    /// done at this level. Compilation errors at this level are supported, but
35    /// rare (only really possible if an invalid unicode character is found).
36    pub fn tokenize(&mut self) -> Result<VecDeque<TokenInfo>, Box<dyn Error>> {
37        let mut tokens: VecDeque<TokenInfo> = VecDeque::new();
38        let mut buf: [u8; 1] = [0; 1];
39        let mut str_token_buf: Vec<char> = Vec::new();
40
41        let mut line: u32 = 1;
42        let mut col: u32 = 1;
43
44        // Many conditions can trigger a "drain" of our current raw char-form
45        // token buffer. This is just a macro that wraps up the repetitive
46        // code into a single call.
47        macro_rules! drain_if_needed {
48            () => {
49                if !str_token_buf.is_empty() {
50                    let remaining: String = str_token_buf.iter().collect();
51                    str_token_buf.clear();
52                    let tok: Token = Self::parse_str(&remaining);
53                    tokens.push_back(TokenInfo::new(tok, line, col));
54                }
55            };
56        }
57
58        while self.file.read_exact(&mut buf).is_ok() {
59            let letter = buf[0] as char;
60            match letter {
61                '(' | '{' | ')' | '}' => {
62                    col += 1;
63                    drain_if_needed!();
64                    let tok: Token = match letter {
65                        '(' => Token::LeftParen,
66                        '{' => Token::LeftBracket,
67                        ')' => Token::RightParen,
68                        '}' => Token::RightBracket,
69                        _ => unreachable!(),
70                    };
71
72                    tokens.push_back(TokenInfo::new(tok, line, col));
73                }
74                ',' => {
75                    col += 1;
76                    drain_if_needed!();
77                    tokens.push_back(TokenInfo::new(Token::Comma, line, col));
78                }
79                '$' => {
80                    col += 1;
81                    drain_if_needed!();
82                    tokens.push_back(TokenInfo::new(Token::Dollar, line, col));
83                }
84                white if white.is_ascii_whitespace() => {
85                    if white == '\n' {
86                        line += 1;
87                        col = 0;
88                    } else {
89                        col += 1;
90                    }
91                    drain_if_needed!();
92                }
93                asc if asc.is_ascii_alphanumeric() => {
94                    col += 1;
95                    str_token_buf.push(letter);
96                }
97                other => {
98                    col += 1;
99                    return Err(Box::new(CompilationError::new(
100                        line,
101                        col,
102                        format!("Unrecognized character type: {}", other),
103                    )));
104                }
105            }
106        }
107
108        Ok(tokens)
109    }
110
111    /// Helper function for `tokenize`.
112    ///
113    /// Essentially anything that isn't a Special Character will be lexed here.
114    /// Where possible, we will store the lexeme in a rich token type (Like
115    /// keywords). Everything else however will be parsed as identifiers (even
116    /// numbers). The parser handles converting these to their expected types at
117    /// AST generation time in `parser.rs`
118    fn parse_str(tok: &str) -> Token {
119        match tok {
120            "extern" => Token::KeywordExtern,
121            "WithLevel" => Token::KeywordWithLevel,
122            "User" => Token::KeywordPrivilege(Ring::User),
123            "Super" => Token::KeywordPrivilege(Ring::Super),
124            "isr" => Token::KeywordIsr,
125            "call" => Token::KeywordCall,
126            id => Token::Identifier(String::from_str(id).expect("Invalid str for identifier")),
127        }
128    }
129}
130
131#[cfg(test)]
132mod tests {
133
134    #[test]
135    fn valid_tokens() {
136        const EXAMPLE: &str = "extern Func()    ";
137    }
138
139    #[test]
140    fn invalid_token() {
141        const EXAMPLE: &str = "extern Func() #$%^";
142    }
143}