anillo/lexer.rs
1//! Basic type definitions and implementations for the lexing of an Anillo
2//! source file.
3
4use std::{collections::VecDeque, error::Error, io::Read, str::FromStr};
5
6use crate::{
7 error::CompilationError,
8 token::{Ring, Token, TokenInfo},
9};
10
11/// The Anillo Lexer
12///
13/// The Anillo Lexer is purposefully simple. Lexemes (or Tokens here) are only
14/// delimited by whitespace and "Special Characters" (these are defined in the
15/// techincal specification, but generally punctuation like brackets, or special
16/// semantic tokens like '$' count as "Special Characters")
17pub struct Lexer {
18 file: std::io::BufReader<std::fs::File>,
19}
20
21impl Lexer {
22 pub fn new(path: &std::path::Path) -> std::io::Result<Lexer> {
23 let f = std::fs::File::open(path)?;
24 Ok(Lexer {
25 file: std::io::BufReader::new(f),
26 })
27 }
28
29 /// Eagerly lex the ModuleSource and return a buffer of `TokenInfo`s to the
30 /// caller.
31 ///
32 /// Lexing here is done as simply as possible (and the language was designed
33 /// with this in mind). There should be absolutely no look ahead peeking
34 /// done at this level. Compilation errors at this level are supported, but
35 /// rare (only really possible if an invalid unicode character is found).
36 pub fn tokenize(&mut self) -> Result<VecDeque<TokenInfo>, Box<dyn Error>> {
37 let mut tokens: VecDeque<TokenInfo> = VecDeque::new();
38 let mut buf: [u8; 1] = [0; 1];
39 let mut str_token_buf: Vec<char> = Vec::new();
40
41 let mut line: u32 = 1;
42 let mut col: u32 = 1;
43
44 // Many conditions can trigger a "drain" of our current raw char-form
45 // token buffer. This is just a macro that wraps up the repetitive
46 // code into a single call.
47 macro_rules! drain_if_needed {
48 () => {
49 if !str_token_buf.is_empty() {
50 let remaining: String = str_token_buf.iter().collect();
51 str_token_buf.clear();
52 let tok: Token = Self::parse_str(&remaining);
53 tokens.push_back(TokenInfo::new(tok, line, col));
54 }
55 };
56 }
57
58 while self.file.read_exact(&mut buf).is_ok() {
59 let letter = buf[0] as char;
60 match letter {
61 '(' | '{' | ')' | '}' => {
62 col += 1;
63 drain_if_needed!();
64 let tok: Token = match letter {
65 '(' => Token::LeftParen,
66 '{' => Token::LeftBracket,
67 ')' => Token::RightParen,
68 '}' => Token::RightBracket,
69 _ => unreachable!(),
70 };
71
72 tokens.push_back(TokenInfo::new(tok, line, col));
73 }
74 ',' => {
75 col += 1;
76 drain_if_needed!();
77 tokens.push_back(TokenInfo::new(Token::Comma, line, col));
78 }
79 '$' => {
80 col += 1;
81 drain_if_needed!();
82 tokens.push_back(TokenInfo::new(Token::Dollar, line, col));
83 }
84 white if white.is_ascii_whitespace() => {
85 if white == '\n' {
86 line += 1;
87 col = 0;
88 } else {
89 col += 1;
90 }
91 drain_if_needed!();
92 }
93 asc if asc.is_ascii_alphanumeric() => {
94 col += 1;
95 str_token_buf.push(letter);
96 }
97 other => {
98 col += 1;
99 return Err(Box::new(CompilationError::new(
100 line,
101 col,
102 format!("Unrecognized character type: {}", other),
103 )));
104 }
105 }
106 }
107
108 Ok(tokens)
109 }
110
111 /// Helper function for `tokenize`.
112 ///
113 /// Essentially anything that isn't a Special Character will be lexed here.
114 /// Where possible, we will store the lexeme in a rich token type (Like
115 /// keywords). Everything else however will be parsed as identifiers (even
116 /// numbers). The parser handles converting these to their expected types at
117 /// AST generation time in `parser.rs`
118 fn parse_str(tok: &str) -> Token {
119 match tok {
120 "extern" => Token::KeywordExtern,
121 "WithLevel" => Token::KeywordWithLevel,
122 "User" => Token::KeywordPrivilege(Ring::User),
123 "Super" => Token::KeywordPrivilege(Ring::Super),
124 "isr" => Token::KeywordIsr,
125 "call" => Token::KeywordCall,
126 id => Token::Identifier(String::from_str(id).expect("Invalid str for identifier")),
127 }
128 }
129}
130
131#[cfg(test)]
132mod tests {
133
134 #[test]
135 fn valid_tokens() {
136 const EXAMPLE: &str = "extern Func() ";
137 }
138
139 #[test]
140 fn invalid_token() {
141 const EXAMPLE: &str = "extern Func() #$%^";
142 }
143}