From 690dfad4326293aef24c39db4be9abc19c8fb46e Mon Sep 17 00:00:00 2001 From: Mel Date: Sun, 17 Oct 2021 23:01:03 +0200 Subject: Lexing --- .gitignore | 1 + Cargo.lock | 7 ++++ Cargo.toml | 8 ++++ grammar.ebnf | 16 +++++++ src/lex/lexer.rs | 125 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/lex/mod.rs | 2 + src/lex/token.rs | 29 +++++++++++++ src/main.rs | 12 ++++++ 8 files changed, 200 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 grammar.ebnf create mode 100644 src/lex/lexer.rs create mode 100644 src/lex/mod.rs create mode 100644 src/lex/token.rs create mode 100644 src/main.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..41466aa --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "rabbithole" +version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..8679632 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "rabbithole" +version = "0.1.0" +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] diff --git a/grammar.ebnf b/grammar.ebnf new file mode 100644 index 0000000..1805abb --- /dev/null +++ b/grammar.ebnf @@ -0,0 +1,16 @@ +(* Grammar definition in EBNF format. *) + +Expression = TermExpression; + +TermExpression = FactorExpression { ("+" | "-") FactorExpression }; + +FactorExpression = UnaryExpression { ("*" | "/") UnaryExpression }; + +UnaryExpression = ( "-" | "!" ) | GroupExpression; + +UnitExpression = NaturalDigit {Digit} | "(" Expression ")";; + +(* Basics *) + +NaturalDigit = "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"; +Digit = "0" | NaturalDigit ; \ No newline at end of file diff --git a/src/lex/lexer.rs b/src/lex/lexer.rs new file mode 100644 index 0000000..e2ac0f4 --- /dev/null +++ b/src/lex/lexer.rs @@ -0,0 +1,125 @@ +use std::{iter::Peekable, str::Chars}; + +use super::token::{Location, Token, TokenVariant}; + +pub struct Lexer<'source> { + location: Location, + chars: Peekable>, + done: bool, +} + +impl Iterator for Lexer<'_> { + type Item = Token; + + fn next(&mut self) -> Option { + if self.done { + return None; + } + + if let None = self.chars.peek() { + self.done = true; + return Some(Token { + location: self.location, + variant: TokenVariant::Eof, + }); + } + + self.skip_whitespace(); + + let c = *self.chars.peek()?; + + let token = if c.is_numeric() { + self.number() + } else if c == '+' { + self.char_token(TokenVariant::OpPlus) + } else if c == '-' { + self.char_token(TokenVariant::OpMinus) + } else if c == '*' { + self.char_token(TokenVariant::OpStar) + } else if c == '/' { + self.char_token(TokenVariant::OpSlash) + } else if c == '!' { + self.char_token(TokenVariant::OpNot) + } else if c == '(' { + self.char_token(TokenVariant::GroupOpen) + } else if c == ')' { + self.char_token(TokenVariant::GroupClose) + } else { + self.char_token(TokenVariant::Unknown(c)) + }; + + Some(token) + } +} + +impl<'s> Lexer<'s> { + pub fn new(source: &'s str) -> Self { + Lexer { + location: Location { col: 0, row: 0 }, + chars: source.chars().peekable(), + done: false, + } + } + + fn advance(&mut self) -> Option { + let next = self.chars.next(); + if let Some(c) = next { + if c == '\n' { + self.location.row += 1; + self.location.col = 0; + } else { + self.location.row += 1; + } + } + + next + } + + fn skip_whitespace(&mut self) { + while self + .chars + .peek() + .map_or(false, |x| x.is_whitespace() && *x != '\n') + { + self.advance(); + } + } + + fn char_token(&mut self, variant: TokenVariant) -> Token { + let token = Token { + location: self.location, + variant, + }; + self.advance(); + token + } + + fn number(&mut self) -> Token { + let location = self.location; + + let mut is_integer = true; + let mut buffer = String::new(); + + while self + .chars + .peek() + .map_or(false, |&c| c.is_numeric() || c == '.') + { + let c = self.advance().unwrap(); + if c == '.' { + is_integer = false; + } + buffer.push(c); + } + + let variant = if is_integer { + let int = buffer.parse().expect("Failed lexing integer token."); + TokenVariant::Int(int) + } else { + let float = buffer.parse().expect("Failed lexing float token."); + TokenVariant::Float(float) + }; + + Token { location, variant } + } +} diff --git a/src/lex/mod.rs b/src/lex/mod.rs new file mode 100644 index 0000000..f785280 --- /dev/null +++ b/src/lex/mod.rs @@ -0,0 +1,2 @@ +pub mod lexer; +pub mod token; \ No newline at end of file diff --git a/src/lex/token.rs b/src/lex/token.rs new file mode 100644 index 0000000..a43cf0f --- /dev/null +++ b/src/lex/token.rs @@ -0,0 +1,29 @@ +#[derive(Clone, Copy, Debug)] +pub struct Location { + pub col: usize, + pub row: usize, +} + +#[derive(Clone, Debug)] +pub struct Token { + pub location: Location, + pub variant: TokenVariant, +} + +#[derive(Clone, Debug)] +pub enum TokenVariant { + OpPlus, + OpMinus, + OpStar, + OpSlash, + OpNot, + + GroupOpen, + GroupClose, + + Int(u32), + Float(f32), + + Unknown(char), + Eof, +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..3b23d8e --- /dev/null +++ b/src/main.rs @@ -0,0 +1,12 @@ +mod lex; + +use lex::lexer::Lexer; + +fn main() { + let source = "1 + 2"; + let lexer = Lexer::new(source); + + for token in lexer { + println!("{:?}", token); + } +} -- cgit 1.4.1