diff options
| author | Mel <mel@rnrd.eu> | 2025-03-14 22:48:06 +0100 |
|---|---|---|
| committer | Mel <mel@rnrd.eu> | 2025-03-14 22:48:06 +0100 |
| commit | 0a3bcb4ab7c5884e9ad6bb47b05c9a8231395c93 (patch) | |
| tree | 1a40d808d1a877f32d101c9913727de3090d5e48 /boot/lex.c | |
| parent | 742cc47bfbe67806f80bf4e821476b5efff9a9f6 (diff) | |
| download | catskill-0a3bcb4ab7c5884e9ad6bb47b05c9a8231395c93.tar.zst catskill-0a3bcb4ab7c5884e9ad6bb47b05c9a8231395c93.zip | |
Bootstrapping lexer
Signed-off-by: Mel <mel@rnrd.eu>
Diffstat (limited to 'boot/lex.c')
| -rw-r--r-- | boot/lex.c | 599 |
1 files changed, 599 insertions, 0 deletions
diff --git a/boot/lex.c b/boot/lex.c new file mode 100644 index 0000000..040dd9a --- /dev/null +++ b/boot/lex.c @@ -0,0 +1,599 @@ +/* + * simple advancing lexer + * invoked until completion by parser. + */ + +#define MAX_CHAR_BUFFER_SIZE 256 + +// byte position within a file. +typedef uint64 Pos; + +// an extent of text within a file, +// defined by its start and end byte positions. +struct Span +{ + Pos start; + Pos end; +}; + +struct Span +span_new(Pos start, Pos end) +{ + return (struct Span){ .start = start, .end = end }; +} + +struct Span +span_width(Pos start, uint width) +{ + return (struct Span){ .start = start, .end = start + width }; +} + +// a cursor position placed within a text file. +struct Cursor +{ + uint line; + uint column; +}; + +// what kind of token is it? +enum Token_Kind +{ + TOKEN_NONE, + TOKEN_SOMETHING, + TOKEN_END_OF_FILE, + TOKEN_NEWLINE, + + TOKEN_NAME, + TOKEN_LITERAL_INTEGER, + TOKEN_LITERAL_FLOAT, + TOKEN_LITERAL_STRING, + + TOKEN_WORD_FUN, + TOKEN_WORD_IF, + TOKEN_WORD_ELSE, + TOKEN_WORD_FOR, + TOKEN_WORD_LOOP, + TOKEN_WORD_BREAK, + TOKEN_WORD_CONTINUE, + TOKEN_WORD_DEFER, + TOKEN_WORD_SWITCH, + TOKEN_WORD_RETURN, + TOKEN_WORD_VAR, + TOKEN_WORD_TYPE, + + TOKEN_ROUND_OPEN, + TOKEN_ROUND_CLOSE, + TOKEN_CURLY_OPEN, + TOKEN_CURLY_CLOSE, + TOKEN_SQUARE_OPEN, + TOKEN_SQUARE_CLOSE, + + TOKEN_COMMA, + TOKEN_ASSIGN, + TOKEN_AMPERSAND, + TOKEN_DOT, + TOKEN_PIPE, + + TOKEN_PLUS, + TOKEN_MINUS, + TOKEN_STAR, + TOKEN_SLASH, + + TOKEN_AND, + TOKEN_OR, + TOKEN_NOT, + TOKEN_EQUAL, + TOKEN_NOT_EQUAL, + TOKEN_LESS, + TOKEN_LESS_EQUAL, + TOKEN_GREATER, + TOKEN_GREATER_EQUAL, +}; + +const ascii* +token_kind_to_string(enum Token_Kind kind) +{ + switch (kind) { + case TOKEN_NONE: + return "NONE"; + case TOKEN_SOMETHING: + return "SOMETHING"; + case TOKEN_END_OF_FILE: + return "END_OF_FILE"; + case TOKEN_NEWLINE: + return "NEWLINE"; + + case TOKEN_NAME: + return "NAME"; + case TOKEN_LITERAL_INTEGER: + return "LITERAL_INTEGER"; + case TOKEN_LITERAL_FLOAT: + return "LITERAL_FLOAT"; + case TOKEN_LITERAL_STRING: + return "LITERAL_STRING"; + + case TOKEN_WORD_FUN: + return "WORD_FUN"; + case TOKEN_WORD_IF: + return "WORD_IF"; + case TOKEN_WORD_ELSE: + return "WORD_ELSE"; + case TOKEN_WORD_FOR: + return "WORD_FOR"; + case TOKEN_WORD_LOOP: + return "WORD_LOOP"; + case TOKEN_WORD_BREAK: + return "WORD_BREAK"; + case TOKEN_WORD_CONTINUE: + return "WORD_CONTINUE"; + case TOKEN_WORD_DEFER: + return "WORD_DEFER"; + case TOKEN_WORD_SWITCH: + return "WORD_SWITCH"; + case TOKEN_WORD_RETURN: + return "WORD_RETURN"; + case TOKEN_WORD_VAR: + return "WORD_VAR"; + case TOKEN_WORD_TYPE: + return "WORD_TYPE"; + + case TOKEN_ROUND_OPEN: + return "ROUND_OPEN"; + case TOKEN_ROUND_CLOSE: + return "ROUND_CLOSE"; + case TOKEN_CURLY_OPEN: + return "CURLY_OPEN"; + case TOKEN_CURLY_CLOSE: + return "CURLY_CLOSE"; + case TOKEN_SQUARE_OPEN: + return "SQUARE_OPEN"; + case TOKEN_SQUARE_CLOSE: + return "SQUARE_CLOSE"; + + case TOKEN_COMMA: + return "COMMA"; + case TOKEN_ASSIGN: + return "ASSIGN"; + case TOKEN_AMPERSAND: + return "AMPERSAND"; + case TOKEN_DOT: + return "DOT"; + case TOKEN_PIPE: + return "PIPE"; + + case TOKEN_PLUS: + return "PLUS"; + case TOKEN_MINUS: + return "MINUS"; + case TOKEN_STAR: + return "STAR"; + case TOKEN_SLASH: + return "SLASH"; + + case TOKEN_AND: + return "AND"; + case TOKEN_OR: + return "OR"; + case TOKEN_NOT: + return "NOT"; + case TOKEN_EQUAL: + return "EQUAL"; + case TOKEN_NOT_EQUAL: + return "NOT_EQUAL"; + case TOKEN_LESS: + return "LESS"; + case TOKEN_LESS_EQUAL: + return "LESS_EQUAL"; + case TOKEN_GREATER: + return "GREATER"; + case TOKEN_GREATER_EQUAL: + return "GREATER_EQUAL"; + default: + return ""; + } +} + +// further value optionally carried by some tokens. +union Token_Value +{ + ascii unknown_character; + int64 literal_integer; + float64 literal_float; + struct String literal_string; + struct String name; +}; + +// a lexical token. +struct Token +{ + enum Token_Kind kind; + union Token_Value value; + struct Span span; + struct Cursor location; +}; + +struct Token +token_new(enum Token_Kind kind, struct Span span, struct Cursor location, union Token_Value value) +{ + return (struct Token){ + .kind = kind, + .span = span, + .value = value, + }; +} + +struct Token +token_wide(enum Token_Kind kind, struct Span span, struct Cursor location) +{ + return token_new(kind, span, location, (union Token_Value){}); +} + +struct Token +token_simple(enum Token_Kind kind, Pos pos, struct Cursor location) +{ + return token_wide(kind, (struct Span){ pos, pos }, location); +} + +bool +ascii_in_range(ascii c, ascii from, ascii to) +{ + return c >= from && c <= to; +} + +bool +ascii_is_number(ascii c) +{ + return ascii_in_range(c, '0', '9'); +} + +bool +ascii_is_name(ascii c) +{ + return ascii_in_range(c, 'A', 'Z') || ascii_in_range(c, 'a', 'z') || c == '_'; +} + +// the lexical analyzer. +// reads tokens from a source file, +// and spits them out to the parser, one by one. +struct Lexer +{ + struct String source; + bool eof; + Pos position; + struct Cursor cursor; +}; + +// initialize a lexer with a source file. +void +lexer_new(struct Lexer* l, struct String source) +{ + l->source = source; + l->eof = false; + l->position = 0; + l->cursor = (struct Cursor){ 1, 1 }; +} + +struct Lexer_Char +{ + Pos position; + ascii character; + bool eof; +}; + +struct Lexer_Char +lexer_advance_char(struct Lexer* l) +{ + if (l->position >= string_length(l->source)) + return (struct Lexer_Char){ .position = string_length(l->source), .eof = true }; + + ascii character = string_at(l->source, l->position); + l->position += 1; + + if (character == '\n') { + l->cursor.line += 1; + l->cursor.column = 1; + } else { + l->cursor.column += 1; + } + + return (struct Lexer_Char){ .position = l->position - 1, .character = character, .eof = false }; +} + +struct Lexer_Char +lexer_peek_char_at(const struct Lexer* l, uint offset) +{ + Pos at = l->position + offset; + if (at >= l->source.length) + return (struct Lexer_Char){ .position = string_length(l->source), .eof = true }; + return (struct Lexer_Char){ .position = at, .character = string_at(l->source, at) }; +} + +struct Lexer_Char +lexer_peek_char(const struct Lexer* l) +{ + return lexer_peek_char_at(l, 0); +} + +struct Lexer_Char +lexer_peek_char_further(const struct Lexer* l) +{ + return lexer_peek_char_at(l, 1); +} + +struct Lexer_Char_Match +{ + bool got_match; + struct Lexer_Char match; +}; + +struct Lexer_Char_Match +lexer_match_char(struct Lexer* l, ascii expected) +{ + struct Lexer_Char peek = lexer_peek_char(l); + if (peek.character == expected) return (struct Lexer_Char_Match){ true, lexer_advance_char(l) }; + + return (struct Lexer_Char_Match){}; +} + +struct Lexer_Non_Code +{ + bool had_newline; + Pos newline_position; + struct Cursor newline_cursor; +}; + +struct Lexer_Non_Code +lexer_skip_non_code(struct Lexer* l) +{ + struct Lexer_Non_Code status = { + .had_newline = false, + .newline_position = {}, + .newline_cursor = {}, + }; + + bool in_comment = false; + + for (;;) { + struct Lexer_Char lc = lexer_peek_char(l); + if (lc.eof) return status; + ascii c = lc.character; + + if (c == '\n') { + if (!status.had_newline) { + status.had_newline = true; + status.newline_position = lc.position; + status.newline_cursor = l->cursor; + } + + in_comment = false; + } + + if (c == '#') { in_comment = true; } + + bool is_whitespace = c == ' ' || c == '\t' || c == '\n'; + if (!is_whitespace && !in_comment) { break; } + + lexer_advance_char(l); + } + + return status; +} + +struct Lexer_Symbol_Token +{ + enum Token_Kind kind; + uint width; +}; + +struct Lexer_Symbol_Token +lexer_symbol_token(struct Lexer* l, struct Lexer_Char current) +{ +#define RET return (struct Lexer_Symbol_Token) + struct Lexer_Char_Match next; + switch (current.character) { + case '(': + RET{ TOKEN_ROUND_OPEN, 1 }; + case ')': + RET{ TOKEN_ROUND_CLOSE, 1 }; + case '{': + RET{ TOKEN_CURLY_OPEN, 1 }; + case '}': + RET{ TOKEN_CURLY_CLOSE, 1 }; + case '[': + RET{ TOKEN_SQUARE_OPEN, 1 }; + case ']': + RET{ TOKEN_SQUARE_CLOSE, 1 }; + + case ',': + RET{ TOKEN_COMMA, 1 }; + + case '=': + next = lexer_match_char(l, '='); + if (next.got_match) RET{ TOKEN_EQUAL, 2 }; + + RET{ TOKEN_ASSIGN, 1 }; + + case '&': + next = lexer_match_char(l, '&'); + if (next.got_match) RET{ TOKEN_AND, 2 }; + + RET{ TOKEN_AMPERSAND, 1 }; + + case '.': + RET{ TOKEN_DOT, 1 }; + + case '|': + next = lexer_match_char(l, '|'); + if (next.got_match) RET{ TOKEN_OR, 2 }; + + RET{ TOKEN_PIPE, 1 }; + + // todo: increment, decrement, +=, -=, *=, etc. + // all the special assignment operations + case '+': + RET{ TOKEN_PLUS, 1 }; + case '-': + RET{ TOKEN_MINUS, 1 }; + case '*': + RET{ TOKEN_STAR, 1 }; + case '/': + RET{ TOKEN_SLASH, 1 }; + + case '!': + next = lexer_match_char(l, '='); + if (next.got_match) RET{ TOKEN_NOT_EQUAL, 2 }; + + RET{ TOKEN_NOT, 1 }; + + case '<': + next = lexer_match_char(l, '='); + if (next.got_match) RET{ TOKEN_LESS_EQUAL, 2 }; + RET{ TOKEN_LESS }; + + case '>': + next = lexer_match_char(l, '='); + if (next.got_match) RET{ TOKEN_GREATER_EQUAL, 2 }; + RET{ TOKEN_GREATER }; + + default: + RET{ TOKEN_NONE, 0 }; + } +#undef RET +} + +struct Token +lexer_string_token(struct Lexer* l) +{ + struct Cursor cursor = l->cursor; + Pos position = l->position; + + struct Lexer_Char_Match start_match = lexer_match_char(l, '"'); + check(start_match.got_match, "`lexer_string_token` called on non-string token"); + + ascii buffer[MAX_CHAR_BUFFER_SIZE]; + uint buffer_size = 0; + for (;;) { + struct Lexer_Char c = lexer_advance_char(l); + // todo: return lexer error for these cases. + check(!c.eof, "unclosed string"); + + if (c.character == '"') break; + + check(buffer_size < MAX_CHAR_BUFFER_SIZE, "string too long to lex"); + buffer[buffer_size++] = c.character; + } + + struct String string = string_empty(); + if (buffer_size > 0) string = string_new(buffer, buffer_size); + + union Token_Value value = { .literal_string = string }; + struct Span span = span_new(position, l->position); + return token_new(TOKEN_LITERAL_STRING, span, cursor, value); +} + +struct Token +lexer_number_token(struct Lexer* l) +{ + struct Cursor cursor = l->cursor; + Pos position = l->position; + + ascii buffer[MAX_CHAR_BUFFER_SIZE]; + uint buffer_size = 0; + for (;;) { + struct Lexer_Char c = lexer_peek_char(l); + bool is_number_char = ascii_is_number(c.character) || c.character == '.'; + if (c.eof || !is_number_char) break; + + check(buffer_size < MAX_CHAR_BUFFER_SIZE, "number too long to lex"); + buffer[buffer_size++] = c.character; + + lexer_advance_char(l); + } + + check(buffer_size < MAX_CHAR_BUFFER_SIZE, "number too long to lex"); + buffer[buffer_size++] = '\0'; + + enum Token_Kind kind; + union Token_Value value; + struct Span span = span_new(position, l->position); + + ascii* dot_exists = strchr(buffer, '.'); + if (dot_exists) { + float64 literal_float = strtod(buffer, nil); + + kind = TOKEN_LITERAL_FLOAT; + value.literal_float = literal_float; + } else { + int64 literal_integer = strtoll(buffer, nil, 10); + + kind = TOKEN_LITERAL_INTEGER; + value.literal_integer = literal_integer; + } + + return token_new(kind, span, cursor, value); +} + +struct Token +lexer_name_token(struct Lexer* l) +{ + struct Cursor cursor = l->cursor; + Pos position = l->position; + + ascii buffer[MAX_CHAR_BUFFER_SIZE]; + uint buffer_size = 0; + for (;;) { + struct Lexer_Char c = lexer_peek_char(l); + bool is_name_char = ascii_is_name(c.character) || ascii_is_number(c.character); + if (c.eof || !is_name_char) break; + + check(buffer_size < MAX_CHAR_BUFFER_SIZE, "name too long to lex"); + buffer[buffer_size++] = c.character; + + lexer_advance_char(l); + } + + check(buffer_size != 0, "`lexer_name_token` called on non-name token"); + + union Token_Value value = { .name = string_new(buffer, buffer_size) }; + struct Span span = span_new(position, l->position); + return token_new(TOKEN_NAME, span, cursor, value); +} + +// return the next token. +struct Token +lexer_next(struct Lexer* l) +{ + if (l->eof) return token_simple(TOKEN_END_OF_FILE, l->position, l->cursor); + + struct Lexer_Non_Code non_code = lexer_skip_non_code(l); + if (non_code.had_newline) + return token_simple(TOKEN_NEWLINE, non_code.newline_position, non_code.newline_cursor); + + struct Lexer_Char c = lexer_peek_char(l); + if (c.eof) { + l->eof = true; + return token_simple(TOKEN_END_OF_FILE, l->position, l->cursor); + } + + if (c.character == '"') { + return lexer_string_token(l); + } else if (ascii_is_number(c.character)) { + return lexer_number_token(l); + } else if (ascii_is_name(c.character)) { + return lexer_name_token(l); + } + + Pos position = l->position; + struct Cursor cursor = l->cursor; + + c = lexer_advance_char(l); + + struct Lexer_Symbol_Token symbol_token = lexer_symbol_token(l, c); + if (symbol_token.kind != TOKEN_NONE) { + struct Span symbol_span = span_width(position, symbol_token.width); + return token_wide(symbol_token.kind, symbol_span, cursor); + } + + union Token_Value value = (union Token_Value){ .unknown_character = c.character }; + return token_new(TOKEN_SOMETHING, span_width(position, 1), cursor, value); +} \ No newline at end of file |
