diff options
| author | Mel <mel@rnrd.eu> | 2025-03-15 00:37:54 +0100 |
|---|---|---|
| committer | Mel <mel@rnrd.eu> | 2025-03-15 00:37:54 +0100 |
| commit | 7a7446a6dfdfcf5e02358fdb34c193eb67068b93 (patch) | |
| tree | 7807c003f1ba6c0eafc672130ecc5517e9ab0685 | |
| parent | a13646c5f5b0e4dd3b460f6b11226058ad3c9fe6 (diff) | |
| download | catskill-7a7446a6dfdfcf5e02358fdb34c193eb67068b93.tar.zst catskill-7a7446a6dfdfcf5e02358fdb34c193eb67068b93.zip | |
Keyword recognition in lexer through CRC32
Signed-off-by: Mel <mel@rnrd.eu>
| -rw-r--r-- | boot/lex.c | 59 |
1 files changed, 50 insertions, 9 deletions
diff --git a/boot/lex.c b/boot/lex.c index 040dd9a..db0173e 100644 --- a/boot/lex.c +++ b/boot/lex.c @@ -247,7 +247,7 @@ ascii_is_number(ascii c) } bool -ascii_is_name(ascii c) +ascii_is_name_or_word(ascii c) { return ascii_in_range(c, 'A', 'Z') || ascii_in_range(c, 'a', 'z') || c == '_'; } @@ -533,8 +533,43 @@ lexer_number_token(struct Lexer* l) return token_new(kind, span, cursor, value); } +enum Token_Kind +lexer_word_from_name(struct Lexer* l, struct String word_or_name) +{ + uint32 crc = crc32_posix(word_or_name); + // CRC32 values can be checked with `echo -ne "word" | cksum` + switch (crc) { + case 1373415947: // "fun" + return TOKEN_WORD_FUN; + case 812472514: // "if" + return TOKEN_WORD_IF; + case 2588761009: // "else" + return TOKEN_WORD_ELSE; + case 2652874405: // "for" + return TOKEN_WORD_FOR; + case 1637870694: // "loop" + return TOKEN_WORD_LOOP; + case 1007193266: // "break" + return TOKEN_WORD_BREAK; + case 1827824793: // "continue" + return TOKEN_WORD_CONTINUE; + case 836542293: // "defer" + return TOKEN_WORD_DEFER; + case 3635023017: // "switch" + return TOKEN_WORD_SWITCH; + case 2579962013: // "return" + return TOKEN_WORD_RETURN; + case 1662845996: // "var" + return TOKEN_WORD_VAR; + case 91700392: // "type" + return TOKEN_WORD_TYPE; + default: + return TOKEN_NONE; + } +} + struct Token -lexer_name_token(struct Lexer* l) +lexer_name_or_word_token(struct Lexer* l) { struct Cursor cursor = l->cursor; Pos position = l->position; @@ -543,19 +578,25 @@ lexer_name_token(struct Lexer* l) uint buffer_size = 0; for (;;) { struct Lexer_Char c = lexer_peek_char(l); - bool is_name_char = ascii_is_name(c.character) || ascii_is_number(c.character); - if (c.eof || !is_name_char) break; + bool is_name_or_word_char = + ascii_is_name_or_word(c.character) || ascii_is_number(c.character); + if (c.eof || !is_name_or_word_char) break; - check(buffer_size < MAX_CHAR_BUFFER_SIZE, "name too long to lex"); + check(buffer_size < MAX_CHAR_BUFFER_SIZE, "name or word too long to lex"); buffer[buffer_size++] = c.character; lexer_advance_char(l); } - check(buffer_size != 0, "`lexer_name_token` called on non-name token"); + check(buffer_size != 0, "`lexer_name_or_word_token` called on non-name/word token"); - union Token_Value value = { .name = string_new(buffer, buffer_size) }; struct Span span = span_new(position, l->position); + struct String name_or_word = string_new(buffer, buffer_size); + + enum Token_Kind word_kind = lexer_word_from_name(l, name_or_word); + if (word_kind != TOKEN_NONE) return token_wide(word_kind, span, cursor); + + union Token_Value value = { .name = name_or_word }; return token_new(TOKEN_NAME, span, cursor, value); } @@ -579,8 +620,8 @@ lexer_next(struct Lexer* l) return lexer_string_token(l); } else if (ascii_is_number(c.character)) { return lexer_number_token(l); - } else if (ascii_is_name(c.character)) { - return lexer_name_token(l); + } else if (ascii_is_name_or_word(c.character)) { + return lexer_name_or_word_token(l); } Pos position = l->position; |
