about summary refs log tree commit diff
path: root/boot/lex.c
diff options
context:
space:
mode:
authorMel <mel@rnrd.eu>2025-03-15 00:37:54 +0100
committerMel <mel@rnrd.eu>2025-03-15 00:37:54 +0100
commit7a7446a6dfdfcf5e02358fdb34c193eb67068b93 (patch)
tree7807c003f1ba6c0eafc672130ecc5517e9ab0685 /boot/lex.c
parenta13646c5f5b0e4dd3b460f6b11226058ad3c9fe6 (diff)
downloadcatskill-7a7446a6dfdfcf5e02358fdb34c193eb67068b93.tar.zst
catskill-7a7446a6dfdfcf5e02358fdb34c193eb67068b93.zip
Keyword recognition in lexer through CRC32
Signed-off-by: Mel <mel@rnrd.eu>
Diffstat (limited to 'boot/lex.c')
-rw-r--r--boot/lex.c59
1 files changed, 50 insertions, 9 deletions
diff --git a/boot/lex.c b/boot/lex.c
index 040dd9a..db0173e 100644
--- a/boot/lex.c
+++ b/boot/lex.c
@@ -247,7 +247,7 @@ ascii_is_number(ascii c)
 }
 
 bool
-ascii_is_name(ascii c)
+ascii_is_name_or_word(ascii c)
 {
     return ascii_in_range(c, 'A', 'Z') || ascii_in_range(c, 'a', 'z') || c == '_';
 }
@@ -533,8 +533,43 @@ lexer_number_token(struct Lexer* l)
     return token_new(kind, span, cursor, value);
 }
 
+enum Token_Kind
+lexer_word_from_name(struct Lexer* l, struct String word_or_name)
+{
+    uint32 crc = crc32_posix(word_or_name);
+    // CRC32 values can be checked with `echo -ne "word" | cksum`
+    switch (crc) {
+    case 1373415947: // "fun"
+        return TOKEN_WORD_FUN;
+    case 812472514: // "if"
+        return TOKEN_WORD_IF;
+    case 2588761009: // "else"
+        return TOKEN_WORD_ELSE;
+    case 2652874405: // "for"
+        return TOKEN_WORD_FOR;
+    case 1637870694: // "loop"
+        return TOKEN_WORD_LOOP;
+    case 1007193266: // "break"
+        return TOKEN_WORD_BREAK;
+    case 1827824793: // "continue"
+        return TOKEN_WORD_CONTINUE;
+    case 836542293: // "defer"
+        return TOKEN_WORD_DEFER;
+    case 3635023017: // "switch"
+        return TOKEN_WORD_SWITCH;
+    case 2579962013: // "return"
+        return TOKEN_WORD_RETURN;
+    case 1662845996: // "var"
+        return TOKEN_WORD_VAR;
+    case 91700392: // "type"
+        return TOKEN_WORD_TYPE;
+    default:
+        return TOKEN_NONE;
+    }
+}
+
 struct Token
-lexer_name_token(struct Lexer* l)
+lexer_name_or_word_token(struct Lexer* l)
 {
     struct Cursor cursor = l->cursor;
     Pos position = l->position;
@@ -543,19 +578,25 @@ lexer_name_token(struct Lexer* l)
     uint buffer_size = 0;
     for (;;) {
         struct Lexer_Char c = lexer_peek_char(l);
-        bool is_name_char = ascii_is_name(c.character) || ascii_is_number(c.character);
-        if (c.eof || !is_name_char) break;
+        bool is_name_or_word_char =
+            ascii_is_name_or_word(c.character) || ascii_is_number(c.character);
+        if (c.eof || !is_name_or_word_char) break;
 
-        check(buffer_size < MAX_CHAR_BUFFER_SIZE, "name too long to lex");
+        check(buffer_size < MAX_CHAR_BUFFER_SIZE, "name or word too long to lex");
         buffer[buffer_size++] = c.character;
 
         lexer_advance_char(l);
     }
 
-    check(buffer_size != 0, "`lexer_name_token` called on non-name token");
+    check(buffer_size != 0, "`lexer_name_or_word_token` called on non-name/word token");
 
-    union Token_Value value = { .name = string_new(buffer, buffer_size) };
     struct Span span = span_new(position, l->position);
+    struct String name_or_word = string_new(buffer, buffer_size);
+
+    enum Token_Kind word_kind = lexer_word_from_name(l, name_or_word);
+    if (word_kind != TOKEN_NONE) return token_wide(word_kind, span, cursor);
+
+    union Token_Value value = { .name = name_or_word };
     return token_new(TOKEN_NAME, span, cursor, value);
 }
 
@@ -579,8 +620,8 @@ lexer_next(struct Lexer* l)
         return lexer_string_token(l);
     } else if (ascii_is_number(c.character)) {
         return lexer_number_token(l);
-    } else if (ascii_is_name(c.character)) {
-        return lexer_name_token(l);
+    } else if (ascii_is_name_or_word(c.character)) {
+        return lexer_name_or_word_token(l);
     }
 
     Pos position = l->position;