Bootstrapping lexer

Signed-off-by: Mel <mel@rnrd.eu>
author: Mel <mel@rnrd.eu> 2025-03-14 22:48:06 +0100
committer: Mel <mel@rnrd.eu> 2025-03-14 22:48:06 +0100
commit: 0a3bcb4ab7c5884e9ad6bb47b05c9a8231395c93 (patch)
tree: 1a40d808d1a877f32d101c9913727de3090d5e48 /boot/lex.c
parent: 742cc47bfbe67806f80bf4e821476b5efff9a9f6 (diff)
download: catskill-0a3bcb4ab7c5884e9ad6bb47b05c9a8231395c93.tar.zst
catskill-0a3bcb4ab7c5884e9ad6bb47b05c9a8231395c93.zip
1 files changed, 599 insertions, 0 deletions
diff --git a/boot/lex.c b/boot/lex.c
new file mode 100644
index 0000000..040dd9a
--- /dev/null
+++ b/boot/lex.c
@@ -0,0 +1,599 @@
+/*
+ * simple advancing lexer
+ * invoked until completion by parser.
+ */
+
+#define MAX_CHAR_BUFFER_SIZE 256
+
+// byte position within a file.
+typedef uint64 Pos;
+
+// an extent of text within a file,
+// defined by its start and end byte positions.
+struct Span
+{
+    Pos start;
+    Pos end;
+};
+
+struct Span
+span_new(Pos start, Pos end)
+{
+    return (struct Span){ .start = start, .end = end };
+}
+
+struct Span
+span_width(Pos start, uint width)
+{
+    return (struct Span){ .start = start, .end = start + width };
+}
+
+// a cursor position placed within a text file.
+struct Cursor
+{
+    uint line;
+    uint column;
+};
+
+// what kind of token is it?
+enum Token_Kind
+{
+    TOKEN_NONE,
+    TOKEN_SOMETHING,
+    TOKEN_END_OF_FILE,
+    TOKEN_NEWLINE,
+
+    TOKEN_NAME,
+    TOKEN_LITERAL_INTEGER,
+    TOKEN_LITERAL_FLOAT,
+    TOKEN_LITERAL_STRING,
+
+    TOKEN_WORD_FUN,
+    TOKEN_WORD_IF,
+    TOKEN_WORD_ELSE,
+    TOKEN_WORD_FOR,
+    TOKEN_WORD_LOOP,
+    TOKEN_WORD_BREAK,
+    TOKEN_WORD_CONTINUE,
+    TOKEN_WORD_DEFER,
+    TOKEN_WORD_SWITCH,
+    TOKEN_WORD_RETURN,
+    TOKEN_WORD_VAR,
+    TOKEN_WORD_TYPE,
+
+    TOKEN_ROUND_OPEN,
+    TOKEN_ROUND_CLOSE,
+    TOKEN_CURLY_OPEN,
+    TOKEN_CURLY_CLOSE,
+    TOKEN_SQUARE_OPEN,
+    TOKEN_SQUARE_CLOSE,
+
+    TOKEN_COMMA,
+    TOKEN_ASSIGN,
+    TOKEN_AMPERSAND,
+    TOKEN_DOT,
+    TOKEN_PIPE,
+
+    TOKEN_PLUS,
+    TOKEN_MINUS,
+    TOKEN_STAR,
+    TOKEN_SLASH,
+
+    TOKEN_AND,
+    TOKEN_OR,
+    TOKEN_NOT,
+    TOKEN_EQUAL,
+    TOKEN_NOT_EQUAL,
+    TOKEN_LESS,
+    TOKEN_LESS_EQUAL,
+    TOKEN_GREATER,
+    TOKEN_GREATER_EQUAL,
+};
+
+const ascii*
+token_kind_to_string(enum Token_Kind kind)
+{
+    switch (kind) {
+    case TOKEN_NONE:
+        return "NONE";
+    case TOKEN_SOMETHING:
+        return "SOMETHING";
+    case TOKEN_END_OF_FILE:
+        return "END_OF_FILE";
+    case TOKEN_NEWLINE:
+        return "NEWLINE";
+
+    case TOKEN_NAME:
+        return "NAME";
+    case TOKEN_LITERAL_INTEGER:
+        return "LITERAL_INTEGER";
+    case TOKEN_LITERAL_FLOAT:
+        return "LITERAL_FLOAT";
+    case TOKEN_LITERAL_STRING:
+        return "LITERAL_STRING";
+
+    case TOKEN_WORD_FUN:
+        return "WORD_FUN";
+    case TOKEN_WORD_IF:
+        return "WORD_IF";
+    case TOKEN_WORD_ELSE:
+        return "WORD_ELSE";
+    case TOKEN_WORD_FOR:
+        return "WORD_FOR";
+    case TOKEN_WORD_LOOP:
+        return "WORD_LOOP";
+    case TOKEN_WORD_BREAK:
+        return "WORD_BREAK";
+    case TOKEN_WORD_CONTINUE:
+        return "WORD_CONTINUE";
+    case TOKEN_WORD_DEFER:
+        return "WORD_DEFER";
+    case TOKEN_WORD_SWITCH:
+        return "WORD_SWITCH";
+    case TOKEN_WORD_RETURN:
+        return "WORD_RETURN";
+    case TOKEN_WORD_VAR:
+        return "WORD_VAR";
+    case TOKEN_WORD_TYPE:
+        return "WORD_TYPE";
+
+    case TOKEN_ROUND_OPEN:
+        return "ROUND_OPEN";
+    case TOKEN_ROUND_CLOSE:
+        return "ROUND_CLOSE";
+    case TOKEN_CURLY_OPEN:
+        return "CURLY_OPEN";
+    case TOKEN_CURLY_CLOSE:
+        return "CURLY_CLOSE";
+    case TOKEN_SQUARE_OPEN:
+        return "SQUARE_OPEN";
+    case TOKEN_SQUARE_CLOSE:
+        return "SQUARE_CLOSE";
+
+    case TOKEN_COMMA:
+        return "COMMA";
+    case TOKEN_ASSIGN:
+        return "ASSIGN";
+    case TOKEN_AMPERSAND:
+        return "AMPERSAND";
+    case TOKEN_DOT:
+        return "DOT";
+    case TOKEN_PIPE:
+        return "PIPE";
+
+    case TOKEN_PLUS:
+        return "PLUS";
+    case TOKEN_MINUS:
+        return "MINUS";
+    case TOKEN_STAR:
+        return "STAR";
+    case TOKEN_SLASH:
+        return "SLASH";
+
+    case TOKEN_AND:
+        return "AND";
+    case TOKEN_OR:
+        return "OR";
+    case TOKEN_NOT:
+        return "NOT";
+    case TOKEN_EQUAL:
+        return "EQUAL";
+    case TOKEN_NOT_EQUAL:
+        return "NOT_EQUAL";
+    case TOKEN_LESS:
+        return "LESS";
+    case TOKEN_LESS_EQUAL:
+        return "LESS_EQUAL";
+    case TOKEN_GREATER:
+        return "GREATER";
+    case TOKEN_GREATER_EQUAL:
+        return "GREATER_EQUAL";
+    default:
+        return "";
+    }
+}
+
+// further value optionally carried by some tokens.
+union Token_Value
+{
+    ascii unknown_character;
+    int64 literal_integer;
+    float64 literal_float;
+    struct String literal_string;
+    struct String name;
+};
+
+// a lexical token.
+struct Token
+{
+    enum Token_Kind kind;
+    union Token_Value value;
+    struct Span span;
+    struct Cursor location;
+};
+
+struct Token
+token_new(enum Token_Kind kind, struct Span span, struct Cursor location, union Token_Value value)
+{
+    return (struct Token){
+        .kind = kind,
+        .span = span,
+        .value = value,
+    };
+}
+
+struct Token
+token_wide(enum Token_Kind kind, struct Span span, struct Cursor location)
+{
+    return token_new(kind, span, location, (union Token_Value){});
+}
+
+struct Token
+token_simple(enum Token_Kind kind, Pos pos, struct Cursor location)
+{
+    return token_wide(kind, (struct Span){ pos, pos }, location);
+}
+
+bool
+ascii_in_range(ascii c, ascii from, ascii to)
+{
+    return c >= from && c <= to;
+}
+
+bool
+ascii_is_number(ascii c)
+{
+    return ascii_in_range(c, '0', '9');
+}
+
+bool
+ascii_is_name(ascii c)
+{
+    return ascii_in_range(c, 'A', 'Z') || ascii_in_range(c, 'a', 'z') || c == '_';
+}
+
+// the lexical analyzer.
+// reads tokens from a source file,
+// and spits them out to the parser, one by one.
+struct Lexer
+{
+    struct String source;
+    bool eof;
+    Pos position;
+    struct Cursor cursor;
+};
+
+// initialize a lexer with a source file.
+void
+lexer_new(struct Lexer* l, struct String source)
+{
+    l->source = source;
+    l->eof = false;
+    l->position = 0;
+    l->cursor = (struct Cursor){ 1, 1 };
+}
+
+struct Lexer_Char
+{
+    Pos position;
+    ascii character;
+    bool eof;
+};
+
+struct Lexer_Char
+lexer_advance_char(struct Lexer* l)
+{
+    if (l->position >= string_length(l->source))
+        return (struct Lexer_Char){ .position = string_length(l->source), .eof = true };
+
+    ascii character = string_at(l->source, l->position);
+    l->position += 1;
+
+    if (character == '\n') {
+        l->cursor.line += 1;
+        l->cursor.column = 1;
+    } else {
+        l->cursor.column += 1;
+    }
+
+    return (struct Lexer_Char){ .position = l->position - 1, .character = character, .eof = false };
+}
+
+struct Lexer_Char
+lexer_peek_char_at(const struct Lexer* l, uint offset)
+{
+    Pos at = l->position + offset;
+    if (at >= l->source.length)
+        return (struct Lexer_Char){ .position = string_length(l->source), .eof = true };
+    return (struct Lexer_Char){ .position = at, .character = string_at(l->source, at) };
+}
+
+struct Lexer_Char
+lexer_peek_char(const struct Lexer* l)
+{
+    return lexer_peek_char_at(l, 0);
+}
+
+struct Lexer_Char
+lexer_peek_char_further(const struct Lexer* l)
+{
+    return lexer_peek_char_at(l, 1);
+}
+
+struct Lexer_Char_Match
+{
+    bool got_match;
+    struct Lexer_Char match;
+};
+
+struct Lexer_Char_Match
+lexer_match_char(struct Lexer* l, ascii expected)
+{
+    struct Lexer_Char peek = lexer_peek_char(l);
+    if (peek.character == expected) return (struct Lexer_Char_Match){ true, lexer_advance_char(l) };
+
+    return (struct Lexer_Char_Match){};
+}
+
+struct Lexer_Non_Code
+{
+    bool had_newline;
+    Pos newline_position;
+    struct Cursor newline_cursor;
+};
+
+struct Lexer_Non_Code
+lexer_skip_non_code(struct Lexer* l)
+{
+    struct Lexer_Non_Code status = {
+        .had_newline = false,
+        .newline_position = {},
+        .newline_cursor = {},
+    };
+
+    bool in_comment = false;
+
+    for (;;) {
+        struct Lexer_Char lc = lexer_peek_char(l);
+        if (lc.eof) return status;
+        ascii c = lc.character;
+
+        if (c == '\n') {
+            if (!status.had_newline) {
+                status.had_newline = true;
+                status.newline_position = lc.position;
+                status.newline_cursor = l->cursor;
+            }
+
+            in_comment = false;
+        }
+
+        if (c == '#') { in_comment = true; }
+
+        bool is_whitespace = c == ' ' || c == '\t' || c == '\n';
+        if (!is_whitespace && !in_comment) { break; }
+
+        lexer_advance_char(l);
+    }
+
+    return status;
+}
+
+struct Lexer_Symbol_Token
+{
+    enum Token_Kind kind;
+    uint width;
+};
+
+struct Lexer_Symbol_Token
+lexer_symbol_token(struct Lexer* l, struct Lexer_Char current)
+{
+#define RET return (struct Lexer_Symbol_Token)
+    struct Lexer_Char_Match next;
+    switch (current.character) {
+    case '(':
+        RET{ TOKEN_ROUND_OPEN, 1 };
+    case ')':
+        RET{ TOKEN_ROUND_CLOSE, 1 };
+    case '{':
+        RET{ TOKEN_CURLY_OPEN, 1 };
+    case '}':
+        RET{ TOKEN_CURLY_CLOSE, 1 };
+    case '[':
+        RET{ TOKEN_SQUARE_OPEN, 1 };
+    case ']':
+        RET{ TOKEN_SQUARE_CLOSE, 1 };
+
+    case ',':
+        RET{ TOKEN_COMMA, 1 };
+
+    case '=':
+        next = lexer_match_char(l, '=');
+        if (next.got_match) RET{ TOKEN_EQUAL, 2 };
+
+        RET{ TOKEN_ASSIGN, 1 };
+
+    case '&':
+        next = lexer_match_char(l, '&');
+        if (next.got_match) RET{ TOKEN_AND, 2 };
+
+        RET{ TOKEN_AMPERSAND, 1 };
+
+    case '.':
+        RET{ TOKEN_DOT, 1 };
+
+    case '|':
+        next = lexer_match_char(l, '|');
+        if (next.got_match) RET{ TOKEN_OR, 2 };
+
+        RET{ TOKEN_PIPE, 1 };
+
+    // todo: increment, decrement, +=, -=, *=, etc.
+    // all the special assignment operations
+    case '+':
+        RET{ TOKEN_PLUS, 1 };
+    case '-':
+        RET{ TOKEN_MINUS, 1 };
+    case '*':
+        RET{ TOKEN_STAR, 1 };
+    case '/':
+        RET{ TOKEN_SLASH, 1 };
+
+    case '!':
+        next = lexer_match_char(l, '=');
+        if (next.got_match) RET{ TOKEN_NOT_EQUAL, 2 };
+
+        RET{ TOKEN_NOT, 1 };
+
+    case '<':
+        next = lexer_match_char(l, '=');
+        if (next.got_match) RET{ TOKEN_LESS_EQUAL, 2 };
+        RET{ TOKEN_LESS };
+
+    case '>':
+        next = lexer_match_char(l, '=');
+        if (next.got_match) RET{ TOKEN_GREATER_EQUAL, 2 };
+        RET{ TOKEN_GREATER };
+
+    default:
+        RET{ TOKEN_NONE, 0 };
+    }
+#undef RET
+}
+
+struct Token
+lexer_string_token(struct Lexer* l)
+{
+    struct Cursor cursor = l->cursor;
+    Pos position = l->position;
+
+    struct Lexer_Char_Match start_match = lexer_match_char(l, '"');
+    check(start_match.got_match, "`lexer_string_token` called on non-string token");
+
+    ascii buffer[MAX_CHAR_BUFFER_SIZE];
+    uint buffer_size = 0;
+    for (;;) {
+        struct Lexer_Char c = lexer_advance_char(l);
+        // todo: return lexer error for these cases.
+        check(!c.eof, "unclosed string");
+
+        if (c.character == '"') break;
+
+        check(buffer_size < MAX_CHAR_BUFFER_SIZE, "string too long to lex");
+        buffer[buffer_size++] = c.character;
+    }
+
+    struct String string = string_empty();
+    if (buffer_size > 0) string = string_new(buffer, buffer_size);
+
+    union Token_Value value = { .literal_string = string };
+    struct Span span = span_new(position, l->position);
+    return token_new(TOKEN_LITERAL_STRING, span, cursor, value);
+}
+
+struct Token
+lexer_number_token(struct Lexer* l)
+{
+    struct Cursor cursor = l->cursor;
+    Pos position = l->position;
+
+    ascii buffer[MAX_CHAR_BUFFER_SIZE];
+    uint buffer_size = 0;
+    for (;;) {
+        struct Lexer_Char c = lexer_peek_char(l);
+        bool is_number_char = ascii_is_number(c.character) || c.character == '.';
+        if (c.eof || !is_number_char) break;
+
+        check(buffer_size < MAX_CHAR_BUFFER_SIZE, "number too long to lex");
+        buffer[buffer_size++] = c.character;
+
+        lexer_advance_char(l);
+    }
+
+    check(buffer_size < MAX_CHAR_BUFFER_SIZE, "number too long to lex");
+    buffer[buffer_size++] = '\0';
+
+    enum Token_Kind kind;
+    union Token_Value value;
+    struct Span span = span_new(position, l->position);
+
+    ascii* dot_exists = strchr(buffer, '.');
+    if (dot_exists) {
+        float64 literal_float = strtod(buffer, nil);
+
+        kind = TOKEN_LITERAL_FLOAT;
+        value.literal_float = literal_float;
+    } else {
+        int64 literal_integer = strtoll(buffer, nil, 10);
+
+        kind = TOKEN_LITERAL_INTEGER;
+        value.literal_integer = literal_integer;
+    }
+
+    return token_new(kind, span, cursor, value);
+}
+
+struct Token
+lexer_name_token(struct Lexer* l)
+{
+    struct Cursor cursor = l->cursor;
+    Pos position = l->position;
+
+    ascii buffer[MAX_CHAR_BUFFER_SIZE];
+    uint buffer_size = 0;
+    for (;;) {
+        struct Lexer_Char c = lexer_peek_char(l);
+        bool is_name_char = ascii_is_name(c.character) || ascii_is_number(c.character);
+        if (c.eof || !is_name_char) break;
+
+        check(buffer_size < MAX_CHAR_BUFFER_SIZE, "name too long to lex");
+        buffer[buffer_size++] = c.character;
+
+        lexer_advance_char(l);
+    }
+
+    check(buffer_size != 0, "`lexer_name_token` called on non-name token");
+
+    union Token_Value value = { .name = string_new(buffer, buffer_size) };
+    struct Span span = span_new(position, l->position);
+    return token_new(TOKEN_NAME, span, cursor, value);
+}
+
+// return the next token.
+struct Token
+lexer_next(struct Lexer* l)
+{
+    if (l->eof) return token_simple(TOKEN_END_OF_FILE, l->position, l->cursor);
+
+    struct Lexer_Non_Code non_code = lexer_skip_non_code(l);
+    if (non_code.had_newline)
+        return token_simple(TOKEN_NEWLINE, non_code.newline_position, non_code.newline_cursor);
+
+    struct Lexer_Char c = lexer_peek_char(l);
+    if (c.eof) {
+        l->eof = true;
+        return token_simple(TOKEN_END_OF_FILE, l->position, l->cursor);
+    }
+
+    if (c.character == '"') {
+        return lexer_string_token(l);
+    } else if (ascii_is_number(c.character)) {
+        return lexer_number_token(l);
+    } else if (ascii_is_name(c.character)) {
+        return lexer_name_token(l);
+    }
+
+    Pos position = l->position;
+    struct Cursor cursor = l->cursor;
+
+    c = lexer_advance_char(l);
+
+    struct Lexer_Symbol_Token symbol_token = lexer_symbol_token(l, c);
+    if (symbol_token.kind != TOKEN_NONE) {
+        struct Span symbol_span = span_width(position, symbol_token.width);
+        return token_wide(symbol_token.kind, symbol_span, cursor);
+    }
+
+    union Token_Value value = (union Token_Value){ .unknown_character = c.character };
+    return token_new(TOKEN_SOMETHING, span_width(position, 1), cursor, value);
+}
\ No newline at end of file
author	Mel <mel@rnrd.eu>	2025-03-14 22:48:06 +0100
committer	Mel <mel@rnrd.eu>	2025-03-14 22:48:06 +0100
commit	0a3bcb4ab7c5884e9ad6bb47b05c9a8231395c93 (patch)
tree	1a40d808d1a877f32d101c9913727de3090d5e48 /boot/lex.c
parent	742cc47bfbe67806f80bf4e821476b5efff9a9f6 (diff)
download	catskill-0a3bcb4ab7c5884e9ad6bb47b05c9a8231395c93.tar.zst catskill-0a3bcb4ab7c5884e9ad6bb47b05c9a8231395c93.zip