about summary refs log tree commit diff
path: root/boot/parse.c
diff options
context:
space:
mode:
Diffstat (limited to 'boot/parse.c')
-rw-r--r--boot/parse.c470
1 files changed, 470 insertions, 0 deletions
diff --git a/boot/parse.c b/boot/parse.c
new file mode 100644
index 0000000..38cb712
--- /dev/null
+++ b/boot/parse.c
@@ -0,0 +1,470 @@
+/*
+ * very simple handwritten recursive descent
+ * parser for a catskill source file.
+ */
+
+#define PARSER_LOOKAHEAD 2
+
+#define CHECK(parse) \
+    parse;           \
+    if (!parser_error_is_none(error)) return nil;
+
+struct Parser_Error
+{
+    enum Parser_Error_Kind
+    {
+        PARSER_ERROR_NONE,
+        PARSER_ERROR_UNEXPECTED_TOKEN,
+        PARSER_ERROR_UNEXPECTED_EOF,
+        PARSER_ERROR_EXPECTED_STATEMENT_END,
+        PARSER_ERROR_EXPECTED_PRIMARY_EXPRESSION,
+    } kind;
+    // TODO: add span to error
+};
+
+struct Parser_Error
+parser_error(enum Parser_Error_Kind kind)
+{
+    return (struct Parser_Error){ kind };
+}
+
+struct Parser_Error
+parser_error_none()
+{
+    return parser_error(PARSER_ERROR_NONE);
+}
+
+bool
+parser_error_is_none(const struct Parser_Error* error)
+{
+    return error->kind == PARSER_ERROR_NONE;
+}
+
+const ascii*
+parser_error_to_string(const struct Parser_Error* error)
+{
+    switch (error->kind) {
+    case PARSER_ERROR_NONE:
+        return "none";
+    case PARSER_ERROR_UNEXPECTED_TOKEN:
+        return "unexpected token";
+    case PARSER_ERROR_UNEXPECTED_EOF:
+        return "unexpected end of file";
+    case PARSER_ERROR_EXPECTED_STATEMENT_END:
+        return "expected statement end";
+    case PARSER_ERROR_EXPECTED_PRIMARY_EXPRESSION:
+        return "expected primary expression";
+    default:
+        return "unknown error";
+    }
+}
+
+struct Parser
+{
+    struct Lexer* lexer;
+    struct Token lookahead[PARSER_LOOKAHEAD];
+};
+
+void
+parser_new(struct Parser* p, struct Lexer* lexer)
+{
+    p->lexer = lexer;
+    memset(p->lookahead, 0, sizeof(p->lookahead));
+}
+
+bool
+parser_lookahead_pop(struct Parser* p, struct Token* token)
+{
+    struct Token head = p->lookahead[0];
+    if (token_is_empty(&head)) return false;
+
+    for (uint i = 0; i < PARSER_LOOKAHEAD - 1; i++) p->lookahead[i] = p->lookahead[i + 1];
+    p->lookahead[PARSER_LOOKAHEAD - 1] = token_none();
+
+    *token = head;
+    return true;
+}
+
+bool
+parser_lookahead_push(struct Parser* p, struct Token token)
+{
+    for (uint i = 0; i < PARSER_LOOKAHEAD; i++) {
+        if (token_is_empty(&p->lookahead[i])) {
+            p->lookahead[i] = token;
+            return true;
+        }
+    }
+    return false;
+}
+
+// advance the token stream and return the next token.
+struct Token
+parser_next(struct Parser* p)
+{
+    struct Token token;
+    if (!parser_lookahead_pop(p, &token)) token = lexer_next(p->lexer);
+    return token;
+}
+
+// advance the token stream if the token is of the right type.
+// if not, return none-token and set the error.
+struct Token
+parser_need(struct Parser* p, enum Token_Kind kind, struct Parser_Error* error)
+{
+    struct Token token = parser_next(p);
+    if (!token_is(&token, kind)) {
+        *error = parser_error(PARSER_ERROR_UNEXPECTED_TOKEN);
+        return token_none();
+    }
+    return token;
+}
+
+// peek at the token stream at given index, without advancing it.
+// index 0 is the token that would be returned by a next `parser_next` call.
+// peek limit is defined by `PARSER_LOOKAHEAD`.
+struct Token
+parser_peek_at(struct Parser* p, uint index)
+{
+    check(index < 2, "parser peek index out of range");
+
+    while (token_is_empty(&p->lookahead[index])) {
+        struct Token next_token = lexer_next(p->lexer);
+        parser_lookahead_push(p, next_token);
+    }
+
+    return p->lookahead[index];
+}
+
+// peek at the next token in the stream, without advancing it.
+// synonym for `parser_peek_at(p, 0)`.
+struct Token
+parser_peek(struct Parser* p)
+{
+    return parser_peek_at(p, 0);
+}
+
+// peek at the token one beyond the current cursor in the stream, without advancing it.
+// synonym for `parser_peek_at(p, 1)`.
+struct Token
+parser_peek_further(struct Parser* p)
+{
+    return parser_peek_at(p, 1);
+}
+
+// check if the next token is of a given type.
+bool
+parser_probe(struct Parser* p, enum Token_Kind kind)
+{
+    struct Token token = parser_peek(p);
+    return token_is(&token, kind);
+}
+
+struct Expression* parser_expression(struct Parser* p, struct Parser_Error* error);
+
+struct Expression*
+parser_expression_primary_name(struct Parser* p, struct Parser_Error* error)
+{
+    struct Token token = CHECK(parser_need(p, TOKEN_NAME, error));
+    union Expression_Value value = { .name = { token.value.name } };
+    return expression_new(EXPRESSION_NAME, value, token.span, token.location);
+}
+
+struct Expression*
+parser_expression_primary_integer(struct Parser* p, struct Parser_Error* error)
+{
+    struct Token token = CHECK(parser_need(p, TOKEN_LITERAL_INTEGER, error));
+    union Expression_Value value = { .integer_literal = { token.value.literal_integer } };
+    return expression_new(EXPRESSION_INTEGER_LITERAL, value, token.span, token.location);
+}
+
+struct Expression*
+parser_expression_primary_float(struct Parser* p, struct Parser_Error* error)
+{
+    struct Token token = CHECK(parser_need(p, TOKEN_LITERAL_FLOAT, error));
+    union Expression_Value value = { .float_literal = { token.value.literal_float } };
+    return expression_new(EXPRESSION_FLOAT_LITERAL, value, token.span, token.location);
+}
+
+struct Expression*
+parser_expression_primary_string(struct Parser* p, struct Parser_Error* error)
+{
+    struct Token token = CHECK(parser_need(p, TOKEN_LITERAL_STRING, error));
+    union Expression_Value value = { .string_literal = { token.value.literal_string } };
+    return expression_new(EXPRESSION_STRING_LITERAL, value, token.span, token.location);
+}
+
+struct Expression*
+parser_expression_primary_group(struct Parser* p, struct Parser_Error* error)
+{
+    struct Token start_token = CHECK(parser_need(p, TOKEN_ROUND_OPEN, error));
+    struct Expression* expression = CHECK(parser_expression(p, error));
+    struct Token end_token = CHECK(parser_need(p, TOKEN_ROUND_OPEN, error));
+
+    struct Span span = span_merge(start_token.span, end_token.span);
+    union Expression_Value value = { .group = { expression } };
+    return expression_new(EXPRESSION_GROUP, value, span, start_token.location);
+}
+
+struct Expression*
+parser_expression_primary(struct Parser* p, struct Parser_Error* error)
+{
+    struct Token token = parser_peek(p);
+    switch (token.kind) {
+    case TOKEN_NAME:
+        return parser_expression_primary_name(p, error);
+    case TOKEN_LITERAL_INTEGER:
+        return parser_expression_primary_integer(p, error);
+    case TOKEN_LITERAL_FLOAT:
+        return parser_expression_primary_float(p, error);
+    case TOKEN_LITERAL_STRING:
+        return parser_expression_primary_string(p, error);
+    case TOKEN_ROUND_OPEN:
+        return parser_expression_primary_group(p, error);
+    // TODO: boolean literals.
+    default:
+        *error = parser_error(PARSER_ERROR_EXPECTED_PRIMARY_EXPRESSION);
+        return nil;
+    }
+}
+
+struct Expression*
+parser_expression_member(struct Parser* p, struct Parser_Error* error)
+{
+    struct Expression* left = CHECK(parser_expression_primary(p, error));
+
+    // NOTE: see `parser_expression_postfix_call`.
+    while (parser_probe(p, TOKEN_DOT)) {
+        parser_next(p);
+        struct Token name_token = CHECK(parser_need(p, TOKEN_NAME, error));
+        struct String name = name_token.value.name;
+
+        struct Span span = span_merge(left->span, name_token.span);
+        union Expression_Value value = { .member = { left, name } };
+        left = expression_new(EXPRESSION_MEMBER, value, span, name_token.location);
+    }
+
+    return left;
+}
+
+struct Expression*
+parser_expression_postfix_call(
+    struct Parser* p, struct Expression* subject, struct Parser_Error* error)
+{
+    // NOTE: because of the way the parser works, we have to parse all subsequent
+    // call expressions in the same loop.
+    // this is the case with an expression like `meow_function()(123)`,
+    // where the hypothetical `meow_function` returns a function pointer.
+    while (parser_probe(p, TOKEN_ROUND_OPEN)) {
+        parser_next(p);
+
+        struct Expression *arguments_head = nil, *arguments_current = nil;
+        while (!parser_probe(p, TOKEN_ROUND_CLOSE)) {
+            struct Expression* argument = CHECK(parser_expression(p, error));
+            if (!arguments_head)
+                arguments_head = argument;
+            else
+                arguments_current->next = argument;
+            arguments_current = argument;
+            if (parser_probe(p, TOKEN_COMMA)) parser_next(p);
+        }
+
+        struct Token token = CHECK(parser_need(p, TOKEN_ROUND_CLOSE, error));
+        struct Span span = span_merge(subject->span, token.span);
+        union Expression_Value value = { .call = { subject, arguments_head } };
+        subject = expression_new(EXPRESSION_CALL, value, span, token.location);
+    }
+
+    return subject;
+}
+
+struct Expression*
+parser_expression_postfix_subscript(
+    struct Parser* p, struct Expression* subject, struct Parser_Error* error)
+{
+    // NOTE: see `parser_expression_postfix_call`.
+    while (parser_probe(p, TOKEN_SQUARE_OPEN)) {
+        parser_next(p);
+
+        struct Expression* index = CHECK(parser_expression(p, error));
+        struct Token token = CHECK(parser_need(p, TOKEN_SQUARE_CLOSE, error));
+
+        struct Span span = span_merge(subject->span, span_merge(index->span, token.span));
+        union Expression_Value value = { .subscript = { subject, index } };
+        subject = expression_new(EXPRESSION_SUBSCRIPT, value, span, token.location);
+    }
+    return subject;
+}
+
+struct Expression*
+parser_expression_postfix(struct Parser* p, struct Parser_Error* error)
+{
+    struct Expression* expression = CHECK(parser_expression_member(p, error));
+
+    switch (parser_peek(p).kind) {
+    case TOKEN_ROUND_OPEN:
+        return parser_expression_postfix_call(p, expression, error);
+    case TOKEN_SQUARE_OPEN:
+        return parser_expression_postfix_subscript(p, expression, error);
+    default:
+        return expression;
+    }
+}
+
+struct Expression*
+parser_expression_unary_operation(struct Parser* p, struct Parser_Error* error)
+{
+    struct Token token = parser_peek(p);
+    enum Unary_Operation operation = unary_operation_from_token(&token);
+    if (operation) {
+        parser_next(p);
+        struct Expression* operand = CHECK(parser_expression_unary_operation(p, error));
+
+        struct Span span = span_merge(token.span, operand->span);
+        union Expression_Value value = { .unary_operator = { operation, operand } };
+        return expression_new(EXPRESSION_UNARY_OPERATION, value, span, token.location);
+    }
+
+    return parser_expression_postfix(p, error);
+}
+
+// given two expressions and some kind of binary operation between them,
+// merge them into a single binary expression, with attention to operator precedence
+// and associativity.
+struct Expression_Binary_Operator
+parser_merge_into_single_binary_expression(
+    struct Parser* p, struct Expression* left, enum Binary_Operation operation,
+    struct Expression* right)
+{
+    // NOTE: due to the parser structure, the left expression is never a binary operation
+    // so we can worry about fixing up the right side only.
+    check(left->kind != EXPRESSION_BINARY_OPERATION, "left expression is a binary operation");
+    if (right->kind == EXPRESSION_BINARY_OPERATION) {
+        struct Expression_Binary_Operator right_binary = right->value.binary_operator;
+
+        uint right_precedence = binary_operation_precedence(right_binary.operation);
+        uint precedence = binary_operation_precedence(operation);
+
+        bool switch_due_to_precedence = right_precedence < precedence;
+        bool switch_due_to_associativity =
+            precedence == right_precedence
+            && binary_operation_associativity(operation) == BINARY_ASSOCIATIVITY_LEFT;
+
+        // essentially check if an expression like `a ~ (b ~ c)` needs to be
+        // switched into `(a ~ b) ~ c` or not.
+        if (switch_due_to_precedence || switch_due_to_associativity) {
+            // since we only use static memory, we need to switch the operands around
+            // without allocating more expressions, thus we have to reuse the allocation
+            // slots of the previous expressions.
+            struct Expression* operands[3] = {
+                left, right_binary.left_operand, right_binary.right_operand
+            };
+            struct Expression* slots[2] = { right, right_binary.right_operand };
+
+            struct Expression_Binary_Operator new_binary_operator =
+                parser_merge_into_single_binary_expression(p, operands[0], operation, operands[1]);
+            *slots[0] = (struct Expression){
+                .kind = EXPRESSION_BINARY_OPERATION,
+                .value = { .binary_operator = new_binary_operator },
+                .span = span_merge(left->span, right->span),
+                .location = left->location
+            };
+
+            *slots[1] = *operands[2];
+
+            left = slots[0];
+            right = slots[1];
+            operation = right_binary.operation;
+        }
+    }
+
+    return (struct Expression_Binary_Operator){
+        .operation = operation,
+        .left_operand = left,
+        .right_operand = right,
+    };
+}
+
+struct Expression*
+parser_expression_binary_operation(struct Parser* p, struct Parser_Error* error)
+{
+    struct Expression* left = CHECK(parser_expression_unary_operation(p, error));
+
+    struct Token token = parser_peek(p);
+    enum Binary_Operation operation = binary_operation_from_token(&token);
+    if (operation) {
+        parser_next(p);
+        struct Expression* right = CHECK(parser_expression_binary_operation(p, error));
+
+        struct Span span = span_merge(left->span, right->span);
+        struct Expression_Binary_Operator binary_value =
+            parser_merge_into_single_binary_expression(p, left, operation, right);
+        union Expression_Value value = { .binary_operator = binary_value };
+        return expression_new(EXPRESSION_BINARY_OPERATION, value, span, left->location);
+    }
+
+    return left;
+}
+
+struct Expression*
+parser_expression(struct Parser* p, struct Parser_Error* error)
+{
+    return parser_expression_binary_operation(p, error);
+}
+
+void
+parser_end_statement(struct Parser* p, struct Parser_Error* error)
+{
+    struct Token token = parser_peek(p);
+    if (!token_ends_statement(&token)) {
+        *error = parser_error(PARSER_ERROR_EXPECTED_STATEMENT_END);
+        return;
+    }
+    parser_next(p);
+}
+
+struct Statement*
+parser_statement(struct Parser* p, struct Parser_Error* error)
+{
+    // skip empty statements.
+    struct Token token = parser_peek(p);
+    if (token_ends_statement(&token)) {
+        parser_next(p);
+        return nil;
+    }
+
+    // TODO: no statements for now, just go straight to expressions.
+    struct Expression* expression = CHECK(parser_expression(p, error));
+
+    CHECK(parser_end_statement(p, error));
+
+    // expand by one byte to include the statement terminator.
+    struct Span span = span_expand(expression->span, 1);
+    union Statement_Value value = { .expression = expression };
+    return statement_new(STATEMENT_EXPRESSION, value, span, expression->location);
+}
+
+// parse the lexer tokens into a single AST.
+// note: it was either `parser_parse` or this. :)
+struct Tree
+parser_do_your_thing(struct Parser* p, struct Parser_Error* error)
+{
+    struct Statement* head = nil;
+
+    struct Statement* current = nil;
+    while (!p->lexer->eof) {
+        struct Statement* next = parser_statement(p, error);
+        if (!parser_error_is_none(error)) return (struct Tree){ nil };
+
+        if (current) {
+            current->next = next;
+        } else {
+            head = next;
+        }
+        current = next;
+    }
+
+    *error = parser_error_none();
+    return (struct Tree){ head };
+}
+
+#undef CHECK
\ No newline at end of file