diff options
| author | Mel <mel@rnrd.eu> | 2025-05-21 16:19:59 +0200 |
|---|---|---|
| committer | Mel <mel@rnrd.eu> | 2025-05-21 16:19:59 +0200 |
| commit | 1e07cecafc62d18ba05f21101e13b131a10e8c45 (patch) | |
| tree | e036129c57b734292ff15b71a1d193aa0a97c678 /boot/parse.c | |
| parent | 8c8d65f026121b4d75a31bff8a91c3fbf969fca5 (diff) | |
| download | catskill-1e07cecafc62d18ba05f21101e13b131a10e8c45.tar.zst catskill-1e07cecafc62d18ba05f21101e13b131a10e8c45.zip | |
Basic expression parser with operator precedence and associativity
Signed-off-by: Mel <mel@rnrd.eu>
Diffstat (limited to 'boot/parse.c')
| -rw-r--r-- | boot/parse.c | 470 |
1 files changed, 470 insertions, 0 deletions
diff --git a/boot/parse.c b/boot/parse.c new file mode 100644 index 0000000..38cb712 --- /dev/null +++ b/boot/parse.c @@ -0,0 +1,470 @@ +/* + * very simple handwritten recursive descent + * parser for a catskill source file. + */ + +#define PARSER_LOOKAHEAD 2 + +#define CHECK(parse) \ + parse; \ + if (!parser_error_is_none(error)) return nil; + +struct Parser_Error +{ + enum Parser_Error_Kind + { + PARSER_ERROR_NONE, + PARSER_ERROR_UNEXPECTED_TOKEN, + PARSER_ERROR_UNEXPECTED_EOF, + PARSER_ERROR_EXPECTED_STATEMENT_END, + PARSER_ERROR_EXPECTED_PRIMARY_EXPRESSION, + } kind; + // TODO: add span to error +}; + +struct Parser_Error +parser_error(enum Parser_Error_Kind kind) +{ + return (struct Parser_Error){ kind }; +} + +struct Parser_Error +parser_error_none() +{ + return parser_error(PARSER_ERROR_NONE); +} + +bool +parser_error_is_none(const struct Parser_Error* error) +{ + return error->kind == PARSER_ERROR_NONE; +} + +const ascii* +parser_error_to_string(const struct Parser_Error* error) +{ + switch (error->kind) { + case PARSER_ERROR_NONE: + return "none"; + case PARSER_ERROR_UNEXPECTED_TOKEN: + return "unexpected token"; + case PARSER_ERROR_UNEXPECTED_EOF: + return "unexpected end of file"; + case PARSER_ERROR_EXPECTED_STATEMENT_END: + return "expected statement end"; + case PARSER_ERROR_EXPECTED_PRIMARY_EXPRESSION: + return "expected primary expression"; + default: + return "unknown error"; + } +} + +struct Parser +{ + struct Lexer* lexer; + struct Token lookahead[PARSER_LOOKAHEAD]; +}; + +void +parser_new(struct Parser* p, struct Lexer* lexer) +{ + p->lexer = lexer; + memset(p->lookahead, 0, sizeof(p->lookahead)); +} + +bool +parser_lookahead_pop(struct Parser* p, struct Token* token) +{ + struct Token head = p->lookahead[0]; + if (token_is_empty(&head)) return false; + + for (uint i = 0; i < PARSER_LOOKAHEAD - 1; i++) p->lookahead[i] = p->lookahead[i + 1]; + p->lookahead[PARSER_LOOKAHEAD - 1] = token_none(); + + *token = head; + return true; +} + +bool +parser_lookahead_push(struct Parser* p, struct Token token) +{ + for (uint i = 0; i < PARSER_LOOKAHEAD; i++) { + if (token_is_empty(&p->lookahead[i])) { + p->lookahead[i] = token; + return true; + } + } + return false; +} + +// advance the token stream and return the next token. +struct Token +parser_next(struct Parser* p) +{ + struct Token token; + if (!parser_lookahead_pop(p, &token)) token = lexer_next(p->lexer); + return token; +} + +// advance the token stream if the token is of the right type. +// if not, return none-token and set the error. +struct Token +parser_need(struct Parser* p, enum Token_Kind kind, struct Parser_Error* error) +{ + struct Token token = parser_next(p); + if (!token_is(&token, kind)) { + *error = parser_error(PARSER_ERROR_UNEXPECTED_TOKEN); + return token_none(); + } + return token; +} + +// peek at the token stream at given index, without advancing it. +// index 0 is the token that would be returned by a next `parser_next` call. +// peek limit is defined by `PARSER_LOOKAHEAD`. +struct Token +parser_peek_at(struct Parser* p, uint index) +{ + check(index < 2, "parser peek index out of range"); + + while (token_is_empty(&p->lookahead[index])) { + struct Token next_token = lexer_next(p->lexer); + parser_lookahead_push(p, next_token); + } + + return p->lookahead[index]; +} + +// peek at the next token in the stream, without advancing it. +// synonym for `parser_peek_at(p, 0)`. +struct Token +parser_peek(struct Parser* p) +{ + return parser_peek_at(p, 0); +} + +// peek at the token one beyond the current cursor in the stream, without advancing it. +// synonym for `parser_peek_at(p, 1)`. +struct Token +parser_peek_further(struct Parser* p) +{ + return parser_peek_at(p, 1); +} + +// check if the next token is of a given type. +bool +parser_probe(struct Parser* p, enum Token_Kind kind) +{ + struct Token token = parser_peek(p); + return token_is(&token, kind); +} + +struct Expression* parser_expression(struct Parser* p, struct Parser_Error* error); + +struct Expression* +parser_expression_primary_name(struct Parser* p, struct Parser_Error* error) +{ + struct Token token = CHECK(parser_need(p, TOKEN_NAME, error)); + union Expression_Value value = { .name = { token.value.name } }; + return expression_new(EXPRESSION_NAME, value, token.span, token.location); +} + +struct Expression* +parser_expression_primary_integer(struct Parser* p, struct Parser_Error* error) +{ + struct Token token = CHECK(parser_need(p, TOKEN_LITERAL_INTEGER, error)); + union Expression_Value value = { .integer_literal = { token.value.literal_integer } }; + return expression_new(EXPRESSION_INTEGER_LITERAL, value, token.span, token.location); +} + +struct Expression* +parser_expression_primary_float(struct Parser* p, struct Parser_Error* error) +{ + struct Token token = CHECK(parser_need(p, TOKEN_LITERAL_FLOAT, error)); + union Expression_Value value = { .float_literal = { token.value.literal_float } }; + return expression_new(EXPRESSION_FLOAT_LITERAL, value, token.span, token.location); +} + +struct Expression* +parser_expression_primary_string(struct Parser* p, struct Parser_Error* error) +{ + struct Token token = CHECK(parser_need(p, TOKEN_LITERAL_STRING, error)); + union Expression_Value value = { .string_literal = { token.value.literal_string } }; + return expression_new(EXPRESSION_STRING_LITERAL, value, token.span, token.location); +} + +struct Expression* +parser_expression_primary_group(struct Parser* p, struct Parser_Error* error) +{ + struct Token start_token = CHECK(parser_need(p, TOKEN_ROUND_OPEN, error)); + struct Expression* expression = CHECK(parser_expression(p, error)); + struct Token end_token = CHECK(parser_need(p, TOKEN_ROUND_OPEN, error)); + + struct Span span = span_merge(start_token.span, end_token.span); + union Expression_Value value = { .group = { expression } }; + return expression_new(EXPRESSION_GROUP, value, span, start_token.location); +} + +struct Expression* +parser_expression_primary(struct Parser* p, struct Parser_Error* error) +{ + struct Token token = parser_peek(p); + switch (token.kind) { + case TOKEN_NAME: + return parser_expression_primary_name(p, error); + case TOKEN_LITERAL_INTEGER: + return parser_expression_primary_integer(p, error); + case TOKEN_LITERAL_FLOAT: + return parser_expression_primary_float(p, error); + case TOKEN_LITERAL_STRING: + return parser_expression_primary_string(p, error); + case TOKEN_ROUND_OPEN: + return parser_expression_primary_group(p, error); + // TODO: boolean literals. + default: + *error = parser_error(PARSER_ERROR_EXPECTED_PRIMARY_EXPRESSION); + return nil; + } +} + +struct Expression* +parser_expression_member(struct Parser* p, struct Parser_Error* error) +{ + struct Expression* left = CHECK(parser_expression_primary(p, error)); + + // NOTE: see `parser_expression_postfix_call`. + while (parser_probe(p, TOKEN_DOT)) { + parser_next(p); + struct Token name_token = CHECK(parser_need(p, TOKEN_NAME, error)); + struct String name = name_token.value.name; + + struct Span span = span_merge(left->span, name_token.span); + union Expression_Value value = { .member = { left, name } }; + left = expression_new(EXPRESSION_MEMBER, value, span, name_token.location); + } + + return left; +} + +struct Expression* +parser_expression_postfix_call( + struct Parser* p, struct Expression* subject, struct Parser_Error* error) +{ + // NOTE: because of the way the parser works, we have to parse all subsequent + // call expressions in the same loop. + // this is the case with an expression like `meow_function()(123)`, + // where the hypothetical `meow_function` returns a function pointer. + while (parser_probe(p, TOKEN_ROUND_OPEN)) { + parser_next(p); + + struct Expression *arguments_head = nil, *arguments_current = nil; + while (!parser_probe(p, TOKEN_ROUND_CLOSE)) { + struct Expression* argument = CHECK(parser_expression(p, error)); + if (!arguments_head) + arguments_head = argument; + else + arguments_current->next = argument; + arguments_current = argument; + if (parser_probe(p, TOKEN_COMMA)) parser_next(p); + } + + struct Token token = CHECK(parser_need(p, TOKEN_ROUND_CLOSE, error)); + struct Span span = span_merge(subject->span, token.span); + union Expression_Value value = { .call = { subject, arguments_head } }; + subject = expression_new(EXPRESSION_CALL, value, span, token.location); + } + + return subject; +} + +struct Expression* +parser_expression_postfix_subscript( + struct Parser* p, struct Expression* subject, struct Parser_Error* error) +{ + // NOTE: see `parser_expression_postfix_call`. + while (parser_probe(p, TOKEN_SQUARE_OPEN)) { + parser_next(p); + + struct Expression* index = CHECK(parser_expression(p, error)); + struct Token token = CHECK(parser_need(p, TOKEN_SQUARE_CLOSE, error)); + + struct Span span = span_merge(subject->span, span_merge(index->span, token.span)); + union Expression_Value value = { .subscript = { subject, index } }; + subject = expression_new(EXPRESSION_SUBSCRIPT, value, span, token.location); + } + return subject; +} + +struct Expression* +parser_expression_postfix(struct Parser* p, struct Parser_Error* error) +{ + struct Expression* expression = CHECK(parser_expression_member(p, error)); + + switch (parser_peek(p).kind) { + case TOKEN_ROUND_OPEN: + return parser_expression_postfix_call(p, expression, error); + case TOKEN_SQUARE_OPEN: + return parser_expression_postfix_subscript(p, expression, error); + default: + return expression; + } +} + +struct Expression* +parser_expression_unary_operation(struct Parser* p, struct Parser_Error* error) +{ + struct Token token = parser_peek(p); + enum Unary_Operation operation = unary_operation_from_token(&token); + if (operation) { + parser_next(p); + struct Expression* operand = CHECK(parser_expression_unary_operation(p, error)); + + struct Span span = span_merge(token.span, operand->span); + union Expression_Value value = { .unary_operator = { operation, operand } }; + return expression_new(EXPRESSION_UNARY_OPERATION, value, span, token.location); + } + + return parser_expression_postfix(p, error); +} + +// given two expressions and some kind of binary operation between them, +// merge them into a single binary expression, with attention to operator precedence +// and associativity. +struct Expression_Binary_Operator +parser_merge_into_single_binary_expression( + struct Parser* p, struct Expression* left, enum Binary_Operation operation, + struct Expression* right) +{ + // NOTE: due to the parser structure, the left expression is never a binary operation + // so we can worry about fixing up the right side only. + check(left->kind != EXPRESSION_BINARY_OPERATION, "left expression is a binary operation"); + if (right->kind == EXPRESSION_BINARY_OPERATION) { + struct Expression_Binary_Operator right_binary = right->value.binary_operator; + + uint right_precedence = binary_operation_precedence(right_binary.operation); + uint precedence = binary_operation_precedence(operation); + + bool switch_due_to_precedence = right_precedence < precedence; + bool switch_due_to_associativity = + precedence == right_precedence + && binary_operation_associativity(operation) == BINARY_ASSOCIATIVITY_LEFT; + + // essentially check if an expression like `a ~ (b ~ c)` needs to be + // switched into `(a ~ b) ~ c` or not. + if (switch_due_to_precedence || switch_due_to_associativity) { + // since we only use static memory, we need to switch the operands around + // without allocating more expressions, thus we have to reuse the allocation + // slots of the previous expressions. + struct Expression* operands[3] = { + left, right_binary.left_operand, right_binary.right_operand + }; + struct Expression* slots[2] = { right, right_binary.right_operand }; + + struct Expression_Binary_Operator new_binary_operator = + parser_merge_into_single_binary_expression(p, operands[0], operation, operands[1]); + *slots[0] = (struct Expression){ + .kind = EXPRESSION_BINARY_OPERATION, + .value = { .binary_operator = new_binary_operator }, + .span = span_merge(left->span, right->span), + .location = left->location + }; + + *slots[1] = *operands[2]; + + left = slots[0]; + right = slots[1]; + operation = right_binary.operation; + } + } + + return (struct Expression_Binary_Operator){ + .operation = operation, + .left_operand = left, + .right_operand = right, + }; +} + +struct Expression* +parser_expression_binary_operation(struct Parser* p, struct Parser_Error* error) +{ + struct Expression* left = CHECK(parser_expression_unary_operation(p, error)); + + struct Token token = parser_peek(p); + enum Binary_Operation operation = binary_operation_from_token(&token); + if (operation) { + parser_next(p); + struct Expression* right = CHECK(parser_expression_binary_operation(p, error)); + + struct Span span = span_merge(left->span, right->span); + struct Expression_Binary_Operator binary_value = + parser_merge_into_single_binary_expression(p, left, operation, right); + union Expression_Value value = { .binary_operator = binary_value }; + return expression_new(EXPRESSION_BINARY_OPERATION, value, span, left->location); + } + + return left; +} + +struct Expression* +parser_expression(struct Parser* p, struct Parser_Error* error) +{ + return parser_expression_binary_operation(p, error); +} + +void +parser_end_statement(struct Parser* p, struct Parser_Error* error) +{ + struct Token token = parser_peek(p); + if (!token_ends_statement(&token)) { + *error = parser_error(PARSER_ERROR_EXPECTED_STATEMENT_END); + return; + } + parser_next(p); +} + +struct Statement* +parser_statement(struct Parser* p, struct Parser_Error* error) +{ + // skip empty statements. + struct Token token = parser_peek(p); + if (token_ends_statement(&token)) { + parser_next(p); + return nil; + } + + // TODO: no statements for now, just go straight to expressions. + struct Expression* expression = CHECK(parser_expression(p, error)); + + CHECK(parser_end_statement(p, error)); + + // expand by one byte to include the statement terminator. + struct Span span = span_expand(expression->span, 1); + union Statement_Value value = { .expression = expression }; + return statement_new(STATEMENT_EXPRESSION, value, span, expression->location); +} + +// parse the lexer tokens into a single AST. +// note: it was either `parser_parse` or this. :) +struct Tree +parser_do_your_thing(struct Parser* p, struct Parser_Error* error) +{ + struct Statement* head = nil; + + struct Statement* current = nil; + while (!p->lexer->eof) { + struct Statement* next = parser_statement(p, error); + if (!parser_error_is_none(error)) return (struct Tree){ nil }; + + if (current) { + current->next = next; + } else { + head = next; + } + current = next; + } + + *error = parser_error_none(); + return (struct Tree){ head }; +} + +#undef CHECK \ No newline at end of file |
