/* * very simple handwritten recursive descent * parser for a catskill source file. * * Copyright (c) 2025, Mel G. * * SPDX-License-Identifier: MPL-2.0 */ #pragma once #include "catboot.h" #define PARSER_LOOKAHEAD 2 #define CHECK(parse) \ parse; \ if (!parser_error_is_none(error)) return nil; #define CHECK_RETURN(parse, ret) \ parse; \ if (!parser_error_is_none(error)) return (ret){ 0 }; enum Parser_Error_Kind { PARSER_ERROR_NONE, PARSER_ERROR_UNEXPECTED_TOKEN, PARSER_ERROR_UNEXPECTED_EOF, PARSER_ERROR_EXPECTED_STATEMENT_END, PARSER_ERROR_EXPECTED_PRIMARY_EXPRESSION, PARSER_ERROR_EXPECTED_TYPE, PARSER_ERROR_EXPECTED_PRAGMA, PARSER_ERROR_EXPECTED_PRAGMA_ARGUMENT }; const ascii* parser_error_kind_to_string(enum Parser_Error_Kind error_kind) { switch (error_kind) { case PARSER_ERROR_NONE: return "none"; case PARSER_ERROR_UNEXPECTED_TOKEN: return "unexpected token"; case PARSER_ERROR_UNEXPECTED_EOF: return "unexpected end of file"; case PARSER_ERROR_EXPECTED_STATEMENT_END: return "expected statement end"; case PARSER_ERROR_EXPECTED_PRIMARY_EXPRESSION: return "expected primary expression"; case PARSER_ERROR_EXPECTED_TYPE: return "expected type"; case PARSER_ERROR_EXPECTED_PRAGMA: return "expected pragma"; case PARSER_ERROR_EXPECTED_PRAGMA_ARGUMENT: return "expected pragma argument"; default: return "unknown error"; } } struct Parser_Error { // kind is the top-level error type, // with subkind being a more specific error. // i.e. kind: EXPECTED_PRAGMA, subkind: UNEXPECTED_TOKEN(integer) enum Parser_Error_Kind kind, subkind; struct Token cause; }; void parser_error(struct Parser_Error* error, enum Parser_Error_Kind kind, struct Token token) { *error = (struct Parser_Error){ .kind = kind, .subkind = PARSER_ERROR_NONE, .cause = token }; } void parser_error_wrap(struct Parser_Error* error, enum Parser_Error_Kind super_kind) { *error = (struct Parser_Error){ .kind = super_kind, .subkind = error->kind, .cause = error->cause, }; } void parser_error_none(struct Parser_Error* error) { parser_error(error, PARSER_ERROR_NONE, token_none()); } bool parser_error_is_none(const struct Parser_Error* error) { return error->kind == PARSER_ERROR_NONE; } struct Span token_span_to_line_span(struct Span span, struct String source) { Pos line_start = span.start, line_end = span.start; // go backwards from the start of the token's span to find line start. while (line_start > 0 && string_at(source, line_start - 1) != '\n') line_start--; // go forwards from the end of the token's span to find line end. while (line_end < string_length(source) && string_at(source, line_end) != '\n') line_end++; return span_new(line_start, line_end); } // print out nice, human-readable error message // pointing to the location of the error in the source file. // TODO: bring out the infrastructure for displaying errors in the same format // outside of the parser, so that it can be used in other places. void parser_error_display(const struct Parser_Error* error, struct Source_File source_file) { if (parser_error_is_none(error)) return; uint line = error->cause.location.line; uint column = error->cause.location.column; struct Token cause = error->cause; fprintf(stderr, ANSI_WHITE "%s:%lu:%lu:\n", source_file.path.data, line, column); fprintf(stderr, ANSI_BOLD ANSI_RED "error: " ANSI_WHITE); if (error->subkind != PARSER_ERROR_NONE) { fprintf(stderr, "%s: %s", parser_error_kind_to_string(error->kind), parser_error_kind_to_string(error->subkind)); } else { fprintf(stderr, "%s", parser_error_kind_to_string(error->kind)); } // TODO: display tokens nicely fprintf(stderr, ", got '%s' :(\n", token_kind_to_string(cause.kind)); struct Span line_span = token_span_to_line_span(cause.span, source_file.source); ascii* source_line = source_file.source.data + line_span.start; int source_line_length = line_span.end - line_span.start; fprintf(stderr, ANSI_WHITE ANSI_NO_BOLD "%lu| %.*s\n", line, source_line_length, source_line); uint line_number_length = ceil(log10(line + 1)); fprintf(stderr, ANSI_RED "%*s", (int)(column + 1 + line_number_length), " "); for (uint w = 0; w < span_length(cause.span); w++) { fprintf(stderr, "^"); } fprintf(stderr, "\n" ANSI_RESET); } struct Parser { struct Lexer* lexer; struct Token lookahead[PARSER_LOOKAHEAD]; struct Source_File source_file; bool had_errors; }; void parser_new(struct Parser* p, struct Lexer* lexer, struct Source_File source_file) { p->lexer = lexer; memset(p->lookahead, 0, sizeof(p->lookahead)); p->source_file = source_file; p->had_errors = false; } bool parser_reached_end(struct Parser* p) { return p->lexer->eof; } bool parser_lookahead_pop(struct Parser* p, struct Token* token) { struct Token head = p->lookahead[0]; if (token_is_empty(&head)) return false; for (uint i = 0; i < PARSER_LOOKAHEAD - 1; i++) p->lookahead[i] = p->lookahead[i + 1]; p->lookahead[PARSER_LOOKAHEAD - 1] = token_none(); *token = head; return true; } bool parser_lookahead_push(struct Parser* p, struct Token token) { for (uint i = 0; i < PARSER_LOOKAHEAD; i++) { if (token_is_empty(&p->lookahead[i])) { p->lookahead[i] = token; return true; } } return false; } // advance the token stream and return the next token. struct Token parser_next(struct Parser* p) { struct Token token; if (!parser_lookahead_pop(p, &token)) token = lexer_next(p->lexer); return token; } // advance the token stream if the token is of the right type. // if not, return none-token and set the error. struct Token parser_need(struct Parser* p, enum Token_Kind kind, struct Parser_Error* error) { struct Token token = parser_next(p); if (!token_is(&token, kind)) { parser_error(error, PARSER_ERROR_UNEXPECTED_TOKEN, token); return token_none(); } return token; } // peek at the token stream at given index, without advancing it. // index 0 is the token that would be returned by a next `parser_next` call. // peek limit is defined by `PARSER_LOOKAHEAD`. struct Token parser_peek_at(struct Parser* p, uint index) { check(index < 2, "parser peek index out of range"); while (token_is_empty(&p->lookahead[index])) { struct Token next_token = lexer_next(p->lexer); parser_lookahead_push(p, next_token); } return p->lookahead[index]; } // peek at the next token in the stream, without advancing it. // synonym for `parser_peek_at(p, 0)`. struct Token parser_peek(struct Parser* p) { return parser_peek_at(p, 0); } // peek at the token one beyond the current cursor in the stream, without advancing it. // synonym for `parser_peek_at(p, 1)`. struct Token parser_peek_further(struct Parser* p) { return parser_peek_at(p, 1); } // check if the next token is of a given type. bool parser_probe(struct Parser* p, enum Token_Kind kind) { struct Token token = parser_peek(p); return token_is(&token, kind); } // skip all consecutive newlines and other non-interrupting // tokens in the token stream. // necessary for parsing statements and expressions that are // split-able over multiple lines. // TODO: this needs to be used in many more places and // is currently mostly redundant, due to only skipping newlines, // acting mostly as a marking for `the tokens don't have to follow precisely`. void parser_unglue(struct Parser* p) { while (parser_probe(p, TOKEN_NEWLINE)) parser_next(p); } // discard tokens until a "synchronization" point is found. // used to avoid errors cascading from one part of the code to another. void parser_panic(struct Parser* p) { // NOTE: this is still very non-robust and naive and will not work in many cases, // and swallow errors that should be reported. // it also basically destroys the interim tree state, so when it is used, // we should not expect the tree to be in a valid state to continue // compilation after the parse phase. // for a simple bootstrapping compiler, this is okay, // but for the final compiler, this should be replaced with a way more sophisticated system. // an idea i had for future implementations is to start a "pseudo-parse" pass that tries // recognizing familiar valid patterns to return control to the parser once it find a valid // pattern. for (;;) { struct Token token = parser_peek(p); switch (token.kind) { // if we're panicking and see an opening brace, bracket or parenthesis, // we assume it's part of the broken statement and // consume until we find its matching closing brace. case TOKEN_SQUARE_OPEN: case TOKEN_ROUND_OPEN: case TOKEN_CURLY_OPEN: { parser_next(p); // consume '{' '(' or '[' enum Token_Kind open_kind = token.kind; enum Token_Kind close_kind; switch (open_kind) { case TOKEN_SQUARE_OPEN: close_kind = TOKEN_SQUARE_CLOSE; break; case TOKEN_ROUND_OPEN: close_kind = TOKEN_ROUND_CLOSE; break; case TOKEN_CURLY_OPEN: close_kind = TOKEN_CURLY_CLOSE; break; default: unreachable(); } int balance = 1; while (balance > 0 && !parser_reached_end(p)) { struct Token t = parser_next(p); if (t.kind == open_kind) balance++; if (t.kind == close_kind) balance--; } break; } // Check if the next token is a good place to resume. case TOKEN_WORD_FUN: case TOKEN_WORD_LET: case TOKEN_WORD_VAR: case TOKEN_WORD_IF: case TOKEN_WORD_FOR: case TOKEN_WORD_WHILE: case TOKEN_WORD_RETURN: case TOKEN_WORD_TYPE: case TOKEN_WORD_CLASS: case TOKEN_WORD_VARIANT: case TOKEN_PIPE: case TOKEN_END_OF_FILE: // continue normal parsing return; default: // no synchronization point found yet :( break; } token = parser_next(p); if (token.kind == TOKEN_NEWLINE) { return; } } } struct Statement* parser_statement(struct Parser* p, struct Parser_Error* error); struct Expression* parser_expression(struct Parser* p, struct Parser_Error* error); void parser_end_statement(struct Parser* p, struct Parser_Error* error) { struct Token token = parser_peek(p); if (!token_ends_statement(&token)) { parser_error(error, PARSER_ERROR_EXPECTED_STATEMENT_END, token); return; } parser_next(p); } struct Block_Node parser_block_node(struct Parser* p, struct Parser_Error* error) { struct Token start_token = CHECK_RETURN(parser_need(p, TOKEN_CURLY_OPEN, error), struct Block_Node); parser_unglue(p); struct Statement* head = nil; struct Statement* current = nil; while (!parser_probe(p, TOKEN_CURLY_CLOSE)) { struct Statement* statement = CHECK_RETURN(parser_statement(p, error), struct Block_Node); // statement ending token isn't required when the block ends on the same line, // as in e.g.: `if (true) { print("yes") }` if (!parser_probe(p, TOKEN_CURLY_CLOSE)) CHECK_RETURN(parser_end_statement(p, error), struct Block_Node); if (!head) { head = statement; } else { current->next = statement; } current = statement; } struct Token end_token = CHECK_RETURN(parser_need(p, TOKEN_CURLY_CLOSE, error), struct Block_Node); struct Span span = span_merge(start_token.span, end_token.span); return (struct Block_Node){ .statements = head, .span = span, .location = start_token.location, }; } struct Type_Node* parser_node_type(struct Parser* p, struct Parser_Error* error); struct Function_Header_Node parser_function_header_node(struct Parser* p, struct Parser_Error* error) { struct Token open_parameters_token = CHECK_RETURN(parser_need(p, TOKEN_ROUND_OPEN, error), struct Function_Header_Node); struct Function_Header_Node header = { 0 }; while (!parser_probe(p, TOKEN_ROUND_CLOSE)) { // TODO: correctly output parameter spans bool variadic = false; if (parser_probe(p, TOKEN_DOT_DOT_DOT)) { variadic = true; parser_next(p); } struct Token name_token = CHECK_RETURN(parser_need(p, TOKEN_NAME, error), struct Function_Header_Node); struct String name = name_token.value.name; struct Type_Node* type; if (!parser_probe(p, TOKEN_ROUND_CLOSE) && !parser_probe(p, TOKEN_COMMA)) { type = CHECK_RETURN(parser_node_type(p, error), struct Function_Header_Node); } else { type = type_node_none(name_token.span, name_token.location); } type->value_name = name; type->variadic = variadic; if (!header.parameters_type_and_name) header.parameters_type_and_name = type; else header.parameters_type_and_name->next = type; if (parser_probe(p, TOKEN_COMMA)) parser_next(p); } struct Token close_parameters_token = parser_next(p); header.span = span_merge(open_parameters_token.span, close_parameters_token.span); struct Token next = parser_peek(p); struct Token further = parser_peek_further(p); if (token_can_begin_type(&next)) { // NOTE: this is very uncomfortable. to avoid ambiguity between a no-return-type function // body (i.e. `fun () {`) and a function with a return type of inline structure (i.e. `fun // () {x int} {`), we require that inline structs are declared in one line, without a // line-break, and function bodies always have a line-break after the opening brace. this // means that we cannot have a single line function, and we can't have a function with an // inline structure as a return type that is too long to fit on a single line. // TODO: to allow single line functions, introduce a `:` token as a replacement for the // opening brace. `fun (x int): x + 1` if (token_is(&next, TOKEN_CURLY_OPEN) && token_is(&further, TOKEN_NEWLINE)) return header; header.return_type = CHECK_RETURN(parser_node_type(p, error), struct Function_Header_Node); header.span = span_merge(header.span, header.return_type->span); } return header; } struct Bare_Declaration_Node parser_bare_declaration_node(struct Parser* p, struct Parser_Error* error) { struct String_Array names = string_array_new(); struct Span span = { 0 }; struct Cursor location = parser_peek(p).location; for (;;) { struct Token name_token = CHECK_RETURN(parser_need(p, TOKEN_NAME, error), struct Bare_Declaration_Node); span = span_is_empty(span) ? name_token.span : span_merge(span, name_token.span); string_array_add(&names, name_token.value.name); struct Token next = parser_peek(p); if (token_can_begin_type(&next)) break; if (next.kind == TOKEN_COMMA) parser_next(p); } // for now, type is always required. struct Type_Node* type = CHECK_RETURN(parser_node_type(p, error), struct Bare_Declaration_Node); CHECK_RETURN(parser_need(p, TOKEN_ASSIGN, error), struct Bare_Declaration_Node); struct Expression* initializer = CHECK_RETURN(parser_expression(p, error), struct Bare_Declaration_Node); return (struct Bare_Declaration_Node){ .names = names, .initializer = initializer, .type = type, .span = span, .location = location, }; } struct Type_Node* parser_node_type_name(struct Parser* p, struct Parser_Error* error) { struct Token name_token = CHECK(parser_need(p, TOKEN_NAME, error)); return type_node_new( TYPE_NODE_NAME, (union Type_Node_Value){ .name = { name_token.value.name } }, name_token.span, name_token.location); } struct Type_Node* parser_node_type_structure(struct Parser* p, struct Parser_Error* error) { struct Token open_token = CHECK(parser_need(p, TOKEN_CURLY_OPEN, error)); parser_unglue(p); struct Type_Node *head = nil, *current = nil; while (!parser_probe(p, TOKEN_CURLY_CLOSE)) { struct Token field_name_token = CHECK(parser_need(p, TOKEN_NAME, error)); struct Type_Node* field_type = CHECK(parser_node_type(p, error)); field_type->value_name = field_name_token.value.name; if (!head) head = field_type; else current->next = field_type; current = field_type; if (parser_probe(p, TOKEN_COMMA) || parser_probe(p, TOKEN_NEWLINE)) parser_next(p); } parser_unglue(p); struct Token close_token = CHECK(parser_need(p, TOKEN_CURLY_CLOSE, error)); struct Span span = span_merge(open_token.span, close_token.span); return type_node_new( TYPE_NODE_STRUCTURE, (union Type_Node_Value){ .structure = { head } }, span, open_token.location); } struct Type_Node* parser_node_type_variant(struct Parser* p, struct Parser_Error* error) { struct Token variant_token = CHECK(parser_need(p, TOKEN_WORD_VARIANT, error)); CHECK(parser_need(p, TOKEN_CURLY_OPEN, error)); parser_unglue(p); struct Type_Node *head = nil, *current = nil; while (!parser_probe(p, TOKEN_CURLY_CLOSE)) { struct Token variant_name_token = CHECK(parser_need(p, TOKEN_NAME, error)); struct String variant_name = variant_name_token.value.name; struct Type_Node* backing_type = nil; struct Token next = parser_peek(p); bool has_backing_type = !token_is(&next, TOKEN_NEWLINE) && !token_is(&next, TOKEN_COMMA) && !token_is(&next, TOKEN_CURLY_CLOSE); if (has_backing_type) { backing_type = CHECK(parser_node_type(p, error)); } else { backing_type = type_node_none(variant_name_token.span, variant_name_token.location); } backing_type->value_name = variant_name; if (!head) head = backing_type; else current->next = backing_type; current = backing_type; next = parser_peek(p); if (token_is(&next, TOKEN_COMMA) || token_is(&next, TOKEN_NEWLINE)) parser_next(p); } parser_unglue(p); struct Token close_token = CHECK(parser_need(p, TOKEN_CURLY_CLOSE, error)); struct Span span = span_merge(variant_token.span, close_token.span); return type_node_new( TYPE_NODE_VARIANT, (union Type_Node_Value){ .variant = { head } }, span, variant_token.location); } struct Type_Node* parser_node_type_function(struct Parser* p, struct Parser_Error* error) { struct Token fun_token = CHECK(parser_need(p, TOKEN_WORD_FUN, error)); struct Function_Header_Node header = CHECK(parser_function_header_node(p, error)); struct Span span = span_merge(fun_token.span, header.span); return type_node_new( TYPE_NODE_FUNCTION, (union Type_Node_Value){ .function = { header } }, span, fun_token.location); } struct Type_Node* parser_node_type_class(struct Parser* p, struct Parser_Error* error) { struct Token class_token = CHECK(parser_need(p, TOKEN_WORD_CLASS, error)); CHECK(parser_need(p, TOKEN_CURLY_OPEN, error)); parser_unglue(p); struct Type_Node *head = nil, *current = nil; while (!parser_probe(p, TOKEN_CURLY_CLOSE)) { struct Token method_name_token = CHECK(parser_need(p, TOKEN_NAME, error)); struct String method_name = method_name_token.value.name; struct Function_Header_Node header = CHECK(parser_function_header_node(p, error)); // allocate a new type node for the method type, // for the sake of consistency and the built-in type node linked list. struct Type_Node* method_type = type_node_new( TYPE_NODE_FUNCTION, (union Type_Node_Value){ .function = { header } }, header.span, header.parameters_type_and_name->location); method_type->value_name = method_name; if (!head) head = method_type; else current->next = method_type; current = method_type; if (parser_probe(p, TOKEN_COMMA) || parser_probe(p, TOKEN_NEWLINE)) parser_next(p); } parser_unglue(p); struct Token close_token = CHECK(parser_need(p, TOKEN_CURLY_CLOSE, error)); struct Span span = span_merge(class_token.span, close_token.span); return type_node_new( TYPE_NODE_CLASS, (union Type_Node_Value){ .class = { head } }, span, class_token.location); } struct Type_Node* parser_node_type_tuple(struct Parser* p, struct Parser_Error* error) { struct Token open_token = CHECK(parser_need(p, TOKEN_ROUND_OPEN, error)); struct Type_Node *head = nil, *current = nil; while (!parser_probe(p, TOKEN_ROUND_CLOSE)) { struct Type_Node* type = CHECK(parser_node_type(p, error)); if (!head) head = type; else current->next = type; current = type; if (parser_probe(p, TOKEN_COMMA)) parser_next(p); else break; } struct Token close_token = CHECK(parser_need(p, TOKEN_ROUND_CLOSE, error)); struct Span span = span_merge(open_token.span, close_token.span); return type_node_new( TYPE_NODE_TUPLE, (union Type_Node_Value){ .tuple = { head } }, span, open_token.location); } struct Type_Node* parser_node_type_array_or_map(struct Parser* p, struct Parser_Error* error) { struct Token open_token = CHECK(parser_need(p, TOKEN_SQUARE_OPEN, error)); struct Type_Node* element_or_key_type = parser_node_type(p, error); if (!element_or_key_type) { parser_error_wrap(error, PARSER_ERROR_EXPECTED_TYPE); return nil; } enum Type_Node_Type type; union Type_Node_Value value; if (parser_probe(p, TOKEN_ASSIGN)) { // this is a map type, e.g. `[string = int]` parser_next(p); // consume the assignment token struct Type_Node* key_type = element_or_key_type; struct Type_Node* value_type = parser_node_type(p, error); if (!value_type) { parser_error_wrap(error, PARSER_ERROR_EXPECTED_TYPE); return nil; } type = TYPE_NODE_MAP; value.map = (struct Type_Node_Map){ .key_type = key_type, .value_type = value_type, }; } else { // this is an array type, e.g. `[int]` type = TYPE_NODE_ARRAY; value.array = (struct Type_Node_Array){ .element_type = element_or_key_type }; } struct Token close_token = CHECK(parser_need(p, TOKEN_SQUARE_CLOSE, error)); struct Span span = span_merge(open_token.span, close_token.span); return type_node_new(type, value, span, open_token.location); } struct Type_Node* parser_node_type_reference(struct Parser* p, struct Parser_Error* error) { struct Token ampersand_token = CHECK(parser_need(p, TOKEN_AMPERSAND, error)); struct Type_Node* referenced_type = CHECK(parser_node_type(p, error)); if (!referenced_type) { parser_error(error, PARSER_ERROR_EXPECTED_TYPE, ampersand_token); return nil; } struct Span span = span_merge(ampersand_token.span, referenced_type->span); return type_node_new( TYPE_NODE_REFERENCE, (union Type_Node_Value){ .reference = { referenced_type } }, span, ampersand_token.location); } struct Type_Node* parser_node_type_inner(struct Parser* p, struct Parser_Error* error) { struct Token token = parser_peek(p); switch (token.kind) { case TOKEN_NAME: return parser_node_type_name(p, error); case TOKEN_WORD_VARIANT: return parser_node_type_variant(p, error); case TOKEN_WORD_CLASS: return parser_node_type_class(p, error); case TOKEN_WORD_FUN: return parser_node_type_function(p, error); case TOKEN_CURLY_OPEN: return parser_node_type_structure(p, error); case TOKEN_ROUND_OPEN: return parser_node_type_tuple(p, error); case TOKEN_SQUARE_OPEN: return parser_node_type_array_or_map(p, error); case TOKEN_AMPERSAND: return parser_node_type_reference(p, error); default: parser_error(error, PARSER_ERROR_UNEXPECTED_TOKEN, token); return nil; } } struct Type_Node* parser_node_type(struct Parser* p, struct Parser_Error* error) { struct Type_Node* type = CHECK(parser_node_type_inner(p, error)); // check if the type is followed by a `?`, which means it may be nil. if (parser_probe(p, TOKEN_QUESTION)) { parser_next(p); // consume the question mark type = type_node_new( TYPE_NODE_MAYBE, (union Type_Node_Value){ .maybe = { type } }, type->span, type->location); } return type; } struct Pragma_Node* parser_pragma_node(struct Parser* p, struct Parser_Error* error) { // `| c_header "stdio.h"` // `| clone always, printable CHECK(parser_need(p, TOKEN_PIPE, error)); struct Pragma_Node *head = nil, *current = nil; struct Token token = parser_peek(p); while (!token_ends_statement(&token)) { struct Token pragma_token = CHECK(parser_need(p, TOKEN_NAME, error)); enum Pragma_Type pragma_type = pragma_type_from_string(pragma_token.value.name); if (!pragma_type) { parser_error(error, PARSER_ERROR_EXPECTED_PRAGMA, pragma_token); return nil; } struct Span span = pragma_token.span; // parse the arguments until either statement end or comma // arguments can either be numbers, names or strings. uint argument_index = 0; struct Pragma_Argument arguments[PRAGMA_ARGUMENT_MAX] = { 0 }; token = parser_peek(p); while (!token_ends_statement(&token)) { check(argument_index < PRAGMA_ARGUMENT_MAX, "too many pragma arguments"); struct Pragma_Argument* argument = &arguments[argument_index]; union Token_Value* v = &token.value; switch (token.kind) { case TOKEN_LITERAL_INTEGER: argument->type = PRAGMA_ARGUMENT_NUMBER; argument->value.number = v->literal_integer; break; case TOKEN_LITERAL_FLOAT: argument->type = PRAGMA_ARGUMENT_DECIMAL; argument->value.decimal = v->literal_float; break; case TOKEN_LITERAL_STRING: argument->type = PRAGMA_ARGUMENT_NAME_OR_STRING; argument->value.name_or_string = v->literal_string; break; case TOKEN_NAME: argument->type = PRAGMA_ARGUMENT_NAME_OR_STRING; argument->value.name_or_string = v->name; break; default: parser_error(error, PARSER_ERROR_EXPECTED_PRAGMA_ARGUMENT, token); return nil; } argument_index++; span = span_merge(span, token.span); parser_next(p); // comma separates pragmas on a single line. token = parser_peek(p); if (token_is(&token, TOKEN_COMMA)) { parser_next(p); break; } if (token_ends_statement(&token)) { break; } } struct Pragma_Node* pragma = pragma_node_new(pragma_type, span, pragma_token.location); pragma->argument_count = argument_index; memcpy(pragma->arguments, arguments, sizeof(arguments)); if (!head) { head = pragma; } else { current->next = pragma; } current = pragma; } return head; } struct Expression* parser_expression_primary_name(struct Parser* p, struct Parser_Error* error) { struct Token token = CHECK(parser_need(p, TOKEN_NAME, error)); union Expression_Value value = { .name = { token.value.name } }; return expression_new(EXPRESSION_NAME, value, token.span, token.location); } struct Expression* parser_expression_primary_integer(struct Parser* p, struct Parser_Error* error) { struct Token token = CHECK(parser_need(p, TOKEN_LITERAL_INTEGER, error)); union Expression_Value value = { .integer_literal = { token.value.literal_integer } }; return expression_new(EXPRESSION_INTEGER_LITERAL, value, token.span, token.location); } struct Expression* parser_expression_primary_float(struct Parser* p, struct Parser_Error* error) { struct Token token = CHECK(parser_need(p, TOKEN_LITERAL_FLOAT, error)); union Expression_Value value = { .float_literal = { token.value.literal_float } }; return expression_new(EXPRESSION_FLOAT_LITERAL, value, token.span, token.location); } struct Expression* parser_expression_primary_string(struct Parser* p, struct Parser_Error* error) { struct Token token = CHECK(parser_need(p, TOKEN_LITERAL_STRING, error)); union Expression_Value value = { .string_literal = { token.value.literal_string } }; return expression_new(EXPRESSION_STRING_LITERAL, value, token.span, token.location); } struct Expression* parser_expression_primary_boolean(struct Parser* p, struct Parser_Error* error) { struct Token token = parser_next(p); check(token.kind == TOKEN_WORD_TRUE || token.kind == TOKEN_WORD_FALSE, "expected boolean literal"); bool literal = token.kind == TOKEN_WORD_TRUE; union Expression_Value expr_value = { .bool_literal = { literal } }; return expression_new(EXPRESSION_BOOLEAN_LITERAL, expr_value, token.span, token.location); } struct Expression* parser_expression_primary_group(struct Parser* p, struct Parser_Error* error) { struct Token start_token = CHECK(parser_need(p, TOKEN_ROUND_OPEN, error)); struct Expression* expression = CHECK(parser_expression(p, error)); struct Token end_token = CHECK(parser_need(p, TOKEN_ROUND_CLOSE, error)); struct Span span = span_merge(start_token.span, end_token.span); union Expression_Value value = { .group = { expression } }; return expression_new(EXPRESSION_GROUP, value, span, start_token.location); } struct Expression* parser_expression_function(struct Parser* p, struct Parser_Error* error) { struct Token fun_token = CHECK(parser_need(p, TOKEN_WORD_FUN, error)); struct Expression_Function fun = { 0 }; fun.header = CHECK(parser_function_header_node(p, error)); fun.body = CHECK(parser_block_node(p, error)); return expression_new( EXPRESSION_FUNCTION, (union Expression_Value){ .function = fun }, span_merge(fun_token.span, fun.body.span), fun_token.location); } struct Expression* parser_expression_type(struct Parser* p, struct Parser_Error* error) { struct Token start_token = parser_peek(p); struct Type_Node* type = nil; switch (start_token.kind) { case TOKEN_WORD_TYPE: parser_next(p); // skip the `type` keyword. type = CHECK(parser_node_type(p, error)); break; case TOKEN_WORD_VARIANT: type = CHECK(parser_node_type_variant(p, error)); break; case TOKEN_WORD_CLASS: type = CHECK(parser_node_type_class(p, error)); break; default: failure("expected type signifying keyword"); } struct Span span = span_merge(start_token.span, type->span); return expression_new( EXPRESSION_TYPE, (union Expression_Value){ .type = { type } }, span, start_token.location); } struct Expression* parser_expression_primary(struct Parser* p, struct Parser_Error* error) { struct Token token = parser_peek(p); switch (token.kind) { case TOKEN_NAME: return parser_expression_primary_name(p, error); case TOKEN_LITERAL_INTEGER: return parser_expression_primary_integer(p, error); case TOKEN_LITERAL_FLOAT: return parser_expression_primary_float(p, error); case TOKEN_LITERAL_STRING: return parser_expression_primary_string(p, error); case TOKEN_WORD_TRUE: case TOKEN_WORD_FALSE: return parser_expression_primary_boolean(p, error); case TOKEN_ROUND_OPEN: return parser_expression_primary_group(p, error); case TOKEN_WORD_FUN: return parser_expression_function(p, error); // `variant` and `class` here are essentially a shortcut, as they already // imply a type, so we don't have to write `type variant` or `type class`. case TOKEN_WORD_TYPE: case TOKEN_WORD_VARIANT: case TOKEN_WORD_CLASS: return parser_expression_type(p, error); default: parser_error(error, PARSER_ERROR_EXPECTED_PRIMARY_EXPRESSION, token); return nil; } } struct Expression* parser_expression_postfix_member( struct Parser* p, struct Expression* subject, struct Parser_Error* error) { CHECK(parser_need(p, TOKEN_DOT, error)); struct Token name_token = CHECK(parser_need(p, TOKEN_NAME, error)); struct String name = name_token.value.name; struct Span span = span_merge(subject->span, name_token.span); union Expression_Value value = { .member = { subject, name } }; return expression_new(EXPRESSION_MEMBER, value, span, name_token.location); } struct Expression* parser_expression_postfix_call_or_construct( struct Parser* p, struct Expression* subject, struct Parser_Error* error) { // calls and constructs look identical in the parser. // * call: `meow_function(123, "hello")` // * construct: `Meow(num = 123, str = "hello")` CHECK(parser_need(p, TOKEN_ROUND_OPEN, error)); struct String_Array argument_names = string_array_new(); struct Expression *arguments_head = nil, *arguments_current = nil; while (!parser_probe(p, TOKEN_ROUND_CLOSE)) { // check if we have a named argument. struct String name = string_empty(); struct Token name_token = parser_peek(p), next = parser_peek_further(p); if (token_is(&name_token, TOKEN_NAME) && token_is(&next, TOKEN_ASSIGN)) { parser_next(p); parser_next(p); name = name_token.value.name; } struct Expression* argument = CHECK(parser_expression(p, error)); if (!arguments_head) arguments_head = argument; else arguments_current->next = argument; arguments_current = argument; // if we have a named argument, we need to add it to the names array, // otherwise we just add an empty string. string_array_add(&argument_names, name); if (parser_probe(p, TOKEN_COMMA)) parser_next(p); } struct Token token = CHECK(parser_need(p, TOKEN_ROUND_CLOSE, error)); struct Span span = span_merge(subject->span, token.span); union Expression_Value value = { .call_or_construct = { subject, arguments_head, argument_names } }; return expression_new(EXPRESSION_CALL_OR_CONSTRUCT, value, span, token.location); } struct Expression* parser_expression_postfix_subscript( struct Parser* p, struct Expression* subject, struct Parser_Error* error) { CHECK(parser_need(p, TOKEN_SQUARE_OPEN, error)); struct Expression* index = CHECK(parser_expression(p, error)); struct Token token = CHECK(parser_need(p, TOKEN_SQUARE_CLOSE, error)); struct Span span = span_merge(subject->span, span_merge(index->span, token.span)); union Expression_Value value = { .subscript = { subject, index } }; return expression_new(EXPRESSION_SUBSCRIPT, value, span, token.location); } struct Expression* parser_expression_postfix_increment_decrement( struct Parser* p, struct Expression* subject, struct Parser_Error* error) { struct Token token = parser_peek(p); enum Increment_Decrement_Operation operation = increment_decrement_operation_from_token(&token); struct Expression_Increment_Decrement inc_dec = { .prefix = false, .subject = subject, .operation = operation, }; struct Span span = span_merge(subject->span, token.span); union Expression_Value value = { .increment_decrement = inc_dec }; return expression_new(EXPRESSION_INCREMENT_DECREMENT, value, span, token.location); } struct Expression* parser_expression_postfix_try( struct Parser* p, struct Expression* subject, struct Parser_Error* error) { struct Token question_token = parser_next(p); struct Span span = span_merge(subject->span, question_token.span); union Expression_Value value = { .try = { subject } }; return expression_new(EXPRESSION_TRY, value, span, subject->location); } struct Expression* parser_expression_postfix_must( struct Parser* p, struct Expression* subject, struct Parser_Error* error) { struct Token bang_token = CHECK(parser_need(p, TOKEN_BANG, error)); struct Span span = span_merge(subject->span, bang_token.span); union Expression_Value value = { .try = { subject } }; return expression_new(EXPRESSION_MUST, value, span, subject->location); } struct Expression* parser_expression_postfix(struct Parser* p, struct Parser_Error* error) { struct Expression* expression = CHECK(parser_expression_primary(p, error)); // NOTE: we have to parse all subsequent postfix expressions non-recursively. for (;;) { struct Token token = parser_peek(p); switch (token.kind) { case TOKEN_ROUND_OPEN: expression = parser_expression_postfix_call_or_construct(p, expression, error); continue; case TOKEN_SQUARE_OPEN: expression = parser_expression_postfix_subscript(p, expression, error); continue; case TOKEN_DOT: expression = parser_expression_postfix_member(p, expression, error); continue; case TOKEN_QUESTION: expression = parser_expression_postfix_try(p, expression, error); continue; case TOKEN_BANG: expression = parser_expression_postfix_must(p, expression, error); continue; default:; } enum Increment_Decrement_Operation inc_dec_op = increment_decrement_operation_from_token(&token); if (inc_dec_op) { parser_next(p); struct Expression_Increment_Decrement inc_dec = { .prefix = false, .subject = expression, .operation = inc_dec_op, }; struct Span span = span_merge(expression->span, token.span); union Expression_Value value = { .increment_decrement = inc_dec }; expression = expression_new(EXPRESSION_INCREMENT_DECREMENT, value, span, token.location); continue; } return expression; } } struct Expression* parser_expression_unary_operation(struct Parser* p, struct Parser_Error* error) { struct Token token = parser_peek(p); enum Unary_Operation operation = unary_operation_from_token(&token); if (operation) { parser_next(p); struct Expression* operand = CHECK(parser_expression_unary_operation(p, error)); struct Span span = span_merge(token.span, operand->span); union Expression_Value value = { .unary_operator = { operation, operand } }; return expression_new(EXPRESSION_UNARY_OPERATION, value, span, token.location); } enum Increment_Decrement_Operation inc_dec_op = increment_decrement_operation_from_token(&token); if (inc_dec_op) { parser_next(p); struct Expression* subject = CHECK(parser_expression_unary_operation(p, error)); struct Expression_Increment_Decrement inc_dec = { .prefix = true, .subject = subject, .operation = inc_dec_op, }; struct Span span = span_merge(token.span, inc_dec.subject->span); union Expression_Value value = { .increment_decrement = inc_dec }; return expression_new(EXPRESSION_INCREMENT_DECREMENT, value, span, token.location); } return parser_expression_postfix(p, error); } // given two expressions and some kind of binary operation between them, // merge them into a single binary expression, with attention to operator precedence // and associativity. struct Expression_Binary_Operator parser_merge_into_single_binary_expression( struct Parser* p, struct Expression* left, enum Binary_Operation operation, struct Expression* right) { // NOTE: due to the parser structure, the left expression is never a binary operation // so we can worry about fixing up the right side only. check(left->kind != EXPRESSION_BINARY_OPERATION, "left expression is a binary operation"); if (right->kind == EXPRESSION_BINARY_OPERATION) { struct Expression_Binary_Operator right_binary = right->value.binary_operator; uint right_precedence = binary_operation_precedence(right_binary.operation); uint precedence = binary_operation_precedence(operation); bool switch_due_to_precedence = right_precedence < precedence; bool switch_due_to_associativity = precedence == right_precedence && binary_operation_associativity(operation) == BINARY_ASSOCIATIVITY_LEFT; // essentially check if an expression like `a ~ (b ~ c)` needs to be // switched into `(a ~ b) ~ c` or not. if (switch_due_to_precedence || switch_due_to_associativity) { // since we only use static memory, we need to switch the operands around // without allocating more expressions, thus we have to reuse the allocation // slots of the previous expressions. struct Expression* operands[3] = { left, right_binary.left_operand, right_binary.right_operand }; struct Expression* slots[2] = { right, right_binary.right_operand }; struct Expression_Binary_Operator new_binary_operator = parser_merge_into_single_binary_expression(p, operands[0], operation, operands[1]); *slots[0] = (struct Expression){ .kind = EXPRESSION_BINARY_OPERATION, .value = { .binary_operator = new_binary_operator }, .span = span_merge(left->span, right->span), .location = left->location }; *slots[1] = *operands[2]; left = slots[0]; right = slots[1]; operation = right_binary.operation; } } return (struct Expression_Binary_Operator){ .operation = operation, .left_operand = left, .right_operand = right, }; } struct Expression* parser_expression_binary_operation(struct Parser* p, struct Parser_Error* error) { struct Expression* left = CHECK(parser_expression_unary_operation(p, error)); struct Token token = parser_peek(p); enum Binary_Operation operation = binary_operation_from_token(&token); if (operation) { parser_next(p); struct Expression* right = CHECK(parser_expression_binary_operation(p, error)); struct Span span = span_merge(left->span, right->span); struct Expression_Binary_Operator binary_value = parser_merge_into_single_binary_expression(p, left, operation, right); union Expression_Value value = { .binary_operator = binary_value }; return expression_new(EXPRESSION_BINARY_OPERATION, value, span, left->location); } return left; } struct Expression* parser_expression(struct Parser* p, struct Parser_Error* error) { return parser_expression_binary_operation(p, error); } struct Statement* parser_statement_declaration(struct Parser* p, struct Parser_Error* error) { struct Token declaration_token = parser_next(p); enum Statement_Declaration_Kind declaration_kind = statement_declaration_kind_from_token(&declaration_token); check(declaration_kind, "expected valid declaration token"); struct Bare_Declaration_Node inner = CHECK(parser_bare_declaration_node(p, error)); struct Span span = span_merge(declaration_token.span, inner.span); union Statement_Value value = { .declaration = { .kind = declaration_kind, .inner = inner, }, }; return statement_new(STATEMENT_DECLARATION, value, span, declaration_token.location); } struct Statement* parser_statement_conditional(struct Parser* p, struct Parser_Error* error) { struct Statement_Value_Conditional conditional = { 0 }; struct Token if_token = parser_need(p, TOKEN_WORD_IF, error); // primary if condition + block. struct Expression* if_condition = CHECK(parser_expression(p, error)); struct Block_Node then_block = CHECK(parser_block_node(p, error)); conditional.conditions[conditional.condition_count++] = (struct Statement_Conditional_Branch){ .when = if_condition, .then = then_block, }; struct Span span = span_merge(if_token.span, then_block.span); while (parser_probe(p, TOKEN_WORD_ELSE)) { check(conditional.condition_count < STATEMENT_VALUE_CONDITIONAL_MAX, "too many conditional branches"); parser_next(p); struct Statement_Conditional_Branch branch = { 0 }; if (parser_probe(p, TOKEN_WORD_IF)) { // else if condition + block. parser_next(p); struct Expression* else_condition = CHECK(parser_expression(p, error)); struct Block_Node else_block = CHECK(parser_block_node(p, error)); branch = (struct Statement_Conditional_Branch){ .when = else_condition, .then = else_block, }; } else { // else block. struct Block_Node else_block = CHECK(parser_block_node(p, error)); branch = (struct Statement_Conditional_Branch){ .when = nil, .then = else_block, }; } conditional.conditions[conditional.condition_count++] = branch; span = span_merge(span, branch.then.span); } return statement_new( STATEMENT_CONDITIONAL, (union Statement_Value){ .conditional = conditional }, span, if_token.location); } struct Statement* parser_statement_for(struct Parser* p, struct Parser_Error* error) { struct Token for_token = CHECK(parser_need(p, TOKEN_WORD_FOR, error)); // these are the possible for loop variants: // * `for name String = collection {}`, as iteration over container // * `for i u8 = 0, i < 10, i++ {}`, as a c-style semi-semi loop // a declaration without a signifier like `var` or `let`. struct Bare_Declaration_Node declaration = CHECK(parser_bare_declaration_node(p, error)); enum Statement_Loop_Style style = STATEMENT_LOOP_STYLE_FOR_EACH; // c-style semi-semi loop. struct Expression *condition = nil, *iteration = nil; if (parser_probe(p, TOKEN_COMMA)) { parser_next(p); condition = CHECK(parser_expression(p, error)); CHECK(parser_need(p, TOKEN_COMMA, error)); iteration = CHECK(parser_expression(p, error)); style = STATEMENT_LOOP_STYLE_C; } struct Block_Node body = CHECK(parser_block_node(p, error)); struct Span span = span_merge(for_token.span, body.span); union Statement_Value value = { .loop = { .style = style, .declaration = declaration, .condition = condition, .iteration = iteration, .body = body, }, }; return statement_new(STATEMENT_LOOP, value, span, for_token.location); } struct Statement* parser_statement_while(struct Parser* p, struct Parser_Error* error) { struct Token while_token = CHECK(parser_need(p, TOKEN_WORD_WHILE, error)); enum Statement_Loop_Style style = STATEMENT_LOOP_STYLE_ENDLESS; struct Expression* condition = nil; if (!parser_probe(p, TOKEN_CURLY_OPEN)) { condition = CHECK(parser_expression(p, error)); style = STATEMENT_LOOP_STYLE_WHILE; } struct Block_Node body = CHECK(parser_block_node(p, error)); struct Span span = span_merge(while_token.span, body.span); union Statement_Value value = { .loop = { .style = style, .condition = condition, .body = body } }; return statement_new(STATEMENT_LOOP, value, span, while_token.location); } struct Statement* parser_statement_block(struct Parser* p, struct Parser_Error* error) { struct Block_Node block = CHECK(parser_block_node(p, error)); return statement_new( STATEMENT_BLOCK, (union Statement_Value){ .block = { block } }, block.span, block.location); } struct Statement* parser_statement_return(struct Parser* p, struct Parser_Error* error) { struct Token return_token = CHECK(parser_need(p, TOKEN_WORD_RETURN, error)); struct Expression* value = nil; if (!token_ends_statement(&return_token)) { value = CHECK(parser_expression(p, error)); } struct Span span = value ? return_token.span : span_merge(return_token.span, value->span); union Statement_Value statement_value = { .return_value = { value } }; return statement_new(STATEMENT_RETURN, statement_value, span, return_token.location); } struct Statement* parser_statement_break(struct Parser* p, struct Parser_Error* error) { struct Token break_token = CHECK(parser_need(p, TOKEN_WORD_BREAK, error)); struct Span span = break_token.span; return statement_new(STATEMENT_BREAK, (union Statement_Value){ 0 }, span, break_token.location); } struct Statement* parser_statement_continue(struct Parser* p, struct Parser_Error* error) { struct Token continue_token = CHECK(parser_need(p, TOKEN_WORD_CONTINUE, error)); struct Span span = continue_token.span; return statement_new( STATEMENT_CONTINUE, (union Statement_Value){ 0 }, span, continue_token.location); } struct Statement* parser_statement_defer(struct Parser* p, struct Parser_Error* error) { struct Token defer_token = CHECK(parser_need(p, TOKEN_WORD_DEFER, error)); struct Span span = defer_token.span; struct Statement_Value_Defer defer = { 0 }; if (parser_probe(p, TOKEN_CURLY_OPEN)) { struct Block_Node block = CHECK(parser_block_node(p, error)); span = span_merge(span, block.span); defer.block = block; } else { struct Expression* expression = CHECK(parser_expression(p, error)); span = span_merge(span, expression->span); defer.expression = expression; } union Statement_Value value = { .defer = defer }; return statement_new(STATEMENT_DEFER, value, span, defer_token.location); } struct Statement* parser_statement_pragma(struct Parser* p, struct Parser_Error* error) { struct Token pipe_token = parser_peek(p); struct Pragma_Node* pragma_node = parser_pragma_node(p, error); if (!parser_error_is_none(error)) { parser_error_wrap(error, PARSER_ERROR_EXPECTED_PRAGMA); return nil; } struct Span span = {}; FOR_EACH (struct Pragma_Node*, node, pragma_node) { span = span_is_empty(span) ? node->span : span_merge(span, node->span); } union Statement_Value value = { .pragma.inner = pragma_node }; return statement_new(STATEMENT_PRAGMA, value, span, pipe_token.location); } struct Statement* parser_statement(struct Parser* p, struct Parser_Error* error) { struct Token token = parser_peek(p); // skip empty statements. while (token_ends_statement(&token)) { if (parser_reached_end(p)) return nil; parser_next(p); token = parser_peek(p); } if (token_can_begin_declaration(&token)) return parser_statement_declaration(p, error); switch (token.kind) { case TOKEN_WORD_IF: return parser_statement_conditional(p, error); case TOKEN_WORD_FOR: return parser_statement_for(p, error); case TOKEN_WORD_WHILE: return parser_statement_while(p, error); case TOKEN_CURLY_OPEN: return parser_statement_block(p, error); case TOKEN_WORD_RETURN: return parser_statement_return(p, error); case TOKEN_WORD_BREAK: return parser_statement_break(p, error); case TOKEN_WORD_CONTINUE: return parser_statement_continue(p, error); case TOKEN_WORD_DEFER: return parser_statement_defer(p, error); case TOKEN_PIPE: return parser_statement_pragma(p, error); default: break; } struct Expression* expression = CHECK(parser_expression(p, error)); // expand by one byte to include the statement terminator. struct Span span = span_expand(expression->span, 1); union Statement_Value value = { .expression.inner = expression }; return statement_new(STATEMENT_EXPRESSION, value, span, expression->location); } void parser_handle_error(struct Parser* p, struct Parser_Error* error) { if (parser_error_is_none(error)) return; p->had_errors = true; parser_error_display(error, p->source_file); parser_error_none(error); parser_panic(p); } // parse the lexer tokens into a single AST. // note: it was either `parser_parse` or this. :) int parser_do_your_thing(struct Parser* p, struct Tree* tree) { struct Parser_Error error; parser_error_none(&error); struct Statement* head = nil; struct Statement* current = nil; while (!parser_reached_end(p)) { struct Statement* next = parser_statement(p, &error); parser_handle_error(p, &error); if (next) { parser_end_statement(p, &error); if (parser_error_is_none(&error)) { if (current) { current->next = next; } else { head = next; } current = next; } else { parser_handle_error(p, &error); } } else if (parser_reached_end(p)) { // `parser_statement` returns nil on EOF. break; } } parser_error_none(&error); *tree = (struct Tree){ head }; return p->had_errors; } #undef CHECK #undef CHECK_RETURN