/* * very simple handwritten recursive descent * parser for a catskill source file. */ #define PARSER_LOOKAHEAD 2 #define CHECK(parse) \ parse; \ if (!parser_error_is_none(error)) return nil; #define CHECK_RETURN(parse, ret) \ parse; \ if (!parser_error_is_none(error)) return (ret){ 0 }; struct Parser_Error { enum Parser_Error_Kind { PARSER_ERROR_NONE, PARSER_ERROR_UNEXPECTED_TOKEN, PARSER_ERROR_UNEXPECTED_EOF, PARSER_ERROR_EXPECTED_STATEMENT_END, PARSER_ERROR_EXPECTED_PRIMARY_EXPRESSION, PARSER_ERROR_EXPECTED_TYPE, } kind; // TODO: add span to error }; struct Parser_Error parser_error(enum Parser_Error_Kind kind) { return (struct Parser_Error){ kind }; } struct Parser_Error parser_error_none() { return parser_error(PARSER_ERROR_NONE); } bool parser_error_is_none(const struct Parser_Error* error) { return error->kind == PARSER_ERROR_NONE; } const ascii* parser_error_to_string(const struct Parser_Error* error) { switch (error->kind) { case PARSER_ERROR_NONE: return "none"; case PARSER_ERROR_UNEXPECTED_TOKEN: return "unexpected token"; case PARSER_ERROR_UNEXPECTED_EOF: return "unexpected end of file"; case PARSER_ERROR_EXPECTED_STATEMENT_END: return "expected statement end"; case PARSER_ERROR_EXPECTED_PRIMARY_EXPRESSION: return "expected primary expression"; case PARSER_ERROR_EXPECTED_TYPE: return "expected type"; default: return "unknown error"; } } struct Parser { struct Lexer* lexer; struct Token lookahead[PARSER_LOOKAHEAD]; }; void parser_new(struct Parser* p, struct Lexer* lexer) { p->lexer = lexer; memset(p->lookahead, 0, sizeof(p->lookahead)); } bool parser_lookahead_pop(struct Parser* p, struct Token* token) { struct Token head = p->lookahead[0]; if (token_is_empty(&head)) return false; for (uint i = 0; i < PARSER_LOOKAHEAD - 1; i++) p->lookahead[i] = p->lookahead[i + 1]; p->lookahead[PARSER_LOOKAHEAD - 1] = token_none(); *token = head; return true; } bool parser_lookahead_push(struct Parser* p, struct Token token) { for (uint i = 0; i < PARSER_LOOKAHEAD; i++) { if (token_is_empty(&p->lookahead[i])) { p->lookahead[i] = token; return true; } } return false; } // advance the token stream and return the next token. struct Token parser_next(struct Parser* p) { struct Token token; if (!parser_lookahead_pop(p, &token)) token = lexer_next(p->lexer); return token; } // advance the token stream if the token is of the right type. // if not, return none-token and set the error. struct Token parser_need(struct Parser* p, enum Token_Kind kind, struct Parser_Error* error) { struct Token token = parser_next(p); if (!token_is(&token, kind)) { *error = parser_error(PARSER_ERROR_UNEXPECTED_TOKEN); return token_none(); } return token; } // peek at the token stream at given index, without advancing it. // index 0 is the token that would be returned by a next `parser_next` call. // peek limit is defined by `PARSER_LOOKAHEAD`. struct Token parser_peek_at(struct Parser* p, uint index) { check(index < 2, "parser peek index out of range"); while (token_is_empty(&p->lookahead[index])) { struct Token next_token = lexer_next(p->lexer); parser_lookahead_push(p, next_token); } return p->lookahead[index]; } // peek at the next token in the stream, without advancing it. // synonym for `parser_peek_at(p, 0)`. struct Token parser_peek(struct Parser* p) { return parser_peek_at(p, 0); } // peek at the token one beyond the current cursor in the stream, without advancing it. // synonym for `parser_peek_at(p, 1)`. struct Token parser_peek_further(struct Parser* p) { return parser_peek_at(p, 1); } // check if the next token is of a given type. bool parser_probe(struct Parser* p, enum Token_Kind kind) { struct Token token = parser_peek(p); return token_is(&token, kind); } struct Statement* parser_statement(struct Parser* p, struct Parser_Error* error); struct Expression* parser_expression(struct Parser* p, struct Parser_Error* error); void parser_end_statement(struct Parser* p, struct Parser_Error* error) { struct Token token = parser_peek(p); if (!token_ends_statement(&token)) { *error = parser_error(PARSER_ERROR_EXPECTED_STATEMENT_END); return; } parser_next(p); } bool parser_could_be_variable_declaration(struct Parser* p) { // NOTE: these can be a variable declaration: // x uint = 123 // me, them Obj = create() // otherwise without a type, it is instead counted as an assignment: // a = "hi!" struct Token first = parser_peek(p); struct Token second = parser_peek_further(p); bool first_matches = token_is(&first, TOKEN_NAME); bool second_matches = token_is(&second, TOKEN_COMMA) || token_is(&second, TOKEN_NAME); return first_matches && second_matches; } struct Block_Node parser_block_node(struct Parser* p, struct Parser_Error* error) { struct Token start_token = CHECK_RETURN(parser_need(p, TOKEN_CURLY_OPEN, error), struct Block_Node); struct Statement* head = nil; struct Statement* current = nil; while (!parser_probe(p, TOKEN_CURLY_CLOSE)) { struct Statement* statement = CHECK_RETURN(parser_statement(p, error), struct Block_Node); CHECK_RETURN(parser_end_statement(p, error), struct Block_Node); if (!head) { head = statement; } else { current->next = statement; } current = statement; } struct Token end_token = CHECK_RETURN(parser_need(p, TOKEN_CURLY_CLOSE, error), struct Block_Node); struct Span span = span_merge(start_token.span, end_token.span); return (struct Block_Node){ .statements = head, .span = span, .location = start_token.location, }; } struct Type_Node parser_node_type(struct Parser* p, struct Parser_Error* error) { struct Token token = parser_need(p, TOKEN_NAME, error); if (token_is_empty(&token)) { *error = parser_error(PARSER_ERROR_EXPECTED_TYPE); return (struct Type_Node){ 0 }; } struct String type_name = token.value.name; // for now, we only support a single type name. // in the future, we might want to support more complex types. return (struct Type_Node){ .type = TYPE_NAME, .name = type_name, .span = token.span, .location = token.location, }; } struct Expression* parser_expression_primary_name(struct Parser* p, struct Parser_Error* error) { struct Token token = CHECK(parser_need(p, TOKEN_NAME, error)); union Expression_Value value = { .name = { token.value.name } }; return expression_new(EXPRESSION_NAME, value, token.span, token.location); } struct Expression* parser_expression_primary_integer(struct Parser* p, struct Parser_Error* error) { struct Token token = CHECK(parser_need(p, TOKEN_LITERAL_INTEGER, error)); union Expression_Value value = { .integer_literal = { token.value.literal_integer } }; return expression_new(EXPRESSION_INTEGER_LITERAL, value, token.span, token.location); } struct Expression* parser_expression_primary_float(struct Parser* p, struct Parser_Error* error) { struct Token token = CHECK(parser_need(p, TOKEN_LITERAL_FLOAT, error)); union Expression_Value value = { .float_literal = { token.value.literal_float } }; return expression_new(EXPRESSION_FLOAT_LITERAL, value, token.span, token.location); } struct Expression* parser_expression_primary_string(struct Parser* p, struct Parser_Error* error) { struct Token token = CHECK(parser_need(p, TOKEN_LITERAL_STRING, error)); union Expression_Value value = { .string_literal = { token.value.literal_string } }; return expression_new(EXPRESSION_STRING_LITERAL, value, token.span, token.location); } struct Expression* parser_expression_primary_boolean(struct Parser* p, struct Parser_Error* error) { struct Token token = parser_next(p); check(token.kind == TOKEN_WORD_TRUE || token.kind == TOKEN_WORD_FALSE, "expected boolean literal"); bool literal = token.kind == TOKEN_WORD_TRUE; union Expression_Value expr_value = { .bool_literal = { literal } }; return expression_new(EXPRESSION_BOOLEAN_LITERAL, expr_value, token.span, token.location); } struct Expression* parser_expression_primary_group(struct Parser* p, struct Parser_Error* error) { struct Token start_token = CHECK(parser_need(p, TOKEN_ROUND_OPEN, error)); struct Expression* expression = CHECK(parser_expression(p, error)); struct Token end_token = CHECK(parser_need(p, TOKEN_ROUND_CLOSE, error)); struct Span span = span_merge(start_token.span, end_token.span); union Expression_Value value = { .group = { expression } }; return expression_new(EXPRESSION_GROUP, value, span, start_token.location); } struct Expression* parser_expression_function(struct Parser* p, struct Parser_Error* error) { struct Token fun_token = CHECK(parser_need(p, TOKEN_WORD_FUN, error)); CHECK(parser_need(p, TOKEN_ROUND_OPEN, error)); struct Expression_Function fun = { 0 }; while (!parser_probe(p, TOKEN_ROUND_CLOSE)) { struct Token name_token = CHECK(parser_need(p, TOKEN_NAME, error)); struct String name = name_token.value.name; struct Type_Node type = { 0 }; struct Token next = parser_peek(p); if (!token_is(&next, TOKEN_ROUND_CLOSE) && !token_is(&next, TOKEN_COMMA)) type = CHECK(parser_node_type(p, error)); if (parser_probe(p, TOKEN_COMMA)) parser_next(p); check(fun.parameter_count < EXPRESSION_FUNCTION_MAX_PARAMS, "too many function parameters"); fun.parameters[fun.parameter_count++] = (struct Expression_Function_Parameter){ .name = name, .type = type, }; } parser_next(p); if (!parser_probe(p, TOKEN_CURLY_OPEN)) fun.return_type = CHECK(parser_node_type(p, error)); fun.body = CHECK(parser_block_node(p, error)); return expression_new( EXPRESSION_FUNCTION, (union Expression_Value){ .function = fun }, span_merge(fun_token.span, fun.body.span), fun_token.location); } struct Expression* parser_expression_primary(struct Parser* p, struct Parser_Error* error) { struct Token token = parser_peek(p); switch (token.kind) { case TOKEN_NAME: return parser_expression_primary_name(p, error); case TOKEN_LITERAL_INTEGER: return parser_expression_primary_integer(p, error); case TOKEN_LITERAL_FLOAT: return parser_expression_primary_float(p, error); case TOKEN_LITERAL_STRING: return parser_expression_primary_string(p, error); case TOKEN_WORD_TRUE: case TOKEN_WORD_FALSE: return parser_expression_primary_boolean(p, error); case TOKEN_ROUND_OPEN: return parser_expression_primary_group(p, error); case TOKEN_WORD_FUN: return parser_expression_function(p, error); default: *error = parser_error(PARSER_ERROR_EXPECTED_PRIMARY_EXPRESSION); return nil; } } struct Expression* parser_expression_member(struct Parser* p, struct Parser_Error* error) { struct Expression* left = CHECK(parser_expression_primary(p, error)); // NOTE: see `parser_expression_postfix_call`. while (parser_probe(p, TOKEN_DOT)) { parser_next(p); struct Token name_token = CHECK(parser_need(p, TOKEN_NAME, error)); struct String name = name_token.value.name; struct Span span = span_merge(left->span, name_token.span); union Expression_Value value = { .member = { left, name } }; left = expression_new(EXPRESSION_MEMBER, value, span, name_token.location); } return left; } struct Expression* parser_expression_postfix_call( struct Parser* p, struct Expression* subject, struct Parser_Error* error) { // NOTE: because of the way the parser works, we have to parse all subsequent // call expressions in the same loop. // this is the case with an expression like `meow_function()(123)`, // where the hypothetical `meow_function` returns a function pointer. while (parser_probe(p, TOKEN_ROUND_OPEN)) { parser_next(p); struct Expression *arguments_head = nil, *arguments_current = nil; while (!parser_probe(p, TOKEN_ROUND_CLOSE)) { struct Expression* argument = CHECK(parser_expression(p, error)); if (!arguments_head) arguments_head = argument; else arguments_current->next = argument; arguments_current = argument; if (parser_probe(p, TOKEN_COMMA)) parser_next(p); } struct Token token = CHECK(parser_need(p, TOKEN_ROUND_CLOSE, error)); struct Span span = span_merge(subject->span, token.span); union Expression_Value value = { .call = { subject, arguments_head } }; subject = expression_new(EXPRESSION_CALL, value, span, token.location); } return subject; } struct Expression* parser_expression_postfix_subscript( struct Parser* p, struct Expression* subject, struct Parser_Error* error) { // NOTE: see `parser_expression_postfix_call`. while (parser_probe(p, TOKEN_SQUARE_OPEN)) { parser_next(p); struct Expression* index = CHECK(parser_expression(p, error)); struct Token token = CHECK(parser_need(p, TOKEN_SQUARE_CLOSE, error)); struct Span span = span_merge(subject->span, span_merge(index->span, token.span)); union Expression_Value value = { .subscript = { subject, index } }; subject = expression_new(EXPRESSION_SUBSCRIPT, value, span, token.location); } return subject; } struct Expression* parser_expression_postfix(struct Parser* p, struct Parser_Error* error) { struct Expression* expression = CHECK(parser_expression_member(p, error)); struct Token token = parser_peek(p); switch (token.kind) { case TOKEN_ROUND_OPEN: return parser_expression_postfix_call(p, expression, error); case TOKEN_SQUARE_OPEN: return parser_expression_postfix_subscript(p, expression, error); default:; } enum Increment_Decrement_Operation inc_dec_op = increment_decrement_operation_from_token(&token); if (inc_dec_op) { parser_next(p); struct Expression_Increment_Decrement inc_dec = { .prefix = false, .subject = expression, .operation = inc_dec_op, }; struct Span span = span_merge(expression->span, token.span); union Expression_Value value = { .increment_decrement = inc_dec }; return expression_new(EXPRESSION_INCREMENT_DECREMENT, value, span, token.location); } return expression; } struct Expression* parser_expression_unary_operation(struct Parser* p, struct Parser_Error* error) { struct Token token = parser_peek(p); enum Unary_Operation operation = unary_operation_from_token(&token); if (operation) { parser_next(p); struct Expression* operand = CHECK(parser_expression_unary_operation(p, error)); struct Span span = span_merge(token.span, operand->span); union Expression_Value value = { .unary_operator = { operation, operand } }; return expression_new(EXPRESSION_UNARY_OPERATION, value, span, token.location); } enum Increment_Decrement_Operation inc_dec_op = increment_decrement_operation_from_token(&token); if (inc_dec_op) { parser_next(p); struct Expression* subject = CHECK(parser_expression_unary_operation(p, error)); struct Expression_Increment_Decrement inc_dec = { .prefix = true, .subject = subject, .operation = inc_dec_op, }; struct Span span = span_merge(token.span, inc_dec.subject->span); union Expression_Value value = { .increment_decrement = inc_dec }; return expression_new(EXPRESSION_INCREMENT_DECREMENT, value, span, token.location); } return parser_expression_postfix(p, error); } // given two expressions and some kind of binary operation between them, // merge them into a single binary expression, with attention to operator precedence // and associativity. struct Expression_Binary_Operator parser_merge_into_single_binary_expression( struct Parser* p, struct Expression* left, enum Binary_Operation operation, struct Expression* right) { // NOTE: due to the parser structure, the left expression is never a binary operation // so we can worry about fixing up the right side only. check(left->kind != EXPRESSION_BINARY_OPERATION, "left expression is a binary operation"); if (right->kind == EXPRESSION_BINARY_OPERATION) { struct Expression_Binary_Operator right_binary = right->value.binary_operator; uint right_precedence = binary_operation_precedence(right_binary.operation); uint precedence = binary_operation_precedence(operation); bool switch_due_to_precedence = right_precedence < precedence; bool switch_due_to_associativity = precedence == right_precedence && binary_operation_associativity(operation) == BINARY_ASSOCIATIVITY_LEFT; // essentially check if an expression like `a ~ (b ~ c)` needs to be // switched into `(a ~ b) ~ c` or not. if (switch_due_to_precedence || switch_due_to_associativity) { // since we only use static memory, we need to switch the operands around // without allocating more expressions, thus we have to reuse the allocation // slots of the previous expressions. struct Expression* operands[3] = { left, right_binary.left_operand, right_binary.right_operand }; struct Expression* slots[2] = { right, right_binary.right_operand }; struct Expression_Binary_Operator new_binary_operator = parser_merge_into_single_binary_expression(p, operands[0], operation, operands[1]); *slots[0] = (struct Expression){ .kind = EXPRESSION_BINARY_OPERATION, .value = { .binary_operator = new_binary_operator }, .span = span_merge(left->span, right->span), .location = left->location }; *slots[1] = *operands[2]; left = slots[0]; right = slots[1]; operation = right_binary.operation; } } return (struct Expression_Binary_Operator){ .operation = operation, .left_operand = left, .right_operand = right, }; } struct Expression* parser_expression_binary_operation(struct Parser* p, struct Parser_Error* error) { struct Expression* left = CHECK(parser_expression_unary_operation(p, error)); struct Token token = parser_peek(p); enum Binary_Operation operation = binary_operation_from_token(&token); if (operation) { parser_next(p); struct Expression* right = CHECK(parser_expression_binary_operation(p, error)); struct Span span = span_merge(left->span, right->span); struct Expression_Binary_Operator binary_value = parser_merge_into_single_binary_expression(p, left, operation, right); union Expression_Value value = { .binary_operator = binary_value }; return expression_new(EXPRESSION_BINARY_OPERATION, value, span, left->location); } return left; } struct Expression* parser_expression(struct Parser* p, struct Parser_Error* error) { return parser_expression_binary_operation(p, error); } struct Statement* parser_statement_declaration(struct Parser* p, struct Parser_Error* error) { struct String_Array names = string_array_new(); struct Span span = { 0 }; struct Cursor location = parser_peek(p).location; for (;;) { struct Token name_token = parser_need(p, TOKEN_NAME, error); if (!parser_error_is_none(error)) return nil; span = span_is_empty(span) ? name_token.span : span_merge(span, name_token.span); string_array_add(&names, name_token.value.name); struct Token next = parser_peek(p); if (next.kind == TOKEN_NAME) break; if (next.kind == TOKEN_COMMA) parser_next(p); } // for now, type is always required. struct Type_Node type = CHECK(parser_node_type(p, error)); CHECK(parser_need(p, TOKEN_ASSIGN, error)); struct Expression* initializer = CHECK(parser_expression(p, error)); span = span_merge(span, initializer->span); union Statement_Value value = { .declaration = { .names = names, .type = type, .initializer = initializer, }, }; return statement_new(STATEMENT_DECLARATION, value, span, location); } struct Statement* parser_statement_conditional(struct Parser* p, struct Parser_Error* error) { struct Statement_Value_Conditional conditional = { 0 }; struct Token if_token = parser_need(p, TOKEN_WORD_IF, error); // primary if condition + block. struct Expression* if_condition = CHECK(parser_expression(p, error)); struct Block_Node then_block = CHECK(parser_block_node(p, error)); conditional.conditions[conditional.condition_count++] = (struct Statement_Conditional_Branch){ .when = if_condition, .then = then_block, }; struct Span span = span_merge(if_token.span, then_block.span); while (parser_probe(p, TOKEN_WORD_ELSE)) { check(conditional.condition_count < STATEMENT_VALUE_CONDITIONAL_MAX, "too many conditional branches"); parser_next(p); struct Statement_Conditional_Branch branch = { 0 }; if (parser_probe(p, TOKEN_WORD_IF)) { // else if condition + block. parser_next(p); struct Expression* else_condition = CHECK(parser_expression(p, error)); struct Block_Node else_block = CHECK(parser_block_node(p, error)); branch = (struct Statement_Conditional_Branch){ .when = else_condition, .then = else_block, }; } else { // else block. struct Block_Node else_block = CHECK(parser_block_node(p, error)); branch = (struct Statement_Conditional_Branch){ .when = nil, .then = else_block, }; } conditional.conditions[conditional.condition_count++] = branch; span = span_merge(span, branch.then.span); } return statement_new( STATEMENT_CONDITIONAL, (union Statement_Value){ .conditional = conditional }, span, if_token.location); } struct Statement* parser_statement_loop(struct Parser* p, struct Parser_Error* error) { struct Token for_token = CHECK(parser_need(p, TOKEN_WORD_FOR, error)); // these are the possible for loop variants: // * `for String name = collection {}`, as iteration over container // * `for check() {}`, as a simple while-style loop // * `for u8 i = 0, i < 10, i++ {}`, as a c-style semi-semi loop // * `for {}`, as an infinite loop struct Statement* declaration = nil; struct Expression *condition = nil, *iteration = nil; // c-style or iterator-style if (parser_could_be_variable_declaration(p)) { declaration = CHECK(parser_statement_declaration(p, error)); // c-style if (parser_probe(p, TOKEN_COMMA)) { parser_next(p); condition = CHECK(parser_expression(p, error)); CHECK(parser_need(p, TOKEN_COMMA, error)); iteration = CHECK(parser_expression(p, error)); } } // while-style if (!parser_probe(p, TOKEN_CURLY_OPEN)) condition = CHECK(parser_expression(p, error)); struct Block_Node body = CHECK(parser_block_node(p, error)); struct Span span = span_merge(for_token.span, body.span); union Statement_Value value = { .loop = { .declaration = declaration, .condition = condition, .iteration = iteration, .body = body, }, }; return statement_new(STATEMENT_LOOP, value, span, for_token.location); } struct Statement* parser_statement(struct Parser* p, struct Parser_Error* error) { struct Token token = parser_peek(p); // skip empty statements. while (token_ends_statement(&token)) { parser_next(p); token = parser_peek(p); } if (parser_could_be_variable_declaration(p)) return parser_statement_declaration(p, error); if (token_is(&token, TOKEN_WORD_IF)) return parser_statement_conditional(p, error); if (token_is(&token, TOKEN_WORD_FOR)) return parser_statement_loop(p, error); if (token_is(&token, TOKEN_CURLY_OPEN)) { // a block statement. struct Block_Node block = CHECK(parser_block_node(p, error)); return statement_new( STATEMENT_BLOCK, (union Statement_Value){ .block = { block } }, block.span, block.location); } struct Expression* expression = CHECK(parser_expression(p, error)); // expand by one byte to include the statement terminator. struct Span span = span_expand(expression->span, 1); union Statement_Value value = { .expression.inner = expression }; return statement_new(STATEMENT_EXPRESSION, value, span, expression->location); } // parse the lexer tokens into a single AST. // note: it was either `parser_parse` or this. :) struct Tree parser_do_your_thing(struct Parser* p, struct Parser_Error* error) { struct Statement* head = nil; struct Statement* current = nil; while (!p->lexer->eof) { struct Statement* next = CHECK_RETURN(parser_statement(p, error), struct Tree); CHECK_RETURN(parser_end_statement(p, error), struct Tree); if (current) { current->next = next; } else { head = next; } current = next; } *error = parser_error_none(); return (struct Tree){ head }; } #undef CHECK #undef CHECK_RETURN