diff options
Diffstat (limited to 'boot')
| -rw-r--r-- | boot/catboot.c | 18 | ||||
| -rw-r--r-- | boot/lex.c | 48 | ||||
| -rw-r--r-- | boot/parse.c | 470 | ||||
| -rw-r--r-- | boot/tree.c | 149 |
4 files changed, 685 insertions, 0 deletions
diff --git a/boot/catboot.c b/boot/catboot.c index fec47c0..23b649e 100644 --- a/boot/catboot.c +++ b/boot/catboot.c @@ -23,6 +23,7 @@ #include "common.c" #include "lex.c" #include "tree.c" +#include "parse.c" const ascii* read_file(const ascii* path) @@ -57,10 +58,27 @@ main(const int32 argc, const ascii* argv[]) lexer_new(&lexer, source); struct Token token; + printf("tokens: "); do { token = lexer_next(&lexer); printf("%s ", token_kind_to_string(token.kind)); } while (token.kind != TOKEN_END_OF_FILE); + printf("\n"); + + // reset lexer + lexer_new(&lexer, source); + + struct Parser parser; + parser_new(&parser, &lexer); + + struct Parser_Error parser_error = parser_error_none(); + struct Tree tree = parser_do_your_thing(&parser, &parser_error); + if (!parser_error_is_none(&parser_error)) { + printf("parser error: %s\n", parser_error_to_string(&parser_error)); + return EXIT_FAILURE; + } + + tree_print(&tree); return EXIT_SUCCESS; } diff --git a/boot/lex.c b/boot/lex.c index 9a94c55..e8432f6 100644 --- a/boot/lex.c +++ b/boot/lex.c @@ -28,6 +28,30 @@ span_width(Pos start, uint width) return (struct Span){ .start = start, .end = start + width }; } +// create a span which encompasses two spans. +struct Span +span_merge(struct Span a, struct Span b) +{ + return (struct Span){ + .start = a.start < b.start ? a.start : b.start, + .end = a.end > b.end ? a.end : b.end, + }; +} + +// expand a span by a number of bytes. +// negative number expands the span to the left, +// positive number expands the span to the right. +struct Span +span_expand(struct Span span, integer by) +{ + uint start_expand = by < 0 ? -by : 0; + uint end_expand = by > 0 ? by : 0; + return (struct Span){ + .start = span.start - start_expand, + .end = span.end + end_expand, + }; +} + // a cursor position placed within a text file. struct Cursor { @@ -235,6 +259,30 @@ token_simple(enum Token_Kind kind, Pos pos, struct Cursor location) return token_wide(kind, (struct Span){ pos, pos }, location); } +struct Token +token_none() +{ + return token_simple(TOKEN_NONE, 0, (struct Cursor){ 0 }); +} + +bool +token_is(const struct Token* t, enum Token_Kind kind) +{ + return t->kind == kind; +} + +bool +token_is_empty(const struct Token* t) +{ + return token_is(t, TOKEN_NONE); +} + +bool +token_ends_statement(const struct Token* t) +{ + return token_is(t, TOKEN_END_OF_FILE) || token_is(t, TOKEN_NEWLINE); +} + bool ascii_in_range(ascii c, ascii from, ascii to) { diff --git a/boot/parse.c b/boot/parse.c new file mode 100644 index 0000000..38cb712 --- /dev/null +++ b/boot/parse.c @@ -0,0 +1,470 @@ +/* + * very simple handwritten recursive descent + * parser for a catskill source file. + */ + +#define PARSER_LOOKAHEAD 2 + +#define CHECK(parse) \ + parse; \ + if (!parser_error_is_none(error)) return nil; + +struct Parser_Error +{ + enum Parser_Error_Kind + { + PARSER_ERROR_NONE, + PARSER_ERROR_UNEXPECTED_TOKEN, + PARSER_ERROR_UNEXPECTED_EOF, + PARSER_ERROR_EXPECTED_STATEMENT_END, + PARSER_ERROR_EXPECTED_PRIMARY_EXPRESSION, + } kind; + // TODO: add span to error +}; + +struct Parser_Error +parser_error(enum Parser_Error_Kind kind) +{ + return (struct Parser_Error){ kind }; +} + +struct Parser_Error +parser_error_none() +{ + return parser_error(PARSER_ERROR_NONE); +} + +bool +parser_error_is_none(const struct Parser_Error* error) +{ + return error->kind == PARSER_ERROR_NONE; +} + +const ascii* +parser_error_to_string(const struct Parser_Error* error) +{ + switch (error->kind) { + case PARSER_ERROR_NONE: + return "none"; + case PARSER_ERROR_UNEXPECTED_TOKEN: + return "unexpected token"; + case PARSER_ERROR_UNEXPECTED_EOF: + return "unexpected end of file"; + case PARSER_ERROR_EXPECTED_STATEMENT_END: + return "expected statement end"; + case PARSER_ERROR_EXPECTED_PRIMARY_EXPRESSION: + return "expected primary expression"; + default: + return "unknown error"; + } +} + +struct Parser +{ + struct Lexer* lexer; + struct Token lookahead[PARSER_LOOKAHEAD]; +}; + +void +parser_new(struct Parser* p, struct Lexer* lexer) +{ + p->lexer = lexer; + memset(p->lookahead, 0, sizeof(p->lookahead)); +} + +bool +parser_lookahead_pop(struct Parser* p, struct Token* token) +{ + struct Token head = p->lookahead[0]; + if (token_is_empty(&head)) return false; + + for (uint i = 0; i < PARSER_LOOKAHEAD - 1; i++) p->lookahead[i] = p->lookahead[i + 1]; + p->lookahead[PARSER_LOOKAHEAD - 1] = token_none(); + + *token = head; + return true; +} + +bool +parser_lookahead_push(struct Parser* p, struct Token token) +{ + for (uint i = 0; i < PARSER_LOOKAHEAD; i++) { + if (token_is_empty(&p->lookahead[i])) { + p->lookahead[i] = token; + return true; + } + } + return false; +} + +// advance the token stream and return the next token. +struct Token +parser_next(struct Parser* p) +{ + struct Token token; + if (!parser_lookahead_pop(p, &token)) token = lexer_next(p->lexer); + return token; +} + +// advance the token stream if the token is of the right type. +// if not, return none-token and set the error. +struct Token +parser_need(struct Parser* p, enum Token_Kind kind, struct Parser_Error* error) +{ + struct Token token = parser_next(p); + if (!token_is(&token, kind)) { + *error = parser_error(PARSER_ERROR_UNEXPECTED_TOKEN); + return token_none(); + } + return token; +} + +// peek at the token stream at given index, without advancing it. +// index 0 is the token that would be returned by a next `parser_next` call. +// peek limit is defined by `PARSER_LOOKAHEAD`. +struct Token +parser_peek_at(struct Parser* p, uint index) +{ + check(index < 2, "parser peek index out of range"); + + while (token_is_empty(&p->lookahead[index])) { + struct Token next_token = lexer_next(p->lexer); + parser_lookahead_push(p, next_token); + } + + return p->lookahead[index]; +} + +// peek at the next token in the stream, without advancing it. +// synonym for `parser_peek_at(p, 0)`. +struct Token +parser_peek(struct Parser* p) +{ + return parser_peek_at(p, 0); +} + +// peek at the token one beyond the current cursor in the stream, without advancing it. +// synonym for `parser_peek_at(p, 1)`. +struct Token +parser_peek_further(struct Parser* p) +{ + return parser_peek_at(p, 1); +} + +// check if the next token is of a given type. +bool +parser_probe(struct Parser* p, enum Token_Kind kind) +{ + struct Token token = parser_peek(p); + return token_is(&token, kind); +} + +struct Expression* parser_expression(struct Parser* p, struct Parser_Error* error); + +struct Expression* +parser_expression_primary_name(struct Parser* p, struct Parser_Error* error) +{ + struct Token token = CHECK(parser_need(p, TOKEN_NAME, error)); + union Expression_Value value = { .name = { token.value.name } }; + return expression_new(EXPRESSION_NAME, value, token.span, token.location); +} + +struct Expression* +parser_expression_primary_integer(struct Parser* p, struct Parser_Error* error) +{ + struct Token token = CHECK(parser_need(p, TOKEN_LITERAL_INTEGER, error)); + union Expression_Value value = { .integer_literal = { token.value.literal_integer } }; + return expression_new(EXPRESSION_INTEGER_LITERAL, value, token.span, token.location); +} + +struct Expression* +parser_expression_primary_float(struct Parser* p, struct Parser_Error* error) +{ + struct Token token = CHECK(parser_need(p, TOKEN_LITERAL_FLOAT, error)); + union Expression_Value value = { .float_literal = { token.value.literal_float } }; + return expression_new(EXPRESSION_FLOAT_LITERAL, value, token.span, token.location); +} + +struct Expression* +parser_expression_primary_string(struct Parser* p, struct Parser_Error* error) +{ + struct Token token = CHECK(parser_need(p, TOKEN_LITERAL_STRING, error)); + union Expression_Value value = { .string_literal = { token.value.literal_string } }; + return expression_new(EXPRESSION_STRING_LITERAL, value, token.span, token.location); +} + +struct Expression* +parser_expression_primary_group(struct Parser* p, struct Parser_Error* error) +{ + struct Token start_token = CHECK(parser_need(p, TOKEN_ROUND_OPEN, error)); + struct Expression* expression = CHECK(parser_expression(p, error)); + struct Token end_token = CHECK(parser_need(p, TOKEN_ROUND_OPEN, error)); + + struct Span span = span_merge(start_token.span, end_token.span); + union Expression_Value value = { .group = { expression } }; + return expression_new(EXPRESSION_GROUP, value, span, start_token.location); +} + +struct Expression* +parser_expression_primary(struct Parser* p, struct Parser_Error* error) +{ + struct Token token = parser_peek(p); + switch (token.kind) { + case TOKEN_NAME: + return parser_expression_primary_name(p, error); + case TOKEN_LITERAL_INTEGER: + return parser_expression_primary_integer(p, error); + case TOKEN_LITERAL_FLOAT: + return parser_expression_primary_float(p, error); + case TOKEN_LITERAL_STRING: + return parser_expression_primary_string(p, error); + case TOKEN_ROUND_OPEN: + return parser_expression_primary_group(p, error); + // TODO: boolean literals. + default: + *error = parser_error(PARSER_ERROR_EXPECTED_PRIMARY_EXPRESSION); + return nil; + } +} + +struct Expression* +parser_expression_member(struct Parser* p, struct Parser_Error* error) +{ + struct Expression* left = CHECK(parser_expression_primary(p, error)); + + // NOTE: see `parser_expression_postfix_call`. + while (parser_probe(p, TOKEN_DOT)) { + parser_next(p); + struct Token name_token = CHECK(parser_need(p, TOKEN_NAME, error)); + struct String name = name_token.value.name; + + struct Span span = span_merge(left->span, name_token.span); + union Expression_Value value = { .member = { left, name } }; + left = expression_new(EXPRESSION_MEMBER, value, span, name_token.location); + } + + return left; +} + +struct Expression* +parser_expression_postfix_call( + struct Parser* p, struct Expression* subject, struct Parser_Error* error) +{ + // NOTE: because of the way the parser works, we have to parse all subsequent + // call expressions in the same loop. + // this is the case with an expression like `meow_function()(123)`, + // where the hypothetical `meow_function` returns a function pointer. + while (parser_probe(p, TOKEN_ROUND_OPEN)) { + parser_next(p); + + struct Expression *arguments_head = nil, *arguments_current = nil; + while (!parser_probe(p, TOKEN_ROUND_CLOSE)) { + struct Expression* argument = CHECK(parser_expression(p, error)); + if (!arguments_head) + arguments_head = argument; + else + arguments_current->next = argument; + arguments_current = argument; + if (parser_probe(p, TOKEN_COMMA)) parser_next(p); + } + + struct Token token = CHECK(parser_need(p, TOKEN_ROUND_CLOSE, error)); + struct Span span = span_merge(subject->span, token.span); + union Expression_Value value = { .call = { subject, arguments_head } }; + subject = expression_new(EXPRESSION_CALL, value, span, token.location); + } + + return subject; +} + +struct Expression* +parser_expression_postfix_subscript( + struct Parser* p, struct Expression* subject, struct Parser_Error* error) +{ + // NOTE: see `parser_expression_postfix_call`. + while (parser_probe(p, TOKEN_SQUARE_OPEN)) { + parser_next(p); + + struct Expression* index = CHECK(parser_expression(p, error)); + struct Token token = CHECK(parser_need(p, TOKEN_SQUARE_CLOSE, error)); + + struct Span span = span_merge(subject->span, span_merge(index->span, token.span)); + union Expression_Value value = { .subscript = { subject, index } }; + subject = expression_new(EXPRESSION_SUBSCRIPT, value, span, token.location); + } + return subject; +} + +struct Expression* +parser_expression_postfix(struct Parser* p, struct Parser_Error* error) +{ + struct Expression* expression = CHECK(parser_expression_member(p, error)); + + switch (parser_peek(p).kind) { + case TOKEN_ROUND_OPEN: + return parser_expression_postfix_call(p, expression, error); + case TOKEN_SQUARE_OPEN: + return parser_expression_postfix_subscript(p, expression, error); + default: + return expression; + } +} + +struct Expression* +parser_expression_unary_operation(struct Parser* p, struct Parser_Error* error) +{ + struct Token token = parser_peek(p); + enum Unary_Operation operation = unary_operation_from_token(&token); + if (operation) { + parser_next(p); + struct Expression* operand = CHECK(parser_expression_unary_operation(p, error)); + + struct Span span = span_merge(token.span, operand->span); + union Expression_Value value = { .unary_operator = { operation, operand } }; + return expression_new(EXPRESSION_UNARY_OPERATION, value, span, token.location); + } + + return parser_expression_postfix(p, error); +} + +// given two expressions and some kind of binary operation between them, +// merge them into a single binary expression, with attention to operator precedence +// and associativity. +struct Expression_Binary_Operator +parser_merge_into_single_binary_expression( + struct Parser* p, struct Expression* left, enum Binary_Operation operation, + struct Expression* right) +{ + // NOTE: due to the parser structure, the left expression is never a binary operation + // so we can worry about fixing up the right side only. + check(left->kind != EXPRESSION_BINARY_OPERATION, "left expression is a binary operation"); + if (right->kind == EXPRESSION_BINARY_OPERATION) { + struct Expression_Binary_Operator right_binary = right->value.binary_operator; + + uint right_precedence = binary_operation_precedence(right_binary.operation); + uint precedence = binary_operation_precedence(operation); + + bool switch_due_to_precedence = right_precedence < precedence; + bool switch_due_to_associativity = + precedence == right_precedence + && binary_operation_associativity(operation) == BINARY_ASSOCIATIVITY_LEFT; + + // essentially check if an expression like `a ~ (b ~ c)` needs to be + // switched into `(a ~ b) ~ c` or not. + if (switch_due_to_precedence || switch_due_to_associativity) { + // since we only use static memory, we need to switch the operands around + // without allocating more expressions, thus we have to reuse the allocation + // slots of the previous expressions. + struct Expression* operands[3] = { + left, right_binary.left_operand, right_binary.right_operand + }; + struct Expression* slots[2] = { right, right_binary.right_operand }; + + struct Expression_Binary_Operator new_binary_operator = + parser_merge_into_single_binary_expression(p, operands[0], operation, operands[1]); + *slots[0] = (struct Expression){ + .kind = EXPRESSION_BINARY_OPERATION, + .value = { .binary_operator = new_binary_operator }, + .span = span_merge(left->span, right->span), + .location = left->location + }; + + *slots[1] = *operands[2]; + + left = slots[0]; + right = slots[1]; + operation = right_binary.operation; + } + } + + return (struct Expression_Binary_Operator){ + .operation = operation, + .left_operand = left, + .right_operand = right, + }; +} + +struct Expression* +parser_expression_binary_operation(struct Parser* p, struct Parser_Error* error) +{ + struct Expression* left = CHECK(parser_expression_unary_operation(p, error)); + + struct Token token = parser_peek(p); + enum Binary_Operation operation = binary_operation_from_token(&token); + if (operation) { + parser_next(p); + struct Expression* right = CHECK(parser_expression_binary_operation(p, error)); + + struct Span span = span_merge(left->span, right->span); + struct Expression_Binary_Operator binary_value = + parser_merge_into_single_binary_expression(p, left, operation, right); + union Expression_Value value = { .binary_operator = binary_value }; + return expression_new(EXPRESSION_BINARY_OPERATION, value, span, left->location); + } + + return left; +} + +struct Expression* +parser_expression(struct Parser* p, struct Parser_Error* error) +{ + return parser_expression_binary_operation(p, error); +} + +void +parser_end_statement(struct Parser* p, struct Parser_Error* error) +{ + struct Token token = parser_peek(p); + if (!token_ends_statement(&token)) { + *error = parser_error(PARSER_ERROR_EXPECTED_STATEMENT_END); + return; + } + parser_next(p); +} + +struct Statement* +parser_statement(struct Parser* p, struct Parser_Error* error) +{ + // skip empty statements. + struct Token token = parser_peek(p); + if (token_ends_statement(&token)) { + parser_next(p); + return nil; + } + + // TODO: no statements for now, just go straight to expressions. + struct Expression* expression = CHECK(parser_expression(p, error)); + + CHECK(parser_end_statement(p, error)); + + // expand by one byte to include the statement terminator. + struct Span span = span_expand(expression->span, 1); + union Statement_Value value = { .expression = expression }; + return statement_new(STATEMENT_EXPRESSION, value, span, expression->location); +} + +// parse the lexer tokens into a single AST. +// note: it was either `parser_parse` or this. :) +struct Tree +parser_do_your_thing(struct Parser* p, struct Parser_Error* error) +{ + struct Statement* head = nil; + + struct Statement* current = nil; + while (!p->lexer->eof) { + struct Statement* next = parser_statement(p, error); + if (!parser_error_is_none(error)) return (struct Tree){ nil }; + + if (current) { + current->next = next; + } else { + head = next; + } + current = next; + } + + *error = parser_error_none(); + return (struct Tree){ head }; +} + +#undef CHECK \ No newline at end of file diff --git a/boot/tree.c b/boot/tree.c index 339f12c..4fb4b73 100644 --- a/boot/tree.c +++ b/boot/tree.c @@ -4,12 +4,31 @@ enum Unary_Operation { + UNARY_NONE, + UNARY_MINUS, UNARY_NOT, UNARY_BITWISE_NOT, }; +enum Unary_Operation +unary_operation_from_token(const struct Token* token) +{ + switch (token->kind) { + case TOKEN_MINUS: + return UNARY_MINUS; + case TOKEN_NOT: + return UNARY_NOT; + // TODO: tilde token + // case TOKEN_TILDE: + // return UNARY_BITWISE_NOT; + + default: + return UNARY_NONE; + } +} + const ascii* unary_operation_to_string(enum Unary_Operation operation) { @@ -28,6 +47,8 @@ unary_operation_to_string(enum Unary_Operation operation) enum Binary_Operation { + BINARY_NONE, + BINARY_PLUS, BINARY_MINUS, BINARY_MULTIPLY, @@ -50,6 +71,101 @@ enum Binary_Operation BINARY_BITWISE_RIGHT_SHIFT, }; +enum Binary_Operation +binary_operation_from_token(const struct Token* token) +{ + switch (token->kind) { + case TOKEN_PLUS: + return BINARY_PLUS; + case TOKEN_MINUS: + return BINARY_MINUS; + case TOKEN_STAR: + return BINARY_MULTIPLY; + case TOKEN_SLASH: + return BINARY_DIVIDE; + // TODO: percent token + // case TOKEN_PERCENT: + // return BINARY_MODULO; + + case TOKEN_EQUAL: + return BINARY_EQUAL; + case TOKEN_NOT_EQUAL: + return BINARY_NOT_EQUAL; + case TOKEN_GREATER: + return BINARY_GREATER_THAN; + case TOKEN_GREATER_EQUAL: + return BINARY_GREATER_THAN_EQUAL; + case TOKEN_LESS: + return BINARY_LESS_THAN; + case TOKEN_LESS_EQUAL: + return BINARY_LESS_THAN_EQUAL; + case TOKEN_AND: + return BINARY_AND; + case TOKEN_OR: + return BINARY_OR; + + default: + return BINARY_NONE; + } +} + +// return the precedence of the given binary operation. +// lower values are weaker, with 1 being the weakest. +// the values are taken mostly the same as other C-family languages. +uint +binary_operation_precedence(enum Binary_Operation operation) +{ + switch (operation) { + // weakest + case BINARY_OR: + return 1; + case BINARY_AND: + return 2; + case BINARY_BITWISE_OR: + return 3; + case BINARY_BITWISE_XOR: + return 4; + case BINARY_BITWISE_AND: + return 5; + case BINARY_EQUAL: + case BINARY_NOT_EQUAL: + return 6; + case BINARY_GREATER_THAN: + case BINARY_GREATER_THAN_EQUAL: + case BINARY_LESS_THAN: + case BINARY_LESS_THAN_EQUAL: + return 7; + case BINARY_BITWISE_LEFT_SHIFT: + case BINARY_BITWISE_RIGHT_SHIFT: + return 8; + case BINARY_PLUS: + case BINARY_MINUS: + return 9; + case BINARY_MULTIPLY: + case BINARY_DIVIDE: + case BINARY_MODULO: + return 10; + // strongest + + default: + return 0; + } +} + +enum Binary_Operation_Associativity +{ + BINARY_ASSOCIATIVITY_NONE, + BINARY_ASSOCIATIVITY_LEFT, + BINARY_ASSOCIATIVITY_RIGHT, +}; + +enum Binary_Operation_Associativity +binary_operation_associativity(enum Binary_Operation operation) +{ + // all operations are left associative by default. + return BINARY_ASSOCIATIVITY_LEFT; +} + const ascii* binary_operation_to_string(enum Binary_Operation operation) { @@ -208,6 +324,23 @@ struct Expression REGION(struct Expression, expression) +struct Expression* +expression_new( + enum Expression_Kind kind, union Expression_Value value, struct Span span, + struct Cursor location) +{ + check(region_expression_cursor < REGION_SIZE, "out of expression memory"); + struct Expression* expression = ®ion_expression[region_expression_cursor++]; + *expression = (struct Expression){ + .kind = kind, + .value = value, + .span = span, + .location = location, + .next = nil, + }; + return expression; +} + void expression_print(const struct Expression* expression) { @@ -303,6 +436,22 @@ struct Statement REGION(struct Statement, statement) +struct Statement* +statement_new( + enum Statement_Kind kind, union Statement_Value value, struct Span span, struct Cursor location) +{ + check(region_statement_cursor < REGION_SIZE, "out of statement memory"); + struct Statement* statement = ®ion_statement[region_statement_cursor++]; + *statement = (struct Statement){ + .kind = kind, + .value = value, + .span = span, + .location = location, + .next = nil, + }; + return statement; +} + void statement_print(const struct Statement* statement) { |
