/* * simple advancing lexer * invoked until completion by parser. */ #define MAX_CHAR_BUFFER_SIZE 256 // byte position within a file. typedef uint64 Pos; // an extent of text within a file, // defined by its start and end byte positions. struct Span { Pos start; Pos end; }; struct Span span_new(Pos start, Pos end) { return (struct Span){ .start = start, .end = end }; } struct Span span_width(Pos start, uint width) { return (struct Span){ .start = start, .end = start + width }; } // a cursor position placed within a text file. struct Cursor { uint line; uint column; }; // what kind of token is it? enum Token_Kind { TOKEN_NONE, TOKEN_SOMETHING, TOKEN_END_OF_FILE, TOKEN_NEWLINE, TOKEN_NAME, TOKEN_LITERAL_INTEGER, TOKEN_LITERAL_FLOAT, TOKEN_LITERAL_STRING, TOKEN_WORD_FUN, TOKEN_WORD_IF, TOKEN_WORD_ELSE, TOKEN_WORD_FOR, TOKEN_WORD_LOOP, TOKEN_WORD_BREAK, TOKEN_WORD_CONTINUE, TOKEN_WORD_DEFER, TOKEN_WORD_SWITCH, TOKEN_WORD_RETURN, TOKEN_WORD_VAR, TOKEN_WORD_TYPE, TOKEN_ROUND_OPEN, TOKEN_ROUND_CLOSE, TOKEN_CURLY_OPEN, TOKEN_CURLY_CLOSE, TOKEN_SQUARE_OPEN, TOKEN_SQUARE_CLOSE, TOKEN_COMMA, TOKEN_ASSIGN, TOKEN_AMPERSAND, TOKEN_DOT, TOKEN_PIPE, TOKEN_PLUS, TOKEN_MINUS, TOKEN_STAR, TOKEN_SLASH, TOKEN_AND, TOKEN_OR, TOKEN_NOT, TOKEN_EQUAL, TOKEN_NOT_EQUAL, TOKEN_LESS, TOKEN_LESS_EQUAL, TOKEN_GREATER, TOKEN_GREATER_EQUAL, }; const ascii* token_kind_to_string(enum Token_Kind kind) { switch (kind) { case TOKEN_NONE: return "NONE"; case TOKEN_SOMETHING: return "SOMETHING"; case TOKEN_END_OF_FILE: return "END_OF_FILE"; case TOKEN_NEWLINE: return "NEWLINE"; case TOKEN_NAME: return "NAME"; case TOKEN_LITERAL_INTEGER: return "LITERAL_INTEGER"; case TOKEN_LITERAL_FLOAT: return "LITERAL_FLOAT"; case TOKEN_LITERAL_STRING: return "LITERAL_STRING"; case TOKEN_WORD_FUN: return "WORD_FUN"; case TOKEN_WORD_IF: return "WORD_IF"; case TOKEN_WORD_ELSE: return "WORD_ELSE"; case TOKEN_WORD_FOR: return "WORD_FOR"; case TOKEN_WORD_LOOP: return "WORD_LOOP"; case TOKEN_WORD_BREAK: return "WORD_BREAK"; case TOKEN_WORD_CONTINUE: return "WORD_CONTINUE"; case TOKEN_WORD_DEFER: return "WORD_DEFER"; case TOKEN_WORD_SWITCH: return "WORD_SWITCH"; case TOKEN_WORD_RETURN: return "WORD_RETURN"; case TOKEN_WORD_VAR: return "WORD_VAR"; case TOKEN_WORD_TYPE: return "WORD_TYPE"; case TOKEN_ROUND_OPEN: return "ROUND_OPEN"; case TOKEN_ROUND_CLOSE: return "ROUND_CLOSE"; case TOKEN_CURLY_OPEN: return "CURLY_OPEN"; case TOKEN_CURLY_CLOSE: return "CURLY_CLOSE"; case TOKEN_SQUARE_OPEN: return "SQUARE_OPEN"; case TOKEN_SQUARE_CLOSE: return "SQUARE_CLOSE"; case TOKEN_COMMA: return "COMMA"; case TOKEN_ASSIGN: return "ASSIGN"; case TOKEN_AMPERSAND: return "AMPERSAND"; case TOKEN_DOT: return "DOT"; case TOKEN_PIPE: return "PIPE"; case TOKEN_PLUS: return "PLUS"; case TOKEN_MINUS: return "MINUS"; case TOKEN_STAR: return "STAR"; case TOKEN_SLASH: return "SLASH"; case TOKEN_AND: return "AND"; case TOKEN_OR: return "OR"; case TOKEN_NOT: return "NOT"; case TOKEN_EQUAL: return "EQUAL"; case TOKEN_NOT_EQUAL: return "NOT_EQUAL"; case TOKEN_LESS: return "LESS"; case TOKEN_LESS_EQUAL: return "LESS_EQUAL"; case TOKEN_GREATER: return "GREATER"; case TOKEN_GREATER_EQUAL: return "GREATER_EQUAL"; default: return ""; } } // further value optionally carried by some tokens. union Token_Value { ascii unknown_character; int64 literal_integer; float64 literal_float; struct String literal_string; struct String name; }; // a lexical token. struct Token { enum Token_Kind kind; union Token_Value value; struct Span span; struct Cursor location; }; struct Token token_new(enum Token_Kind kind, struct Span span, struct Cursor location, union Token_Value value) { return (struct Token){ .kind = kind, .span = span, .value = value, .location = location, }; } struct Token token_wide(enum Token_Kind kind, struct Span span, struct Cursor location) { return token_new(kind, span, location, (union Token_Value){}); } struct Token token_simple(enum Token_Kind kind, Pos pos, struct Cursor location) { return token_wide(kind, (struct Span){ pos, pos }, location); } bool ascii_in_range(ascii c, ascii from, ascii to) { return c >= from && c <= to; } bool ascii_is_number(ascii c) { return ascii_in_range(c, '0', '9'); } bool ascii_is_name_or_word(ascii c) { return ascii_in_range(c, 'A', 'Z') || ascii_in_range(c, 'a', 'z') || c == '_'; } // the lexical analyzer. // reads tokens from a source file, // and spits them out to the parser, one by one. struct Lexer { struct String source; bool eof; Pos position; struct Cursor cursor; }; // initialize a lexer with a source file. void lexer_new(struct Lexer* l, struct String source) { l->source = source; l->eof = false; l->position = 0; l->cursor = (struct Cursor){ 1, 1 }; } struct Lexer_Char { Pos position; ascii character; bool eof; }; struct Lexer_Char lexer_advance_char(struct Lexer* l) { if (l->position >= string_length(l->source)) return (struct Lexer_Char){ .position = string_length(l->source), .eof = true }; ascii character = string_at(l->source, l->position); l->position += 1; if (character == '\n') { l->cursor.line += 1; l->cursor.column = 1; } else { l->cursor.column += 1; } return (struct Lexer_Char){ .position = l->position - 1, .character = character, .eof = false }; } struct Lexer_Char lexer_peek_char_at(const struct Lexer* l, uint offset) { Pos at = l->position + offset; if (at >= l->source.length) return (struct Lexer_Char){ .position = string_length(l->source), .eof = true }; return (struct Lexer_Char){ .position = at, .character = string_at(l->source, at) }; } struct Lexer_Char lexer_peek_char(const struct Lexer* l) { return lexer_peek_char_at(l, 0); } struct Lexer_Char lexer_peek_char_further(const struct Lexer* l) { return lexer_peek_char_at(l, 1); } struct Lexer_Char_Match { bool got_match; struct Lexer_Char match; }; struct Lexer_Char_Match lexer_match_char(struct Lexer* l, ascii expected) { struct Lexer_Char peek = lexer_peek_char(l); if (peek.character == expected) return (struct Lexer_Char_Match){ true, lexer_advance_char(l) }; return (struct Lexer_Char_Match){}; } struct Lexer_Non_Code { bool had_newline; Pos newline_position; struct Cursor newline_cursor; }; struct Lexer_Non_Code lexer_skip_non_code(struct Lexer* l) { struct Lexer_Non_Code status = { .had_newline = false, .newline_position = {}, .newline_cursor = {}, }; bool in_comment = false; for (;;) { struct Lexer_Char lc = lexer_peek_char(l); if (lc.eof) return status; ascii c = lc.character; if (c == '\n') { if (!status.had_newline) { status.had_newline = true; status.newline_position = lc.position; status.newline_cursor = l->cursor; } in_comment = false; } if (c == '#') { in_comment = true; } bool is_whitespace = c == ' ' || c == '\t' || c == '\n'; if (!is_whitespace && !in_comment) { break; } lexer_advance_char(l); } return status; } struct Lexer_Symbol_Token { enum Token_Kind kind; uint width; }; struct Lexer_Symbol_Token lexer_symbol_token(struct Lexer* l, struct Lexer_Char current) { #define RET return (struct Lexer_Symbol_Token) struct Lexer_Char_Match next; switch (current.character) { case '(': RET{ TOKEN_ROUND_OPEN, 1 }; case ')': RET{ TOKEN_ROUND_CLOSE, 1 }; case '{': RET{ TOKEN_CURLY_OPEN, 1 }; case '}': RET{ TOKEN_CURLY_CLOSE, 1 }; case '[': RET{ TOKEN_SQUARE_OPEN, 1 }; case ']': RET{ TOKEN_SQUARE_CLOSE, 1 }; case ',': RET{ TOKEN_COMMA, 1 }; case '=': next = lexer_match_char(l, '='); if (next.got_match) RET{ TOKEN_EQUAL, 2 }; RET{ TOKEN_ASSIGN, 1 }; case '&': next = lexer_match_char(l, '&'); if (next.got_match) RET{ TOKEN_AND, 2 }; RET{ TOKEN_AMPERSAND, 1 }; case '.': RET{ TOKEN_DOT, 1 }; case '|': next = lexer_match_char(l, '|'); if (next.got_match) RET{ TOKEN_OR, 2 }; RET{ TOKEN_PIPE, 1 }; // todo: increment, decrement, +=, -=, *=, etc. // all the special assignment operations case '+': RET{ TOKEN_PLUS, 1 }; case '-': RET{ TOKEN_MINUS, 1 }; case '*': RET{ TOKEN_STAR, 1 }; case '/': RET{ TOKEN_SLASH, 1 }; case '!': next = lexer_match_char(l, '='); if (next.got_match) RET{ TOKEN_NOT_EQUAL, 2 }; RET{ TOKEN_NOT, 1 }; case '<': next = lexer_match_char(l, '='); if (next.got_match) RET{ TOKEN_LESS_EQUAL, 2 }; RET{ TOKEN_LESS }; case '>': next = lexer_match_char(l, '='); if (next.got_match) RET{ TOKEN_GREATER_EQUAL, 2 }; RET{ TOKEN_GREATER }; default: RET{ TOKEN_NONE, 0 }; } #undef RET } struct Token lexer_string_token(struct Lexer* l) { struct Cursor cursor = l->cursor; Pos position = l->position; struct Lexer_Char_Match start_match = lexer_match_char(l, '"'); check(start_match.got_match, "`lexer_string_token` called on non-string token"); ascii buffer[MAX_CHAR_BUFFER_SIZE]; uint buffer_size = 0; for (;;) { struct Lexer_Char c = lexer_advance_char(l); // todo: return lexer error for these cases. check(!c.eof, "unclosed string"); if (c.character == '"') break; check(buffer_size < MAX_CHAR_BUFFER_SIZE, "string too long to lex"); buffer[buffer_size++] = c.character; } struct String string = string_empty(); if (buffer_size > 0) string = string_new(buffer, buffer_size); union Token_Value value = { .literal_string = string }; struct Span span = span_new(position, l->position); return token_new(TOKEN_LITERAL_STRING, span, cursor, value); } struct Token lexer_number_token(struct Lexer* l) { struct Cursor cursor = l->cursor; Pos position = l->position; ascii buffer[MAX_CHAR_BUFFER_SIZE]; uint buffer_size = 0; for (;;) { struct Lexer_Char c = lexer_peek_char(l); bool is_number_char = ascii_is_number(c.character) || c.character == '.'; if (c.eof || !is_number_char) break; check(buffer_size < MAX_CHAR_BUFFER_SIZE, "number too long to lex"); buffer[buffer_size++] = c.character; lexer_advance_char(l); } check(buffer_size < MAX_CHAR_BUFFER_SIZE, "number too long to lex"); buffer[buffer_size++] = '\0'; enum Token_Kind kind; union Token_Value value; struct Span span = span_new(position, l->position); ascii* dot_exists = strchr(buffer, '.'); if (dot_exists) { float64 literal_float = strtod(buffer, nil); kind = TOKEN_LITERAL_FLOAT; value.literal_float = literal_float; } else { int64 literal_integer = strtoll(buffer, nil, 10); kind = TOKEN_LITERAL_INTEGER; value.literal_integer = literal_integer; } return token_new(kind, span, cursor, value); } enum Token_Kind lexer_word_from_name(struct Lexer* l, struct String word_or_name) { uint32 crc = crc32_posix(word_or_name); // CRC32 values can be checked with `echo -ne "word" | cksum` switch (crc) { case 1373415947: // "fun" return TOKEN_WORD_FUN; case 812472514: // "if" return TOKEN_WORD_IF; case 2588761009: // "else" return TOKEN_WORD_ELSE; case 2652874405: // "for" return TOKEN_WORD_FOR; case 1637870694: // "loop" return TOKEN_WORD_LOOP; case 1007193266: // "break" return TOKEN_WORD_BREAK; case 1827824793: // "continue" return TOKEN_WORD_CONTINUE; case 836542293: // "defer" return TOKEN_WORD_DEFER; case 3635023017: // "switch" return TOKEN_WORD_SWITCH; case 2579962013: // "return" return TOKEN_WORD_RETURN; case 1662845996: // "var" return TOKEN_WORD_VAR; case 91700392: // "type" return TOKEN_WORD_TYPE; default: return TOKEN_NONE; } } struct Token lexer_name_or_word_token(struct Lexer* l) { struct Cursor cursor = l->cursor; Pos position = l->position; ascii buffer[MAX_CHAR_BUFFER_SIZE]; uint buffer_size = 0; for (;;) { struct Lexer_Char c = lexer_peek_char(l); bool is_name_or_word_char = ascii_is_name_or_word(c.character) || ascii_is_number(c.character); if (c.eof || !is_name_or_word_char) break; check(buffer_size < MAX_CHAR_BUFFER_SIZE, "name or word too long to lex"); buffer[buffer_size++] = c.character; lexer_advance_char(l); } check(buffer_size != 0, "`lexer_name_or_word_token` called on non-name/word token"); struct Span span = span_new(position, l->position); struct String name_or_word = string_new(buffer, buffer_size); enum Token_Kind word_kind = lexer_word_from_name(l, name_or_word); if (word_kind != TOKEN_NONE) return token_wide(word_kind, span, cursor); union Token_Value value = { .name = name_or_word }; return token_new(TOKEN_NAME, span, cursor, value); } // return the next token. struct Token lexer_next(struct Lexer* l) { if (l->eof) return token_simple(TOKEN_END_OF_FILE, l->position, l->cursor); struct Lexer_Non_Code non_code = lexer_skip_non_code(l); if (non_code.had_newline) return token_simple(TOKEN_NEWLINE, non_code.newline_position, non_code.newline_cursor); struct Lexer_Char c = lexer_peek_char(l); if (c.eof) { l->eof = true; return token_simple(TOKEN_END_OF_FILE, l->position, l->cursor); } if (c.character == '"') { return lexer_string_token(l); } else if (ascii_is_number(c.character)) { return lexer_number_token(l); } else if (ascii_is_name_or_word(c.character)) { return lexer_name_or_word_token(l); } Pos position = l->position; struct Cursor cursor = l->cursor; c = lexer_advance_char(l); struct Lexer_Symbol_Token symbol_token = lexer_symbol_token(l, c); if (symbol_token.kind != TOKEN_NONE) { struct Span symbol_span = span_width(position, symbol_token.width); return token_wide(symbol_token.kind, symbol_span, cursor); } union Token_Value value = (union Token_Value){ .unknown_character = c.character }; return token_new(TOKEN_SOMETHING, span_width(position, 1), cursor, value); }