Extract lexer and switch token metadata to helpers

2026-05-12 08:42:11 +02:00
parent df7f1f27a1
commit 4cffa1f69a
4 changed files with 336 additions and 330 deletions
--- a/main.c
+++ b/main.c
@@ -8,206 +8,7 @@
 #include <stdarg.h>
 #include "base.c"
 #include "meta_gen.c"
-
-typedef struct Token {
-    Token_Kind kind;
-    char *str;
-    int len;
-
-    char *file;
-    int line, column;
-
-    union {
-        uint64_t u;
-    };
-} Token;
-
-typedef Vec(Token) Token_Array;
-
-typedef struct Lexer {
-    char *at;
-    char *end;
-    char *file;
-    int line;
-    int column;
-} Lexer;
-
-void lex_advance(Lexer *lex) {
-    if (lex->at >= lex->end) {
-        return;
-    }
-
-    if (*lex->at == '\n') {
-        lex->line++;
-        lex->column = 0;
-    } else {
-        lex->column++;
-    }
-    lex->at += 1;
-}
-
-void eat_whitespace(Lexer *lex) {
-    while (lex->at < lex->end) {
-        switch (*lex->at) {
-            case ' ':
-            case '\t':
-            case '\r':
-            case '\n':
-                lex_advance(lex);
-                break;
-            default:
-                return;
-        }
-    }
-}
-
-Lexer make_lexer(char *file, char *src, int len) {
-    Lexer lex = {
-        .at = src,
-        .end = src + len,
-        .file = file,
-        .line = 0,
-        .column = 0,
-    };
-    return lex;
-}
-
-bool lex_peek_is(Lexer *lex, char c) {
-    return lex->at < lex->end && *lex->at == c;
-}
-
-bool lex_match(Lexer *lex, char c) {
-    if (lex_peek_is(lex, c)) {
-        lex_advance(lex);
-        return true;
-    }
-    return false;
-}
-
-Token_Kind lex_repeat_or_assign(Lexer *lex, char repeated_char, Token_Kind single, Token_Kind repeated, Token_Kind assigned) {
-    if (lex_match(lex, repeated_char)) return repeated;
-    if (lex_match(lex, '=')) return assigned;
-    return single;
-}
-
-Token_Kind lex_assign_variant(Lexer *lex, Token_Kind single, Token_Kind assigned) {
-    return lex_match(lex, '=') ? assigned : single;
-}
-
-Token_Kind lex_shift_family(Lexer *lex, char repeated_char, Token_Kind single, Token_Kind single_eq, Token_Kind doubled, Token_Kind doubled_eq) {
-    if (lex_match(lex, repeated_char)) {
-        return lex_match(lex, '=') ? doubled_eq : doubled;
-    }
-    return lex_match(lex, '=') ? single_eq : single;
-}
-
-Token lex_token(Lexer *lex) {
-    eat_whitespace(lex);
-    Token t = {
-        .str = lex->at,
-        .line = lex->line,
-        .column = lex->column,
-        .file = lex->file,
-    };
-
-    if (lex->at >= lex->end) {
-        t.kind = TOK_EOF;
-        t.len = 0;
-        return t;
-    }
-
-    char c = *lex->at;
-
-    if (isdigit(c)) {
-        t.kind = TOK_INT;
-        while (lex->at < lex->end && isdigit(*lex->at)) {
-            lex_advance(lex);
-        }
-
-        // @todo: proper lexing of floats (as well as postfixes)
-        if (lex->at < lex->end && *lex->at == '.') {
-            t.kind = TOK_FLOAT;
-            lex_advance(lex);
-
-            while (lex->at < lex->end && isdigit(*lex->at)) {
-                lex_advance(lex);
-            }
-        }
-
-        if (t.kind == TOK_INT) {
-            t.u = strtoull(t.str, NULL, 10);
-        }
-
-        t.len = (int)(lex->at - t.str);
-        return t;
-    }
-
-    lex_advance(lex);
-
-    switch (c) {
-        case 0: t.kind = TOK_EOF; break;
-        case '(': t.kind = TOK_LPAREN; break;
-        case ')': t.kind = TOK_RPAREN; break;
-        case '[': t.kind = TOK_LBRACKET; break;
-        case ']': t.kind = TOK_RBRACKET; break;
-        case '{': t.kind = TOK_LBRACE; break;
-        case '}': t.kind = TOK_RBRACE; break;
-        case ',': t.kind = TOK_COMMA; break;
-        case '.': t.kind = TOK_DOT; break;
-        case ':': t.kind = TOK_COLON; break;
-        case ';': t.kind = TOK_SEMICOLON; break;
-        case '?': t.kind = TOK_QUESTION; break;
-        case '#': t.kind = TOK_HASH; break;
-        case '+': t.kind = lex_repeat_or_assign(lex, '+', TOK_PLUS, TOK_INC, TOK_PLUS_ASSIGN); break;
-        case '-': {
-            if (lex_match(lex, '-')) t.kind = TOK_DEC;
-            else if (lex_match(lex, '=')) t.kind = TOK_MINUS_ASSIGN;
-            else if (lex_match(lex, '>')) t.kind = TOK_ARROW;
-            else t.kind = TOK_MINUS;
-        } break;
-        case '*': t.kind = lex_assign_variant(lex, TOK_STAR, TOK_MUL_ASSIGN); break;
-        case '/': t.kind = lex_assign_variant(lex, TOK_SLASH, TOK_DIV_ASSIGN); break;
-        case '%': t.kind = lex_assign_variant(lex, TOK_PERCENT, TOK_MOD_ASSIGN); break;
-        case '=': t.kind = lex_assign_variant(lex, TOK_ASSIGN, TOK_EQ); break;
-        case '<': t.kind = lex_shift_family(lex, '<', TOK_LT, TOK_LEQ, TOK_LSHIFT, TOK_LSHIFT_ASSIGN); break;
-        case '>': t.kind = lex_shift_family(lex, '>', TOK_GT, TOK_GEQ, TOK_RSHIFT, TOK_RSHIFT_ASSIGN); break;
-        case '!': t.kind = lex_assign_variant(lex, TOK_NOT, TOK_NEQ); break;
-        case '~': t.kind = TOK_BITNOT; break;
-        case '&': t.kind = lex_repeat_or_assign(lex, '&', TOK_BITAND, TOK_AND, TOK_AND_ASSIGN); break;
-        case '|': t.kind = lex_repeat_or_assign(lex, '|', TOK_BITOR, TOK_OR, TOK_OR_ASSIGN); break;
-        case '^': t.kind = lex_assign_variant(lex, TOK_BITXOR, TOK_XOR_ASSIGN); break;
-        default: {
-            // @todo: lexer perhaps should have a static buffer of size 1024, error message
-            // should be put there and piped to the upper program. The token should be filled
-            // with that message
-            t.kind = TOK_ERROR;
-        }
-    }
-
-    t.len = (int)(lex->at - t.str);
-    return t;
-}
-
-void assert_token(Token t, Token_Kind kind, char *text, int line, int column) {
-    assert(t.kind == kind);
-    assert(t.line == line);
-    assert(t.column == column);
-    assert(t.len == (int)strlen(text));
-    assert(strncmp(t.str, text, t.len) == 0);
-}
-
-Token_Array lex_file(char *file, char *src, int len) {
-    Lexer lex = make_lexer(file, src, len);
-    Token_Array result = {0};
-    for (;;) {
-        Token token = lex_token(&lex);
-        vec_push(&result, token);
-        if (token.kind == TOK_EOF) {
-            break;
-        }
-    }
-    return result;
-}
+#include "lex.c"

 typedef struct Parser {
    Token *at;
@@ -256,7 +57,7 @@ Token *expect_token(Parser *p, Token_Kind kind) {
    if (p->at->kind == kind) {
        return next_token(p);
    }
-    panicf("expected token kind: %s, got instead: %s", token_to_name[p->at->kind], token_to_name[kind]);
+    panicf("expected token kind: %s, got instead: %s", token_to_name(p->at->kind), token_to_name(kind));
 }

 Ast *create_ast(Token *token, Ast_Kind kind) {
@@ -287,7 +88,7 @@ Ast *parse_atom(Parser *p) {
        n = parse_expr(p, 0);
        expect_token(p, TOK_RPAREN);
    } else {
-        panicf("unknown token in %s. %.*s (%s/%d), ", __FUNCTION__, token->len, token->str, token_to_name[token->kind], token->kind);
+        panicf("unknown token in %s. %.*s (%s/%d), ", __FUNCTION__, token->len, token->str, token_to_name(token->kind), token->kind);
    }
    return n;
 }
@@ -315,7 +116,7 @@ Ast *parse_valid_left_binding(Parser *p, Token *tok, Ast *left) {
        case TOK_BITOR: case TOK_BITXOR: case TOK_AND: case TOK_OR: case TOK_LSHIFT: case TOK_RSHIFT: {
            return create_binary_expr(tok, tok->kind, left, parse_expr(p, get_binding_power(tok)));
        } break;
-        default: panicf("unknown token in %s. %.*s (%s/%d), ", __FUNCTION__, tok->len, tok->str, token_to_name[tok->kind], tok->kind);
+        default: panicf("unknown token in %s. %.*s (%s/%d), ", __FUNCTION__, tok->len, tok->str, token_to_name(tok->kind), tok->kind);
    }
    return NULL;
 }
@@ -366,7 +167,7 @@ void print_expr(Ast *n) {
        case AST_INT: printf("%lu", n->u); break;
        case AST_BINARY: {
            print_expr(n->l);
-            printf(" %s ", token_to_op[n->op]);
+            printf(" %s ", token_to_op(n->op));
            print_expr(n->r);
        } break;
        default: panicf("encountered invalid ast kind in %s of kind: %d\n", __FUNCTION__, n->kind);
@@ -415,18 +216,17 @@ void parser_test(void) {
    TEST_EVAL((2+3)*(4+5));
    TEST_EVAL(9>3&1);
    TEST_EVAL(8|1<4);
-
-    // TEST_EVAL(7<=3+4);
-    // TEST_EVAL(8>=2*4);
-    // TEST_EVAL(4==2+2);
-    // TEST_EVAL(5!=2+2);
-    // TEST_EVAL(1&&2);                                                                                                                                            
-    // TEST_EVAL(0||3);                                                                                                                                            
-    // TEST_EVAL(1||0&&0);                                                                                                                                         
-    // TEST_EVAL(8<<2);                                                                                                                                            
-    // TEST_EVAL(32>>3);                                                                                                                                           
-    // TEST_EVAL(1+2<<3);                                                                                                                                          
-    // TEST_EVAL(16>>1+1);   
+    TEST_EVAL(7<=3+4);
+    TEST_EVAL(8>=2*4);
+    TEST_EVAL(4==2+2);
+    TEST_EVAL(5!=2+2);
+    TEST_EVAL(1&&2);
+    TEST_EVAL(0||3);
+    TEST_EVAL(1||0&&0);
+    TEST_EVAL(8<<2);
+    TEST_EVAL(32>>3);
+    TEST_EVAL(1+2<<3);
+    TEST_EVAL(16>>1+1);

 #pragma clang diagnostic pop