smallprojects/lex.c

/*

- [ ] New line splicing, first source preprocessing stage. In order to properly handle '\\' backslash new line, we most likely need to preprocess the source in a initial pass. So at some point we need to introduce a stage that will create a buffer without wrong characters with a line / column mapping data structure.

*/

typedef struct Token {
    Token_Kind kind;
    int len;
    char *str;

    char *file;
    int line, column;

    struct {
        uint8_t preproc : 1;
    };

    union {
        uint64_t u;
        char *intern;
    };
} Token;

typedef Vec(Token) Token_Array;

typedef struct Lexer {
    char *at;
    char *end;
    char *file;
    int line;
    int column;
    uint8_t preproc;
} Lexer;


uint64_t hash_bytes(char *data, size_t len) {
    uint64_t h = 1469598103934665603ull;
    for (size_t i = 0; i < len; i++) {
        h ^= (unsigned char)data[i];
        h *= 1099511628211ull;
    }
    return h;
}

char *global_intern_table[4096];
char intern_arena[4096*6];
int intern_arena_len;

char *lex_alloc_string(int len) {
    char *result = intern_arena + intern_arena_len;
    intern_arena_len += len + 1;
    assert(intern_arena_len < ilen(intern_arena));
    return result;
}

char *make_intern(char *string, int len) {
    uint64_t hash = hash_bytes(string, len);
    int index = hash % ilen(global_intern_table);
    for (int i = 0; i < ilen(global_intern_table); i += 1) {
        if (global_intern_table[index] == NULL) {
            global_intern_table[index] = lex_alloc_string(len + 1);
            memcpy(global_intern_table[index], string, len);
            global_intern_table[index][len] = 0;
            return global_intern_table[index];
        } else if (global_intern_table[index] && (memcmp(global_intern_table[index], string, len) == 0)) {
            return global_intern_table[index];
        }
        index += 1;
        index = index % ilen(global_intern_table);
    }
    assert(!"invalid codepath");
    return NULL;
}

char *lex_first_keyword = NULL;
char *lex_last_keyword = NULL;
#define lex_add_keyword(x) make_intern(x, ilen(x) - 1)

void lex_init_keywords(void) {
    lex_first_keyword = lex_add_keyword("auto");
    lex_add_keyword("break");
    lex_add_keyword("case");
    lex_add_keyword("char");
    lex_add_keyword("const");
    lex_add_keyword("continue");
    lex_add_keyword("default");
    lex_add_keyword("do");
    lex_add_keyword("double");
    lex_add_keyword("else");
    lex_add_keyword("enum");
    lex_add_keyword("extern");
    lex_add_keyword("float");
    lex_add_keyword("for");
    lex_add_keyword("goto");
    lex_add_keyword("if");
    lex_add_keyword("inline");
    lex_add_keyword("int");
    lex_add_keyword("long");
    lex_add_keyword("register");
    lex_add_keyword("restrict");
    lex_add_keyword("return");
    lex_add_keyword("short");
    lex_add_keyword("signed");
    lex_add_keyword("sizeof");
    lex_add_keyword("static");
    lex_add_keyword("struct");
    lex_add_keyword("switch");
    lex_add_keyword("typedef");
    lex_add_keyword("union");
    lex_add_keyword("unsigned");
    lex_add_keyword("void");
    lex_add_keyword("volatile");
    lex_last_keyword = lex_add_keyword("while");
}

bool lex_is_keyword(char *string) {
    bool result = string >= lex_first_keyword && string <= lex_last_keyword;
    return result;
}

void lex_advance(Lexer *lex) {
    if (lex->at >= lex->end) {
        return;
    }

    if (*lex->at == '\n') {
        lex->line++;
        lex->preproc = false;
        lex->column = 0;
    } else {
        lex->column++;
    }

    lex->at += 1;
}

void eat_whitespace(Lexer *lex) {
    while (lex->at < lex->end) {
        switch (*lex->at) {
            case ' ': case '\t': case '\r': case '\n':
                lex_advance(lex);
                break;
            default:
                return;
        }
    }
}

Lexer make_lexer(char *file, char *src, int len) {
    Lexer lex = {
        .at = src,
        .end = src + len,
        .file = file,
        .line = 0,
        .column = 0,
    };
    return lex;
}

bool lex_peek_is(Lexer *lex, char c) {
    return lex->at < lex->end && *lex->at == c;
}

bool lex_match(Lexer *lex, char c) {
    if (lex_peek_is(lex, c)) {
        lex_advance(lex);
        return true;
    }
    return false;
}

Token_Kind lex_repeat_or_assign(Lexer *lex, char repeated_char, Token_Kind single, Token_Kind repeated, Token_Kind assigned) {
    if (lex_match(lex, repeated_char)) return repeated;
    if (lex_match(lex, '=')) return assigned;
    return single;
}

Token_Kind lex_assign_variant(Lexer *lex, Token_Kind single, Token_Kind assigned) {
    return lex_match(lex, '=') ? assigned : single;
}

Token_Kind lex_shift_family(Lexer *lex, char repeated_char, Token_Kind single, Token_Kind single_eq, Token_Kind doubled, Token_Kind doubled_eq) {
    if (lex_match(lex, repeated_char)) {
        return lex_match(lex, '=') ? doubled_eq : doubled;
    }
    return lex_match(lex, '=') ? single_eq : single;
}

Token lex_token(Lexer *lex) {
    eat_whitespace(lex);
    Token t = {
        .str = lex->at,
        .line = lex->line,
        .column = lex->column,
        .file = lex->file,
        .preproc = lex->preproc,
    };

    if (lex->at >= lex->end) {
        t.kind = TOK_EOF;
        t.len = 0;
        return t;
    }

    char c = *lex->at;
    lex_advance(lex);

    switch (c) {
        case 0: t.kind = TOK_EOF; break;
        case '(': t.kind = TOK_LPAREN; break;
        case ')': t.kind = TOK_RPAREN; break;
        case '[': t.kind = TOK_LBRACKET; break;
        case ']': t.kind = TOK_RBRACKET; break;
        case '{': t.kind = TOK_LBRACE; break;
        case '}': t.kind = TOK_RBRACE; break;
        case ',': t.kind = TOK_COMMA; break;
        case '.': t.kind = TOK_DOT; break;
        case ':': t.kind = TOK_COLON; break;
        case ';': t.kind = TOK_SEMICOLON; break;
        case '?': t.kind = TOK_QUESTION; break;
        case '+': t.kind = lex_repeat_or_assign(lex, '+', TOK_PLUS, TOK_INC, TOK_PLUS_ASSIGN); break;
        case '-': {
            if (lex_match(lex, '-')) t.kind = TOK_DEC;
            else if (lex_match(lex, '=')) t.kind = TOK_MINUS_ASSIGN;
            else if (lex_match(lex, '>')) t.kind = TOK_ARROW;
            else t.kind = TOK_MINUS;
        } break;
        case '*': t.kind = lex_assign_variant(lex, TOK_STAR, TOK_MUL_ASSIGN); break;
        case '/': t.kind = lex_assign_variant(lex, TOK_SLASH, TOK_DIV_ASSIGN); break;
        case '%': t.kind = lex_assign_variant(lex, TOK_PERCENT, TOK_MOD_ASSIGN); break;
        case '=': t.kind = lex_assign_variant(lex, TOK_ASSIGN, TOK_EQ); break;
        case '<': t.kind = lex_shift_family(lex, '<', TOK_LT, TOK_LEQ, TOK_LSHIFT, TOK_LSHIFT_ASSIGN); break;
        case '>': t.kind = lex_shift_family(lex, '>', TOK_GT, TOK_GEQ, TOK_RSHIFT, TOK_RSHIFT_ASSIGN); break;
        case '!': t.kind = lex_assign_variant(lex, TOK_NOT, TOK_NEQ); break;
        case '~': t.kind = TOK_BITNOT; break;
        case '&': t.kind = lex_repeat_or_assign(lex, '&', TOK_BITAND, TOK_AND, TOK_AND_ASSIGN); break;
        case '|': t.kind = lex_repeat_or_assign(lex, '|', TOK_BITOR, TOK_OR, TOK_OR_ASSIGN); break;
        case '^': t.kind = lex_assign_variant(lex, TOK_BITXOR, TOK_XOR_ASSIGN); break;

        case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': {
            t.kind = TOK_INT;
            while (lex->at < lex->end && isdigit(*lex->at)) {
                lex_advance(lex);
            }
            t.u = strtoull(t.str, NULL, 10);
        } break;

        case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j':
        case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't':
        case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
        case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J':
        case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T':
        case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z':
        case '_': {
            t.kind = TOK_IDENT;
            while (lex->at < lex->end && (isalnum(*lex->at) || *lex->at == '_')) {
                lex_advance(lex);
            }
        } break;

        case '#': {
            t.kind = TOK_HASH;
            lex->preproc = t.preproc = true;
            while (lex->at < lex->end && isalpha(*lex->at)) {
                lex_advance(lex);
            }
        } break;

        default: panicf("unrecognized character: '%c', can't match with any of the token kinds", c);
    }

    t.len = (int)(lex->at - t.str);

    if (t.kind == TOK_IDENT) {
        t.intern = make_intern(t.str, t.len);
        if (lex_is_keyword(t.intern)) {
            t.kind = TOK_KEYWORD;
        }
    }

    return t;
}

Token_Array lex_file(char *file, char *src, int len) {
    Lexer lex = make_lexer(file, src, len);
    Token_Array result = {0};
    for (;;) {
        Token token = lex_token(&lex);
        vec_push(&result, token);
        if (token.kind == TOK_EOF) {
            break;
        }
    }
    return result;
}

void assert_token(Token t, Token_Kind kind, char *text, int line, int column) {
    assert(t.kind == kind);
    assert(t.line == line);
    assert(t.column == column);
    assert(t.len == (int)strlen(text));
    assert(strncmp(t.str, text, t.len) == 0);
}

void lex_test(void) {
    char *src = "12 + 34   * 6\n- 7 % 2 / 1 == 1 != 2 <= 3 >= 4 && 3 || 4 << 1 >> 2";
    Lexer lex = make_lexer("test.c", src, (int)strlen(src));

    assert_token(lex_token(&lex), TOK_INT, "12", 0, 0);
    assert_token(lex_token(&lex), TOK_PLUS, "+", 0, 3);
    assert_token(lex_token(&lex), TOK_INT, "34", 0, 5);
    assert_token(lex_token(&lex), TOK_STAR, "*", 0, 10);
    assert_token(lex_token(&lex), TOK_INT, "6", 0, 12);
    assert_token(lex_token(&lex), TOK_MINUS, "-", 1, 0);
    assert_token(lex_token(&lex), TOK_INT, "7", 1, 2);
    assert_token(lex_token(&lex), TOK_PERCENT, "%", 1, 4);
    assert_token(lex_token(&lex), TOK_INT, "2", 1, 6);
    assert_token(lex_token(&lex), TOK_SLASH, "/", 1, 8);
    assert_token(lex_token(&lex), TOK_INT, "1", 1, 10);
    assert_token(lex_token(&lex), TOK_EQ, "==", 1, 12);
    assert_token(lex_token(&lex), TOK_INT, "1", 1, 15);
    assert_token(lex_token(&lex), TOK_NEQ, "!=", 1, 17);
    assert_token(lex_token(&lex), TOK_INT, "2", 1, 20);
    assert_token(lex_token(&lex), TOK_LEQ, "<=", 1, 22);
    assert_token(lex_token(&lex), TOK_INT, "3", 1, 25);
    assert_token(lex_token(&lex), TOK_GEQ, ">=", 1, 27);
    assert_token(lex_token(&lex), TOK_INT, "4", 1, 30);
    assert_token(lex_token(&lex), TOK_AND, "&&", 1, 32);
    assert_token(lex_token(&lex), TOK_INT, "3", 1, 35);
    assert_token(lex_token(&lex), TOK_OR, "||", 1, 37);
    assert_token(lex_token(&lex), TOK_INT, "4", 1, 40);
    assert_token(lex_token(&lex), TOK_LSHIFT, "<<", 1, 42);
    assert_token(lex_token(&lex), TOK_INT, "1", 1, 45);
    assert_token(lex_token(&lex), TOK_RSHIFT, ">>", 1, 47);
    assert_token(lex_token(&lex), TOK_INT, "2", 1, 50);

    Token eof = lex_token(&lex);
    assert(eof.kind == TOK_EOF);
    assert(eof.len == 0);
    assert(eof.line == 1);
    assert(eof.column == 51);

    Token_Array array = lex_file("test.c", src, (int)strlen(src));
    assert(array.len == 28);

    char *intern_a = make_intern("hello", 5);
    char *intern_b = make_intern("hello", 5);
    char *intern_c = make_intern("world", 5);
    assert(strcmp(intern_a, "hello") == 0);
    assert(strcmp(intern_b, "hello") == 0);
    assert(strcmp(intern_c, "world") == 0);
    assert(intern_a == intern_b);
    assert(intern_a != intern_c);

    char *ident_src = "foo _bar baz123 if for while if_ x9";
    Lexer ident_lex = make_lexer("ident_test.c", ident_src, (int)strlen(ident_src));
    Token foo = lex_token(&ident_lex);
    assert_token(foo, TOK_IDENT, "foo", 0, 0);
    assert(strcmp(foo.intern, "foo") == 0);
    Token bar = lex_token(&ident_lex);
    assert_token(bar, TOK_IDENT, "_bar", 0, 4);
    assert(strcmp(bar.intern, "_bar") == 0);
    Token baz123 = lex_token(&ident_lex);
    assert_token(baz123, TOK_IDENT, "baz123", 0, 9);
    assert(strcmp(baz123.intern, "baz123") == 0);
    Token kw_if = lex_token(&ident_lex);
    assert_token(kw_if, TOK_KEYWORD, "if", 0, 16);
    Token kw_for = lex_token(&ident_lex);
    assert_token(kw_for, TOK_KEYWORD, "for", 0, 19);
    Token kw_while = lex_token(&ident_lex);
    assert_token(kw_while, TOK_KEYWORD, "while", 0, 23);
    Token ident_if_ = lex_token(&ident_lex);
    assert_token(ident_if_, TOK_IDENT, "if_", 0, 29);
    Token ident_x9 = lex_token(&ident_lex);
    assert_token(ident_x9, TOK_IDENT, "x9", 0, 33);

    printf("lexer tests passed\n");
}