WIP top-level parsing and keyword tokens

This commit is contained in:
Krzosa Karol
2026-05-23 15:39:04 +02:00
parent fc04ee5c3e
commit b1b79abfd4
7 changed files with 431 additions and 155 deletions

89
lex.c
View File

@@ -1,9 +1,3 @@
/*
- [ ] New line splicing, first source preprocessing stage. In order to properly handle '\\' backslash new line, we most likely need to preprocess the source in a initial pass. So at some point we need to introduce a stage that will create a buffer without wrong characters with a line / column mapping data structure.
*/
typedef struct Token {
Token_Kind kind;
int len;
@@ -46,7 +40,6 @@ uint64_t hash_bytes(char *data, size_t len) {
char *global_intern_table[4096];
char intern_arena[4096*6];
int intern_arena_len;
char *lex_alloc_string(int len) {
char *result = intern_arena + intern_arena_len;
intern_arena_len += len + 1;
@@ -73,47 +66,6 @@ char *make_intern(char *string, int len) {
return NULL;
}
char *lex_first_keyword = NULL;
char *lex_last_keyword = NULL;
#define lex_add_keyword(x) make_intern(x, ilen(x) - 1)
void lex_init_keywords(void) {
lex_first_keyword = lex_add_keyword("auto");
lex_add_keyword("break");
lex_add_keyword("case");
lex_add_keyword("char");
lex_add_keyword("const");
lex_add_keyword("continue");
lex_add_keyword("default");
lex_add_keyword("do");
lex_add_keyword("double");
lex_add_keyword("else");
lex_add_keyword("enum");
lex_add_keyword("extern");
lex_add_keyword("float");
lex_add_keyword("for");
lex_add_keyword("goto");
lex_add_keyword("if");
lex_add_keyword("inline");
lex_add_keyword("int");
lex_add_keyword("long");
lex_add_keyword("register");
lex_add_keyword("restrict");
lex_add_keyword("return");
lex_add_keyword("short");
lex_add_keyword("signed");
lex_add_keyword("sizeof");
lex_add_keyword("static");
lex_add_keyword("struct");
lex_add_keyword("switch");
lex_add_keyword("typedef");
lex_add_keyword("union");
lex_add_keyword("unsigned");
lex_add_keyword("void");
lex_add_keyword("volatile");
lex_last_keyword = lex_add_keyword("while");
}
bool lex_is_keyword(char *string) {
bool result = string >= lex_first_keyword && string <= lex_last_keyword;
return result;
@@ -275,7 +227,40 @@ Token lex_token(Lexer *lex) {
if (t.kind == TOK_IDENT) {
t.intern = make_intern(t.str, t.len);
if (lex_is_keyword(t.intern)) {
t.kind = TOK_KEYWORD;
if (t.intern == keyword_while) t.kind = TOK_while;
if (t.intern == keyword_break) t.kind = TOK_break;
if (t.intern == keyword_case) t.kind = TOK_case;
if (t.intern == keyword_char) t.kind = TOK_char;
if (t.intern == keyword_const) t.kind = TOK_const;
if (t.intern == keyword_continue) t.kind = TOK_continue;
if (t.intern == keyword_default) t.kind = TOK_default;
if (t.intern == keyword_do) t.kind = TOK_do;
if (t.intern == keyword_double) t.kind = TOK_double;
if (t.intern == keyword_else) t.kind = TOK_else;
if (t.intern == keyword_enum) t.kind = TOK_enum;
if (t.intern == keyword_extern) t.kind = TOK_extern;
if (t.intern == keyword_float) t.kind = TOK_float;
if (t.intern == keyword_for) t.kind = TOK_for;
if (t.intern == keyword_goto) t.kind = TOK_goto;
if (t.intern == keyword_if) t.kind = TOK_if;
if (t.intern == keyword_inline) t.kind = TOK_inline;
if (t.intern == keyword_int) t.kind = TOK_int;
if (t.intern == keyword_long) t.kind = TOK_long;
if (t.intern == keyword_register) t.kind = TOK_register;
if (t.intern == keyword_restrict) t.kind = TOK_restrict;
if (t.intern == keyword_return) t.kind = TOK_return;
if (t.intern == keyword_short) t.kind = TOK_short;
if (t.intern == keyword_signed) t.kind = TOK_signed;
if (t.intern == keyword_sizeof) t.kind = TOK_sizeof;
if (t.intern == keyword_static) t.kind = TOK_static;
if (t.intern == keyword_struct) t.kind = TOK_struct;
if (t.intern == keyword_switch) t.kind = TOK_switch;
if (t.intern == keyword_typedef) t.kind = TOK_typedef;
if (t.intern == keyword_union) t.kind = TOK_union;
if (t.intern == keyword_unsigned) t.kind = TOK_unsigned;
if (t.intern == keyword_void) t.kind = TOK_void;
if (t.intern == keyword_volatile) t.kind = TOK_volatile;
if (t.intern == keyword_auto) t.kind = TOK_auto;
}
}
@@ -365,11 +350,11 @@ void lex_test(void) {
assert_token(baz123, TOK_IDENT, "baz123", 0, 9);
assert(strcmp(baz123.intern, "baz123") == 0);
Token kw_if = lex_token(&ident_lex);
assert_token(kw_if, TOK_KEYWORD, "if", 0, 16);
assert_token(kw_if, TOK_if, "if", 0, 16);
Token kw_for = lex_token(&ident_lex);
assert_token(kw_for, TOK_KEYWORD, "for", 0, 19);
assert_token(kw_for, TOK_for, "for", 0, 19);
Token kw_while = lex_token(&ident_lex);
assert_token(kw_while, TOK_KEYWORD, "while", 0, 23);
assert_token(kw_while, TOK_while, "while", 0, 23);
Token ident_if_ = lex_token(&ident_lex);
assert_token(ident_if_, TOK_IDENT, "if_", 0, 29);
Token ident_x9 = lex_token(&ident_lex);