diff --git a/base.c b/base.c index a3e96a0..f9c3d5f 100644 --- a/base.c +++ b/base.c @@ -1,4 +1,7 @@ #define panicf(...) base_panicf(__FILE__, __LINE__, __VA_ARGS__) +#define len(x) (sizeof((x))/sizeof((x)[0])) +#define ilen(x) ((int)len(x)) + _Noreturn void base_panicf(char *file, int line, const char *fmt, ...) { diff --git a/lex.c b/lex.c index 8949f94..8a2e78b 100644 --- a/lex.c +++ b/lex.c @@ -18,6 +18,7 @@ typedef struct Token { union { uint64_t u; + char *intern; }; } Token; @@ -32,6 +33,92 @@ typedef struct Lexer { uint8_t preproc; } Lexer; + +uint64_t hash_bytes(char *data, size_t len) { + uint64_t h = 1469598103934665603ull; + for (size_t i = 0; i < len; i++) { + h ^= (unsigned char)data[i]; + h *= 1099511628211ull; + } + return h; +} + +char *global_intern_table[4096]; +char intern_arena[4096*6]; +int intern_arena_len; + +char *lex_alloc_string(int len) { + char *result = intern_arena + intern_arena_len; + intern_arena_len += len + 1; + assert(intern_arena_len < ilen(intern_arena)); + return result; +} + +char *make_intern(char *string, int len) { + uint64_t hash = hash_bytes(string, len); + int index = hash % ilen(global_intern_table); + for (int i = 0; i < ilen(global_intern_table); i += 1) { + if (global_intern_table[index] == NULL) { + global_intern_table[index] = lex_alloc_string(len + 1); + memcpy(global_intern_table[index], string, len); + global_intern_table[index][len] = 0; + return global_intern_table[index]; + } else if (global_intern_table[index] && (memcmp(global_intern_table[index], string, len) == 0)) { + return global_intern_table[index]; + } + index += 1; + index = index % ilen(global_intern_table); + } + assert(!"invalid codepath"); + return NULL; +} + +char *lex_first_keyword = NULL; +char *lex_last_keyword = NULL; +#define lex_add_keyword(x) make_intern(x, ilen(x) - 1) + +void lex_init_keywords(void) { + lex_first_keyword = lex_add_keyword("auto"); + lex_add_keyword("break"); + lex_add_keyword("case"); + lex_add_keyword("char"); + lex_add_keyword("const"); + lex_add_keyword("continue"); + lex_add_keyword("default"); + lex_add_keyword("do"); + lex_add_keyword("double"); + lex_add_keyword("else"); + lex_add_keyword("enum"); + lex_add_keyword("extern"); + lex_add_keyword("float"); + lex_add_keyword("for"); + lex_add_keyword("goto"); + lex_add_keyword("if"); + lex_add_keyword("inline"); + lex_add_keyword("int"); + lex_add_keyword("long"); + lex_add_keyword("register"); + lex_add_keyword("restrict"); + lex_add_keyword("return"); + lex_add_keyword("short"); + lex_add_keyword("signed"); + lex_add_keyword("sizeof"); + lex_add_keyword("static"); + lex_add_keyword("struct"); + lex_add_keyword("switch"); + lex_add_keyword("typedef"); + lex_add_keyword("union"); + lex_add_keyword("unsigned"); + lex_add_keyword("void"); + lex_add_keyword("volatile"); + lex_last_keyword = lex_add_keyword("while"); +} + +bool lex_is_keyword(char *string) { + bool result = string >= lex_first_keyword && string <= lex_last_keyword; + return result; +} + void lex_advance(Lexer *lex) { if (lex->at >= lex->end) { return; @@ -39,17 +126,13 @@ void lex_advance(Lexer *lex) { if (*lex->at == '\n') { lex->line++; + lex->preproc = false; lex->column = 0; } else { lex->column++; } - if (*lex->at == '\n') { - lex->preproc = false; - lex->at += 1; - } else { - lex->at += 1; - } + lex->at += 1; } void eat_whitespace(Lexer *lex) { @@ -121,31 +204,6 @@ Token lex_token(Lexer *lex) { } char c = *lex->at; - - if (isdigit(c)) { - t.kind = TOK_INT; - while (lex->at < lex->end && isdigit(*lex->at)) { - lex_advance(lex); - } - - // @todo: proper lexing of floats (as well as postfixes) - if (lex->at < lex->end && *lex->at == '.') { - t.kind = TOK_FLOAT; - lex_advance(lex); - - while (lex->at < lex->end && isdigit(*lex->at)) { - lex_advance(lex); - } - } - - if (t.kind == TOK_INT) { - t.u = strtoull(t.str, NULL, 10); - } - - t.len = (int)(lex->at - t.str); - return t; - } - lex_advance(lex); switch (c) { @@ -180,6 +238,27 @@ Token lex_token(Lexer *lex) { case '|': t.kind = lex_repeat_or_assign(lex, '|', TOK_BITOR, TOK_OR, TOK_OR_ASSIGN); break; case '^': t.kind = lex_assign_variant(lex, TOK_BITXOR, TOK_XOR_ASSIGN); break; + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { + t.kind = TOK_INT; + while (lex->at < lex->end && isdigit(*lex->at)) { + lex_advance(lex); + } + t.u = strtoull(t.str, NULL, 10); + } break; + + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': + case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': + case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': + case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': + case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': + case '_': { + t.kind = TOK_IDENT; + while (lex->at < lex->end && (isalnum(*lex->at) || *lex->at == '_')) { + lex_advance(lex); + } + } break; + case '#': { t.kind = TOK_HASH; lex->preproc = t.preproc = true; @@ -192,6 +271,14 @@ Token lex_token(Lexer *lex) { } t.len = (int)(lex->at - t.str); + + if (t.kind == TOK_IDENT) { + t.intern = make_intern(t.str, t.len); + if (lex_is_keyword(t.intern)) { + t.kind = TOK_KEYWORD; + } + } + return t; } @@ -217,12 +304,12 @@ void assert_token(Token t, Token_Kind kind, char *text, int line, int column) { } void lex_test(void) { - char *src = "12 + 34.5 * 6\n- 7 % 2 / 1 == 1 != 2 <= 3 >= 4 && 3 || 4 << 1 >> 2"; + char *src = "12 + 34 * 6\n- 7 % 2 / 1 == 1 != 2 <= 3 >= 4 && 3 || 4 << 1 >> 2"; Lexer lex = make_lexer("test.c", src, (int)strlen(src)); assert_token(lex_token(&lex), TOK_INT, "12", 0, 0); assert_token(lex_token(&lex), TOK_PLUS, "+", 0, 3); - assert_token(lex_token(&lex), TOK_FLOAT, "34.5", 0, 5); + assert_token(lex_token(&lex), TOK_INT, "34", 0, 5); assert_token(lex_token(&lex), TOK_STAR, "*", 0, 10); assert_token(lex_token(&lex), TOK_INT, "6", 0, 12); assert_token(lex_token(&lex), TOK_MINUS, "-", 1, 0); @@ -257,5 +344,36 @@ void lex_test(void) { Token_Array array = lex_file("test.c", src, (int)strlen(src)); assert(array.len == 28); + char *intern_a = make_intern("hello", 5); + char *intern_b = make_intern("hello", 5); + char *intern_c = make_intern("world", 5); + assert(strcmp(intern_a, "hello") == 0); + assert(strcmp(intern_b, "hello") == 0); + assert(strcmp(intern_c, "world") == 0); + assert(intern_a == intern_b); + assert(intern_a != intern_c); + + char *ident_src = "foo _bar baz123 if for while if_ x9"; + Lexer ident_lex = make_lexer("ident_test.c", ident_src, (int)strlen(ident_src)); + Token foo = lex_token(&ident_lex); + assert_token(foo, TOK_IDENT, "foo", 0, 0); + assert(strcmp(foo.intern, "foo") == 0); + Token bar = lex_token(&ident_lex); + assert_token(bar, TOK_IDENT, "_bar", 0, 4); + assert(strcmp(bar.intern, "_bar") == 0); + Token baz123 = lex_token(&ident_lex); + assert_token(baz123, TOK_IDENT, "baz123", 0, 9); + assert(strcmp(baz123.intern, "baz123") == 0); + Token kw_if = lex_token(&ident_lex); + assert_token(kw_if, TOK_KEYWORD, "if", 0, 16); + Token kw_for = lex_token(&ident_lex); + assert_token(kw_for, TOK_KEYWORD, "for", 0, 19); + Token kw_while = lex_token(&ident_lex); + assert_token(kw_while, TOK_KEYWORD, "while", 0, 23); + Token ident_if_ = lex_token(&ident_lex); + assert_token(ident_if_, TOK_IDENT, "if_", 0, 29); + Token ident_x9 = lex_token(&ident_lex); + assert_token(ident_x9, TOK_IDENT, "x9", 0, 33); + printf("lexer tests passed\n"); } diff --git a/main.c b/main.c index 47188c1..82de2b7 100644 --- a/main.c +++ b/main.c @@ -13,6 +13,7 @@ #include "emit_asm_x64.c" int main() { + lex_init_keywords(); vec_test(); lex_test(); parser_test(); diff --git a/meta.c b/meta.c index e32e686..5763235 100644 --- a/meta.c +++ b/meta.c @@ -1,7 +1,9 @@ #include - -#define len(x) (sizeof((x))/sizeof((x)[0])) -#define ilen(x) ((int)len(x)) +#include +#include +#include +#include +#include "base.c" int main() { typedef struct {