typedef struct Token { Token_Kind kind; int len; char *str; char *file; int line, column; struct { uint8_t preproc : 1; }; union { uint64_t u; char *intern; }; } Token; typedef Vec(Token) Token_Array; typedef struct Lexer { char *at; char *end; char *file; int line; int column; uint8_t preproc; } Lexer; uint64_t hash_bytes(char *data, size_t len) { uint64_t h = 1469598103934665603ull; for (size_t i = 0; i < len; i++) { h ^= (unsigned char)data[i]; h *= 1099511628211ull; } return h; } char *global_intern_table[4096]; char intern_arena[4096*6]; int intern_arena_len; char *lex_alloc_string(int len) { char *result = intern_arena + intern_arena_len; intern_arena_len += len + 1; assert(intern_arena_len < ilen(intern_arena)); return result; } char *make_intern(char *string, int len) { uint64_t hash = hash_bytes(string, len); int index = hash % ilen(global_intern_table); for (int i = 0; i < ilen(global_intern_table); i += 1) { if (global_intern_table[index] == NULL) { global_intern_table[index] = lex_alloc_string(len + 1); memcpy(global_intern_table[index], string, len); global_intern_table[index][len] = 0; return global_intern_table[index]; } else if (global_intern_table[index] && (memcmp(global_intern_table[index], string, len) == 0)) { return global_intern_table[index]; } index += 1; index = index % ilen(global_intern_table); } assert(!"invalid codepath"); return NULL; } bool lex_is_keyword(char *string) { bool result = string >= lex_first_keyword && string <= lex_last_keyword; return result; } void lex_advance(Lexer *lex) { if (lex->at >= lex->end) { return; } if (*lex->at == '\n') { lex->line++; lex->preproc = false; lex->column = 0; } else { lex->column++; } lex->at += 1; } void eat_whitespace(Lexer *lex) { while (lex->at < lex->end) { switch (*lex->at) { case ' ': case '\t': case '\r': case '\n': lex_advance(lex); break; default: return; } } } Lexer make_lexer(char *file, char *src, int len) { Lexer lex = { .at = src, .end = src + len, .file = file, .line = 0, .column = 0, }; return lex; } bool lex_peek_is(Lexer *lex, char c) { return lex->at < lex->end && *lex->at == c; } bool lex_match(Lexer *lex, char c) { if (lex_peek_is(lex, c)) { lex_advance(lex); return true; } return false; } Token_Kind lex_repeat_or_assign(Lexer *lex, char repeated_char, Token_Kind single, Token_Kind repeated, Token_Kind assigned) { if (lex_match(lex, repeated_char)) return repeated; if (lex_match(lex, '=')) return assigned; return single; } Token_Kind lex_assign_variant(Lexer *lex, Token_Kind single, Token_Kind assigned) { return lex_match(lex, '=') ? assigned : single; } Token_Kind lex_shift_family(Lexer *lex, char repeated_char, Token_Kind single, Token_Kind single_eq, Token_Kind doubled, Token_Kind doubled_eq) { if (lex_match(lex, repeated_char)) { return lex_match(lex, '=') ? doubled_eq : doubled; } return lex_match(lex, '=') ? single_eq : single; } Token lex_token(Lexer *lex) { eat_whitespace(lex); Token t = { .str = lex->at, .line = lex->line, .column = lex->column, .file = lex->file, .preproc = lex->preproc, }; if (lex->at >= lex->end) { t.kind = TOK_EOF; t.len = 0; return t; } char c = *lex->at; lex_advance(lex); switch (c) { case 0: t.kind = TOK_EOF; break; case '(': t.kind = TOK_LPAREN; break; case ')': t.kind = TOK_RPAREN; break; case '[': t.kind = TOK_LBRACKET; break; case ']': t.kind = TOK_RBRACKET; break; case '{': t.kind = TOK_LBRACE; break; case '}': t.kind = TOK_RBRACE; break; case ',': t.kind = TOK_COMMA; break; case '.': t.kind = TOK_DOT; break; case ':': t.kind = TOK_COLON; break; case ';': t.kind = TOK_SEMICOLON; break; case '?': t.kind = TOK_QUESTION; break; case '+': t.kind = lex_repeat_or_assign(lex, '+', TOK_PLUS, TOK_INC, TOK_PLUS_ASSIGN); break; case '-': { if (lex_match(lex, '-')) t.kind = TOK_DEC; else if (lex_match(lex, '=')) t.kind = TOK_MINUS_ASSIGN; else if (lex_match(lex, '>')) t.kind = TOK_ARROW; else t.kind = TOK_MINUS; } break; case '*': t.kind = lex_assign_variant(lex, TOK_STAR, TOK_MUL_ASSIGN); break; case '/': t.kind = lex_assign_variant(lex, TOK_SLASH, TOK_DIV_ASSIGN); break; case '%': t.kind = lex_assign_variant(lex, TOK_PERCENT, TOK_MOD_ASSIGN); break; case '=': t.kind = lex_assign_variant(lex, TOK_ASSIGN, TOK_EQ); break; case '<': t.kind = lex_shift_family(lex, '<', TOK_LT, TOK_LEQ, TOK_LSHIFT, TOK_LSHIFT_ASSIGN); break; case '>': t.kind = lex_shift_family(lex, '>', TOK_GT, TOK_GEQ, TOK_RSHIFT, TOK_RSHIFT_ASSIGN); break; case '!': t.kind = lex_assign_variant(lex, TOK_NOT, TOK_NEQ); break; case '~': t.kind = TOK_BITNOT; break; case '&': t.kind = lex_repeat_or_assign(lex, '&', TOK_BITAND, TOK_AND, TOK_AND_ASSIGN); break; case '|': t.kind = lex_repeat_or_assign(lex, '|', TOK_BITOR, TOK_OR, TOK_OR_ASSIGN); break; case '^': t.kind = lex_assign_variant(lex, TOK_BITXOR, TOK_XOR_ASSIGN); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { t.kind = TOK_INT; while (lex->at < lex->end && isdigit(*lex->at)) { lex_advance(lex); } t.u = strtoull(t.str, NULL, 10); } break; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': case '_': { t.kind = TOK_IDENT; while (lex->at < lex->end && (isalnum(*lex->at) || *lex->at == '_')) { lex_advance(lex); } } break; case '#': { t.kind = TOK_HASH; lex->preproc = t.preproc = true; while (lex->at < lex->end && isalpha(*lex->at)) { lex_advance(lex); } } break; default: panicf("unrecognized character: '%c', can't match with any of the token kinds", c); } t.len = (int)(lex->at - t.str); if (t.kind == TOK_IDENT) { t.intern = make_intern(t.str, t.len); if (lex_is_keyword(t.intern)) { if (t.intern == keyword_while) t.kind = TOK_while; if (t.intern == keyword_break) t.kind = TOK_break; if (t.intern == keyword_case) t.kind = TOK_case; if (t.intern == keyword_char) t.kind = TOK_char; if (t.intern == keyword_const) t.kind = TOK_const; if (t.intern == keyword_continue) t.kind = TOK_continue; if (t.intern == keyword_default) t.kind = TOK_default; if (t.intern == keyword_do) t.kind = TOK_do; if (t.intern == keyword_double) t.kind = TOK_double; if (t.intern == keyword_else) t.kind = TOK_else; if (t.intern == keyword_enum) t.kind = TOK_enum; if (t.intern == keyword_extern) t.kind = TOK_extern; if (t.intern == keyword_float) t.kind = TOK_float; if (t.intern == keyword_for) t.kind = TOK_for; if (t.intern == keyword_goto) t.kind = TOK_goto; if (t.intern == keyword_if) t.kind = TOK_if; if (t.intern == keyword_inline) t.kind = TOK_inline; if (t.intern == keyword_int) t.kind = TOK_int; if (t.intern == keyword_long) t.kind = TOK_long; if (t.intern == keyword_register) t.kind = TOK_register; if (t.intern == keyword_restrict) t.kind = TOK_restrict; if (t.intern == keyword_return) t.kind = TOK_return; if (t.intern == keyword_short) t.kind = TOK_short; if (t.intern == keyword_signed) t.kind = TOK_signed; if (t.intern == keyword_sizeof) t.kind = TOK_sizeof; if (t.intern == keyword_static) t.kind = TOK_static; if (t.intern == keyword_struct) t.kind = TOK_struct; if (t.intern == keyword_switch) t.kind = TOK_switch; if (t.intern == keyword_typedef) t.kind = TOK_typedef; if (t.intern == keyword_union) t.kind = TOK_union; if (t.intern == keyword_unsigned) t.kind = TOK_unsigned; if (t.intern == keyword_void) t.kind = TOK_void; if (t.intern == keyword_volatile) t.kind = TOK_volatile; if (t.intern == keyword_auto) t.kind = TOK_auto; } } return t; } Token_Array lex_file(char *file, char *src, int len) { Lexer lex = make_lexer(file, src, len); Token_Array result = {0}; for (;;) { Token token = lex_token(&lex); vec_push(&result, token); if (token.kind == TOK_EOF) { break; } } return result; } void assert_token(Token t, Token_Kind kind, char *text, int line, int column) { assert(t.kind == kind); assert(t.line == line); assert(t.column == column); assert(t.len == (int)strlen(text)); assert(strncmp(t.str, text, t.len) == 0); } void lex_test(void) { char *src = "12 + 34 * 6\n- 7 % 2 / 1 == 1 != 2 <= 3 >= 4 && 3 || 4 << 1 >> 2"; Lexer lex = make_lexer("test.c", src, (int)strlen(src)); assert_token(lex_token(&lex), TOK_INT, "12", 0, 0); assert_token(lex_token(&lex), TOK_PLUS, "+", 0, 3); assert_token(lex_token(&lex), TOK_INT, "34", 0, 5); assert_token(lex_token(&lex), TOK_STAR, "*", 0, 10); assert_token(lex_token(&lex), TOK_INT, "6", 0, 12); assert_token(lex_token(&lex), TOK_MINUS, "-", 1, 0); assert_token(lex_token(&lex), TOK_INT, "7", 1, 2); assert_token(lex_token(&lex), TOK_PERCENT, "%", 1, 4); assert_token(lex_token(&lex), TOK_INT, "2", 1, 6); assert_token(lex_token(&lex), TOK_SLASH, "/", 1, 8); assert_token(lex_token(&lex), TOK_INT, "1", 1, 10); assert_token(lex_token(&lex), TOK_EQ, "==", 1, 12); assert_token(lex_token(&lex), TOK_INT, "1", 1, 15); assert_token(lex_token(&lex), TOK_NEQ, "!=", 1, 17); assert_token(lex_token(&lex), TOK_INT, "2", 1, 20); assert_token(lex_token(&lex), TOK_LEQ, "<=", 1, 22); assert_token(lex_token(&lex), TOK_INT, "3", 1, 25); assert_token(lex_token(&lex), TOK_GEQ, ">=", 1, 27); assert_token(lex_token(&lex), TOK_INT, "4", 1, 30); assert_token(lex_token(&lex), TOK_AND, "&&", 1, 32); assert_token(lex_token(&lex), TOK_INT, "3", 1, 35); assert_token(lex_token(&lex), TOK_OR, "||", 1, 37); assert_token(lex_token(&lex), TOK_INT, "4", 1, 40); assert_token(lex_token(&lex), TOK_LSHIFT, "<<", 1, 42); assert_token(lex_token(&lex), TOK_INT, "1", 1, 45); assert_token(lex_token(&lex), TOK_RSHIFT, ">>", 1, 47); assert_token(lex_token(&lex), TOK_INT, "2", 1, 50); Token eof = lex_token(&lex); assert(eof.kind == TOK_EOF); assert(eof.len == 0); assert(eof.line == 1); assert(eof.column == 51); Token_Array array = lex_file("test.c", src, (int)strlen(src)); assert(array.len == 28); char *intern_a = make_intern("hello", 5); char *intern_b = make_intern("hello", 5); char *intern_c = make_intern("world", 5); assert(strcmp(intern_a, "hello") == 0); assert(strcmp(intern_b, "hello") == 0); assert(strcmp(intern_c, "world") == 0); assert(intern_a == intern_b); assert(intern_a != intern_c); char *ident_src = "foo _bar baz123 if for while if_ x9"; Lexer ident_lex = make_lexer("ident_test.c", ident_src, (int)strlen(ident_src)); Token foo = lex_token(&ident_lex); assert_token(foo, TOK_IDENT, "foo", 0, 0); assert(strcmp(foo.intern, "foo") == 0); Token bar = lex_token(&ident_lex); assert_token(bar, TOK_IDENT, "_bar", 0, 4); assert(strcmp(bar.intern, "_bar") == 0); Token baz123 = lex_token(&ident_lex); assert_token(baz123, TOK_IDENT, "baz123", 0, 9); assert(strcmp(baz123.intern, "baz123") == 0); Token kw_if = lex_token(&ident_lex); assert_token(kw_if, TOK_if, "if", 0, 16); Token kw_for = lex_token(&ident_lex); assert_token(kw_for, TOK_for, "for", 0, 19); Token kw_while = lex_token(&ident_lex); assert_token(kw_while, TOK_while, "while", 0, 23); Token ident_if_ = lex_token(&ident_lex); assert_token(ident_if_, TOK_IDENT, "if_", 0, 29); Token ident_x9 = lex_token(&ident_lex); assert_token(ident_x9, TOK_IDENT, "x9", 0, 33); printf("lexer tests passed\n"); }