typedef struct Token { Token_Kind kind; char *str; int len; char *file; int line, column; union { uint64_t u; }; } Token; typedef Vec(Token) Token_Array; typedef struct Lexer { char *at; char *end; char *file; int line; int column; } Lexer; void lex_advance(Lexer *lex) { if (lex->at >= lex->end) { return; } if (*lex->at == '\n') { lex->line++; lex->column = 0; } else { lex->column++; } lex->at += 1; } void eat_whitespace(Lexer *lex) { while (lex->at < lex->end) { switch (*lex->at) { case ' ': case '\t': case '\r': case '\n': lex_advance(lex); break; default: return; } } } Lexer make_lexer(char *file, char *src, int len) { Lexer lex = { .at = src, .end = src + len, .file = file, .line = 0, .column = 0, }; return lex; } bool lex_peek_is(Lexer *lex, char c) { return lex->at < lex->end && *lex->at == c; } bool lex_match(Lexer *lex, char c) { if (lex_peek_is(lex, c)) { lex_advance(lex); return true; } return false; } Token_Kind lex_repeat_or_assign(Lexer *lex, char repeated_char, Token_Kind single, Token_Kind repeated, Token_Kind assigned) { if (lex_match(lex, repeated_char)) return repeated; if (lex_match(lex, '=')) return assigned; return single; } Token_Kind lex_assign_variant(Lexer *lex, Token_Kind single, Token_Kind assigned) { return lex_match(lex, '=') ? assigned : single; } Token_Kind lex_shift_family(Lexer *lex, char repeated_char, Token_Kind single, Token_Kind single_eq, Token_Kind doubled, Token_Kind doubled_eq) { if (lex_match(lex, repeated_char)) { return lex_match(lex, '=') ? doubled_eq : doubled; } return lex_match(lex, '=') ? single_eq : single; } Token lex_token(Lexer *lex) { eat_whitespace(lex); Token t = { .str = lex->at, .line = lex->line, .column = lex->column, .file = lex->file, }; if (lex->at >= lex->end) { t.kind = TOK_EOF; t.len = 0; return t; } char c = *lex->at; if (isdigit(c)) { t.kind = TOK_INT; while (lex->at < lex->end && isdigit(*lex->at)) { lex_advance(lex); } // @todo: proper lexing of floats (as well as postfixes) if (lex->at < lex->end && *lex->at == '.') { t.kind = TOK_FLOAT; lex_advance(lex); while (lex->at < lex->end && isdigit(*lex->at)) { lex_advance(lex); } } if (t.kind == TOK_INT) { t.u = strtoull(t.str, NULL, 10); } t.len = (int)(lex->at - t.str); return t; } lex_advance(lex); switch (c) { case 0: t.kind = TOK_EOF; break; case '(': t.kind = TOK_LPAREN; break; case ')': t.kind = TOK_RPAREN; break; case '[': t.kind = TOK_LBRACKET; break; case ']': t.kind = TOK_RBRACKET; break; case '{': t.kind = TOK_LBRACE; break; case '}': t.kind = TOK_RBRACE; break; case ',': t.kind = TOK_COMMA; break; case '.': t.kind = TOK_DOT; break; case ':': t.kind = TOK_COLON; break; case ';': t.kind = TOK_SEMICOLON; break; case '?': t.kind = TOK_QUESTION; break; case '#': t.kind = TOK_HASH; break; case '+': t.kind = lex_repeat_or_assign(lex, '+', TOK_PLUS, TOK_INC, TOK_PLUS_ASSIGN); break; case '-': { if (lex_match(lex, '-')) t.kind = TOK_DEC; else if (lex_match(lex, '=')) t.kind = TOK_MINUS_ASSIGN; else if (lex_match(lex, '>')) t.kind = TOK_ARROW; else t.kind = TOK_MINUS; } break; case '*': t.kind = lex_assign_variant(lex, TOK_STAR, TOK_MUL_ASSIGN); break; case '/': t.kind = lex_assign_variant(lex, TOK_SLASH, TOK_DIV_ASSIGN); break; case '%': t.kind = lex_assign_variant(lex, TOK_PERCENT, TOK_MOD_ASSIGN); break; case '=': t.kind = lex_assign_variant(lex, TOK_ASSIGN, TOK_EQ); break; case '<': t.kind = lex_shift_family(lex, '<', TOK_LT, TOK_LEQ, TOK_LSHIFT, TOK_LSHIFT_ASSIGN); break; case '>': t.kind = lex_shift_family(lex, '>', TOK_GT, TOK_GEQ, TOK_RSHIFT, TOK_RSHIFT_ASSIGN); break; case '!': t.kind = lex_assign_variant(lex, TOK_NOT, TOK_NEQ); break; case '~': t.kind = TOK_BITNOT; break; case '&': t.kind = lex_repeat_or_assign(lex, '&', TOK_BITAND, TOK_AND, TOK_AND_ASSIGN); break; case '|': t.kind = lex_repeat_or_assign(lex, '|', TOK_BITOR, TOK_OR, TOK_OR_ASSIGN); break; case '^': t.kind = lex_assign_variant(lex, TOK_BITXOR, TOK_XOR_ASSIGN); break; default: panicf("unrecognized character: '%c', can't match with any of the token kinds", c); } t.len = (int)(lex->at - t.str); return t; } void assert_token(Token t, Token_Kind kind, char *text, int line, int column) { assert(t.kind == kind); assert(t.line == line); assert(t.column == column); assert(t.len == (int)strlen(text)); assert(strncmp(t.str, text, t.len) == 0); } Token_Array lex_file(char *file, char *src, int len) { Lexer lex = make_lexer(file, src, len); Token_Array result = {0}; for (;;) { Token token = lex_token(&lex); vec_push(&result, token); if (token.kind == TOK_EOF) { break; } } return result; }