Files
smallprojects/lex.c
2026-05-23 08:35:43 +02:00

262 lines
8.0 KiB
C

/*
- [ ] New line splicing, first source preprocessing stage. In order to properly handle '\\' backslash new line, we most likely need to preprocess the source in a initial pass. So at some point we need to introduce a stage that will create a buffer without wrong characters with a line / column mapping data structure.
*/
typedef struct Token {
Token_Kind kind;
int len;
char *str;
char *file;
int line, column;
struct {
uint8_t preproc : 1;
};
union {
uint64_t u;
};
} Token;
typedef Vec(Token) Token_Array;
typedef struct Lexer {
char *at;
char *end;
char *file;
int line;
int column;
uint8_t preproc;
} Lexer;
void lex_advance(Lexer *lex) {
if (lex->at >= lex->end) {
return;
}
if (*lex->at == '\n') {
lex->line++;
lex->column = 0;
} else {
lex->column++;
}
if (*lex->at == '\n') {
lex->preproc = false;
lex->at += 1;
} else {
lex->at += 1;
}
}
void eat_whitespace(Lexer *lex) {
while (lex->at < lex->end) {
switch (*lex->at) {
case ' ': case '\t': case '\r': case '\n':
lex_advance(lex);
break;
default:
return;
}
}
}
Lexer make_lexer(char *file, char *src, int len) {
Lexer lex = {
.at = src,
.end = src + len,
.file = file,
.line = 0,
.column = 0,
};
return lex;
}
bool lex_peek_is(Lexer *lex, char c) {
return lex->at < lex->end && *lex->at == c;
}
bool lex_match(Lexer *lex, char c) {
if (lex_peek_is(lex, c)) {
lex_advance(lex);
return true;
}
return false;
}
Token_Kind lex_repeat_or_assign(Lexer *lex, char repeated_char, Token_Kind single, Token_Kind repeated, Token_Kind assigned) {
if (lex_match(lex, repeated_char)) return repeated;
if (lex_match(lex, '=')) return assigned;
return single;
}
Token_Kind lex_assign_variant(Lexer *lex, Token_Kind single, Token_Kind assigned) {
return lex_match(lex, '=') ? assigned : single;
}
Token_Kind lex_shift_family(Lexer *lex, char repeated_char, Token_Kind single, Token_Kind single_eq, Token_Kind doubled, Token_Kind doubled_eq) {
if (lex_match(lex, repeated_char)) {
return lex_match(lex, '=') ? doubled_eq : doubled;
}
return lex_match(lex, '=') ? single_eq : single;
}
Token lex_token(Lexer *lex) {
eat_whitespace(lex);
Token t = {
.str = lex->at,
.line = lex->line,
.column = lex->column,
.file = lex->file,
.preproc = lex->preproc,
};
if (lex->at >= lex->end) {
t.kind = TOK_EOF;
t.len = 0;
return t;
}
char c = *lex->at;
if (isdigit(c)) {
t.kind = TOK_INT;
while (lex->at < lex->end && isdigit(*lex->at)) {
lex_advance(lex);
}
// @todo: proper lexing of floats (as well as postfixes)
if (lex->at < lex->end && *lex->at == '.') {
t.kind = TOK_FLOAT;
lex_advance(lex);
while (lex->at < lex->end && isdigit(*lex->at)) {
lex_advance(lex);
}
}
if (t.kind == TOK_INT) {
t.u = strtoull(t.str, NULL, 10);
}
t.len = (int)(lex->at - t.str);
return t;
}
lex_advance(lex);
switch (c) {
case 0: t.kind = TOK_EOF; break;
case '(': t.kind = TOK_LPAREN; break;
case ')': t.kind = TOK_RPAREN; break;
case '[': t.kind = TOK_LBRACKET; break;
case ']': t.kind = TOK_RBRACKET; break;
case '{': t.kind = TOK_LBRACE; break;
case '}': t.kind = TOK_RBRACE; break;
case ',': t.kind = TOK_COMMA; break;
case '.': t.kind = TOK_DOT; break;
case ':': t.kind = TOK_COLON; break;
case ';': t.kind = TOK_SEMICOLON; break;
case '?': t.kind = TOK_QUESTION; break;
case '+': t.kind = lex_repeat_or_assign(lex, '+', TOK_PLUS, TOK_INC, TOK_PLUS_ASSIGN); break;
case '-': {
if (lex_match(lex, '-')) t.kind = TOK_DEC;
else if (lex_match(lex, '=')) t.kind = TOK_MINUS_ASSIGN;
else if (lex_match(lex, '>')) t.kind = TOK_ARROW;
else t.kind = TOK_MINUS;
} break;
case '*': t.kind = lex_assign_variant(lex, TOK_STAR, TOK_MUL_ASSIGN); break;
case '/': t.kind = lex_assign_variant(lex, TOK_SLASH, TOK_DIV_ASSIGN); break;
case '%': t.kind = lex_assign_variant(lex, TOK_PERCENT, TOK_MOD_ASSIGN); break;
case '=': t.kind = lex_assign_variant(lex, TOK_ASSIGN, TOK_EQ); break;
case '<': t.kind = lex_shift_family(lex, '<', TOK_LT, TOK_LEQ, TOK_LSHIFT, TOK_LSHIFT_ASSIGN); break;
case '>': t.kind = lex_shift_family(lex, '>', TOK_GT, TOK_GEQ, TOK_RSHIFT, TOK_RSHIFT_ASSIGN); break;
case '!': t.kind = lex_assign_variant(lex, TOK_NOT, TOK_NEQ); break;
case '~': t.kind = TOK_BITNOT; break;
case '&': t.kind = lex_repeat_or_assign(lex, '&', TOK_BITAND, TOK_AND, TOK_AND_ASSIGN); break;
case '|': t.kind = lex_repeat_or_assign(lex, '|', TOK_BITOR, TOK_OR, TOK_OR_ASSIGN); break;
case '^': t.kind = lex_assign_variant(lex, TOK_BITXOR, TOK_XOR_ASSIGN); break;
case '#': {
t.kind = TOK_HASH;
lex->preproc = t.preproc = true;
while (lex->at < lex->end && isalpha(*lex->at)) {
lex_advance(lex);
}
} break;
default: panicf("unrecognized character: '%c', can't match with any of the token kinds", c);
}
t.len = (int)(lex->at - t.str);
return t;
}
Token_Array lex_file(char *file, char *src, int len) {
Lexer lex = make_lexer(file, src, len);
Token_Array result = {0};
for (;;) {
Token token = lex_token(&lex);
vec_push(&result, token);
if (token.kind == TOK_EOF) {
break;
}
}
return result;
}
void assert_token(Token t, Token_Kind kind, char *text, int line, int column) {
assert(t.kind == kind);
assert(t.line == line);
assert(t.column == column);
assert(t.len == (int)strlen(text));
assert(strncmp(t.str, text, t.len) == 0);
}
void lex_test(void) {
char *src = "12 + 34.5 * 6\n- 7 % 2 / 1 == 1 != 2 <= 3 >= 4 && 3 || 4 << 1 >> 2";
Lexer lex = make_lexer("test.c", src, (int)strlen(src));
assert_token(lex_token(&lex), TOK_INT, "12", 0, 0);
assert_token(lex_token(&lex), TOK_PLUS, "+", 0, 3);
assert_token(lex_token(&lex), TOK_FLOAT, "34.5", 0, 5);
assert_token(lex_token(&lex), TOK_STAR, "*", 0, 10);
assert_token(lex_token(&lex), TOK_INT, "6", 0, 12);
assert_token(lex_token(&lex), TOK_MINUS, "-", 1, 0);
assert_token(lex_token(&lex), TOK_INT, "7", 1, 2);
assert_token(lex_token(&lex), TOK_PERCENT, "%", 1, 4);
assert_token(lex_token(&lex), TOK_INT, "2", 1, 6);
assert_token(lex_token(&lex), TOK_SLASH, "/", 1, 8);
assert_token(lex_token(&lex), TOK_INT, "1", 1, 10);
assert_token(lex_token(&lex), TOK_EQ, "==", 1, 12);
assert_token(lex_token(&lex), TOK_INT, "1", 1, 15);
assert_token(lex_token(&lex), TOK_NEQ, "!=", 1, 17);
assert_token(lex_token(&lex), TOK_INT, "2", 1, 20);
assert_token(lex_token(&lex), TOK_LEQ, "<=", 1, 22);
assert_token(lex_token(&lex), TOK_INT, "3", 1, 25);
assert_token(lex_token(&lex), TOK_GEQ, ">=", 1, 27);
assert_token(lex_token(&lex), TOK_INT, "4", 1, 30);
assert_token(lex_token(&lex), TOK_AND, "&&", 1, 32);
assert_token(lex_token(&lex), TOK_INT, "3", 1, 35);
assert_token(lex_token(&lex), TOK_OR, "||", 1, 37);
assert_token(lex_token(&lex), TOK_INT, "4", 1, 40);
assert_token(lex_token(&lex), TOK_LSHIFT, "<<", 1, 42);
assert_token(lex_token(&lex), TOK_INT, "1", 1, 45);
assert_token(lex_token(&lex), TOK_RSHIFT, ">>", 1, 47);
assert_token(lex_token(&lex), TOK_INT, "2", 1, 50);
Token eof = lex_token(&lex);
assert(eof.kind == TOK_EOF);
assert(eof.len == 0);
assert(eof.line == 1);
assert(eof.column == 51);
Token_Array array = lex_file("test.c", src, (int)strlen(src));
assert(array.len == 28);
printf("lexer tests passed\n");
}