Files
lc_lang/src/compiler/lex.c
2024-04-13 15:29:53 +02:00

536 lines
16 KiB
C

LC_FUNCTION void LC_LexingError(LC_Token *pos, const char *str, ...) {
LC_FORMAT(L->arena, str, s8);
LC_SendErrorMessage(pos, s8);
L->errors += 1;
pos->kind = LC_TokenKind_Error;
}
#define LC_IF(cond, ...) \
do { \
if (cond) { \
LC_LexingError(t, __VA_ARGS__); \
return; \
} \
} while (0)
LC_FUNCTION bool LC_IsAssign(LC_TokenKind kind) {
bool result = kind >= LC_TokenKind_Assign && kind <= LC_TokenKind_RightShiftAssign;
return result;
}
LC_FUNCTION bool LC_IsHexDigit(char c) {
bool result = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
return result;
}
LC_FUNCTION bool LC_IsBinDigit(char c) {
bool result = (c >= '0' && c <= '1');
return result;
}
LC_FUNCTION uint64_t LC_MapCharToNumber(char c) {
// clang-format off
switch (c) {
case '0': return 0;
case '1': return 1;
case '2': return 2;
case '3': return 3;
case '4': return 4;
case '5': return 5;
case '6': return 6;
case '7': return 7;
case '8': return 8;
case '9': return 9;
case 'a': case 'A': return 10;
case 'b': case 'B': return 11;
case 'c': case 'C': return 12;
case 'd': case 'D': return 13;
case 'e': case 'E': return 14;
case 'f': case 'F': return 15;
default: return 255;
}
// clang-format on
}
LC_FUNCTION uint64_t LC_GetEscapeCode(char c) {
switch (c) {
case 'a': return '\a';
case 'b': return '\b';
case 'e': return 0x1B;
case 'f': return '\f';
case 'n': return '\n';
case 'r': return '\r';
case 't': return '\t';
case 'v': return '\v';
case '\\': return '\\';
case '\'': return '\'';
case '\"': return '\"';
case '0': return '\0';
default: return UINT64_MAX;
}
}
LC_FUNCTION LC_String LC_GetEscapeString(char c) {
switch (c) {
case '\a': return LC_Lit("\\a");
case '\b': return LC_Lit("\\b");
case 0x1B: return LC_Lit("\\x1B");
case '\f': return LC_Lit("\\f");
case '\n': return LC_Lit("\\n");
case '\r': return LC_Lit("\\r");
case '\t': return LC_Lit("\\t");
case '\v': return LC_Lit("\\v");
case '\\': return LC_Lit("\\\\");
case '\'': return LC_Lit("\\'");
case '\"': return LC_Lit("\\\"");
case '\0': return LC_Lit("\\0");
default: return LC_Lit("");
}
}
LC_FUNCTION void LC_LexAdvance(LC_Lex *x) {
if (x->at[0] == 0) {
return;
} else if (x->at[0] == '\n') {
x->line += 1;
x->column = 0;
}
x->column += 1;
x->at += 1;
}
LC_FUNCTION void LC_EatWhitespace(LC_Lex *x) {
while (LC_IsWhitespace(x->at[0])) LC_LexAdvance(x);
}
LC_FUNCTION void LC_EatIdent(LC_Lex *x) {
while (x->at[0] == '_' || LC_IsAlphanumeric(x->at[0])) LC_LexAdvance(x);
}
LC_FUNCTION void LC_SetTokenLen(LC_Lex *x, LC_Token *t) {
t->len = (int)(x->at - t->str);
LC_ASSERT(NULL, t->len < 2000000000);
}
LC_FUNCTION void LC_EatUntilIncluding(LC_Lex *x, char c) {
while (x->at[0] != 0 && x->at[0] != c) LC_LexAdvance(x);
LC_LexAdvance(x);
}
// @todo: add temporary allocation + copy at end to perm
LC_FUNCTION LC_BigInt LC_LexBigInt(char *string, int len, uint64_t base) {
LC_ASSERT(NULL, base >= 2 && base <= 16);
LC_BigInt m = LC_Bigint_u64(1);
LC_BigInt base_mul = LC_Bigint_u64(base);
LC_BigInt result = LC_Bigint_u64(0);
LC_BigInt tmp = {0};
for (int i = len - 1; i >= 0; --i) {
uint64_t u = LC_MapCharToNumber(string[i]);
LC_ASSERT(NULL, u < base);
LC_BigInt val = LC_Bigint_u64(u);
LC_Bigint_mul(&tmp, &val, &m);
LC_BigInt new_val = tmp;
LC_Bigint_add(&tmp, &result, &new_val);
result = tmp;
LC_Bigint_mul(&tmp, &m, &base_mul);
m = tmp;
}
return result;
}
LC_FUNCTION void LC_LexNestedComments(LC_Lex *x, LC_Token *t) {
t->kind = LC_TokenKind_Comment;
LC_LexAdvance(x);
if (x->at[0] == '*') {
LC_LexAdvance(x);
t->kind = LC_TokenKind_DocComment;
if (x->at[0] == ' ' && x->at[1] == 'f' && x->at[2] == 'i' && x->at[3] == 'l' && x->at[4] == 'e') {
t->kind = LC_TokenKind_FileDocComment;
}
if (x->at[0] == ' ' && x->at[1] == 'p' && x->at[2] == 'a' && x->at[3] == 'c' && x->at[4] == 'k' && x->at[5] == 'a' && x->at[6] == 'g' && x->at[7] == 'e') {
t->kind = LC_TokenKind_PackageDocComment;
}
}
int counter = 0;
for (;;) {
if (x->at[0] == '*' && x->at[1] == '/') {
if (counter <= 0) break;
counter -= 1;
} else if (x->at[0] == '/' && x->at[1] == '*') {
counter += 1;
LC_LexAdvance(x);
}
LC_IF(x->at[0] == 0, "Unclosed block comment");
LC_LexAdvance(x);
}
t->str += 2;
LC_SetTokenLen(x, t);
LC_LexAdvance(x);
LC_LexAdvance(x);
}
LC_FUNCTION void LC_LexStringLiteral(LC_Lex *x, LC_Token *t, LC_TokenKind kind) {
t->kind = kind;
if (kind == LC_TokenKind_RawString) {
LC_EatUntilIncluding(x, '`');
} else if (kind == LC_TokenKind_String) {
for (;;) {
LC_IF(x->at[0] == '\n', "got a new line while parsing a '\"' string literal");
LC_IF(x->at[0] == 0, "reached end of file during string lexing");
if (x->at[0] == '"') break;
if (x->at[0] == '\\' && x->at[1] == '"') LC_LexAdvance(x);
LC_LexAdvance(x);
}
LC_LexAdvance(x);
} else {
LC_IF(1, "internal compiler error: unhandled case in %s", __FUNCTION__);
}
LC_SetTokenLen(x, t);
t->len -= 2;
t->str += 1;
}
LC_FUNCTION void LC_LexUnicodeLiteral(LC_Lex *x, LC_Token *t) {
t->kind = LC_TokenKind_Unicode;
LC_UTF32Result decode = LC_ConvertUTF8ToUTF32(x->at, 4);
LC_IF(decode.error, "invalid utf8 sequence");
uint8_t c[8] = {0};
for (int i = 0; i < decode.advance; i += 1) {
c[i] = x->at[0];
LC_LexAdvance(x);
}
uint64_t result = *(uint64_t *)&c[0];
if (result == '\\') {
LC_ASSERT(NULL, decode.advance == 1);
result = LC_GetEscapeCode(x->at[0]);
LC_IF(result == UINT64_MAX, "invalid escape code");
LC_LexAdvance(x);
}
LC_IF(x->at[0] != '\'', "unclosed unicode literal");
LC_Bigint_init_signed(&t->i, result);
LC_LexAdvance(x);
LC_SetTokenLen(x, t);
t->str += 1;
t->len -= 2;
LC_IF(t->len == 0, "empty unicode literal");
}
LC_FUNCTION void LC_LexIntOrFloat(LC_Lex *x, LC_Token *t) {
t->kind = LC_TokenKind_Int;
for (;;) {
if (x->at[0] == '.') {
LC_IF(t->kind == LC_TokenKind_Float, "failed to parse a floating point number, invalid format, found multiple '.'");
if (t->kind == LC_TokenKind_Int) t->kind = LC_TokenKind_Float;
} else if (!LC_IsDigit(x->at[0])) break;
LC_LexAdvance(x);
}
LC_SetTokenLen(x, t);
if (t->kind == LC_TokenKind_Int) {
t->i = LC_LexBigInt(t->str, t->len, 10);
} else if (t->kind == LC_TokenKind_Float) {
t->f64 = LC_ParseFloat(t->str, t->len);
} else {
LC_IF(1, "internal compiler error: unhandled case in %s", __FUNCTION__);
}
}
LC_FUNCTION void LC_LexCase2(LC_Lex *x, LC_Token *t, LC_TokenKind tk0, char c, LC_TokenKind tk1) {
t->kind = tk0;
if (x->at[0] == c) {
LC_LexAdvance(x);
t->kind = tk1;
}
}
LC_FUNCTION void LC_LexCase3(LC_Lex *x, LC_Token *t, LC_TokenKind tk, char c0, LC_TokenKind tk0, char c1, LC_TokenKind tk1) {
t->kind = tk;
if (x->at[0] == c0) {
t->kind = tk0;
LC_LexAdvance(x);
} else if (x->at[0] == c1) {
t->kind = tk1;
LC_LexAdvance(x);
}
}
LC_FUNCTION void LC_LexCase4(LC_Lex *x, LC_Token *t, LC_TokenKind tk, char c0, LC_TokenKind tk0, char c1, LC_TokenKind tk1, char c2, LC_TokenKind tk2) {
t->kind = tk;
if (x->at[0] == c0) {
t->kind = tk0;
LC_LexAdvance(x);
} else if (x->at[0] == c1) {
LC_LexAdvance(x);
LC_LexCase2(x, t, tk1, c2, tk2);
}
}
LC_FUNCTION void LC_LexNext(LC_Lex *x, LC_Token *t) {
LC_EatWhitespace(x);
LC_MemoryZero(t, sizeof(LC_Token));
t->str = x->at;
t->line = x->line + 1;
t->column = x->column;
t->lex = x;
char *c = x->at;
LC_LexAdvance(x);
switch (c[0]) {
case 0: t->kind = LC_TokenKind_EOF; break;
case '(': t->kind = LC_TokenKind_OpenParen; break;
case ')': t->kind = LC_TokenKind_CloseParen; break;
case '{': t->kind = LC_TokenKind_OpenBrace; break;
case '}': t->kind = LC_TokenKind_CloseBrace; break;
case '[': t->kind = LC_TokenKind_OpenBracket; break;
case ']': t->kind = LC_TokenKind_CloseBracket; break;
case ',': t->kind = LC_TokenKind_Comma; break;
case ':': t->kind = LC_TokenKind_Colon; break;
case ';': t->kind = LC_TokenKind_Semicolon; break;
case '~': t->kind = LC_TokenKind_Neg; break;
case '#': t->kind = LC_TokenKind_Hash; break;
case '@': t->kind = LC_TokenKind_Note; break;
case '\'': LC_LexUnicodeLiteral(x, t); break;
case '"': LC_LexStringLiteral(x, t, LC_TokenKind_String); break;
case '`': LC_LexStringLiteral(x, t, LC_TokenKind_RawString); break;
case '=': LC_LexCase2(x, t, LC_TokenKind_Assign, '=', LC_TokenKind_Equals); break;
case '!': LC_LexCase2(x, t, LC_TokenKind_Not, '=', LC_TokenKind_NotEquals); break;
case '*': LC_LexCase2(x, t, LC_TokenKind_Mul, '=', LC_TokenKind_MulAssign); break;
case '%': LC_LexCase2(x, t, LC_TokenKind_Mod, '=', LC_TokenKind_ModAssign); break;
case '+': LC_LexCase2(x, t, LC_TokenKind_Add, '=', LC_TokenKind_AddAssign); break;
case '-': LC_LexCase2(x, t, LC_TokenKind_Sub, '=', LC_TokenKind_SubAssign); break;
case '^': LC_LexCase2(x, t, LC_TokenKind_BitXor, '=', LC_TokenKind_BitXorAssign); break;
case '&': LC_LexCase3(x, t, LC_TokenKind_BitAnd, '=', LC_TokenKind_BitAndAssign, '&', LC_TokenKind_And); break;
case '|': LC_LexCase3(x, t, LC_TokenKind_BitOr, '=', LC_TokenKind_BitOrAssign, '|', LC_TokenKind_Or); break;
case '>': LC_LexCase4(x, t, LC_TokenKind_GreaterThen, '=', LC_TokenKind_GreaterThenEq, '>', LC_TokenKind_RightShift, '=', LC_TokenKind_RightShiftAssign); break;
case '<': LC_LexCase4(x, t, LC_TokenKind_LesserThen, '=', LC_TokenKind_LesserThenEq, '<', LC_TokenKind_LeftShift, '=', LC_TokenKind_LeftShiftAssign); break;
case '.': {
t->kind = LC_TokenKind_Dot;
if (x->at[0] == '.' && x->at[1] == '.') {
t->kind = LC_TokenKind_ThreeDots;
LC_LexAdvance(x);
LC_LexAdvance(x);
}
} break;
case '0': {
if (x->at[0] == 'x') {
t->kind = LC_TokenKind_Int;
LC_LexAdvance(x);
while (LC_IsHexDigit(x->at[0])) LC_LexAdvance(x);
LC_SetTokenLen(x, t);
LC_IF(t->len < 3, "invalid hex number");
t->i = LC_LexBigInt(t->str + 2, t->len - 2, 16);
break;
}
if (x->at[0] == 'b') {
t->kind = LC_TokenKind_Int;
LC_LexAdvance(x);
while (LC_IsBinDigit(x->at[0])) LC_LexAdvance(x);
LC_SetTokenLen(x, t);
LC_IF(t->len < 3, "invalid binary number");
t->i = LC_LexBigInt(t->str + 2, t->len - 2, 2);
break;
}
} // @fallthrough
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9': {
LC_LexIntOrFloat(x, t);
} break;
case 'A':
case 'a':
case 'B':
case 'b':
case 'C':
case 'c':
case 'D':
case 'd':
case 'E':
case 'e':
case 'F':
case 'f':
case 'G':
case 'g':
case 'H':
case 'h':
case 'I':
case 'i':
case 'J':
case 'j':
case 'K':
case 'k':
case 'L':
case 'l':
case 'M':
case 'm':
case 'N':
case 'n':
case 'O':
case 'o':
case 'P':
case 'p':
case 'Q':
case 'q':
case 'R':
case 'r':
case 'S':
case 's':
case 'T':
case 't':
case 'U':
case 'u':
case 'V':
case 'v':
case 'W':
case 'w':
case 'X':
case 'x':
case 'Y':
case 'y':
case 'Z':
case 'z':
case '_': {
t->kind = LC_TokenKind_Ident;
LC_EatIdent(x);
} break;
case '/': {
t->kind = LC_TokenKind_Div;
if (x->at[0] == '=') {
t->kind = LC_TokenKind_DivAssign;
LC_LexAdvance(x);
} else if (x->at[0] == '/') {
t->kind = LC_TokenKind_Comment;
LC_LexAdvance(x);
while (x->at[0] != '\n' && x->at[0] != 0) LC_LexAdvance(x);
LC_SetTokenLen(x, t);
} else if (x->at[0] == '*') {
LC_LexNestedComments(x, t);
}
} break;
default: LC_IF(1, "invalid character");
}
if (t->len == 0 && t->kind != LC_TokenKind_String && t->kind != LC_TokenKind_RawString) LC_SetTokenLen(x, t);
if (t->kind == LC_TokenKind_Comment) LC_LexNext(x, t);
}
LC_FUNCTION LC_Lex *LC_LexStream(char *file, char *str, int line) {
LC_Lex *x = LC_PushStruct(L->lex_arena, LC_Lex);
x->begin = str;
x->at = str;
x->file = LC_ILit(file);
x->line = line;
for (;;) {
LC_Token *t = LC_PushStruct(L->lex_arena, LC_Token);
if (!x->tokens) x->tokens = t;
x->token_count += 1;
LC_LexNext(x, t);
if (t->kind == LC_TokenKind_EOF) break;
}
return x;
}
LC_FUNCTION LC_String LC_GetTokenLine(LC_Token *token) {
LC_Lex *x = token->lex;
LC_String content = LC_MakeFromChar(x->begin);
LC_StringList lines = LC_Split(L->arena, content, LC_Lit("\n"), 0);
LC_String l[3] = {LC_MakeEmptyString()};
int line = 1;
for (LC_StringNode *it = lines.first; it; it = it->next) {
LC_String sline = it->string;
if (token->line - 1 == line) {
l[0] = LC_Format(L->arena, "> %.*s\n", LC_Expand(sline));
}
if (token->line + 1 == line) {
l[2] = LC_Format(L->arena, "> %.*s\n", LC_Expand(sline));
break;
}
if (token->line == line) {
int begin = (int)(token->str - sline.str);
LC_String left = LC_GetPrefix(sline, begin);
LC_String past_left = LC_Skip(sline, begin);
LC_String mid = LC_GetPrefix(past_left, token->len);
LC_String right = LC_Skip(past_left, token->len);
char *green = "\033[32m";
char *reset = "\033[0m";
if (!L->use_colored_terminal_output) {
green = ">>>>";
reset = "<<<<";
}
l[1] = LC_Format(L->arena, "> %.*s%s%.*s%s%.*s\n", LC_Expand(left), green, LC_Expand(mid), reset, LC_Expand(right));
}
line += 1;
}
LC_String result = LC_Format(L->arena, "%.*s%.*s%.*s", LC_Expand(l[0]), LC_Expand(l[1]), LC_Expand(l[2]));
return result;
}
LC_FUNCTION void LC_InternTokens(LC_Lex *x) {
// @todo: add scratch, we can dump the LC_PushArray strings
for (int i = 0; i < x->token_count; i += 1) {
LC_Token *t = x->tokens + i;
if (t->kind == LC_TokenKind_String) {
int string_len = 0;
char *string = LC_PushArray(L->arena, char, t->len);
for (int i = 0; i < t->len; i += 1) {
char c0 = t->str[i];
char c1 = t->str[i + 1];
if (i + 1 >= t->len) c1 = 0;
if (c0 == '\\') {
uint64_t code = LC_GetEscapeCode(c1);
if (code == UINT64_MAX) {
LC_LexingError(t, "invalid escape code in string '%c%c'", c0, c1);
break;
}
c0 = (char)code;
i += 1;
}
string[string_len++] = c0;
}
t->ident = LC_InternStrLen(string, string_len);
}
if (t->kind == LC_TokenKind_Note || t->kind == LC_TokenKind_Ident || t->kind == LC_TokenKind_RawString) {
t->ident = LC_InternStrLen(t->str, t->len);
}
if (t->kind == LC_TokenKind_Ident) {
bool is_keyword = t->ident >= L->first_keyword && t->ident <= L->last_keyword;
if (is_keyword) {
t->kind = LC_TokenKind_Keyword;
if (L->kaddptr == t->ident) t->kind = LC_TokenKind_AddPtr;
}
}
}
}
#undef LC_IF