Files
corelang/core_lexing.cpp
2023-04-21 15:19:38 +02:00

789 lines
24 KiB
C++

force_inline B32 token_is_assign(Token_Kind token) { return token >= TK_FirstAssign && token <= TK_LastAssign; }
force_inline B32 token_is_assign(Token *token) { return token_is_assign(token->kind); }
force_inline B32 token_is_compare(Token_Kind token) { return token >= TK_FirstCompare && token <= TK_LastCompare; }
force_inline B32 token_is_compare(Token *token) { return token_is_compare(token->kind); }
CORE_Static U8
lexc(Lex_Stream *s) {
return s->stream.str[s->iter];
}
CORE_Static U8
lexci(Lex_Stream *s, S32 i) {
return s->stream.str[s->iter + i];
}
CORE_Static U8 *
lexcp(Lex_Stream *s) {
return s->stream.str + s->iter;
}
CORE_Static B32
lex_is_whitespace(U8 c) {
B32 result = c == ' ' || c == '\r';
return result;
}
CORE_Static B32
lex_is_alphabetic(U8 c) {
B32 result = (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
return result;
}
CORE_Static B32
lex_is_numeric(U8 c) {
B32 result = c >= '0' && c <= '9';
return result;
}
CORE_Static B32
lex_is_numeric_base16(U8 c) {
B32 result = (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') ||
(c >= 'a' && c <= 'f');
return result;
}
CORE_Static B32
lex_is_alphanumeric(U8 c) {
B32 result = lex_is_numeric(c) || lex_is_alphabetic(c);
return result;
}
CORE_Static void
lex_set_len(Lex_Stream *s, Token *token) {
assert(lexcp(s) >= token->str);
token->len = lexcp(s) - token->str;
}
CORE_Static void
lex_set_keywords(Core_Ctx *lexer, Array<String> keywords) {
Intern_String keyword = {};
For(keywords) {
keyword = intern_string(&lexer->interns, it);
if (&it == keywords.begin())
lexer->interns.first_keyword = keyword.str;
}
lexer->interns.last_keyword = keyword.str;
}
CORE_Static B32
lex_is_keyword(Intern_Table *lexer, Intern_String keyword) {
B32 result = keyword.str >= lexer->first_keyword && keyword.str <= lexer->last_keyword;
return result;
}
CORE_Static void
token_error(Token *t, String error_val) {
t->kind = TK_Error;
t->error_val = error_val;
}
CORE_Static void
lex_parse_u64(Core_Ctx *lexer, Token *t, S64 base) {
Scoped_Arena _scope(lexer->scratch);
Set_BigInt_Arena(lexer->scratch);
t->kind = TK_Integer;
BigInt m = bigint_u64(1);
BigInt base_mul = bigint_u64(base);
BigInt result = bigint_u64(0);
for (S64 i = t->len - 1; i >= 0; --i) {
U64 value = t->str[i];
if (t->str[i] >= 'a') value = value - 'a' + 10;
else if (t->str[i] >= 'A') value = value - 'A' + 10;
else value -= '0';
BigInt val = bigint_u64(value);
BigInt new_val = bigint_mul(&val, &m);
result = bigint_add(&result, &new_val);
m = bigint_mul(&m, &base_mul);
}
t->int_val = bigint_copy(lexer->perm, &result);
}
CORE_Static void
lex_parse_f64(Token *t) {
t->kind = TK_Float;
char buffer[128];
S64 len = clamp_top((int)t->len, 126);
memory_copy(buffer, t->str, len);
buffer[len] = 0;
t->f64_val = strtod(buffer, 0);
}
CORE_Static void
lex_advance(Lex_Stream *s) {
if (s->iter >= s->stream.len) {
return;
}
else if (lexc(s) == '\n') {
s->iter++;
s->line++;
s->line_begin = lexcp(s);
}
else {
s->iter++;
}
}
CORE_Static void
lex_parse_string(Lex_Stream *s, Token *t, U8 c) {
for (;;) {
if (lexc(s) == '\\') lex_advance(s);
else if (lexc(s) == c) break;
else if (lexc(s) == 0) {
token_error(t, "Unterminated string, reached end of file"_s);
break;
}
lex_advance(s);
}
if (t->kind != TK_Error) {
lex_advance(s);
lex_set_len(s, t);
}
}
CORE_Static void
lex_parse_ident(Intern_Table *table, Lex_Stream *s, Token *t) {
while (lex_is_alphanumeric(lexc(s)) || lexc(s) == '_')
lex_advance(s);
lex_set_len(s, t);
}
#define CASE2(op, OpName, Assign) \
case op: \
if (lexc(s) == '=') { \
lex_advance(s); \
t.kind = Assign; \
} \
else { \
t.kind = OpName; \
} \
break
#define CASE3(op, OpName, Assign, Incr) \
case op: \
if (lexc(s) == '=') { \
lex_advance(s); \
t.kind = Assign; \
} \
else if (lexc(s) == op) { \
lex_advance(s); \
t.kind = Incr; \
} \
else { \
t.kind = OpName; \
} \
break
CORE_Static Token
token_make(Core_Ctx *lexer, U8 *str, Intern_String file, int line, U8 *line_begin) {
Token t = {};
t.str = str;
t.file = file;
t.line = line;
t.line_begin = line_begin;
t.di = lexer->token_debug_ids++;
return t;
}
CORE_Static Token
token_make(Core_Ctx *lexer) {
return token_make(lexer, lexcp(&lexer->stream), lexer->stream.file, lexer->stream.line, lexer->stream.line_begin);
}
CORE_Static Token *
lex_last_indent_token(Lex_Stream *s) {
if (s->indent_stack.len > 0) {
return *s->indent_stack.back();
}
return &pctx->same_scope_token;
}
CORE_Static B32
lex_is_scope(Token *t) {
B32 result = t->kind == OPEN_SCOPE || t->kind == CLOSE_SCOPE || t->kind == SAME_SCOPE;
return result;
}
CORE_Static void
lex_add_token(Core_Ctx *ctx, Token *token) {
Token *top = (Token *)arena_push_size(&ctx->token_arena, sizeof(Token));
*top = *token;
ctx->tokens.len += 1;
ctx->tokens.cap += 1;
ctx->tokens.data = (Token *)ctx->token_arena.memory.data;
}
CORE_Static void
lex_unwind_indent_stack(Core_Ctx *ctx, Token *t, Lex_Stream *s) {
for (S64 i = s->indent_stack.len - 1; i >= 0; i -= 1) {
auto it = s->indent_stack.data[i];
assert(lex_is_scope(it));
if (it->indent == t->indent) {
t->kind = SAME_SCOPE;
lex_add_token(ctx, t);
break;
}
else if (it->indent < t->indent) {
token_error(t, "Bad indentation"_s);
lex_add_token(ctx, t);
break;
}
else {
s->indent_stack.pop();
t->kind = CLOSE_SCOPE;
lex_add_token(ctx, t);
}
}
}
CORE_Static void
lex__stream(Core_Ctx *lexer) {
Intern_Table *table = &lexer->interns;
Lex_Stream *s = &lexer->stream;
B32 beginning = true;
for (;;) {
if (lexc(s) == 0 || s->iter >= s->stream.len) {
end_of_stream:
Token t = token_make(lexer);
lex_unwind_indent_stack(lexer, &t, s);
break;
}
// @note: the lexer is going to be a 2 stage process
// first we tokenize the indentation and then proceed to tokenize
// the good stuff
// for blocks of stmts we parse till we cant find another new line
// of same scope.
// parse_decl doesn't require preceding new line
//
// in that way new lines act as commas in CORE_Static params
// seeing a comma means that there is a next thing to parse
// and it's easy to parse stuff using a do while loop
// @note: first handle indentation
// mostly we want to merge multiple new lines
// but for down scopes we want to emit 2 new lines
// that will ease out parsing, one token to break out
// from a block parsing, second to allow continuation of surrounding scope
Token t = token_make(lexer);
B32 should_emit = beginning;
for (;;) {
switch (lexc(s)) {
case 0: goto end_of_stream; break;
case '\t':
case ' ':
lex_advance(s);
t.indent++;
break;
case '\r': lex_advance(s); break;
case '/': {
if (lexci(s, 1) == '/') {
lex_advance(s);
lex_advance(s);
t.kind = TK_Comment;
for (;;) {
if (lexc(s) == '\n' || lexc(s) == 0) break;
lex_advance(s);
}
}
else if (lexci(s, 1) == '*') {
lex_advance(s);
lex_advance(s);
t.kind = TK_Comment;
for (;;) {
if (lexc(s) == '*' && lexci(s, 1) == '/') {
lex_advance(s);
lex_advance(s);
break;
}
else if (lexc(s) == 0) {
token_error(&t, "Unterminated block comment"_s);
break;
}
lex_advance(s);
}
}
else goto indent_loop_break;
} break;
// @todo: also need some way to detect indentation so that
// first of all we can check for consistency and second of
// all because we would know by how much to indent
// @todo: after detecting indentation 2 spaces would become 1 indent value
case ';': {
Token semi = token_make(lexer);
Token *last = lex_last_indent_token(s);
semi.indent = last->indent;
lex_advance(s);
if (lexc(s) == ';') {
lex_advance(s);
semi.kind = OPEN_SCOPE;
semi.indent = last->indent + 2; // @todo: proper detection of indentation
lex_add_token(lexer, &semi);
s->indent_stack.add(lexer->tokens.back());
}
else {
semi.kind = SAME_SCOPE;
lex_add_token(lexer, &semi);
}
} break;
case '\n': {
lex_advance(s);
should_emit = true;
t = token_make(lexer);
} break;
default: {
if (s->inside_brace_paren) should_emit = false;
if (should_emit) {
Token *last = lex_last_indent_token(s);
if (t.indent > last->indent) {
t.kind = OPEN_SCOPE;
lex_add_token(lexer, &t);
s->indent_stack.add(lexer->tokens.back());
}
else if (t.indent < last->indent) {
lex_unwind_indent_stack(lexer, &t, s);
}
else {
t.kind = SAME_SCOPE;
lex_add_token(lexer, &t);
}
}
goto indent_loop_break;
}
}
}
indent_loop_break:
beginning = false;
// @note: handle the indented token
t = token_make(lexer);
lex_advance(s);
switch (*t.str) {
case 0: goto end_of_stream; break;
case '@': t.kind = TK_At; break;
case '(':
s->inside_brace_paren++;
t.kind = TK_OpenParen;
break;
case ')':
s->inside_brace_paren--;
t.kind = TK_CloseParen;
break;
case '{':
s->inside_brace_paren++;
t.kind = TK_OpenBrace;
break;
case '}':
s->inside_brace_paren--;
t.kind = TK_CloseBrace;
break;
case '[':
s->inside_brace_paren++;
t.kind = TK_OpenBracket;
break;
case ']':
s->inside_brace_paren--;
t.kind = TK_CloseBracket;
break;
case ',': t.kind = TK_Comma; break;
case '~': t.kind = TK_Neg; break;
case '?': t.kind = TK_Question; break;
case '^':
t.kind = TK_BitXor;
break;
CASE2('!', TK_Not, TK_NotEquals);
CASE2('=', TK_Assign, TK_Equals);
CASE2('*', TK_Mul, TK_MulAssign);
CASE2('%', TK_Mod, TK_ModAssign);
CASE3('+', TK_Add, TK_AddAssign, TK_Increment);
CASE3('&', TK_BitAnd, TK_AndAssign, TK_And);
CASE3('|', TK_BitOr, TK_OrAssign, TK_Or);
case '$': {
t.kind = TK_Polymorph;
lex_parse_ident(table, s, &t);
t.str += 1;
t.len -= 1;
t.intern_val = intern_string(table, t.string);
if (t.len == 0) token_error(&t, "Polymorph token without content"_s);
} break;
case '#': {
t.kind = TK_Pound;
lex_parse_ident(table, s, &t);
t.str += 1;
t.len -= 1;
t.intern_val = intern_string(table, t.string);
if (t.len == 0) token_error(&t, "Macro token without content"_s);
} break;
case '.': {
if (lexc(s) == '.') {
lex_advance(s);
if (lexc(s) == '.') {
lex_advance(s);
t.kind = TK_ThreeDots;
}
else t.kind = TK_TwoDots;
}
else t.kind = TK_Dot;
} break;
case '\'': {
assert(s->stream.len >= s->iter);
UTF32_Result decode = utf8_to_utf32(lexcp(s), s->stream.len - s->iter);
if (!decode.error) {
for (S32 i = 0; i < decode.advance; i++) lex_advance(s);
t.unicode = decode.out_str;
t.kind = TK_UnicodeLit;
if (lexc(s) == '\'') {
lex_advance(s);
}
else {
token_error(&t, "Unclosed unicode literal"_s);
}
}
else {
token_error(&t, "Invalid UTF8 sequence in unicode literal"_s);
}
} break;
case '<': {
if (lexc(s) == '<') {
lex_advance(s);
if (lexc(s) == '=') {
lex_advance(s);
t.kind = TK_LeftShiftAssign;
}
else {
t.kind = TK_LeftShift;
}
}
else if (lexc(s) == '=') {
lex_advance(s);
t.kind = TK_LesserThenOrEqual;
}
else {
t.kind = TK_LesserThen;
}
} break;
case '>': {
if (lexc(s) == '>') {
lex_advance(s);
if (lexc(s) == '=') {
lex_advance(s);
t.kind = TK_RightShiftAssign;
}
else {
t.kind = TK_RightShift;
}
}
else if (lexc(s) == '=') {
lex_advance(s);
t.kind = TK_GreaterThenOrEqual;
}
else {
t.kind = TK_GreaterThen;
}
} break;
case ':': {
if (lexc(s) == ':') {
lex_advance(s);
t.kind = TK_DoubleColon;
}
else if (lexc(s) == '=') {
lex_advance(s);
t.kind = TK_ColonAssign;
}
else {
t.kind = TK_Colon;
}
} break;
case '-': {
if (lexc(s) == '=') {
lex_advance(s);
t.kind = TK_SubAssign;
}
else if (lexc(s) == '-') {
lex_advance(s);
t.kind = TK_Decrement;
}
else if (lexc(s) == '>') {
lex_advance(s);
t.kind = TK_Arrow;
}
else {
t.kind = TK_Sub;
}
} break;
case '"': {
t.kind = TK_StringLit;
lex_parse_string(s, &t, '"');
if (t.kind != TK_Error) {
t.str += 1;
t.len -= 2;
}
t.intern_val = intern_string(table, t.string);
} break;
case '/': {
if (lexc(s) == '=') {
t.kind = TK_DivAssign;
lex_advance(s);
}
else {
t.kind = TK_Div;
}
} break;
case '0': {
if (lexc(s) == 'x') {
lex_advance(s);
while (lex_is_numeric_base16(lexc(s)))
lex_advance(s);
lex_set_len(s, &t);
t.str += 2;
t.len -= 2;
if (t.len == 0)
token_error(&t, "Hex constant doesn't have value"_s);
else
lex_parse_u64(lexer, &t, 16);
break;
}
else if (lexc(s) == 'b') {
lex_advance(s);
while (lexc(s) == '0' || lexc(s) == '1')
lex_advance(s);
lex_set_len(s, &t);
t.str += 2;
t.len -= 2;
if (t.len == 0)
token_error(&t, "Hex constant doesn't have value"_s);
else
lex_parse_u64(lexer, &t, 2);
break;
}
}
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9': {
B32 found_dot = false;
for (;;) {
if (lex_is_numeric(lexc(s)))
;
else if (lexc(s) == '.') {
if (found_dot) {
token_error(&t, "Multiple '.' in float literal"_s);
goto end_of_switch;
}
found_dot = true;
}
else break;
lex_advance(s);
}
lex_set_len(s, &t);
if (found_dot) lex_parse_f64(&t);
else lex_parse_u64(lexer, &t, 10);
} break;
case 'A':
case 'a':
case 'M':
case 'm':
case 'B':
case 'b':
case 'N':
case 'n':
case 'C':
case 'c':
case 'O':
case 'o':
case 'D':
case 'd':
case 'P':
case 'p':
case 'E':
case 'e':
case 'Q':
case 'q':
case 'F':
case 'f':
case 'R':
case 'r':
case 'G':
case 'g':
case 'S':
case 's':
case 'H':
case 'h':
case 'T':
case 't':
case 'I':
case 'i':
case 'U':
case 'u':
case 'J':
case 'j':
case 'V':
case 'v':
case 'K':
case 'k':
case 'W':
case 'w':
case 'L':
case 'X':
case 'l':
case 'x':
case 'Z':
case 'z':
case 'Y':
case 'y':
case '_': {
t.kind = TK_Identifier;
lex_parse_ident(table, s, &t);
t.intern_val = intern_string(table, t.string);
if (lex_is_keyword(table, t.intern_val)) {
t.kind = TK_Keyword;
}
} break;
default: {
token_error(&t, "Unknown token"_s);
}
}
end_of_switch:
if (t.len == 0)
lex_set_len(s, &t);
lex_add_token(lexer, &t);
}
#undef CASE2
#undef CASE3
}
CORE_Static void
lex_restream(Core_Ctx *lexer, String istream, String file) {
lexer->stream = {};
lexer->stream.stream = istream;
lexer->stream.line_begin = istream.str;
lexer->stream.file = lexer->intern(file);
Scoped_Arena _scope(lexer->scratch);
lexer->stream.indent_stack.allocator = lexer->scratch;
lexer->stream.indent_stack.add(&lexer->same_scope_token);
lex__stream(lexer);
lexer->lines_lexed += lexer->stream.line;
}
//-----------------------------------------------------------------------------
// Token metadata
//-----------------------------------------------------------------------------
CORE_Static const char *
name(Token_Kind kind) {
switch (kind) {
case TK_End:
return "End of stream";
/*#
import meta
for i in meta.token_kinds:
if i[1] != "SPECIAL":
print("case TK_" + i[0] + f": return \"{i[1]}\";")
*/
case TK_Mul: return "*";
case TK_Div: return "/";
case TK_Mod: return "%";
case TK_LeftShift: return "<<";
case TK_RightShift: return ">>";
case TK_Add: return "+";
case TK_Sub: return "-";
case TK_Equals: return "==";
case TK_LesserThenOrEqual: return "<=";
case TK_GreaterThenOrEqual: return ">=";
case TK_LesserThen: return "<";
case TK_GreaterThen: return ">";
case TK_NotEquals: return "!=";
case TK_BitAnd: return "&";
case TK_BitOr: return "|";
case TK_BitXor: return "^";
case TK_And: return "&&";
case TK_Or: return "||";
case TK_Neg: return "~";
case TK_Not: return "!";
case TK_Decrement: return "--";
case TK_Increment: return "++";
case TK_PostDecrement: return "--";
case TK_PostIncrement: return "++";
case TK_Assign: return "=";
case TK_ColonAssign: return ":=";
case TK_DivAssign: return "/=";
case TK_MulAssign: return "*=";
case TK_ModAssign: return "%=";
case TK_SubAssign: return "-=";
case TK_AddAssign: return "+=";
case TK_AndAssign: return "&=";
case TK_OrAssign: return "|=";
case TK_XorAssign: return "^=";
case TK_LeftShiftAssign: return "<<=";
case TK_RightShiftAssign: return ">>=";
case TK_OpenParen: return "(";
case TK_CloseParen: return ")";
case TK_OpenBrace: return "{";
case TK_CloseBrace: return "}";
case TK_OpenBracket: return "[";
case TK_CloseBracket: return "]";
case TK_Comma: return ",";
case TK_Pound: return "#";
case TK_Question: return "?";
case TK_ThreeDots: return "...";
case TK_Semicolon: return ";";
case TK_Dot: return ".";
case TK_TwoDots: return "..";
case TK_NewLine: return "[NewLine]";
case TK_Colon: return ":";
case TK_DoubleColon: return "::";
case TK_At: return "@";
case TK_Arrow: return "->";
case TK_Polymorph: return "$";
case TK_ExprSizeof: return "[sizeof]";
case TK_DocComment: return "[///]";
case TK_Comment: return "//";
case TK_Identifier: return "[Ident]";
case TK_UnicodeLit: return "[Unicode]";
case TK_StringLit: return "[String]";
case TK_Error: return "[Error]";
case TK_Float: return "[Float]";
case TK_Integer: return "[Int]";
case TK_Keyword:
return "[Keyword]";
/*END*/
case CLOSE_SCOPE: return "Close_Scope";
case OPEN_SCOPE: return "Open_Scope";
case SAME_SCOPE: return "Same_Scope";
default: invalid_codepath; return "<Undefined>";
}
}