force_inline B32 token_is_assign(Token_Kind token) { return token >= TK_FirstAssign && token <= TK_LastAssign; } force_inline B32 token_is_assign(Token *token) { return token_is_assign(token->kind); } force_inline B32 token_is_compare(Token_Kind token) { return token >= TK_FirstCompare && token <= TK_LastCompare; } force_inline B32 token_is_compare(Token *token) { return token_is_compare(token->kind); } CORE_Static U8 lexc(Lex_Stream *s) { return s->stream.str[s->iter]; } CORE_Static U8 lexci(Lex_Stream *s, S32 i) { return s->stream.str[s->iter + i]; } CORE_Static U8 * lexcp(Lex_Stream *s) { return s->stream.str + s->iter; } CORE_Static B32 lex_is_whitespace(U8 c) { B32 result = c == ' ' || c == '\r'; return result; } CORE_Static B32 lex_is_alphabetic(U8 c) { B32 result = (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); return result; } CORE_Static B32 lex_is_numeric(U8 c) { B32 result = c >= '0' && c <= '9'; return result; } CORE_Static B32 lex_is_numeric_base16(U8 c) { B32 result = (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'); return result; } CORE_Static B32 lex_is_alphanumeric(U8 c) { B32 result = lex_is_numeric(c) || lex_is_alphabetic(c); return result; } CORE_Static void lex_set_len(Lex_Stream *s, Token *token) { assert(lexcp(s) >= token->str); token->len = lexcp(s) - token->str; } CORE_Static void lex_set_keywords(Core_Ctx *lexer, Array keywords) { Intern_String keyword = {}; For(keywords) { keyword = intern_string(&lexer->interns, it); if (&it == keywords.begin()) lexer->interns.first_keyword = keyword.str; } lexer->interns.last_keyword = keyword.str; } CORE_Static B32 lex_is_keyword(Intern_Table *lexer, Intern_String keyword) { B32 result = keyword.str >= lexer->first_keyword && keyword.str <= lexer->last_keyword; return result; } CORE_Static void token_error(Token *t, String error_val) { t->kind = TK_Error; t->error_val = error_val; } CORE_Static void lex_parse_u64(Core_Ctx *lexer, Token *t, S64 base) { Scoped_Arena _scope(lexer->scratch); Set_BigInt_Arena(lexer->scratch); t->kind = TK_Integer; BigInt m = bigint_u64(1); BigInt base_mul = bigint_u64(base); BigInt result = bigint_u64(0); for (S64 i = t->len - 1; i >= 0; --i) { U64 value = t->str[i]; if (t->str[i] >= 'a') value = value - 'a' + 10; else if (t->str[i] >= 'A') value = value - 'A' + 10; else value -= '0'; BigInt val = bigint_u64(value); BigInt new_val = bigint_mul(&val, &m); result = bigint_add(&result, &new_val); m = bigint_mul(&m, &base_mul); } t->int_val = bigint_copy(lexer->perm, &result); } CORE_Static void lex_parse_f64(Token *t) { t->kind = TK_Float; char buffer[128]; S64 len = clamp_top((int)t->len, 126); memory_copy(buffer, t->str, len); buffer[len] = 0; t->f64_val = strtod(buffer, 0); } CORE_Static void lex_advance(Lex_Stream *s) { if (s->iter >= s->stream.len) { return; } else if (lexc(s) == '\n') { s->iter++; s->line++; s->line_begin = lexcp(s); } else { s->iter++; } } CORE_Static void lex_parse_string(Lex_Stream *s, Token *t, U8 c) { for (;;) { if (lexc(s) == '\\') lex_advance(s); else if (lexc(s) == c) break; else if (lexc(s) == 0) { token_error(t, "Unterminated string, reached end of file"_s); break; } lex_advance(s); } if (t->kind != TK_Error) { lex_advance(s); lex_set_len(s, t); } } CORE_Static void lex_parse_ident(Intern_Table *table, Lex_Stream *s, Token *t) { while (lex_is_alphanumeric(lexc(s)) || lexc(s) == '_') lex_advance(s); lex_set_len(s, t); } #define CASE2(op, OpName, Assign) \ case op: \ if (lexc(s) == '=') { \ lex_advance(s); \ t.kind = Assign; \ } \ else { \ t.kind = OpName; \ } \ break #define CASE3(op, OpName, Assign, Incr) \ case op: \ if (lexc(s) == '=') { \ lex_advance(s); \ t.kind = Assign; \ } \ else if (lexc(s) == op) { \ lex_advance(s); \ t.kind = Incr; \ } \ else { \ t.kind = OpName; \ } \ break CORE_Static Token token_make(Core_Ctx *lexer, U8 *str, Intern_String file, int line, U8 *line_begin) { Token t = {}; t.str = str; t.file = file; t.line = line; t.line_begin = line_begin; t.di = lexer->token_debug_ids++; return t; } CORE_Static Token token_make(Core_Ctx *lexer) { return token_make(lexer, lexcp(&lexer->stream), lexer->stream.file, lexer->stream.line, lexer->stream.line_begin); } CORE_Static Token * lex_last_indent_token(Lex_Stream *s) { if (s->indent_stack.len > 0) { return *s->indent_stack.back(); } return &pctx->same_scope_token; } CORE_Static B32 lex_is_scope(Token *t) { B32 result = t->kind == OPEN_SCOPE || t->kind == CLOSE_SCOPE || t->kind == SAME_SCOPE; return result; } CORE_Static void lex_add_token(Core_Ctx *ctx, Token *token) { Token *top = (Token *)arena_push_size(&ctx->token_arena, sizeof(Token)); *top = *token; ctx->tokens.len += 1; ctx->tokens.cap += 1; ctx->tokens.data = (Token *)ctx->token_arena.memory.data; } CORE_Static void lex_unwind_indent_stack(Core_Ctx *ctx, Token *t, Lex_Stream *s) { for (S64 i = s->indent_stack.len - 1; i >= 0; i -= 1) { auto it = s->indent_stack.data[i]; assert(lex_is_scope(it)); if (it->indent == t->indent) { t->kind = SAME_SCOPE; lex_add_token(ctx, t); break; } else if (it->indent < t->indent) { token_error(t, "Bad indentation"_s); lex_add_token(ctx, t); break; } else { s->indent_stack.pop(); t->kind = CLOSE_SCOPE; lex_add_token(ctx, t); } } } CORE_Static void lex__stream(Core_Ctx *lexer) { Intern_Table *table = &lexer->interns; Lex_Stream *s = &lexer->stream; B32 beginning = true; for (;;) { if (lexc(s) == 0 || s->iter >= s->stream.len) { end_of_stream: Token t = token_make(lexer); lex_unwind_indent_stack(lexer, &t, s); break; } // @note: the lexer is going to be a 2 stage process // first we tokenize the indentation and then proceed to tokenize // the good stuff // for blocks of stmts we parse till we cant find another new line // of same scope. // parse_decl doesn't require preceding new line // // in that way new lines act as commas in CORE_Static params // seeing a comma means that there is a next thing to parse // and it's easy to parse stuff using a do while loop // @note: first handle indentation // mostly we want to merge multiple new lines // but for down scopes we want to emit 2 new lines // that will ease out parsing, one token to break out // from a block parsing, second to allow continuation of surrounding scope Token t = token_make(lexer); B32 should_emit = beginning; for (;;) { switch (lexc(s)) { case 0: goto end_of_stream; break; case '\t': case ' ': lex_advance(s); t.indent++; break; case '\r': lex_advance(s); break; case '/': { if (lexci(s, 1) == '/') { lex_advance(s); lex_advance(s); t.kind = TK_Comment; for (;;) { if (lexc(s) == '\n' || lexc(s) == 0) break; lex_advance(s); } } else if (lexci(s, 1) == '*') { lex_advance(s); lex_advance(s); t.kind = TK_Comment; for (;;) { if (lexc(s) == '*' && lexci(s, 1) == '/') { lex_advance(s); lex_advance(s); break; } else if (lexc(s) == 0) { token_error(&t, "Unterminated block comment"_s); break; } lex_advance(s); } } else goto indent_loop_break; } break; // @todo: add [;;] operator which adds new scope // @todo: also need some way to detect indentation so that // first of all we can check for consistency and second of // all because we would know by how much to indent // @todo: after detecting indentation 2 spaces would become 1 indent value case ';': { Token semi = token_make(lexer); Token *last = lex_last_indent_token(s); semi.indent = last->indent; lex_advance(s); if (lexc(s) == ';') { lex_advance(s); semi.kind = OPEN_SCOPE; semi.indent = last->indent + 2; // @todo: proper detection of indentation lex_add_token(lexer, &semi); s->indent_stack.add(lexer->tokens.back()); } else { semi.kind = SAME_SCOPE; lex_add_token(lexer, &semi); } } break; case '\n': { lex_advance(s); should_emit = true; t = token_make(lexer); } break; default: { if (s->inside_brace_paren) should_emit = false; if (should_emit) { Token *last = lex_last_indent_token(s); if (t.indent > last->indent) { t.kind = OPEN_SCOPE; lex_add_token(lexer, &t); s->indent_stack.add(lexer->tokens.back()); } else if (t.indent < last->indent) { lex_unwind_indent_stack(lexer, &t, s); } else { t.kind = SAME_SCOPE; lex_add_token(lexer, &t); } } goto indent_loop_break; } } } indent_loop_break: beginning = false; // @note: handle the indented token t = token_make(lexer); lex_advance(s); switch (*t.str) { case 0: goto end_of_stream; break; case '@': t.kind = TK_At; break; case '(': s->inside_brace_paren++; t.kind = TK_OpenParen; break; case ')': s->inside_brace_paren--; t.kind = TK_CloseParen; break; case '{': s->inside_brace_paren++; t.kind = TK_OpenBrace; break; case '}': s->inside_brace_paren--; t.kind = TK_CloseBrace; break; case '[': s->inside_brace_paren++; t.kind = TK_OpenBracket; break; case ']': s->inside_brace_paren--; t.kind = TK_CloseBracket; break; case ',': t.kind = TK_Comma; break; case '~': t.kind = TK_Neg; break; case '?': t.kind = TK_Question; break; case '^': t.kind = TK_BitXor; break; CASE2('!', TK_Not, TK_NotEquals); CASE2('=', TK_Assign, TK_Equals); CASE2('*', TK_Mul, TK_MulAssign); CASE2('%', TK_Mod, TK_ModAssign); CASE3('+', TK_Add, TK_AddAssign, TK_Increment); CASE3('&', TK_BitAnd, TK_AndAssign, TK_And); CASE3('|', TK_BitOr, TK_OrAssign, TK_Or); case '$': { t.kind = TK_Polymorph; lex_parse_ident(table, s, &t); t.str += 1; t.len -= 1; t.intern_val = intern_string(table, t.string); if (t.len == 0) token_error(&t, "Polymorph token without content"_s); } break; case '#': { t.kind = TK_Pound; lex_parse_ident(table, s, &t); t.str += 1; t.len -= 1; t.intern_val = intern_string(table, t.string); if (t.len == 0) token_error(&t, "Macro token without content"_s); } break; case '.': { if (lexc(s) == '.') { lex_advance(s); if (lexc(s) == '.') { lex_advance(s); t.kind = TK_ThreeDots; } else t.kind = TK_TwoDots; } else t.kind = TK_Dot; } break; case '\'': { assert(s->stream.len >= s->iter); UTF32_Result decode = utf8_to_utf32(lexcp(s), s->stream.len - s->iter); if (!decode.error) { for (S32 i = 0; i < decode.advance; i++) lex_advance(s); t.unicode = decode.out_str; t.kind = TK_UnicodeLit; if (lexc(s) == '\'') { lex_advance(s); } else { token_error(&t, "Unclosed unicode literal"_s); } } else { token_error(&t, "Invalid UTF8 sequence in unicode literal"_s); } } break; case '<': { if (lexc(s) == '<') { lex_advance(s); if (lexc(s) == '=') { lex_advance(s); t.kind = TK_LeftShiftAssign; } else { t.kind = TK_LeftShift; } } else if (lexc(s) == '=') { lex_advance(s); t.kind = TK_LesserThenOrEqual; } else { t.kind = TK_LesserThen; } } break; case '>': { if (lexc(s) == '>') { lex_advance(s); if (lexc(s) == '=') { lex_advance(s); t.kind = TK_RightShiftAssign; } else { t.kind = TK_RightShift; } } else if (lexc(s) == '=') { lex_advance(s); t.kind = TK_GreaterThenOrEqual; } else { t.kind = TK_GreaterThen; } } break; case ':': { if (lexc(s) == ':') { lex_advance(s); t.kind = TK_DoubleColon; } else if (lexc(s) == '=') { lex_advance(s); t.kind = TK_ColonAssign; } else { t.kind = TK_Colon; } } break; case '-': { if (lexc(s) == '=') { lex_advance(s); t.kind = TK_SubAssign; } else if (lexc(s) == '-') { lex_advance(s); t.kind = TK_Decrement; } else if (lexc(s) == '>') { lex_advance(s); t.kind = TK_Arrow; } else { t.kind = TK_Sub; } } break; case '"': { t.kind = TK_StringLit; lex_parse_string(s, &t, '"'); if (t.kind != TK_Error) { t.str += 1; t.len -= 2; } t.intern_val = intern_string(table, t.string); } break; case '/': { if (lexc(s) == '=') { t.kind = TK_DivAssign; lex_advance(s); } else { t.kind = TK_Div; } } break; case '0': { if (lexc(s) == 'x') { lex_advance(s); while (lex_is_numeric_base16(lexc(s))) lex_advance(s); lex_set_len(s, &t); t.str += 2; t.len -= 2; if (t.len == 0) token_error(&t, "Hex constant doesn't have value"_s); else lex_parse_u64(lexer, &t, 16); break; } else if (lexc(s) == 'b') { lex_advance(s); while (lexc(s) == '0' || lexc(s) == '1') lex_advance(s); lex_set_len(s, &t); t.str += 2; t.len -= 2; if (t.len == 0) token_error(&t, "Hex constant doesn't have value"_s); else lex_parse_u64(lexer, &t, 2); break; } } case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { B32 found_dot = false; for (;;) { if (lex_is_numeric(lexc(s))) ; else if (lexc(s) == '.') { if (found_dot) { token_error(&t, "Multiple '.' in float literal"_s); goto end_of_switch; } found_dot = true; } else break; lex_advance(s); } lex_set_len(s, &t); if (found_dot) lex_parse_f64(&t); else lex_parse_u64(lexer, &t, 10); } break; case 'A': case 'a': case 'M': case 'm': case 'B': case 'b': case 'N': case 'n': case 'C': case 'c': case 'O': case 'o': case 'D': case 'd': case 'P': case 'p': case 'E': case 'e': case 'Q': case 'q': case 'F': case 'f': case 'R': case 'r': case 'G': case 'g': case 'S': case 's': case 'H': case 'h': case 'T': case 't': case 'I': case 'i': case 'U': case 'u': case 'J': case 'j': case 'V': case 'v': case 'K': case 'k': case 'W': case 'w': case 'L': case 'X': case 'l': case 'x': case 'Z': case 'z': case 'Y': case 'y': case '_': { t.kind = TK_Identifier; lex_parse_ident(table, s, &t); t.intern_val = intern_string(table, t.string); if (lex_is_keyword(table, t.intern_val)) { t.kind = TK_Keyword; } } break; default: { token_error(&t, "Unknown token"_s); } } end_of_switch: if (t.len == 0) lex_set_len(s, &t); lex_add_token(lexer, &t); } #undef CASE2 #undef CASE3 } CORE_Static void lex_restream(Core_Ctx *lexer, String istream, String file) { lexer->stream = {}; lexer->stream.stream = istream; lexer->stream.line_begin = istream.str; lexer->stream.file = lexer->intern(file); Scoped_Arena _scope(lexer->scratch); lexer->stream.indent_stack.allocator = lexer->scratch; lexer->stream.indent_stack.add(&lexer->same_scope_token); lex__stream(lexer); lexer->lines_lexed += lexer->stream.line; } //----------------------------------------------------------------------------- // Token metadata //----------------------------------------------------------------------------- CORE_Static const char * name(Token_Kind kind) { switch (kind) { case TK_End: return "End of stream"; /*# import meta for i in meta.token_kinds: if i[1] != "SPECIAL": print("case TK_" + i[0] + f": return \"{i[1]}\";") */ case TK_Mul: return "*"; case TK_Div: return "/"; case TK_Mod: return "%"; case TK_LeftShift: return "<<"; case TK_RightShift: return ">>"; case TK_Add: return "+"; case TK_Sub: return "-"; case TK_Equals: return "=="; case TK_LesserThenOrEqual: return "<="; case TK_GreaterThenOrEqual: return ">="; case TK_LesserThen: return "<"; case TK_GreaterThen: return ">"; case TK_NotEquals: return "!="; case TK_BitAnd: return "&"; case TK_BitOr: return "|"; case TK_BitXor: return "^"; case TK_And: return "&&"; case TK_Or: return "||"; case TK_Neg: return "~"; case TK_Not: return "!"; case TK_Decrement: return "--"; case TK_Increment: return "++"; case TK_PostDecrement: return "--"; case TK_PostIncrement: return "++"; case TK_Assign: return "="; case TK_ColonAssign: return ":="; case TK_DivAssign: return "/="; case TK_MulAssign: return "*="; case TK_ModAssign: return "%="; case TK_SubAssign: return "-="; case TK_AddAssign: return "+="; case TK_AndAssign: return "&="; case TK_OrAssign: return "|="; case TK_XorAssign: return "^="; case TK_LeftShiftAssign: return "<<="; case TK_RightShiftAssign: return ">>="; case TK_OpenParen: return "("; case TK_CloseParen: return ")"; case TK_OpenBrace: return "{"; case TK_CloseBrace: return "}"; case TK_OpenBracket: return "["; case TK_CloseBracket: return "]"; case TK_Comma: return ","; case TK_Pound: return "#"; case TK_Question: return "?"; case TK_ThreeDots: return "..."; case TK_Semicolon: return ";"; case TK_Dot: return "."; case TK_TwoDots: return ".."; case TK_NewLine: return "[NewLine]"; case TK_Colon: return ":"; case TK_DoubleColon: return "::"; case TK_At: return "@"; case TK_Arrow: return "->"; case TK_Polymorph: return "$"; case TK_ExprSizeof: return "[SizeOf]"; case TK_DocComment: return "[///]"; case TK_Comment: return "//"; case TK_Identifier: return "[Ident]"; case TK_UnicodeLit: return "[Unicode]"; case TK_StringLit: return "[String]"; case TK_Error: return "[Error]"; case TK_Float: return "[Float]"; case TK_Integer: return "[Int]"; case TK_Keyword: return "[Keyword]"; /*END*/ case CLOSE_SCOPE: return "Close_Scope"; case OPEN_SCOPE: return "Open_Scope"; case SAME_SCOPE: return "Same_Scope"; default: invalid_codepath; return ""; } }