force_inline B32 token_is_assign(Token_Kind token){return token >= TK_FirstAssign && token <= TK_LastAssign;} force_inline B32 token_is_assign(Token *token){return token_is_assign(token->kind);} force_inline B32 token_is_compare(Token_Kind token){return token >= TK_FirstCompare && token <= TK_LastCompare;} force_inline B32 token_is_compare(Token *token){return token_is_compare(token->kind);} CORE_Static U8 lexc(Lex_Stream *s){ return s->stream.str[s->iter]; } CORE_Static U8 lexci(Lex_Stream *s, S32 i){ return s->stream.str[s->iter+i]; } CORE_Static U8 * lexcp(Lex_Stream *s){ return s->stream.str + s->iter; } CORE_Static B32 lex_is_whitespace(U8 c){ B32 result = c == ' ' || c == '\r'; return result; } CORE_Static B32 lex_is_alphabetic(U8 c){ B32 result = (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); return result; } CORE_Static B32 lex_is_numeric(U8 c){ B32 result = c >= '0' && c <= '9'; return result; } CORE_Static B32 lex_is_numeric_base16(U8 c){ B32 result = (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'); return result; } CORE_Static B32 lex_is_alphanumeric(U8 c){ B32 result = lex_is_numeric(c) || lex_is_alphabetic(c); return result; } CORE_Static void lex_set_len(Lex_Stream *s, Token *token){ assert(lexcp(s) >= token->str); token->len = lexcp(s) - token->str; } CORE_Static void lex_set_keywords(Core_Ctx *lexer, Array keywords){ Intern_String keyword = {}; For(keywords){ keyword = intern_string(&lexer->interns, it); if(&it == keywords.begin()) lexer->interns.first_keyword = keyword.str; } lexer->interns.last_keyword = keyword.str; } CORE_Static B32 lex_is_keyword(Intern_Table *lexer, Intern_String keyword){ B32 result = keyword.str >= lexer->first_keyword && keyword.str <= lexer->last_keyword; return result; } CORE_Static void token_error(Token *t, String error_val){ t->kind = TK_Error; t->error_val = error_val; } CORE_Static void lex_parse_u64(Core_Ctx *lexer, Token *t, S64 base){ Scratch_Scope _scope(lexer->scratch); Set_BigInt_Arena(lexer->scratch); t->kind = TK_Integer; BigInt m = bigint_u64(1); BigInt base_mul = bigint_u64(base); BigInt result = bigint_u64(0); for(S64 i = t->len - 1; i >= 0; --i){ U64 value = t->str[i]; if(t->str[i] >= 'a') value = value - 'a' + 10; else if(t->str[i] >= 'A') value = value - 'A' + 10; else value -= '0'; BigInt val = bigint_u64(value); BigInt new_val = bigint_mul(&val, &m); result = bigint_add(&result, &new_val); m = bigint_mul(&m, &base_mul); } t->int_val = bigint_copy(lexer->perm, &result); } CORE_Static void lex_parse_f64(Token *t){ t->kind = TK_Float; char buffer[128]; S64 len = clamp_top((int)t->len, 126); memory_copy(buffer, t->str, len); buffer[len] = 0; t->f64_val = strtod(buffer, 0); } CORE_Static void lex_advance(Lex_Stream *s){ if(s->iter >= s->stream.len){ return; } else if(lexc(s) == '\n'){ s->iter++; s->line++; s->line_begin = lexcp(s); } else{ s->iter++; } } CORE_Static void lex_parse_string(Lex_Stream *s, Token *t, U8 c){ for(;;){ if(lexc(s) == '\\') lex_advance(s); else if(lexc(s) == c) break; else if(lexc(s) == 0){ token_error(t, "Unterminated string, reached end of file"_s); break; } lex_advance(s); } if(t->kind != TK_Error){ lex_advance(s); lex_set_len(s,t); } } CORE_Static void lex_parse_ident(Intern_Table *table, Lex_Stream *s, Token *t){ while(lex_is_alphanumeric(lexc(s)) || lexc(s) == '_') lex_advance(s); lex_set_len(s,t); } #define CASE2(op, OpName, Assign) \ case op: \ if (lexc(s) == '=') { \ lex_advance(s); \ t.kind = Assign; \ } else { \ t.kind = OpName; \ } \ break #define CASE3(op, OpName, Assign, Incr) \ case op: \ if (lexc(s) == '=') { \ lex_advance(s); \ t.kind = Assign; \ } else if (lexc(s) == op) { \ lex_advance(s); \ t.kind = Incr; \ } else { \ t.kind = OpName; \ } \ break CORE_Static Token token_make(Core_Ctx *lexer, U8 *str, Intern_String file, int line, U8 *line_begin){ Token t = {}; t.str = str; t.file = file; t.line = line; t.line_begin = line_begin; t.di = lexer->token_debug_ids++; return t; } CORE_Static Token token_make(Core_Ctx *lexer){ return token_make(lexer, lexcp(&lexer->stream), lexer->stream.file, lexer->stream.line, lexer->stream.line_begin); } CORE_Static Token * lex_last_indent_token(Lex_Stream *s){ if (s->indent_stack.len > 0) { return *s->indent_stack.last(); } return &pctx->same_scope_token; } CORE_Static B32 lex_is_scope(Token *t){ B32 result = t->kind == OPEN_SCOPE || t->kind == CLOSE_SCOPE || t->kind == SAME_SCOPE; return result; } CORE_Static void lex_add_token(Core_Ctx *ctx, Token *token) { Token *top = (Token *)arena_push_size(ctx->stage_arena, sizeof(Token)); *top = *token; ctx->tokens.len += 1; ctx->tokens.data = (Token *)ctx->stage_arena->memory.data; } CORE_Static void lex_unwind_indent_stack(Core_Ctx *ctx, Token *t, Lex_Stream *s){ for(S64 i = s->indent_stack.len-1; i >= 0; i-=1){ auto it = s->indent_stack.data[i]; assert(lex_is_scope(it)); if(it->indent == t->indent){ t->kind = SAME_SCOPE; lex_add_token(ctx, t); break; } else if(it->indent < t->indent){ token_error(t, "Bad indentation"_s); lex_add_token(ctx, t); break; } else{ s->indent_stack.pop(); t->kind = CLOSE_SCOPE; lex_add_token(ctx, t); } } } CORE_Static void lex__stream(Core_Ctx *lexer){ Intern_Table *table = &lexer->interns; Lex_Stream *s = &lexer->stream; B32 beginning = true; for(;;){ if(lexc(s) == 0 || s->iter >= s->stream.len){ end_of_stream: Token t = token_make(lexer); lex_unwind_indent_stack(lexer, &t, s); break; } // @note: the lexer is going to be a 2 stage process // first we tokenize the indentation and then proceed to tokenize // the good stuff // for blocks of stmts we parse till we cant find another new line // of same scope. // parse_decl doesn't require preceding new line // // in that way new lines act as commas in CORE_Static params // seeing a comma means that there is a next thing to parse // and it's easy to parse stuff using a do while loop // @note: first handle indentation // mostly we want to merge multiple new lines // but for down scopes we want to emit 2 new lines // that will ease out parsing, one token to break out // from a block parsing, second to allow continuation of surrounding scope Token t = token_make(lexer); B32 should_emit = beginning; for(;;){ switch(lexc(s)){ case 0 : goto end_of_stream; break; case '\t': case ' ': lex_advance(s); t.indent++; break; case '\r': lex_advance(s); break; case '/': { if(lexci(s,1) == '/'){ lex_advance(s); lex_advance(s); t.kind = TK_Comment; for(;;){ if(lexc(s) == '\n' || lexc(s) == 0) break; lex_advance(s); } } else if(lexci(s,1) == '*'){ lex_advance(s); lex_advance(s); t.kind = TK_Comment; for(;;){ if(lexc(s) == '*' && lexci(s,1) == '/'){ lex_advance(s); lex_advance(s); break; } else if(lexc(s) == 0){ token_error(&t, "Unterminated block comment"_s); break; } lex_advance(s); } } else goto indent_loop_break; } break; // @todo: add [;;] operator which adds new scope // @todo: also need some way to detect indentation so that // first of all we can check for consistency and second of // all because we would know by how much to indent // @todo: after detecting indentation 2 spaces would become 1 indent value case ';' : { Token semi = token_make(lexer); Token *last = lex_last_indent_token(s); semi.indent = last->indent; lex_advance(s); if(lexc(s) == ';'){ lex_advance(s); semi.kind = OPEN_SCOPE; semi.indent = last->indent + 2; // @todo: proper detection of indentation lex_add_token(lexer, &semi); s->indent_stack.add(lexer->tokens.last()); } else{ semi.kind = SAME_SCOPE; lex_add_token(lexer, &semi); } } break; case '\n':{ lex_advance(s); should_emit = true; t = token_make(lexer); } break; default:{ if(s->inside_brace_paren) should_emit = false; if(should_emit){ Token *last = lex_last_indent_token(s); if(t.indent > last->indent){ t.kind = OPEN_SCOPE; lex_add_token(lexer, &t); s->indent_stack.add(lexer->tokens.last()); } else if(t.indent < last->indent){ lex_unwind_indent_stack(lexer, &t, s); } else { t.kind = SAME_SCOPE; lex_add_token(lexer, &t); } } goto indent_loop_break; } } } indent_loop_break: beginning = false; // @note: handle the indented token t = token_make(lexer); lex_advance(s); switch(*t.str){ case 0 : goto end_of_stream; break; case '@': t.kind = TK_At; break; case '(': s->inside_brace_paren++; t.kind = TK_OpenParen; break; case ')': s->inside_brace_paren--; t.kind = TK_CloseParen; break; case '{': s->inside_brace_paren++; t.kind = TK_OpenBrace; break; case '}': s->inside_brace_paren--; t.kind = TK_CloseBrace; break; case '[': s->inside_brace_paren++; t.kind = TK_OpenBracket; break; case ']': s->inside_brace_paren--; t.kind = TK_CloseBracket; break; case ',': t.kind = TK_Comma; break; case '~': t.kind = TK_Neg; break; case '?': t.kind = TK_Question; break; case '^': t.kind = TK_BitXor; break; CASE2('!', TK_Not, TK_NotEquals); CASE2('=', TK_Assign, TK_Equals); CASE2('*', TK_Mul, TK_MulAssign); CASE2('%', TK_Mod, TK_ModAssign); CASE3('+', TK_Add, TK_AddAssign, TK_Increment); CASE3('&', TK_BitAnd, TK_AndAssign, TK_And); CASE3('|', TK_BitOr, TK_OrAssign, TK_Or); case '#': { t.kind = TK_Pound; lex_parse_ident(table, s, &t); t.str += 1; t.len -= 1; t.intern_val = intern_string(table, t.string); if(t.len == 0) token_error(&t, "Macro token without content"_s); }break; case '.': { if(lexc(s) == '.'){ lex_advance(s); if(lexci(s,1) == '.') { lex_advance(s); t.kind = TK_ThreeDots; } else t.kind = TK_TwoDots; } else t.kind = TK_Dot; } break; case '\'':{ assert(s->stream.len >= s->iter); UTF32_Result decode = utf8_to_utf32(lexcp(s), s->stream.len - s->iter); if(!decode.error){ for(S32 i = 0; i < decode.advance; i++) lex_advance(s); t.unicode = decode.out_str; t.kind = TK_UnicodeLit; if(lexc(s) == '\''){ lex_advance(s); } else{ token_error(&t, "Unclosed unicode literal"_s); } } else{ token_error(&t, "Invalid UTF8 sequence in unicode literal"_s); } } break; case '<': { if (lexc(s) == '<') { lex_advance(s); if (lexc(s) == '=') { lex_advance(s); t.kind = TK_LeftShiftAssign; } else { t.kind = TK_LeftShift; } } else if (lexc(s) == '=') { lex_advance(s); t.kind = TK_LesserThenOrEqual; } else { t.kind = TK_LesserThen; } } break; case '>': { if (lexc(s) == '>') { lex_advance(s); if (lexc(s) == '=') { lex_advance(s); t.kind = TK_RightShiftAssign; } else { t.kind = TK_RightShift; } } else if (lexc(s) == '=') { lex_advance(s); t.kind = TK_GreaterThenOrEqual; } else { t.kind = TK_GreaterThen; } } break; case ':': { if (lexc(s) == ':') { lex_advance(s); t.kind = TK_DoubleColon; } else if(lexc(s) == '='){ lex_advance(s); t.kind = TK_ColonAssign; } else { t.kind = TK_Colon; } } break; case '-':{ if (lexc(s) == '=') { lex_advance(s); t.kind = TK_SubAssign; } else if (lexc(s) == '-') { lex_advance(s); t.kind = TK_Decrement; } else if (lexc(s) == '>') { lex_advance(s); t.kind = TK_Arrow; } else { t.kind = TK_Sub; } } break; case '"': { t.kind = TK_StringLit; lex_parse_string(s,&t,'"'); if(t.kind != TK_Error){ t.str += 1; t.len -= 2; } t.intern_val = intern_string(table, t.string); } break; case '/': { if(lexc(s) == '='){ t.kind = TK_DivAssign; lex_advance(s); } else { t.kind = TK_Div; } } break; case '0':{ if(lexc(s) == 'x'){ lex_advance(s); while(lex_is_numeric_base16(lexc(s))) lex_advance(s); lex_set_len(s, &t); t.str += 2; t.len -= 2; if(t.len == 0) token_error(&t, "Hex constant doesn't have value"_s); else lex_parse_u64(lexer, &t, 16); break; } else if(lexc(s) == 'b'){ lex_advance(s); while(lexc(s) == '0' || lexc(s) == '1') lex_advance(s); lex_set_len(s, &t); t.str += 2; t.len -= 2; if(t.len == 0) token_error(&t, "Hex constant doesn't have value"_s); else lex_parse_u64(lexer, &t, 2); break; } } case '1':case '2':case '3':case '4': case '5':case '6':case '7':case '8':case '9':{ B32 found_dot = false; for(;;){ if(lex_is_numeric(lexc(s))) ; else if(lexc(s) == '.'){ if(found_dot){ token_error(&t, "Multiple '.' in float literal"_s); goto end_of_switch; } found_dot = true; } else break; lex_advance(s); } lex_set_len(s, &t); if(found_dot) lex_parse_f64(&t); else lex_parse_u64(lexer, &t, 10); } break; case 'A':case 'a':case 'M':case 'm':case 'B': case 'b':case 'N':case 'n':case 'C':case 'c':case 'O': case 'o':case 'D':case 'd':case 'P':case 'p':case 'E': case 'e':case 'Q':case 'q':case 'F':case 'f':case 'R': case 'r':case 'G':case 'g':case 'S':case 's':case 'H': case 'h':case 'T':case 't':case 'I':case 'i':case 'U': case 'u':case 'J':case 'j':case 'V':case 'v':case 'K': case 'k':case 'W':case 'w':case 'L':case 'X':case 'l': case 'x':case 'Z':case 'z':case 'Y':case 'y':case '_': { t.kind = TK_Identifier; lex_parse_ident(table, s, &t); t.intern_val = intern_string(table, t.string); if(lex_is_keyword(table, t.intern_val)){ t.kind = TK_Keyword; } } break; default: { token_error(&t, "Unknown token"_s); } }end_of_switch: if(t.len==0) lex_set_len(s,&t); lex_add_token(lexer, &t); } #undef CASE2 #undef CASE3 } CORE_Static void lex_restream(Core_Ctx *lexer, String istream, String file){ lexer->stream = {}; lexer->stream.stream = istream; lexer->stream.line_begin = istream.str; lexer->stream.file = lexer->intern(file); Scratch_Scope _scope(lexer->scratch); lexer->stream.indent_stack.allocator = lexer->scratch; lexer->stream.indent_stack.add(&lexer->same_scope_token); lex__stream(lexer); } //----------------------------------------------------------------------------- // Token metadata //----------------------------------------------------------------------------- CORE_Static const char * name(Token_Kind kind){ switch(kind){ case TK_End: return "End of stream"; /*# import meta for i in meta.token_kinds: if i[1] != "SPECIAL": print("case TK_" + i[0] + f": return \"{i[1]}\";") */ case TK_Mul: return "*"; case TK_Div: return "/"; case TK_Mod: return "%"; case TK_LeftShift: return "<<"; case TK_RightShift: return ">>"; case TK_Add: return "+"; case TK_Sub: return "-"; case TK_Equals: return "=="; case TK_LesserThenOrEqual: return "<="; case TK_GreaterThenOrEqual: return ">="; case TK_LesserThen: return "<"; case TK_GreaterThen: return ">"; case TK_NotEquals: return "!="; case TK_BitAnd: return "&"; case TK_BitOr: return "|"; case TK_BitXor: return "^"; case TK_And: return "&&"; case TK_Or: return "||"; case TK_Neg: return "~"; case TK_Not: return "!"; case TK_Decrement: return "--"; case TK_Increment: return "++"; case TK_PostDecrement: return "--"; case TK_PostIncrement: return "++"; case TK_Assign: return "="; case TK_ColonAssign: return ":="; case TK_DivAssign: return "/="; case TK_MulAssign: return "*="; case TK_ModAssign: return "%="; case TK_SubAssign: return "-="; case TK_AddAssign: return "+="; case TK_AndAssign: return "&="; case TK_OrAssign: return "|="; case TK_XorAssign: return "^="; case TK_LeftShiftAssign: return "<<="; case TK_RightShiftAssign: return ">>="; case TK_OpenParen: return "("; case TK_CloseParen: return ")"; case TK_OpenBrace: return "{"; case TK_CloseBrace: return "}"; case TK_OpenBracket: return "["; case TK_CloseBracket: return "]"; case TK_Comma: return ","; case TK_Pound: return "#"; case TK_Question: return "?"; case TK_ThreeDots: return "..."; case TK_Semicolon: return ";"; case TK_Dot: return "."; case TK_TwoDots: return ".."; case TK_NewLine: return "[NewLine]"; case TK_Colon: return ":"; case TK_DoubleColon: return "::"; case TK_At: return "@"; case TK_Arrow: return "->"; case TK_ExprSizeof: return "[SizeOf]"; case TK_DocComment: return "[///]"; case TK_Comment: return "//"; case TK_Identifier: return "[Ident]"; case TK_UnicodeLit: return "[Unicode]"; case TK_StringLit: return "[String]"; case TK_Error: return "[Error]"; case TK_Float: return "[Float]"; case TK_Integer: return "[Int]"; case TK_Keyword: return "[Keyword]"; /*END*/ case CLOSE_SCOPE: return "Close_Scope"; case OPEN_SCOPE: return "Open_Scope"; case SAME_SCOPE: return "Same_Scope"; default: invalid_codepath; return ""; } }