global Token token_end_of_stream = {0}; function Token * token_alloc(Tokens *t){ if(t->cap == 0){ t->cap = 1024; t->tokens = malloc(sizeof(Token)*t->cap); } else if(t->len+1 > t->cap){ t->cap *= 2; t->tokens = realloc(t->tokens, sizeof(Token)*t->cap); } Token *result = t->tokens + t->len++; memory_zero(result, sizeof(*result)); return result; } function void lex_advance(Lex_Stream *s){ if(*s->stream == '\n'){ s->stream++; s->line++; s->line_begin = s->stream; } else if(*s->stream == 0){ // Don't advance, end of stream } else{ s->stream++; } } function B32 lex_is_whitespace(U8 c){ B32 result = c == '\n' || c == '\r' || c == ' ' || c == '\r'; return result; } function B32 lex_is_alphabetic(U8 c){ B32 result = (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); return result; } function B32 lex_is_numeric(U8 c){ B32 result = c >= '0' && c <= '9'; return result; } function B32 lex_is_alphanumeric(U8 c){ B32 result = lex_is_numeric(c) || lex_is_alphabetic(c); return result; } function void lex_set_len(Lex_Stream *s, Token *token){ assert(s->stream >= token->str); token->len = s->stream - token->str; } function U8 lexc(Lex_Stream *s){ return *s->stream; } function void token_error(Token *t, String error_val){ t->kind = TK_Error; t->error_val = error_val; } function void lex_parse_string(Lex_Stream *s, Token *t, U8 c){ for(;;){ if(lexc(s) == '\\') lex_advance(s); else if(lexc(s) == c) break; else if(lexc(s) == 0){ token_error(t, lit("Unterminated string, reached end of file")); break; } lex_advance(s); } if(t->kind != TK_Error){ lex_advance(s); lex_set_len(s,t); } } function void lex_token_seed(Lex_Stream *s, Token *t){ t->str = s->stream; t->file = s->filename; t->line = s->line; t->line_begin = s->line_begin; } function U64 parse_u64(U8 *str, S64 len){ U64 result = 0; U64 m = 1; for(S64 i = len - 1; i >= 0; --i){ U64 val = str[i] - '0'; U64 new_val = val * m; assert_msg(result+new_val >= result, "Integer overflow!"); result+=new_val; m *= 10; } return result; } function void lex_base(Lex_Stream *s, Tokens *tokens){ while(*s->stream){ #define CASE2(op, OpName, Assign) \ case op: \ if (lexc(s) == '=') { \ lex_advance(s); \ t->kind = Assign; \ } else { \ t->kind = OpName; \ } \ break #define CASE3(op, OpName, Assign, Incr) \ case op: \ if (lexc(s) == '=') { \ lex_advance(s); \ t->kind = Assign; \ } else if (lexc(s) == op) { \ lex_advance(s); \ t->kind = Incr; \ } else { \ t->kind = OpName; \ } \ break Token *t = token_alloc(tokens); top: while(lex_is_whitespace(*s->stream)) lex_advance(s); lex_token_seed(s, t); lex_advance(s); switch(*t->str) { case 0: break; case '@': t->kind = TK_At; break; case '(': t->kind = TK_OpenParen; break; case ')': t->kind = TK_CloseParen; break; case '{': t->kind = TK_OpenBrace; break; case '}': t->kind = TK_CloseBrace; break; case '[': t->kind = TK_OpenBracket; break; case ']': t->kind = TK_CloseBracket; break; case ',': t->kind = TK_Comma; break; case '~': t->kind = TK_Neg; break; case '?': t->kind = TK_Question; break; case ';': t->kind = TK_Semicolon; break; CASE2('!', TK_Not, TK_NotEquals); CASE2('^', TK_BitXor, TK_XorAssign); CASE2('=', TK_Assign, TK_Equals); CASE2('*', TK_Mul, TK_MulAssign); CASE2('%', TK_Mod, TK_ModAssign); CASE3('+', TK_Add, TK_AddAssign, TK_Increment); CASE3('&', TK_BitAnd, TK_AndAssign, TK_And); CASE3('|', TK_BitOr, TK_OrAssign, TK_Or); #undef CASE2 #undef CASE3 case '#': { t->kind = TK_Pound; // @Todo(Krzosa): Some convenient way to recognize macros } break; case '.': { if(s->stream[0] == '.' && s->stream[1] == '.') { lex_advance(s); lex_advance(s); t->kind = TK_ThreeDots; } else { t->kind = TK_Dot; } } break; case '<': { if (lexc(s) == '<') { lex_advance(s); if (lexc(s) == '=') { lex_advance(s); t->kind = TK_LeftShiftAssign; } else { t->kind = TK_LeftShift; } } else if (lexc(s) == '=') { lex_advance(s); t->kind = TK_LesserThenOrEqual; } else { t->kind = TK_LesserThen; } } break; case '>': { if (lexc(s) == '>') { lex_advance(s); if (lexc(s) == '=') { lex_advance(s); t->kind = TK_RightShiftAssign; } else { t->kind = TK_RightShift; } } else if (lexc(s) == '=') { lex_advance(s); t->kind = TK_GreaterThenOrEqual; } else { t->kind = TK_GreaterThen; } } break; case ':': { if (lexc(s) == ':') { lex_advance(s); t->kind = TK_DoubleColon; } else { t->kind = TK_Colon; } } break; case '-':{ if (lexc(s) == '=') { lex_advance(s); t->kind = TK_SubAssign; } else if (lexc(s) == '-') { lex_advance(s); t->kind = TK_Decrement; } else if (lexc(s) == '>') { lex_advance(s); t->kind = TK_Arrow; } else { t->kind = TK_Sub; } } break; case '\'':{not_implemented;} break; case '"': { t->kind = TK_U8Lit; lex_parse_string(s,t,'"'); if(t->kind != TK_Error){ t->str += 1; t->len -= 2; } } break; case '/': { if(lexc(s) == '='){ t->kind = TK_DivAssign; lex_advance(s); } else if(lexc(s) == '/'){ lex_advance(s); if(lexc(s) == '/'){ lex_advance(s); //t->kind = TK_DocComment; } else { //t->kind = TK_Comment; } for(;;){ if(lexc(s) == '\n' || lexc(s) == 0) break; lex_advance(s); } goto top; //lex_set_len(s,t); } else if(lexc(s) == '*'){ lex_advance(s); //t->kind = TK_Comment; for(;;){ if(s->stream[0] == '*' && s->stream[1] == '/'){ lex_advance(s); lex_advance(s); break; } else if(lexc(s) == 0){ token_error(t, lit("Unterminated block comment")); break; } lex_advance(s); } goto top; //lex_set_len(s,t); } else t->kind = TK_Div; } break; case '0': case '1':case '2':case '3': case '4':case '5':case '6': case '7':case '8':case '9': { t->kind = TK_Int; while(lex_is_numeric(lexc(s))) lex_advance(s); lex_set_len(s, t); t->int_val = parse_u64(t->str, t->len); } break; case 'l':{ if(s->stream[0] == 'i' && s->stream[1] == 't' && s->stream[2] == '(' && s->stream[3] == '"'){ t->kind = TK_StringLit; lex_advance(s);lex_advance(s);lex_advance(s);lex_advance(s); lex_parse_string(s,t,'"'); if(s->stream[0] == ')') { t->str += 5; t->len -= 6; lex_advance(s); } else token_error(t, lit("Unterminated string literal, missing closing parenthesis")); break; } }; case 'A':case 'a':case 'M':case 'm':case 'B': case 'b':case 'N':case 'n':case 'C':case 'c':case 'O': case 'o':case 'D':case 'd':case 'P':case 'p':case 'E': case 'e':case 'Q':case 'q':case 'F':case 'f':case 'R': case 'r':case 'G':case 'g':case 'S':case 's':case 'H': case 'h':case 'T':case 't':case 'I':case 'i':case 'U': case 'u':case 'J':case 'j':case 'V':case 'v':case 'K': case 'k':case 'W':case 'w':case 'L':case 'X': case 'x':case 'Z':case 'z':case 'Y':case 'y':case '_': { t->kind = TK_Identifier; while(lex_is_alphanumeric(lexc(s)) || lexc(s) == '_') lex_advance(s); lex_set_len(s,t); } break; default:{ token_error(t, lit("Unknown token")); } break; } if(t->len==0){ lex_set_len(s,t); } } // Token end of stream Token *t = token_alloc(tokens); *t = token_end_of_stream; tokens->len -= 1; } function Tokens lex_stream(String in_stream, String filename){ Lex_Stream stream = {in_stream.str, in_stream.str, filename, 0}; Tokens tokens = {0}; lex_base(&stream, &tokens); return tokens; } function void parser_lex_stream(Parser *p, String in_stream, String filename){ Lex_Stream stream = {in_stream.str, in_stream.str, filename, 0}; p->tokens.len = 0; p->tokens.iter = 0; lex_base(&stream, &p->tokens); intern_tokens(p); } //----------------------------------------------------------------------------- // //----------------------------------------------------------------------------- function B32 token_compare(Token *t, String str){ B32 result = string_compare(t->string, str); return result; } function B32 token_is_comment(Token *token){ B32 result = token->kind == TK_Comment || token->kind == TK_DocComment; return result; } function Token * token_get(Parser *p){ Token *token = p->tokens.tokens + p->tokens.iter; return token; } function B32 intern_compare(Intern_String a, Intern_String b){ B32 result = a.s.str == b.s.str; return result; } function Token * token_is_keyword(Parser *p, Intern_String keyword){ assert(intern_is_keyword(p, keyword)); Token *t = token_get(p); if(t->kind == TK_Keyword && intern_compare(t->intern_val, keyword)){ return t; } return 0; } function void token_advance(Parser *p){ p->tokens.iter = clamp_top_s64(p->tokens.iter + 1, p->tokens.len); } function Token * token_next(Parser *p){ Token *token = token_get(p); token_advance(p); return token; } function Token * token_match(Parser *p, Token_Kind kind){ Token *token = token_get(p); if(token->kind == kind){ return token_next(p); } return 0; } function Token * token_match_keyword(Parser *p, Intern_String keyword){ assert(intern_is_keyword(p, keyword)); Token *token = token_get(p); if(token->kind == TK_Keyword && intern_compare(keyword, token->intern_val)){ return token_next(p); } return 0; } function Token * token_expect(Parser *p, Token_Kind kind){ Token *token = token_get(p); if(token->kind == kind){ return token_next(p); } parser_push_error(p, token, "Expected token of kind: %s, got instead token of kind: %s", token_kind_string[kind].str, token_kind_string[token->kind].str); return 0; } function B32 token_is(Parser *p, Token_Kind kind){ B32 result = token_get(p)->kind == kind; return result; } function Token * token_is_assignment(Parser *p){ Token *t = token_get(p); if(t->kind >= TK_Assign && t->kind <= TK_RightShiftAssign) return t; return 0; } function Token * token_peek(Parser *p, S64 count){ S64 index = clamp_top_s64(p->tokens.iter + count, p->tokens.len); Token *result = p->tokens.tokens + index; return result; } function Token * token_peek_is(Parser *p, S64 count, Token_Kind kind){ Token *token = token_peek(p, count); if(token->kind == kind) return token; return 0; } function Token * token_peek_is_keyword(Parser *p, S64 count, Intern_String keyword){ Token *token = token_peek(p, count); if(token->kind == TK_Keyword){ if(intern_compare(keyword, token->intern_val)){ return token; } } return 0; } function void lex_test(){ Tokens t; t = lex_stream(lit("3252342510 42524 \"U8Literal\""), lit("test")); //tokens_print(t); assert(t.len == 3); assert(t.tokens[0].int_val == 3252342510); assert(t.tokens[1].int_val == 42524); assert(t.tokens[2].kind == TK_U8Lit); assert(token_compare(t.tokens + 2, lit("U8Literal"))); t = lex_stream(lit("_identifier Thing Thing2 lit(\"String_Test\")"), lit("test")); //tokens_print(t); assert(t.tokens[0].kind == TK_Identifier); assert(t.tokens[1].kind == TK_Identifier); assert(t.tokens[2].kind == TK_Identifier); assert(t.tokens[3].kind == TK_StringLit); assert(token_compare(t.tokens, lit("_identifier"))); assert(token_compare(t.tokens+1, lit("Thing"))); assert(token_compare(t.tokens+2, lit("Thing2"))); assert(token_compare(t.tokens+3, lit("String_Test"))); t = lex_stream(lit("lit(\"String_Test\"{})(324*=+=-/ *% // Comment \n" "Thing /*Thing*/ += -= =- +/%^&*&&|| |>> << <<= >>=/*Error"), lit("test")); assert(t.tokens[0].kind == TK_Error); //tokens_print(t); }