global Intern_String keyword_if; global Intern_String keyword_for; global Intern_String keyword_cast; global Intern_String keyword_else; global Intern_String keyword_sizeof; global Intern_String keyword_typeof; global Intern_String keyword_while; global Intern_String keyword_switch; global Intern_String keyword_case; global Intern_String keyword_struct; global Intern_String keyword_enum; global Intern_String keyword_union; global U8 *first_keyword; global U8 *last_keyword; function void init_default_keywords(Intern_Table *t){ keyword_if = intern_string(t, lit("if")); first_keyword = keyword_if.s.str; keyword_cast = intern_string(t, lit("cast")); keyword_for = intern_string(t, lit("for")); keyword_else = intern_string(t, lit("else")); keyword_sizeof = intern_string(t, lit("sizeof")); keyword_typeof = intern_string(t, lit("typeof")); keyword_while = intern_string(t, lit("while")); keyword_switch = intern_string(t, lit("switch")); keyword_case = intern_string(t, lit("case")); keyword_struct = intern_string(t, lit("struct")); keyword_enum = intern_string(t, lit("enum")); keyword_union = intern_string(t, lit("union")); last_keyword = keyword_union.s.str; } function B32 lex_is_keyword(Intern_String str){ B32 result = str.s.str >= first_keyword && str.s.str <= last_keyword; return result; } typedef enum Token_Kind{ TK_End, TK_Mul, TK_Div, TK_Mod, TK_LeftShift, TK_RightShift, TK_FirstMul = TK_Mul, TK_LastMul = TK_RightShift, TK_Add, TK_Sub, TK_FirstAdd = TK_Add, TK_LastAdd = TK_Sub, TK_Equals, TK_LesserThenOrEqual, TK_GreaterThenOrEqual, TK_LesserThen, TK_GreaterThen, TK_NotEquals, TK_FirstCompare = TK_Equals, TK_LastCompare = TK_NotEquals, TK_BitAnd, TK_BitOr, TK_BitXor, TK_And, TK_Or, TK_FirstLogical = TK_BitAnd, TK_LastLogical = TK_Or, TK_Neg, TK_Not, TK_OpenParen, TK_CloseParen, TK_OpenBrace, TK_CloseBrace, TK_OpenBracket, TK_CloseBracket, TK_Comma, TK_Pound, TK_Question, TK_ThreeDots, TK_Semicolon, TK_Dot, TK_Colon, TK_Assign, TK_DivAssign, TK_MulAssign, TK_ModAssign, TK_SubAssign, TK_AddAssign, TK_AndAssign, TK_OrAssign, TK_XorAssign, TK_LeftShiftAssign, TK_RightShiftAssign, TK_DoubleColon, TK_At, TK_Decrement, TK_Increment, TK_PostDecrement, TK_PostIncrement, TK_Arrow, TK_ExprSizeof, TK_DocComment, TK_Comment, TK_Identifier, TK_StringLit, TK_Character, TK_Error, TK_Float, TK_Int, TK_Keyword, }Token_Kind; typedef struct Token{ Token_Kind kind; union{ String string; struct{U8 *str; S64 len;}; }; union { U64 int_val; F64 float_val; String error_val; Intern_String intern_val; }; String file; S32 line; U8 *line_begin; }Token; #include "token_array.c" typedef struct Lex_Stream{ String stream; S64 iter; U8 *line_begin; String file; S32 line; }Lex_Stream; function U8 lexc(Lex_Stream *s){ return s->stream.str[s->iter]; } function U8 lexci(Lex_Stream *s, S32 i){ return s->stream.str[s->iter+i]; } function U8 * lexcp(Lex_Stream *s){ return s->stream.str + s->iter; } function B32 lex_is_whitespace(U8 c){ B32 result = c == '\n' || c == '\r' || c == ' ' || c == '\r'; return result; } function B32 lex_is_alphabetic(U8 c){ B32 result = (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); return result; } function B32 lex_is_numeric(U8 c){ B32 result = c >= '0' && c <= '9'; return result; } function B32 lex_is_alphanumeric(U8 c){ B32 result = lex_is_numeric(c) || lex_is_alphabetic(c); return result; } function void lex_set_len(Lex_Stream *s, Token *token){ assert(lexcp(s) >= token->str); token->len = lexcp(s) - token->str; } function void token_error(Token *t, String error_val){ t->kind = TK_Error; t->error_val = error_val; } function void lex_parse_u64(Token *t){ U64 result = 0; U64 m = 1; for(S64 i = t->len - 1; i >= 0; --i){ U64 val = t->str[i] - '0'; U64 new_val = val * m; if((result + new_val) < result){ token_error(t, lit("Integer overflow")); return; } result+=new_val; m *= 10; } t->int_val = result; } function void lex_advance(Lex_Stream *s){ if(s->iter >= s->stream.len){ return; } else if(lexc(s) == '\n'){ s->iter++; s->line++; s->line_begin = lexcp(s); } else{ s->iter++; } } function Token token_int(U64 val){ Token result = {.kind = TK_Int, .int_val=val}; return result; } function void lex_parse_string(Lex_Stream *s, Token *t, U8 c){ for(;;){ if(lexc(s) == '\\') lex_advance(s); else if(lexc(s) == c) break; else if(lexc(s) == 0){ token_error(t, lit("Unterminated string, reached end of file")); break; } lex_advance(s); } if(t->kind != TK_Error){ lex_advance(s); lex_set_len(s,t); } } #define CASE2(op, OpName, Assign) \ case op: \ if (lexc(s) == '=') { \ lex_advance(s); \ t.kind = Assign; \ } else { \ t.kind = OpName; \ } \ break #define CASE3(op, OpName, Assign, Incr) \ case op: \ if (lexc(s) == '=') { \ lex_advance(s); \ t.kind = Assign; \ } else if (lexc(s) == op) { \ lex_advance(s); \ t.kind = Incr; \ } else { \ t.kind = OpName; \ } \ break function void lex__stream(Token_Array *array, Lex_Stream *s){ while(lexc(s)){ while(lex_is_whitespace(lexc(s))) lex_advance(s); Token t = {0}; t.str = lexcp(s); t.file = s->file; t.line = s->line; t.line_begin = s->line_begin; lex_advance(s); switch(*t.str){ case 0: break; case '@': t.kind = TK_At; break; case '(': t.kind = TK_OpenParen; break; case ')': t.kind = TK_CloseParen; break; case '{': t.kind = TK_OpenBrace; break; case '}': t.kind = TK_CloseBrace; break; case '[': t.kind = TK_OpenBracket; break; case ']': t.kind = TK_CloseBracket; break; case ',': t.kind = TK_Comma; break; case '~': t.kind = TK_Neg; break; case '?': t.kind = TK_Question; break; case ';': t.kind = TK_Semicolon; break; case '#': t.kind = TK_Pound; break; CASE2('!', TK_Not, TK_NotEquals); CASE2('^', TK_BitXor, TK_XorAssign); CASE2('=', TK_Assign, TK_Equals); CASE2('*', TK_Mul, TK_MulAssign); CASE2('%', TK_Mod, TK_ModAssign); CASE3('+', TK_Add, TK_AddAssign, TK_Increment); CASE3('&', TK_BitAnd, TK_AndAssign, TK_And); CASE3('|', TK_BitOr, TK_OrAssign, TK_Or); #undef CASE2 #undef CASE3 case '.': { if(lexc(s) == '.' && lexci(s,1) == '.') { lex_advance(s); lex_advance(s); t.kind = TK_ThreeDots; } else { t.kind = TK_Dot; } } break; case '<': { if (lexc(s) == '<') { lex_advance(s); if (lexc(s) == '=') { lex_advance(s); t.kind = TK_LeftShiftAssign; } else { t.kind = TK_LeftShift; } } else if (lexc(s) == '=') { lex_advance(s); t.kind = TK_LesserThenOrEqual; } else { t.kind = TK_LesserThen; } } break; case '>': { if (lexc(s) == '>') { lex_advance(s); if (lexc(s) == '=') { lex_advance(s); t.kind = TK_RightShiftAssign; } else { t.kind = TK_RightShift; } } else if (lexc(s) == '=') { lex_advance(s); t.kind = TK_GreaterThenOrEqual; } else { t.kind = TK_GreaterThen; } } break; case ':': { if (lexc(s) == ':') { lex_advance(s); t.kind = TK_DoubleColon; } else { t.kind = TK_Colon; } } break; case '-':{ if (lexc(s) == '=') { lex_advance(s); t.kind = TK_SubAssign; } else if (lexc(s) == '-') { lex_advance(s); t.kind = TK_Decrement; } else if (lexc(s) == '>') { lex_advance(s); t.kind = TK_Arrow; } else { t.kind = TK_Sub; } } break; case '\'':{not_implemented;} break; case '"': { t.kind = TK_StringLit; lex_parse_string(s,&t,'"'); if(t.kind != TK_Error){ t.str += 1; t.len -= 2; } t.intern_val = intern_string(&array->interns, t.string); } break; case '/': { if(lexc(s) == '='){ t.kind = TK_DivAssign; lex_advance(s); } else if(lexc(s) == '/'){ lex_advance(s); t.kind = TK_Comment; for(;;){ if(lexc(s) == '\n' || lexc(s) == 0) break; lex_advance(s); } continue; } else if(lexc(s) == '*'){ lex_advance(s); t.kind = TK_Comment; for(;;){ if(lexc(s) == '*' && lexci(s,1) == '/'){ lex_advance(s); lex_advance(s); break; } else if(lexc(s) == 0){ token_error(&t, lit("Unterminated block comment")); goto skip_continue; } lex_advance(s); } continue; skip_continue:; } else { t.kind = TK_Div; } } break; case '0':case '1':case '2':case '3':case '4': case '5':case '6':case '7':case '8':case '9':{ t.kind = TK_Int; while(lex_is_numeric(lexc(s))) lex_advance(s); lex_set_len(s, &t); lex_parse_u64(&t); } break; case 'A':case 'a':case 'M':case 'm':case 'B': case 'b':case 'N':case 'n':case 'C':case 'c':case 'O': case 'o':case 'D':case 'd':case 'P':case 'p':case 'E': case 'e':case 'Q':case 'q':case 'F':case 'f':case 'R': case 'r':case 'G':case 'g':case 'S':case 's':case 'H': case 'h':case 'T':case 't':case 'I':case 'i':case 'U': case 'u':case 'J':case 'j':case 'V':case 'v':case 'K': case 'k':case 'W':case 'w':case 'L':case 'X':case 'l': case 'x':case 'Z':case 'z':case 'Y':case 'y':case '_': { t.kind = TK_Identifier; while(lex_is_alphanumeric(lexc(s)) || lexc(s) == '_') lex_advance(s); lex_set_len(s,&t); t.intern_val = intern_string(&array->interns, t.string); if(lex_is_keyword(t.intern_val)){ t.kind = TK_Keyword; } } break; default: { token_error(&t, lit("Unknown token")); } } if(t.len==0) lex_set_len(s,&t); token_array_push(array, &t); } } function Token_Array lex_stream(Arena *arena, String stream, String file){ Lex_Stream s = {stream, 0, stream.str, file, 0}; Token_Array array = token_array_make(arena); init_default_keywords(&array.interns); lex__stream(&array, &s); return array; } function void lex_test(){ Arena *scratch = arena_begin_scratch(); String test = lit("18446744073709551616{})(@?&+-;....->,:::/**/\"Thing\"//R\n Thingy" "\"Test_Meme\"+=-===42524 4294967295 18446744073709551615" "for if while switch"); Token_Array array = lex_stream(scratch, test, lit("Test1")); Token_Kind kind[] = { TK_Error,TK_OpenBrace,TK_CloseBrace,TK_CloseParen,TK_OpenParen, TK_At,TK_Question,TK_BitAnd,TK_Add,TK_Sub,TK_Semicolon, TK_ThreeDots, TK_Dot, TK_Arrow, TK_Comma, TK_DoubleColon, TK_Colon, TK_StringLit, TK_Identifier, TK_StringLit, TK_AddAssign, TK_SubAssign, TK_Equals, TK_Int, TK_Int, TK_Int, TK_Keyword, TK_Keyword, TK_Keyword, TK_Keyword, TK_End }; String strs[] = { lit("18446744073709551616"),lit("{"),lit("}"),lit(")"),lit("("), lit("@"),lit("?"),lit("&"),lit("+"),lit("-"),lit(";"), lit("..."),lit("."),lit("->"),lit(","),lit("::"),lit(":"), lit("Thing"),lit("Thingy"),lit("Test_Meme"), lit("+="),lit("-="), lit("=="),lit("42524"),lit("4294967295"),lit("18446744073709551615"), lit("for"), lit("if"), lit("while"), lit("switch"), lit(""), }; U64 vals[] = { 42524, 4294967295, 18446744073709551615llu }; int i = 0; int ui = 0; for(Token *t = token_array_iter_begin(&array); t->kind != TK_End; t = token_array_iter_next(&array)){ assert(t->kind == kind[i]); assert(string_compare(t->string, strs[i++])); if(t->kind == TK_Int){ assert(t->int_val == vals[ui++]); } } arena_end_scratch(); } //----------------------------------------------------------------------------- // Token metadata //----------------------------------------------------------------------------- global const char *token_kind_string[] = { [TK_End] = "End of stream", [TK_Mul] = "*", [TK_Div] = "/", [TK_Add] = "+", [TK_Sub] = "-", [TK_Mod] = "%", [TK_BitAnd] = "&", [TK_BitOr] = "|", [TK_BitXor] = "^", [TK_Neg] = "~", [TK_Not] = "!", [TK_OpenParen] = "(", [TK_CloseParen] = " ", [TK_OpenBrace] = "{", [TK_CloseBrace] = "}", [TK_OpenBracket] = "[", [TK_CloseBracket] = "]", [TK_Comma] = ",", [TK_Pound] = "#", [TK_Question] = "?", [TK_ThreeDots] = "...", [TK_Semicolon] = ";", [TK_Dot] = ".", [TK_LesserThen] = "<", [TK_GreaterThen] = ">", [TK_Colon] = ":", [TK_Assign] = "=", [TK_DivAssign] = "/=", [TK_MulAssign] = "*=", [TK_ModAssign] = "%=", [TK_SubAssign] = "-=", [TK_AddAssign] = "+=", [TK_AndAssign] = "&=", [TK_OrAssign] = "|=", [TK_XorAssign] = "^=", [TK_LeftShiftAssign] = "<<=", [TK_RightShiftAssign] = ">>=", [TK_DoubleColon] = "::", [TK_At] = "@", [TK_Decrement] = "--", [TK_Increment] = "++", [TK_PostDecrement] = "--", [TK_PostIncrement] = "++", [TK_LesserThenOrEqual] = "<=", [TK_GreaterThenOrEqual] = ">=", [TK_Equals] = "==", [TK_And] = "&&", [TK_Or] = "||", [TK_NotEquals] = "!=", [TK_LeftShift] = "<<", [TK_RightShift] = ">>", [TK_Arrow] = "->", [TK_ExprSizeof] = "sizeof", [TK_DocComment] = "DocComment", [TK_Comment] = "Comment", [TK_Identifier] = "Identifier", [TK_StringLit] = "StringLit", [TK_Character] = "Character", [TK_Error] = "Error", [TK_Float] = "Float", [TK_Int] = "Int", [TK_Keyword] = "Keyword", };