enum Token_Kind{ TK_End, TK_Mul, TK_Div, TK_Mod, TK_LeftShift, TK_RightShift, TK_FirstMul = TK_Mul, TK_LastMul = TK_RightShift, TK_Add, TK_Sub, TK_FirstAdd = TK_Add, TK_LastAdd = TK_Sub, TK_Equals, TK_LesserThenOrEqual, TK_GreaterThenOrEqual, TK_LesserThen, TK_GreaterThen, TK_NotEquals, TK_FirstCompare = TK_Equals, TK_LastCompare = TK_NotEquals, TK_BitAnd, TK_BitOr, TK_And, TK_Or, TK_FirstLogical = TK_BitAnd, TK_LastLogical = TK_Or, TK_Neg, TK_Not, TK_OpenParen, TK_CloseParen, TK_OpenBrace, TK_CloseBrace, TK_OpenBracket, TK_CloseBracket, TK_Comma, TK_Pound, TK_Question, TK_ThreeDots, TK_Semicolon, TK_Dot, TK_NewLine, TK_Colon, TK_Assign, TK_DivAssign, TK_MulAssign, TK_ModAssign, TK_SubAssign, TK_AddAssign, TK_AndAssign, TK_OrAssign, TK_XorAssign, TK_LeftShiftAssign, TK_RightShiftAssign, TK_FirstAssign = TK_Assign, TK_LastAssign = TK_RightShiftAssign, TK_DoubleColon, TK_At, TK_Decrement, TK_Increment, TK_PostDecrement, TK_PostIncrement, TK_Arrow, TK_ExprSizeof, TK_DocComment, TK_Comment, TK_Identifier, TK_StringLit, TK_Character, TK_Error, TK_Float, TK_Integer, TK_Keyword, TK_Pointer, TK_Dereference, // These are not produced by lexer // but identified by parser OPEN_SCOPE, CLOSE_SCOPE, SAME_SCOPE, }; struct Token{ Token_Kind kind; union{ String string; struct{U8 *str; S64 len;}; }; union { U64 int_val; F64 float_val; String error_val; Intern_String intern_val; S64 indent; }; String file; S32 line; U8 *line_begin; }; struct Lex_Stream{ String stream; S64 iter; U8 *line_begin; String file; S32 line; S32 inside_brace_paren; S32 last_valid_indent; }; struct Lexer{ Lex_Stream stream; Array tokens; Intern_Table interns; S64 token_iter; }; function U8 lexc(Lex_Stream *s){ return s->stream.str[s->iter]; } function U8 lexci(Lex_Stream *s, S32 i){ return s->stream.str[s->iter+i]; } function U8 * lexcp(Lex_Stream *s){ return s->stream.str + s->iter; } function B32 lex_is_whitespace(U8 c){ B32 result = c == ' ' || c == '\r'; return result; } function B32 lex_is_alphabetic(U8 c){ B32 result = (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); return result; } function B32 lex_is_numeric(U8 c){ B32 result = c >= '0' && c <= '9'; return result; } function B32 lex_is_alphanumeric(U8 c){ B32 result = lex_is_numeric(c) || lex_is_alphabetic(c); return result; } function void lex_set_len(Lex_Stream *s, Token *token){ assert(lexcp(s) >= token->str); token->len = lexcp(s) - token->str; } function void lex_set_keywords(Lexer *lexer, Array keywords){ Intern_String keyword = {}; For(keywords){ keyword = intern_string(&lexer->interns, *it); if(it == keywords.begin()) lexer->interns.first_keyword = keyword.str; } lexer->interns.last_keyword = keyword.str; } function B32 lex_is_keyword(Intern_Table *lexer, Intern_String keyword){ B32 result = keyword.str >= lexer->first_keyword && keyword.str <= lexer->last_keyword; return result; } function void token_error(Token *t, String error_val){ t->kind = TK_Error; t->error_val = error_val; } function void lex_parse_u64(Token *t){ U64 result = 0; U64 m = 1; for(S64 i = t->len - 1; i >= 0; --i){ U64 val = t->str[i] - '0'; U64 new_val = val * m; if((result + new_val) < result){ token_error(t, "Integer overflow"_s); return; } result+=new_val; m *= 10; } t->int_val = result; } function void lex_advance(Lex_Stream *s){ if(s->iter >= s->stream.len){ return; } else if(lexc(s) == '\n'){ s->iter++; s->line++; s->line_begin = lexcp(s); } else{ s->iter++; } } function void lex_parse_string(Lex_Stream *s, Token *t, U8 c){ for(;;){ if(lexc(s) == '\\') lex_advance(s); else if(lexc(s) == c) break; else if(lexc(s) == 0){ token_error(t, "Unterminated string, reached end of file"_s); break; } lex_advance(s); } if(t->kind != TK_Error){ lex_advance(s); lex_set_len(s,t); } } #define CASE2(op, OpName, Assign) \ case op: \ if (lexc(s) == '=') { \ lex_advance(s); \ t.kind = Assign; \ } else { \ t.kind = OpName; \ } \ break #define CASE3(op, OpName, Assign, Incr) \ case op: \ if (lexc(s) == '=') { \ lex_advance(s); \ t.kind = Assign; \ } else if (lexc(s) == op) { \ lex_advance(s); \ t.kind = Incr; \ } else { \ t.kind = OpName; \ } \ break function void lex__stream(Intern_Table *table, Array *array, Lex_Stream *s){ while(lexc(s)){ while(lexc(s) == '\r') lex_advance(s); Token t = {}; t.str = lexcp(s); t.file = s->file; t.line = s->line; t.line_begin = s->line_begin; lex_advance(s); switch(*t.str){ case 0 : break; case '@': t.kind = TK_At; break; case '(': s->inside_brace_paren++; t.kind = TK_OpenParen; break; case ')': s->inside_brace_paren--; t.kind = TK_CloseParen; break; case '{': s->inside_brace_paren++; t.kind = TK_OpenBrace; break; case '}': s->inside_brace_paren--; t.kind = TK_CloseBrace; break; case '[': s->inside_brace_paren++; t.kind = TK_OpenBracket; break; case ']': s->inside_brace_paren--; t.kind = TK_CloseBracket; break; case ',': t.kind = TK_Comma; break; case '~': t.kind = TK_Neg; break; case '?': t.kind = TK_Question; break; case '#': t.kind = TK_Pound; break; case '^': t.kind = TK_Pointer; break; CASE2('!', TK_Not, TK_NotEquals); CASE2('=', TK_Assign, TK_Equals); CASE2('*', TK_Mul, TK_MulAssign); CASE2('%', TK_Mod, TK_ModAssign); CASE3('+', TK_Add, TK_AddAssign, TK_Increment); CASE3('&', TK_BitAnd, TK_AndAssign, TK_And); CASE3('|', TK_BitOr, TK_OrAssign, TK_Or); #undef CASE2 #undef CASE3 case ';': { t.kind = TK_Semicolon; }break; case '\r': case ' ' : s->stream.str -= 1; case '\n': { t.kind = TK_NewLine; if(lexc(s) == '\r') lex_advance(s); for(;;){ if(lexc(s) == ' ') { t.indent++; // @Todo(Krzosa): Detect indentation method, file an error while methods are mixed } else if(lexc(s) == '\t') t.indent++; else break; lex_advance(s); } }break; case '.': { if(lexc(s) == '.' && lexci(s,1) == '.') { lex_advance(s); lex_advance(s); t.kind = TK_ThreeDots; } else { t.kind = TK_Dot; } } break; case '<': { if (lexc(s) == '<') { lex_advance(s); if (lexc(s) == '=') { lex_advance(s); t.kind = TK_LeftShiftAssign; } else { t.kind = TK_LeftShift; } } else if (lexc(s) == '=') { lex_advance(s); t.kind = TK_LesserThenOrEqual; } else { t.kind = TK_LesserThen; } } break; case '>': { if (lexc(s) == '>') { lex_advance(s); if (lexc(s) == '=') { lex_advance(s); t.kind = TK_RightShiftAssign; } else { t.kind = TK_RightShift; } } else if (lexc(s) == '=') { lex_advance(s); t.kind = TK_GreaterThenOrEqual; } else { t.kind = TK_GreaterThen; } } break; case ':': { if (lexc(s) == ':') { lex_advance(s); t.kind = TK_DoubleColon; } else { t.kind = TK_Colon; } } break; case '-':{ if (lexc(s) == '=') { lex_advance(s); t.kind = TK_SubAssign; } else if (lexc(s) == '-') { lex_advance(s); t.kind = TK_Decrement; } else if (lexc(s) == '>') { lex_advance(s); t.kind = TK_Arrow; } else { t.kind = TK_Sub; } } break; case '\'':{not_implemented;} break; case '"': { t.kind = TK_StringLit; lex_parse_string(s,&t,'"'); if(t.kind != TK_Error){ t.str += 1; t.len -= 2; } t.intern_val = intern_string(table, t.string); } break; case '/': { if(lexc(s) == '='){ t.kind = TK_DivAssign; lex_advance(s); } else if(lexc(s) == '/'){ lex_advance(s); t.kind = TK_Comment; for(;;){ if(lexc(s) == '\n' || lexc(s) == 0) break; lex_advance(s); } continue; } else if(lexc(s) == '*'){ lex_advance(s); t.kind = TK_Comment; for(;;){ if(lexc(s) == '*' && lexci(s,1) == '/'){ lex_advance(s); lex_advance(s); break; } else if(lexc(s) == 0){ token_error(&t, "Unterminated block comment"_s); goto skip_continue; } lex_advance(s); } continue; skip_continue:; } else { t.kind = TK_Div; } } break; case '0':case '1':case '2':case '3':case '4': case '5':case '6':case '7':case '8':case '9':{ t.kind = TK_Integer; while(lex_is_numeric(lexc(s))) lex_advance(s); lex_set_len(s, &t); lex_parse_u64(&t); } break; case 'A':case 'a':case 'M':case 'm':case 'B': case 'b':case 'N':case 'n':case 'C':case 'c':case 'O': case 'o':case 'D':case 'd':case 'P':case 'p':case 'E': case 'e':case 'Q':case 'q':case 'F':case 'f':case 'R': case 'r':case 'G':case 'g':case 'S':case 's':case 'H': case 'h':case 'T':case 't':case 'I':case 'i':case 'U': case 'u':case 'J':case 'j':case 'V':case 'v':case 'K': case 'k':case 'W':case 'w':case 'L':case 'X':case 'l': case 'x':case 'Z':case 'z':case 'Y':case 'y':case '_': { t.kind = TK_Identifier; while(lex_is_alphanumeric(lexc(s)) || lexc(s) == '_') lex_advance(s); lex_set_len(s,&t); t.intern_val = intern_string(table, t.string); if(lex_is_keyword(table, t.intern_val)){ t.kind = TK_Keyword; } } break; default: { token_error(&t, "Unknown token"_s); } } if(t.len==0) lex_set_len(s,&t); B32 skip = 0; if(t.kind == TK_NewLine){ if(s->inside_brace_paren > 0) skip = 1; if(array->len > 0 && array->last()->kind == TK_NewLine) array->pop(); } if(!skip){ array->add(t); } while(lex_is_whitespace(lexc(s))) lex_advance(s); if(s->iter >= s->stream.len) // End of stream break; } } function void lex_init(Allocator *token_string_arena, Allocator *map_allocator, Lexer *l){ l->tokens = array_make(token_string_arena, 1024*2); l->interns= intern_table_make(token_string_arena, map_allocator, 1024); } function Lexer lex_make(Allocator *token_string_arena, Allocator *map_allocator){ Lexer result = {}; lex_init(token_string_arena, map_allocator, &result); return result; } function void lex_restream(Lexer *lexer, String istream, String file){ lexer->stream = {}; lexer->stream.stream = istream; lexer->stream.line_begin = istream.str; lexer->stream.file = file; lexer->tokens.clear(); lexer->token_iter = 0; lex__stream(&lexer->interns, &lexer->tokens, &lexer->stream); } function Lexer lex_stream(Allocator *token_string_arena, Allocator *map_allocator, String istream, String file){ Lexer result = lex_make(token_string_arena, map_allocator); lex_restream(&result, istream, file); return result; } function void lex_test(){ Scratch scratch; String test = "Keyword //R\n 18446744073709551616{})(@?&+-;....->,:::/**/\"Thing\" Thingy" "\"Test_Meme\"+=-===42524 4294967295 18446744073709551615" "for if while switch :="_s; Array keywords = {scratch}; keywords.add("Keyword"_s); keywords.add("for"_s); keywords.add("if"_s); keywords.add("while"_s); keywords.add("switch"_s); Lexer lexer = lex_make(scratch, scratch); lex_set_keywords(&lexer, keywords); lex_restream(&lexer, test, "Test1"_s); Array arr = lexer.tokens; Token_Kind kind[] = { TK_Keyword, TK_NewLine, TK_Error,TK_OpenBrace,TK_CloseBrace,TK_CloseParen,TK_OpenParen, TK_At,TK_Question,TK_BitAnd,TK_Add,TK_Sub,TK_Semicolon, TK_ThreeDots, TK_Dot, TK_Arrow, TK_Comma, TK_DoubleColon, TK_Colon, TK_StringLit, TK_Identifier, TK_StringLit, TK_AddAssign, TK_SubAssign, TK_Equals, TK_Integer, TK_Integer, TK_Integer, TK_Keyword, TK_Keyword, TK_Keyword, TK_Keyword, TK_Colon, TK_Assign, TK_End }; String strs[] = { "Keyword"_s, "\n "_s, "18446744073709551616"_s,"{"_s,"}"_s,")"_s,"("_s, "@"_s,"?"_s,"&"_s,"+"_s,"-"_s,";"_s, "..."_s,"."_s,"->"_s,","_s,"::"_s,":"_s, "Thing"_s,"Thingy"_s,"Test_Meme"_s, "+="_s,"-="_s, "=="_s,"42524"_s,"4294967295"_s,"18446744073709551615"_s, "for"_s, "if"_s, "while"_s, "switch"_s, ":"_s, "="_s, ""_s, }; U64 vals[] = { 42524, 4294967295, 18446744073709551615llu }; int ui = 0; For(arr){ assert(it->kind == kind[i]); assert(string_compare(it->string, strs[i])); if(it->kind == TK_Integer){ assert(it->int_val == vals[ui++]); } } } //----------------------------------------------------------------------------- // Token metadata //----------------------------------------------------------------------------- function String token_kind_string(Token_Kind kind){ switch(kind){ case TK_End: return "End of stream"_s; case TK_Mul: return "*"_s; case TK_Div: return "/"_s; case TK_Add: return "+"_s; case TK_Sub: return "-"_s; case TK_Mod: return "%"_s; case TK_BitAnd: return "&"_s; case TK_BitOr: return "|"_s; case TK_Pointer: return "^"_s; case TK_Neg: return "~"_s; case TK_Not: return "!"_s; case TK_OpenParen: return "("_s; case TK_CloseParen: return " "_s; case TK_OpenBrace: return "{"_s; case TK_CloseBrace: return "}"_s; case TK_OpenBracket: return "["_s; case TK_CloseBracket: return "]"_s; case TK_Comma: return ","_s; case TK_Pound: return "#"_s; case TK_Question: return "?"_s; case TK_ThreeDots: return "..."_s; case TK_Semicolon: return ";"_s; case TK_Dot: return "."_s; case TK_LesserThen: return "<"_s; case TK_GreaterThen: return ">"_s; case TK_Colon: return ":"_s; case TK_Assign: return "="_s; case TK_DivAssign: return "/="_s; case TK_MulAssign: return "*="_s; case TK_ModAssign: return "%="_s; case TK_SubAssign: return "-="_s; case TK_AddAssign: return "+="_s; case TK_AndAssign: return "&="_s; case TK_OrAssign: return "|="_s; case TK_XorAssign: return "^="_s; case TK_LeftShiftAssign: return "<<="_s; case TK_RightShiftAssign: return ">>="_s; case TK_DoubleColon: return "::"_s; case TK_At: return "@"_s; case TK_Decrement: return "--"_s; case TK_Increment: return "++"_s; case TK_PostDecrement: return "--"_s; case TK_PostIncrement: return "++"_s; case TK_LesserThenOrEqual: return "<="_s; case TK_GreaterThenOrEqual: return ">="_s; case TK_Equals: return "=="_s; case TK_And: return "&&"_s; case TK_Or: return "||"_s; case TK_NotEquals: return "!="_s; case TK_LeftShift: return "<<"_s; case TK_RightShift: return ">>"_s; case TK_Arrow: return "->"_s; case TK_NewLine: return "New_Line"_s; case TK_ExprSizeof: return "sizeof"_s; case TK_DocComment: return "Doc_Comment"_s; case TK_Comment: return "Comment"_s; case TK_Identifier: return "Identifier"_s; case TK_StringLit: return "String_Lit"_s; case TK_Character: return "Character"_s; case TK_Error: return "Error"_s; case TK_Float: return "Float"_s; case TK_Integer: return "Int"_s; case TK_Keyword: return "Keyword"_s; case CLOSE_SCOPE: return "Close_Scope"_s; case OPEN_SCOPE: return "Open_Scope"_s; case SAME_SCOPE: return "Same_Scope"_s; default: invalid_codepath; return ""_s; } }