enum Token_Kind{ TK_End, TK_Mul, TK_Div, TK_Mod, TK_LeftShift, TK_RightShift, TK_FirstMul = TK_Mul, TK_LastMul = TK_RightShift, TK_Add, TK_Sub, TK_FirstAdd = TK_Add, TK_LastAdd = TK_Sub, TK_Equals, TK_LesserThenOrEqual, TK_GreaterThenOrEqual, TK_LesserThen, TK_GreaterThen, TK_NotEquals, TK_FirstCompare = TK_Equals, TK_LastCompare = TK_NotEquals, TK_BitAnd, TK_BitOr, TK_BitXor, TK_And, TK_Or, TK_FirstLogical = TK_BitAnd, TK_LastLogical = TK_Or, TK_Neg, TK_Not, TK_OpenParen, TK_CloseParen, TK_OpenBrace, TK_CloseBrace, TK_OpenBracket, TK_CloseBracket, TK_Comma, TK_Pound, TK_Question, TK_ThreeDots, TK_Semicolon, TK_Dot, TK_NewLine, TK_Colon, TK_Assign, TK_ColonAssign, TK_DivAssign, TK_MulAssign, TK_ModAssign, TK_SubAssign, TK_AddAssign, TK_AndAssign, TK_OrAssign, TK_XorAssign, TK_LeftShiftAssign, TK_RightShiftAssign, TK_FirstAssign = TK_Assign, TK_LastAssign = TK_RightShiftAssign, TK_DoubleColon, TK_At, TK_Decrement, TK_Increment, TK_PostDecrement, TK_PostIncrement, TK_Arrow, TK_ExprSizeof, TK_DocComment, TK_Comment, TK_Identifier, TK_StringLit, TK_Character, TK_Error, TK_Float, TK_Integer, TK_Keyword, TK_Pointer = TK_Mul, TK_Dereference = TK_BitAnd, // These are not produced by lexer // but identified by parser OPEN_SCOPE = 128, CLOSE_SCOPE, SAME_SCOPE, }; struct Token{ Token_Kind kind; union{ String string; struct{U8 *str; S64 len;}; }; union { U64 int_val; F64 float_val; String error_val; Intern_String intern_val; S64 indent; }; String file; S32 line; U8 *line_begin; }; struct Lex_Stream{ String stream; S64 iter; U8 *line_begin; String file; S32 line; S32 inside_brace_paren; Array indent_stack; }; struct Lexer{ Lex_Stream stream; Array tokens; Intern_Table interns; S64 token_iter; }; force_inline B32 token_is_assign(Token_Kind token){return token >= TK_FirstAssign && token <= TK_LastAssign;} force_inline B32 token_is_assign(Token *token){return token_is_assign(token->kind);} function U8 lexc(Lex_Stream *s){ return s->stream.str[s->iter]; } function U8 lexci(Lex_Stream *s, S32 i){ return s->stream.str[s->iter+i]; } function U8 * lexcp(Lex_Stream *s){ return s->stream.str + s->iter; } function B32 lex_is_whitespace(U8 c){ B32 result = c == ' ' || c == '\r'; return result; } function B32 lex_is_alphabetic(U8 c){ B32 result = (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); return result; } function B32 lex_is_numeric(U8 c){ B32 result = c >= '0' && c <= '9'; return result; } function B32 lex_is_alphanumeric(U8 c){ B32 result = lex_is_numeric(c) || lex_is_alphabetic(c); return result; } function void lex_set_len(Lex_Stream *s, Token *token){ assert(lexcp(s) >= token->str); token->len = lexcp(s) - token->str; } function void lex_set_keywords(Lexer *lexer, Array keywords){ Intern_String keyword = {}; For(keywords){ keyword = intern_string(&lexer->interns, it); if(&it == keywords.begin()) lexer->interns.first_keyword = keyword.str; } lexer->interns.last_keyword = keyword.str; } function B32 lex_is_keyword(Intern_Table *lexer, Intern_String keyword){ B32 result = keyword.str >= lexer->first_keyword && keyword.str <= lexer->last_keyword; return result; } function void token_error(Token *t, String error_val){ t->kind = TK_Error; t->error_val = error_val; } function void lex_parse_u64(Token *t){ U64 result = 0; U64 m = 1; for(S64 i = t->len - 1; i >= 0; --i){ U64 val = t->str[i] - '0'; U64 new_val = val * m; if((result + new_val) < result){ token_error(t, "Integer overflow"_s); return; } result+=new_val; m *= 10; } t->int_val = result; } function void lex_advance(Lex_Stream *s){ if(s->iter >= s->stream.len){ return; } else if(lexc(s) == '\n'){ s->iter++; s->line++; s->line_begin = lexcp(s); } else{ s->iter++; } } function void lex_parse_string(Lex_Stream *s, Token *t, U8 c){ for(;;){ if(lexc(s) == '\\') lex_advance(s); else if(lexc(s) == c) break; else if(lexc(s) == 0){ token_error(t, "Unterminated string, reached end of file"_s); break; } lex_advance(s); } if(t->kind != TK_Error){ lex_advance(s); lex_set_len(s,t); } } #define CASE2(op, OpName, Assign) \ case op: \ if (lexc(s) == '=') { \ lex_advance(s); \ t.kind = Assign; \ } else { \ t.kind = OpName; \ } \ break #define CASE3(op, OpName, Assign, Incr) \ case op: \ if (lexc(s) == '=') { \ lex_advance(s); \ t.kind = Assign; \ } else if (lexc(s) == op) { \ lex_advance(s); \ t.kind = Incr; \ } else { \ t.kind = OpName; \ } \ break function Token token_make(U8 *str, String file, int line, U8 *line_begin){ Token t = {}; t.str = str; t.file = file; t.line = line; t.line_begin = line_begin; return t; } global Token token_null = {SAME_SCOPE}; function Token * lex_last_indent_token(Lex_Stream *s){ if(s->indent_stack.len > 0){ return *s->indent_stack.last(); } return &token_null; } function B32 lex_is_scope(Token *t){ B32 result = t->kind == OPEN_SCOPE || t->kind == CLOSE_SCOPE || t->kind == SAME_SCOPE; return result; } function void lex_unwind_indent_stack(Token *t, Lex_Stream *s, Array *array){ for(S64 i = s->indent_stack.len-1; i >= 0; i-=1){ auto it = s->indent_stack.data[i]; assert(lex_is_scope(it)); if(it->indent == t->indent){ t->kind = SAME_SCOPE; array->add(*t); break; } else if(it->indent < t->indent){ token_error(t, "Bad indentation"_s); array->add(*t); break; } else{ s->indent_stack.pop(); t->kind = CLOSE_SCOPE; array->add(*t); } } } function void lex__stream(Intern_Table *table, Array *array, Lex_Stream *s){ B32 beginning = true; for(;;){ if(lexc(s) == 0 || s->iter >= s->stream.len){ Token t = token_make(lexcp(s), s->file, s->line, s->line_begin); lex_unwind_indent_stack(&t, s, array); break; } // @note: the lexer is going to be a 2 stage process // first we tokenize the indentation and then proceed to tokenize // the good stuff // for blocks of stmts we parse till we cant find another new line // of same scope. // parse_decl doesn't require preceding new line // // in that way new lines act as commas in function params // seeing a comma means that there is a next thing to parse // and it's easy to parse stuff using a do while loop // @note: first handle indentation // mostly we want to merge multiple new lines // but for down scopes we want to emit 2 new lines // that will ease out parsing, one token to break out // from a block parsing, second to allow continuation of surrounding scope Token t = token_make(lexcp(s), s->file, s->line, s->line_begin); B32 should_emit = beginning; for(;;){ switch(lexc(s)){ case '\t': case ' ': lex_advance(s); t.indent++; break; case '\r': lex_advance(s); break; case '/': { if(lexci(s,1) == '/'){ lex_advance(s); lex_advance(s); t.kind = TK_Comment; for(;;){ if(lexc(s) == '\n' || lexc(s) == 0) break; lex_advance(s); } } else if(lexci(s,1) == '*'){ lex_advance(s); lex_advance(s); t.kind = TK_Comment; for(;;){ if(lexc(s) == '*' && lexci(s,1) == '/'){ lex_advance(s); lex_advance(s); break; } else if(lexc(s) == 0){ token_error(&t, "Unterminated block comment"_s); break; } lex_advance(s); } } else goto indent_loop_break; } break; // @todo: add [;;] operator which adds new scope // @todo: also need some way to detect indentation so that // first of all we can check for consistency and second of // all because we would know by how much to indent // @todo: after detecting indentation 2 spaces would become 1 indent value case ';' : { Token semi = token_make(lexcp(s), s->file, s->line, s->line_begin); Token *last = lex_last_indent_token(s); semi.kind = SAME_SCOPE; semi.indent = last->indent; lex_advance(s); array->add(semi); } break; case '\n':{ lex_advance(s); should_emit = true; t = token_make(lexcp(s), s->file, s->line, s->line_begin); } break; // @todo: add open and close brace handling as OPEN_SCOPE CLOSE_SCOPE // when it comes to compound statements it's going to check for scopes // and then it's going to specialize and look for brace string // case '{': { // s->inside_brace_paren++; t.kind = TK_OpenBrace; // } break; // case '}': { // s->inside_brace_paren--; // t.kind = CLOSE_SCOPE; // } break; default:{ if(s->inside_brace_paren) should_emit = false; if(should_emit){ Token *last = lex_last_indent_token(s); if(t.indent > last->indent){ t.kind = OPEN_SCOPE; array->add(t); s->indent_stack.add(array->last()); } else if(t.indent < last->indent){ lex_unwind_indent_stack(&t, s, array); } else { t.kind = SAME_SCOPE; array->add(t); } } goto indent_loop_break; } } } indent_loop_break: beginning = false; // @note: handle the indented token t = token_make(lexcp(s), s->file, s->line, s->line_begin); lex_advance(s); switch(*t.str){ case 0 : break; case '@': t.kind = TK_At; break; case '(': s->inside_brace_paren++; t.kind = TK_OpenParen; break; case ')': s->inside_brace_paren--; t.kind = TK_CloseParen; break; case '{': s->inside_brace_paren++; t.kind = TK_OpenBrace; break; case '}': s->inside_brace_paren--; t.kind = TK_CloseBrace; break; case '[': s->inside_brace_paren++; t.kind = TK_OpenBracket; break; case ']': s->inside_brace_paren--; t.kind = TK_CloseBracket; break; case ',': t.kind = TK_Comma; break; case '~': t.kind = TK_Neg; break; case '?': t.kind = TK_Question; break; case '#': t.kind = TK_Pound; break; case '^': t.kind = TK_BitXor; break; CASE2('!', TK_Not, TK_NotEquals); CASE2('=', TK_Assign, TK_Equals); CASE2('*', TK_Mul, TK_MulAssign); CASE2('%', TK_Mod, TK_ModAssign); CASE3('+', TK_Add, TK_AddAssign, TK_Increment); CASE3('&', TK_BitAnd, TK_AndAssign, TK_And); CASE3('|', TK_BitOr, TK_OrAssign, TK_Or); case '.': { if(lexc(s) == '.' && lexci(s,1) == '.') { lex_advance(s); lex_advance(s); t.kind = TK_ThreeDots; } else { t.kind = TK_Dot; } } break; case '<': { if (lexc(s) == '<') { lex_advance(s); if (lexc(s) == '=') { lex_advance(s); t.kind = TK_LeftShiftAssign; } else { t.kind = TK_LeftShift; } } else if (lexc(s) == '=') { lex_advance(s); t.kind = TK_LesserThenOrEqual; } else { t.kind = TK_LesserThen; } } break; case '>': { if (lexc(s) == '>') { lex_advance(s); if (lexc(s) == '=') { lex_advance(s); t.kind = TK_RightShiftAssign; } else { t.kind = TK_RightShift; } } else if (lexc(s) == '=') { lex_advance(s); t.kind = TK_GreaterThenOrEqual; } else { t.kind = TK_GreaterThen; } } break; case ':': { if (lexc(s) == ':') { lex_advance(s); t.kind = TK_DoubleColon; } else if(lexc(s) == '='){ lex_advance(s); t.kind = TK_ColonAssign; } else { t.kind = TK_Colon; } } break; case '-':{ if (lexc(s) == '=') { lex_advance(s); t.kind = TK_SubAssign; } else if (lexc(s) == '-') { lex_advance(s); t.kind = TK_Decrement; } else if (lexc(s) == '>') { lex_advance(s); t.kind = TK_Arrow; } else { t.kind = TK_Sub; } } break; case '\'':{not_implemented;} break; case '"': { t.kind = TK_StringLit; lex_parse_string(s,&t,'"'); if(t.kind != TK_Error){ t.str += 1; t.len -= 2; } t.intern_val = intern_string(table, t.string); } break; case '/': { if(lexc(s) == '='){ t.kind = TK_DivAssign; lex_advance(s); } else { t.kind = TK_Div; } } break; case '0':case '1':case '2':case '3':case '4': case '5':case '6':case '7':case '8':case '9':{ t.kind = TK_Integer; while(lex_is_numeric(lexc(s))) lex_advance(s); lex_set_len(s, &t); lex_parse_u64(&t); } break; case 'A':case 'a':case 'M':case 'm':case 'B': case 'b':case 'N':case 'n':case 'C':case 'c':case 'O': case 'o':case 'D':case 'd':case 'P':case 'p':case 'E': case 'e':case 'Q':case 'q':case 'F':case 'f':case 'R': case 'r':case 'G':case 'g':case 'S':case 's':case 'H': case 'h':case 'T':case 't':case 'I':case 'i':case 'U': case 'u':case 'J':case 'j':case 'V':case 'v':case 'K': case 'k':case 'W':case 'w':case 'L':case 'X':case 'l': case 'x':case 'Z':case 'z':case 'Y':case 'y':case '_': { t.kind = TK_Identifier; while(lex_is_alphanumeric(lexc(s)) || lexc(s) == '_') lex_advance(s); lex_set_len(s,&t); t.intern_val = intern_string(table, t.string); if(lex_is_keyword(table, t.intern_val)){ t.kind = TK_Keyword; } } break; default: { token_error(&t, "Unknown token"_s); } } if(t.len==0) lex_set_len(s,&t); array->add(t); } #undef CASE2 #undef CASE3 } function void lex_init(Allocator *token_string_arena, Allocator *map_allocator, Lexer *l){ l->tokens = array_make(token_string_arena, 1024*2); l->interns= intern_table_make(token_string_arena, map_allocator, 1024); } function Lexer lex_make(Allocator *token_string_arena, Allocator *map_allocator){ Lexer result = {}; lex_init(token_string_arena, map_allocator, &result); return result; } function void lex_restream(Lexer *lexer, String istream, String file){ lexer->stream = {}; lexer->stream.stream = istream; lexer->stream.line_begin = istream.str; lexer->stream.file = file; lexer->tokens.clear(); lexer->token_iter = 0; Scratch scratch; lexer->stream.indent_stack.allocator = scratch; lexer->stream.indent_stack.add(&token_null); lex__stream(&lexer->interns, &lexer->tokens, &lexer->stream); } function Lexer lex_stream(Allocator *token_string_arena, Allocator *map_allocator, String istream, String file){ Lexer result = lex_make(token_string_arena, map_allocator); lex_restream(&result, istream, file); return result; } //----------------------------------------------------------------------------- // Token metadata //----------------------------------------------------------------------------- function String token_kind_string(Token_Kind kind){ switch(kind){ case TK_End: return "End of stream"_s; case TK_Mul: return "*"_s; case TK_Div: return "/"_s; case TK_Add: return "+"_s; case TK_Sub: return "-"_s; case TK_Mod: return "%"_s; case TK_BitAnd: return "&"_s; case TK_BitOr: return "|"_s; case TK_BitXor: return "^"_s; case TK_Neg: return "~"_s; case TK_Not: return "!"_s; case TK_OpenParen: return "("_s; case TK_CloseParen: return " "_s; case TK_OpenBrace: return "{"_s; case TK_CloseBrace: return "}"_s; case TK_OpenBracket: return "["_s; case TK_CloseBracket: return "]"_s; case TK_ColonAssign: return ":="_s; case TK_Comma: return ","_s; case TK_Pound: return "#"_s; case TK_Question: return "?"_s; case TK_ThreeDots: return "..."_s; case TK_Semicolon: return ";"_s; case TK_Dot: return "."_s; case TK_LesserThen: return "<"_s; case TK_GreaterThen: return ">"_s; case TK_Colon: return ":"_s; case TK_Assign: return "="_s; case TK_DivAssign: return "/="_s; case TK_MulAssign: return "*="_s; case TK_ModAssign: return "%="_s; case TK_SubAssign: return "-="_s; case TK_AddAssign: return "+="_s; case TK_AndAssign: return "&="_s; case TK_OrAssign: return "|="_s; case TK_XorAssign: return "^="_s; case TK_LeftShiftAssign: return "<<="_s; case TK_RightShiftAssign: return ">>="_s; case TK_DoubleColon: return "::"_s; case TK_At: return "@"_s; case TK_Decrement: return "--"_s; case TK_Increment: return "++"_s; case TK_PostDecrement: return "--"_s; case TK_PostIncrement: return "++"_s; case TK_LesserThenOrEqual: return "<="_s; case TK_GreaterThenOrEqual: return ">="_s; case TK_Equals: return "=="_s; case TK_And: return "&&"_s; case TK_Or: return "||"_s; case TK_NotEquals: return "!="_s; case TK_LeftShift: return "<<"_s; case TK_RightShift: return ">>"_s; case TK_Arrow: return "->"_s; case TK_NewLine: return "New_Line"_s; case TK_ExprSizeof: return "sizeof"_s; case TK_DocComment: return "Doc_Comment"_s; case TK_Comment: return "Comment"_s; case TK_Identifier: return "Identifier"_s; case TK_StringLit: return "String_Lit"_s; case TK_Character: return "Character"_s; case TK_Error: return "Error"_s; case TK_Float: return "Float"_s; case TK_Integer: return "Int"_s; case TK_Keyword: return "Keyword"_s; case CLOSE_SCOPE: return "Close_Scope"_s; case OPEN_SCOPE: return "Open_Scope"_s; case SAME_SCOPE: return "Same_Scope"_s; default: invalid_codepath; return ""_s; } }