diff --git a/lex.c b/lex.c deleted file mode 100644 index 9c8b873..0000000 --- a/lex.c +++ /dev/null @@ -1,642 +0,0 @@ -global Intern_String keyword_if; -global Intern_String keyword_for; -global Intern_String keyword_cast; -global Intern_String keyword_else; -global Intern_String keyword_defer; -global Intern_String keyword_do; -global Intern_String keyword_size_type; -global Intern_String keyword_size_expr; -global Intern_String keyword_const; -global Intern_String keyword_typedef; -global Intern_String keyword_return; -global Intern_String keyword_typeof; -global Intern_String keyword_while; -global Intern_String keyword_switch; -global Intern_String keyword_case; -global Intern_String keyword_struct; -global Intern_String keyword_enum; -global Intern_String keyword_union; -global U8 *first_keyword; -global U8 *last_keyword; - -global Intern_String intern_char; -global Intern_String intern_void; -global Intern_String intern_int; - -function void -init_default_keywords(Intern_Table *t){ - keyword_if = intern_string(t, lit("if")); - first_keyword = keyword_if.s.str; - - keyword_cast = intern_string(t, lit("cast")); - keyword_for = intern_string(t, lit("for")); - keyword_else = intern_string(t, lit("else")); - keyword_defer = intern_string(t, lit("defer")); - keyword_do = intern_string(t, lit("do")); - keyword_size_type = intern_string(t, lit("size_type")); - keyword_size_expr = intern_string(t, lit("size_expr")); - keyword_typeof = intern_string(t, lit("typeof")); - keyword_const = intern_string(t, lit("const")); - keyword_while = intern_string(t, lit("while")); - keyword_return = intern_string(t, lit("return")); - keyword_switch = intern_string(t, lit("switch")); - keyword_typedef = intern_string(t, lit("typedef")); - keyword_case = intern_string(t, lit("case")); - keyword_struct = intern_string(t, lit("struct")); - keyword_enum = intern_string(t, lit("enum")); - - keyword_union = intern_string(t, lit("union")); - last_keyword = keyword_union.s.str; - - intern_char = intern_string(t, lit("char")); - intern_void = intern_string(t, lit("void")); - intern_int = intern_string(t, lit("int")); -} - -function B32 -lex_is_keyword(Intern_String str){ - B32 result = str.s.str >= first_keyword && str.s.str <= last_keyword; - return result; -} - -typedef enum Token_Kind{ - TK_End, - - TK_Mul, - TK_Div, - TK_Mod, - TK_LeftShift, - TK_RightShift, - TK_FirstMul = TK_Mul, - TK_LastMul = TK_RightShift, - - TK_Add, - TK_Sub, - TK_FirstAdd = TK_Add, - TK_LastAdd = TK_Sub, - - TK_Equals, - TK_LesserThenOrEqual, - TK_GreaterThenOrEqual, - TK_LesserThen, - TK_GreaterThen, - TK_NotEquals, - TK_FirstCompare = TK_Equals, - TK_LastCompare = TK_NotEquals, - - TK_BitAnd, - TK_BitOr, - TK_Pointer, - TK_And, - TK_Or, - TK_FirstLogical = TK_BitAnd, - TK_LastLogical = TK_Or, - - TK_Neg, - TK_Not, - TK_OpenParen, - TK_CloseParen, - TK_OpenBrace, - TK_CloseBrace, - TK_OpenBracket, - TK_CloseBracket, - TK_Comma, - TK_Pound, - TK_Question, - TK_ThreeDots, - TK_Semicolon, - TK_Dot, - - TK_Colon, - - TK_Assign, - TK_ColonAssign, - TK_DivAssign, - TK_MulAssign, - TK_ModAssign, - TK_SubAssign, - TK_AddAssign, - TK_AndAssign, - TK_OrAssign, - TK_XorAssign, - TK_LeftShiftAssign, - TK_RightShiftAssign, - TK_FirstAssign = TK_Assign, - TK_LastAssign = TK_RightShiftAssign, - - TK_DoubleColon, - TK_At, - TK_Decrement, - TK_Increment, - TK_PostDecrement, - TK_PostIncrement, - - TK_Arrow, - TK_ExprSizeof, - TK_DocComment, - TK_Comment, - TK_Identifier, - TK_StringLit, - TK_Character, - TK_Error, - TK_Float, - TK_Integer, - TK_Keyword, -}Token_Kind; - -typedef struct Token{ - Token_Kind kind; - union{ - String string; - struct{U8 *str; S64 len;}; - }; - - union { - U64 int_val; - F64 float_val; - String error_val; - Intern_String intern_val; - }; - - String file; - S32 line; - U8 *line_begin; -}Token; -#include "token_array.c" - -typedef struct Lex_Stream{ - String stream; - S64 iter; - - U8 *line_begin; - String file; - S32 line; -}Lex_Stream; - - -function U8 -lexc(Lex_Stream *s){ - return s->stream.str[s->iter]; -} - -function U8 -lexci(Lex_Stream *s, S32 i){ - return s->stream.str[s->iter+i]; -} - -function U8 * -lexcp(Lex_Stream *s){ - return s->stream.str + s->iter; -} - -function B32 -lex_is_whitespace(U8 c){ - B32 result = c == '\n' || c == '\r' || c == ' ' || c == '\r'; - return result; -} - -function B32 -lex_is_alphabetic(U8 c){ - B32 result = (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); - return result; -} - -function B32 -lex_is_numeric(U8 c){ - B32 result = c >= '0' && c <= '9'; - return result; -} - -function B32 -lex_is_alphanumeric(U8 c){ - B32 result = lex_is_numeric(c) || lex_is_alphabetic(c); - return result; -} - -function void -lex_set_len(Lex_Stream *s, Token *token){ - assert(lexcp(s) >= token->str); - token->len = lexcp(s) - token->str; -} - -function void -token_error(Token *t, String error_val){ - t->kind = TK_Error; - t->error_val = error_val; -} - -function void -lex_parse_u64(Token *t){ - U64 result = 0; - U64 m = 1; - for(S64 i = t->len - 1; i >= 0; --i){ - U64 val = t->str[i] - '0'; - U64 new_val = val * m; - if((result + new_val) < result){ - token_error(t, lit("Integer overflow")); - return; - } - result+=new_val; - m *= 10; - } - t->int_val = result; -} - -function void -lex_advance(Lex_Stream *s){ - if(s->iter >= s->stream.len){ - return; - } - else if(lexc(s) == '\n'){ - s->iter++; - s->line++; - s->line_begin = lexcp(s); - } - else{ - s->iter++; - } -} - -function void -lex_parse_string(Lex_Stream *s, Token *t, U8 c){ - for(;;){ - if(lexc(s) == '\\') lex_advance(s); - else if(lexc(s) == c) break; - else if(lexc(s) == 0){ - token_error(t, lit("Unterminated string, reached end of file")); - break; - } - lex_advance(s); - } - if(t->kind != TK_Error){ - lex_advance(s); - lex_set_len(s,t); - } -} - -#define CASE2(op, OpName, Assign) \ -case op: \ -if (lexc(s) == '=') { \ -lex_advance(s); \ -t.kind = Assign; \ -} else { \ -t.kind = OpName; \ -} \ -break -#define CASE3(op, OpName, Assign, Incr) \ -case op: \ -if (lexc(s) == '=') { \ -lex_advance(s); \ -t.kind = Assign; \ -} else if (lexc(s) == op) { \ -lex_advance(s); \ -t.kind = Incr; \ -} else { \ -t.kind = OpName; \ -} \ -break - -function void -lex__stream(Token_Array *array, Lex_Stream *s){ - while(lexc(s)){ - while(lex_is_whitespace(lexc(s))) - lex_advance(s); - - Token t = {0}; - t.str = lexcp(s); - t.file = s->file; - t.line = s->line; - t.line_begin = s->line_begin; - lex_advance(s); - - switch(*t.str){ - case 0: break; - case '@': t.kind = TK_At; break; - case '(': t.kind = TK_OpenParen; break; - case ')': t.kind = TK_CloseParen; break; - case '{': t.kind = TK_OpenBrace; break; - case '}': t.kind = TK_CloseBrace; break; - case '[': t.kind = TK_OpenBracket; break; - case ']': t.kind = TK_CloseBracket; break; - case ',': t.kind = TK_Comma; break; - case '~': t.kind = TK_Neg; break; - case '?': t.kind = TK_Question; break; - case ';': t.kind = TK_Semicolon; break; - case '#': t.kind = TK_Pound; break; - CASE2('!', TK_Not, TK_NotEquals); - CASE2('^', TK_Pointer, TK_XorAssign); - CASE2('=', TK_Assign, TK_Equals); - CASE2('*', TK_Mul, TK_MulAssign); - CASE2('%', TK_Mod, TK_ModAssign); - CASE3('+', TK_Add, TK_AddAssign, TK_Increment); - CASE3('&', TK_BitAnd, TK_AndAssign, TK_And); - CASE3('|', TK_BitOr, TK_OrAssign, TK_Or); -#undef CASE2 -#undef CASE3 - case '.': { - if(lexc(s) == '.' && lexci(s,1) == '.') { - lex_advance(s); lex_advance(s); - t.kind = TK_ThreeDots; - } - else { - t.kind = TK_Dot; - } - } break; - - - case '<': { - if (lexc(s) == '<') { - lex_advance(s); - if (lexc(s) == '=') { - lex_advance(s); - t.kind = TK_LeftShiftAssign; - } - else { - t.kind = TK_LeftShift; - } - } - else if (lexc(s) == '=') { - lex_advance(s); - t.kind = TK_LesserThenOrEqual; - } - else { - t.kind = TK_LesserThen; - } - } break; - - case '>': { - if (lexc(s) == '>') { - lex_advance(s); - if (lexc(s) == '=') { - lex_advance(s); - t.kind = TK_RightShiftAssign; - } - else { - t.kind = TK_RightShift; - } - } - else if (lexc(s) == '=') { - lex_advance(s); - t.kind = TK_GreaterThenOrEqual; - } - else { - t.kind = TK_GreaterThen; - } - } break; - - case ':': { - if (lexc(s) == ':') { - lex_advance(s); - t.kind = TK_DoubleColon; - } - else if(lexc(s) == '='){ - lex_advance(s); - t.kind = TK_ColonAssign; - } - else { - t.kind = TK_Colon; - } - } break; - - case '-':{ - if (lexc(s) == '=') { - lex_advance(s); - t.kind = TK_SubAssign; - } - else if (lexc(s) == '-') { - lex_advance(s); - t.kind = TK_Decrement; - } - else if (lexc(s) == '>') { - lex_advance(s); - t.kind = TK_Arrow; - } - else { - t.kind = TK_Sub; - } - } break; - - - case '\'':{not_implemented;} break; - case '"': { - t.kind = TK_StringLit; - lex_parse_string(s,&t,'"'); - if(t.kind != TK_Error){ - t.str += 1; - t.len -= 2; - } - t.intern_val = intern_string(&array->interns, t.string); - } break; - - case '/': { - if(lexc(s) == '='){ - t.kind = TK_DivAssign; - lex_advance(s); - } - else if(lexc(s) == '/'){ - lex_advance(s); - t.kind = TK_Comment; - for(;;){ - if(lexc(s) == '\n' || lexc(s) == 0) break; - lex_advance(s); - } - continue; - } - else if(lexc(s) == '*'){ - lex_advance(s); - t.kind = TK_Comment; - for(;;){ - if(lexc(s) == '*' && lexci(s,1) == '/'){ - lex_advance(s); - lex_advance(s); - break; - } - else if(lexc(s) == 0){ - token_error(&t, lit("Unterminated block comment")); - goto skip_continue; - } - lex_advance(s); - } - continue; - skip_continue:; - } - else { - t.kind = TK_Div; - } - } break; - - case '0':case '1':case '2':case '3':case '4': - case '5':case '6':case '7':case '8':case '9':{ - t.kind = TK_Integer; - while(lex_is_numeric(lexc(s))) - lex_advance(s); - lex_set_len(s, &t); - lex_parse_u64(&t); - } break; - - case 'A':case 'a':case 'M':case 'm':case 'B': - case 'b':case 'N':case 'n':case 'C':case 'c':case 'O': - case 'o':case 'D':case 'd':case 'P':case 'p':case 'E': - case 'e':case 'Q':case 'q':case 'F':case 'f':case 'R': - case 'r':case 'G':case 'g':case 'S':case 's':case 'H': - case 'h':case 'T':case 't':case 'I':case 'i':case 'U': - case 'u':case 'J':case 'j':case 'V':case 'v':case 'K': - case 'k':case 'W':case 'w':case 'L':case 'X':case 'l': - case 'x':case 'Z':case 'z':case 'Y':case 'y':case '_': { - t.kind = TK_Identifier; - while(lex_is_alphanumeric(lexc(s)) || lexc(s) == '_') - lex_advance(s); - lex_set_len(s,&t); - t.intern_val = intern_string(&array->interns, t.string); - if(lex_is_keyword(t.intern_val)){ - t.kind = TK_Keyword; - } - } break; - - default: { - token_error(&t, lit("Unknown token")); - } - } - - if(t.len==0) - lex_set_len(s,&t); - - token_array_push(array, &t); - } -} - -function void -lex_add_stream(Token_Array *array, String stream, String file){ - Lex_Stream s = {stream, 0, stream.str, file, 0}; - lex__stream(array, &s); -} - -function Token_Array -lex_make_token_array(Arena *arena){ - Token_Array array = token_array_make(arena); - init_default_keywords(&array.interns); - return array; -} - -function Token_Array -lex_stream(Arena *arena, String stream, String file){ - Token_Array array = lex_make_token_array(arena); - lex_add_stream(&array, stream, file); - return array; -} - -function void -lex_restream(Token_Array *array, String stream, String file){ - token_array_reset(array); - lex_add_stream(array, stream, file); -} - -function void -lex_test(){ - Arena *scratch = arena_begin_scratch(); - String test = lit("18446744073709551616{})(@?&+-;....->,:::/**/\"Thing\"//R\n Thingy" - "\"Test_Meme\"+=-===42524 4294967295 18446744073709551615" - "for if while switch :="); - Token_Array array = lex_stream(scratch, test, lit("Test1")); - - Token_Kind kind[] = { - TK_Error,TK_OpenBrace,TK_CloseBrace,TK_CloseParen,TK_OpenParen, - TK_At,TK_Question,TK_BitAnd,TK_Add,TK_Sub,TK_Semicolon, - TK_ThreeDots, TK_Dot, TK_Arrow, TK_Comma, TK_DoubleColon, TK_Colon, - TK_StringLit, TK_Identifier, TK_StringLit, TK_AddAssign, TK_SubAssign, - TK_Equals, TK_Integer, TK_Integer, TK_Integer, TK_Keyword, TK_Keyword, - TK_Keyword, TK_Keyword, TK_ColonAssign, TK_End - }; - String strs[] = { - lit("18446744073709551616"),lit("{"),lit("}"),lit(")"),lit("("), - lit("@"),lit("?"),lit("&"),lit("+"),lit("-"),lit(";"), - lit("..."),lit("."),lit("->"),lit(","),lit("::"),lit(":"), - lit("Thing"),lit("Thingy"),lit("Test_Meme"), lit("+="),lit("-="), - lit("=="),lit("42524"),lit("4294967295"),lit("18446744073709551615"), - lit("for"), lit("if"), lit("while"), lit("switch"), lit(":="), lit(""), - }; - U64 vals[] = { - 42524, 4294967295, 18446744073709551615llu - }; - - int i = 0; - int ui = 0; - for(Token *t = token_array_iter_begin(&array); t->kind != TK_End; t = token_array_iter_next(&array)){ - assert(t->kind == kind[i]); - assert(string_compare(t->string, strs[i++])); - if(t->kind == TK_Integer){ - assert(t->int_val == vals[ui++]); - } - } - arena_end_scratch(); - -} - -//----------------------------------------------------------------------------- -// Token metadata -//----------------------------------------------------------------------------- -global const char *token_kind_string[] = { - [TK_End] = "End of stream", - [TK_Mul] = "*", - [TK_Div] = "/", - [TK_Add] = "+", - [TK_Sub] = "-", - [TK_Mod] = "%", - [TK_BitAnd] = "&", - [TK_BitOr] = "|", - [TK_Pointer] = "^", - [TK_Neg] = "~", - [TK_Not] = "!", - [TK_OpenParen] = "(", - [TK_CloseParen] = " ", - [TK_OpenBrace] = "{", - [TK_CloseBrace] = "}", - [TK_OpenBracket] = "[", - [TK_CloseBracket] = "]", - [TK_Comma] = ",", - [TK_Pound] = "#", - [TK_Question] = "?", - [TK_ThreeDots] = "...", - [TK_Semicolon] = ";", - [TK_Dot] = ".", - [TK_LesserThen] = "<", - [TK_GreaterThen] = ">", - [TK_Colon] = ":", - [TK_Assign] = "=", - [TK_ColonAssign] = ":=", - [TK_DivAssign] = "/=", - [TK_MulAssign] = "*=", - [TK_ModAssign] = "%=", - [TK_SubAssign] = "-=", - [TK_AddAssign] = "+=", - [TK_AndAssign] = "&=", - [TK_OrAssign] = "|=", - [TK_XorAssign] = "^=", - [TK_LeftShiftAssign] = "<<=", - [TK_RightShiftAssign] = ">>=", - [TK_DoubleColon] = "::", - [TK_At] = "@", - [TK_Decrement] = "--", - [TK_Increment] = "++", - [TK_PostDecrement] = "--", - [TK_PostIncrement] = "++", - [TK_LesserThenOrEqual] = "<=", - [TK_GreaterThenOrEqual] = ">=", - [TK_Equals] = "==", - [TK_And] = "&&", - [TK_Or] = "||", - [TK_NotEquals] = "!=", - [TK_LeftShift] = "<<", - [TK_RightShift] = ">>", - [TK_Arrow] = "->", - [TK_ExprSizeof] = "sizeof", - [TK_DocComment] = "DocComment", - [TK_Comment] = "Comment", - [TK_Identifier] = "Identifier", - [TK_StringLit] = "StringLit", - [TK_Character] = "Character", - [TK_Error] = "Error", - [TK_Float] = "Float", - [TK_Integer] = "int", - [TK_Keyword] = "Keyword", -};