643 lines
16 KiB
C
643 lines
16 KiB
C
global Intern_String keyword_if;
|
|
global Intern_String keyword_for;
|
|
global Intern_String keyword_cast;
|
|
global Intern_String keyword_else;
|
|
global Intern_String keyword_defer;
|
|
global Intern_String keyword_do;
|
|
global Intern_String keyword_size_type;
|
|
global Intern_String keyword_size_expr;
|
|
global Intern_String keyword_const;
|
|
global Intern_String keyword_typedef;
|
|
global Intern_String keyword_return;
|
|
global Intern_String keyword_typeof;
|
|
global Intern_String keyword_while;
|
|
global Intern_String keyword_switch;
|
|
global Intern_String keyword_case;
|
|
global Intern_String keyword_struct;
|
|
global Intern_String keyword_enum;
|
|
global Intern_String keyword_union;
|
|
global U8 *first_keyword;
|
|
global U8 *last_keyword;
|
|
|
|
global Intern_String intern_char;
|
|
global Intern_String intern_void;
|
|
global Intern_String intern_int;
|
|
|
|
function void
|
|
init_default_keywords(Intern_Table *t){
|
|
keyword_if = intern_string(t, lit("if"));
|
|
first_keyword = keyword_if.s.str;
|
|
|
|
keyword_cast = intern_string(t, lit("cast"));
|
|
keyword_for = intern_string(t, lit("for"));
|
|
keyword_else = intern_string(t, lit("else"));
|
|
keyword_defer = intern_string(t, lit("defer"));
|
|
keyword_do = intern_string(t, lit("do"));
|
|
keyword_size_type = intern_string(t, lit("size_type"));
|
|
keyword_size_expr = intern_string(t, lit("size_expr"));
|
|
keyword_typeof = intern_string(t, lit("typeof"));
|
|
keyword_const = intern_string(t, lit("const"));
|
|
keyword_while = intern_string(t, lit("while"));
|
|
keyword_return = intern_string(t, lit("return"));
|
|
keyword_switch = intern_string(t, lit("switch"));
|
|
keyword_typedef = intern_string(t, lit("typedef"));
|
|
keyword_case = intern_string(t, lit("case"));
|
|
keyword_struct = intern_string(t, lit("struct"));
|
|
keyword_enum = intern_string(t, lit("enum"));
|
|
|
|
keyword_union = intern_string(t, lit("union"));
|
|
last_keyword = keyword_union.s.str;
|
|
|
|
intern_char = intern_string(t, lit("char"));
|
|
intern_void = intern_string(t, lit("void"));
|
|
intern_int = intern_string(t, lit("int"));
|
|
}
|
|
|
|
function B32
|
|
lex_is_keyword(Intern_String str){
|
|
B32 result = str.s.str >= first_keyword && str.s.str <= last_keyword;
|
|
return result;
|
|
}
|
|
|
|
typedef enum Token_Kind{
|
|
TK_End,
|
|
|
|
TK_Mul,
|
|
TK_Div,
|
|
TK_Mod,
|
|
TK_LeftShift,
|
|
TK_RightShift,
|
|
TK_FirstMul = TK_Mul,
|
|
TK_LastMul = TK_RightShift,
|
|
|
|
TK_Add,
|
|
TK_Sub,
|
|
TK_FirstAdd = TK_Add,
|
|
TK_LastAdd = TK_Sub,
|
|
|
|
TK_Equals,
|
|
TK_LesserThenOrEqual,
|
|
TK_GreaterThenOrEqual,
|
|
TK_LesserThen,
|
|
TK_GreaterThen,
|
|
TK_NotEquals,
|
|
TK_FirstCompare = TK_Equals,
|
|
TK_LastCompare = TK_NotEquals,
|
|
|
|
TK_BitAnd,
|
|
TK_BitOr,
|
|
TK_Pointer,
|
|
TK_And,
|
|
TK_Or,
|
|
TK_FirstLogical = TK_BitAnd,
|
|
TK_LastLogical = TK_Or,
|
|
|
|
TK_Neg,
|
|
TK_Not,
|
|
TK_OpenParen,
|
|
TK_CloseParen,
|
|
TK_OpenBrace,
|
|
TK_CloseBrace,
|
|
TK_OpenBracket,
|
|
TK_CloseBracket,
|
|
TK_Comma,
|
|
TK_Pound,
|
|
TK_Question,
|
|
TK_ThreeDots,
|
|
TK_Semicolon,
|
|
TK_Dot,
|
|
|
|
TK_Colon,
|
|
|
|
TK_Assign,
|
|
TK_ColonAssign,
|
|
TK_DivAssign,
|
|
TK_MulAssign,
|
|
TK_ModAssign,
|
|
TK_SubAssign,
|
|
TK_AddAssign,
|
|
TK_AndAssign,
|
|
TK_OrAssign,
|
|
TK_XorAssign,
|
|
TK_LeftShiftAssign,
|
|
TK_RightShiftAssign,
|
|
TK_FirstAssign = TK_Assign,
|
|
TK_LastAssign = TK_RightShiftAssign,
|
|
|
|
TK_DoubleColon,
|
|
TK_At,
|
|
TK_Decrement,
|
|
TK_Increment,
|
|
TK_PostDecrement,
|
|
TK_PostIncrement,
|
|
|
|
TK_Arrow,
|
|
TK_ExprSizeof,
|
|
TK_DocComment,
|
|
TK_Comment,
|
|
TK_Identifier,
|
|
TK_StringLit,
|
|
TK_Character,
|
|
TK_Error,
|
|
TK_Float,
|
|
TK_Integer,
|
|
TK_Keyword,
|
|
}Token_Kind;
|
|
|
|
typedef struct Token{
|
|
Token_Kind kind;
|
|
union{
|
|
String string;
|
|
struct{U8 *str; S64 len;};
|
|
};
|
|
|
|
union {
|
|
U64 int_val;
|
|
F64 float_val;
|
|
String error_val;
|
|
Intern_String intern_val;
|
|
};
|
|
|
|
String file;
|
|
S32 line;
|
|
U8 *line_begin;
|
|
}Token;
|
|
#include "token_array.c"
|
|
|
|
typedef struct Lex_Stream{
|
|
String stream;
|
|
S64 iter;
|
|
|
|
U8 *line_begin;
|
|
String file;
|
|
S32 line;
|
|
}Lex_Stream;
|
|
|
|
|
|
function U8
|
|
lexc(Lex_Stream *s){
|
|
return s->stream.str[s->iter];
|
|
}
|
|
|
|
function U8
|
|
lexci(Lex_Stream *s, S32 i){
|
|
return s->stream.str[s->iter+i];
|
|
}
|
|
|
|
function U8 *
|
|
lexcp(Lex_Stream *s){
|
|
return s->stream.str + s->iter;
|
|
}
|
|
|
|
function B32
|
|
lex_is_whitespace(U8 c){
|
|
B32 result = c == '\n' || c == '\r' || c == ' ' || c == '\r';
|
|
return result;
|
|
}
|
|
|
|
function B32
|
|
lex_is_alphabetic(U8 c){
|
|
B32 result = (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
|
|
return result;
|
|
}
|
|
|
|
function B32
|
|
lex_is_numeric(U8 c){
|
|
B32 result = c >= '0' && c <= '9';
|
|
return result;
|
|
}
|
|
|
|
function B32
|
|
lex_is_alphanumeric(U8 c){
|
|
B32 result = lex_is_numeric(c) || lex_is_alphabetic(c);
|
|
return result;
|
|
}
|
|
|
|
function void
|
|
lex_set_len(Lex_Stream *s, Token *token){
|
|
assert(lexcp(s) >= token->str);
|
|
token->len = lexcp(s) - token->str;
|
|
}
|
|
|
|
function void
|
|
token_error(Token *t, String error_val){
|
|
t->kind = TK_Error;
|
|
t->error_val = error_val;
|
|
}
|
|
|
|
function void
|
|
lex_parse_u64(Token *t){
|
|
U64 result = 0;
|
|
U64 m = 1;
|
|
for(S64 i = t->len - 1; i >= 0; --i){
|
|
U64 val = t->str[i] - '0';
|
|
U64 new_val = val * m;
|
|
if((result + new_val) < result){
|
|
token_error(t, lit("Integer overflow"));
|
|
return;
|
|
}
|
|
result+=new_val;
|
|
m *= 10;
|
|
}
|
|
t->int_val = result;
|
|
}
|
|
|
|
function void
|
|
lex_advance(Lex_Stream *s){
|
|
if(s->iter >= s->stream.len){
|
|
return;
|
|
}
|
|
else if(lexc(s) == '\n'){
|
|
s->iter++;
|
|
s->line++;
|
|
s->line_begin = lexcp(s);
|
|
}
|
|
else{
|
|
s->iter++;
|
|
}
|
|
}
|
|
|
|
function void
|
|
lex_parse_string(Lex_Stream *s, Token *t, U8 c){
|
|
for(;;){
|
|
if(lexc(s) == '\\') lex_advance(s);
|
|
else if(lexc(s) == c) break;
|
|
else if(lexc(s) == 0){
|
|
token_error(t, lit("Unterminated string, reached end of file"));
|
|
break;
|
|
}
|
|
lex_advance(s);
|
|
}
|
|
if(t->kind != TK_Error){
|
|
lex_advance(s);
|
|
lex_set_len(s,t);
|
|
}
|
|
}
|
|
|
|
#define CASE2(op, OpName, Assign) \
|
|
case op: \
|
|
if (lexc(s) == '=') { \
|
|
lex_advance(s); \
|
|
t.kind = Assign; \
|
|
} else { \
|
|
t.kind = OpName; \
|
|
} \
|
|
break
|
|
#define CASE3(op, OpName, Assign, Incr) \
|
|
case op: \
|
|
if (lexc(s) == '=') { \
|
|
lex_advance(s); \
|
|
t.kind = Assign; \
|
|
} else if (lexc(s) == op) { \
|
|
lex_advance(s); \
|
|
t.kind = Incr; \
|
|
} else { \
|
|
t.kind = OpName; \
|
|
} \
|
|
break
|
|
|
|
function void
|
|
lex__stream(Token_Array *array, Lex_Stream *s){
|
|
while(lexc(s)){
|
|
while(lex_is_whitespace(lexc(s)))
|
|
lex_advance(s);
|
|
|
|
Token t = {0};
|
|
t.str = lexcp(s);
|
|
t.file = s->file;
|
|
t.line = s->line;
|
|
t.line_begin = s->line_begin;
|
|
lex_advance(s);
|
|
|
|
switch(*t.str){
|
|
case 0: break;
|
|
case '@': t.kind = TK_At; break;
|
|
case '(': t.kind = TK_OpenParen; break;
|
|
case ')': t.kind = TK_CloseParen; break;
|
|
case '{': t.kind = TK_OpenBrace; break;
|
|
case '}': t.kind = TK_CloseBrace; break;
|
|
case '[': t.kind = TK_OpenBracket; break;
|
|
case ']': t.kind = TK_CloseBracket; break;
|
|
case ',': t.kind = TK_Comma; break;
|
|
case '~': t.kind = TK_Neg; break;
|
|
case '?': t.kind = TK_Question; break;
|
|
case ';': t.kind = TK_Semicolon; break;
|
|
case '#': t.kind = TK_Pound; break;
|
|
CASE2('!', TK_Not, TK_NotEquals);
|
|
CASE2('^', TK_Pointer, TK_XorAssign);
|
|
CASE2('=', TK_Assign, TK_Equals);
|
|
CASE2('*', TK_Mul, TK_MulAssign);
|
|
CASE2('%', TK_Mod, TK_ModAssign);
|
|
CASE3('+', TK_Add, TK_AddAssign, TK_Increment);
|
|
CASE3('&', TK_BitAnd, TK_AndAssign, TK_And);
|
|
CASE3('|', TK_BitOr, TK_OrAssign, TK_Or);
|
|
#undef CASE2
|
|
#undef CASE3
|
|
case '.': {
|
|
if(lexc(s) == '.' && lexci(s,1) == '.') {
|
|
lex_advance(s); lex_advance(s);
|
|
t.kind = TK_ThreeDots;
|
|
}
|
|
else {
|
|
t.kind = TK_Dot;
|
|
}
|
|
} break;
|
|
|
|
|
|
case '<': {
|
|
if (lexc(s) == '<') {
|
|
lex_advance(s);
|
|
if (lexc(s) == '=') {
|
|
lex_advance(s);
|
|
t.kind = TK_LeftShiftAssign;
|
|
}
|
|
else {
|
|
t.kind = TK_LeftShift;
|
|
}
|
|
}
|
|
else if (lexc(s) == '=') {
|
|
lex_advance(s);
|
|
t.kind = TK_LesserThenOrEqual;
|
|
}
|
|
else {
|
|
t.kind = TK_LesserThen;
|
|
}
|
|
} break;
|
|
|
|
case '>': {
|
|
if (lexc(s) == '>') {
|
|
lex_advance(s);
|
|
if (lexc(s) == '=') {
|
|
lex_advance(s);
|
|
t.kind = TK_RightShiftAssign;
|
|
}
|
|
else {
|
|
t.kind = TK_RightShift;
|
|
}
|
|
}
|
|
else if (lexc(s) == '=') {
|
|
lex_advance(s);
|
|
t.kind = TK_GreaterThenOrEqual;
|
|
}
|
|
else {
|
|
t.kind = TK_GreaterThen;
|
|
}
|
|
} break;
|
|
|
|
case ':': {
|
|
if (lexc(s) == ':') {
|
|
lex_advance(s);
|
|
t.kind = TK_DoubleColon;
|
|
}
|
|
else if(lexc(s) == '='){
|
|
lex_advance(s);
|
|
t.kind = TK_ColonAssign;
|
|
}
|
|
else {
|
|
t.kind = TK_Colon;
|
|
}
|
|
} break;
|
|
|
|
case '-':{
|
|
if (lexc(s) == '=') {
|
|
lex_advance(s);
|
|
t.kind = TK_SubAssign;
|
|
}
|
|
else if (lexc(s) == '-') {
|
|
lex_advance(s);
|
|
t.kind = TK_Decrement;
|
|
}
|
|
else if (lexc(s) == '>') {
|
|
lex_advance(s);
|
|
t.kind = TK_Arrow;
|
|
}
|
|
else {
|
|
t.kind = TK_Sub;
|
|
}
|
|
} break;
|
|
|
|
|
|
case '\'':{not_implemented;} break;
|
|
case '"': {
|
|
t.kind = TK_StringLit;
|
|
lex_parse_string(s,&t,'"');
|
|
if(t.kind != TK_Error){
|
|
t.str += 1;
|
|
t.len -= 2;
|
|
}
|
|
t.intern_val = intern_string(&array->interns, t.string);
|
|
} break;
|
|
|
|
case '/': {
|
|
if(lexc(s) == '='){
|
|
t.kind = TK_DivAssign;
|
|
lex_advance(s);
|
|
}
|
|
else if(lexc(s) == '/'){
|
|
lex_advance(s);
|
|
t.kind = TK_Comment;
|
|
for(;;){
|
|
if(lexc(s) == '\n' || lexc(s) == 0) break;
|
|
lex_advance(s);
|
|
}
|
|
continue;
|
|
}
|
|
else if(lexc(s) == '*'){
|
|
lex_advance(s);
|
|
t.kind = TK_Comment;
|
|
for(;;){
|
|
if(lexc(s) == '*' && lexci(s,1) == '/'){
|
|
lex_advance(s);
|
|
lex_advance(s);
|
|
break;
|
|
}
|
|
else if(lexc(s) == 0){
|
|
token_error(&t, lit("Unterminated block comment"));
|
|
goto skip_continue;
|
|
}
|
|
lex_advance(s);
|
|
}
|
|
continue;
|
|
skip_continue:;
|
|
}
|
|
else {
|
|
t.kind = TK_Div;
|
|
}
|
|
} break;
|
|
|
|
case '0':case '1':case '2':case '3':case '4':
|
|
case '5':case '6':case '7':case '8':case '9':{
|
|
t.kind = TK_Integer;
|
|
while(lex_is_numeric(lexc(s)))
|
|
lex_advance(s);
|
|
lex_set_len(s, &t);
|
|
lex_parse_u64(&t);
|
|
} break;
|
|
|
|
case 'A':case 'a':case 'M':case 'm':case 'B':
|
|
case 'b':case 'N':case 'n':case 'C':case 'c':case 'O':
|
|
case 'o':case 'D':case 'd':case 'P':case 'p':case 'E':
|
|
case 'e':case 'Q':case 'q':case 'F':case 'f':case 'R':
|
|
case 'r':case 'G':case 'g':case 'S':case 's':case 'H':
|
|
case 'h':case 'T':case 't':case 'I':case 'i':case 'U':
|
|
case 'u':case 'J':case 'j':case 'V':case 'v':case 'K':
|
|
case 'k':case 'W':case 'w':case 'L':case 'X':case 'l':
|
|
case 'x':case 'Z':case 'z':case 'Y':case 'y':case '_': {
|
|
t.kind = TK_Identifier;
|
|
while(lex_is_alphanumeric(lexc(s)) || lexc(s) == '_')
|
|
lex_advance(s);
|
|
lex_set_len(s,&t);
|
|
t.intern_val = intern_string(&array->interns, t.string);
|
|
if(lex_is_keyword(t.intern_val)){
|
|
t.kind = TK_Keyword;
|
|
}
|
|
} break;
|
|
|
|
default: {
|
|
token_error(&t, lit("Unknown token"));
|
|
}
|
|
}
|
|
|
|
if(t.len==0)
|
|
lex_set_len(s,&t);
|
|
|
|
token_array_push(array, &t);
|
|
}
|
|
}
|
|
|
|
function void
|
|
lex_add_stream(Token_Array *array, String stream, String file){
|
|
Lex_Stream s = {stream, 0, stream.str, file, 0};
|
|
lex__stream(array, &s);
|
|
}
|
|
|
|
function Token_Array
|
|
lex_make_token_array(Arena *arena){
|
|
Token_Array array = token_array_make(arena);
|
|
init_default_keywords(&array.interns);
|
|
return array;
|
|
}
|
|
|
|
function Token_Array
|
|
lex_stream(Arena *arena, String stream, String file){
|
|
Token_Array array = lex_make_token_array(arena);
|
|
lex_add_stream(&array, stream, file);
|
|
return array;
|
|
}
|
|
|
|
function void
|
|
lex_restream(Token_Array *array, String stream, String file){
|
|
token_array_reset(array);
|
|
lex_add_stream(array, stream, file);
|
|
}
|
|
|
|
function void
|
|
lex_test(){
|
|
Arena *scratch = arena_begin_scratch();
|
|
String test = lit("18446744073709551616{})(@?&+-;....->,:::/**/\"Thing\"//R\n Thingy"
|
|
"\"Test_Meme\"+=-===42524 4294967295 18446744073709551615"
|
|
"for if while switch :=");
|
|
Token_Array array = lex_stream(scratch, test, lit("Test1"));
|
|
|
|
Token_Kind kind[] = {
|
|
TK_Error,TK_OpenBrace,TK_CloseBrace,TK_CloseParen,TK_OpenParen,
|
|
TK_At,TK_Question,TK_BitAnd,TK_Add,TK_Sub,TK_Semicolon,
|
|
TK_ThreeDots, TK_Dot, TK_Arrow, TK_Comma, TK_DoubleColon, TK_Colon,
|
|
TK_StringLit, TK_Identifier, TK_StringLit, TK_AddAssign, TK_SubAssign,
|
|
TK_Equals, TK_Integer, TK_Integer, TK_Integer, TK_Keyword, TK_Keyword,
|
|
TK_Keyword, TK_Keyword, TK_ColonAssign, TK_End
|
|
};
|
|
String strs[] = {
|
|
lit("18446744073709551616"),lit("{"),lit("}"),lit(")"),lit("("),
|
|
lit("@"),lit("?"),lit("&"),lit("+"),lit("-"),lit(";"),
|
|
lit("..."),lit("."),lit("->"),lit(","),lit("::"),lit(":"),
|
|
lit("Thing"),lit("Thingy"),lit("Test_Meme"), lit("+="),lit("-="),
|
|
lit("=="),lit("42524"),lit("4294967295"),lit("18446744073709551615"),
|
|
lit("for"), lit("if"), lit("while"), lit("switch"), lit(":="), lit(""),
|
|
};
|
|
U64 vals[] = {
|
|
42524, 4294967295, 18446744073709551615llu
|
|
};
|
|
|
|
int i = 0;
|
|
int ui = 0;
|
|
for(Token *t = token_array_iter_begin(&array); t->kind != TK_End; t = token_array_iter_next(&array)){
|
|
assert(t->kind == kind[i]);
|
|
assert(string_compare(t->string, strs[i++]));
|
|
if(t->kind == TK_Integer){
|
|
assert(t->int_val == vals[ui++]);
|
|
}
|
|
}
|
|
arena_end_scratch();
|
|
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Token metadata
|
|
//-----------------------------------------------------------------------------
|
|
global const char *token_kind_string[] = {
|
|
[TK_End] = "End of stream",
|
|
[TK_Mul] = "*",
|
|
[TK_Div] = "/",
|
|
[TK_Add] = "+",
|
|
[TK_Sub] = "-",
|
|
[TK_Mod] = "%",
|
|
[TK_BitAnd] = "&",
|
|
[TK_BitOr] = "|",
|
|
[TK_Pointer] = "^",
|
|
[TK_Neg] = "~",
|
|
[TK_Not] = "!",
|
|
[TK_OpenParen] = "(",
|
|
[TK_CloseParen] = " ",
|
|
[TK_OpenBrace] = "{",
|
|
[TK_CloseBrace] = "}",
|
|
[TK_OpenBracket] = "[",
|
|
[TK_CloseBracket] = "]",
|
|
[TK_Comma] = ",",
|
|
[TK_Pound] = "#",
|
|
[TK_Question] = "?",
|
|
[TK_ThreeDots] = "...",
|
|
[TK_Semicolon] = ";",
|
|
[TK_Dot] = ".",
|
|
[TK_LesserThen] = "<",
|
|
[TK_GreaterThen] = ">",
|
|
[TK_Colon] = ":",
|
|
[TK_Assign] = "=",
|
|
[TK_ColonAssign] = ":=",
|
|
[TK_DivAssign] = "/=",
|
|
[TK_MulAssign] = "*=",
|
|
[TK_ModAssign] = "%=",
|
|
[TK_SubAssign] = "-=",
|
|
[TK_AddAssign] = "+=",
|
|
[TK_AndAssign] = "&=",
|
|
[TK_OrAssign] = "|=",
|
|
[TK_XorAssign] = "^=",
|
|
[TK_LeftShiftAssign] = "<<=",
|
|
[TK_RightShiftAssign] = ">>=",
|
|
[TK_DoubleColon] = "::",
|
|
[TK_At] = "@",
|
|
[TK_Decrement] = "--",
|
|
[TK_Increment] = "++",
|
|
[TK_PostDecrement] = "--",
|
|
[TK_PostIncrement] = "++",
|
|
[TK_LesserThenOrEqual] = "<=",
|
|
[TK_GreaterThenOrEqual] = ">=",
|
|
[TK_Equals] = "==",
|
|
[TK_And] = "&&",
|
|
[TK_Or] = "||",
|
|
[TK_NotEquals] = "!=",
|
|
[TK_LeftShift] = "<<",
|
|
[TK_RightShift] = ">>",
|
|
[TK_Arrow] = "->",
|
|
[TK_ExprSizeof] = "sizeof",
|
|
[TK_DocComment] = "DocComment",
|
|
[TK_Comment] = "Comment",
|
|
[TK_Identifier] = "Identifier",
|
|
[TK_StringLit] = "StringLit",
|
|
[TK_Character] = "Character",
|
|
[TK_Error] = "Error",
|
|
[TK_Float] = "Float",
|
|
[TK_Integer] = "Int",
|
|
[TK_Keyword] = "Keyword",
|
|
};
|