Files
corelang/new_lex.c
2022-05-06 10:13:16 +02:00

517 lines
13 KiB
C

global Intern_String keyword_if;
global Intern_String keyword_for;
global Intern_String keyword_else;
global Intern_String keyword_sizeof;
global Intern_String keyword_typeof;
global Intern_String keyword_while;
global Intern_String keyword_switch;
global Intern_String keyword_case;
global Intern_String keyword_struct;
global Intern_String keyword_enum;
global Intern_String keyword_union;
global U8 *first_keyword;
global U8 *last_keyword;
function void
init_default_keywords(Intern_Table *t){
keyword_if = intern_string(t, lit("if"));
first_keyword = keyword_if.s.str;
keyword_for = intern_string(t, lit("for"));
keyword_else = intern_string(t, lit("else"));
keyword_sizeof = intern_string(t, lit("sizeof"));
keyword_typeof = intern_string(t, lit("typeof"));
keyword_while = intern_string(t, lit("while"));
keyword_switch = intern_string(t, lit("switch"));
keyword_case = intern_string(t, lit("case"));
keyword_struct = intern_string(t, lit("struct"));
keyword_enum = intern_string(t, lit("enum"));
keyword_union = intern_string(t, lit("union"));
last_keyword = keyword_union.s.str;
}
function B32
lex_is_keyword(Intern_String str){
B32 result = str.s.str >= first_keyword && str.s.str <= last_keyword;
return result;
}
typedef enum Token_Kind{
TK_End,
TK_Mul,
TK_Div,
TK_Add,
TK_Sub,
TK_Mod,
TK_BitAnd,
TK_BitOr,
TK_BitXor,
TK_Neg,
TK_Not,
TK_OpenParen,
TK_CloseParen,
TK_OpenBrace,
TK_CloseBrace,
TK_OpenBracket,
TK_CloseBracket,
TK_Comma,
TK_Pound,
TK_Question,
TK_ThreeDots,
TK_Semicolon,
TK_Dot,
TK_LesserThen,
TK_GreaterThen,
TK_Colon,
TK_Assign,
TK_DivAssign,
TK_MulAssign,
TK_ModAssign,
TK_SubAssign,
TK_AddAssign,
TK_AndAssign,
TK_OrAssign,
TK_XorAssign,
TK_LeftShiftAssign,
TK_RightShiftAssign,
TK_DoubleColon,
TK_At,
TK_Decrement,
TK_Increment,
TK_PostDecrement,
TK_PostIncrement,
TK_LesserThenOrEqual,
TK_GreaterThenOrEqual,
TK_Equals,
TK_And,
TK_Or,
TK_NotEquals,
TK_LeftShift,
TK_RightShift,
TK_Arrow,
TK_ExprSizeof,
TK_DocComment,
TK_Comment,
TK_Identifier,
TK_StringLit,
TK_Character,
TK_Error,
TK_Float,
TK_Int,
TK_Keyword,
}Token_Kind;
typedef struct Token{
Token_Kind kind;
union{
String string;
struct{U8 *str; S64 len;};
};
union {
U64 int_val;
F64 float_val;
String error_val;
Intern_String intern_val;
};
String file;
S32 line;
U8 *line_begin;
}Token;
#include "token_array.c"
typedef struct Lex_Stream{
String stream;
S64 iter;
U8 *line_begin;
String file;
S32 line;
}Lex_Stream;
function U8
lexc(Lex_Stream *s){
return s->stream.str[s->iter];
}
function U8
lexci(Lex_Stream *s, S32 i){
return s->stream.str[s->iter+i];
}
function U8 *
lexcp(Lex_Stream *s){
return s->stream.str + s->iter;
}
function B32
lex_is_whitespace(U8 c){
B32 result = c == '\n' || c == '\r' || c == ' ' || c == '\r';
return result;
}
function B32
lex_is_alphabetic(U8 c){
B32 result = (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
return result;
}
function B32
lex_is_numeric(U8 c){
B32 result = c >= '0' && c <= '9';
return result;
}
function B32
lex_is_alphanumeric(U8 c){
B32 result = lex_is_numeric(c) || lex_is_alphabetic(c);
return result;
}
function void
lex_set_len(Lex_Stream *s, Token *token){
assert(lexcp(s) >= token->str);
token->len = lexcp(s) - token->str;
}
function void
token_error(Token *t, String error_val){
t->kind = TK_Error;
t->error_val = error_val;
}
function void
lex_parse_u64(Token *t){
U64 result = 0;
U64 m = 1;
for(S64 i = t->len - 1; i >= 0; --i){
U64 val = t->str[i] - '0';
U64 new_val = val * m;
if((result + new_val) < result){
token_error(t, lit("Integer overflow"));
return;
}
result+=new_val;
m *= 10;
}
t->int_val = result;
}
function void
lex_advance(Lex_Stream *s){
if(s->iter >= s->stream.len){
return;
}
else if(lexc(s) == '\n'){
s->iter++;
s->line++;
s->line_begin = lexcp(s);
}
else{
s->iter++;
}
}
function Token
token_int(U64 val){
Token result = {.kind = TK_Int, .int_val=val};
return result;
}
function void
lex_parse_string(Lex_Stream *s, Token *t, U8 c){
for(;;){
if(lexc(s) == '\\') lex_advance(s);
else if(lexc(s) == c) break;
else if(lexc(s) == 0){
token_error(t, lit("Unterminated string, reached end of file"));
break;
}
lex_advance(s);
}
if(t->kind != TK_Error){
lex_advance(s);
lex_set_len(s,t);
}
}
#define CASE2(op, OpName, Assign) \
case op: \
if (lexc(s) == '=') { \
lex_advance(s); \
t.kind = Assign; \
} else { \
t.kind = OpName; \
} \
break
#define CASE3(op, OpName, Assign, Incr) \
case op: \
if (lexc(s) == '=') { \
lex_advance(s); \
t.kind = Assign; \
} else if (lexc(s) == op) { \
lex_advance(s); \
t.kind = Incr; \
} else { \
t.kind = OpName; \
} \
break
function void
lex__stream(Token_Array *array, Lex_Stream *s){
while(lexc(s)){
while(lex_is_whitespace(lexc(s)))
lex_advance(s);
Token t = {0};
t.str = lexcp(s);
t.file = s->file;
t.line = s->line;
t.line_begin = s->line_begin;
lex_advance(s);
switch(*t.str){
case 0: break;
case '@': t.kind = TK_At; break;
case '(': t.kind = TK_OpenParen; break;
case ')': t.kind = TK_CloseParen; break;
case '{': t.kind = TK_OpenBrace; break;
case '}': t.kind = TK_CloseBrace; break;
case '[': t.kind = TK_OpenBracket; break;
case ']': t.kind = TK_CloseBracket; break;
case ',': t.kind = TK_Comma; break;
case '~': t.kind = TK_Neg; break;
case '?': t.kind = TK_Question; break;
case ';': t.kind = TK_Semicolon; break;
case '#': t.kind = TK_Pound; break;
CASE2('!', TK_Not, TK_NotEquals);
CASE2('^', TK_BitXor, TK_XorAssign);
CASE2('=', TK_Assign, TK_Equals);
CASE2('*', TK_Mul, TK_MulAssign);
CASE2('%', TK_Mod, TK_ModAssign);
CASE3('+', TK_Add, TK_AddAssign, TK_Increment);
CASE3('&', TK_BitAnd, TK_AndAssign, TK_And);
CASE3('|', TK_BitOr, TK_OrAssign, TK_Or);
#undef CASE2
#undef CASE3
case '.': {
if(lexc(s) == '.' && lexci(s,1) == '.') {
lex_advance(s); lex_advance(s);
t.kind = TK_ThreeDots;
}
else {
t.kind = TK_Dot;
}
} break;
case '<': {
if (lexc(s) == '<') {
lex_advance(s);
if (lexc(s) == '=') {
lex_advance(s);
t.kind = TK_LeftShiftAssign;
}
else {
t.kind = TK_LeftShift;
}
}
else if (lexc(s) == '=') {
lex_advance(s);
t.kind = TK_LesserThenOrEqual;
}
else {
t.kind = TK_LesserThen;
}
} break;
case '>': {
if (lexc(s) == '>') {
lex_advance(s);
if (lexc(s) == '=') {
lex_advance(s);
t.kind = TK_RightShiftAssign;
}
else {
t.kind = TK_RightShift;
}
}
else if (lexc(s) == '=') {
lex_advance(s);
t.kind = TK_GreaterThenOrEqual;
}
else {
t.kind = TK_GreaterThen;
}
} break;
case ':': {
if (lexc(s) == ':') {
lex_advance(s);
t.kind = TK_DoubleColon;
}
else {
t.kind = TK_Colon;
}
} break;
case '-':{
if (lexc(s) == '=') {
lex_advance(s);
t.kind = TK_SubAssign;
}
else if (lexc(s) == '-') {
lex_advance(s);
t.kind = TK_Decrement;
}
else if (lexc(s) == '>') {
lex_advance(s);
t.kind = TK_Arrow;
}
else {
t.kind = TK_Sub;
}
} break;
case '\'':{not_implemented;} break;
case '"': {
t.kind = TK_StringLit;
lex_parse_string(s,&t,'"');
if(t.kind != TK_Error){
t.str += 1;
t.len -= 2;
}
t.intern_val = intern_string(&array->interns, t.string);
} break;
case '/': {
if(lexc(s) == '='){
t.kind = TK_DivAssign;
lex_advance(s);
}
else if(lexc(s) == '/'){
lex_advance(s);
t.kind = TK_Comment;
for(;;){
if(lexc(s) == '\n' || lexc(s) == 0) break;
lex_advance(s);
}
continue;
}
else if(lexc(s) == '*'){
lex_advance(s);
t.kind = TK_Comment;
for(;;){
if(lexc(s) == '*' && lexci(s,1) == '/'){
lex_advance(s);
lex_advance(s);
break;
}
else if(lexc(s) == 0){
token_error(&t, lit("Unterminated block comment"));
goto skip_continue;
}
lex_advance(s);
}
continue;
skip_continue:;
}
else {
t.kind = TK_Div;
}
} break;
case '0':case '1':case '2':case '3':case '4':
case '5':case '6':case '7':case '8':case '9':{
t.kind = TK_Int;
while(lex_is_numeric(lexc(s)))
lex_advance(s);
lex_set_len(s, &t);
lex_parse_u64(&t);
} break;
case 'A':case 'a':case 'M':case 'm':case 'B':
case 'b':case 'N':case 'n':case 'C':case 'c':case 'O':
case 'o':case 'D':case 'd':case 'P':case 'p':case 'E':
case 'e':case 'Q':case 'q':case 'F':case 'f':case 'R':
case 'r':case 'G':case 'g':case 'S':case 's':case 'H':
case 'h':case 'T':case 't':case 'I':case 'i':case 'U':
case 'u':case 'J':case 'j':case 'V':case 'v':case 'K':
case 'k':case 'W':case 'w':case 'L':case 'X':case 'l':
case 'x':case 'Z':case 'z':case 'Y':case 'y':case '_': {
t.kind = TK_Identifier;
while(lex_is_alphanumeric(lexc(s)) || lexc(s) == '_')
lex_advance(s);
lex_set_len(s,&t);
t.intern_val = intern_string(&array->interns, t.string);
if(lex_is_keyword(t.intern_val)){
t.kind = TK_Keyword;
}
} break;
default: {
token_error(&t, lit("Unknown token"));
}
}
if(t.len==0)
lex_set_len(s,&t);
token_array_push(array, &t);
}
}
function Token_Array
lex_stream(Arena *arena, String stream, String file){
Lex_Stream s = {stream, 0, stream.str, file, 0};
Token_Array array = token_array_make(arena);
init_default_keywords(&array.interns);
lex__stream(&array, &s);
return array;
}
function void
lex_test(){
Arena *scratch = arena_begin_scratch();
String test = lit("18446744073709551616{})(@?&+-;....->,:::/**/\"Thing\"//R\n Thingy"
"\"Test_Meme\"+=-===42524 4294967295 18446744073709551615"
"for if while switch");
Token_Array array = lex_stream(scratch, test, lit("Test1"));
Token_Kind kind[] = {
TK_Error,TK_OpenBrace,TK_CloseBrace,TK_CloseParen,TK_OpenParen,
TK_At,TK_Question,TK_BitAnd,TK_Add,TK_Sub,TK_Semicolon,
TK_ThreeDots, TK_Dot, TK_Arrow, TK_Comma, TK_DoubleColon, TK_Colon,
TK_StringLit, TK_Identifier, TK_StringLit, TK_AddAssign, TK_SubAssign,
TK_Equals, TK_Int, TK_Int, TK_Int, TK_Keyword, TK_Keyword,
TK_Keyword, TK_Keyword
};
String strs[] = {
lit("18446744073709551616"),lit("{"),lit("}"),lit(")"),lit("("),
lit("@"),lit("?"),lit("&"),lit("+"),lit("-"),lit(";"),
lit("..."),lit("."),lit("->"),lit(","),lit("::"),lit(":"),
lit("Thing"),lit("Thingy"),lit("Test_Meme"), lit("+="),lit("-="),
lit("=="),lit("42524"),lit("4294967295"),lit("18446744073709551615"),
lit("for"), lit("if"), lit("while"), lit("switch"),
};
U64 vals[] = {
42524, 4294967295, 18446744073709551615llu
};
int i = 0;
int ui = 0;
for(Token *t = token_array_iter_begin(&array); t; t = token_array_iter_next(&array)){
assert(t->kind == kind[i]);
assert(string_compare(t->string, strs[i++]));
if(t->kind == TK_Int){
assert(t->int_val == vals[ui++]);
}
}
arena_end_scratch();
}