Files
corelang/new_lex.cpp
2022-05-25 14:44:30 +02:00

666 lines
17 KiB
C++

enum Token_Kind{
TK_End,
TK_Mul,
TK_Div,
TK_Mod,
TK_LeftShift,
TK_RightShift,
TK_FirstMul = TK_Mul,
TK_LastMul = TK_RightShift,
TK_Add,
TK_Sub,
TK_FirstAdd = TK_Add,
TK_LastAdd = TK_Sub,
TK_Equals,
TK_LesserThenOrEqual,
TK_GreaterThenOrEqual,
TK_LesserThen,
TK_GreaterThen,
TK_NotEquals,
TK_FirstCompare = TK_Equals,
TK_LastCompare = TK_NotEquals,
TK_BitAnd,
TK_BitOr,
TK_And,
TK_Or,
TK_FirstLogical = TK_BitAnd,
TK_LastLogical = TK_Or,
TK_Neg,
TK_Not,
TK_OpenParen,
TK_CloseParen,
TK_OpenBrace,
TK_CloseBrace,
TK_OpenBracket,
TK_CloseBracket,
TK_Comma,
TK_Pound,
TK_Question,
TK_ThreeDots,
TK_Semicolon,
TK_Dot,
TK_NewLine,
TK_Colon,
TK_Assign,
TK_DivAssign,
TK_MulAssign,
TK_ModAssign,
TK_SubAssign,
TK_AddAssign,
TK_AndAssign,
TK_OrAssign,
TK_XorAssign,
TK_LeftShiftAssign,
TK_RightShiftAssign,
TK_FirstAssign = TK_Assign,
TK_LastAssign = TK_RightShiftAssign,
TK_DoubleColon,
TK_At,
TK_Decrement,
TK_Increment,
TK_PostDecrement,
TK_PostIncrement,
TK_Arrow,
TK_ExprSizeof,
TK_DocComment,
TK_Comment,
TK_Identifier,
TK_StringLit,
TK_Character,
TK_Error,
TK_Float,
TK_Integer,
TK_Keyword,
TK_Pointer,
TK_Dereference,
// These are not produced by lexer
// but identified by parser
OPEN_SCOPE,
CLOSE_SCOPE,
SAME_SCOPE,
};
struct Token{
Token_Kind kind;
union{
String string;
struct{U8 *str; S64 len;};
};
union {
U64 int_val;
F64 float_val;
String error_val;
Intern_String intern_val;
S64 indent;
};
String file;
S32 line;
U8 *line_begin;
};
struct Lex_Stream{
String stream;
S64 iter;
U8 *line_begin;
String file;
S32 line;
S32 inside_brace_paren;
S32 last_valid_indent;
};
struct Lexer{
Lex_Stream stream;
Array<Token> tokens;
Intern_Table interns;
S64 token_iter;
};
function U8
lexc(Lex_Stream *s){
return s->stream.str[s->iter];
}
function U8
lexci(Lex_Stream *s, S32 i){
return s->stream.str[s->iter+i];
}
function U8 *
lexcp(Lex_Stream *s){
return s->stream.str + s->iter;
}
function B32
lex_is_whitespace(U8 c){
B32 result = c == ' ' || c == '\r';
return result;
}
function B32
lex_is_alphabetic(U8 c){
B32 result = (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
return result;
}
function B32
lex_is_numeric(U8 c){
B32 result = c >= '0' && c <= '9';
return result;
}
function B32
lex_is_alphanumeric(U8 c){
B32 result = lex_is_numeric(c) || lex_is_alphabetic(c);
return result;
}
function void
lex_set_len(Lex_Stream *s, Token *token){
assert(lexcp(s) >= token->str);
token->len = lexcp(s) - token->str;
}
function void
lex_set_keywords(Lexer *lexer, Array<String> keywords){
Intern_String keyword = {};
For(keywords){
keyword = intern_string(&lexer->interns, *it);
if(it == keywords.begin())
lexer->interns.first_keyword = keyword.str;
}
lexer->interns.last_keyword = keyword.str;
}
function B32
lex_is_keyword(Intern_Table *lexer, Intern_String keyword){
B32 result = keyword.str >= lexer->first_keyword && keyword.str <= lexer->last_keyword;
return result;
}
function void
token_error(Token *t, String error_val){
t->kind = TK_Error;
t->error_val = error_val;
}
function void
lex_parse_u64(Token *t){
U64 result = 0;
U64 m = 1;
for(S64 i = t->len - 1; i >= 0; --i){
U64 val = t->str[i] - '0';
U64 new_val = val * m;
if((result + new_val) < result){
token_error(t, "Integer overflow"_s);
return;
}
result+=new_val;
m *= 10;
}
t->int_val = result;
}
function void
lex_advance(Lex_Stream *s){
if(s->iter >= s->stream.len){
return;
}
else if(lexc(s) == '\n'){
s->iter++;
s->line++;
s->line_begin = lexcp(s);
}
else{
s->iter++;
}
}
function void
lex_parse_string(Lex_Stream *s, Token *t, U8 c){
for(;;){
if(lexc(s) == '\\') lex_advance(s);
else if(lexc(s) == c) break;
else if(lexc(s) == 0){
token_error(t, "Unterminated string, reached end of file"_s);
break;
}
lex_advance(s);
}
if(t->kind != TK_Error){
lex_advance(s);
lex_set_len(s,t);
}
}
#define CASE2(op, OpName, Assign) \
case op: \
if (lexc(s) == '=') { \
lex_advance(s); \
t.kind = Assign; \
} else { \
t.kind = OpName; \
} \
break
#define CASE3(op, OpName, Assign, Incr) \
case op: \
if (lexc(s) == '=') { \
lex_advance(s); \
t.kind = Assign; \
} else if (lexc(s) == op) { \
lex_advance(s); \
t.kind = Incr; \
} else { \
t.kind = OpName; \
} \
break
function void
lex__stream(Intern_Table *table, Array<Token> *array, Lex_Stream *s){
while(lexc(s)){
Token t = {};
t.str = lexcp(s);
t.file = s->file;
t.line = s->line;
t.line_begin = s->line_begin;
lex_advance(s);
switch(*t.str){
case 0 : break;
case '@': t.kind = TK_At; break;
case '(': s->inside_brace_paren++; t.kind = TK_OpenParen; break;
case ')': s->inside_brace_paren--; t.kind = TK_CloseParen; break;
case '{': s->inside_brace_paren++; t.kind = TK_OpenBrace; break;
case '}': s->inside_brace_paren--; t.kind = TK_CloseBrace; break;
case '[': s->inside_brace_paren++; t.kind = TK_OpenBracket; break;
case ']': s->inside_brace_paren--; t.kind = TK_CloseBracket; break;
case ',': t.kind = TK_Comma; break;
case '~': t.kind = TK_Neg; break;
case '?': t.kind = TK_Question; break;
case '#': t.kind = TK_Pound; break;
case '^': t.kind = TK_Pointer; break;
CASE2('!', TK_Not, TK_NotEquals);
CASE2('=', TK_Assign, TK_Equals);
CASE2('*', TK_Mul, TK_MulAssign);
CASE2('%', TK_Mod, TK_ModAssign);
CASE3('+', TK_Add, TK_AddAssign, TK_Increment);
CASE3('&', TK_BitAnd, TK_AndAssign, TK_And);
CASE3('|', TK_BitOr, TK_OrAssign, TK_Or);
#undef CASE2
#undef CASE3
case ';': {
t.kind = TK_Semicolon;
}break;
case ' ' : s->stream.str -= 1;
case '\n': {
t.kind = TK_NewLine;
if(lexc(s) == '\r')
lex_advance(s);
for(;;){
if(lexc(s) == ' ') {
t.indent++;
// @Todo(Krzosa): Detect indentation method, file an error while methods are mixed
}
else if(lexc(s) == '\t') t.indent++;
else break;
lex_advance(s);
}
}break;
case '.': {
if(lexc(s) == '.' && lexci(s,1) == '.') {
lex_advance(s); lex_advance(s);
t.kind = TK_ThreeDots;
}
else {
t.kind = TK_Dot;
}
} break;
case '<': {
if (lexc(s) == '<') {
lex_advance(s);
if (lexc(s) == '=') {
lex_advance(s);
t.kind = TK_LeftShiftAssign;
}
else {
t.kind = TK_LeftShift;
}
}
else if (lexc(s) == '=') {
lex_advance(s);
t.kind = TK_LesserThenOrEqual;
}
else {
t.kind = TK_LesserThen;
}
} break;
case '>': {
if (lexc(s) == '>') {
lex_advance(s);
if (lexc(s) == '=') {
lex_advance(s);
t.kind = TK_RightShiftAssign;
}
else {
t.kind = TK_RightShift;
}
}
else if (lexc(s) == '=') {
lex_advance(s);
t.kind = TK_GreaterThenOrEqual;
}
else {
t.kind = TK_GreaterThen;
}
} break;
case ':': {
if (lexc(s) == ':') {
lex_advance(s);
t.kind = TK_DoubleColon;
}
else {
t.kind = TK_Colon;
}
} break;
case '-':{
if (lexc(s) == '=') {
lex_advance(s);
t.kind = TK_SubAssign;
}
else if (lexc(s) == '-') {
lex_advance(s);
t.kind = TK_Decrement;
}
else if (lexc(s) == '>') {
lex_advance(s);
t.kind = TK_Arrow;
}
else {
t.kind = TK_Sub;
}
} break;
case '\'':{not_implemented;} break;
case '"': {
t.kind = TK_StringLit;
lex_parse_string(s,&t,'"');
if(t.kind != TK_Error){
t.str += 1;
t.len -= 2;
}
t.intern_val = intern_string(table, t.string);
} break;
case '/': {
if(lexc(s) == '='){
t.kind = TK_DivAssign;
lex_advance(s);
}
else if(lexc(s) == '/'){
lex_advance(s);
t.kind = TK_Comment;
for(;;){
if(lexc(s) == '\n' || lexc(s) == 0) break;
lex_advance(s);
}
continue;
}
else if(lexc(s) == '*'){
lex_advance(s);
t.kind = TK_Comment;
for(;;){
if(lexc(s) == '*' && lexci(s,1) == '/'){
lex_advance(s);
lex_advance(s);
break;
}
else if(lexc(s) == 0){
token_error(&t, "Unterminated block comment"_s);
goto skip_continue;
}
lex_advance(s);
}
continue;
skip_continue:;
}
else {
t.kind = TK_Div;
}
} break;
case '0':case '1':case '2':case '3':case '4':
case '5':case '6':case '7':case '8':case '9':{
t.kind = TK_Integer;
while(lex_is_numeric(lexc(s)))
lex_advance(s);
lex_set_len(s, &t);
lex_parse_u64(&t);
} break;
case 'A':case 'a':case 'M':case 'm':case 'B':
case 'b':case 'N':case 'n':case 'C':case 'c':case 'O':
case 'o':case 'D':case 'd':case 'P':case 'p':case 'E':
case 'e':case 'Q':case 'q':case 'F':case 'f':case 'R':
case 'r':case 'G':case 'g':case 'S':case 's':case 'H':
case 'h':case 'T':case 't':case 'I':case 'i':case 'U':
case 'u':case 'J':case 'j':case 'V':case 'v':case 'K':
case 'k':case 'W':case 'w':case 'L':case 'X':case 'l':
case 'x':case 'Z':case 'z':case 'Y':case 'y':case '_': {
t.kind = TK_Identifier;
while(lex_is_alphanumeric(lexc(s)) || lexc(s) == '_')
lex_advance(s);
lex_set_len(s,&t);
t.intern_val = intern_string(table, t.string);
if(lex_is_keyword(table, t.intern_val)){
t.kind = TK_Keyword;
}
} break;
default: {
token_error(&t, "Unknown token"_s);
}
}
if(t.len==0)
lex_set_len(s,&t);
B32 skip = 0;
if(t.kind == TK_NewLine){
if(s->inside_brace_paren > 0) skip = 1;
if(array->len > 0 && array->last()->kind == TK_NewLine) array->pop();
}
if(!skip){
array->add(t);
}
while(lex_is_whitespace(lexc(s)))
lex_advance(s);
if(s->iter >= s->stream.len) // End of stream
break;
}
}
function void
lex_init(Allocator *token_string_arena, Allocator *map_allocator, Lexer *l){
l->tokens = array_make<Token>(token_string_arena, 1024*2);
l->interns= intern_table_make(token_string_arena, map_allocator, 1024);
}
function Lexer
lex_make(Allocator *token_string_arena, Allocator *map_allocator){
Lexer result = {};
lex_init(token_string_arena, map_allocator, &result);
return result;
}
function void
lex_restream(Lexer *lexer, String istream, String file){
lexer->stream = {};
lexer->stream.stream = istream;
lexer->stream.line_begin = istream.str;
lexer->stream.file = file;
lexer->tokens.clear();
lexer->token_iter = 0;
lex__stream(&lexer->interns, &lexer->tokens, &lexer->stream);
}
function Lexer
lex_stream(Allocator *token_string_arena, Allocator *map_allocator, String istream, String file){
Lexer result = lex_make(token_string_arena, map_allocator);
lex_restream(&result, istream, file);
return result;
}
function void
lex_test(){
Scratch scratch;
String test = "Keyword //R\n 18446744073709551616{})(@?&+-;....->,:::/**/\"Thing\" Thingy"
"\"Test_Meme\"+=-===42524 4294967295 18446744073709551615"
"for if while switch :="_s;
Array<String> keywords = {scratch};
keywords.add("Keyword"_s);
keywords.add("for"_s);
keywords.add("if"_s);
keywords.add("while"_s);
keywords.add("switch"_s);
Lexer lexer = lex_make(scratch, scratch);
lex_set_keywords(&lexer, keywords);
lex_restream(&lexer, test, "Test1"_s);
Array<Token> arr = lexer.tokens;
Token_Kind kind[] = {
TK_Keyword, TK_NewLine, TK_Error,TK_OpenBrace,TK_CloseBrace,TK_CloseParen,TK_OpenParen,
TK_At,TK_Question,TK_BitAnd,TK_Add,TK_Sub,TK_Semicolon,
TK_ThreeDots, TK_Dot, TK_Arrow, TK_Comma, TK_DoubleColon, TK_Colon,
TK_StringLit, TK_Identifier, TK_StringLit, TK_AddAssign, TK_SubAssign,
TK_Equals, TK_Integer, TK_Integer, TK_Integer,
TK_Keyword, TK_Keyword, TK_Keyword, TK_Keyword,
TK_Colon, TK_Assign, TK_End
};
String strs[] = {
"Keyword"_s, "\n "_s, "18446744073709551616"_s,"{"_s,"}"_s,")"_s,"("_s,
"@"_s,"?"_s,"&"_s,"+"_s,"-"_s,";"_s,
"..."_s,"."_s,"->"_s,","_s,"::"_s,":"_s,
"Thing"_s,"Thingy"_s,"Test_Meme"_s, "+="_s,"-="_s,
"=="_s,"42524"_s,"4294967295"_s,"18446744073709551615"_s,
"for"_s, "if"_s, "while"_s, "switch"_s, ":"_s, "="_s, ""_s,
};
U64 vals[] = {
42524, 4294967295, 18446744073709551615llu
};
int ui = 0;
For(arr){
assert(it->kind == kind[i]);
assert(string_compare(it->string, strs[i]));
if(it->kind == TK_Integer){
assert(it->int_val == vals[ui++]);
}
}
}
//-----------------------------------------------------------------------------
// Token metadata
//-----------------------------------------------------------------------------
function String
token_kind_string(Token_Kind kind){
switch(kind){
case TK_End: return "End of stream"_s;
case TK_Mul: return "*"_s;
case TK_Div: return "/"_s;
case TK_Add: return "+"_s;
case TK_Sub: return "-"_s;
case TK_Mod: return "%"_s;
case TK_BitAnd: return "&"_s;
case TK_BitOr: return "|"_s;
case TK_Pointer: return "^"_s;
case TK_Neg: return "~"_s;
case TK_Not: return "!"_s;
case TK_OpenParen: return "("_s;
case TK_CloseParen: return " "_s;
case TK_OpenBrace: return "{"_s;
case TK_CloseBrace: return "}"_s;
case TK_OpenBracket: return "["_s;
case TK_CloseBracket: return "]"_s;
case TK_Comma: return ","_s;
case TK_Pound: return "#"_s;
case TK_Question: return "?"_s;
case TK_ThreeDots: return "..."_s;
case TK_Semicolon: return ";"_s;
case TK_Dot: return "."_s;
case TK_LesserThen: return "<"_s;
case TK_GreaterThen: return ">"_s;
case TK_Colon: return ":"_s;
case TK_Assign: return "="_s;
case TK_DivAssign: return "/="_s;
case TK_MulAssign: return "*="_s;
case TK_ModAssign: return "%="_s;
case TK_SubAssign: return "-="_s;
case TK_AddAssign: return "+="_s;
case TK_AndAssign: return "&="_s;
case TK_OrAssign: return "|="_s;
case TK_XorAssign: return "^="_s;
case TK_LeftShiftAssign: return "<<="_s;
case TK_RightShiftAssign: return ">>="_s;
case TK_DoubleColon: return "::"_s;
case TK_At: return "@"_s;
case TK_Decrement: return "--"_s;
case TK_Increment: return "++"_s;
case TK_PostDecrement: return "--"_s;
case TK_PostIncrement: return "++"_s;
case TK_LesserThenOrEqual: return "<="_s;
case TK_GreaterThenOrEqual: return ">="_s;
case TK_Equals: return "=="_s;
case TK_And: return "&&"_s;
case TK_Or: return "||"_s;
case TK_NotEquals: return "!="_s;
case TK_LeftShift: return "<<"_s;
case TK_RightShift: return ">>"_s;
case TK_Arrow: return "->"_s;
case TK_NewLine: return "New_Line"_s;
case TK_ExprSizeof: return "sizeof"_s;
case TK_DocComment: return "Doc_Comment"_s;
case TK_Comment: return "Comment"_s;
case TK_Identifier: return "Identifier"_s;
case TK_StringLit: return "String_Lit"_s;
case TK_Character: return "Character"_s;
case TK_Error: return "Error"_s;
case TK_Float: return "Float"_s;
case TK_Integer: return "Int"_s;
case TK_Keyword: return "Keyword"_s;
case CLOSE_SCOPE: return "Close_Scope"_s;
case OPEN_SCOPE: return "Open_Scope"_s;
case SAME_SCOPE: return "Same_Scope"_s;
default: invalid_codepath; return "<Undefined>"_s;
}
}