Working on the lexer, handling indentation and scopes

This commit is contained in:
Krzosa Karol
2022-05-26 14:24:11 +02:00
parent f4c05923c9
commit d9a80afa9e
2 changed files with 127 additions and 42 deletions

View File

@@ -1,3 +1,19 @@
/*
/*begin of file*/ thing // indent 2 == error
add_10 :: (size: int): int // scope 0
add_20 :: (new_size: int): int // up scope 2
return 20 // up scope 2
// down scope
// down scope
// scope 0
thing
*/
add_10 :: (size: int): int add_10 :: (size: int): int
add_20 :: (new_size: int): int add_20 :: (new_size: int): int

View File

@@ -121,6 +121,7 @@ struct Lex_Stream{
S32 line; S32 line;
S32 inside_brace_paren; S32 inside_brace_paren;
S32 last_valid_indent; S32 last_valid_indent;
Array<Token *> indent_stack;
}; };
struct Lexer{ struct Lexer{
@@ -269,18 +270,112 @@ lex_parse_string(Lex_Stream *s, Token *t, U8 c){
} \ } \
break break
function Token
token_make(U8 *str, String file, int line, U8 *line_begin){
Token t = {};
t.str = str;
t.file = file;
t.line = line;
t.line_begin = line_begin;
return t;
}
global Token token_null = {SAME_SCOPE};
function Token *
lex_last_indent_token(Lex_Stream *s){
if(s->indent_stack.len > 0){
return *s->indent_stack.last();
}
return &token_null;
}
function B32
token_is_scope(Token *t){
B32 result = t->kind == OPEN_SCOPE || t->kind == CLOSE_SCOPE || t->kind == SAME_SCOPE;
return result;
}
function void function void
lex__stream(Intern_Table *table, Array<Token> *array, Lex_Stream *s){ lex__stream(Intern_Table *table, Array<Token> *array, Lex_Stream *s){
B32 beginning = true;
while(lexc(s)){ while(lexc(s)){
while(lexc(s) == '\r') lex_advance(s); if(s->iter >= s->stream.len) // End of stream
break;
Token t = {}; // @note: for now the lexer is going to be a 2 stage process
t.str = lexcp(s); // first we tokenize the indentation and then proceed to tokenize
t.file = s->file; // the good stuff
t.line = s->line;
t.line_begin = s->line_begin; // for blocks of stmts we parse till we cant find another new line
// of same scope.
// parse_decl doesn't require preceding new line
//
// in that way new lines act as commas in function params
// seeing a comma means that there is a next thing to parse
// and it's easy to parse stuff using a do while loop
// @note: first handle indentation
// mostly we want to merge multiple new lines
// but for down scopes we want to emit 2 new lines
// that will ease out parsing, one token to break out
// from a block parsing, second to allow continuation of surrounding scope
Token t = token_make(lexcp(s), s->file, s->line, s->line_begin);
B32 should_emit = beginning;
for(;;){
switch(lexc(s)){
case '\t': case ' ': lex_advance(s); t.indent++; break;
case '\r': lex_advance(s); break;
case '\n':{
lex_advance(s);
should_emit = true;
t = token_make(lexcp(s), s->file, s->line, s->line_begin);
} break;
default:{
if(s->inside_brace_paren) should_emit = false;
if(should_emit){
Token *last = lex_last_indent_token(s);
if(t.indent > last->indent){
t.kind = OPEN_SCOPE;
array->add(t);
s->indent_stack.add(array->last());
}
else if(t.indent < last->indent){
For_Reverse(s->indent_stack){
assert(token_is_scope(*it));
if(it[0]->indent == t.indent){
t.kind = SAME_SCOPE;
array->add(t);
break;
}
else if(it[0]->indent < t.indent){
token_error(&t, "Bad indentation"_s);
array->add(t);
break;
}
else{
s->indent_stack.pop();
t.kind = CLOSE_SCOPE;
array->add(t);
}
}
}
else {
t.kind = SAME_SCOPE;
array->add(t); // else SAME_SCOPE
}
}
goto indent_loop_break;
}
}
} indent_loop_break:
beginning = false;
t = token_make(lexcp(s), s->file, s->line, s->line_begin);
lex_advance(s); lex_advance(s);
// @note: handle the indented token
switch(*t.str){ switch(*t.str){
case 0 : break; case 0 : break;
case '@': t.kind = TK_At; break; case '@': t.kind = TK_At; break;
@@ -302,29 +397,10 @@ lex__stream(Intern_Table *table, Array<Token> *array, Lex_Stream *s){
CASE3('+', TK_Add, TK_AddAssign, TK_Increment); CASE3('+', TK_Add, TK_AddAssign, TK_Increment);
CASE3('&', TK_BitAnd, TK_AndAssign, TK_And); CASE3('&', TK_BitAnd, TK_AndAssign, TK_And);
CASE3('|', TK_BitOr, TK_OrAssign, TK_Or); CASE3('|', TK_BitOr, TK_OrAssign, TK_Or);
#undef CASE2
#undef CASE3
case ';': { case ';': {
t.kind = TK_Semicolon; t.kind = TK_Semicolon;
}break; }break;
case '\r': case ' ' : s->stream.str -= 1;
case '\n': {
t.kind = TK_NewLine;
if(lexc(s) == '\r')
lex_advance(s);
for(;;){
if(lexc(s) == ' ') {
t.indent++;
// @Todo(Krzosa): Detect indentation method, file an error while methods are mixed
}
else if(lexc(s) == '\t') t.indent++;
else break;
lex_advance(s);
}
}break;
case '.': { case '.': {
if(lexc(s) == '.' && lexci(s,1) == '.') { if(lexc(s) == '.' && lexci(s,1) == '.') {
lex_advance(s); lex_advance(s); lex_advance(s); lex_advance(s);
@@ -489,21 +565,10 @@ lex__stream(Intern_Table *table, Array<Token> *array, Lex_Stream *s){
if(t.len==0) if(t.len==0)
lex_set_len(s,&t); lex_set_len(s,&t);
B32 skip = 0;
if(t.kind == TK_NewLine){
if(s->inside_brace_paren > 0) skip = 1;
if(array->len > 0 && array->last()->kind == TK_NewLine) array->pop();
}
if(!skip){
array->add(t); array->add(t);
} }
#undef CASE2
while(lex_is_whitespace(lexc(s))) #undef CASE3
lex_advance(s);
if(s->iter >= s->stream.len) // End of stream
break;
}
} }
function void function void
@@ -529,6 +594,9 @@ lex_restream(Lexer *lexer, String istream, String file){
lexer->tokens.clear(); lexer->tokens.clear();
lexer->token_iter = 0; lexer->token_iter = 0;
Scratch scratch;
lexer->stream.indent_stack.allocator = scratch;
lexer->stream.indent_stack.add(&token_null);
lex__stream(&lexer->interns, &lexer->tokens, &lexer->stream); lex__stream(&lexer->interns, &lexer->tokens, &lexer->stream);
} }
@@ -542,7 +610,7 @@ lex_stream(Allocator *token_string_arena, Allocator *map_allocator, String istre
function void function void
lex_test(){ lex_test(){
Scratch scratch; Scratch scratch;
String test = "Keyword //R\n 18446744073709551616{})(@?&+-;....->,:::/**/\"Thing\" Thingy" String test = "Keyword //R\n 18446744073709551616\n {}\n)(@?&+-;....->,:::/**/\"Thing\" Thingy"
"\"Test_Meme\"+=-===42524 4294967295 18446744073709551615" "\"Test_Meme\"+=-===42524 4294967295 18446744073709551615"
"for if while switch :="_s; "for if while switch :="_s;
@@ -559,7 +627,8 @@ lex_test(){
Array<Token> arr = lexer.tokens; Array<Token> arr = lexer.tokens;
Token_Kind kind[] = { Token_Kind kind[] = {
TK_Keyword, TK_NewLine, TK_Error,TK_OpenBrace,TK_CloseBrace,TK_CloseParen,TK_OpenParen, SAME_SCOPE,
TK_Keyword, OPEN_SCOPE, TK_Error, OPEN_SCOPE, TK_OpenBrace,TK_CloseBrace,CLOSE_SCOPE, CLOSE_SCOPE, SAME_SCOPE, TK_CloseParen,TK_OpenParen,
TK_At,TK_Question,TK_BitAnd,TK_Add,TK_Sub,TK_Semicolon, TK_At,TK_Question,TK_BitAnd,TK_Add,TK_Sub,TK_Semicolon,
TK_ThreeDots, TK_Dot, TK_Arrow, TK_Comma, TK_DoubleColon, TK_Colon, TK_ThreeDots, TK_Dot, TK_Arrow, TK_Comma, TK_DoubleColon, TK_Colon,
TK_StringLit, TK_Identifier, TK_StringLit, TK_AddAssign, TK_SubAssign, TK_StringLit, TK_Identifier, TK_StringLit, TK_AddAssign, TK_SubAssign,
@@ -568,7 +637,7 @@ lex_test(){
TK_Colon, TK_Assign, TK_End TK_Colon, TK_Assign, TK_End
}; };
String strs[] = { String strs[] = {
"Keyword"_s, "\n "_s, "18446744073709551616"_s,"{"_s,"}"_s,")"_s,"("_s, ""_s, "Keyword"_s, ""_s, "18446744073709551616"_s, ""_s, "{"_s,"}"_s, ""_s, ""_s, ""_s, ")"_s, "("_s,
"@"_s,"?"_s,"&"_s,"+"_s,"-"_s,";"_s, "@"_s,"?"_s,"&"_s,"+"_s,"-"_s,";"_s,
"..."_s,"."_s,"->"_s,","_s,"::"_s,":"_s, "..."_s,"."_s,"->"_s,","_s,"::"_s,":"_s,
"Thing"_s,"Thingy"_s,"Test_Meme"_s, "+="_s,"-="_s, "Thing"_s,"Thingy"_s,"Test_Meme"_s, "+="_s,"-="_s,