Working on the lexer, handling indentation and scopes
This commit is contained in:
153
new_lex.cpp
153
new_lex.cpp
@@ -121,6 +121,7 @@ struct Lex_Stream{
|
||||
S32 line;
|
||||
S32 inside_brace_paren;
|
||||
S32 last_valid_indent;
|
||||
Array<Token *> indent_stack;
|
||||
};
|
||||
|
||||
struct Lexer{
|
||||
@@ -269,18 +270,112 @@ lex_parse_string(Lex_Stream *s, Token *t, U8 c){
|
||||
} \
|
||||
break
|
||||
|
||||
function Token
|
||||
token_make(U8 *str, String file, int line, U8 *line_begin){
|
||||
Token t = {};
|
||||
t.str = str;
|
||||
t.file = file;
|
||||
t.line = line;
|
||||
t.line_begin = line_begin;
|
||||
return t;
|
||||
}
|
||||
|
||||
global Token token_null = {SAME_SCOPE};
|
||||
|
||||
function Token *
|
||||
lex_last_indent_token(Lex_Stream *s){
|
||||
if(s->indent_stack.len > 0){
|
||||
return *s->indent_stack.last();
|
||||
}
|
||||
return &token_null;
|
||||
}
|
||||
|
||||
function B32
|
||||
token_is_scope(Token *t){
|
||||
B32 result = t->kind == OPEN_SCOPE || t->kind == CLOSE_SCOPE || t->kind == SAME_SCOPE;
|
||||
return result;
|
||||
}
|
||||
|
||||
function void
|
||||
lex__stream(Intern_Table *table, Array<Token> *array, Lex_Stream *s){
|
||||
B32 beginning = true;
|
||||
while(lexc(s)){
|
||||
while(lexc(s) == '\r') lex_advance(s);
|
||||
if(s->iter >= s->stream.len) // End of stream
|
||||
break;
|
||||
|
||||
Token t = {};
|
||||
t.str = lexcp(s);
|
||||
t.file = s->file;
|
||||
t.line = s->line;
|
||||
t.line_begin = s->line_begin;
|
||||
// @note: for now the lexer is going to be a 2 stage process
|
||||
// first we tokenize the indentation and then proceed to tokenize
|
||||
// the good stuff
|
||||
|
||||
// for blocks of stmts we parse till we cant find another new line
|
||||
// of same scope.
|
||||
// parse_decl doesn't require preceding new line
|
||||
//
|
||||
// in that way new lines act as commas in function params
|
||||
// seeing a comma means that there is a next thing to parse
|
||||
// and it's easy to parse stuff using a do while loop
|
||||
|
||||
// @note: first handle indentation
|
||||
// mostly we want to merge multiple new lines
|
||||
// but for down scopes we want to emit 2 new lines
|
||||
// that will ease out parsing, one token to break out
|
||||
// from a block parsing, second to allow continuation of surrounding scope
|
||||
Token t = token_make(lexcp(s), s->file, s->line, s->line_begin);
|
||||
B32 should_emit = beginning;
|
||||
for(;;){
|
||||
switch(lexc(s)){
|
||||
case '\t': case ' ': lex_advance(s); t.indent++; break;
|
||||
case '\r': lex_advance(s); break;
|
||||
case '\n':{
|
||||
lex_advance(s);
|
||||
should_emit = true;
|
||||
t = token_make(lexcp(s), s->file, s->line, s->line_begin);
|
||||
} break;
|
||||
default:{
|
||||
if(s->inside_brace_paren) should_emit = false;
|
||||
if(should_emit){
|
||||
Token *last = lex_last_indent_token(s);
|
||||
if(t.indent > last->indent){
|
||||
t.kind = OPEN_SCOPE;
|
||||
array->add(t);
|
||||
s->indent_stack.add(array->last());
|
||||
}
|
||||
else if(t.indent < last->indent){
|
||||
For_Reverse(s->indent_stack){
|
||||
assert(token_is_scope(*it));
|
||||
if(it[0]->indent == t.indent){
|
||||
t.kind = SAME_SCOPE;
|
||||
array->add(t);
|
||||
break;
|
||||
}
|
||||
else if(it[0]->indent < t.indent){
|
||||
token_error(&t, "Bad indentation"_s);
|
||||
array->add(t);
|
||||
break;
|
||||
}
|
||||
else{
|
||||
s->indent_stack.pop();
|
||||
t.kind = CLOSE_SCOPE;
|
||||
array->add(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
t.kind = SAME_SCOPE;
|
||||
array->add(t); // else SAME_SCOPE
|
||||
}
|
||||
}
|
||||
|
||||
goto indent_loop_break;
|
||||
}
|
||||
}
|
||||
} indent_loop_break:
|
||||
beginning = false;
|
||||
|
||||
t = token_make(lexcp(s), s->file, s->line, s->line_begin);
|
||||
lex_advance(s);
|
||||
|
||||
// @note: handle the indented token
|
||||
switch(*t.str){
|
||||
case 0 : break;
|
||||
case '@': t.kind = TK_At; break;
|
||||
@@ -302,29 +397,10 @@ lex__stream(Intern_Table *table, Array<Token> *array, Lex_Stream *s){
|
||||
CASE3('+', TK_Add, TK_AddAssign, TK_Increment);
|
||||
CASE3('&', TK_BitAnd, TK_AndAssign, TK_And);
|
||||
CASE3('|', TK_BitOr, TK_OrAssign, TK_Or);
|
||||
#undef CASE2
|
||||
#undef CASE3
|
||||
case ';': {
|
||||
t.kind = TK_Semicolon;
|
||||
}break;
|
||||
|
||||
case '\r': case ' ' : s->stream.str -= 1;
|
||||
case '\n': {
|
||||
t.kind = TK_NewLine;
|
||||
if(lexc(s) == '\r')
|
||||
lex_advance(s);
|
||||
|
||||
for(;;){
|
||||
if(lexc(s) == ' ') {
|
||||
t.indent++;
|
||||
// @Todo(Krzosa): Detect indentation method, file an error while methods are mixed
|
||||
}
|
||||
else if(lexc(s) == '\t') t.indent++;
|
||||
else break;
|
||||
lex_advance(s);
|
||||
}
|
||||
|
||||
}break;
|
||||
case '.': {
|
||||
if(lexc(s) == '.' && lexci(s,1) == '.') {
|
||||
lex_advance(s); lex_advance(s);
|
||||
@@ -489,21 +565,10 @@ lex__stream(Intern_Table *table, Array<Token> *array, Lex_Stream *s){
|
||||
if(t.len==0)
|
||||
lex_set_len(s,&t);
|
||||
|
||||
B32 skip = 0;
|
||||
if(t.kind == TK_NewLine){
|
||||
if(s->inside_brace_paren > 0) skip = 1;
|
||||
if(array->len > 0 && array->last()->kind == TK_NewLine) array->pop();
|
||||
}
|
||||
if(!skip){
|
||||
array->add(t);
|
||||
}
|
||||
|
||||
while(lex_is_whitespace(lexc(s)))
|
||||
lex_advance(s);
|
||||
|
||||
if(s->iter >= s->stream.len) // End of stream
|
||||
break;
|
||||
array->add(t);
|
||||
}
|
||||
#undef CASE2
|
||||
#undef CASE3
|
||||
}
|
||||
|
||||
function void
|
||||
@@ -529,6 +594,9 @@ lex_restream(Lexer *lexer, String istream, String file){
|
||||
|
||||
lexer->tokens.clear();
|
||||
lexer->token_iter = 0;
|
||||
Scratch scratch;
|
||||
lexer->stream.indent_stack.allocator = scratch;
|
||||
lexer->stream.indent_stack.add(&token_null);
|
||||
lex__stream(&lexer->interns, &lexer->tokens, &lexer->stream);
|
||||
}
|
||||
|
||||
@@ -542,7 +610,7 @@ lex_stream(Allocator *token_string_arena, Allocator *map_allocator, String istre
|
||||
function void
|
||||
lex_test(){
|
||||
Scratch scratch;
|
||||
String test = "Keyword //R\n 18446744073709551616{})(@?&+-;....->,:::/**/\"Thing\" Thingy"
|
||||
String test = "Keyword //R\n 18446744073709551616\n {}\n)(@?&+-;....->,:::/**/\"Thing\" Thingy"
|
||||
"\"Test_Meme\"+=-===42524 4294967295 18446744073709551615"
|
||||
"for if while switch :="_s;
|
||||
|
||||
@@ -559,7 +627,8 @@ lex_test(){
|
||||
Array<Token> arr = lexer.tokens;
|
||||
|
||||
Token_Kind kind[] = {
|
||||
TK_Keyword, TK_NewLine, TK_Error,TK_OpenBrace,TK_CloseBrace,TK_CloseParen,TK_OpenParen,
|
||||
SAME_SCOPE,
|
||||
TK_Keyword, OPEN_SCOPE, TK_Error, OPEN_SCOPE, TK_OpenBrace,TK_CloseBrace,CLOSE_SCOPE, CLOSE_SCOPE, SAME_SCOPE, TK_CloseParen,TK_OpenParen,
|
||||
TK_At,TK_Question,TK_BitAnd,TK_Add,TK_Sub,TK_Semicolon,
|
||||
TK_ThreeDots, TK_Dot, TK_Arrow, TK_Comma, TK_DoubleColon, TK_Colon,
|
||||
TK_StringLit, TK_Identifier, TK_StringLit, TK_AddAssign, TK_SubAssign,
|
||||
@@ -568,7 +637,7 @@ lex_test(){
|
||||
TK_Colon, TK_Assign, TK_End
|
||||
};
|
||||
String strs[] = {
|
||||
"Keyword"_s, "\n "_s, "18446744073709551616"_s,"{"_s,"}"_s,")"_s,"("_s,
|
||||
""_s, "Keyword"_s, ""_s, "18446744073709551616"_s, ""_s, "{"_s,"}"_s, ""_s, ""_s, ""_s, ")"_s, "("_s,
|
||||
"@"_s,"?"_s,"&"_s,"+"_s,"-"_s,";"_s,
|
||||
"..."_s,"."_s,"->"_s,","_s,"::"_s,":"_s,
|
||||
"Thing"_s,"Thingy"_s,"Test_Meme"_s, "+="_s,"-="_s,
|
||||
|
||||
Reference in New Issue
Block a user