Working on the lexer, handling indentation and scopes

2022-05-26 14:24:11 +02:00
parent f4c05923c9
commit d9a80afa9e
2 changed files with 127 additions and 42 deletions
--- a/new_lex.cpp
+++ b/new_lex.cpp
@@ -121,6 +121,7 @@ struct Lex_Stream{
  S32    line;
  S32    inside_brace_paren;
  S32    last_valid_indent;
+  Array<Token *> indent_stack;
 };

 struct Lexer{
@@ -269,18 +270,112 @@ lex_parse_string(Lex_Stream *s, Token *t, U8 c){
    }                                                                                    \
    break

+function Token
+token_make(U8 *str, String file, int line, U8 *line_begin){
+  Token t = {};
+  t.str   = str;
+  t.file  = file;
+  t.line  = line;
+  t.line_begin = line_begin;
+  return t;
+}
+
+global Token token_null = {SAME_SCOPE};
+
+function Token *
+lex_last_indent_token(Lex_Stream *s){
+  if(s->indent_stack.len > 0){
+    return *s->indent_stack.last();
+  }
+  return &token_null;
+}
+
+function B32
+token_is_scope(Token *t){
+  B32 result = t->kind == OPEN_SCOPE || t->kind == CLOSE_SCOPE || t->kind == SAME_SCOPE;
+  return result;
+}
+
 function void
 lex__stream(Intern_Table *table, Array<Token> *array, Lex_Stream *s){
+  B32 beginning = true;
  while(lexc(s)){
-    while(lexc(s) == '\r') lex_advance(s);
+    if(s->iter >= s->stream.len) // End of stream
+      break;

-    Token t = {};
-    t.str = lexcp(s);
-    t.file = s->file;
-    t.line = s->line;
-    t.line_begin = s->line_begin;
+    // @note: for now the lexer is going to be a 2 stage process
+    // first we tokenize the indentation and then proceed to tokenize
+    // the good stuff
+
+    // for blocks of stmts we parse till we cant find another new line
+    // of same scope.
+    // parse_decl doesn't require preceding new line
+    //
+    // in that way new lines act as commas in function params
+    // seeing a comma means that there is a next thing to parse
+    // and it's easy to parse stuff using a do while loop
+
+    // @note: first handle indentation
+    // mostly we want to merge multiple new lines
+    // but for down scopes we want to emit 2 new lines
+    // that will ease out parsing, one token to break out
+    // from a block parsing, second to allow continuation of surrounding scope
+    Token t = token_make(lexcp(s), s->file, s->line, s->line_begin);
+    B32 should_emit = beginning;
+    for(;;){
+      switch(lexc(s)){
+        case '\t': case ' ': lex_advance(s); t.indent++;                        break;
+        case '\r':           lex_advance(s);                                    break;
+        case '\n':{
+          lex_advance(s);
+          should_emit = true;
+          t = token_make(lexcp(s), s->file, s->line, s->line_begin);
+        } break;
+        default:{
+          if(s->inside_brace_paren) should_emit = false;
+          if(should_emit){
+            Token *last = lex_last_indent_token(s);
+            if(t.indent > last->indent){
+              t.kind = OPEN_SCOPE;
+              array->add(t);
+              s->indent_stack.add(array->last());
+            }
+            else if(t.indent < last->indent){
+              For_Reverse(s->indent_stack){
+                assert(token_is_scope(*it));
+                if(it[0]->indent == t.indent){
+                  t.kind = SAME_SCOPE;
+                  array->add(t);
+                  break;
+                }
+                else if(it[0]->indent < t.indent){
+                  token_error(&t, "Bad indentation"_s);
+                  array->add(t);
+                  break;
+                }
+                else{
+                  s->indent_stack.pop();
+                  t.kind = CLOSE_SCOPE;
+                  array->add(t);
+                }
+              }
+            }
+            else {
+              t.kind = SAME_SCOPE;
+              array->add(t); // else SAME_SCOPE
+            }
+          }
+
+          goto indent_loop_break;
+        }
+      }
+    } indent_loop_break:
+    beginning = false;
+
+    t = token_make(lexcp(s), s->file, s->line, s->line_begin);
    lex_advance(s);

+    // @note: handle the indented token
    switch(*t.str){
      case  0 : break;
      case '@': t.kind = TK_At; break;
@@ -302,29 +397,10 @@ lex__stream(Intern_Table *table, Array<Token> *array, Lex_Stream *s){
      CASE3('+', TK_Add, TK_AddAssign, TK_Increment);
      CASE3('&', TK_BitAnd, TK_AndAssign, TK_And);
      CASE3('|', TK_BitOr, TK_OrAssign, TK_Or);
-#undef CASE2
-#undef CASE3
      case ';': {
        t.kind = TK_Semicolon;
      }break;

-      case '\r': case ' ' : s->stream.str -= 1;
-      case '\n': {
-        t.kind = TK_NewLine;
-        if(lexc(s) == '\r')
-          lex_advance(s);
-
-        for(;;){
-          if(lexc(s) == ' ') {
-            t.indent++;
-            // @Todo(Krzosa): Detect indentation method, file an error while methods are mixed
-          }
-          else if(lexc(s) == '\t') t.indent++;
-          else break;
-          lex_advance(s);
-        }
-
-      }break;
      case '.': {
        if(lexc(s) == '.' && lexci(s,1) == '.') {
          lex_advance(s); lex_advance(s);
@@ -489,21 +565,10 @@ lex__stream(Intern_Table *table, Array<Token> *array, Lex_Stream *s){
    if(t.len==0)
      lex_set_len(s,&t);

-    B32 skip = 0;
-    if(t.kind == TK_NewLine){
-      if(s->inside_brace_paren > 0) skip = 1;
-      if(array->len > 0 && array->last()->kind == TK_NewLine) array->pop();
-    }
-    if(!skip){
-      array->add(t);
-    }
-
-    while(lex_is_whitespace(lexc(s)))
-      lex_advance(s);
-
-    if(s->iter >= s->stream.len) // End of stream
-      break;
+    array->add(t);
  }
+#undef CASE2
+#undef CASE3
 }

 function void
@@ -529,6 +594,9 @@ lex_restream(Lexer *lexer, String istream, String file){

  lexer->tokens.clear();
  lexer->token_iter = 0;
+  Scratch scratch;
+  lexer->stream.indent_stack.allocator = scratch;
+  lexer->stream.indent_stack.add(&token_null);
  lex__stream(&lexer->interns, &lexer->tokens, &lexer->stream);
 }

@@ -542,7 +610,7 @@ lex_stream(Allocator *token_string_arena, Allocator *map_allocator, String istre
 function void
 lex_test(){
  Scratch scratch;
-  String test = "Keyword //R\n 18446744073709551616{})(@?&+-;....->,:::/**/\"Thing\" Thingy"
+  String test = "Keyword //R\n 18446744073709551616\n  {}\n)(@?&+-;....->,:::/**/\"Thing\" Thingy"
    "\"Test_Meme\"+=-===42524 4294967295 18446744073709551615"
    "for if while switch :="_s;

@@ -559,7 +627,8 @@ lex_test(){
  Array<Token> arr = lexer.tokens;

  Token_Kind kind[] = {
-    TK_Keyword, TK_NewLine, TK_Error,TK_OpenBrace,TK_CloseBrace,TK_CloseParen,TK_OpenParen,
+    SAME_SCOPE,
+    TK_Keyword, OPEN_SCOPE, TK_Error, OPEN_SCOPE, TK_OpenBrace,TK_CloseBrace,CLOSE_SCOPE, CLOSE_SCOPE, SAME_SCOPE, TK_CloseParen,TK_OpenParen,
    TK_At,TK_Question,TK_BitAnd,TK_Add,TK_Sub,TK_Semicolon,
    TK_ThreeDots, TK_Dot, TK_Arrow, TK_Comma, TK_DoubleColon, TK_Colon,
    TK_StringLit, TK_Identifier, TK_StringLit, TK_AddAssign, TK_SubAssign,
@@ -568,7 +637,7 @@ lex_test(){
    TK_Colon, TK_Assign, TK_End
  };
  String strs[] = {
-    "Keyword"_s, "\n "_s, "18446744073709551616"_s,"{"_s,"}"_s,")"_s,"("_s,
+    ""_s, "Keyword"_s, ""_s, "18446744073709551616"_s, ""_s, "{"_s,"}"_s, ""_s, ""_s, ""_s, ")"_s, "("_s,
    "@"_s,"?"_s,"&"_s,"+"_s,"-"_s,";"_s,
    "..."_s,"."_s,"->"_s,","_s,"::"_s,":"_s,
    "Thing"_s,"Thingy"_s,"Test_Meme"_s, "+="_s,"-="_s,