Working on the lexer, handling indentation and scopes

2022-05-26 14:24:11 +02:00
parent f4c05923c9
commit d9a80afa9e
2 changed files with 127 additions and 42 deletions
--- a/lambdas.kl
+++ b/lambdas.kl
@@ -1,3 +1,19 @@
 /*
 /*begin of file*/  thing // indent 2 == error
 add_10 :: (size: int): int // scope 0
  add_20 :: (new_size: int): int // up scope 2
    return 20 // up scope 2
  // down scope
 // down scope
 // scope 0
 thing
 */
 add_10 :: (size: int): int
  add_20 :: (new_size: int): int
--- a/new_lex.cpp
+++ b/new_lex.cpp
@@ -121,6 +121,7 @@ struct Lex_Stream{
  S32    line;
  S32    inside_brace_paren;
  S32    last_valid_indent;
  Array<Token *> indent_stack;
 };
 struct Lexer{
@@ -269,18 +270,112 @@ lex_parse_string(Lex_Stream *s, Token *t, U8 c){
    }                                                                                    \
    break
 function Token
 token_make(U8 *str, String file, int line, U8 *line_begin){
  Token t = {};
  t.str   = str;
  t.file  = file;
  t.line  = line;
  t.line_begin = line_begin;
  return t;
 }
 global Token token_null = {SAME_SCOPE};
 function Token *
 lex_last_indent_token(Lex_Stream *s){
  if(s->indent_stack.len > 0){
    return *s->indent_stack.last();
  }
  return &token_null;
 }
 function B32
 token_is_scope(Token *t){
  B32 result = t->kind == OPEN_SCOPE || t->kind == CLOSE_SCOPE || t->kind == SAME_SCOPE;
  return result;
 }
 function void
 lex__stream(Intern_Table *table, Array<Token> *array, Lex_Stream *s){
  B32 beginning = true;
  while(lexc(s)){
-    while(lexc(s) == '\r') lex_advance(s);
+    if(s->iter >= s->stream.len) // End of stream
      break;
-    Token t = {};
+    // @note: for now the lexer is going to be a 2 stage process
-    t.str = lexcp(s);
+    // first we tokenize the indentation and then proceed to tokenize
-    t.file = s->file;
+    // the good stuff
-    t.line = s->line;
+
-    t.line_begin = s->line_begin;
+    // for blocks of stmts we parse till we cant find another new line
    // of same scope.
    // parse_decl doesn't require preceding new line
    //
    // in that way new lines act as commas in function params
    // seeing a comma means that there is a next thing to parse
    // and it's easy to parse stuff using a do while loop
    // @note: first handle indentation
    // mostly we want to merge multiple new lines
    // but for down scopes we want to emit 2 new lines
    // that will ease out parsing, one token to break out
    // from a block parsing, second to allow continuation of surrounding scope
    Token t = token_make(lexcp(s), s->file, s->line, s->line_begin);
    B32 should_emit = beginning;
    for(;;){
      switch(lexc(s)){
        case '\t': case ' ': lex_advance(s); t.indent++;                        break;
        case '\r':           lex_advance(s);                                    break;
        case '\n':{
          lex_advance(s);
          should_emit = true;
          t = token_make(lexcp(s), s->file, s->line, s->line_begin);
        } break;
        default:{
          if(s->inside_brace_paren) should_emit = false;
          if(should_emit){
            Token *last = lex_last_indent_token(s);
            if(t.indent > last->indent){
              t.kind = OPEN_SCOPE;
              array->add(t);
              s->indent_stack.add(array->last());
            }
            else if(t.indent < last->indent){
              For_Reverse(s->indent_stack){
                assert(token_is_scope(*it));
                if(it[0]->indent == t.indent){
                  t.kind = SAME_SCOPE;
                  array->add(t);
                  break;
                }
                else if(it[0]->indent < t.indent){
                  token_error(&t, "Bad indentation"_s);
                  array->add(t);
                  break;
                }
                else{
                  s->indent_stack.pop();
                  t.kind = CLOSE_SCOPE;
                  array->add(t);
                }
              }
            }
            else {
              t.kind = SAME_SCOPE;
              array->add(t); // else SAME_SCOPE
            }
          }
          goto indent_loop_break;
        }
      }
    } indent_loop_break:
    beginning = false;
    t = token_make(lexcp(s), s->file, s->line, s->line_begin);
    lex_advance(s);
    // @note: handle the indented token
    switch(*t.str){
      case  0 : break;
      case '@': t.kind = TK_At; break;
@@ -302,29 +397,10 @@ lex__stream(Intern_Table *table, Array<Token> *array, Lex_Stream *s){
      CASE3('+', TK_Add, TK_AddAssign, TK_Increment);
      CASE3('&', TK_BitAnd, TK_AndAssign, TK_And);
      CASE3('|', TK_BitOr, TK_OrAssign, TK_Or);
 #undef CASE2
 #undef CASE3
      case ';': {
        t.kind = TK_Semicolon;
      }break;
      case '\r': case ' ' : s->stream.str -= 1;
      case '\n': {
        t.kind = TK_NewLine;
        if(lexc(s) == '\r')
          lex_advance(s);
        for(;;){
          if(lexc(s) == ' ') {
            t.indent++;
            // @Todo(Krzosa): Detect indentation method, file an error while methods are mixed
          }
          else if(lexc(s) == '\t') t.indent++;
          else break;
          lex_advance(s);
        }
      }break;
      case '.': {
        if(lexc(s) == '.' && lexci(s,1) == '.') {
          lex_advance(s); lex_advance(s);
@@ -489,21 +565,10 @@ lex__stream(Intern_Table *table, Array<Token> *array, Lex_Stream *s){
    if(t.len==0)
      lex_set_len(s,&t);
    B32 skip = 0;
    if(t.kind == TK_NewLine){
      if(s->inside_brace_paren > 0) skip = 1;
      if(array->len > 0 && array->last()->kind == TK_NewLine) array->pop();
    }
    if(!skip){
    array->add(t);
  }
-
+#undef CASE2
-    while(lex_is_whitespace(lexc(s)))
+#undef CASE3
      lex_advance(s);
    if(s->iter >= s->stream.len) // End of stream
      break;
  }
 }
 function void
@@ -529,6 +594,9 @@ lex_restream(Lexer *lexer, String istream, String file){
  lexer->tokens.clear();
  lexer->token_iter = 0;
  Scratch scratch;
  lexer->stream.indent_stack.allocator = scratch;
  lexer->stream.indent_stack.add(&token_null);
  lex__stream(&lexer->interns, &lexer->tokens, &lexer->stream);
 }
@@ -542,7 +610,7 @@ lex_stream(Allocator *token_string_arena, Allocator *map_allocator, String istre
 function void
 lex_test(){
  Scratch scratch;
-  String test = "Keyword //R\n 18446744073709551616{})(@?&+-;....->,:::/**/\"Thing\" Thingy"
+  String test = "Keyword //R\n 18446744073709551616\n  {}\n)(@?&+-;....->,:::/**/\"Thing\" Thingy"
    "\"Test_Meme\"+=-===42524 4294967295 18446744073709551615"
    "for if while switch :="_s;
@@ -559,7 +627,8 @@ lex_test(){
  Array<Token> arr = lexer.tokens;
  Token_Kind kind[] = {
-    TK_Keyword, TK_NewLine, TK_Error,TK_OpenBrace,TK_CloseBrace,TK_CloseParen,TK_OpenParen,
+    SAME_SCOPE,
    TK_Keyword, OPEN_SCOPE, TK_Error, OPEN_SCOPE, TK_OpenBrace,TK_CloseBrace,CLOSE_SCOPE, CLOSE_SCOPE, SAME_SCOPE, TK_CloseParen,TK_OpenParen,
    TK_At,TK_Question,TK_BitAnd,TK_Add,TK_Sub,TK_Semicolon,
    TK_ThreeDots, TK_Dot, TK_Arrow, TK_Comma, TK_DoubleColon, TK_Colon,
    TK_StringLit, TK_Identifier, TK_StringLit, TK_AddAssign, TK_SubAssign,
@@ -568,7 +637,7 @@ lex_test(){
    TK_Colon, TK_Assign, TK_End
  };
  String strs[] = {
-    "Keyword"_s, "\n "_s, "18446744073709551616"_s,"{"_s,"}"_s,")"_s,"("_s,
+    ""_s, "Keyword"_s, ""_s, "18446744073709551616"_s, ""_s, "{"_s,"}"_s, ""_s, ""_s, ""_s, ")"_s, "("_s,
    "@"_s,"?"_s,"&"_s,"+"_s,"-"_s,";"_s,
    "..."_s,"."_s,"->"_s,","_s,"::"_s,":"_s,
    "Thing"_s,"Thingy"_s,"Test_Meme"_s, "+="_s,"-="_s,