diff --git a/lex.c b/lex.c index 53d85de..5b1f85c 100644 --- a/lex.c +++ b/lex.c @@ -1,11 +1,21 @@ +/* + +- [ ] New line splicing, first source preprocessing stage. In order to properly handle '\\' backslash new line, we most likely need to preprocess the source in a initial pass. So at some point we need to introduce a stage that will create a buffer without wrong characters with a line / column mapping data structure. + +*/ + typedef struct Token { Token_Kind kind; - char *str; int len; + char *str; char *file; int line, column; + struct { + uint8_t preproc : 1; + }; + union { uint64_t u; }; @@ -19,6 +29,7 @@ typedef struct Lexer { char *file; int line; int column; + uint8_t preproc; } Lexer; void lex_advance(Lexer *lex) { @@ -32,16 +43,36 @@ void lex_advance(Lexer *lex) { } else { lex->column++; } - lex->at += 1; + + if (*lex->at == '\\') { + lex->at += 1; + if ((lex->at < lex->end) && *lex->at == '\n') { + lex->at += 1; + lex->line += 1; lex->column = 0; + } else if ((lex->at < lex->end) && *lex->at == '\r') { + lex->at += 1; + if ((lex->at < lex->end) && *lex->at == '\n') { + lex->at += 1; + lex->line += 1; lex->column = 0; + } else { + panicf("after \\r missing \\n"); + } + } else { + panicf("stray '\\' without follow up new line"); + } + + } else if (*lex->at == '\n') { + lex->preproc = false; + lex->at += 1; + } else { + lex->at += 1; + } } void eat_whitespace(Lexer *lex) { while (lex->at < lex->end) { switch (*lex->at) { - case ' ': - case '\t': - case '\r': - case '\n': + case ' ': case '\t': case '\r': case '\n': lex_advance(lex); break; default: @@ -97,6 +128,7 @@ Token lex_token(Lexer *lex) { .line = lex->line, .column = lex->column, .file = lex->file, + .preproc = lex->preproc, }; if (lex->at >= lex->end) { @@ -146,7 +178,6 @@ Token lex_token(Lexer *lex) { case ':': t.kind = TOK_COLON; break; case ';': t.kind = TOK_SEMICOLON; break; case '?': t.kind = TOK_QUESTION; break; - case '#': t.kind = TOK_HASH; break; case '+': t.kind = lex_repeat_or_assign(lex, '+', TOK_PLUS, TOK_INC, TOK_PLUS_ASSIGN); break; case '-': { if (lex_match(lex, '-')) t.kind = TOK_DEC; @@ -165,6 +196,15 @@ Token lex_token(Lexer *lex) { case '&': t.kind = lex_repeat_or_assign(lex, '&', TOK_BITAND, TOK_AND, TOK_AND_ASSIGN); break; case '|': t.kind = lex_repeat_or_assign(lex, '|', TOK_BITOR, TOK_OR, TOK_OR_ASSIGN); break; case '^': t.kind = lex_assign_variant(lex, TOK_BITXOR, TOK_XOR_ASSIGN); break; + + case '#': { + t.kind = TOK_HASH; + lex->preproc = t.preproc = true; + while (lex->at < lex->end && isalpha(*lex->at)) { + lex_advance(lex); + } + } break; + default: panicf("unrecognized character: '%c', can't match with any of the token kinds", c); } @@ -172,14 +212,6 @@ Token lex_token(Lexer *lex) { return t; } -void assert_token(Token t, Token_Kind kind, char *text, int line, int column) { - assert(t.kind == kind); - assert(t.line == line); - assert(t.column == column); - assert(t.len == (int)strlen(text)); - assert(strncmp(t.str, text, t.len) == 0); -} - Token_Array lex_file(char *file, char *src, int len) { Lexer lex = make_lexer(file, src, len); Token_Array result = {0}; @@ -193,6 +225,13 @@ Token_Array lex_file(char *file, char *src, int len) { return result; } +void assert_token(Token t, Token_Kind kind, char *text, int line, int column) { + assert(t.kind == kind); + assert(t.line == line); + assert(t.column == column); + assert(t.len == (int)strlen(text)); + assert(strncmp(t.str, text, t.len) == 0); +} void lex_test(void) { char *src = "12 + 34.5 * 6\n- 7 % 2 / 1 == 1 != 2 <= 3 >= 4 && 3 || 4 << 1 >> 2"; diff --git a/main.c b/main.c index 96de294..6b56a74 100644 --- a/main.c +++ b/main.c @@ -25,5 +25,4 @@ int main(int argc, char **argv) { lex_test(); parser_test(); } - } \ No newline at end of file