typedef enum lex_kind_t lex_kind_t; enum lex_kind_t { #define LEX_KIND_XLIST\ X(eof, "end of file", "---")\ X(integer, "integer", "---")\ X(real, "real", "---")\ X(ident, "identifier", "---")\ X(string, "string", "---")\ X(comment, "comment", "---")\ X(open_brace, "'{' open brace", "{")\ X(close_brace, "'}' close brace", "}")\ X(open_paren, "'(' open parenthesis", "(")\ X(close_paren, "')' close parenthesis", ")")\ X(open_bracket, "'[' open bracket", "[")\ X(close_bracket, "']' close bracket", "]")\ X(plus, "'+' plus", "+")\ X(minus, "'-' minus", "-")\ X(divide, "'/' division sign", "/")\ X(multiply, "'*' multiplication sign", "*")\ X(modulo, "'%' modulo", "%")\ X(or, "'||' logical or", "||")\ X(and, "'&&' logical and", "&&")\ X(negation, "'!' logical negation", "!")\ X(bit_negation, "'~' bit negation", "~")\ X(bit_left_shift, "'<<' bit left shift", "<<")\ X(bit_right_shift, "'>>' bit right shift", ">>")\ X(bit_or, "'|' bit or", "|")\ X(bit_and, "'&' bit and", "&")\ X(bit_xor, "'^' bit xor", "^")\ X(decrement, "'--' decrement", "--")\ X(increment, "'++' increment", "++")\ X(post_decrement, "'--' post decrement", "--")\ X(post_increment, "'++' post increment", "++")\ X(assign, "'=' assignment", "=")\ X(divide_assign, "'/=' divide assignment", "/=")\ X(multiply_assign, "'*=' multiply assignment", "*=")\ X(plus_assign, "'+=' plus assignment", "+=")\ X(minus_assign, "'-=' minus assignment", "-=")\ X(modulo_assign, "'%=' modulo assignment", "%=")\ X(bit_and_assign, "&=", "&=")\ X(bit_or_assign, "'|=' bit or assignment", "|=")\ X(bit_xor_assign, "'^=' bit xor assignment", "^=")\ X(bit_left_shift_assign, "'<<=' bit left shift assignment", "<<=")\ X(bit_right_shift_assign, "'>>=' bit right shift assignment", ">>=")\ X(equals, "'==' equals sign", "==")\ X(not_equals, "'!=' not equals sign", "!=")\ X(lesser, "'<' lesser then", "<")\ X(greater, "'>' greater then", ">")\ X(lesser_or_equal, "'<=' lesser then or equal", "<=")\ X(greater_or_equal, "'>=' greater then or equal", ">=")\ X(comma, "',' comma", ",")\ X(dot, "'.' dot", ".")\ X(three_dots, "'...' three dots", "...")\ X(semicolon, "';' semicolon", ";")\ X(colon, "':' colon", ":")\ X(arrow, "'->' arrow", "->")\ X(question, "'?' question mark", "?")\ #define X(KIND, STR, SIMPLE) lex_kind_##KIND, LEX_KIND_XLIST #undef X lex_kind_count, }; typedef enum lex_suffix_t lex_suffix_t; enum lex_suffix_t { #define LEX_SUFFIX_XLIST X(none) X(f) X(d) X(u) X(ul) X(ull) X(l) X(ll) #define X(KIND) lex_suffix_##KIND, LEX_SUFFIX_XLIST #undef X lex_suffix_count, }; typedef struct lex_t lex_t; struct lex_t { lex_kind_t kind; lex_suffix_t suffix; union { struct {char *str; i64 len;}; s8_t string; }; i32 line; i32 column; char *file_name; union { u64 integer; f64 real; char *error; }; }; typedef struct lexer_t lexer_t; struct lexer_t { char *at; char *file_name; i32 line; i32 column; }; typedef struct lex_array_t lex_array_t; struct lex_array_t { lex_t *data; i32 len; }; void lex_panicf(lex_t *token, const char *str, ...) { ma_temp_t scratch = ma_begin_scratch(); S8_FMT(scratch.arena, str, str8); panicf("%s(%d:%d): error: %S", token->file_name, token->line, token->column, str8); ma_end_scratch(scratch); } lexer_t lex_make(char *begin, char *file_name) { lexer_t result = {.at = begin, .file_name = file_name}; return result; } void lex_advance(lexer_t *lex) { if (lex->at[0] == 0) return; if (lex->at[0] == '\n') { lex->column = 0; lex->line += 1; } lex->column += 1; lex->at += 1; } b32 lex_match(lexer_t *lex, char c) { if (lex->at[0] == c) { lex_advance(lex); return true; } return false; } void lex_eat_whitespace(lexer_t *lex) { while (char_is_whitespace(lex->at[0])) lex_advance(lex); } u64 lex_map_char_to_int(char c) { switch (c) { case '0': return 0; break; case '1': return 1; break; case '2': return 2; break; case '3': return 3; break; case '4': return 4; break; case '5': return 5; break; case '6': return 6; break; case '7': return 7; break; case '8': return 8; break; case '9': return 9; break; case 'a': case 'A': return 10; break; case 'b': case 'B': return 11; break; case 'c': case 'C': return 12; break; case 'd': case 'D': return 13; break; case 'e': case 'E': return 14; break; case 'f': case 'F': return 15; break; default: return 255; } } u64 lex_deserial_u64(char *string, i64 len, u64 base) { assert(base >= 2 && base <= 16); u64 acc = 0; for (i64 i = 0; i < len; i++) { u64 num = lex_map_char_to_int(string[i]); if (num >= base) { panicf("invalid number"); break; } acc *= base; acc += num; } return acc; } void lex_eat_number(lexer_t *lex, lex_t *token) { token->kind = lex_kind_integer; for (;;) { if (char_is_digit(lex->at[0])) { lex_advance(lex); continue; } if (lex_match(lex, '.')) { if (token->kind == lex_kind_real) { lex_panicf(token, "multiple '.' periods in floating point number literal"); } token->kind = lex_kind_real; continue; } break; } if (lex_match(lex, 'f')) { token->kind = lex_kind_real; token->suffix = lex_suffix_f; } else if (lex_match(lex, 'd')) { token->kind = lex_kind_real; token->suffix = lex_suffix_d; } else if (token->kind == lex_kind_integer && ((lex->at[0] == 'u' && lex->at[1] == 'l' && lex->at[2] == 'l') || (lex->at[0] == 'U' && lex->at[1] == 'L' && lex->at[2] == 'L'))) { token->suffix = lex_suffix_ull; lex_advance(lex); lex_advance(lex); lex_advance(lex); } else if (token->kind == lex_kind_integer && ((lex->at[0] == 'u' && lex->at[1] == 'l') || (lex->at[0] == 'U' && lex->at[1] == 'L'))) { token->suffix = lex_suffix_ul; lex_advance(lex); lex_advance(lex); } else if (token->kind == lex_kind_integer && (lex->at[0] == 'l' || lex->at[0] == 'L')) { token->suffix = lex_suffix_l; lex_advance(lex); } else if (token->kind == lex_kind_integer && ((lex->at[0] == 'l' && lex->at[1] == 'l') || (lex->at[0] == 'L' && lex->at[1] == 'L'))) { token->suffix = lex_suffix_ll; lex_advance(lex); lex_advance(lex); } } void lex_eat_until(lexer_t *lex, char c) { while (lex->at[0] != c && lex->at[0] != 0) lex_advance(lex); } void lex_eat_string(lexer_t *lex, lex_t *token) { token->kind = lex_kind_string; for (;;) { if (lex_match(lex, token->str[0])) { break; } if (lex->at[0] == 0) { lex_panicf(token, "unclosed string"); } lex_advance(lex); } } #define LEX_CASE3(C1, K1, C2, K2, C3, K3)\ case C1: {\ token->kind = K1;\ if (lex_match(lex, C2)) {\ lex_advance(lex);\ token->kind = K2;\ } else if (lex_match(lex, C3)) {\ lex_advance(lex);\ token->kind = K3;\ }\ } break void lex_token_ex(lexer_t *lex, lex_t *token) { lex_eat_whitespace(lex); *token = (lex_t){.str = lex->at, .file_name = lex->file_name, .line = lex->line, .column = lex->column}; lex_advance(lex); switch (token->str[0]) { case '\0': token->kind = lex_kind_eof; break; case '{': token->kind = lex_kind_open_brace; break; case '}': token->kind = lex_kind_close_brace; break; case '(': token->kind = lex_kind_open_paren; break; case ')': token->kind = lex_kind_close_paren; break; case '[': token->kind = lex_kind_open_bracket; break; case ']': token->kind = lex_kind_close_bracket; break; case '~': token->kind = lex_kind_bit_negation; break; case ';': token->kind = lex_kind_semicolon; break; case ':': token->kind = lex_kind_colon; break; case ',': token->kind = lex_kind_comma; break; case '"': lex_eat_string(lex, token); break; case '`': lex_eat_string(lex, token); break; case '\'': lex_eat_string(lex, token); break; case '.': { token->kind = lex_kind_dot; if (lex->at[0] == '.' && lex->at[1] == '.') { lex_advance(lex); lex_advance(lex); token->kind = lex_kind_three_dots; } } break; case '/': { token->kind = lex_kind_divide; if (lex_match(lex, '/')) { token->kind = lex_kind_comment; lex_eat_until(lex, '\n'); } else if (lex_match(lex, '*')) { token->kind = lex_kind_comment; for (;;) { if (lex->at[0] == '*' && lex->at[1] == '/') { break; } if (lex->at[0] == 0) { lex_panicf(token, "Unclosed block comment"); return; } lex_advance(lex); } lex_advance(lex); lex_advance(lex); } else if (lex_match(lex, '=')) { token->kind = lex_kind_divide_assign; lex_advance(lex); } } break; LEX_CASE3('^', lex_kind_bit_xor, '=', lex_kind_bit_xor_assign, /*ignored option*/'=', lex_kind_bit_xor_assign); LEX_CASE3('=', lex_kind_assign, '=', lex_kind_equals, /*ignored option*/'=', lex_kind_equals); LEX_CASE3('!', lex_kind_negation, '=', lex_kind_not_equals, /*ignored option*/'=', lex_kind_not_equals); LEX_CASE3('%', lex_kind_modulo, '=', lex_kind_modulo_assign, /*ignored option*/'=', lex_kind_modulo_assign); LEX_CASE3('*', lex_kind_multiply, '=', lex_kind_multiply_assign, /*ignored option*/'=', lex_kind_multiply_assign); LEX_CASE3('+', lex_kind_plus, '+', lex_kind_increment, '=', lex_kind_plus_assign); LEX_CASE3('-', lex_kind_minus, '-', lex_kind_decrement, '=', lex_kind_minus_assign); LEX_CASE3('&', lex_kind_bit_and, '&', lex_kind_and, '=', lex_kind_bit_and_assign); LEX_CASE3('|', lex_kind_bit_or, '|', lex_kind_or, '=', lex_kind_bit_or_assign); case '>': { token->kind = lex_kind_greater; if (lex_match(lex, '=')) { token->kind = lex_kind_greater_or_equal; } else if (lex_match(lex, '>')) { token->kind = lex_kind_bit_right_shift; if (lex_match(lex, '=')) { token->kind = lex_kind_bit_right_shift_assign; } } } break; case '<': { token->kind = lex_kind_lesser; if (lex_match(lex, '=')) { token->kind = lex_kind_lesser_or_equal; } else if (lex_match(lex, '<')) { token->kind = lex_kind_bit_left_shift; if (lex_match(lex, '=')) { token->kind = lex_kind_bit_left_shift_assign; } } } break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case '0': { lex_eat_number(lex, token); } break; case 'A': case 'a': case 'B': case 'b': case 'C': case 'c': case 'D': case 'd': case 'E': case 'e': case 'F': case 'f': case 'G': case 'g': case 'H': case 'h': case 'I': case 'i': case 'J': case 'j': case 'K': case 'k': case 'L': case 'l': case 'M': case 'm': case 'N': case 'n': case 'O': case 'o': case 'P': case 'p': case 'Q': case 'q': case 'R': case 'r': case 'S': case 's': case 'T': case 't': case 'U': case 'u': case 'V': case 'v': case 'W': case 'w': case 'X': case 'x': case 'Y': case 'y': case 'Z': case 'z': case '_': { token->kind = lex_kind_ident; while (char_is_alphanumeric(lex->at[0]) || lex->at[0] == '_') lex_advance(lex); } break; default: { lex_panicf(token, "found invalid character in the token stream (%d)", token->str[0]); } } token->len = (i32)(lex->at - token->str); if (token->kind == lex_kind_integer) { token->integer = lex_deserial_u64(token->str, token->len, 10); } else if (token->kind == lex_kind_real) { token->real = s8_deserial_f64(token->string); } else if (token->kind == lex_kind_string) { token->str += 1; token->len -= 2; } } lex_t lex_token(lexer_t *lex) { lex_t result = {0}; lex_token_ex(lex, &result); return result; } // @todo: use s8_t instead lex_array_t lex_tokens(ma_arena_t *arena, char *file_name, char *stream) { usize align = arena->align; arena->align = 0; lex_array_t token_array = {0}; lexer_t l = lex_make(stream, file_name); for (;;) { lex_t *token = ma_push_type(arena, lex_t); if (token_array.data == NULL) { token_array.data = token; } token_array.len += 1; do { lex_token_ex(&l, token); } while (token->kind == lex_kind_comment); if (token->kind == lex_kind_eof) break; } arena->align = align; return token_array; } s8_t global_lex_kind_simple_strings[] = { #define X(KIND, STR, SIMPLE) s8_const_lit(SIMPLE), LEX_KIND_XLIST #undef X }; s8_t s8_serial_simple_lex_kind_t(lex_kind_t kind) { assert(kind >= 0 && kind < lex_kind_count); return global_lex_kind_simple_strings[kind]; } s8_t global_lex_kind_strings[] = { #define X(KIND, STR, SIMPLE) s8_const_lit(STR), LEX_KIND_XLIST #undef X }; s8_t s8_serial_lex_kind_t(lex_kind_t kind) { assert(kind >= 0 && kind < lex_kind_count); return global_lex_kind_strings[kind]; } type_member_t members__lex_kind_t[] = { #define X(KIND, STR, SIMPLE) {.name = s8_const_lit("lex_kind_" #KIND), .value = lex_kind_##KIND}, LEX_KIND_XLIST #undef X }; DEFINE_ENUM(lex_kind_t); type_member_t members__lex_suffix_t[] = { #define X(KIND) {.name = s8_const_lit("lex_suffix_" #KIND), .value = lex_suffix_##KIND}, LEX_SUFFIX_XLIST #undef X }; DEFINE_ENUM(lex_suffix_t); // // typedef struct parser_t parser_t; struct parser_t { ma_arena_t *arena; lex_t *at; }; #define parser_make(ARENA, TOKEN) &(parser_t){.arena = ARENA, .at = TOKEN} lex_t *parser_next(parser_t *par) { lex_t *result = par->at; if (result->kind != lex_kind_eof) par->at += 1; return result; } lex_t *parser_match(parser_t *par, lex_kind_t kind) { if (par->at->kind == kind) { return parser_next(par); } else { return NULL; } } lex_t *parser_matchi(parser_t *par, s8_t str) { if (par->at->kind == lex_kind_ident && s8_equal(par->at->string, str)) { return parser_next(par); } else { return NULL; } } lex_t *parser_expect(parser_t *par, lex_kind_t kind) { lex_t *token = parser_match(par, kind); if (!token) lex_panicf(par->at, "expected token kind: %S, got instead: %S", s8_serial_lex_kind_t(kind), s8_serial_lex_kind_t(par->at->kind)); return token; } void parser_eat_until(parser_t *par, lex_kind_t kind) { while (par->at->kind != kind && par->at->kind != lex_kind_eof) { parser_next(par); } } void parser_eat_including(parser_t *par, lex_kind_t kind) { parser_eat_until(par, kind); parser_next(par); }