493 lines
16 KiB
C
493 lines
16 KiB
C
typedef enum lex_kind_t lex_kind_t;
|
|
enum lex_kind_t {
|
|
#define LEX_KIND_XLIST\
|
|
X(eof, "end of file", "---")\
|
|
X(integer, "integer", "---")\
|
|
X(real, "real", "---")\
|
|
X(ident, "identifier", "---")\
|
|
X(string, "string", "---")\
|
|
X(comment, "comment", "---")\
|
|
X(open_brace, "'{' open brace", "{")\
|
|
X(close_brace, "'}' close brace", "}")\
|
|
X(open_paren, "'(' open parenthesis", "(")\
|
|
X(close_paren, "')' close parenthesis", ")")\
|
|
X(open_bracket, "'[' open bracket", "[")\
|
|
X(close_bracket, "']' close bracket", "]")\
|
|
X(plus, "'+' plus", "+")\
|
|
X(minus, "'-' minus", "-")\
|
|
X(divide, "'/' division sign", "/")\
|
|
X(multiply, "'*' multiplication sign", "*")\
|
|
X(modulo, "'%' modulo", "%")\
|
|
X(or, "'||' logical or", "||")\
|
|
X(and, "'&&' logical and", "&&")\
|
|
X(negation, "'!' logical negation", "!")\
|
|
X(bit_negation, "'~' bit negation", "~")\
|
|
X(bit_left_shift, "'<<' bit left shift", "<<")\
|
|
X(bit_right_shift, "'>>' bit right shift", ">>")\
|
|
X(bit_or, "'|' bit or", "|")\
|
|
X(bit_and, "'&' bit and", "&")\
|
|
X(bit_xor, "'^' bit xor", "^")\
|
|
X(decrement, "'--' decrement", "--")\
|
|
X(increment, "'++' increment", "++")\
|
|
X(post_decrement, "'--' post decrement", "--")\
|
|
X(post_increment, "'++' post increment", "++")\
|
|
X(assign, "'=' assignment", "=")\
|
|
X(divide_assign, "'/=' divide assignment", "/=")\
|
|
X(multiply_assign, "'*=' multiply assignment", "*=")\
|
|
X(plus_assign, "'+=' plus assignment", "+=")\
|
|
X(minus_assign, "'-=' minus assignment", "-=")\
|
|
X(modulo_assign, "'%=' modulo assignment", "%=")\
|
|
X(bit_and_assign, "&=", "&=")\
|
|
X(bit_or_assign, "'|=' bit or assignment", "|=")\
|
|
X(bit_xor_assign, "'^=' bit xor assignment", "^=")\
|
|
X(bit_left_shift_assign, "'<<=' bit left shift assignment", "<<=")\
|
|
X(bit_right_shift_assign, "'>>=' bit right shift assignment", ">>=")\
|
|
X(equals, "'==' equals sign", "==")\
|
|
X(not_equals, "'!=' not equals sign", "!=")\
|
|
X(lesser, "'<' lesser then", "<")\
|
|
X(greater, "'>' greater then", ">")\
|
|
X(lesser_or_equal, "'<=' lesser then or equal", "<=")\
|
|
X(greater_or_equal, "'>=' greater then or equal", ">=")\
|
|
X(comma, "',' comma", ",")\
|
|
X(dot, "'.' dot", ".")\
|
|
X(three_dots, "'...' three dots", "...")\
|
|
X(semicolon, "';' semicolon", ";")\
|
|
X(colon, "':' colon", ":")\
|
|
X(arrow, "'->' arrow", "->")\
|
|
X(question, "'?' question mark", "?")\
|
|
|
|
|
|
#define X(KIND, STR, SIMPLE) lex_kind_##KIND,
|
|
LEX_KIND_XLIST
|
|
#undef X
|
|
|
|
lex_kind_count,
|
|
};
|
|
|
|
typedef enum lex_suffix_t lex_suffix_t;
|
|
enum lex_suffix_t {
|
|
#define LEX_SUFFIX_XLIST X(none) X(f) X(d) X(u) X(ul) X(ull) X(l) X(ll)
|
|
#define X(KIND) lex_suffix_##KIND,
|
|
LEX_SUFFIX_XLIST
|
|
#undef X
|
|
|
|
lex_suffix_count,
|
|
};
|
|
|
|
typedef struct lex_t lex_t;
|
|
struct lex_t {
|
|
lex_kind_t kind;
|
|
lex_suffix_t suffix;
|
|
|
|
union {
|
|
struct {char *str; i64 len;};
|
|
s8_t string;
|
|
};
|
|
|
|
i32 line;
|
|
i32 column;
|
|
char *file_name;
|
|
|
|
union {
|
|
u64 integer;
|
|
f64 real;
|
|
char *error;
|
|
};
|
|
};
|
|
|
|
typedef struct lexer_t lexer_t;
|
|
struct lexer_t {
|
|
char *at;
|
|
char *file_name;
|
|
i32 line;
|
|
i32 column;
|
|
};
|
|
|
|
typedef struct lex_array_t lex_array_t;
|
|
struct lex_array_t {
|
|
lex_t *data;
|
|
i32 len;
|
|
};
|
|
|
|
void lex_panicf(lex_t *token, const char *str, ...) {
|
|
ma_temp_t scratch = ma_begin_scratch();
|
|
S8_FMT(scratch.arena, str, str8);
|
|
panicf("%s(%d:%d): error: %S", token->file_name, token->line, token->column, str8);
|
|
ma_end_scratch(scratch);
|
|
}
|
|
|
|
lexer_t lex_make(char *begin, char *file_name) {
|
|
lexer_t result = {.at = begin, .file_name = file_name};
|
|
return result;
|
|
}
|
|
|
|
void lex_advance(lexer_t *lex) {
|
|
if (lex->at[0] == 0) return;
|
|
if (lex->at[0] == '\n') { lex->column = 0; lex->line += 1; }
|
|
lex->column += 1;
|
|
lex->at += 1;
|
|
}
|
|
|
|
b32 lex_match(lexer_t *lex, char c) {
|
|
if (lex->at[0] == c) {
|
|
lex_advance(lex);
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void lex_eat_whitespace(lexer_t *lex) {
|
|
while (char_is_whitespace(lex->at[0])) lex_advance(lex);
|
|
}
|
|
|
|
u64 lex_map_char_to_int(char c) {
|
|
switch (c) {
|
|
case '0': return 0; break;
|
|
case '1': return 1; break;
|
|
case '2': return 2; break;
|
|
case '3': return 3; break;
|
|
case '4': return 4; break;
|
|
case '5': return 5; break;
|
|
case '6': return 6; break;
|
|
case '7': return 7; break;
|
|
case '8': return 8; break;
|
|
case '9': return 9; break;
|
|
case 'a':
|
|
case 'A': return 10; break;
|
|
case 'b':
|
|
case 'B': return 11; break;
|
|
case 'c':
|
|
case 'C': return 12; break;
|
|
case 'd':
|
|
case 'D': return 13; break;
|
|
case 'e':
|
|
case 'E': return 14; break;
|
|
case 'f':
|
|
case 'F': return 15; break;
|
|
default: return 255;
|
|
}
|
|
}
|
|
|
|
u64 lex_deserial_u64(char *string, i64 len, u64 base) {
|
|
assert(base >= 2 && base <= 16);
|
|
u64 acc = 0;
|
|
for (i64 i = 0; i < len; i++) {
|
|
u64 num = lex_map_char_to_int(string[i]);
|
|
if (num >= base) {
|
|
panicf("invalid number");
|
|
break;
|
|
}
|
|
acc *= base;
|
|
acc += num;
|
|
}
|
|
return acc;
|
|
}
|
|
|
|
void lex_eat_number(lexer_t *lex, lex_t *token) {
|
|
token->kind = lex_kind_integer;
|
|
for (;;) {
|
|
if (char_is_digit(lex->at[0])) {
|
|
lex_advance(lex);
|
|
continue;
|
|
}
|
|
|
|
if (lex_match(lex, '.')) {
|
|
if (token->kind == lex_kind_real) {
|
|
lex_panicf(token, "multiple '.' periods in floating point number literal");
|
|
}
|
|
token->kind = lex_kind_real;
|
|
continue;
|
|
}
|
|
|
|
break;
|
|
}
|
|
|
|
if (lex_match(lex, 'f')) {
|
|
token->kind = lex_kind_real;
|
|
token->suffix = lex_suffix_f;
|
|
} else if (lex_match(lex, 'd')) {
|
|
token->kind = lex_kind_real;
|
|
token->suffix = lex_suffix_d;
|
|
} else if (token->kind == lex_kind_integer && ((lex->at[0] == 'u' && lex->at[1] == 'l' && lex->at[2] == 'l') || (lex->at[0] == 'U' && lex->at[1] == 'L' && lex->at[2] == 'L'))) {
|
|
token->suffix = lex_suffix_ull;
|
|
lex_advance(lex); lex_advance(lex); lex_advance(lex);
|
|
} else if (token->kind == lex_kind_integer && ((lex->at[0] == 'u' && lex->at[1] == 'l') || (lex->at[0] == 'U' && lex->at[1] == 'L'))) {
|
|
token->suffix = lex_suffix_ul;
|
|
lex_advance(lex); lex_advance(lex);
|
|
} else if (token->kind == lex_kind_integer && (lex->at[0] == 'l' || lex->at[0] == 'L')) {
|
|
token->suffix = lex_suffix_l;
|
|
lex_advance(lex);
|
|
} else if (token->kind == lex_kind_integer && ((lex->at[0] == 'l' && lex->at[1] == 'l') || (lex->at[0] == 'L' && lex->at[1] == 'L'))) {
|
|
token->suffix = lex_suffix_ll;
|
|
lex_advance(lex); lex_advance(lex);
|
|
}
|
|
}
|
|
|
|
void lex_eat_until(lexer_t *lex, char c) {
|
|
while (lex->at[0] != c && lex->at[0] != 0) lex_advance(lex);
|
|
}
|
|
|
|
void lex_eat_string(lexer_t *lex, lex_t *token) {
|
|
token->kind = lex_kind_string;
|
|
for (;;) {
|
|
if (lex_match(lex, token->str[0])) {
|
|
break;
|
|
}
|
|
|
|
if (lex->at[0] == 0) {
|
|
lex_panicf(token, "unclosed string");
|
|
}
|
|
|
|
lex_advance(lex);
|
|
}
|
|
}
|
|
|
|
#define LEX_CASE3(C1, K1, C2, K2, C3, K3)\
|
|
case C1: {\
|
|
token->kind = K1;\
|
|
if (lex_match(lex, C2)) {\
|
|
lex_advance(lex);\
|
|
token->kind = K2;\
|
|
} else if (lex_match(lex, C3)) {\
|
|
lex_advance(lex);\
|
|
token->kind = K3;\
|
|
}\
|
|
} break
|
|
|
|
void lex_token_ex(lexer_t *lex, lex_t *token) {
|
|
lex_eat_whitespace(lex);
|
|
*token = (lex_t){.str = lex->at, .file_name = lex->file_name, .line = lex->line, .column = lex->column};
|
|
lex_advance(lex);
|
|
|
|
switch (token->str[0]) {
|
|
case '\0': token->kind = lex_kind_eof; break;
|
|
case '{': token->kind = lex_kind_open_brace; break;
|
|
case '}': token->kind = lex_kind_close_brace; break;
|
|
case '(': token->kind = lex_kind_open_paren; break;
|
|
case ')': token->kind = lex_kind_close_paren; break;
|
|
case '[': token->kind = lex_kind_open_bracket; break;
|
|
case ']': token->kind = lex_kind_close_bracket; break;
|
|
case '~': token->kind = lex_kind_bit_negation; break;
|
|
case ';': token->kind = lex_kind_semicolon; break;
|
|
case ':': token->kind = lex_kind_colon; break;
|
|
case ',': token->kind = lex_kind_comma; break;
|
|
case '"': lex_eat_string(lex, token); break;
|
|
case '`': lex_eat_string(lex, token); break;
|
|
case '\'': lex_eat_string(lex, token); break;
|
|
|
|
case '.': {
|
|
token->kind = lex_kind_dot;
|
|
if (lex->at[0] == '.' && lex->at[1] == '.') {
|
|
lex_advance(lex);
|
|
lex_advance(lex);
|
|
token->kind = lex_kind_three_dots;
|
|
}
|
|
} break;
|
|
|
|
case '/': {
|
|
token->kind = lex_kind_divide;
|
|
if (lex_match(lex, '/')) {
|
|
token->kind = lex_kind_comment;
|
|
lex_eat_until(lex, '\n');
|
|
} else if (lex_match(lex, '*')) {
|
|
token->kind = lex_kind_comment;
|
|
for (;;) {
|
|
if (lex->at[0] == '*' && lex->at[1] == '/') {
|
|
break;
|
|
}
|
|
if (lex->at[0] == 0) {
|
|
lex_panicf(token, "Unclosed block comment");
|
|
return;
|
|
}
|
|
lex_advance(lex);
|
|
}
|
|
lex_advance(lex);
|
|
lex_advance(lex);
|
|
} else if (lex_match(lex, '=')) {
|
|
token->kind = lex_kind_divide_assign;
|
|
lex_advance(lex);
|
|
}
|
|
} break;
|
|
|
|
LEX_CASE3('^', lex_kind_bit_xor, '=', lex_kind_bit_xor_assign, /*ignored option*/'=', lex_kind_bit_xor_assign);
|
|
LEX_CASE3('=', lex_kind_assign, '=', lex_kind_equals, /*ignored option*/'=', lex_kind_equals);
|
|
LEX_CASE3('!', lex_kind_negation, '=', lex_kind_not_equals, /*ignored option*/'=', lex_kind_not_equals);
|
|
LEX_CASE3('%', lex_kind_modulo, '=', lex_kind_modulo_assign, /*ignored option*/'=', lex_kind_modulo_assign);
|
|
LEX_CASE3('*', lex_kind_multiply, '=', lex_kind_multiply_assign, /*ignored option*/'=', lex_kind_multiply_assign);
|
|
LEX_CASE3('+', lex_kind_plus, '+', lex_kind_increment, '=', lex_kind_plus_assign);
|
|
LEX_CASE3('-', lex_kind_minus, '-', lex_kind_decrement, '=', lex_kind_minus_assign);
|
|
LEX_CASE3('&', lex_kind_bit_and, '&', lex_kind_and, '=', lex_kind_bit_and_assign);
|
|
LEX_CASE3('|', lex_kind_bit_or, '|', lex_kind_or, '=', lex_kind_bit_or_assign);
|
|
|
|
case '>': {
|
|
token->kind = lex_kind_greater;
|
|
if (lex_match(lex, '=')) {
|
|
token->kind = lex_kind_greater_or_equal;
|
|
} else if (lex_match(lex, '>')) {
|
|
token->kind = lex_kind_bit_right_shift;
|
|
if (lex_match(lex, '=')) {
|
|
token->kind = lex_kind_bit_right_shift_assign;
|
|
}
|
|
}
|
|
} break;
|
|
|
|
case '<': {
|
|
token->kind = lex_kind_lesser;
|
|
if (lex_match(lex, '=')) {
|
|
token->kind = lex_kind_lesser_or_equal;
|
|
}
|
|
else if (lex_match(lex, '<')) {
|
|
token->kind = lex_kind_bit_left_shift;
|
|
if (lex_match(lex, '=')) {
|
|
token->kind = lex_kind_bit_left_shift_assign;
|
|
}
|
|
}
|
|
} break;
|
|
|
|
case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case '0': {
|
|
lex_eat_number(lex, token);
|
|
} break;
|
|
|
|
case 'A': case 'a': case 'B': case 'b': case 'C': case 'c':
|
|
case 'D': case 'd': case 'E': case 'e': case 'F': case 'f':
|
|
case 'G': case 'g': case 'H': case 'h': case 'I': case 'i':
|
|
case 'J': case 'j': case 'K': case 'k': case 'L': case 'l':
|
|
case 'M': case 'm': case 'N': case 'n': case 'O': case 'o':
|
|
case 'P': case 'p': case 'Q': case 'q': case 'R': case 'r':
|
|
case 'S': case 's': case 'T': case 't': case 'U': case 'u':
|
|
case 'V': case 'v': case 'W': case 'w': case 'X': case 'x':
|
|
case 'Y': case 'y': case 'Z': case 'z': case '_': {
|
|
token->kind = lex_kind_ident;
|
|
while (char_is_alphanumeric(lex->at[0]) || lex->at[0] == '_') lex_advance(lex);
|
|
} break;
|
|
|
|
default: {
|
|
lex_panicf(token, "found invalid character in the token stream (%d)", token->str[0]);
|
|
}
|
|
}
|
|
|
|
token->len = (i32)(lex->at - token->str);
|
|
|
|
if (token->kind == lex_kind_integer) {
|
|
token->integer = lex_deserial_u64(token->str, token->len, 10);
|
|
} else if (token->kind == lex_kind_real) {
|
|
token->real = s8_deserial_f64(token->string);
|
|
} else if (token->kind == lex_kind_string) {
|
|
token->str += 1;
|
|
token->len -= 2;
|
|
}
|
|
}
|
|
|
|
lex_t lex_token(lexer_t *lex) {
|
|
lex_t result = {0};
|
|
lex_token_ex(lex, &result);
|
|
return result;
|
|
}
|
|
|
|
// @todo: use s8_t instead
|
|
lex_array_t lex_tokens(ma_arena_t *arena, char *file_name, char *stream) {
|
|
usize align = arena->align;
|
|
arena->align = 0;
|
|
|
|
lex_array_t token_array = {0};
|
|
lexer_t l = lex_make(stream, file_name);
|
|
for (;;) {
|
|
lex_t *token = ma_push_type(arena, lex_t);
|
|
if (token_array.data == NULL) {
|
|
token_array.data = token;
|
|
}
|
|
token_array.len += 1;
|
|
|
|
do {
|
|
lex_token_ex(&l, token);
|
|
} while (token->kind == lex_kind_comment);
|
|
if (token->kind == lex_kind_eof) break;
|
|
}
|
|
|
|
arena->align = align;
|
|
return token_array;
|
|
}
|
|
|
|
s8_t global_lex_kind_simple_strings[] = {
|
|
#define X(KIND, STR, SIMPLE) s8_const_lit(SIMPLE),
|
|
LEX_KIND_XLIST
|
|
#undef X
|
|
};
|
|
|
|
s8_t s8_serial_simple_lex_kind_t(lex_kind_t kind) {
|
|
assert(kind >= 0 && kind < lex_kind_count);
|
|
return global_lex_kind_simple_strings[kind];
|
|
}
|
|
|
|
s8_t global_lex_kind_strings[] = {
|
|
#define X(KIND, STR, SIMPLE) s8_const_lit(STR),
|
|
LEX_KIND_XLIST
|
|
#undef X
|
|
};
|
|
|
|
s8_t s8_serial_lex_kind_t(lex_kind_t kind) {
|
|
assert(kind >= 0 && kind < lex_kind_count);
|
|
return global_lex_kind_strings[kind];
|
|
}
|
|
|
|
type_member_t members__lex_kind_t[] = {
|
|
#define X(KIND, STR, SIMPLE) {.name = s8_const_lit("lex_kind_" #KIND), .value = lex_kind_##KIND},
|
|
LEX_KIND_XLIST
|
|
#undef X
|
|
};
|
|
DEFINE_ENUM(lex_kind_t);
|
|
|
|
type_member_t members__lex_suffix_t[] = {
|
|
#define X(KIND) {.name = s8_const_lit("lex_suffix_" #KIND), .value = lex_suffix_##KIND},
|
|
LEX_SUFFIX_XLIST
|
|
#undef X
|
|
};
|
|
DEFINE_ENUM(lex_suffix_t);
|
|
|
|
//
|
|
//
|
|
typedef struct parser_t parser_t;
|
|
struct parser_t {
|
|
ma_arena_t *arena;
|
|
lex_t *at;
|
|
};
|
|
|
|
#define parser_make(ARENA, TOKEN) &(parser_t){.arena = ARENA, .at = TOKEN}
|
|
|
|
lex_t *parser_next(parser_t *par) {
|
|
lex_t *result = par->at;
|
|
if (result->kind != lex_kind_eof) par->at += 1;
|
|
return result;
|
|
}
|
|
|
|
lex_t *parser_match(parser_t *par, lex_kind_t kind) {
|
|
if (par->at->kind == kind) {
|
|
return parser_next(par);
|
|
} else {
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
lex_t *parser_matchi(parser_t *par, s8_t str) {
|
|
if (par->at->kind == lex_kind_ident && s8_equal(par->at->string, str)) {
|
|
return parser_next(par);
|
|
} else {
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
lex_t *parser_expect(parser_t *par, lex_kind_t kind) {
|
|
lex_t *token = parser_match(par, kind);
|
|
if (!token) lex_panicf(par->at, "expected token kind: %S, got instead: %S", s8_serial_lex_kind_t(kind), s8_serial_lex_kind_t(par->at->kind));
|
|
return token;
|
|
}
|
|
|
|
void parser_eat_until(parser_t *par, lex_kind_t kind) {
|
|
while (par->at->kind != kind && par->at->kind != lex_kind_eof) {
|
|
parser_next(par);
|
|
}
|
|
}
|
|
void parser_eat_including(parser_t *par, lex_kind_t kind) {
|
|
parser_eat_until(par, kind);
|
|
parser_next(par);
|
|
} |