#include "clexer.h" #include #ifndef CL_PRIVATE_FUNCTION #if defined(__GNUC__) || defined(__clang__) #define CL_PRIVATE_FUNCTION __attribute__((unused)) static #else #define CL_PRIVATE_FUNCTION static #endif #endif #ifndef CL_Allocate #include #define CL_Allocate(allocator, size) malloc(size) #endif #ifndef CL_STRING_TO_DOUBLE #include #define CL_STRING_TO_DOUBLE(str, len) strtod(str, 0) #endif #ifndef CL_ASSERT #include #define CL_ASSERT(x) assert(x) #endif #ifndef CL_VSNPRINTF #include #define CL_VSNPRINTF vsnprintf #endif #ifndef CL_SNPRINTF #include #define CL_SNPRINTF snprintf #endif #ifndef CL__MemoryCopy #include #define CL__MemoryCopy(dst, src, s) memcpy(dst, src, s) #endif #ifndef CL_MemoryZero #include #define CL_MemoryZero(p, size) memset(p, 0, size) #endif #ifndef CL_FileExists #define CL_FileExists CL__FileExists #include CL_PRIVATE_FUNCTION bool CL_FileExists(char *name) { bool result = false; FILE *f = fopen(name, "rb"); if (f) { result = true; fclose(f); } return result; } #endif CL_PRIVATE_FUNCTION void CL_ReportError(CL_Lexer *T, CL_Token *token, const char *string, ...); CL_PRIVATE_FUNCTION char *CL_PushStringCopy(CL_Allocator arena, char *p, int size) { char *copy_buffer = (char *)CL_Allocate(arena, size + 1); CL__MemoryCopy(copy_buffer, p, size); copy_buffer[size] = 0; return copy_buffer; } CL_INLINE void CL_Advance(CL_Lexer *T) { if (*T->stream == '\n') { T->line += 1; T->column = 0; } else if (*T->stream == ' ') { T->column += 1; } else if (*T->stream == '\t') { T->column += 1; } else if (*T->stream == 0) { return; } T->stream += 1; } CL_INLINE bool CL_IsAlphabetic(char c) { bool result = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); return result; } CL_INLINE bool CL_IsNumeric(char c) { bool result = (c >= '0' && c <= '9'); return result; } CL_INLINE bool CL_IsHexNumeric(char c) { bool result = (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'); return result; } CL_INLINE bool CL_IsWhitespace(char c) { bool result = c == ' ' || c == '\n' || c == '\r' || c == '\t'; return result; } CL_INLINE bool CL_IsAlphanumeric(char c) { bool result = CL_IsAlphabetic(c) || CL_IsNumeric(c); return result; } CL_API_FUNCTION void CL_SetTokenLength(CL_Lexer *T, CL_Token *token) { intptr_t diff = T->stream - token->str; CL_ASSERT(diff < 2147483647); token->len = (int)diff; } CL_PRIVATE_FUNCTION uint64_t CL_CharMapToNumber(char c) { switch (c) { case '0': return 0; break; case '1': return 1; break; case '2': return 2; break; case '3': return 3; break; case '4': return 4; break; case '5': return 5; break; case '6': return 6; break; case '7': return 7; break; case '8': return 8; break; case '9': return 9; break; case 'a': case 'A': return 10; break; case 'b': case 'B': return 11; break; case 'c': case 'C': return 12; break; case 'd': case 'D': return 13; break; case 'e': case 'E': return 14; break; case 'f': case 'F': return 15; break; default: return 255; } } CL_PRIVATE_FUNCTION uint64_t CL_ParseInteger(CL_Lexer *T, CL_Token *token, char *string, uint64_t len, uint64_t base) { CL_ASSERT(base >= 2 && base <= 16); uint64_t acc = 0; for (uint64_t i = 0; i < len; i++) { uint64_t num = CL_CharMapToNumber(string[i]); if (num >= base) { CL_ReportError(T, token, "Internal compiler error! Failed to parse a number"); break; } acc *= base; acc += num; } return acc; } typedef struct CL_UTF32Result { uint32_t out_str; int advance; int error; } CL_UTF32Result; CL_PRIVATE_FUNCTION CL_UTF32Result CL_UTF8ToUTF32(char *c, int max_advance) { CL_UTF32Result result = {0}; if ((c[0] & 0x80) == 0) { // Check if leftmost zero of first byte is unset if (max_advance >= 1) { result.out_str = c[0]; result.advance = 1; } else result.error = 1; } else if ((c[0] & 0xe0) == 0xc0) { if ((c[1] & 0xc0) == 0x80) { // Continuation byte required if (max_advance >= 2) { result.out_str = (uint32_t)(c[0] & 0x1f) << 6u | (c[1] & 0x3f); result.advance = 2; } else result.error = 2; } else result.error = 2; } else if ((c[0] & 0xf0) == 0xe0) { if ((c[1] & 0xc0) == 0x80 && (c[2] & 0xc0) == 0x80) { // Two continuation bytes required if (max_advance >= 3) { result.out_str = (uint32_t)(c[0] & 0xf) << 12u | (uint32_t)(c[1] & 0x3f) << 6u | (c[2] & 0x3f); result.advance = 3; } else result.error = 3; } else result.error = 3; } else if ((c[0] & 0xf8) == 0xf0) { if ((c[1] & 0xc0) == 0x80 && (c[2] & 0xc0) == 0x80 && (c[3] & 0xc0) == 0x80) { // Three continuation bytes required if (max_advance >= 4) { result.out_str = (uint32_t)(c[0] & 0xf) << 18u | (uint32_t)(c[1] & 0x3f) << 12u | (uint32_t)(c[2] & 0x3f) << 6u | (uint32_t)(c[3] & 0x3f); result.advance = 4; } else result.error = 4; } else result.error = 4; } else result.error = 4; return result; } // @todo I think I should look at this again CL_PRIVATE_FUNCTION void CL_ParseCharLiteral(CL_Lexer *T, CL_Token *token) { token->kind = CL_CHARLIT; token->str = T->stream; while (*T->stream != '\'') { if (*T->stream == '\\') { CL_Advance(T); } if (*T->stream == 0) { CL_ReportError(T, token, "Unclosed character literal!"); return; } CL_Advance(T); } CL_SetTokenLength(T, token); if (token->str[0] == '\\') { switch (token->str[1]) { case '\\': token->u64 = '\\'; break; case '\'': token->u64 = '\''; break; case '"': token->u64 = '"'; break; case 't': token->u64 = '\t'; break; case 'v': token->u64 = '\v'; break; case 'f': token->u64 = '\f'; break; case 'n': token->u64 = '\n'; break; case 'r': token->u64 = '\r'; break; case 'a': token->u64 = '\a'; break; case 'b': token->u64 = '\b'; break; case '0': token->u64 = '\0'; break; case 'x': case 'X': CL_ASSERT(!"Not implemented"); break; // Hex constant case 'u': CL_ASSERT(!"Not implemented"); break; // Unicode constant default: { CL_ReportError(T, token, "Unknown escape code"); } } } else { if (token->len > 4) { CL_ReportError(T, token, "This character literal has invalid format, it's too big"); goto skip_utf_encode; } token->u64 = 0; int i = 0; for (; i < token->len;) { CL_UTF32Result result = CL_UTF8ToUTF32(token->str + i, (int)token->len); i += result.advance; token->u64 |= result.out_str << (8 * (token->len - i)); if (result.error) { CL_ReportError(T, token, "This character literal couldnt be parsed as utf8"); break; } } if (i != token->len) { CL_ReportError(T, token, "Character literal decode error"); } } skip_utf_encode: CL_Advance(T); } // It combines strings, verifies the escape sequences but doesn't do any allocations // so the final string actually needs additional transformation pass. A pass // that will combine the string snippets, replace escape sequences with actual values etc. // // @warning: @not_sure: we are not setting token->string_literal // // "String 1" "String 2" - those strings snippets are combined // @todo: look at this again // @todo: make a manual correct version that user can execute if he needs to CL_PRIVATE_FUNCTION void CL_CheckString(CL_Lexer *T, CL_Token *token) { token->kind = CL_STRINGLIT; combine_next_string_literal: while (*T->stream != '"' && *T->stream != 0 AND_CL_STRING_TERMINATE_ON_NEW_LINE) { if (*T->stream == '\\') { CL_Advance(T); switch (*T->stream) { case 'a': case 'b': case 'e': case 'f': case 'n': case 'r': case 't': case 'v': case '\\': case '\'': case '?': case '"': case 'x': case 'X': // Hex constant case 'u': // Unicode constant case 'U': break; case '0': // octal numbers or null case '1': case '2': case '3': case '4': case '5': case '6': case '7': break; default: { CL_ReportError(T, token, "Invalid escape sequence"); return; } } } CL_Advance(T); } CL_Advance(T); // Try to seek if there is a consecutive string. // If there is such string we try to combine it. { char *seek_for_next_string = T->stream; while (CL_IsWhitespace(*seek_for_next_string)) { seek_for_next_string += 1; } if (*seek_for_next_string == '"') { seek_for_next_string += 1; while (T->stream != seek_for_next_string) CL_Advance(T); goto combine_next_string_literal; } } CL_SetTokenLength(T, token); } CL_PRIVATE_FUNCTION void CL_IsIdentifierKeyword(CL_Token *token) { if (token->len == 1) return; char *c = token->str; switch (c[0]) { case 'v': { switch (c[1]) { case 'o': { if (CL_StringsAreEqual(token->str, token->len, "void", 4)) { token->kind = CL_KEYWORD_VOID; } else if (CL_StringsAreEqual(token->str, token->len, "volatile", 8)) { token->kind = CL_KEYWORD_VOLATILE; } } break; } } break; case 'i': { switch (c[1]) { case 'n': { if (CL_StringsAreEqual(token->str, token->len, "int", 3)) { token->kind = CL_KEYWORD_INT; } else if (CL_StringsAreEqual(token->str, token->len, "inline", 6)) { token->kind = CL_KEYWORD_INLINE; } } break; case 'f': { if (CL_StringsAreEqual(token->str, token->len, "if", 2)) { token->kind = CL_KEYWORD_IF; } } break; } } break; case 'c': { switch (c[1]) { case 'h': { if (CL_StringsAreEqual(token->str, token->len, "char", 4)) { token->kind = CL_KEYWORD_CHAR; } } break; case 'o': { if (CL_StringsAreEqual(token->str, token->len, "const", 5)) { token->kind = CL_KEYWORD_CONST; } else if (CL_StringsAreEqual(token->str, token->len, "continue", 8)) { token->kind = CL_KEYWORD_CONTINUE; } } break; case 'a': { if (CL_StringsAreEqual(token->str, token->len, "case", 4)) { token->kind = CL_KEYWORD_CASE; } } break; } } break; case 'u': { switch (c[1]) { case 'n': { if (CL_StringsAreEqual(token->str, token->len, "unsigned", 8)) { token->kind = CL_KEYWORD_UNSIGNED; } else if (CL_StringsAreEqual(token->str, token->len, "union", 5)) { token->kind = CL_KEYWORD_UNION; } } break; } } break; case 's': { switch (c[1]) { case 'i': { if (CL_StringsAreEqual(token->str, token->len, "signed", 6)) { token->kind = CL_KEYWORD_SIGNED; } else if (CL_StringsAreEqual(token->str, token->len, "sizeof", 6)) { token->kind = CL_KEYWORD_SIZEOF; } } break; case 'h': { if (CL_StringsAreEqual(token->str, token->len, "short", 5)) { token->kind = CL_KEYWORD_SHORT; } } break; case 't': { if (CL_StringsAreEqual(token->str, token->len, "static", 6)) { token->kind = CL_KEYWORD_STATIC; } else if (CL_StringsAreEqual(token->str, token->len, "struct", 6)) { token->kind = CL_KEYWORD_STRUCT; } } break; case 'w': { if (CL_StringsAreEqual(token->str, token->len, "switch", 6)) { token->kind = CL_KEYWORD_SWITCH; } } break; } } break; case 'l': { switch (c[1]) { case 'o': { if (CL_StringsAreEqual(token->str, token->len, "long", 4)) { token->kind = CL_KEYWORD_LONG; } } break; } } break; case 'd': { switch (c[1]) { case 'o': { if (CL_StringsAreEqual(token->str, token->len, "double", 6)) { token->kind = CL_KEYWORD_DOUBLE; } else if (CL_StringsAreEqual(token->str, token->len, "do", 2)) { token->kind = CL_KEYWORD_DO; } } break; case 'e': { if (CL_StringsAreEqual(token->str, token->len, "default", 7)) { token->kind = CL_KEYWORD_DEFAULT; } } break; } } break; case 'f': { switch (c[1]) { case 'l': { if (CL_StringsAreEqual(token->str, token->len, "float", 5)) { token->kind = CL_KEYWORD_FLOAT; } } break; case 'o': { if (CL_StringsAreEqual(token->str, token->len, "for", 3)) { token->kind = CL_KEYWORD_FOR; } } break; } } break; case '_': { switch (c[1]) { case 'B': { if (CL_StringsAreEqual(token->str, token->len, "_Bool", 5)) { token->kind = CL_KEYWORD__BOOL; } } break; case 'C': { if (CL_StringsAreEqual(token->str, token->len, "_Complex", 8)) { token->kind = CL_KEYWORD__COMPLEX; } } break; case 'I': { if (CL_StringsAreEqual(token->str, token->len, "_Imaginary", 10)) { token->kind = CL_KEYWORD__IMAGINARY; } } break; case 'T': { if (CL_StringsAreEqual(token->str, token->len, "_Thread_local", 13)) { token->kind = CL_KEYWORD__THREAD_LOCAL; } } break; case 'A': { if (CL_StringsAreEqual(token->str, token->len, "_Atomic", 7)) { token->kind = CL_KEYWORD__ATOMIC; } else if (CL_StringsAreEqual(token->str, token->len, "_Alignas", 8)) { token->kind = CL_KEYWORD__ALIGNAS; } else if (CL_StringsAreEqual(token->str, token->len, "_Alignof", 8)) { token->kind = CL_KEYWORD__ALIGNOF; } } break; case 'N': { if (CL_StringsAreEqual(token->str, token->len, "_Noreturn", 9)) { token->kind = CL_KEYWORD__NORETURN; } } break; case 'S': { if (CL_StringsAreEqual(token->str, token->len, "_Static_assert", 14)) { token->kind = CL_KEYWORD__STATIC_ASSERT; } } break; case 'G': { if (CL_StringsAreEqual(token->str, token->len, "_Generic", 8)) { token->kind = CL_KEYWORD__GENERIC; } } break; } } break; case 'a': { switch (c[1]) { case 'u': { if (CL_StringsAreEqual(token->str, token->len, "auto", 4)) { token->kind = CL_KEYWORD_AUTO; } } break; } } break; case 'e': { switch (c[1]) { case 'x': { if (CL_StringsAreEqual(token->str, token->len, "extern", 6)) { token->kind = CL_KEYWORD_EXTERN; } } break; case 'n': { if (CL_StringsAreEqual(token->str, token->len, "enum", 4)) { token->kind = CL_KEYWORD_ENUM; } } break; case 'l': { if (CL_StringsAreEqual(token->str, token->len, "else", 4)) { token->kind = CL_KEYWORD_ELSE; } } break; } } break; case 'r': { switch (c[1]) { case 'e': { if (CL_StringsAreEqual(token->str, token->len, "register", 8)) { token->kind = CL_KEYWORD_REGISTER; } else if (CL_StringsAreEqual(token->str, token->len, "restrict", 8)) { token->kind = CL_KEYWORD_RESTRICT; } else if (CL_StringsAreEqual(token->str, token->len, "return", 6)) { token->kind = CL_KEYWORD_RETURN; } } break; } } break; case 't': { switch (c[1]) { case 'y': { if (CL_StringsAreEqual(token->str, token->len, "typedef", 7)) { token->kind = CL_KEYWORD_TYPEDEF; } } break; } } break; case 'b': { switch (c[1]) { case 'r': { if (CL_StringsAreEqual(token->str, token->len, "break", 5)) { token->kind = CL_KEYWORD_BREAK; } } break; } } break; case 'w': { switch (c[1]) { case 'h': { if (CL_StringsAreEqual(token->str, token->len, "while", 5)) { token->kind = CL_KEYWORD_WHILE; } } break; } } break; case 'g': { switch (c[1]) { case 'o': { if (CL_StringsAreEqual(token->str, token->len, "goto", 4)) { token->kind = CL_KEYWORD_GOTO; } } break; } } break; } } CL_PRIVATE_FUNCTION void CL_LexMacroInclude(CL_Lexer *T, CL_Token *token) { token->kind = CL_PREPROC_INCLUDE; while (*T->stream == ' ') CL_Advance(T); char end = 0; if (*T->stream == '"') { end = '"'; } else if (*T->stream == '<') { end = '>'; token->is_system_include = true; } else { CL_ReportError(T, token, "Invalid include directive, file not specified"); return; } CL_Advance(T); token->str = T->stream; while (*T->stream != end) { if (*T->stream == 0) { CL_ReportError(T, token, "Invalid include directive, reached end of file while reading filename"); } if (*T->stream == '\n') { CL_ReportError(T, token, "Invalid include directive filename, got newline character while reading filename"); } CL_Advance(T); } CL_SetTokenLength(T, token); CL_Advance(T); // @not_sure: this is because we want null terminated input into path resolution stuff token->string_literal = CL_PushStringCopy(T->arena, token->str, token->len); } CL_PRIVATE_FUNCTION bool CL_LexMacro(CL_Lexer *T, CL_Token *token) { while (*T->stream == ' ' || T->stream[0] == '\t') CL_Advance(T); token->str = T->stream; while (CL_IsAlphabetic(*T->stream)) CL_Advance(T); CL_SetTokenLength(T, token); switch (*token->str) { case 'd': if (CL_StringsAreEqual(token->str, token->len, "define", 6)) { token->kind = CL_PREPROC_DEFINE; } break; case 'i': if (CL_StringsAreEqual(token->str, token->len, "ifdef", 5)) { token->kind = CL_PREPROC_IFDEF; } else if (CL_StringsAreEqual(token->str, token->len, "ifndef", 6)) { token->kind = CL_PREPROC_IFNDEF; } else if (CL_StringsAreEqual(token->str, token->len, "include", 7)) { token->kind = CL_PREPROC_INCLUDE; CL_LexMacroInclude(T, token); } else if (CL_StringsAreEqual(token->str, token->len, "if", 2)) { token->kind = CL_PREPROC_IF; } break; case 'e': if (CL_StringsAreEqual(token->str, token->len, "endif", 5)) { token->kind = CL_PREPROC_ENDIF; } else if (CL_StringsAreEqual(token->str, token->len, "error", 5)) { token->kind = CL_PREPROC_ERROR; } else if (CL_StringsAreEqual(token->str, token->len, "else", 4)) { token->kind = CL_PREPROC_ELSE; } else if (CL_StringsAreEqual(token->str, token->len, "elif", 4)) { token->kind = CL_PREPROC_ELIF; } break; case 'p': if (CL_StringsAreEqual(token->str, token->len, "pragma", 6)) { token->kind = CL_PREPROC_PRAGMA; } break; case 'u': if (CL_StringsAreEqual(token->str, token->len, "undef", 5)) { token->kind = CL_PREPROC_UNDEF; } break; default: return false; } return true; } // Skipped space here is for case #define Memes (a), this is not a function like macro because of space static uint32_t CL_TokenID; // @todo: make it stable, thread local? CL_PRIVATE_FUNCTION void CL_PrepareToken(CL_Lexer *T, CL_Token *token, bool skipped_space) { CL_MemoryZero(token, sizeof(*token)); token->str = T->stream; token->line = T->line; token->column = T->column; token->file = T->file; token->id = ++CL_TokenID; if (skipped_space) token->is_there_whitespace_before_token = true; CL_Advance(T); } CL_PRIVATE_FUNCTION void CL_DefaultTokenize(CL_Lexer *T, CL_Token *token) { char *c = token->str; switch (*c) { case 0: break; case '(': token->kind = CL_OPENPAREN; break; case ')': token->kind = CL_CLOSEPAREN; break; case '{': token->kind = CL_OPENBRACE; break; case '}': token->kind = CL_CLOSEBRACE; break; case '[': token->kind = CL_OPENBRACKET; break; case ']': token->kind = CL_CLOSEBRACKET; break; case ',': token->kind = CL_COMMA; break; case '~': token->kind = CL_NEG; break; case '?': token->kind = CL_QUESTION; break; case ';': token->kind = CL_SEMICOLON; break; case ':': token->kind = CL_COLON; break; case '.': { token->kind = CL_DOT; if (T->stream[0] == '.' && T->stream[1] == '.') { CL_Advance(T); CL_Advance(T); token->kind = CL_THREEDOTS; } } break; case '/': { token->kind = CL_DIV; if (*T->stream == '/') { token->kind = CL_COMMENT; CL_Advance(T); while (*T->stream != '\n' && *T->stream != 0) { CL_Advance(T); } CL_SetTokenLength(T, token); } else if (*T->stream == '*') { token->kind = CL_COMMENT; CL_Advance(T); for (;;) { if (T->stream[0] == '*' && T->stream[1] == '/') { break; } if (T->stream[0] == 0) { CL_ReportError(T, token, "Unclosed block comment"); goto error_end_path; } CL_Advance(T); } token->str += 2; CL_SetTokenLength(T, token); CL_Advance(T); CL_Advance(T); } else if (*T->stream == '=') { token->kind = CL_DIVASSIGN; CL_Advance(T); } } break; case '#': { if (*T->stream == '#') { token->kind = CL_MACRO_CONCAT; CL_Advance(T); } else { bool is_macro_directive = CL_LexMacro(T, token); if (is_macro_directive) { T->inside_of_macro = true; } else { if (!T->inside_of_macro) { CL_ReportError(T, token, "Invalid preprocessor directive"); goto error_end_path; } token->kind = CL_PREPROC_STRINGIFY; token->str = T->stream; while (*T->stream == '_' || CL_IsAlphanumeric(*T->stream)) CL_Advance(T); CL_SetTokenLength(T, token); } } } break; case '>': { if (*T->stream == '=') { token->kind = CL_GREATERTHEN_OR_EQUAL; CL_Advance(T); } else if (*T->stream == '>') { CL_Advance(T); if (*T->stream == '=') { CL_Advance(T); token->kind = CL_RIGHTSHIFTASSIGN; } else { token->kind = CL_RIGHTSHIFT; } } else { token->kind = CL_GREATERTHEN; } } break; case '<': { token->kind = CL_LESSERTHEN; if (*T->stream == '=') { token->kind = CL_LESSERTHEN_OR_EQUAL; CL_Advance(T); } else if (*T->stream == '<') { CL_Advance(T); if (*T->stream == '=') { CL_Advance(T); token->kind = CL_LEFTSHIFTASSIGN; } else { token->kind = CL_LEFTSHIFT; } } } break; case '&': { if (*T->stream == '=') { token->kind = CL_ANDASSIGN; CL_Advance(T); } else if (*T->stream == '&') { token->kind = CL_AND; CL_Advance(T); } else { token->kind = CL_BITAND; } } break; case '-': { if (*T->stream == '-') { token->kind = CL_DECREMENT; CL_Advance(T); } else if (*T->stream == '=') { token->kind = CL_SUBASSIGN; CL_Advance(T); } else { token->kind = CL_SUB; } } break; case '+': { if (*T->stream == '+') { token->kind = CL_INCREMENT; CL_Advance(T); } else if (*T->stream == '=') { token->kind = CL_ADDASSIGN; CL_Advance(T); } else { token->kind = CL_ADD; } } break; case '|': { if (*T->stream == '|') { token->kind = CL_OR; CL_Advance(T); } else if (*T->stream == '=') { token->kind = CL_ORASSIGN; CL_Advance(T); } else { token->kind = CL_BITOR; } } break; case '=': { if (*T->stream != '=') { token->kind = CL_ASSIGN; } else { CL_Advance(T); token->kind = CL_EQUALS; } } break; case '!': { if (*T->stream != '=') { token->kind = CL_NOT; } else { CL_Advance(T); token->kind = CL_NOTEQUALS; } } break; case '*': { token->kind = CL_MUL; if (*T->stream == '=') { CL_Advance(T); token->kind = CL_MULASSIGN; } } break; case '%': { token->kind = CL_MOD; if (*T->stream == '=') { token->kind = CL_MODASSIGN; CL_Advance(T); } } break; case '^': { token->kind = CL_BITXOR; if (*T->stream == '=') { CL_Advance(T); token->kind = CL_XORASSIGN; } } break; case '"': { CL_CheckString(T, token); } break; case '\'': { CL_ParseCharLiteral(T, token); } break; case 'U': { // @todo Unicode32 if (*T->stream == '"') { token->fix = CL_PREFIX_U32; CL_Advance(T); CL_CheckString(T, token); } else if (*T->stream == '\'') { token->fix = CL_PREFIX_U32; CL_Advance(T); CL_ParseCharLiteral(T, token); } else goto parse_regular_char; } break; case 'u': { // Unicode16 if (*T->stream == '8') { // Unicode8 if (T->stream[1] == '"') { // U8 STRING token->fix = CL_PREFIX_U8; CL_Advance(T); CL_Advance(T); CL_CheckString(T, token); } else if (T->stream[1] == '\'') { // U8 CHAR token->fix = CL_PREFIX_U8; CL_Advance(T); CL_Advance(T); CL_ParseCharLiteral(T, token); } else goto parse_regular_char; } else if (*T->stream == '"') { // U16 STRING token->fix = CL_PREFIX_U16; CL_Advance(T); CL_CheckString(T, token); } else if (*T->stream == '\'') { // U16 CHAR CL_Advance(T); CL_ParseCharLiteral(T, token); } else goto parse_regular_char; } case 'L': { // Widechar if (*T->stream == '"') { token->fix = CL_PREFIX_L; CL_Advance(T); CL_CheckString(T, token); // @todo UTF16 } else if (*T->stream == '\'') { token->fix = CL_PREFIX_L; CL_Advance(T); CL_ParseCharLiteral(T, token); } else goto parse_regular_char; } break; case 'A': case 'a': case 'B': case 'b': case 'C': case 'c': case 'D': case 'd': case 'E': case 'e': case 'F': case 'f': case 'G': case 'g': case 'H': case 'h': case 'I': case 'i': case 'J': case 'j': case 'K': case 'k': /*case 'L':*/ case 'l': case 'M': case 'm': case 'N': case 'n': case 'O': case 'o': case 'P': case 'p': case 'Q': case 'q': case 'R': case 'r': case 'S': case 's': case 'T': case 't': // case 'U': case 'u': case 'V': case 'v': case 'W': case 'w': case 'X': case 'x': case 'Y': case 'y': case 'Z': case 'z': case '_': parse_regular_char : { token->kind = CL_IDENTIFIER; while (*T->stream == '_' || CL_IsAlphanumeric(*T->stream)) { CL_Advance(T); } CL_SetTokenLength(T, token); CL_IsIdentifierKeyword(token); } break; case '0': { if (*T->stream == 'x' || *T->stream == 'X') { token->kind = CL_INT; token->is_hex = true; CL_Advance(T); while (CL_IsHexNumeric(*T->stream)) { CL_Advance(T); } uint64_t len = T->stream - token->str; CL_ASSERT(len > 2); token->u64 = CL_ParseInteger(T, token, token->str + 2, len - 2, 16); break; } } case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { token->kind = CL_INT; for (;;) { if (*T->stream == '.') { if (token->kind == CL_FLOAT) { CL_ReportError(T, token, "Failed to parse a floating point number, invalid format, found multiple '.'"); } if (token->kind == CL_INT) { token->kind = CL_FLOAT; } } else if (CL_IsNumeric(*T->stream) == false) { break; } CL_Advance(T); } if (token->kind == CL_INT) { uint64_t len = T->stream - token->str; CL_ASSERT(len > 0); token->u64 = CL_ParseInteger(T, token, token->str, len, 10); } else if (token->kind == CL_FLOAT) { token->f64 = CL_STRING_TO_DOUBLE(token->str, token->len); } else { CL_ASSERT(token->kind == CL_ERROR); } if (*T->stream == 'f' || *T->stream == 'F') { CL_Advance(T); token->fix = CL_SUFFIX_F; } else if (*T->stream == 'l' || *T->stream == 'L') { CL_Advance(T); token->fix = CL_SUFFIX_L; if (*T->stream == 'l' || *T->stream == 'L') { CL_Advance(T); token->fix = CL_SUFFIX_LL; if (*T->stream == 'u' || *T->stream == 'U') { CL_Advance(T); token->fix = CL_SUFFIX_ULL; } } else if (*T->stream == 'u' || *T->stream == 'U') { CL_Advance(T); token->fix = CL_SUFFIX_UL; } } else if (*T->stream == 'u' || *T->stream == 'U') { CL_Advance(T); token->fix = CL_SUFFIX_U; if (*T->stream == 'l' || *T->stream == 'L') { CL_Advance(T); token->fix = CL_SUFFIX_UL; if (*T->stream == 'l' || *T->stream == 'L') { CL_Advance(T); token->fix = CL_SUFFIX_ULL; } } } } break; default: { CL_ReportError(T, token, "Unhandled character, skipping ..."); } break; } error_end_path:; } CL_PRIVATE_FUNCTION bool CL_EatWhitespace(CL_Lexer *T) { bool skipped = false; for (;;) { if (CL_IsWhitespace(*T->stream)) { if (*T->stream == '\n') T->inside_of_macro = false; CL_Advance(T); skipped = true; } else if (T->stream[0] == '\\' && T->stream[1] == '\n') { CL_Advance(T); CL_Advance(T); skipped = true; } else if (T->stream[0] == '\\' && T->stream[1] == '\r' && T->stream[2] == '\n') { CL_Advance(T); CL_Advance(T); CL_Advance(T); skipped = true; } else { break; } } return skipped; } CL_PRIVATE_FUNCTION void CL_TryToFinalizeToken(CL_Lexer *T, CL_Token *token) { if (!token->len) { CL_SetTokenLength(T, token); } if (T->inside_of_macro) { token->is_inside_macro = true; } } CL_PRIVATE_FUNCTION void CL_InitNextToken(CL_Lexer *T, CL_Token *token) { // Skip comments, comments get allocated on perm and gathered on the Tokenizer. // First non comment token gets those comments attached. for (;;) { bool skipped = CL_EatWhitespace(T); CL_PrepareToken(T, token, skipped); CL_DefaultTokenize(T, token); if (token->kind == CL_EOF) { break; } if (T->select_includes) { if (token->kind != CL_PREPROC_INCLUDE) continue; } if (T->select_macros) { if (!token->is_inside_macro) continue; } if (T->select_comments) { if (token->kind != CL_COMMENT) continue; } if (T->skip_comments) { if (token->kind == CL_COMMENT) continue; } if (T->skip_macros) { if (token->is_inside_macro) continue; } break; } CL_TryToFinalizeToken(T, token); } CL_API_FUNCTION CL_Token CL_Next(CL_Lexer *T) { CL_Token result; CL_MemoryZero(&result, sizeof(CL_Token)); CL_InitNextToken(T, &result); return result; } CL_API_FUNCTION CL_Lexer CL_Begin(CL_Allocator arena, char *stream, char *filename) { CL_Lexer lexer = {0}; lexer.stream = lexer.stream_begin = stream; lexer.file = filename; lexer.arena = arena; lexer.skip_comments = true; return lexer; } // // // CL_PRIVATE_FUNCTION char *CL_ChopLastSlash(CL_Allocator arena, char *str) { int i = 0; int slash_pos = -1; while (str[i]) { if (str[i] == '/') { slash_pos = i; } i += 1; } char *result = str; if (slash_pos != -1) { result = CL_PushStringCopy(arena, str, slash_pos); } else { result = (char *)"./"; } return result; } CL_PRIVATE_FUNCTION char *CL_JoinPath(CL_Allocator arena, char *a, char *b) { int alen = CL_StringLength(a); int blen = CL_StringLength(b); int additional_len = 0; if (alen && a[alen - 1] != '/') additional_len = 1; char *result = (char *)CL_Allocate(arena, sizeof(char) * (alen + blen + 1 + additional_len)); CL__MemoryCopy(result, a, alen); if (additional_len) result[alen++] = '/'; CL__MemoryCopy(result + alen, b, blen); result[alen + blen] = 0; return result; } CL_PRIVATE_FUNCTION bool CL_IsAbsolutePath(char *path) { #if _WIN32 bool result = CL_IsAlphabetic(path[0]) && path[1] == ':' && path[2] == '/'; #else bool result = path[0] == '/'; #endif return result; } CL_PRIVATE_FUNCTION char *CL_SkipToLastSlash(char *p) { int last_slash = 0; for (int i = 0; p[i]; i += 1) { if (p[i] == '/') last_slash = i; } return p + last_slash; } CL_API_FUNCTION char *CL_ResolveFilepath(CL_Allocator arena, CL_SearchPaths *search_paths, char *filename, char *parent_file, bool is_system_include) { CL_SearchPaths null_search_paths = {0}; if (search_paths == 0) search_paths = &null_search_paths; if (search_paths->file_begin_to_ignore) { char *name = CL_SkipToLastSlash(filename); int namelen = CL_StringLength(name); char *ignore = search_paths->file_begin_to_ignore; int ignorelen = CL_StringLength(ignore); if (namelen > ignorelen) { namelen = ignorelen; } if (CL_StringsAreEqual(name, namelen, search_paths->file_begin_to_ignore, ignorelen)) { return 0; } } if (CL_IsAbsolutePath(filename) && CL_FileExists(filename)) { return filename; } if (is_system_include) { for (int path_i = 0; path_i < search_paths->system_include_path_count; path_i += 1) { char *path_it = search_paths->system_include_path[path_i]; char *file = CL_JoinPath(arena, path_it, filename); if (CL_FileExists(file)) { return file; } } } else { if (parent_file) { char *parent_dir = CL_ChopLastSlash(arena, parent_file); char *file = CL_JoinPath(arena, parent_dir, filename); if (CL_FileExists(file)) { return file; } } for (int path_i = 0; path_i < search_paths->include_path_count; path_i += 1) { char *path_it = search_paths->include_path[path_i]; char *file = CL_JoinPath(arena, path_it, filename); if (CL_FileExists(file)) { return file; } } } return 0; } // // // const char *CL_FixString[] = { "SUFFIX_INVALID", "SUFFIX_U", "SUFFIX_UL", "SUFFIX_ULL", "SUFFIX_L", "SUFFIX_LL", "SUFFIX_F", "SUFFIX_FL", "PREFIX_U8", "PREFIX_U16", "PREFIX_U32", "PREFIX_L", }; const char *CL_KindString[] = { "EOF", "*", "/", "%", "<<", ">>", "+", "-", "==", "<", ">", "<=", ">=", "!=", "&", "|", "^", "&&", "||", "~", "!", "--", "++", "--", "++", "=", "/=", "*=", "%=", "-=", "+=", "&=", "|=", "^=", "<<=", ">>=", "(", ")", "{", "}", "[", "]", ",", "##", "#Stringify", "?", "...", ";", ".", ":", "TAG", "->", "SIZEOF", "DOCCOMMENT", "COMMENT", "IDENTIFIER", "STRING_LITERAL", "CHARACTER_LITERAL", "ERROR TOKEN", "FLOAT", "INT", "PREPROC_NULL", "PREPROC_DEFINE", "PREPROC_IFDEF", "PREPROC_IFNDEF", "PREPROC_INCLUDE", "PREPROC_ENDIF", "PREPROC_IF", "PREPROC_PRAGMA", "PREPROC_ERROR", "PREPROC_ELSE", "PREPROC_ELIF", "PREPROC_UNDEF", "KEYWORD_VOID", "KEYWORD_INT", "KEYWORD_CHAR", "KEYWORD_UNSIGNED", "KEYWORD_SIGNED", "KEYWORD_LONG", "KEYWORD_SHORT", "KEYWORD_DOUBLE", "KEYWORD_FLOAT", "KEYWORD__BOOL", "KEYWORD__COMPLEX", "KEYWORD__IMAGINARY", "KEYWORD_STATIC", "KEYWORD_AUTO", "KEYWORD_CONST", "KEYWORD_EXTERN", "KEYWORD_INLINE", "KEYWORD_REGISTER", "KEYWORD_RESTRICT", "KEYWORD_VOLATILE", "KEYWORD__THREAD_LOCAL", "KEYWORD__ATOMIC", "KEYWORD__NORETURN", "KEYWORD_STRUCT", "KEYWORD_UNION", "KEYWORD_ENUM", "KEYWORD_TYPEDEF", "KEYWORD_DEFAULT", "KEYWORD_BREAK", "KEYWORD_RETURN", "KEYWORD_SWITCH", "KEYWORD_IF", "KEYWORD_ELSE", "KEYWORD_FOR", "KEYWORD_WHILE", "KEYWORD_CASE", "KEYWORD_CONTINUE", "KEYWORD_DO", "KEYWORD_GOTO", "KEYWORD_SIZEOF", "KEYWORD__ALIGNAS", "KEYWORD__ALIGNOF", "KEYWORD__STATIC_ASSERT", "KEYWORD__GENERIC", }; CL_API_FUNCTION void CL_StringifyMessage(char *buff, int buff_size, CL_Message *msg) { CL_SNPRINTF(buff, buff_size, "%s:%d %15s", msg->token.file, msg->token.line, msg->string); } CL_API_FUNCTION void CL_Stringify(char *buff, int buff_size, CL_Token *token) { const char *token_kind = "UNKNOWN"; if (token->kind < CL_COUNT) token_kind = CL_KindString[token->kind]; CL_SNPRINTF(buff, buff_size, "%s:%d %15s %15.*s", token->file, token->line, token_kind, token->len, token->str); } #define CL_SLL_QUEUE_ADD_MOD(f, l, n, next) \ do { \ (n)->next = 0; \ if ((f) == 0) { \ (f) = (l) = (n); \ } \ else { \ (l) = (l)->next = (n); \ } \ } while (0) #define CL_SLL_QUEUE_ADD(f, l, n) CL_SLL_QUEUE_ADD_MOD(f, l, n, next) #define CL__FORMAT(arena, string, result) \ va_list args1, args2; \ va_start(args1, string); \ va_copy(args2, args1); \ int len = CL_VSNPRINTF(0, 0, string, args2); \ va_end(args2); \ char *result = (char *)CL_Allocate((arena), len + 1); \ CL_VSNPRINTF(result, len + 1, string, args1); \ va_end(args1) CL_PRIVATE_FUNCTION void CL_ReportError(CL_Lexer *T, CL_Token *token, const char *string, ...) { CL__FORMAT(T->arena, string, message_string); CL_Message *result = (CL_Message *)CL_Allocate(T->arena, sizeof(CL_Message)); CL_MemoryZero(result, sizeof(CL_Message)); CL_SLL_QUEUE_ADD(T->first_message, T->last_message, result); result->string = (char *)string; result->token = *token; token->kind = CL_ERROR; token->error = result; T->errors += 1; }