From 2754ff7ed65babbf1e4e51cb64f01c9c5c00fd3e Mon Sep 17 00:00:00 2001 From: Krzosa Karol Date: Sat, 13 Jan 2024 20:25:28 +0100 Subject: [PATCH] Big rework of clexer --- build_tool/build_lib.cpp | 4 +- build_tool/cache.c | 19 +- standalone_libraries/clexer.c | 1063 +++++++++++---------------------- standalone_libraries/clexer.h | 264 ++------ tests/1test_string.cpp | 44 +- 5 files changed, 437 insertions(+), 957 deletions(-) diff --git a/build_tool/build_lib.cpp b/build_tool/build_lib.cpp index 9699c32..b72d9da 100644 --- a/build_tool/build_lib.cpp +++ b/build_tool/build_lib.cpp @@ -35,8 +35,8 @@ DEBUG = -fsanitize=address #include "../core_library/core.c" -#define CL_Arena MA_Arena -#define CL_PushSize MA_PushSizeNonZeroed +#define CL_Allocator MA_Arena * +#define CL_Allocate(a, s) MA_PushSizeNonZeroed(a, s) #define CL_ASSERT IO_Assert #define CL_VSNPRINTF stbsp_vsnprintf #define CL_SNPRINTF stbsp_snprintf diff --git a/build_tool/cache.c b/build_tool/cache.c index d6788f9..90efaf7 100644 --- a/build_tool/cache.c +++ b/build_tool/cache.c @@ -16,13 +16,11 @@ struct SRC_Cache { double SRC_Time; SRC_Cache *SRC_InMemoryCache; SRC_Cache *SRC_FromFileCache; -CL_ArenaTuple SRC_ArenaTuple; S8_String SRC_CacheFilename; CL_SearchPaths SRC_SearchPaths = {}; // @todo; void SRC_InitCache(MA_Arena *arena, S8_String cachefilename) { SRC_CacheFilename = cachefilename; - CL_InitDefaultTuple(&SRC_ArenaTuple); SRC_InMemoryCache = MA_PushStruct(arena, SRC_Cache); SRC_InMemoryCache->entry_cap = SRC_CACHE_ENTRY_COUNT; @@ -70,16 +68,19 @@ SRC_CacheEntry *SRC_HashFile(S8_String file, char *parent_file) { SRC_CacheEntry *entry = SRC_FindCache(SRC_InMemoryCache, filepath_hash); if (entry) return entry; - CL_LexResult *first_lex = CL_LexFile(&SRC_ArenaTuple, resolved_file); - IO_Assert(first_lex); - uint64_t file_hash = HashBytes(first_lex->stream_begin, first_lex->stream - first_lex->stream_begin); + S8_String filecontent = OS_ReadFile(Perm, S8_MakeFromChar(resolved_file)); + IO_Assert(filecontent.str); + + uint64_t file_hash = HashBytes(filecontent.str, filecontent.len); uint64_t includes_hash = 13; - CL_LexList list = CL_MakeLexList(first_lex); - for (CL_IncludeIter iter = CL_IterateIncludes(&list); iter.filename; CL_GetNextInclude(&iter)) { - if (iter.is_system_include) continue; + CL_Lexer lexer = CL_Begin(Perm, filecontent.str, resolved_file); + lexer.select_includes = true; - S8_String file_it = S8_MakeFromChar(iter.filename); + for (CL_Token token = CL_Next(&lexer); token.kind != CL_EOF; token = CL_Next(&lexer)) { + if (token.is_system_include) continue; + + S8_String file_it = S8_Make(token.str, token.len); SRC_CacheEntry *cache = SRC_HashFile(file_it, resolved_file); if (!cache) { IO_Printf("Missing cache for: %.*s\n", S8_Expand(file_it)); diff --git a/standalone_libraries/clexer.c b/standalone_libraries/clexer.c index 8e9a83d..bcbef78 100644 --- a/standalone_libraries/clexer.c +++ b/standalone_libraries/clexer.c @@ -1,6 +1,19 @@ #include "clexer.h" #include +#ifndef CL_PRIVATE_FUNCTION + #if defined(__GNUC__) || defined(__clang__) + #define CL_PRIVATE_FUNCTION __attribute__((unused)) static + #else + #define CL_PRIVATE_FUNCTION static + #endif +#endif + +#ifndef CL_Allocate + #include + #define CL_Allocate(allocator, size) malloc(size) +#endif + #ifndef CL_STRING_TO_DOUBLE #include #define CL_STRING_TO_DOUBLE(str, len) strtod(str, 0) @@ -26,36 +39,15 @@ #define CL__MemoryCopy(dst, src, s) memcpy(dst, src, s) #endif -#ifndef CL__MemoryZero +#ifndef CL_MemoryZero #include - #define CL__MemoryZero(p, size) memset(p, 0, size) -#endif - -#ifndef CL_ReadFile - #define CL_ReadFile CL__ReadFile - #include -CL_PRIVATE_FUNCTION char *CL_ReadFile(CL_Arena *arena, char *name) { - char *result = 0; - FILE *f = fopen(name, "rb"); - if (f) { - fseek(f, 0, SEEK_END); - int len = ftell(f); - fseek(f, 0, SEEK_SET); - - result = (char *)CL_PushSize(arena, len + 1); - fread(result, len, 1, f); - fclose(f); - result[len] = 0; - } - - return result; -} + #define CL_MemoryZero(p, size) memset(p, 0, size) #endif #ifndef CL_FileExists #define CL_FileExists CL__FileExists #include -CL_API_FUNCTION bool CL_FileExists(char *name) { +CL_PRIVATE_FUNCTION bool CL_FileExists(char *name) { bool result = false; FILE *f = fopen(name, "rb"); if (f) { @@ -66,284 +58,9 @@ CL_API_FUNCTION bool CL_FileExists(char *name) { } #endif -#ifndef CL__HASH_BYTES - #define CL__HASH_BYTES CL__HashBytes -// FNV HASH (1a?) -static uint64_t CL__HashBytes(void *p, int bytes) { - uint8_t *p8 = (uint8_t *)p; - uint64_t hash = (uint64_t)14695981039346656037ULL; - for (int i = 0; i < bytes; i++) { - hash = hash ^ (uint64_t)(p8[i]); - hash = hash * (uint64_t)1099511628211ULL; - } - return hash; -} -#endif +CL_PRIVATE_FUNCTION void CL_ReportError(CL_Lexer *T, CL_Token *token, const char *string, ...); -#ifndef CL_CUSTOM_ARENA_TYPE -CL_PRIVATE_FUNCTION void *CL_PushSize(CL_Arena *arena, int size) { - if (arena->len + size > arena->cap) { - CL_ASSERT(!"CLEX: Not enough memory"); - } - void *result = arena->buff + arena->len; - arena->len += size; - return result; -} -#endif - -#ifdef __cplusplus - #define CL_ZeroStruct() \ - {} -#else - #define CL_ZeroStruct() \ - { 0 } -#endif - -#define CL_PushArray(arena, T, size) (T *)CL__PushSizeZeroed(arena, sizeof(T) * (size)) -#define CL_PushStruct(arena, T) CL_PushArray(arena, T, 1) -CL_PRIVATE_FUNCTION void *CL__PushSizeZeroed(CL_Arena *arena, int size) { - void *result = CL_PushSize(arena, size); - CL__MemoryZero(result, size); - return result; -} - -const char *CL_FixString[] = { - "", - "SUFFIX_U", - "SUFFIX_UL", - "SUFFIX_ULL", - "SUFFIX_L", - "SUFFIX_LL", - "SUFFIX_F", - "SUFFIX_FL", - "PREFIX_U8", - "PREFIX_U16", - "PREFIX_U32", - "PREFIX_L", -}; - -const char *CL_KindString[] = { - "EOF", - "*", - "/", - "%", - "<<", - ">>", - "+", - "-", - "==", - "<", - ">", - "<=", - ">=", - "!=", - "&", - "|", - "^", - "&&", - "||", - "~", - "!", - "--", - "++", - "--", - "++", - "=", - "/=", - "*=", - "%=", - "-=", - "+=", - "&=", - "|=", - "^=", - "<<=", - ">>=", - "(", - ")", - "{", - "}", - "[", - "]", - ",", - "##", - "#Stringify", - "?", - "...", - ";", - ".", - ":", - "TAG", - "->", - "SIZEOF", - "DOCCOMMENT", - "COMMENT", - "IDENTIFIER", - "STRING_LITERAL", - "CHARACTER_LITERAL", - "ERROR TOKEN", - "FLOAT", - "INT", - "PREPROC_NULL", - "PREPROC_DEFINE", - "PREPROC_IFDEF", - "PREPROC_IFNDEF", - "PREPROC_INCLUDE", - "PREPROC_ENDIF", - "PREPROC_IF", - "PREPROC_PRAGMA", - "PREPROC_ERROR", - "PREPROC_ELSE", - "PREPROC_ELIF", - "PREPROC_UNDEF", - "KEYWORD_VOID", - "KEYWORD_INT", - "KEYWORD_CHAR", - "KEYWORD_UNSIGNED", - "KEYWORD_SIGNED", - "KEYWORD_LONG", - "KEYWORD_SHORT", - "KEYWORD_DOUBLE", - "KEYWORD_FLOAT", - "KEYWORD__BOOL", - "KEYWORD__COMPLEX", - "KEYWORD__IMAGINARY", - "KEYWORD_STATIC", - "KEYWORD_AUTO", - "KEYWORD_CONST", - "KEYWORD_EXTERN", - "KEYWORD_INLINE", - "KEYWORD_REGISTER", - "KEYWORD_RESTRICT", - "KEYWORD_VOLATILE", - "KEYWORD__THREAD_LOCAL", - "KEYWORD__ATOMIC", - "KEYWORD__NORETURN", - "KEYWORD_STRUCT", - "KEYWORD_UNION", - "KEYWORD_ENUM", - "KEYWORD_TYPEDEF", - "KEYWORD_DEFAULT", - "KEYWORD_BREAK", - "KEYWORD_RETURN", - "KEYWORD_SWITCH", - "KEYWORD_IF", - "KEYWORD_ELSE", - "KEYWORD_FOR", - "KEYWORD_WHILE", - "KEYWORD_CASE", - "KEYWORD_CONTINUE", - "KEYWORD_DO", - "KEYWORD_GOTO", - "KEYWORD_SIZEOF", - "KEYWORD__ALIGNAS", - "KEYWORD__ALIGNOF", - "KEYWORD__STATIC_ASSERT", - "KEYWORD__GENERIC", -}; - -const char *CL_MessageKindString[] = { - "ERROR", - "WARNING", - "TRACE", -}; -/*END*/ - -#define CL_DLL_QUEUE_ADD_MOD(f, l, node, next, prev) \ - do { \ - if ((f) == 0) { \ - (f) = (l) = (node); \ - (node)->prev = 0; \ - (node)->next = 0; \ - } \ - else { \ - (l)->next = (node); \ - (node)->prev = (l); \ - (node)->next = 0; \ - (l) = (node); \ - } \ - } while (0) -#define CL_DLL_QUEUE_ADD(f, l, node) CL_DLL_QUEUE_ADD_MOD(f, l, node, next, prev) - -#define CL_SLL_QUEUE_ADD_MOD(f, l, n, next) \ - do { \ - (n)->next = 0; \ - if ((f) == 0) { \ - (f) = (l) = (n); \ - } \ - else { \ - (l) = (l)->next = (n); \ - } \ - } while (0) -#define CL_SLL_QUEUE_ADD(f, l, n) CL_SLL_QUEUE_ADD_MOD(f, l, n, next) - -#define CL__FORMAT(arena, string, result) \ - va_list args1, args2; \ - va_start(args1, string); \ - va_copy(args2, args1); \ - int len = CL_VSNPRINTF(0, 0, string, args2); \ - va_end(args2); \ - char *result = (char *)CL_PushSize((arena), len + 1); \ - CL_VSNPRINTF(result, len + 1, string, args1); \ - va_end(args1) - -CL_API_FUNCTION void CL_ReportError(CL_LexResult *T, CL_Token *token, const char *string, ...) { - CL__FORMAT(T->arena->other, string, message_string); - CL_Message *result = CL_PushStruct(T->arena->other, CL_Message); - result->kind = CLM_ERROR; - result->string = (char *)string; - CL_SLL_QUEUE_ADD(T->first_message, T->last_message, result); - result->token = *token; - T->errors += 1; - token->kind = CL_ERROR; - token->error = result; -#if TEST_DEBUG - printf("%s:%d %s\n", token->file, token->line, string); - __debugbreak(); -#endif -} - -CL_PRIVATE_FUNCTION char *CL_PushStringCopy(CL_Arena *arena, char *p, int size) { - char *copy_buffer = (char *)CL_PushSize(arena, size + 1); - CL__MemoryCopy(copy_buffer, p, size); - copy_buffer[size] = 0; - return copy_buffer; -} - -CL_PRIVATE_FUNCTION CL_Token *CL_CopyToken(CL_Arena *arena, CL_Token *token) { - CL_Token *copy_buffer = (CL_Token *)CL_PushSize(arena, sizeof(CL_Token)); - CL__MemoryCopy(copy_buffer, token, sizeof(CL_Token)); - return copy_buffer; -} - -CL_API_FUNCTION void CL_StringifyMessage(char *buff, int buff_size, CL_Message *msg) { - const char *kind = CL_MessageKindString[msg->kind]; - CL_SNPRINTF(buff, buff_size, "%s:%d %15s %15s", msg->token.file, msg->token.line, kind, msg->string); -} - -CL_API_FUNCTION void CL_Stringify(char *buff, int buff_size, CL_Token *token) { - const char *token_kind = "UNKNOWN"; - if (token->kind < CL_COUNT) token_kind = CL_KindString[token->kind]; - CL_SNPRINTF(buff, buff_size, "%s:%d %15s %15.*s", token->file, token->line, token_kind, token->len, token->str); -} - -CL_API_FUNCTION void CL_PrintMessages(CL_LexResult *lex_result) { - char buff[1024]; - for (CL_Message *it = lex_result->first_message; it; it = it->next) { - CL_StringifyMessage(buff, sizeof(buff), it); - printf("%s\n", buff); - } -} - -CL_API_FUNCTION void CL_PrintTokens(CL_Tokens tokens) { - char buff[1024]; - for (int i = 0; i < tokens.count; i += 1) { - CL_Stringify(buff, sizeof(buff), &tokens.data[i]); - printf("%s\n", buff); - } -} - -CL_INLINE void CL_Advance(CL_LexResult *T) { +CL_INLINE void CL_Advance(CL_Lexer *T) { if (*T->stream == '\n') { T->line += 1; T->column = 0; @@ -385,51 +102,12 @@ CL_INLINE bool CL_IsAlphanumeric(char c) { return result; } -CL_API_FUNCTION bool CL_EatWhitespace(CL_LexResult *T) { - bool skipped = false; - for (;;) { - if (CL_IsWhitespace(*T->stream)) { - if (*T->stream == '\n') T->inside_of_macro = false; - CL_Advance(T); - skipped = true; - } - else if (T->stream[0] == '\\' && T->stream[1] == '\n') { - CL_Advance(T); - CL_Advance(T); - skipped = true; - } - else if (T->stream[0] == '\\' && T->stream[1] == '\r' && T->stream[2] == '\n') { - CL_Advance(T); - CL_Advance(T); - CL_Advance(T); - skipped = true; - } - else { - break; - } - } - return skipped; -} - -CL_API_FUNCTION void CL_SetTokenLength(CL_LexResult *T, CL_Token *token) { +CL_API_FUNCTION void CL_SetTokenLength(CL_Lexer *T, CL_Token *token) { intptr_t diff = T->stream - token->str; CL_ASSERT(diff < 2147483647); token->len = (int)diff; } -CL_API_FUNCTION void CL_TryToFinalizeToken(CL_LexResult *T, CL_Token *token) { - for (; T->attached_comment_index < T->comments.count; T->attached_comment_index += 1) { - CL_Token *it = T->comments.data + T->attached_comment_index; - it->comment_is_attached_to_token = token; - } - if (!token->len) { - CL_SetTokenLength(T, token); - } - if (T->inside_of_macro) { - token->flags |= CL_INSIDE_OF_MACRO; - } -} - CL_PRIVATE_FUNCTION uint64_t CL_CharMapToNumber(char c) { switch (c) { case '0': return 0; break; @@ -458,7 +136,7 @@ CL_PRIVATE_FUNCTION uint64_t CL_CharMapToNumber(char c) { } } -CL_PRIVATE_FUNCTION uint64_t CL_ParseInteger(CL_LexResult *T, CL_Token *token, char *string, uint64_t len, uint64_t base) { +CL_PRIVATE_FUNCTION uint64_t CL_ParseInteger(CL_Lexer *T, CL_Token *token, char *string, uint64_t len, uint64_t base) { CL_ASSERT(base >= 2 && base <= 16); uint64_t acc = 0; for (uint64_t i = 0; i < len; i++) { @@ -480,7 +158,7 @@ typedef struct CL_UTF32Result { } CL_UTF32Result; CL_PRIVATE_FUNCTION CL_UTF32Result CL_UTF8ToUTF32(char *c, int max_advance) { - CL_UTF32Result result = CL_ZeroStruct(); + CL_UTF32Result result = {0}; if ((c[0] & 0x80) == 0) { // Check if leftmost zero of first byte is unset if (max_advance >= 1) { @@ -528,7 +206,7 @@ CL_PRIVATE_FUNCTION CL_UTF32Result CL_UTF8ToUTF32(char *c, int max_advance) { } // @todo I think I should look at this again -CL_API_FUNCTION void CL_ParseCharLiteral(CL_LexResult *T, CL_Token *token) { +CL_API_FUNCTION void CL_ParseCharLiteral(CL_Lexer *T, CL_Token *token) { token->kind = CL_CHARLIT; token->str = T->stream; while (*T->stream != '\'') { @@ -555,13 +233,10 @@ CL_API_FUNCTION void CL_ParseCharLiteral(CL_LexResult *T, CL_Token *token) { case 'r': token->u64 = '\r'; break; case 'a': token->u64 = '\a'; break; case 'b': token->u64 = '\b'; break; - case '0': - token->u64 = '\0'; - break; - // Octal constant + case '0': token->u64 = '\0'; break; case 'x': - case 'X': CL_ASSERT(0); break; // Hex constant - case 'u': CL_ASSERT(0); break; // Unicode constant + case 'X': CL_ASSERT(!"Not implemented"); break; // Hex constant + case 'u': CL_ASSERT(!"Not implemented"); break; // Unicode constant default: { CL_ReportError(T, token, "Unknown escape code"); } @@ -595,87 +270,74 @@ skip_utf_encode: CL_Advance(T); } -CL_PRIVATE_FUNCTION void CL_BufferWrite(char *buffer, int buffer_size, int *buffer_iter, char write) { - if (*buffer_iter < buffer_size) { - buffer[*buffer_iter] = write; - *buffer_iter += 1; - } -} - -// @todo I think I should look at this again -// Idea: Maybe try to figure out size first and then write the string -CL_API_FUNCTION void CL_ParseString(CL_LexResult *T, CL_Token *token) { - // @todo String builder here, we dont really want 4096 character limit - int buffer_iter = 0; - int buffer_size = 4096; - char buffer[4096]; - +// It combines strings, verifies the escape sequences but doesn't do any allocations +// so the final string actually needs additional transformation pass. A pass +// that will combine the string snippets, replace escape sequences with actual values etc. +// +// "String 1" "String 2" - those strings snippets are combined +CL_API_FUNCTION void CL_CheckString(CL_Lexer *T, CL_Token *token) { token->kind = CL_STRINGLIT; - // First we try to parse the string normally, we write contents to scratch memory. - // Afterwards we try to seek if there are more consecutive strings. As the speak - // says, those are one string, so we combine them nicely. Then after we have written - // everything to the scratch buffer. We make a proper tight copy on the pernament - // allocator. combine_next_string_literal: while (*T->stream != '"' && *T->stream != 0 AND_CL_STRING_TERMINATE_ON_NEW_LINE) { if (*T->stream == '\\') { CL_Advance(T); switch (*T->stream) { - case '\\': CL_BufferWrite(buffer, buffer_size, &buffer_iter, '\\'); break; - case '\'': CL_BufferWrite(buffer, buffer_size, &buffer_iter, '\''); break; - case '"': CL_BufferWrite(buffer, buffer_size, &buffer_iter, '"'); break; - case 't': CL_BufferWrite(buffer, buffer_size, &buffer_iter, '\t'); break; - case 'f': CL_BufferWrite(buffer, buffer_size, &buffer_iter, '\f'); break; - case 'n': CL_BufferWrite(buffer, buffer_size, &buffer_iter, '\n'); break; - case 'v': CL_BufferWrite(buffer, buffer_size, &buffer_iter, '\v'); break; - case 'r': CL_BufferWrite(buffer, buffer_size, &buffer_iter, '\r'); break; - case 'a': CL_BufferWrite(buffer, buffer_size, &buffer_iter, '\a'); break; - case 'b': CL_BufferWrite(buffer, buffer_size, &buffer_iter, '\b'); break; - case '0': - CL_BufferWrite(buffer, buffer_size, &buffer_iter, '\0'); - break; - - // Octal constant + case 'a': + case 'b': + case 'e': + case 'f': + case 'n': + case 'r': + case 't': + case 'v': + case '\\': + case '\'': + case '?': + case '"': case 'x': - case 'X': CL_ASSERT(0); break; // Hex constant - case 'u': CL_ASSERT(0); break; // Unicode constant + case 'X': // Hex constant + case 'u': // Unicode constant + case 'U': + break; + case '0': // octal numbers or null + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + break; + default: { + CL_ReportError(T, token, "Invalid escape sequence"); + return; + } } } - else { - CL_BufferWrite(buffer, buffer_size, &buffer_iter, *T->stream); - } - CL_Advance(T); } CL_Advance(T); // Try to seek if there is a consecutive string. // If there is such string we try to combine it. - char *seek_for_next_string = T->stream; - while (CL_IsWhitespace(*seek_for_next_string)) { - seek_for_next_string += 1; - } + { + char *seek_for_next_string = T->stream; + while (CL_IsWhitespace(*seek_for_next_string)) { + seek_for_next_string += 1; + } - if (*seek_for_next_string == '"') { - seek_for_next_string += 1; - while (T->stream != seek_for_next_string) CL_Advance(T); - goto combine_next_string_literal; + if (*seek_for_next_string == '"') { + seek_for_next_string += 1; + while (T->stream != seek_for_next_string) CL_Advance(T); + goto combine_next_string_literal; + } } - - int len = buffer_iter + 1; - if (len > buffer_size) { - len = buffer_size; - CL_ReportError(T, token, "Truncated string! Reached 4096 character limit for string literal."); - } - - token->string_literal = CL_PushStringCopy(T->arena->other, buffer, len); + CL_SetTokenLength(T, token); } -CL_API_FUNCTION void CL_IsIdentifierKeyword(CL_LexResult *ctx, CL_Token *token) { +CL_API_FUNCTION void CL_IsIdentifierKeyword(CL_Token *token) { if (token->len == 1) return; char *c = token->str; - /*import meta -meta.gen_lex_keywords()*/ switch (c[0]) { case 'v': { switch (c[1]) { @@ -940,10 +602,9 @@ meta.gen_lex_keywords()*/ } } break; } - /*END*/ } -CL_API_FUNCTION void CL_LexMacroInclude(CL_LexResult *T, CL_Token *token) { +CL_API_FUNCTION void CL_LexMacroInclude(CL_Lexer *T, CL_Token *token) { token->kind = CL_PREPROC_INCLUDE; while (*T->stream == ' ') CL_Advance(T); char end = 0; @@ -952,7 +613,7 @@ CL_API_FUNCTION void CL_LexMacroInclude(CL_LexResult *T, CL_Token *token) { } else if (*T->stream == '<') { end = '>'; - token->flags |= CL_SYSTEM_INCLUDE; + token->is_system_include = true; } else { CL_ReportError(T, token, "Invalid include directive, file not specified"); @@ -972,22 +633,14 @@ CL_API_FUNCTION void CL_LexMacroInclude(CL_LexResult *T, CL_Token *token) { } CL_SetTokenLength(T, token); CL_Advance(T); - - token->str = CL_PushStringCopy(T->arena->other, token->str, token->len); - - CL_Token *include_list_item = CL_CopyToken(T->arena->include, token); - T->includes.count += 1; - if (T->includes.data == 0) T->includes.data = include_list_item; } -CL_API_FUNCTION bool CL_LexMacro(CL_LexResult *T, CL_Token *token) { +CL_API_FUNCTION bool CL_LexMacro(CL_Lexer *T, CL_Token *token) { while (*T->stream == ' ' || T->stream[0] == '\t') CL_Advance(T); token->str = T->stream; while (CL_IsAlphabetic(*T->stream)) CL_Advance(T); CL_SetTokenLength(T, token); - /*import meta - meta.gen_lex_preproc_keywords() Need to add END*/ switch (*token->str) { case 'd': if (CL_StringsAreEqual(token->str, token->len, "define", 6)) { @@ -1042,34 +695,20 @@ CL_API_FUNCTION bool CL_LexMacro(CL_LexResult *T, CL_Token *token) { return true; } -CL_API_FUNCTION void CL_InitLexResult(CL_LexResult *T, CL_ArenaTuple *arena, char *filename, char *filecontent) { - CL__MemoryZero(T, sizeof(CL_LexResult)); - T->arena = arena; - T->stream = filecontent; - T->stream_begin = filecontent; - T->file = filename; -} - -CL_API_FUNCTION CL_LexResult *CL_CreateLexingResult(CL_ArenaTuple *arena, char *filename, char *filecontent) { - CL_LexResult *T = CL_PushStruct(arena->other, CL_LexResult); - CL_InitLexResult(T, arena, filename, filecontent); - return T; -} - // Skipped space here is for case #define Memes (a), this is not a function like macro because of space static uint32_t CL_TokenID; // @todo: make it stable, thread local? -CL_API_FUNCTION void CL_PrepareToken(CL_LexResult *T, CL_Token *token, bool skipped_space) { - CL__MemoryZero(token, sizeof(*token)); +CL_API_FUNCTION void CL_PrepareToken(CL_Lexer *T, CL_Token *token, bool skipped_space) { + CL_MemoryZero(token, sizeof(*token)); token->str = T->stream; token->line = T->line; token->column = T->column; token->file = T->file; token->id = ++CL_TokenID; - if (skipped_space) token->flags |= CL_WHITESPACE_BEFORE_TOKEN; + if (skipped_space) token->is_there_whitespace_before_token = true; CL_Advance(T); } -CL_API_FUNCTION void CL_DefaultTokenize(CL_LexResult *T, CL_Token *token) { +CL_API_FUNCTION void CL_DefaultTokenize(CL_Lexer *T, CL_Token *token) { char *c = token->str; switch (*c) { case 0: break; @@ -1083,6 +722,7 @@ CL_API_FUNCTION void CL_DefaultTokenize(CL_LexResult *T, CL_Token *token) { case '~': token->kind = CL_NEG; break; case '?': token->kind = CL_QUESTION; break; case ';': token->kind = CL_SEMICOLON; break; + case ':': token->kind = CL_COLON; break; case '.': { token->kind = CL_DOT; if (T->stream[0] == '.' && T->stream[1] == '.') { @@ -1091,9 +731,6 @@ CL_API_FUNCTION void CL_DefaultTokenize(CL_LexResult *T, CL_Token *token) { token->kind = CL_THREEDOTS; } } break; - case ':': { - token->kind = CL_COLON; - } break; case '/': { token->kind = CL_DIV; if (*T->stream == '/') { @@ -1104,10 +741,6 @@ CL_API_FUNCTION void CL_DefaultTokenize(CL_LexResult *T, CL_Token *token) { CL_Advance(T); } CL_SetTokenLength(T, token); - - CL_Token *comment_token = CL_CopyToken(T->arena->comment, token); - if (T->comments.data == 0) T->comments.data = comment_token; - T->comments.count += 1; } else if (*T->stream == '*') { token->kind = CL_COMMENT; @@ -1126,10 +759,6 @@ CL_API_FUNCTION void CL_DefaultTokenize(CL_LexResult *T, CL_Token *token) { CL_SetTokenLength(T, token); CL_Advance(T); CL_Advance(T); - - CL_Token *comment_token = CL_CopyToken(T->arena->comment, token); - if (T->comments.data == 0) T->comments.data = comment_token; - T->comments.count += 1; } else if (*T->stream == '=') { token->kind = CL_DIVASSIGN; @@ -1288,7 +917,7 @@ CL_API_FUNCTION void CL_DefaultTokenize(CL_LexResult *T, CL_Token *token) { } } break; case '"': { - CL_ParseString(T, token); + CL_CheckString(T, token); } break; case '\'': { CL_ParseCharLiteral(T, token); @@ -1297,7 +926,7 @@ CL_API_FUNCTION void CL_DefaultTokenize(CL_LexResult *T, CL_Token *token) { if (*T->stream == '"') { token->fix = CL_PREFIX_U32; CL_Advance(T); - CL_ParseString(T, token); + CL_CheckString(T, token); } else if (*T->stream == '\'') { token->fix = CL_PREFIX_U32; @@ -1312,7 +941,7 @@ CL_API_FUNCTION void CL_DefaultTokenize(CL_LexResult *T, CL_Token *token) { token->fix = CL_PREFIX_U8; CL_Advance(T); CL_Advance(T); - CL_ParseString(T, token); + CL_CheckString(T, token); } else if (T->stream[1] == '\'') { // U8 CHAR token->fix = CL_PREFIX_U8; @@ -1325,7 +954,7 @@ CL_API_FUNCTION void CL_DefaultTokenize(CL_LexResult *T, CL_Token *token) { else if (*T->stream == '"') { // U16 STRING token->fix = CL_PREFIX_U16; CL_Advance(T); - CL_ParseString(T, token); + CL_CheckString(T, token); } else if (*T->stream == '\'') { // U16 CHAR CL_Advance(T); @@ -1337,7 +966,7 @@ CL_API_FUNCTION void CL_DefaultTokenize(CL_LexResult *T, CL_Token *token) { if (*T->stream == '"') { token->fix = CL_PREFIX_L; CL_Advance(T); - CL_ParseString(T, token); // @todo UTF16 + CL_CheckString(T, token); // @todo UTF16 } else if (*T->stream == '\'') { token->fix = CL_PREFIX_L; @@ -1403,12 +1032,12 @@ CL_API_FUNCTION void CL_DefaultTokenize(CL_LexResult *T, CL_Token *token) { CL_Advance(T); } CL_SetTokenLength(T, token); - CL_IsIdentifierKeyword(T, token); + CL_IsIdentifierKeyword(token); } break; case '0': { if (*T->stream == 'x' || *T->stream == 'X') { token->kind = CL_INT; - token->flags |= CL_HEX; + token->is_hex = true; CL_Advance(T); while (CL_IsHexNumeric(*T->stream)) { CL_Advance(T); @@ -1497,244 +1126,112 @@ CL_API_FUNCTION void CL_DefaultTokenize(CL_LexResult *T, CL_Token *token) { } break; default: { - CL_Message *result = CL_PushStruct(T->arena->other, CL_Message); - result->kind = CLM_WARNING; - result->string = (char *)"Unhandled character, skipping ..."; - CL_SLL_QUEUE_ADD(T->first_message, T->last_message, result); - result->token = *token; - token->kind = CL_COMMENT; + CL_ReportError(T, token, "Unhandled character, skipping ..."); } break; } error_end_path:; } -CL_API_FUNCTION bool CL_IsComment(CL_Kind kind) { - bool result = kind == CL_COMMENT && kind != CL_EOF; - return result; +CL_API_FUNCTION bool CL_EatWhitespace(CL_Lexer *T) { + bool skipped = false; + for (;;) { + if (CL_IsWhitespace(*T->stream)) { + if (*T->stream == '\n') T->inside_of_macro = false; + CL_Advance(T); + skipped = true; + } + else if (T->stream[0] == '\\' && T->stream[1] == '\n') { + CL_Advance(T); + CL_Advance(T); + skipped = true; + } + else if (T->stream[0] == '\\' && T->stream[1] == '\r' && T->stream[2] == '\n') { + CL_Advance(T); + CL_Advance(T); + CL_Advance(T); + skipped = true; + } + else { + break; + } + } + return skipped; } -CL_API_FUNCTION void CL_InitNextToken(CL_LexResult *T, CL_Token *token) { +CL_API_FUNCTION void CL_TryToFinalizeToken(CL_Lexer *T, CL_Token *token) { + if (!token->len) { + CL_SetTokenLength(T, token); + } + if (T->inside_of_macro) { + token->is_inside_macro = true; + } +} + +CL_API_FUNCTION void CL_InitNextToken(CL_Lexer *T, CL_Token *token) { // Skip comments, comments get allocated on perm and gathered on the Tokenizer. // First non comment token gets those comments attached. - do { + for (;;) { bool skipped = CL_EatWhitespace(T); CL_PrepareToken(T, token, skipped); CL_DefaultTokenize(T, token); - } while (CL_IsComment(token->kind)); + + if (token->kind == CL_EOF) { + break; + } + + if (T->select_includes) { + if (token->kind != CL_PREPROC_INCLUDE) continue; + } + + if (T->select_macros) { + if (!token->is_inside_macro) continue; + } + + if (T->select_comments) { + if (token->kind != CL_COMMENT) continue; + } + + if (T->skip_comments) { + if (token->kind == CL_COMMENT) continue; + } + + if (T->skip_macros) { + if (token->is_inside_macro) continue; + } + + break; + } CL_TryToFinalizeToken(T, token); } -CL_API_FUNCTION void CL_AddToken(CL_LexResult *T, CL_Token *token) { - if (!T->tokens.data) T->tokens.data = token; - T->tokens.count += 1; -} - -CL_API_FUNCTION void CL_AddTokenEx(CL_Arena *arena, CL_Tokens *tokens, CL_Token *token_to_add) { - if (token_to_add->kind != CL_EOF) { - CL_Token *token = CL_PushStruct(arena, CL_Token); - *token = *token_to_add; - if (!tokens->data) tokens->data = token; - tokens->count += 1; - } -} - -CL_API_FUNCTION void CL_AddTokenList(CL_Arena *arena, CL_Tokens *main, CL_Tokens *tokens_to_add) { - for (int i = 0; i < tokens_to_add->count; i += 1) { - CL_Token *it = tokens_to_add->data + i; - CL_AddTokenEx(arena, main, it); - } -} - -CL_API_FUNCTION CL_Token *CL_AddNextToken(CL_LexResult *T) { - CL_Token *token = CL_PushStruct(T->arena->token, CL_Token); - CL_InitNextToken(T, token); - CL_AddToken(T, token); - return token; -} - -CL_API_FUNCTION void CL_LexStringEx(CL_LexResult *result) { - CL_Token *token; - do { - token = CL_AddNextToken(result); - } while (token->kind != CL_EOF); -} - -CL_API_FUNCTION CL_LexResult *CL_LexString(CL_ArenaTuple *arena, char *filename, char *string) { - CL_LexResult *result = CL_CreateLexingResult(arena, filename, string); - CL_LexStringEx(result); +CL_API_FUNCTION CL_Token CL_Next(CL_Lexer *T) { + CL_Token result = {0}; + CL_InitNextToken(T, &result); return result; } -CL_API_FUNCTION CL_LexResult *CL_LexFile(CL_ArenaTuple *arena, char *filename) { - char *file = CL_ReadFile(arena->other, filename); - CL_LexResult *result = 0; - if (file) { - result = CL_LexString(arena, filename, file); - } - return result; +CL_API_FUNCTION CL_Lexer CL_Begin(CL_Allocator arena, char *stream, char *filename) { + CL_Lexer lexer = {0}; + lexer.stream = lexer.stream_begin = stream; + lexer.file = filename; + lexer.arena = arena; + lexer.skip_comments = true; + return lexer; } -CL_API_FUNCTION void CL_AddLexResult(CL_LexList *list, CL_LexResult *result) { - if (result == 0) return; - CL_SLL_QUEUE_ADD_MOD(list->first_result, list->last_result, result, next_result); - list->count += 1; +// +// +// + +CL_PRIVATE_FUNCTION char *CL_PushStringCopy(CL_Allocator arena, char *p, int size) { + char *copy_buffer = (char *)CL_Allocate(arena, size + 1); + CL__MemoryCopy(copy_buffer, p, size); + copy_buffer[size] = 0; + return copy_buffer; } -CL_API_FUNCTION CL_LexList CL_MakeLexList(CL_LexResult *l) { - CL_LexList result = CL_ZeroStruct(); - CL_AddLexResult(&result, l); - return result; -} - -CL_PRIVATE_FUNCTION void CL__SetIncludeToken(CL_IncludeIter *iter, CL_Token *token) { - if (token) { - iter->include_token = token; - iter->filename = token->str; - iter->is_system_include = token->flags & CL_SYSTEM_INCLUDE; - } - else { - iter->include_token = 0; - iter->filename = 0; - iter->is_system_include = 0; - } -} - -CL_API_FUNCTION void CL_GetNextInclude(CL_IncludeIter *iter) { - if (iter->inited_with_filename) { - iter->parent = iter->lex_list->first_result; - iter->inited_with_filename = false; - } - - for (; iter->parent;) { - iter->include_index += 1; - if (iter->include_index >= iter->parent->includes.count) { - iter->parent = iter->parent->next_result; - CL__SetIncludeToken(iter, 0); - iter->include_index = -1; - continue; - } - - CL_Token *it = iter->parent->includes.data + iter->include_index; - CL__SetIncludeToken(iter, it); - - if (iter->resolve) { - char *filename = CL_ResolveFilepath(iter->arena, &iter->search_paths, iter->filename, iter->parent->file, iter->is_system_include); - if (CL_IsValidFile(iter->lex_list, filename)) { - iter->filename = filename; - } - else { - CL__SetIncludeToken(iter, 0); - continue; - } - } - - return; - } -} - -CL_API_FUNCTION CL_IncludeIter CL_IterateFileAndResolvedIncludes(CL_ArenaTuple *arena, char *filename, CL_SearchPaths search_paths) { - CL_IncludeIter result; - CL__MemoryZero(&result, sizeof(CL_IncludeIter)); - result.lex_list = CL_PushStruct(arena->other, CL_LexList); - if (CL_FileExists(filename)) { - result.inited_with_filename = true; - result.filename = filename; - } - result.include_index = -1; - result.resolve = true; - result.search_paths = search_paths; - result.arena = arena->other; - return result; -} - -CL_API_FUNCTION CL_IncludeIter CL_IterateIncludes(CL_LexList *list) { - CL_IncludeIter result; - CL__MemoryZero(&result, sizeof(CL_IncludeIter)); - result.lex_list = list; - result.parent = list->first_result; - result.include_index = -1; - CL_GetNextInclude(&result); - return result; -} - -CL_API_FUNCTION CL_IncludeIter CL_IterateResolvedIncludes(CL_Arena *arena, CL_LexList *list, CL_SearchPaths search_paths) { - CL_IncludeIter result; - CL__MemoryZero(&result, sizeof(CL_IncludeIter)); - result.lex_list = list; - result.parent = list->first_result; - result.include_index = -1; - result.resolve = true; - result.search_paths = search_paths; - result.arena = arena; - CL_GetNextInclude(&result); - return result; -} - -#define CL_IS_POW2(x) (((x) & ((x)-1)) == 0) -#define CL_WRAP_AROUND_POWER_OF_2(x, pow2) (((x) & ((pow2)-1llu))) - -CL_API_FUNCTION void CL_InitInternTable(CL_Arena *arena, CL_InternTable *table, int size) { - CL_ASSERT(CL_IS_POW2(size)); - table->arena = arena; - table->entries = CL_PushArray(arena, CL_InternEntry, size); - table->entry_count = size; - table->occupied_entry_count = 0; -} - -CL_API_FUNCTION CL_InternTable *CL_CreateInternTable(CL_Arena *arena, int size) { - CL_InternTable *result = CL_PushStruct(arena, CL_InternTable); - CL_InitInternTable(arena, result, size); - return result; -} - -CL_API_FUNCTION CL_Intern *CL_InsertIntern(CL_InternTable *table, char *string, int len) { - CL_ASSERT(table->arena); - uint64_t hash = CL__HASH_BYTES(string, len); - if (hash == 0) hash += 1; - - uint64_t index = CL_WRAP_AROUND_POWER_OF_2(hash, table->entry_count); - CL_InternEntry *it = table->entries + index; - for (;;) { - if (it->hash == 0) { - it->string = CL_PushStringCopy(table->arena, string, len); - it->len = len; - it->hash = hash; - table->occupied_entry_count += 1; - return it->string; - } - else if (CL_StringsAreEqual(string, len, it->string, it->len)) { - return it->string; - } - - if (!it->next) { - it->next = CL_PushStruct(table->arena, CL_InternEntry); - } - it = it->next; - } -} - -CL_API_FUNCTION void CL_InternResult(CL_InternTable *table, CL_LexResult *result) { - for (int i = 0; i < result->tokens.count; i += 1) { - CL_Token *it = result->tokens.data + i; - if (it->kind == CL_IDENTIFIER) { - it->intern = CL_InsertIntern(table, it->str, it->len); - } - } -} - -CL_API_FUNCTION void CL_InternListEx(CL_InternTable *table, CL_LexList *list) { - for (CL_LexResult *it = list->first_result; it; it = it->next_result) { - CL_InternResult(table, it); - } -} - -CL_API_FUNCTION void CL_InternList(CL_Arena *arena, CL_LexList *list) { - list->intern_table = CL_CreateInternTable(arena, 4096); - CL_InternListEx(list->intern_table, list); -} - -CL_PRIVATE_FUNCTION char *CL_ChopLastSlash(CL_Arena *arena, char *str) { +CL_PRIVATE_FUNCTION char *CL_ChopLastSlash(CL_Allocator arena, char *str) { int i = 0; int slash_pos = -1; while (str[i]) { @@ -1754,13 +1251,13 @@ CL_PRIVATE_FUNCTION char *CL_ChopLastSlash(CL_Arena *arena, char *str) { return result; } -CL_PRIVATE_FUNCTION char *CL_JoinPath(CL_Arena *arena, char *a, char *b) { +CL_PRIVATE_FUNCTION char *CL_JoinPath(CL_Allocator arena, char *a, char *b) { int alen = CL_StringLength(a); int blen = CL_StringLength(b); int additional_len = 0; if (alen && a[alen - 1] != '/') additional_len = 1; - char *result = CL_PushArray(arena, char, alen + blen + 1 + additional_len); + char *result = (char *)CL_Allocate(arena, sizeof(char) * (alen + blen + 1 + additional_len)); CL__MemoryCopy(result, a, alen); if (additional_len) result[alen++] = '/'; CL__MemoryCopy(result + alen, b, blen); @@ -1777,7 +1274,7 @@ CL_PRIVATE_FUNCTION bool CL_IsAbsolutePath(char *path) { return result; } -char *CL_SkipToLastSlash(char *p) { +CL_PRIVATE_FUNCTION char *CL_SkipToLastSlash(char *p) { int last_slash = 0; for (int i = 0; p[i]; i += 1) { if (p[i] == '/') last_slash = i; @@ -1785,8 +1282,8 @@ char *CL_SkipToLastSlash(char *p) { return p + last_slash; } -CL_API_FUNCTION char *CL_ResolveFilepath(CL_Arena *arena, CL_SearchPaths *search_paths, char *filename, char *parent_file, bool is_system_include) { - CL_SearchPaths null_search_paths = CL_ZeroStruct(); +CL_API_FUNCTION char *CL_ResolveFilepath(CL_Allocator arena, CL_SearchPaths *search_paths, char *filename, char *parent_file, bool is_system_include) { + CL_SearchPaths null_search_paths = {0}; if (search_paths == 0) search_paths = &null_search_paths; if (search_paths->file_begin_to_ignore) { @@ -1835,44 +1332,186 @@ CL_API_FUNCTION char *CL_ResolveFilepath(CL_Arena *arena, CL_SearchPaths *search return 0; } -CL_API_FUNCTION bool CL_IsValidFile(CL_LexList *list, char *filename) { - if (filename == 0) return false; - int filename_len = CL_StringLength(filename); - if (filename_len == 0) return false; +// +// +// - for (CL_LexResult *it = list->first_result; it; it = it->next_result) { - int file_len = CL_StringLength(it->file); - if (CL_StringsAreEqual(filename, filename_len, it->file, file_len)) { - return false; - } - } - return true; +const char *CL_FixString[] = { + "", + "SUFFIX_U", + "SUFFIX_UL", + "SUFFIX_ULL", + "SUFFIX_L", + "SUFFIX_LL", + "SUFFIX_F", + "SUFFIX_FL", + "PREFIX_U8", + "PREFIX_U16", + "PREFIX_U32", + "PREFIX_L", +}; + +const char *CL_KindString[] = { + "EOF", + "*", + "/", + "%", + "<<", + ">>", + "+", + "-", + "==", + "<", + ">", + "<=", + ">=", + "!=", + "&", + "|", + "^", + "&&", + "||", + "~", + "!", + "--", + "++", + "--", + "++", + "=", + "/=", + "*=", + "%=", + "-=", + "+=", + "&=", + "|=", + "^=", + "<<=", + ">>=", + "(", + ")", + "{", + "}", + "[", + "]", + ",", + "##", + "#Stringify", + "?", + "...", + ";", + ".", + ":", + "TAG", + "->", + "SIZEOF", + "DOCCOMMENT", + "COMMENT", + "IDENTIFIER", + "STRING_LITERAL", + "CHARACTER_LITERAL", + "ERROR TOKEN", + "FLOAT", + "INT", + "PREPROC_NULL", + "PREPROC_DEFINE", + "PREPROC_IFDEF", + "PREPROC_IFNDEF", + "PREPROC_INCLUDE", + "PREPROC_ENDIF", + "PREPROC_IF", + "PREPROC_PRAGMA", + "PREPROC_ERROR", + "PREPROC_ELSE", + "PREPROC_ELIF", + "PREPROC_UNDEF", + "KEYWORD_VOID", + "KEYWORD_INT", + "KEYWORD_CHAR", + "KEYWORD_UNSIGNED", + "KEYWORD_SIGNED", + "KEYWORD_LONG", + "KEYWORD_SHORT", + "KEYWORD_DOUBLE", + "KEYWORD_FLOAT", + "KEYWORD__BOOL", + "KEYWORD__COMPLEX", + "KEYWORD__IMAGINARY", + "KEYWORD_STATIC", + "KEYWORD_AUTO", + "KEYWORD_CONST", + "KEYWORD_EXTERN", + "KEYWORD_INLINE", + "KEYWORD_REGISTER", + "KEYWORD_RESTRICT", + "KEYWORD_VOLATILE", + "KEYWORD__THREAD_LOCAL", + "KEYWORD__ATOMIC", + "KEYWORD__NORETURN", + "KEYWORD_STRUCT", + "KEYWORD_UNION", + "KEYWORD_ENUM", + "KEYWORD_TYPEDEF", + "KEYWORD_DEFAULT", + "KEYWORD_BREAK", + "KEYWORD_RETURN", + "KEYWORD_SWITCH", + "KEYWORD_IF", + "KEYWORD_ELSE", + "KEYWORD_FOR", + "KEYWORD_WHILE", + "KEYWORD_CASE", + "KEYWORD_CONTINUE", + "KEYWORD_DO", + "KEYWORD_GOTO", + "KEYWORD_SIZEOF", + "KEYWORD__ALIGNAS", + "KEYWORD__ALIGNOF", + "KEYWORD__STATIC_ASSERT", + "KEYWORD__GENERIC", +}; + +CL_API_FUNCTION void CL_StringifyMessage(char *buff, int buff_size, CL_Message *msg) { + CL_SNPRINTF(buff, buff_size, "%s:%d %15s", msg->token.file, msg->token.line, msg->string); } -CL_API_FUNCTION CL_LexResult *CL_GetFile(CL_LexList *list, char *name) { - for (CL_LexResult *it = list->first_result; it; it = it->next_result) { - if (CL_StringsAreEqual(it->file, CL_StringLength(it->file), name, CL_StringLength(name))) { - return it; - } - } - return 0; +CL_API_FUNCTION void CL_Stringify(char *buff, int buff_size, CL_Token *token) { + const char *token_kind = "UNKNOWN"; + if (token->kind < CL_COUNT) token_kind = CL_KindString[token->kind]; + CL_SNPRINTF(buff, buff_size, "%s:%d %15s %15.*s", token->file, token->line, token_kind, token->len, token->str); } -CL_API_FUNCTION void CL_InitDefaultTuple(CL_ArenaTuple *tuple) { - CL__MemoryZero(tuple, sizeof(CL_ArenaTuple)); - tuple->comment = &tuple->default_comment; - tuple->token = &tuple->default_token; - tuple->include = &tuple->default_include; - tuple->other = &tuple->default_other; -} +#define CL_SLL_QUEUE_ADD_MOD(f, l, n, next) \ + do { \ + (n)->next = 0; \ + if ((f) == 0) { \ + (f) = (l) = (n); \ + } \ + else { \ + (l) = (l)->next = (n); \ + } \ + } while (0) +#define CL_SLL_QUEUE_ADD(f, l, n) CL_SLL_QUEUE_ADD_MOD(f, l, n, next) -CL_API_FUNCTION CL_LexList CL_LexRecursive(CL_ArenaTuple *arena, char *filename, CL_SearchPaths paths) { - CL_LexResult *first_file = CL_LexFile(arena, filename); - CL_LexList result = CL_MakeLexList(first_file); - result.search_paths = paths; - for (CL_IncludeIter iter = CL_IterateResolvedIncludes(arena->other, &result, paths); iter.filename; CL_GetNextInclude(&iter)) { - CL_LexResult *file = CL_LexFile(arena, iter.filename); - CL_AddLexResult(&result, file); - } - return result; +#define CL__FORMAT(arena, string, result) \ + va_list args1, args2; \ + va_start(args1, string); \ + va_copy(args2, args1); \ + int len = CL_VSNPRINTF(0, 0, string, args2); \ + va_end(args2); \ + char *result = (char *)CL_Allocate((arena), len + 1); \ + CL_VSNPRINTF(result, len + 1, string, args1); \ + va_end(args1) + +CL_PRIVATE_FUNCTION void CL_ReportError(CL_Lexer *T, CL_Token *token, const char *string, ...) { + CL__FORMAT(T->arena, string, message_string); + CL_Message *result = (CL_Message *)CL_Allocate(T->arena, sizeof(CL_Message)); + CL_MemoryZero(result, sizeof(CL_Message)); + CL_SLL_QUEUE_ADD(T->first_message, T->last_message, result); + + result->string = (char *)string; + result->token = *token; + token->kind = CL_ERROR; + token->error = result; + T->errors += 1; } diff --git a/standalone_libraries/clexer.h b/standalone_libraries/clexer.h index 86bd945..a7b061f 100644 --- a/standalone_libraries/clexer.h +++ b/standalone_libraries/clexer.h @@ -3,14 +3,6 @@ #include #include -#ifndef CL_PRIVATE_FUNCTION - #if defined(__GNUC__) || defined(__clang__) - #define CL_PRIVATE_FUNCTION __attribute__((unused)) static - #else - #define CL_PRIVATE_FUNCTION static - #endif -#endif - #ifndef CL_API_FUNCTION #ifdef __cplusplus #define CL_API_FUNCTION extern "C" @@ -31,18 +23,9 @@ #endif #endif -#ifndef CL_Arena - #define CL_Arena CL__Arena -typedef struct CL__Arena { - char *buff; - int len, cap; -} CL_Arena; -CL_PRIVATE_FUNCTION void *CL_PushSize(CL_Arena *arena, int size); -#else - #define CL_CUSTOM_ARENA_TYPE - #ifndef CL_PushSize - #error If you use a custom Arena type, you need to implement CL_PushSize macro - #endif +#ifndef CL_Allocator +struct MA_Arena; + #define CL_Allocator MA_Arena * #endif #ifndef AND_CL_STRING_TERMINATE_ON_NEW_LINE @@ -185,118 +168,44 @@ typedef enum CL_Fix { CL_PREFIX_L, } CL_Fix; -typedef uint16_t CL_Flag; -enum { - CL_NONE, - CL_HEX = 1, - CL_DIGRAPH = 2, - CL_INSIDE_OF_MACRO = 4, - CL_SYSTEM_INCLUDE = 8, - CL_WHITESPACE_BEFORE_TOKEN = 16, -}; - -typedef struct CL_Hideset CL_Hideset; -struct CL_Hideset { - CL_Hideset *next; - char *name; -}; - -typedef struct CL_Token CL_Token; // 64 bytes +typedef struct CL_Token CL_Token; struct CL_Token { - // 16 bytes :( we want debug info etc. CL_Kind kind; - CL_Flag flags; CL_Fix fix; - // 8bytes + bool is_hex : 1; + bool is_inside_macro : 1; + bool is_system_include : 1; + bool is_there_whitespace_before_token : 1; + uint32_t id; int len; - char *str; // 8bytes + char *str; - // We dont store line_begin like I would normally cause the user could + // Not storing line_begin like I would normally cause the user could // override the line and file information using directives. // On error need to do search if I want nice error context. - int line, column; // 8bytes - char *file; // 8bytes - CL_Hideset *hideset; // 8bytes + int line, column; + char *file; - union { // 8bytes + union { double f64; uint64_t u64; char *intern; char *string_literal; struct CL_Message *error; - CL_Token *comment_is_attached_to_token; }; }; -typedef enum CL_MessageKind { - CLM_ERROR, - CLM_WARNING, - CLM_TRACE, -} CL_MessageKind; - typedef struct CL_Message CL_Message; struct CL_Message { CL_Message *next; - CL_MessageKind kind; char *string; CL_Token token; }; -typedef struct CL_Tokens CL_Tokens; -struct CL_Tokens { - CL_Token *data; - int count; -}; - -typedef char CL_Intern; -typedef struct CL_InternEntry CL_InternEntry; -struct CL_InternEntry { - CL_InternEntry *next; - char *string; - int len; - uint64_t hash; -}; - -typedef struct CL_InternTable CL_InternTable; -struct CL_InternTable { - CL_InternEntry *entries; - int entry_count; - int occupied_entry_count; - CL_Arena *arena; -}; - -typedef struct CL_ArenaTuple CL_ArenaTuple; -struct CL_ArenaTuple { - - // @todo: Add TokenList and TokenNode, get rid of 1 arena ? - CL_Arena *token; - CL_Arena *other; - union { - CL_Arena *include; - CL_Arena *macro_token; - }; - union { - CL_Arena *comment; - CL_Arena *scratch2; - }; - - CL_Arena default_comment; - CL_Arena default_token; - CL_Arena default_include; - CL_Arena default_other; -}; - -typedef struct CL_LexResult CL_LexResult; -struct CL_LexResult { - CL_LexResult *next_result; - - CL_Tokens tokens; - CL_Tokens includes; - CL_Tokens comments; - int attached_comment_index; - +typedef struct CL_Lexer CL_Lexer; +struct CL_Lexer { CL_Message *first_message; CL_Message *last_message; int errors; @@ -308,7 +217,14 @@ struct CL_LexResult { char *file; bool inside_of_macro; - CL_ArenaTuple *arena; + // filters + bool skip_comments : 1; + bool skip_macros : 1; + bool select_includes : 1; + bool select_comments : 1; + bool select_macros : 1; + + CL_Allocator arena; }; typedef struct CL_SearchPaths CL_SearchPaths; @@ -322,88 +238,24 @@ struct CL_SearchPaths { char *file_begin_to_ignore; }; -typedef struct CL_LexList CL_LexList; -struct CL_LexList { - int count; - CL_LexResult *first_result; - CL_LexResult *last_result; - CL_InternTable *intern_table; - CL_SearchPaths search_paths; -}; +CL_API_FUNCTION CL_Token CL_Next(CL_Lexer *T); +CL_API_FUNCTION CL_Lexer CL_Begin(CL_Allocator arena, char *stream, char *filename); +CL_API_FUNCTION char *CL_ResolveFilepath(CL_Allocator arena, CL_SearchPaths *search_paths, char *filename, char *parent_file, bool is_system_include); -typedef struct CL_IncludeIter CL_IncludeIter; -struct CL_IncludeIter { - char *filename; - bool is_system_include; - bool inited_with_filename; - - CL_Token *include_token; - - int include_index; - CL_LexResult *parent; - CL_LexList *lex_list; - - CL_Arena *arena; - CL_SearchPaths search_paths; - bool resolve; -}; - -// -// Main API -// -CL_API_FUNCTION void CL_InitDefaultTuple(CL_ArenaTuple *tuple); -CL_API_FUNCTION CL_LexResult *CL_LexString(CL_ArenaTuple *arena, char *filename, char *string); -CL_API_FUNCTION CL_LexResult *CL_LexFile(CL_ArenaTuple *arena, char *filename); -CL_API_FUNCTION CL_LexList CL_LexRecursive(CL_ArenaTuple *arena, char *filename, CL_SearchPaths paths); - -// -// Intern table -// -CL_API_FUNCTION void CL_InitInternTable(CL_Arena *arena, CL_InternTable *table, int size); -CL_API_FUNCTION CL_InternTable *CL_CreateInternTable(CL_Arena *arena, int size); -CL_API_FUNCTION CL_Intern *CL_InsertIntern(CL_InternTable *table, char *string, int len); -CL_API_FUNCTION void CL_InternResult(CL_InternTable *table, CL_LexResult *result); - -// -// Include iteration and path resolution -// -CL_API_FUNCTION CL_IncludeIter CL_IterateIncludes(CL_LexList *list); -CL_API_FUNCTION CL_IncludeIter CL_IterateResolvedIncludes(CL_Arena *arena, CL_LexList *list, CL_SearchPaths search_paths); -CL_API_FUNCTION char *CL_ResolveFilepath(CL_Arena *arena, CL_SearchPaths *search_paths, char *filename, char *parent_file, bool is_system_include); -CL_API_FUNCTION bool CL_IsValidFile(CL_LexList *list, char *filename); -CL_API_FUNCTION void CL_GetNextInclude(CL_IncludeIter *iter); - -// Token serialization CL_API_FUNCTION void CL_StringifyMessage(char *buff, int buff_size, CL_Message *msg); -CL_API_FUNCTION void CL_PrintMessages(CL_LexResult *lex_result); CL_API_FUNCTION void CL_Stringify(char *buff, int buff_size, CL_Token *token); -CL_API_FUNCTION void CL_PrintTokens(CL_Tokens tokens); -// -// Extended API for "manual" lexing with extended help -// -CL_API_FUNCTION void CL_ReportError(CL_LexResult *T, CL_Token *token, const char *string, ...); -CL_API_FUNCTION bool CL_EatWhitespace(CL_LexResult *T); -CL_API_FUNCTION void CL_SetTokenLength(CL_LexResult *T, CL_Token *token); -CL_API_FUNCTION void CL_TryToFinalizeToken(CL_LexResult *T, CL_Token *token); -CL_API_FUNCTION void CL_ParseCharLiteral(CL_LexResult *T, CL_Token *token); -CL_API_FUNCTION void CL_ParseString(CL_LexResult *T, CL_Token *token); -CL_API_FUNCTION void CL_IsIdentifierKeyword(CL_LexResult *ctx, CL_Token *token); -CL_API_FUNCTION void CL_LexMacroInclude(CL_LexResult *T, CL_Token *token); -CL_API_FUNCTION bool CL_LexMacro(CL_LexResult *T, CL_Token *token); -CL_API_FUNCTION CL_LexResult *CL_CreateLexingResult(CL_ArenaTuple *arena, char *filename, char *filecontent); -CL_API_FUNCTION void CL_PrepareToken(CL_LexResult *T, CL_Token *token, bool skipped_whitespace); -CL_API_FUNCTION void CL_DefaultTokenize(CL_LexResult *T, CL_Token *token); -CL_API_FUNCTION bool CL_IsComment(CL_Kind kind); -CL_API_FUNCTION void CL_InitNextToken(CL_LexResult *T, CL_Token *token); -CL_API_FUNCTION CL_Hideset *CL_CreateHideset(CL_Arena *arena, char *name); -CL_API_FUNCTION CL_Token *CL_AddNextToken(CL_LexResult *T); -CL_API_FUNCTION void CL_AddToken(CL_LexResult *T, CL_Token *token); -CL_API_FUNCTION CL_LexList CL_MakeLexList(CL_LexResult *l); -CL_API_FUNCTION CL_IncludeIter CL_IterateFileAndResolvedIncludes(CL_ArenaTuple *arena, char *filename, CL_SearchPaths search_paths); +CL_API_FUNCTION void CL_SetTokenLength(CL_Lexer *T, CL_Token *token); +CL_API_FUNCTION void CL_ParseCharLiteral(CL_Lexer *T, CL_Token *token); +CL_API_FUNCTION void CL_ParseString(CL_Lexer *T, CL_Token *token); +CL_API_FUNCTION void CL_IsIdentifierKeyword(CL_Token *token); +CL_API_FUNCTION void CL_LexMacroInclude(CL_Lexer *T, CL_Token *token); +CL_API_FUNCTION bool CL_LexMacro(CL_Lexer *T, CL_Token *token); +CL_API_FUNCTION void CL_PrepareToken(CL_Lexer *T, CL_Token *token, bool skipped_space); +CL_API_FUNCTION void CL_DefaultTokenize(CL_Lexer *T, CL_Token *token); +CL_API_FUNCTION bool CL_EatWhitespace(CL_Lexer *T); +CL_API_FUNCTION void CL_TryToFinalizeToken(CL_Lexer *T, CL_Token *token); +CL_API_FUNCTION void CL_InitNextToken(CL_Lexer *T, CL_Token *token); -// -// Token iteration and utilities -// CL_INLINE int CL_StringLength(char *string) { int len = 0; while (*string++ != 0) len++; @@ -440,16 +292,12 @@ CL_INLINE bool CL_IsKeywordTypeOrSpec(CL_Kind op) { } CL_INLINE bool CL_IsMacro(CL_Kind kind) { - /*print(f"bool result = kind >= CL_PREPROC_{meta.preproc_keywords[0].upper()} && kind <= CL_PREPROC_{meta.preproc_keywords[-1].upper()};")*/ bool result = kind >= CL_PREPROC_DEFINE && kind <= CL_PREPROC_UNDEF; - /*END*/ return result; } CL_INLINE bool CL_IsKeyword(CL_Kind kind) { - /*#print(f"bool result = kind >= CL_KEYWORD_{meta.keywords[0].upper()} && kind <= CL_KEYWORD_{meta.keywords[-1].upper()};")*/ bool result = kind >= CL_KEYWORD_VOID && kind <= CL_KEYWORD__GENERIC; - /*END*/ return result; } @@ -457,39 +305,3 @@ CL_INLINE bool CL_IsKeywordOrIdent(CL_Kind kind) { bool result = CL_IsKeyword(kind) || kind == CL_IDENTIFIER; return result; } - -CL_Token CL_NullToken; -CL_INLINE CL_Token *CL_Next(CL_Tokens *tokens) { - if (tokens->count > 0) { - CL_Token *result = tokens->data; - tokens->data += 1; - tokens->count -= 1; - return result; - } - return &CL_NullToken; -} - -CL_INLINE CL_Token *CL_Get(CL_Tokens *tokens) { - if (tokens->count > 0) { - return tokens->data; - } - return &CL_NullToken; -} - -CL_INLINE CL_Token *CL_Match(CL_Tokens *tokens, CL_Kind kind) { - CL_Token *result = CL_Get(tokens); - if (result->kind == kind) { - CL_Token *next = CL_Next(tokens); - return next; - } - return 0; -} - -CL_INLINE CL_Token *CL_MatchIdentifier(CL_Tokens *tokens, char *str) { - CL_Token *result = CL_Get(tokens); - if (CL_IsIdentifier(result, str)) { - CL_Token *next = CL_Next(tokens); - return next; - } - return 0; -} diff --git a/tests/1test_string.cpp b/tests/1test_string.cpp index 459f3a4..da6a77f 100644 --- a/tests/1test_string.cpp +++ b/tests/1test_string.cpp @@ -1,14 +1,42 @@ #include "../core_library/core.c" -int main() { - S8_String s = "mrówka"; +#define CL_Arena MA_Arena +#define CL_PushSize MA_PushSizeNonZeroed +#define CL_ASSERT IO_Assert +#define CL_VSNPRINTF stbsp_vsnprintf +#define CL_SNPRINTF stbsp_snprintf +#include "../standalone_libraries/clexer.c" - bool found_two_byte = false; - For(s) { - if (it.utf8_codepoint_byte_size == 2) { - found_two_byte = true; - IO_Assert(it.i == 2); +int main() { + // Unicode iteration over codepoints + { + S8_String s = "mrówka"; + + bool found_two_byte = false; + For(s) { + if (it.utf8_codepoint_byte_size == 2) { + found_two_byte = true; + IO_Assert(it.i == 2); + } + } + IO_Assert(found_two_byte); + } + + { + MA_Scratch scratch; + + S8_String filename = "../standalone_libraries/clexer.c"; + S8_String file = OS_ReadFile(scratch, filename); + CL_Lexer lexer = CL_Begin(scratch, file.str, filename.str); + + char buff[1024]; + for (;;) { + CL_Token token = CL_Next(&lexer); + if (token.kind == CL_EOF) break; + if (token.kind != CL_PREPROC_INCLUDE) continue; + + CL_Stringify(buff, sizeof(buff), &token); + IO_Printf("%s\n", buff); } } - IO_Assert(found_two_byte); } \ No newline at end of file