global U32 question_mark32 = '?'; global U16 question_mark16 = 0x003f; global U8 question_mark8 = '?'; struct String32 { U32 *str; S64 len; }; struct UTF32_Result { U32 out_str; S64 advance; B32 error; }; CORE_Static UTF32_Result utf8_to_utf32(U8 *c, S64 max_advance) { UTF32_Result result = {}; if ((c[0] & 0b10000000) == 0) { // Check if leftmost zero of first byte is unset if (max_advance >= 1) { result.out_str = c[0]; result.advance = 1; } else result.error = 1; } else if ((c[0] & 0b11100000) == 0b11000000) { if ((c[1] & 0b11000000) == 0b10000000) { // Continuation byte required if (max_advance >= 2) { result.out_str = (U32)(c[0] & 0b00011111) << 6u | (c[1] & 0b00111111); result.advance = 2; } else result.error = 2; } else result.error = 2; } else if ((c[0] & 0b11110000) == 0b11100000) { if ((c[1] & 0b11000000) == 0b10000000 && (c[2] & 0b11000000) == 0b10000000) { // Two continuation bytes required if (max_advance >= 3) { result.out_str = (U32)(c[0] & 0b00001111) << 12u | (U32)(c[1] & 0b00111111) << 6u | (c[2] & 0b00111111); result.advance = 3; } else result.error = 3; } else result.error = 3; } else if ((c[0] & 0b11111000) == 0b11110000) { if ((c[1] & 0b11000000) == 0b10000000 && (c[2] & 0b11000000) == 0b10000000 && (c[3] & 0b11000000) == 0b10000000) { // Three continuation bytes required if (max_advance >= 4) { result.out_str = (U32)(c[0] & 0b00001111) << 18u | (U32)(c[1] & 0b00111111) << 12u | (U32)(c[2] & 0b00111111) << 6u | (U32)(c[3] & 0b00111111); result.advance = 4; } else result.error = 4; } else result.error = 4; } else result.error = 4; return result; } struct String16 { U16 *str; S64 len; }; struct UTF16_Result { U16 out_str[2]; S32 len; B32 error; }; CORE_Static UTF16_Result utf32_to_utf16(U32 codepoint) { UTF16_Result result = {}; if (codepoint < 0x10000) { result.out_str[0] = (U16)codepoint; result.out_str[1] = 0; result.len = 1; } else if (codepoint <= 0x10FFFF) { U32 code = (codepoint - 0x10000); result.out_str[0] = (U16)(0xD800 | (code >> 10)); result.out_str[1] = (U16)(0xDC00 | (code & 0x3FF)); result.len = 2; } else { result.error = 1; } return result; } struct UTF8_Result { U8 out_str[4]; S32 len; B32 error; }; CORE_Static UTF8_Result utf32_to_utf8(U32 codepoint) { UTF8_Result result = {}; if (codepoint <= 0x7F) { result.len = 1; result.out_str[0] = (U8)codepoint; } else if (codepoint <= 0x7FF) { result.len = 2; result.out_str[0] = 0b11000000 | (0b00011111 & (codepoint >> 6)); result.out_str[1] = 0b10000000 | (0b00111111 & codepoint); } else if (codepoint <= 0xFFFF) { // 16 bit word result.len = 3; result.out_str[0] = 0b11100000 | (0b00001111 & (codepoint >> 12)); // 4 bits result.out_str[1] = 0b10000000 | (0b00111111 & (codepoint >> 6)); // 6 bits result.out_str[2] = 0b10000000 | (0b00111111 & codepoint); // 6 bits } else if (codepoint <= 0x10FFFF) { // 21 bit word result.len = 4; result.out_str[0] = 0b11110000 | (0b00000111 & (codepoint >> 18)); // 3 bits result.out_str[1] = 0b10000000 | (0b00111111 & (codepoint >> 12)); // 6 bits result.out_str[2] = 0b10000000 | (0b00111111 & (codepoint >> 6)); // 6 bits result.out_str[3] = 0b10000000 | (0b00111111 & codepoint); // 6 bits } else { result.error = true; } return result; } CORE_Static UTF32_Result utf16_to_utf32(U16 *c, S32 max_advance) { UTF32_Result result = {}; if (max_advance >= 1) { result.advance = 1; result.out_str = c[0]; if (c[0] >= 0xD800 && c[0] <= 0xDBFF && c[1] >= 0xDC00 && c[1] <= 0xDFFF) { if (max_advance >= 2) { result.out_str = 0x10000; result.out_str += (U32)(c[0] & 0x03FF) << 10u | (c[1] & 0x03FF); result.advance = 2; } else result.error = 2; } } else result.error = 1; return result; } #define unicode_error(question_mark) \ { \ result.str[result.len++] = question_mark; \ break; \ } CORE_Static String32 string16_to_string32(Allocator *allocator, String16 string) { String32 result = {allocate_array(allocator, U32, string.len + 1)}; for (S64 i = 0; i < string.len;) { UTF32_Result decode = utf16_to_utf32(string.str + i, (S32)(string.len - i)); if (!decode.error) { i += decode.advance; result.str[result.len++] = decode.out_str; } else unicode_error(question_mark32); } result.str[result.len] = 0; return result; } CORE_Static String32 string8_to_string32(Allocator *allocator, String string) { String32 result = {allocate_array(allocator, U32, string.len + 1)}; for (S64 i = 0; i < string.len;) { UTF32_Result decode = utf8_to_utf32(string.str + i, string.len - i); if (!decode.error) { i += decode.advance; result.str[result.len++] = decode.out_str; } else unicode_error(question_mark32); } result.str[result.len] = 0; return result; } CORE_Static String16 string8_to_string16(Allocator *allocator, String in) { String16 result = {allocate_array(allocator, U16, (in.len * 2) + 1)}; // @Note(Krzosa): Should be more then enough space for (S64 i = 0; i < in.len;) { UTF32_Result decode = utf8_to_utf32(in.str + i, in.len - i); if (!decode.error) { i += decode.advance; UTF16_Result encode = utf32_to_utf16(decode.out_str); if (!encode.error) { for (S32 j = 0; j < encode.len; j++) { result.str[result.len++] = encode.out_str[j]; } } else unicode_error(question_mark16); } else unicode_error(question_mark16); } result.str[result.len] = 0; return result; } CORE_Static String string16_to_string8(Allocator *allocator, String16 in) { String result = {allocate_array(allocator, U8, in.len * 4 + 1)}; for (S64 i = 0; i < in.len;) { UTF32_Result decode = utf16_to_utf32(in.str + i, (S32)(in.len - i)); if (!decode.error) { i += decode.advance; UTF8_Result encode = utf32_to_utf8(decode.out_str); if (!encode.error) { for (S32 j = 0; j < encode.len; j++) result.str[result.len++] = encode.out_str[j]; } else unicode_error(question_mark8); } else unicode_error(question_mark8); } result.str[result.len] = 0; return result; } CORE_Static B32 string_compare(String16 a, String16 b) { if (a.len != b.len) return false; for (S64 i = 0; i < a.len; i++) { if (a.str[i] != b.str[i]) return false; } return true; } CORE_Static B32 string_compare(String32 a, String32 b) { if (a.len != b.len) return false; for (S64 i = 0; i < a.len; i++) { if (a.str[i] != b.str[i]) return false; } return true; } CORE_Static S64 widechar_len(wchar_t *string) { S64 len = 0; while (*string++ != 0) len++; return len; } CORE_Static String16 string16_from_widechar(wchar_t *string) { String16 result; result.str = (U16 *)string; result.len = widechar_len(string); return result; } CORE_Static String string16_copy(Allocator *a, String string) { U8 *copy = allocate_array(a, U8, string.len + 1); memory_copy(copy, string.str, string.len); copy[string.len] = 0; return String{copy, string.len}; }