global U32 question_mark32 = '?'; global U16 question_mark16 = 0x003f; global U8 question_mark8 = '?'; struct String32{ U32 *str; S64 len; }; struct UTF32_Result{ U32 out_str; S64 advance; B32 error; }; CORE_Static UTF32_Result utf8_to_utf32(U8 *c, S64 max_advance) { UTF32_Result result = {}; if ((c[0] & 0b10000000) == 0) { // Check if leftmost zero of first byte is unset if(max_advance >= 1){ result.out_str = c[0]; result.advance = 1; } else result.error = 1; } else if ((c[0] & 0b11100000) == 0b11000000) { if ((c[1] & 0b11000000) == 0b10000000) { // Continuation byte required if(max_advance >= 2){ result.out_str = (U32)(c[0] & 0b00011111) << 6u | (c[1] & 0b00111111); result.advance = 2; } else result.error = 2; } else result.error = 2; } else if ((c[0] & 0b11110000) == 0b11100000) { if ((c[1] & 0b11000000) == 0b10000000 && (c[2] & 0b11000000) == 0b10000000) { // Two continuation bytes required if(max_advance >= 3){ result.out_str = (U32)(c[0] & 0b00001111) << 12u | (U32)(c[1] & 0b00111111) << 6u | (c[2] & 0b00111111); result.advance = 3; } else result.error = 3; } else result.error = 3; } else if ((c[0] & 0b11111000) == 0b11110000) { if ((c[1] & 0b11000000) == 0b10000000 && (c[2] & 0b11000000) == 0b10000000 && (c[3] & 0b11000000) == 0b10000000) { // Three continuation bytes required if(max_advance >= 4){ result.out_str = (U32)(c[0] & 0b00001111) << 18u | (U32)(c[1] & 0b00111111) << 12u | (U32)(c[2] & 0b00111111) << 6u | (U32)(c[3] & 0b00111111); result.advance = 4; } else result.error = 4; } else result.error = 4; } else result.error = 4; return result; } struct String16{ U16 *str; S64 len; }; struct UTF16_Result{ U16 out_str[2]; S32 len; B32 error; }; CORE_Static UTF16_Result utf32_to_utf16(U32 codepoint){ UTF16_Result result = {}; if (codepoint < 0x10000) { result.out_str[0] = (U16)codepoint; result.out_str[1] = 0; result.len = 1; } else if (codepoint <= 0x10FFFF) { U32 code = (codepoint - 0x10000); result.out_str[0] = (U16)(0xD800 | (code >> 10)); result.out_str[1] = (U16)(0xDC00 | (code & 0x3FF)); result.len = 2; } else{ result.error = 1; } return result; } struct UTF8_Result{ U8 out_str[4]; S32 len; B32 error; }; CORE_Static UTF8_Result utf32_to_utf8(U32 codepoint) { UTF8_Result result = {}; if (codepoint <= 0x7F) { result.len = 1; result.out_str[0] = (U8)codepoint; } else if (codepoint <= 0x7FF) { result.len= 2; result.out_str[0] = 0b11000000 | (0b00011111 & (codepoint >> 6)); result.out_str[1] = 0b10000000 | (0b00111111 & codepoint); } else if (codepoint <= 0xFFFF) { // 16 bit word result.len= 3; result.out_str[0] = 0b11100000 | (0b00001111 & (codepoint >> 12)); // 4 bits result.out_str[1] = 0b10000000 | (0b00111111 & (codepoint >> 6)); // 6 bits result.out_str[2] = 0b10000000 | (0b00111111 & codepoint); // 6 bits } else if (codepoint <= 0x10FFFF) { // 21 bit word result.len= 4; result.out_str[0] = 0b11110000 | (0b00000111 & (codepoint >> 18)); // 3 bits result.out_str[1] = 0b10000000 | (0b00111111 & (codepoint >> 12)); // 6 bits result.out_str[2] = 0b10000000 | (0b00111111 & (codepoint >> 6)); // 6 bits result.out_str[3] = 0b10000000 | (0b00111111 & codepoint); // 6 bits } else{ result.error = true; } return result; } CORE_Static UTF32_Result utf16_to_utf32(U16 *c, S32 max_advance) { UTF32_Result result = {}; if(max_advance >= 1){ result.advance = 1; result.out_str = c[0]; if (c[0] >= 0xD800 && c[0] <= 0xDBFF && c[1] >= 0xDC00 && c[1] <= 0xDFFF) { if(max_advance >= 2){ result.out_str = 0x10000; result.out_str += (U32)(c[0] & 0x03FF) << 10u | (c[1] & 0x03FF); result.advance = 2; } else result.error = 2; } } else result.error = 1; return result; } #define unicode_error(question_mark) \ { \ result.str[result.len++] = question_mark; \ break; \ } CORE_Static String32 string16_to_string32(Allocator *allocator, String16 string){ String32 result = {allocate_array(allocator, U32, string.len+1)}; for(S64 i = 0; i < string.len;){ UTF32_Result decode = utf16_to_utf32(string.str + i, (S32)(string.len - i)); if(!decode.error){ i += decode.advance; result.str[result.len++] = decode.out_str; } else unicode_error(question_mark32); } result.str[result.len] = 0; return result; } CORE_Static String32 string8_to_string32(Allocator *allocator, String string){ String32 result = {allocate_array(allocator, U32, string.len+1)}; for(S64 i = 0; i < string.len;){ UTF32_Result decode = utf8_to_utf32(string.str + i, string.len - i); if(!decode.error){ i += decode.advance; result.str[result.len++] = decode.out_str; } else unicode_error(question_mark32); } result.str[result.len] = 0; return result; } CORE_Static String16 string8_to_string16(Allocator *allocator, String in){ String16 result = {allocate_array(allocator, U16, (in.len*2)+1)}; // @Note(Krzosa): Should be more then enough space for(S64 i = 0; i < in.len;){ UTF32_Result decode = utf8_to_utf32(in.str + i, in.len - i); if(!decode.error){ i += decode.advance; UTF16_Result encode = utf32_to_utf16(decode.out_str); if(!encode.error){ for(S32 j = 0; j < encode.len; j++){ result.str[result.len++] = encode.out_str[j]; } } else unicode_error(question_mark16); } else unicode_error(question_mark16); } result.str[result.len] = 0; return result; } CORE_Static String string16_to_string8(Allocator *allocator, String16 in){ String result = {allocate_array(allocator, U8, in.len*4+1)}; for(S64 i = 0; i < in.len;){ UTF32_Result decode = utf16_to_utf32(in.str + i, (S32)(in.len - i)); if(!decode.error){ i += decode.advance; UTF8_Result encode = utf32_to_utf8(decode.out_str); if(!encode.error){ for(S32 j = 0; j < encode.len; j++) result.str[result.len++] = encode.out_str[j]; } else unicode_error(question_mark8); } else unicode_error(question_mark8); } result.str[result.len] = 0; return result; } CORE_Static B32 string_compare(String16 a, String16 b){ if(a.len != b.len) return false; for(S64 i = 0; i < a.len; i++){ if(a.str[i] != b.str[i]) return false; } return true; } CORE_Static B32 string_compare(String32 a, String32 b){ if(a.len != b.len) return false; for(S64 i = 0; i < a.len; i++){ if(a.str[i] != b.str[i]) return false; } return true; } CORE_Static S64 widechar_len(wchar_t *string){ S64 len = 0; while(*string++!=0)len++; return len; } CORE_Static String16 string16_from_widechar(wchar_t *string){ String16 result; result.str = (U16 *)string; result.len = widechar_len(string); return result; } CORE_Static String string16_copy(Allocator *a, String string){ U8 *copy = allocate_array(a, U8, string.len+1); memory_copy(copy, string.str, string.len); copy[string.len] = 0; return String{copy, string.len}; }