275 lines
7.4 KiB
C++
275 lines
7.4 KiB
C++
global U32 question_mark32 = '?';
|
|
global U16 question_mark16 = 0x003f;
|
|
global U8 question_mark8 = '?';
|
|
|
|
struct String32{
|
|
U32 *str;
|
|
S64 len;
|
|
};
|
|
|
|
struct UTF32_Result{
|
|
U32 out_str;
|
|
S64 advance;
|
|
B32 error;
|
|
};
|
|
|
|
CORE_Static UTF32_Result
|
|
utf8_to_utf32(U8 *c, S64 max_advance) {
|
|
UTF32_Result result = {};
|
|
|
|
if ((c[0] & 0b10000000) == 0) { // Check if leftmost zero of first byte is unset
|
|
if(max_advance >= 1){
|
|
result.out_str = c[0];
|
|
result.advance = 1;
|
|
}
|
|
else result.error = 1;
|
|
}
|
|
|
|
else if ((c[0] & 0b11100000) == 0b11000000) {
|
|
if ((c[1] & 0b11000000) == 0b10000000) { // Continuation byte required
|
|
if(max_advance >= 2){
|
|
result.out_str = (U32)(c[0] & 0b00011111) << 6u | (c[1] & 0b00111111);
|
|
result.advance = 2;
|
|
}
|
|
else result.error = 2;
|
|
}
|
|
else result.error = 2;
|
|
}
|
|
|
|
else if ((c[0] & 0b11110000) == 0b11100000) {
|
|
if ((c[1] & 0b11000000) == 0b10000000 && (c[2] & 0b11000000) == 0b10000000) { // Two continuation bytes required
|
|
if(max_advance >= 3){
|
|
result.out_str = (U32)(c[0] & 0b00001111) << 12u | (U32)(c[1] & 0b00111111) << 6u | (c[2] & 0b00111111);
|
|
result.advance = 3;
|
|
}
|
|
else result.error = 3;
|
|
}
|
|
else result.error = 3;
|
|
}
|
|
|
|
else if ((c[0] & 0b11111000) == 0b11110000) {
|
|
if ((c[1] & 0b11000000) == 0b10000000 && (c[2] & 0b11000000) == 0b10000000 && (c[3] & 0b11000000) == 0b10000000) { // Three continuation bytes required
|
|
if(max_advance >= 4){
|
|
result.out_str = (U32)(c[0] & 0b00001111) << 18u | (U32)(c[1] & 0b00111111) << 12u | (U32)(c[2] & 0b00111111) << 6u | (U32)(c[3] & 0b00111111);
|
|
result.advance = 4;
|
|
}
|
|
else result.error = 4;
|
|
}
|
|
else result.error = 4;
|
|
}
|
|
else result.error = 4;
|
|
|
|
return result;
|
|
}
|
|
|
|
struct String16{
|
|
U16 *str;
|
|
S64 len;
|
|
};
|
|
|
|
struct UTF16_Result{
|
|
U16 out_str[2];
|
|
S32 len;
|
|
B32 error;
|
|
};
|
|
|
|
CORE_Static UTF16_Result
|
|
utf32_to_utf16(U32 codepoint){
|
|
UTF16_Result result = {};
|
|
if (codepoint < 0x10000) {
|
|
result.out_str[0] = (U16)codepoint;
|
|
result.out_str[1] = 0;
|
|
result.len = 1;
|
|
}
|
|
else if (codepoint <= 0x10FFFF) {
|
|
U32 code = (codepoint - 0x10000);
|
|
result.out_str[0] = (U16)(0xD800 | (code >> 10));
|
|
result.out_str[1] = (U16)(0xDC00 | (code & 0x3FF));
|
|
result.len = 2;
|
|
}
|
|
else{
|
|
result.error = 1;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
struct UTF8_Result{
|
|
U8 out_str[4];
|
|
S32 len;
|
|
B32 error;
|
|
};
|
|
|
|
CORE_Static UTF8_Result
|
|
utf32_to_utf8(U32 codepoint) {
|
|
UTF8_Result result = {};
|
|
if (codepoint <= 0x7F) {
|
|
result.len = 1;
|
|
result.out_str[0] = (U8)codepoint;
|
|
}
|
|
else if (codepoint <= 0x7FF) {
|
|
result.len= 2;
|
|
result.out_str[0] = 0b11000000 | (0b00011111 & (codepoint >> 6));
|
|
result.out_str[1] = 0b10000000 | (0b00111111 & codepoint);
|
|
}
|
|
else if (codepoint <= 0xFFFF) { // 16 bit word
|
|
result.len= 3;
|
|
result.out_str[0] = 0b11100000 | (0b00001111 & (codepoint >> 12)); // 4 bits
|
|
result.out_str[1] = 0b10000000 | (0b00111111 & (codepoint >> 6)); // 6 bits
|
|
result.out_str[2] = 0b10000000 | (0b00111111 & codepoint); // 6 bits
|
|
}
|
|
else if (codepoint <= 0x10FFFF) { // 21 bit word
|
|
result.len= 4;
|
|
result.out_str[0] = 0b11110000 | (0b00000111 & (codepoint >> 18)); // 3 bits
|
|
result.out_str[1] = 0b10000000 | (0b00111111 & (codepoint >> 12)); // 6 bits
|
|
result.out_str[2] = 0b10000000 | (0b00111111 & (codepoint >> 6)); // 6 bits
|
|
result.out_str[3] = 0b10000000 | (0b00111111 & codepoint); // 6 bits
|
|
}
|
|
else{
|
|
result.error = true;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
CORE_Static UTF32_Result
|
|
utf16_to_utf32(U16 *c, S32 max_advance) {
|
|
UTF32_Result result = {};
|
|
if(max_advance >= 1){
|
|
result.advance = 1;
|
|
result.out_str = c[0];
|
|
if (c[0] >= 0xD800 && c[0] <= 0xDBFF && c[1] >= 0xDC00 && c[1] <= 0xDFFF) {
|
|
if(max_advance >= 2){
|
|
result.out_str = 0x10000;
|
|
result.out_str += (U32)(c[0] & 0x03FF) << 10u | (c[1] & 0x03FF);
|
|
result.advance = 2;
|
|
}
|
|
else result.error = 2;
|
|
}
|
|
}
|
|
else result.error = 1;
|
|
|
|
return result;
|
|
}
|
|
|
|
#define unicode_error(question_mark) \
|
|
{ \
|
|
result.str[result.len++] = question_mark; \
|
|
break; \
|
|
}
|
|
|
|
CORE_Static String32
|
|
string16_to_string32(Allocator *allocator, String16 string){
|
|
String32 result = {allocate_array(allocator, U32, string.len+1)};
|
|
for(S64 i = 0; i < string.len;){
|
|
UTF32_Result decode = utf16_to_utf32(string.str + i, string.len - i);
|
|
if(!decode.error){
|
|
i += decode.advance;
|
|
result.str[result.len++] = decode.out_str;
|
|
}
|
|
else unicode_error(question_mark32);
|
|
}
|
|
|
|
result.str[result.len] = 0;
|
|
return result;
|
|
}
|
|
|
|
CORE_Static String32
|
|
string8_to_string32(Allocator *allocator, String string){
|
|
String32 result = {allocate_array(allocator, U32, string.len+1)};
|
|
for(S64 i = 0; i < string.len;){
|
|
UTF32_Result decode = utf8_to_utf32(string.str + i, string.len - i);
|
|
if(!decode.error){
|
|
i += decode.advance;
|
|
result.str[result.len++] = decode.out_str;
|
|
}
|
|
else unicode_error(question_mark32);
|
|
}
|
|
result.str[result.len] = 0;
|
|
return result;
|
|
}
|
|
|
|
CORE_Static String16
|
|
string8_to_string16(Allocator *allocator, String in){
|
|
String16 result = {allocate_array(allocator, U16, (in.len*2)+1)}; // @Note(Krzosa): Should be more then enough space
|
|
for(S64 i = 0; i < in.len;){
|
|
UTF32_Result decode = utf8_to_utf32(in.str + i, in.len - i);
|
|
if(!decode.error){
|
|
i += decode.advance;
|
|
UTF16_Result encode = utf32_to_utf16(decode.out_str);
|
|
if(!encode.error){
|
|
for(S32 j = 0; j < encode.len; j++){
|
|
result.str[result.len++] = encode.out_str[j];
|
|
}
|
|
}
|
|
else unicode_error(question_mark16);
|
|
}
|
|
else unicode_error(question_mark16);
|
|
}
|
|
|
|
result.str[result.len] = 0;
|
|
return result;
|
|
}
|
|
|
|
CORE_Static String
|
|
string16_to_string8(Allocator *allocator, String16 in){
|
|
String result = {allocate_array(allocator, U8, in.len*4+1)};
|
|
for(S64 i = 0; i < in.len;){
|
|
UTF32_Result decode = utf16_to_utf32(in.str + i, in.len - i);
|
|
if(!decode.error){
|
|
i += decode.advance;
|
|
UTF8_Result encode = utf32_to_utf8(decode.out_str);
|
|
if(!encode.error){
|
|
for(S32 j = 0; j < encode.len; j++)
|
|
result.str[result.len++] = encode.out_str[j];
|
|
}
|
|
else unicode_error(question_mark8);
|
|
}
|
|
else unicode_error(question_mark8);
|
|
}
|
|
|
|
result.str[result.len] = 0;
|
|
return result;
|
|
}
|
|
|
|
CORE_Static B32
|
|
string_compare(String16 a, String16 b){
|
|
if(a.len != b.len) return false;
|
|
for(S64 i = 0; i < a.len; i++){
|
|
if(a.str[i] != b.str[i]) return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
CORE_Static B32
|
|
string_compare(String32 a, String32 b){
|
|
if(a.len != b.len) return false;
|
|
for(S64 i = 0; i < a.len; i++){
|
|
if(a.str[i] != b.str[i]) return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
CORE_Static S64
|
|
widechar_len(wchar_t *string){
|
|
S64 len = 0;
|
|
while(*string++!=0)len++;
|
|
return len;
|
|
}
|
|
|
|
CORE_Static String16
|
|
string16_from_widechar(wchar_t *string){
|
|
String16 result;
|
|
result.str = (U16 *)string;
|
|
result.len = widechar_len(string);
|
|
return result;
|
|
}
|
|
|
|
CORE_Static String
|
|
string16_copy(Allocator *a, String string){
|
|
U8 *copy = allocate_array(a, U8, string.len+1);
|
|
memory_copy(copy, string.str, string.len);
|
|
copy[string.len] = 0;
|
|
return String{copy, string.len};
|
|
}
|