Files
corelang/base_unicode.cpp
2023-01-01 12:40:58 +01:00

275 lines
7.4 KiB
C++

global U32 question_mark32 = '?';
global U16 question_mark16 = 0x003f;
global U8 question_mark8 = '?';
struct String32{
U32 *str;
S64 len;
};
struct UTF32_Result{
U32 out_str;
S64 advance;
B32 error;
};
CORE_Static UTF32_Result
utf8_to_utf32(U8 *c, S64 max_advance) {
UTF32_Result result = {};
if ((c[0] & 0b10000000) == 0) { // Check if leftmost zero of first byte is unset
if(max_advance >= 1){
result.out_str = c[0];
result.advance = 1;
}
else result.error = 1;
}
else if ((c[0] & 0b11100000) == 0b11000000) {
if ((c[1] & 0b11000000) == 0b10000000) { // Continuation byte required
if(max_advance >= 2){
result.out_str = (U32)(c[0] & 0b00011111) << 6u | (c[1] & 0b00111111);
result.advance = 2;
}
else result.error = 2;
}
else result.error = 2;
}
else if ((c[0] & 0b11110000) == 0b11100000) {
if ((c[1] & 0b11000000) == 0b10000000 && (c[2] & 0b11000000) == 0b10000000) { // Two continuation bytes required
if(max_advance >= 3){
result.out_str = (U32)(c[0] & 0b00001111) << 12u | (U32)(c[1] & 0b00111111) << 6u | (c[2] & 0b00111111);
result.advance = 3;
}
else result.error = 3;
}
else result.error = 3;
}
else if ((c[0] & 0b11111000) == 0b11110000) {
if ((c[1] & 0b11000000) == 0b10000000 && (c[2] & 0b11000000) == 0b10000000 && (c[3] & 0b11000000) == 0b10000000) { // Three continuation bytes required
if(max_advance >= 4){
result.out_str = (U32)(c[0] & 0b00001111) << 18u | (U32)(c[1] & 0b00111111) << 12u | (U32)(c[2] & 0b00111111) << 6u | (U32)(c[3] & 0b00111111);
result.advance = 4;
}
else result.error = 4;
}
else result.error = 4;
}
else result.error = 4;
return result;
}
struct String16{
U16 *str;
S64 len;
};
struct UTF16_Result{
U16 out_str[2];
S32 len;
B32 error;
};
CORE_Static UTF16_Result
utf32_to_utf16(U32 codepoint){
UTF16_Result result = {};
if (codepoint < 0x10000) {
result.out_str[0] = (U16)codepoint;
result.out_str[1] = 0;
result.len = 1;
}
else if (codepoint <= 0x10FFFF) {
U32 code = (codepoint - 0x10000);
result.out_str[0] = (U16)(0xD800 | (code >> 10));
result.out_str[1] = (U16)(0xDC00 | (code & 0x3FF));
result.len = 2;
}
else{
result.error = 1;
}
return result;
}
struct UTF8_Result{
U8 out_str[4];
S32 len;
B32 error;
};
CORE_Static UTF8_Result
utf32_to_utf8(U32 codepoint) {
UTF8_Result result = {};
if (codepoint <= 0x7F) {
result.len = 1;
result.out_str[0] = (U8)codepoint;
}
else if (codepoint <= 0x7FF) {
result.len= 2;
result.out_str[0] = 0b11000000 | (0b00011111 & (codepoint >> 6));
result.out_str[1] = 0b10000000 | (0b00111111 & codepoint);
}
else if (codepoint <= 0xFFFF) { // 16 bit word
result.len= 3;
result.out_str[0] = 0b11100000 | (0b00001111 & (codepoint >> 12)); // 4 bits
result.out_str[1] = 0b10000000 | (0b00111111 & (codepoint >> 6)); // 6 bits
result.out_str[2] = 0b10000000 | (0b00111111 & codepoint); // 6 bits
}
else if (codepoint <= 0x10FFFF) { // 21 bit word
result.len= 4;
result.out_str[0] = 0b11110000 | (0b00000111 & (codepoint >> 18)); // 3 bits
result.out_str[1] = 0b10000000 | (0b00111111 & (codepoint >> 12)); // 6 bits
result.out_str[2] = 0b10000000 | (0b00111111 & (codepoint >> 6)); // 6 bits
result.out_str[3] = 0b10000000 | (0b00111111 & codepoint); // 6 bits
}
else{
result.error = true;
}
return result;
}
CORE_Static UTF32_Result
utf16_to_utf32(U16 *c, S32 max_advance) {
UTF32_Result result = {};
if(max_advance >= 1){
result.advance = 1;
result.out_str = c[0];
if (c[0] >= 0xD800 && c[0] <= 0xDBFF && c[1] >= 0xDC00 && c[1] <= 0xDFFF) {
if(max_advance >= 2){
result.out_str = 0x10000;
result.out_str += (U32)(c[0] & 0x03FF) << 10u | (c[1] & 0x03FF);
result.advance = 2;
}
else result.error = 2;
}
}
else result.error = 1;
return result;
}
#define unicode_error(question_mark) \
{ \
result.str[result.len++] = question_mark; \
break; \
}
CORE_Static String32
string16_to_string32(Allocator *allocator, String16 string){
String32 result = {allocate_array(allocator, U32, string.len+1)};
for(S64 i = 0; i < string.len;){
UTF32_Result decode = utf16_to_utf32(string.str + i, string.len - i);
if(!decode.error){
i += decode.advance;
result.str[result.len++] = decode.out_str;
}
else unicode_error(question_mark32);
}
result.str[result.len] = 0;
return result;
}
CORE_Static String32
string8_to_string32(Allocator *allocator, String string){
String32 result = {allocate_array(allocator, U32, string.len+1)};
for(S64 i = 0; i < string.len;){
UTF32_Result decode = utf8_to_utf32(string.str + i, string.len - i);
if(!decode.error){
i += decode.advance;
result.str[result.len++] = decode.out_str;
}
else unicode_error(question_mark32);
}
result.str[result.len] = 0;
return result;
}
CORE_Static String16
string8_to_string16(Allocator *allocator, String in){
String16 result = {allocate_array(allocator, U16, (in.len*2)+1)}; // @Note(Krzosa): Should be more then enough space
for(S64 i = 0; i < in.len;){
UTF32_Result decode = utf8_to_utf32(in.str + i, in.len - i);
if(!decode.error){
i += decode.advance;
UTF16_Result encode = utf32_to_utf16(decode.out_str);
if(!encode.error){
for(S32 j = 0; j < encode.len; j++){
result.str[result.len++] = encode.out_str[j];
}
}
else unicode_error(question_mark16);
}
else unicode_error(question_mark16);
}
result.str[result.len] = 0;
return result;
}
CORE_Static String
string16_to_string8(Allocator *allocator, String16 in){
String result = {allocate_array(allocator, U8, in.len*4+1)};
for(S64 i = 0; i < in.len;){
UTF32_Result decode = utf16_to_utf32(in.str + i, in.len - i);
if(!decode.error){
i += decode.advance;
UTF8_Result encode = utf32_to_utf8(decode.out_str);
if(!encode.error){
for(S32 j = 0; j < encode.len; j++)
result.str[result.len++] = encode.out_str[j];
}
else unicode_error(question_mark8);
}
else unicode_error(question_mark8);
}
result.str[result.len] = 0;
return result;
}
CORE_Static B32
string_compare(String16 a, String16 b){
if(a.len != b.len) return false;
for(S64 i = 0; i < a.len; i++){
if(a.str[i] != b.str[i]) return false;
}
return true;
}
CORE_Static B32
string_compare(String32 a, String32 b){
if(a.len != b.len) return false;
for(S64 i = 0; i < a.len; i++){
if(a.str[i] != b.str[i]) return false;
}
return true;
}
CORE_Static S64
widechar_len(wchar_t *string){
S64 len = 0;
while(*string++!=0)len++;
return len;
}
CORE_Static String16
string16_from_widechar(wchar_t *string){
String16 result;
result.str = (U16 *)string;
result.len = widechar_len(string);
return result;
}
CORE_Static String
string16_copy(Allocator *a, String string){
U8 *copy = allocate_array(a, U8, string.len+1);
memory_copy(copy, string.str, string.len);
copy[string.len] = 0;
return String{copy, string.len};
}