corelang/base_unicode.cpp

global U32 question_mark32 = '?';
global U16 question_mark16 = 0x003f;
global U8 question_mark8   = '?';

struct String32{
  U32 *str;
  S64 len;
};

struct UTF32_Result{
  U32 out_str;
  S64 advance;
  B32 error;
};

CORE_Static UTF32_Result
utf8_to_utf32(U8 *c, S64 max_advance) {
  UTF32_Result result = {};

  if ((c[0] & 0b10000000) == 0) { // Check if leftmost zero of first byte is unset
    if(max_advance >= 1){
      result.out_str = c[0];
      result.advance = 1;
    }
    else result.error = 1;
  }

  else if ((c[0] & 0b11100000) == 0b11000000) {
    if ((c[1] & 0b11000000) == 0b10000000) { // Continuation byte required
      if(max_advance >= 2){
        result.out_str = (U32)(c[0] & 0b00011111) << 6u | (c[1] & 0b00111111);
        result.advance = 2;
      }
      else result.error = 2;
    }
    else result.error = 2;
  }

  else if ((c[0] & 0b11110000) == 0b11100000) {
    if ((c[1] & 0b11000000) == 0b10000000 && (c[2] & 0b11000000) == 0b10000000) { // Two continuation bytes required
      if(max_advance >= 3){
        result.out_str = (U32)(c[0] & 0b00001111) << 12u | (U32)(c[1] & 0b00111111) << 6u | (c[2] & 0b00111111);
        result.advance = 3;
      }
      else result.error = 3;
    }
    else result.error = 3;
  }

  else if ((c[0] & 0b11111000) == 0b11110000) {
    if ((c[1] & 0b11000000) == 0b10000000 && (c[2] & 0b11000000) == 0b10000000 && (c[3] & 0b11000000) == 0b10000000) { // Three continuation bytes required
      if(max_advance >= 4){
        result.out_str = (U32)(c[0] & 0b00001111) << 18u | (U32)(c[1] & 0b00111111) << 12u | (U32)(c[2] & 0b00111111) << 6u | (U32)(c[3] & 0b00111111);
        result.advance = 4;
      }
      else result.error = 4;
    }
    else result.error = 4;
  }
  else result.error = 4;

  return result;
}

struct String16{
  U16 *str;
  S64  len;
};

struct UTF16_Result{
  U16 out_str[2];
  S32 len;
  B32 error;
};

CORE_Static UTF16_Result
utf32_to_utf16(U32 codepoint){
  UTF16_Result result = {};
  if (codepoint < 0x10000) {
    result.out_str[0] = (U16)codepoint;
    result.out_str[1] = 0;
    result.len = 1;
  }
  else if (codepoint <= 0x10FFFF) {
    U32 code = (codepoint - 0x10000);
    result.out_str[0] = (U16)(0xD800 | (code >> 10));
    result.out_str[1] = (U16)(0xDC00 | (code & 0x3FF));
    result.len = 2;
  }
  else{
    result.error = 1;
  }

  return result;
}

struct UTF8_Result{
  U8 out_str[4];
  S32 len;
  B32 error;
};

CORE_Static UTF8_Result
utf32_to_utf8(U32 codepoint) {
  UTF8_Result result = {};
  if (codepoint <= 0x7F) {
    result.len = 1;
    result.out_str[0] = (U8)codepoint;
  }
  else if (codepoint <= 0x7FF) {
    result.len= 2;
    result.out_str[0] = 0b11000000 | (0b00011111 & (codepoint >> 6));
    result.out_str[1] = 0b10000000 | (0b00111111 & codepoint);
  }
  else if (codepoint <= 0xFFFF) { // 16 bit word
    result.len= 3;
    result.out_str[0] = 0b11100000 | (0b00001111 & (codepoint >> 12)); // 4 bits
    result.out_str[1] = 0b10000000 | (0b00111111 & (codepoint >> 6));  // 6 bits
    result.out_str[2] = 0b10000000 | (0b00111111 & codepoint);         // 6 bits
  }
  else if (codepoint <= 0x10FFFF) {                           // 21 bit word
    result.len= 4;
    result.out_str[0] = 0b11110000 | (0b00000111 & (codepoint >> 18)); // 3 bits
    result.out_str[1] = 0b10000000 | (0b00111111 & (codepoint >> 12)); // 6 bits
    result.out_str[2] = 0b10000000 | (0b00111111 & (codepoint >> 6));  // 6 bits
    result.out_str[3] = 0b10000000 | (0b00111111 & codepoint);         // 6 bits
  }
  else{
    result.error = true;
  }

  return result;
}

CORE_Static UTF32_Result
utf16_to_utf32(U16 *c, S32 max_advance) {
  UTF32_Result result = {};
  if(max_advance >= 1){
    result.advance = 1;
    result.out_str = c[0];
    if (c[0] >= 0xD800 && c[0] <= 0xDBFF && c[1] >= 0xDC00 && c[1] <= 0xDFFF) {
      if(max_advance >= 2){
        result.out_str = 0x10000;
        result.out_str += (U32)(c[0] & 0x03FF) << 10u | (c[1] & 0x03FF);
        result.advance = 2;
      }
      else result.error = 2;
    }
  }
  else result.error = 1;

  return result;
}

#define unicode_error(question_mark)                                                     \
  {                                                                                      \
    result.str[result.len++] = question_mark;                                            \
    break;                                                                               \
  }

CORE_Static String32
string16_to_string32(Allocator *allocator, String16 string){
  String32 result = {allocate_array(allocator, U32, string.len+1)};
  for(S64 i = 0; i < string.len;){
    UTF32_Result decode = utf16_to_utf32(string.str + i, string.len - i);
    if(!decode.error){
      i += decode.advance;
      result.str[result.len++] = decode.out_str;
    }
    else unicode_error(question_mark32);
  }

  result.str[result.len] = 0;
  return result;
}

CORE_Static String32
string8_to_string32(Allocator *allocator, String string){
  String32 result = {allocate_array(allocator, U32, string.len+1)};
  for(S64 i = 0; i < string.len;){
    UTF32_Result decode = utf8_to_utf32(string.str + i, string.len - i);
    if(!decode.error){
      i += decode.advance;
      result.str[result.len++] = decode.out_str;
    }
    else unicode_error(question_mark32);
  }
  result.str[result.len] = 0;
  return result;
}

CORE_Static String16
string8_to_string16(Allocator *allocator, String in){
  String16 result = {allocate_array(allocator, U16, (in.len*2)+1)}; // @Note(Krzosa): Should be more then enough space
  for(S64 i = 0; i < in.len;){
    UTF32_Result decode = utf8_to_utf32(in.str + i, in.len - i);
    if(!decode.error){
      i += decode.advance;
      UTF16_Result encode = utf32_to_utf16(decode.out_str);
      if(!encode.error){
        for(S32 j = 0; j < encode.len; j++){
          result.str[result.len++] = encode.out_str[j];
        }
      }
      else unicode_error(question_mark16);
    }
    else unicode_error(question_mark16);
  }

  result.str[result.len] = 0;
  return result;
}

CORE_Static String
string16_to_string8(Allocator *allocator, String16 in){
  String result = {allocate_array(allocator, U8, in.len*4+1)};
  for(S64 i = 0; i < in.len;){
    UTF32_Result decode = utf16_to_utf32(in.str + i, in.len - i);
    if(!decode.error){
      i += decode.advance;
      UTF8_Result encode = utf32_to_utf8(decode.out_str);
      if(!encode.error){
        for(S32 j = 0; j < encode.len; j++)
          result.str[result.len++] = encode.out_str[j];
      }
      else unicode_error(question_mark8);
    }
    else unicode_error(question_mark8);
  }

  result.str[result.len] = 0;
  return result;
}

CORE_Static B32
string_compare(String16 a, String16 b){
  if(a.len != b.len) return false;
  for(S64 i = 0; i < a.len; i++){
    if(a.str[i] != b.str[i]) return false;
  }
  return true;
}

CORE_Static B32
string_compare(String32 a, String32 b){
  if(a.len != b.len) return false;
  for(S64 i = 0; i < a.len; i++){
    if(a.str[i] != b.str[i]) return false;
  }
  return true;
}

CORE_Static S64
widechar_len(wchar_t *string){
  S64 len = 0;
  while(*string++!=0)len++;
  return len;
}

CORE_Static String16
string16_from_widechar(wchar_t *string){
  String16 result;
  result.str = (U16 *)string;
  result.len = widechar_len(string);
  return result;
}

CORE_Static String
string16_copy(Allocator *a, String string){
  U8 *copy = allocate_array(a, U8, string.len+1);
  memory_copy(copy, string.str, string.len);
  copy[string.len] = 0;
  return String{copy, string.len};
}