Files
lib_compiler/examples/text_editor/unicode.lc
2024-04-13 15:29:53 +02:00

95 lines
2.9 KiB
Plaintext

UTF32_Result :: struct {
out_str: u32;
advance: int;
error: int;
}
UTF8_Result :: struct {
out_str: [4]u8;
len: int;
error: int;
}
IsUTF8ContinuationByte :: proc(c: u8): bool {
result := (c & 0b11000000) == 0b10000000;
return result;
}
UTF8ToUTF32 :: proc(c: *uchar, max_advance: int): UTF32_Result {
result: UTF32_Result;
if (c[0] & 0x80) == 0 { // Check if leftmost zero of first byte is unset
if max_advance >= 1 {
result.out_str = :u32(c[0]);
result.advance = 1;
}
else result.error = 1;
}
else if (c[0] & 0xe0) == 0xc0 {
if (c[1] & 0xc0) == 0x80 { // Continuation byte required
if max_advance >= 2 {
result.out_str = (:u32(c[0] & 0x1f) << 6) | :u32(c[1] & 0x3f);
result.advance = 2;
}
else result.error = 2;
}
else result.error = 2;
}
else if (c[0] & 0xf0) == 0xe0 {
if ((c[1] & 0xc0) == 0x80) && ((c[2] & 0xc0) == 0x80) { // Two continuation bytes required
if max_advance >= 3 {
result.out_str = (:u32(c[0] & 0xf) << 12) | (:u32(c[1] & 0x3f) << 6) | :u32(c[2] & 0x3f);
result.advance = 3;
}
else result.error = 3;
}
else result.error = 3;
}
else if (c[0] & 0xf8) == 0xf0 {
if (c[1] & 0xc0) == 0x80 && (c[2] & 0xc0) == 0x80 && (c[3] & 0xc0) == 0x80 { // Three continuation bytes required
if max_advance >= 4 {
result.out_str = :u32(c[0] & 0xf) << 18 | :u32(c[1] & 0x3f) << 12 | :u32(c[2] & 0x3f) << 6 | :u32(c[3] & 0x3f);
result.advance = 4;
}
else result.error = 4;
}
else result.error = 4;
}
else result.error = 4;
return result;
}
UTF32ToUTF8 :: proc(codepoint: u32): UTF8_Result {
result: UTF8_Result;
if codepoint <= 0x7F {
result.len = 1;
result.out_str[0] = :u8(codepoint);
}
else if codepoint <= 0x7FF {
result.len = 2;
result.out_str[0] = :u8 (0xc0 | 0x1f & (codepoint >> 6));
result.out_str[1] = :u8 (0x80 | 0x3f & codepoint);
}
else if codepoint <= 0xFFFF { // 16 bit word
result.len = 3;
result.out_str[0] = :u8 (0xe0 | 0xf & (codepoint >> 12)); // 4 bits
result.out_str[1] = :u8 (0x80 | 0x3f & (codepoint >> 6)); // 6 bits
result.out_str[2] = :u8 (0x80 | 0x3f & codepoint); // 6 bits
}
else if codepoint <= 0x10FFFF { // 21 bit word
result.len = 4;
result.out_str[0] = :u8 (0xf0 | 0x7 & (codepoint >> 18)); // 3 bits
result.out_str[1] = :u8 (0x80 | 0x3f & (codepoint >> 12)); // 6 bits
result.out_str[2] = :u8 (0x80 | 0x3f & (codepoint >> 6)); // 6 bits
result.out_str[3] = :u8 (0x80 | 0x3f & codepoint); // 6 bits
}
else result.error = 1;
return result;
}