95 lines
2.9 KiB
Plaintext
95 lines
2.9 KiB
Plaintext
UTF32_Result :: struct {
|
|
out_str: u32;
|
|
advance: int;
|
|
error: int;
|
|
}
|
|
|
|
UTF8_Result :: struct {
|
|
out_str: [4]u8;
|
|
len: int;
|
|
error: int;
|
|
}
|
|
|
|
IsUTF8ContinuationByte :: proc(c: u8): bool {
|
|
result := (c & 0b11000000) == 0b10000000;
|
|
return result;
|
|
}
|
|
|
|
UTF8ToUTF32 :: proc(c: *uchar, max_advance: int): UTF32_Result {
|
|
result: UTF32_Result;
|
|
|
|
if (c[0] & 0x80) == 0 { // Check if leftmost zero of first byte is unset
|
|
if max_advance >= 1 {
|
|
result.out_str = :u32(c[0]);
|
|
result.advance = 1;
|
|
}
|
|
else result.error = 1;
|
|
}
|
|
|
|
else if (c[0] & 0xe0) == 0xc0 {
|
|
if (c[1] & 0xc0) == 0x80 { // Continuation byte required
|
|
if max_advance >= 2 {
|
|
result.out_str = (:u32(c[0] & 0x1f) << 6) | :u32(c[1] & 0x3f);
|
|
result.advance = 2;
|
|
}
|
|
else result.error = 2;
|
|
}
|
|
else result.error = 2;
|
|
}
|
|
|
|
else if (c[0] & 0xf0) == 0xe0 {
|
|
if ((c[1] & 0xc0) == 0x80) && ((c[2] & 0xc0) == 0x80) { // Two continuation bytes required
|
|
if max_advance >= 3 {
|
|
result.out_str = (:u32(c[0] & 0xf) << 12) | (:u32(c[1] & 0x3f) << 6) | :u32(c[2] & 0x3f);
|
|
result.advance = 3;
|
|
}
|
|
else result.error = 3;
|
|
}
|
|
else result.error = 3;
|
|
}
|
|
|
|
else if (c[0] & 0xf8) == 0xf0 {
|
|
if (c[1] & 0xc0) == 0x80 && (c[2] & 0xc0) == 0x80 && (c[3] & 0xc0) == 0x80 { // Three continuation bytes required
|
|
if max_advance >= 4 {
|
|
result.out_str = :u32(c[0] & 0xf) << 18 | :u32(c[1] & 0x3f) << 12 | :u32(c[2] & 0x3f) << 6 | :u32(c[3] & 0x3f);
|
|
result.advance = 4;
|
|
}
|
|
else result.error = 4;
|
|
}
|
|
else result.error = 4;
|
|
}
|
|
else result.error = 4;
|
|
|
|
return result;
|
|
}
|
|
|
|
UTF32ToUTF8 :: proc(codepoint: u32): UTF8_Result {
|
|
result: UTF8_Result;
|
|
|
|
if codepoint <= 0x7F {
|
|
result.len = 1;
|
|
result.out_str[0] = :u8(codepoint);
|
|
}
|
|
else if codepoint <= 0x7FF {
|
|
result.len = 2;
|
|
result.out_str[0] = :u8 (0xc0 | 0x1f & (codepoint >> 6));
|
|
result.out_str[1] = :u8 (0x80 | 0x3f & codepoint);
|
|
}
|
|
else if codepoint <= 0xFFFF { // 16 bit word
|
|
result.len = 3;
|
|
result.out_str[0] = :u8 (0xe0 | 0xf & (codepoint >> 12)); // 4 bits
|
|
result.out_str[1] = :u8 (0x80 | 0x3f & (codepoint >> 6)); // 6 bits
|
|
result.out_str[2] = :u8 (0x80 | 0x3f & codepoint); // 6 bits
|
|
}
|
|
else if codepoint <= 0x10FFFF { // 21 bit word
|
|
result.len = 4;
|
|
result.out_str[0] = :u8 (0xf0 | 0x7 & (codepoint >> 18)); // 3 bits
|
|
result.out_str[1] = :u8 (0x80 | 0x3f & (codepoint >> 12)); // 6 bits
|
|
result.out_str[2] = :u8 (0x80 | 0x3f & (codepoint >> 6)); // 6 bits
|
|
result.out_str[3] = :u8 (0x80 | 0x3f & codepoint); // 6 bits
|
|
}
|
|
else result.error = 1;
|
|
|
|
return result;
|
|
}
|