This commit is contained in:
IonutParau 2025-06-29 16:13:26 +02:00
commit 3c770b1e5c

View File

@ -108,92 +108,6 @@ unsigned int nn_unicode_codepointAt(const char *s, size_t byteOffset) {
unsigned int point = 0;
const unsigned char *b = (const unsigned char *)s + byteOffset;
const unsigned char *text = b;
int codepoint = 0x3f; // Codepoint (defaults to '?')
int octet = (unsigned char)(text[0]); // The first UTF8 octet
if (octet <= 0x7f)
{
// Only one octet (ASCII range x00-7F)
codepoint = text[0];
}
else if ((octet & 0xe0) == 0xc0)
{
// Two octets
// [0]xC2-DF [1]UTF8-tail(x80-BF)
unsigned char octet1 = text[1];
if ((octet1 == '\0') || ((octet1 >> 6) != 2)) { return codepoint; } // Unexpected sequence
if ((octet >= 0xc2) && (octet <= 0xdf))
{
codepoint = ((octet & 0x1f) << 6) | (octet1 & 0x3f);
}
}
else if ((octet & 0xf0) == 0xe0)
{
// Three octets
unsigned char octet1 = text[1];
unsigned char octet2 = '\0';
if ((octet1 == '\0') || ((octet1 >> 6) != 2)) { return codepoint; } // Unexpected sequence
octet2 = text[2];
if ((octet2 == '\0') || ((octet2 >> 6) != 2)) { return codepoint; } // Unexpected sequence
// [0]xE0 [1]xA0-BF [2]UTF8-tail(x80-BF)
// [0]xE1-EC [1]UTF8-tail [2]UTF8-tail(x80-BF)
// [0]xED [1]x80-9F [2]UTF8-tail(x80-BF)
// [0]xEE-EF [1]UTF8-tail [2]UTF8-tail(x80-BF)
if (((octet == 0xe0) && !((octet1 >= 0xa0) && (octet1 <= 0xbf))) ||
((octet == 0xed) && !((octet1 >= 0x80) && (octet1 <= 0x9f)))) { return codepoint; }
if ((octet >= 0xe0) && (octet <= 0xef))
{
codepoint = ((octet & 0xf) << 12) | ((octet1 & 0x3f) << 6) | (octet2 & 0x3f);
}
}
else if ((octet & 0xf8) == 0xf0)
{
// Four octets
if (octet > 0xf4) return codepoint;
unsigned char octet1 = text[1];
unsigned char octet2 = '\0';
unsigned char octet3 = '\0';
if ((octet1 == '\0') || ((octet1 >> 6) != 2)) { return codepoint; } // Unexpected sequence
octet2 = text[2];
if ((octet2 == '\0') || ((octet2 >> 6) != 2)) { return codepoint; } // Unexpected sequence
octet3 = text[3];
if ((octet3 == '\0') || ((octet3 >> 6) != 2)) { return codepoint; } // Unexpected sequence
// [0]xF0 [1]x90-BF [2]UTF8-tail [3]UTF8-tail
// [0]xF1-F3 [1]UTF8-tail [2]UTF8-tail [3]UTF8-tail
// [0]xF4 [1]x80-8F [2]UTF8-tail [3]UTF8-tail
if (((octet == 0xf0) && !((octet1 >= 0x90) && (octet1 <= 0xbf))) ||
((octet == 0xf4) && !((octet1 >= 0x80) && (octet1 <= 0x8f)))) { return codepoint; } // Unexpected sequence
if (octet >= 0xf0)
{
codepoint = ((octet & 0x7) << 18) | ((octet1 & 0x3f) << 12) | ((octet2 & 0x3f) << 6) | (octet3 & 0x3f);
}
}
if (codepoint > 0x10ffff) codepoint = 0x3f; // Codepoints after U+10ffff are invalid
return codepoint;
/*
const unsigned char subpartMask = 0b111111;
// look into nn_unicode_codepointToChar as well.
if(b[0] <= 0x7F) {
@ -212,32 +126,9 @@ unsigned int nn_unicode_codepointAt(const char *s, size_t byteOffset) {
point += ((unsigned int)(b[3] & subpartMask));
}
return point;
*/
}
size_t nn_unicode_codepointSize(unsigned int codepoint) {
int size = 1;
if (codepoint <= 0x7f)
{
size = 1;
}
else if (codepoint <= 0x7ff)
{
size = 2;
}
else if (codepoint <= 0xffff)
{
size = 3;
}
else if (codepoint <= 0x10ffff)
{
size = 4;
}
return size;
/*
if (codepoint <= 0x007f) {
return 1;
} else if (codepoint <= 0x07ff) {
@ -249,50 +140,14 @@ size_t nn_unicode_codepointSize(unsigned int codepoint) {
}
return 1;
*/
}
const char *nn_unicode_codepointToChar(unsigned int codepoint, size_t *len) {
static char utf8[6] = { 0 };
memset(utf8, 0, 6); // Clear static array
int size = 0; // Byte size of codepoint
if (codepoint <= 0x7f)
{
utf8[0] = (char)codepoint;
size = 1;
}
else if (codepoint <= 0x7ff)
{
utf8[0] = (char)(((codepoint >> 6) & 0x1f) | 0xc0);
utf8[1] = (char)((codepoint & 0x3f) | 0x80);
size = 2;
}
else if (codepoint <= 0xffff)
{
utf8[0] = (char)(((codepoint >> 12) & 0x0f) | 0xe0);
utf8[1] = (char)(((codepoint >> 6) & 0x3f) | 0x80);
utf8[2] = (char)((codepoint & 0x3f) | 0x80);
size = 3;
}
else if (codepoint <= 0x10ffff)
{
utf8[0] = (char)(((codepoint >> 18) & 0x07) | 0xf0);
utf8[1] = (char)(((codepoint >> 12) & 0x3f) | 0x80);
utf8[2] = (char)(((codepoint >> 6) & 0x3f) | 0x80);
utf8[3] = (char)((codepoint & 0x3f) | 0x80);
size = 4;
}
*len = size;
return utf8;
/*
size_t codepointSize = nn_unicode_codepointSize(codepoint);
*len = codepointSize;
static char buffer[4];
memset(buffer, 0, 4); // Clear static array
if (codepointSize == 1) {
buffer[0] = (char)codepoint;
@ -311,7 +166,6 @@ const char *nn_unicode_codepointToChar(unsigned int codepoint, size_t *len) {
}
return buffer;
*/
}
size_t nn_unicode_charWidth(unsigned int codepoint);