From 643173660a893f536cdb583be80ab943dfaf55d8 Mon Sep 17 00:00:00 2001 From: Blendi-Goose <87442375+Blendi-Goose@users.noreply.github.com> Date: Sun, 29 Jun 2025 15:26:30 +0200 Subject: [PATCH] basically revert atoms bullshit he can actually go fuck off --- src/unicode.c | 148 +------------------------------------------------- 1 file changed, 1 insertion(+), 147 deletions(-) diff --git a/src/unicode.c b/src/unicode.c index 6cf64c7..f9ca12d 100644 --- a/src/unicode.c +++ b/src/unicode.c @@ -108,92 +108,6 @@ unsigned int nn_unicode_codepointAt(const char *s, size_t byteOffset) { unsigned int point = 0; const unsigned char *b = (const unsigned char *)s + byteOffset; - const unsigned char *text = b; - - int codepoint = 0x3f; // Codepoint (defaults to '?') - int octet = (unsigned char)(text[0]); // The first UTF8 octet - - if (octet <= 0x7f) - { - // Only one octet (ASCII range x00-7F) - codepoint = text[0]; - } - else if ((octet & 0xe0) == 0xc0) - { - // Two octets - - // [0]xC2-DF [1]UTF8-tail(x80-BF) - unsigned char octet1 = text[1]; - - if ((octet1 == '\0') || ((octet1 >> 6) != 2)) { return codepoint; } // Unexpected sequence - - if ((octet >= 0xc2) && (octet <= 0xdf)) - { - codepoint = ((octet & 0x1f) << 6) | (octet1 & 0x3f); - } - } - else if ((octet & 0xf0) == 0xe0) - { - // Three octets - unsigned char octet1 = text[1]; - unsigned char octet2 = '\0'; - - if ((octet1 == '\0') || ((octet1 >> 6) != 2)) { return codepoint; } // Unexpected sequence - - octet2 = text[2]; - - if ((octet2 == '\0') || ((octet2 >> 6) != 2)) { return codepoint; } // Unexpected sequence - - // [0]xE0 [1]xA0-BF [2]UTF8-tail(x80-BF) - // [0]xE1-EC [1]UTF8-tail [2]UTF8-tail(x80-BF) - // [0]xED [1]x80-9F [2]UTF8-tail(x80-BF) - // [0]xEE-EF [1]UTF8-tail [2]UTF8-tail(x80-BF) - - if (((octet == 0xe0) && !((octet1 >= 0xa0) && (octet1 <= 0xbf))) || - ((octet == 0xed) && !((octet1 >= 0x80) && (octet1 <= 0x9f)))) { return codepoint; } - - if ((octet >= 0xe0) && (octet <= 0xef)) - { - codepoint = ((octet & 0xf) << 12) | ((octet1 & 0x3f) << 6) | (octet2 & 0x3f); - } - } - else if ((octet & 0xf8) == 0xf0) - { - // Four octets - if (octet > 0xf4) return codepoint; - - unsigned char octet1 = text[1]; - unsigned char octet2 = '\0'; - unsigned char octet3 = '\0'; - - if ((octet1 == '\0') || ((octet1 >> 6) != 2)) { return codepoint; } // Unexpected sequence - - octet2 = text[2]; - - if ((octet2 == '\0') || ((octet2 >> 6) != 2)) { return codepoint; } // Unexpected sequence - - octet3 = text[3]; - - if ((octet3 == '\0') || ((octet3 >> 6) != 2)) { return codepoint; } // Unexpected sequence - - // [0]xF0 [1]x90-BF [2]UTF8-tail [3]UTF8-tail - // [0]xF1-F3 [1]UTF8-tail [2]UTF8-tail [3]UTF8-tail - // [0]xF4 [1]x80-8F [2]UTF8-tail [3]UTF8-tail - - if (((octet == 0xf0) && !((octet1 >= 0x90) && (octet1 <= 0xbf))) || - ((octet == 0xf4) && !((octet1 >= 0x80) && (octet1 <= 0x8f)))) { return codepoint; } // Unexpected sequence - - if (octet >= 0xf0) - { - codepoint = ((octet & 0x7) << 18) | ((octet1 & 0x3f) << 12) | ((octet2 & 0x3f) << 6) | (octet3 & 0x3f); - } - } - - if (codepoint > 0x10ffff) codepoint = 0x3f; // Codepoints after U+10ffff are invalid - - return codepoint; - -/* const unsigned char subpartMask = 0b111111; // look into nn_unicode_codepointToChar as well. if(b[0] <= 0x7F) { @@ -212,32 +126,9 @@ unsigned int nn_unicode_codepointAt(const char *s, size_t byteOffset) { point += ((unsigned int)(b[3] & subpartMask)); } return point; -*/ } size_t nn_unicode_codepointSize(unsigned int codepoint) { - int size = 1; - - if (codepoint <= 0x7f) - { - size = 1; - } - else if (codepoint <= 0x7ff) - { - size = 2; - } - else if (codepoint <= 0xffff) - { - size = 3; - } - else if (codepoint <= 0x10ffff) - { - size = 4; - } - - return size; - -/* if (codepoint <= 0x007f) { return 1; } else if (codepoint <= 0x07ff) { @@ -249,50 +140,14 @@ size_t nn_unicode_codepointSize(unsigned int codepoint) { } return 1; -*/ } const char *nn_unicode_codepointToChar(unsigned int codepoint, size_t *len) { - - static char utf8[6] = { 0 }; - memset(utf8, 0, 6); // Clear static array - int size = 0; // Byte size of codepoint - - if (codepoint <= 0x7f) - { - utf8[0] = (char)codepoint; - size = 1; - } - else if (codepoint <= 0x7ff) - { - utf8[0] = (char)(((codepoint >> 6) & 0x1f) | 0xc0); - utf8[1] = (char)((codepoint & 0x3f) | 0x80); - size = 2; - } - else if (codepoint <= 0xffff) - { - utf8[0] = (char)(((codepoint >> 12) & 0x0f) | 0xe0); - utf8[1] = (char)(((codepoint >> 6) & 0x3f) | 0x80); - utf8[2] = (char)((codepoint & 0x3f) | 0x80); - size = 3; - } - else if (codepoint <= 0x10ffff) - { - utf8[0] = (char)(((codepoint >> 18) & 0x07) | 0xf0); - utf8[1] = (char)(((codepoint >> 12) & 0x3f) | 0x80); - utf8[2] = (char)(((codepoint >> 6) & 0x3f) | 0x80); - utf8[3] = (char)((codepoint & 0x3f) | 0x80); - size = 4; - } - - *len = size; - - return utf8; -/* size_t codepointSize = nn_unicode_codepointSize(codepoint); *len = codepointSize; static char buffer[4]; + memset(buffer, 0, 4); // Clear static array if (codepointSize == 1) { buffer[0] = (char)codepoint; @@ -311,7 +166,6 @@ const char *nn_unicode_codepointToChar(unsigned int codepoint, size_t *len) { } return buffer; -*/ } size_t nn_unicode_charWidth(unsigned int codepoint);