From 64a6b84b30a3cccbbd5216e545a99e7fe8b62b15 Mon Sep 17 00:00:00 2001
From: IonutParau <ionut.alex.parau@gmail.com>
Date: Sun, 13 Jul 2025 11:58:25 +0200
Subject: [PATCH] unicode now allows invalid unicode

---
 TODO.md                 |  3 +--
 src/components/eeprom.c | 11 ++++++++++-
 src/components/gpu.c    | 24 +++++++---------------
 src/neonucleus.h        | 10 ++++++++++
 src/sandbox.lua         |  9 ++++++++-
 src/testLuaArch.c       | 37 +++++++++++-----------------------
 src/unicode.c           | 44 ++++++++++++++++++++++++++++++++++++++++-
 7 files changed, 91 insertions(+), 47 deletions(-)

diff --git a/TODO.md b/TODO.md
index b2cc707..661f4e9 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,10 +1,8 @@
 # Parity with Vanilla OC (only the stuff that makes sense for an emulator)
 
-- make the `unicode` library in testLuaArch support invalid UTF-8 (WHY IS IT OK WITH THAT)
 - in-memory version of `filesystem`
 - complete the GPU implementation (screen buffers and missing methods)
 - complete the screen implementation (bunch of missing methods)
-- support invalid UTF-8 for GPU set and fill, which should pretend the byte value is the codepoint.
 - `hologram` component
 - `computer` component
 - `modem` component
@@ -20,6 +18,7 @@
 
 - Rework filesystem component to pre-process paths to ensure proper sandboxing and not allow arbitrary remote file access
 - Do a huge audit at some point
+- `nn_unicode_charWidth` appears to be bugged, look into that.
 
 # The extra components
 
diff --git a/src/components/eeprom.c b/src/components/eeprom.c
index c325ebf..35f959b 100644
--- a/src/components/eeprom.c
+++ b/src/components/eeprom.c
@@ -242,7 +242,16 @@ void nn_eeprom_getChecksum(nn_eeprom *eeprom, void *_, nn_component *component,
     nn_data_crc32(buf, dataLen + codeLen, hash);
     nn_dealloc(alloc, buf, dataCap + codeCap);
 
-    nn_return_string(computer, hash, sizeof(hash));
+    char encoded[8];
+
+    const char *hex = "0123456789abcdef";
+    for(int i = 0; i < 4; i++) {
+        unsigned char b = hash[i];
+        encoded[i*2] = hex[b >> 4];
+        encoded[i*2+1] = hex[b & 0xF];
+    }
+
+    nn_return_string(computer, encoded, sizeof(encoded));
     
     nn_eeprom_readCost(component, dataLen + codeLen);
 }
diff --git a/src/components/gpu.c b/src/components/gpu.c
index 97e33c8..31036d2 100644
--- a/src/components/gpu.c
+++ b/src/components/gpu.c
@@ -131,24 +131,17 @@ void nni_gpu_set(nni_gpu *gpu, void *_, nn_component *component, nn_computer *co
         return;
     }
 
-    int current = 0;
-    int len = 0;
+    nn_size_t current = 0;
     while(s[current] != 0) {
-        if(nn_unicode_isValidCodepoint(s + current)) {
-            int codepoint = nn_unicode_codepointAt(s, current);
-            nn_setPixel(gpu->currentScreen, x, y, nni_gpu_makePixel(gpu, s + current));
-            current += nn_unicode_codepointSize(codepoint);
-        } else {
-            unsigned int codepoint = (unsigned char)s[current];
-            nn_setPixel(gpu->currentScreen, x, y, nni_gpu_makePixel(gpu, s + current));
-            current++;
-        }
+        unsigned int codepoint = nn_unicode_nextCodepointPermissive(s, &current);
+        char buf[NN_MAXIMUM_UNICODE_BUFFER];
+        nn_unicode_codepointToChar(buf, codepoint, NULL);
+        nn_setPixel(gpu->currentScreen, x, y, nni_gpu_makePixel(gpu, buf));
         if(isVertical) {
             y++;
         } else {
             x++;
         }
-        len++;
     }
 
     nn_simulateBufferedIndirect(component, 1, gpu->ctrl.screenSetsPerTick);
@@ -297,12 +290,9 @@ void nni_gpu_fill(nni_gpu *gpu, void *_, nn_component *component, nn_computer *c
         nn_setCError(computer, "bad argument #5 (character expected)");
         return;
     }
-    if(!nn_unicode_validate(s)) {
-        nn_setCError(computer, "invalid utf-8");
-        return;
-    }
 
-    int codepoint = nn_unicode_codepointAt(s, 0);
+    nn_size_t startIdx = 0;
+    int codepoint = nn_unicode_nextCodepointPermissive(s, &startIdx);
 
     // prevent DoS
     if(x < 0) x = 0;
diff --git a/src/neonucleus.h b/src/neonucleus.h
index 96e65c5..1488428 100644
--- a/src/neonucleus.h
+++ b/src/neonucleus.h
@@ -319,6 +319,16 @@ unsigned int nn_unicode_lowerCodepoint(unsigned int codepoint);
 // returned string must be nn_deallocStr()'d
 char *nn_unicode_lower(nn_Alloc *alloc, const char *s);
 
+// permissive means it allows invalid UTF-8, in which case each byte is treated as a codepoint
+
+// it will return the codepoint starting at byte *index, but will also set *index to the byte afterward it
+// since it is permissive, it supports invalid UTF-8
+unsigned int nn_unicode_nextCodepointPermissive(const char *s, nn_size_t *index);
+nn_size_t nn_unicode_lenPermissive(const char *s);
+nn_size_t nn_unicode_wlenPermissive(const char *s);
+// if not found, it will return -1. This is why it is an nn_intptr_t
+nn_intptr_t nn_unicode_indexPermissive(const char *s, nn_size_t codepointIndex);
+
 // Data card stuff
 
 // Hashing
diff --git a/src/sandbox.lua b/src/sandbox.lua
index e1c9b73..b359cfc 100644
--- a/src/sandbox.lua
+++ b/src/sandbox.lua
@@ -400,9 +400,16 @@ sandbox = {
 
     utf8 = copy(utf8),
     unicode = copy(unicode, {
-        isWide = function(s) return unicode.wlen(s) > unicode.len(s) end,
+        isWide = function(s)
+            local c = unicode.sub(s, 1, 1)
+            return unicode.wlen(c) > unicode.len(c)
+        end,
         upper = string.upper,
         lower = string.lower,
+		wtrunc = function (str,space)
+			space = space - 1
+			return str:sub(1,(space >= utf8.len(str)) and (#str) or (utf8.offset(str,space+1)-1))
+		end,
     }),
     checkArg = checkArg,
     component = libcomponent,
diff --git a/src/testLuaArch.c b/src/testLuaArch.c
index b59d53a..6cfe680 100644
--- a/src/testLuaArch.c
+++ b/src/testLuaArch.c
@@ -469,10 +469,7 @@ int testLuaArch_unicode_sub(lua_State *L) {
     const char *s = luaL_checkstring(L, 1);
     nn_Alloc *alloc = testLuaArch_getAlloc(L);
     int start = luaL_checkinteger(L, 2);
-    if(!nn_unicode_validate(s)) {
-        luaL_error(L, "invalid utf-8");
-    }
-    int len = nn_unicode_len(s);
+    int len = nn_unicode_lenPermissive(s);
     if(len < 0) {
         luaL_error(L, "length overflow");
     }
@@ -503,22 +500,9 @@ int testLuaArch_unicode_sub(lua_State *L) {
         return 1;
     }
 
-    // there is a way to do it without an allocation
-    // however, I'm lazy
-    size_t pointLen;
-    unsigned int *points = nn_unicode_codepoints(alloc, s, &pointLen);
-    if(points == NULL) {
-        luaL_error(L, "out of memory");
-    }
-
-    char *sub = nn_unicode_char(alloc, points + start - 1, stop - start + 1);
-    if(sub == NULL) {
-        nn_dealloc(alloc, points, sizeof(unsigned int) * pointLen);
-        luaL_error(L, "out of memory");
-    }
-    const char *res = testLuaArch_pushstring(L, sub);
-    nn_deallocStr(alloc, sub);
-    nn_dealloc(alloc, points, sizeof(unsigned int) * pointLen);
+    nn_size_t startByte = nn_unicode_indexPermissive(s, start - 1);
+    nn_size_t termByte = nn_unicode_indexPermissive(s, stop);
+    const char *res = testLuaArch_pushlstring(L, s + startByte, termByte - startByte);
     if (!res) {
         luaL_error(L, "out of memory");
     }
@@ -555,10 +539,13 @@ int testLuaArch_unicode_char(lua_State *L) {
 
 int testLuaArch_unicode_len(lua_State *L) {
     const char *s = luaL_checkstring(L, 1);
-    if(!nn_unicode_validate(s)) {
-        luaL_error(L, "invalid utf-8");
-    }
-    lua_pushinteger(L, nn_unicode_len(s));
+    lua_pushinteger(L, nn_unicode_lenPermissive(s));
+    return 1;
+}
+
+int testLuaArch_unicode_wlen(lua_State *L) {
+    const char *s = luaL_checkstring(L, 1);
+    lua_pushinteger(L, nn_unicode_lenPermissive(s));
     return 1;
 }
 
@@ -653,7 +640,7 @@ void testLuaArch_loadEnv(lua_State *L) {
     lua_setfield(L, unicode, "sub");
     lua_pushcfunction(L, testLuaArch_unicode_len);
     lua_setfield(L, unicode, "len");
-    lua_pushcfunction(L, testLuaArch_unicode_len);
+    lua_pushcfunction(L, testLuaArch_unicode_wlen);
     lua_setfield(L, unicode, "wlen");
     lua_pushcfunction(L, testLuaArch_unicode_char);
     lua_setfield(L, unicode, "char");
diff --git a/src/unicode.c b/src/unicode.c
index dd53708..e6f19ad 100644
--- a/src/unicode.c
+++ b/src/unicode.c
@@ -329,7 +329,7 @@ nn_size_t nn_unicode_codepointSize(unsigned int codepoint) {
 
 void nn_unicode_codepointToChar(char *buffer, unsigned int codepoint, nn_size_t *len) {
     nn_size_t codepointSize = nn_unicode_codepointSize(codepoint);
-    *len = codepointSize;
+    if(len != NULL) *len = codepointSize;
 
     nn_memset(buffer, 0, 4); // Clear static array
 
@@ -391,3 +391,45 @@ unsigned int nn_unicode_upperCodepoint(unsigned int codepoint);
 char *nn_unicode_upper(nn_Alloc *alloc, const char *s);
 unsigned int nn_unicode_lowerCodepoint(unsigned int codepoint);
 char *nn_unicode_lower(nn_Alloc *alloc, const char *s);
+
+unsigned int nn_unicode_nextCodepointPermissive(const char *s, nn_size_t *index) {
+    nn_size_t i = *index;
+    if(nn_unicode_isValidCodepoint(s + i)) {
+        // TODO: handle edge-case where suboptimial encoding is used
+        unsigned int p = nn_unicode_codepointAt(s, i);
+        *index = i + nn_unicode_codepointSize(p);
+        return p;
+    }
+    unsigned int p = (unsigned char)s[i];
+    *index = i + 1;
+    return p;
+}
+
+nn_size_t nn_unicode_lenPermissive(const char *b) {
+    nn_size_t len = 0;
+    nn_size_t cur = 0;
+    while(b[cur]) {
+        nn_unicode_nextCodepointPermissive(b, &cur);
+        len++;
+    }
+    return len;
+}
+
+nn_size_t nn_unicode_wlenPermissive(const char *s) {
+    nn_size_t wlen = 0;
+    nn_size_t cur = 0;
+    while (s[cur]) {
+        unsigned int codepoint = nn_unicode_nextCodepointPermissive(s, &cur);
+        wlen += nn_unicode_charWidth(codepoint);
+    }
+    return wlen;
+}
+
+nn_intptr_t nn_unicode_indexPermissive(const char *s, nn_size_t codepointIndex) {
+    nn_size_t bytes = 0;
+    while(true) {
+        if(codepointIndex == 0) return bytes;
+        nn_unicode_nextCodepointPermissive(s, &bytes);
+        codepointIndex--;
+    }
+}