unicode now allows invalid unicode

2025-09-24 09:03:32 +02:00 · 2025-07-13 11:58:25 +02:00 · 2025-07-13 11:58:25 +02:00 · 64a6b84b30
commit 64a6b84b30
parent 2621554165
7 changed files with 91 additions and 47 deletions
--- a/TODO.md
+++ b/TODO.md
@ -1,10 +1,8 @@
 # Parity with Vanilla OC (only the stuff that makes sense for an emulator)

- make the `unicode` library in testLuaArch support invalid UTF-8 (WHY IS IT OK WITH THAT)
 - in-memory version of `filesystem`
 - complete the GPU implementation (screen buffers and missing methods)
 - complete the screen implementation (bunch of missing methods)
- support invalid UTF-8 for GPU set and fill, which should pretend the byte value is the codepoint.
 - `hologram` component
 - `computer` component
 - `modem` component
@ -20,6 +18,7 @@

 - Rework filesystem component to pre-process paths to ensure proper sandboxing and not allow arbitrary remote file access
 - Do a huge audit at some point
+- `nn_unicode_charWidth` appears to be bugged, look into that.

 # The extra components

--- a/src/components/eeprom.c
+++ b/src/components/eeprom.c
@ -242,7 +242,16 @@ void nn_eeprom_getChecksum(nn_eeprom *eeprom, void *_, nn_component *component,
    nn_data_crc32(buf, dataLen + codeLen, hash);
    nn_dealloc(alloc, buf, dataCap + codeCap);

-    nn_return_string(computer, hash, sizeof(hash));
+    char encoded[8];
+
+    const char *hex = "0123456789abcdef";
+    for(int i = 0; i < 4; i++) {
+        unsigned char b = hash[i];
+        encoded[i*2] = hex[b >> 4];
+        encoded[i*2+1] = hex[b & 0xF];
+    }
+
+    nn_return_string(computer, encoded, sizeof(encoded));
    
    nn_eeprom_readCost(component, dataLen + codeLen);
 }
--- a/src/components/gpu.c
+++ b/src/components/gpu.c
@ -131,24 +131,17 @@ void nni_gpu_set(nni_gpu *gpu, void *_, nn_component *component, nn_computer *co
        return;
    }

-    int current = 0;
-    int len = 0;
+    nn_size_t current = 0;
    while(s[current] != 0) {
-        if(nn_unicode_isValidCodepoint(s + current)) {
-            int codepoint = nn_unicode_codepointAt(s, current);
-            nn_setPixel(gpu->currentScreen, x, y, nni_gpu_makePixel(gpu, s + current));
-            current += nn_unicode_codepointSize(codepoint);
-        } else {
-            unsigned int codepoint = (unsigned char)s[current];
-            nn_setPixel(gpu->currentScreen, x, y, nni_gpu_makePixel(gpu, s + current));
-            current++;
-        }
+        unsigned int codepoint = nn_unicode_nextCodepointPermissive(s, &current);
+        char buf[NN_MAXIMUM_UNICODE_BUFFER];
+        nn_unicode_codepointToChar(buf, codepoint, NULL);
+        nn_setPixel(gpu->currentScreen, x, y, nni_gpu_makePixel(gpu, buf));
        if(isVertical) {
            y++;
        } else {
            x++;
        }
-        len++;
    }

    nn_simulateBufferedIndirect(component, 1, gpu->ctrl.screenSetsPerTick);
@ -297,12 +290,9 @@ void nni_gpu_fill(nni_gpu *gpu, void *_, nn_component *component, nn_computer *c
        nn_setCError(computer, "bad argument #5 (character expected)");
        return;
    }
-    if(!nn_unicode_validate(s)) {
-        nn_setCError(computer, "invalid utf-8");
-        return;
-    }

-    int codepoint = nn_unicode_codepointAt(s, 0);
+    nn_size_t startIdx = 0;
+    int codepoint = nn_unicode_nextCodepointPermissive(s, &startIdx);

    // prevent DoS
    if(x < 0) x = 0;
--- a/src/neonucleus.h
+++ b/src/neonucleus.h
@ -319,6 +319,16 @@ unsigned int nn_unicode_lowerCodepoint(unsigned int codepoint);
 // returned string must be nn_deallocStr()'d
 char *nn_unicode_lower(nn_Alloc *alloc, const char *s);

+// permissive means it allows invalid UTF-8, in which case each byte is treated as a codepoint
+
+// it will return the codepoint starting at byte *index, but will also set *index to the byte afterward it
+// since it is permissive, it supports invalid UTF-8
+unsigned int nn_unicode_nextCodepointPermissive(const char *s, nn_size_t *index);
+nn_size_t nn_unicode_lenPermissive(const char *s);
+nn_size_t nn_unicode_wlenPermissive(const char *s);
+// if not found, it will return -1. This is why it is an nn_intptr_t
+nn_intptr_t nn_unicode_indexPermissive(const char *s, nn_size_t codepointIndex);
+
 // Data card stuff

 // Hashing
--- a/src/sandbox.lua
+++ b/src/sandbox.lua
@ -400,9 +400,16 @@ sandbox = {

    utf8 = copy(utf8),
    unicode = copy(unicode, {
-        isWide = function(s) return unicode.wlen(s) > unicode.len(s) end,
+        isWide = function(s)
+            local c = unicode.sub(s, 1, 1)
+            return unicode.wlen(c) > unicode.len(c)
+        end,
        upper = string.upper,
        lower = string.lower,
+		wtrunc = function (str,space)
+			space = space - 1
+			return str:sub(1,(space >= utf8.len(str)) and (#str) or (utf8.offset(str,space+1)-1))
+		end,
    }),
    checkArg = checkArg,
    component = libcomponent,
--- a/src/testLuaArch.c
+++ b/src/testLuaArch.c
@ -469,10 +469,7 @@ int testLuaArch_unicode_sub(lua_State *L) {
    const char *s = luaL_checkstring(L, 1);
    nn_Alloc *alloc = testLuaArch_getAlloc(L);
    int start = luaL_checkinteger(L, 2);
-    if(!nn_unicode_validate(s)) {
-        luaL_error(L, "invalid utf-8");
-    }
-    int len = nn_unicode_len(s);
+    int len = nn_unicode_lenPermissive(s);
    if(len < 0) {
        luaL_error(L, "length overflow");
    }
@ -503,22 +500,9 @@ int testLuaArch_unicode_sub(lua_State *L) {
        return 1;
    }

-    // there is a way to do it without an allocation
-    // however, I'm lazy
-    size_t pointLen;
-    unsigned int *points = nn_unicode_codepoints(alloc, s, &pointLen);
-    if(points == NULL) {
-        luaL_error(L, "out of memory");
-    }
-
-    char *sub = nn_unicode_char(alloc, points + start - 1, stop - start + 1);
-    if(sub == NULL) {
-        nn_dealloc(alloc, points, sizeof(unsigned int) * pointLen);
-        luaL_error(L, "out of memory");
-    }
-    const char *res = testLuaArch_pushstring(L, sub);
-    nn_deallocStr(alloc, sub);
-    nn_dealloc(alloc, points, sizeof(unsigned int) * pointLen);
+    nn_size_t startByte = nn_unicode_indexPermissive(s, start - 1);
+    nn_size_t termByte = nn_unicode_indexPermissive(s, stop);
+    const char *res = testLuaArch_pushlstring(L, s + startByte, termByte - startByte);
    if (!res) {
        luaL_error(L, "out of memory");
    }
@ -555,10 +539,13 @@ int testLuaArch_unicode_char(lua_State *L) {

 int testLuaArch_unicode_len(lua_State *L) {
    const char *s = luaL_checkstring(L, 1);
-    if(!nn_unicode_validate(s)) {
-        luaL_error(L, "invalid utf-8");
-    }
-    lua_pushinteger(L, nn_unicode_len(s));
+    lua_pushinteger(L, nn_unicode_lenPermissive(s));
+    return 1;
+}
+
+int testLuaArch_unicode_wlen(lua_State *L) {
+    const char *s = luaL_checkstring(L, 1);
+    lua_pushinteger(L, nn_unicode_lenPermissive(s));
    return 1;
 }

@ -653,7 +640,7 @@ void testLuaArch_loadEnv(lua_State *L) {
    lua_setfield(L, unicode, "sub");
    lua_pushcfunction(L, testLuaArch_unicode_len);
    lua_setfield(L, unicode, "len");
-    lua_pushcfunction(L, testLuaArch_unicode_len);
+    lua_pushcfunction(L, testLuaArch_unicode_wlen);
    lua_setfield(L, unicode, "wlen");
    lua_pushcfunction(L, testLuaArch_unicode_char);
    lua_setfield(L, unicode, "char");
--- a/src/unicode.c
+++ b/src/unicode.c
@ -329,7 +329,7 @@ nn_size_t nn_unicode_codepointSize(unsigned int codepoint) {

 void nn_unicode_codepointToChar(char *buffer, unsigned int codepoint, nn_size_t *len) {
    nn_size_t codepointSize = nn_unicode_codepointSize(codepoint);
-    *len = codepointSize;
+    if(len != NULL) *len = codepointSize;

    nn_memset(buffer, 0, 4); // Clear static array

@ -391,3 +391,45 @@ unsigned int nn_unicode_upperCodepoint(unsigned int codepoint);
 char *nn_unicode_upper(nn_Alloc *alloc, const char *s);
 unsigned int nn_unicode_lowerCodepoint(unsigned int codepoint);
 char *nn_unicode_lower(nn_Alloc *alloc, const char *s);
+
+unsigned int nn_unicode_nextCodepointPermissive(const char *s, nn_size_t *index) {
+    nn_size_t i = *index;
+    if(nn_unicode_isValidCodepoint(s + i)) {
+        // TODO: handle edge-case where suboptimial encoding is used
+        unsigned int p = nn_unicode_codepointAt(s, i);
+        *index = i + nn_unicode_codepointSize(p);
+        return p;
+    }
+    unsigned int p = (unsigned char)s[i];
+    *index = i + 1;
+    return p;
+}
+
+nn_size_t nn_unicode_lenPermissive(const char *b) {
+    nn_size_t len = 0;
+    nn_size_t cur = 0;
+    while(b[cur]) {
+        nn_unicode_nextCodepointPermissive(b, &cur);
+        len++;
+    }
+    return len;
+}
+
+nn_size_t nn_unicode_wlenPermissive(const char *s) {
+    nn_size_t wlen = 0;
+    nn_size_t cur = 0;
+    while (s[cur]) {
+        unsigned int codepoint = nn_unicode_nextCodepointPermissive(s, &cur);
+        wlen += nn_unicode_charWidth(codepoint);
+    }
+    return wlen;
+}
+
+nn_intptr_t nn_unicode_indexPermissive(const char *s, nn_size_t codepointIndex) {
+    nn_size_t bytes = 0;
+    while(true) {
+        if(codepointIndex == 0) return bytes;
+        nn_unicode_nextCodepointPermissive(s, &bytes);
+        codepointIndex--;
+    }
+}