unicode now allows invalid unicode

2026-02-15 04:03:49 +01:00 · 2025-07-13 11:58:25 +02:00 · 2025-07-13 11:58:25 +02:00 · 64a6b84b30
commit 64a6b84b30
parent 2621554165
7 changed files with 91 additions and 47 deletions
--- a/TODO.md
+++ b/TODO.md
@ -1,10 +1,8 @@
 # Parity with Vanilla OC (only the stuff that makes sense for an emulator)
 - make the `unicode` library in testLuaArch support invalid UTF-8 (WHY IS IT OK WITH THAT)
 - in-memory version of `filesystem`
 - complete the GPU implementation (screen buffers and missing methods)
 - complete the screen implementation (bunch of missing methods)
 - support invalid UTF-8 for GPU set and fill, which should pretend the byte value is the codepoint.
 - `hologram` component
 - `computer` component
 - `modem` component
@ -20,6 +18,7 @@
 - Rework filesystem component to pre-process paths to ensure proper sandboxing and not allow arbitrary remote file access
 - Do a huge audit at some point
 - `nn_unicode_charWidth` appears to be bugged, look into that.
 # The extra components
--- a/src/components/eeprom.c
+++ b/src/components/eeprom.c
@ -242,7 +242,16 @@ void nn_eeprom_getChecksum(nn_eeprom *eeprom, void *_, nn_component *component,
    nn_data_crc32(buf, dataLen + codeLen, hash);
    nn_dealloc(alloc, buf, dataCap + codeCap);
-    nn_return_string(computer, hash, sizeof(hash));
+    char encoded[8];
    const char *hex = "0123456789abcdef";
    for(int i = 0; i < 4; i++) {
        unsigned char b = hash[i];
        encoded[i*2] = hex[b >> 4];
        encoded[i*2+1] = hex[b & 0xF];
    }
    nn_return_string(computer, encoded, sizeof(encoded));
    nn_eeprom_readCost(component, dataLen + codeLen);
 }
--- a/src/components/gpu.c
+++ b/src/components/gpu.c
@ -131,24 +131,17 @@ void nni_gpu_set(nni_gpu *gpu, void *_, nn_component *component, nn_computer *co
        return;
    }
-    int current = 0;
+    nn_size_t current = 0;
    int len = 0;
    while(s[current] != 0) {
-        if(nn_unicode_isValidCodepoint(s + current)) {
+        unsigned int codepoint = nn_unicode_nextCodepointPermissive(s, &current);
-            int codepoint = nn_unicode_codepointAt(s, current);
+        char buf[NN_MAXIMUM_UNICODE_BUFFER];
-            nn_setPixel(gpu->currentScreen, x, y, nni_gpu_makePixel(gpu, s + current));
+        nn_unicode_codepointToChar(buf, codepoint, NULL);
-            current += nn_unicode_codepointSize(codepoint);
+        nn_setPixel(gpu->currentScreen, x, y, nni_gpu_makePixel(gpu, buf));
        } else {
            unsigned int codepoint = (unsigned char)s[current];
            nn_setPixel(gpu->currentScreen, x, y, nni_gpu_makePixel(gpu, s + current));
            current++;
        }
        if(isVertical) {
            y++;
        } else {
            x++;
        }
        len++;
    }
    nn_simulateBufferedIndirect(component, 1, gpu->ctrl.screenSetsPerTick);
@ -297,12 +290,9 @@ void nni_gpu_fill(nni_gpu *gpu, void *_, nn_component *component, nn_computer *c
        nn_setCError(computer, "bad argument #5 (character expected)");
        return;
    }
    if(!nn_unicode_validate(s)) {
        nn_setCError(computer, "invalid utf-8");
        return;
    }
-    int codepoint = nn_unicode_codepointAt(s, 0);
+    nn_size_t startIdx = 0;
    int codepoint = nn_unicode_nextCodepointPermissive(s, &startIdx);
    // prevent DoS
    if(x < 0) x = 0;
--- a/src/neonucleus.h
+++ b/src/neonucleus.h
@ -319,6 +319,16 @@ unsigned int nn_unicode_lowerCodepoint(unsigned int codepoint);
 // returned string must be nn_deallocStr()'d
 char *nn_unicode_lower(nn_Alloc *alloc, const char *s);
 // permissive means it allows invalid UTF-8, in which case each byte is treated as a codepoint
 // it will return the codepoint starting at byte *index, but will also set *index to the byte afterward it
 // since it is permissive, it supports invalid UTF-8
 unsigned int nn_unicode_nextCodepointPermissive(const char *s, nn_size_t *index);
 nn_size_t nn_unicode_lenPermissive(const char *s);
 nn_size_t nn_unicode_wlenPermissive(const char *s);
 // if not found, it will return -1. This is why it is an nn_intptr_t
 nn_intptr_t nn_unicode_indexPermissive(const char *s, nn_size_t codepointIndex);
 // Data card stuff
 // Hashing
--- a/src/sandbox.lua
+++ b/src/sandbox.lua
@ -400,9 +400,16 @@ sandbox = {
    utf8 = copy(utf8),
    unicode = copy(unicode, {
-        isWide = function(s) return unicode.wlen(s) > unicode.len(s) end,
+        isWide = function(s)
            local c = unicode.sub(s, 1, 1)
            return unicode.wlen(c) > unicode.len(c)
        end,
        upper = string.upper,
        lower = string.lower,
 		wtrunc = function (str,space)
 			space = space - 1
 			return str:sub(1,(space >= utf8.len(str)) and (#str) or (utf8.offset(str,space+1)-1))
 		end,
    }),
    checkArg = checkArg,
    component = libcomponent,
--- a/src/testLuaArch.c
+++ b/src/testLuaArch.c
@ -469,10 +469,7 @@ int testLuaArch_unicode_sub(lua_State *L) {
    const char *s = luaL_checkstring(L, 1);
    nn_Alloc *alloc = testLuaArch_getAlloc(L);
    int start = luaL_checkinteger(L, 2);
-    if(!nn_unicode_validate(s)) {
+    int len = nn_unicode_lenPermissive(s);
        luaL_error(L, "invalid utf-8");
    }
    int len = nn_unicode_len(s);
    if(len < 0) {
        luaL_error(L, "length overflow");
    }
@ -503,22 +500,9 @@ int testLuaArch_unicode_sub(lua_State *L) {
        return 1;
    }
-    // there is a way to do it without an allocation
+    nn_size_t startByte = nn_unicode_indexPermissive(s, start - 1);
-    // however, I'm lazy
+    nn_size_t termByte = nn_unicode_indexPermissive(s, stop);
-    size_t pointLen;
+    const char *res = testLuaArch_pushlstring(L, s + startByte, termByte - startByte);
    unsigned int *points = nn_unicode_codepoints(alloc, s, &pointLen);
    if(points == NULL) {
        luaL_error(L, "out of memory");
    }
    char *sub = nn_unicode_char(alloc, points + start - 1, stop - start + 1);
    if(sub == NULL) {
        nn_dealloc(alloc, points, sizeof(unsigned int) * pointLen);
        luaL_error(L, "out of memory");
    }
    const char *res = testLuaArch_pushstring(L, sub);
    nn_deallocStr(alloc, sub);
    nn_dealloc(alloc, points, sizeof(unsigned int) * pointLen);
    if (!res) {
        luaL_error(L, "out of memory");
    }
@ -555,10 +539,13 @@ int testLuaArch_unicode_char(lua_State *L) {
 int testLuaArch_unicode_len(lua_State *L) {
    const char *s = luaL_checkstring(L, 1);
-    if(!nn_unicode_validate(s)) {
+    lua_pushinteger(L, nn_unicode_lenPermissive(s));
-        luaL_error(L, "invalid utf-8");
+    return 1;
-    }
+}
-    lua_pushinteger(L, nn_unicode_len(s));
+
 int testLuaArch_unicode_wlen(lua_State *L) {
    const char *s = luaL_checkstring(L, 1);
    lua_pushinteger(L, nn_unicode_lenPermissive(s));
    return 1;
 }
@ -653,7 +640,7 @@ void testLuaArch_loadEnv(lua_State *L) {
    lua_setfield(L, unicode, "sub");
    lua_pushcfunction(L, testLuaArch_unicode_len);
    lua_setfield(L, unicode, "len");
-    lua_pushcfunction(L, testLuaArch_unicode_len);
+    lua_pushcfunction(L, testLuaArch_unicode_wlen);
    lua_setfield(L, unicode, "wlen");
    lua_pushcfunction(L, testLuaArch_unicode_char);
    lua_setfield(L, unicode, "char");
--- a/src/unicode.c
+++ b/src/unicode.c
@ -329,7 +329,7 @@ nn_size_t nn_unicode_codepointSize(unsigned int codepoint) {
 void nn_unicode_codepointToChar(char *buffer, unsigned int codepoint, nn_size_t *len) {
    nn_size_t codepointSize = nn_unicode_codepointSize(codepoint);
-    *len = codepointSize;
+    if(len != NULL) *len = codepointSize;
    nn_memset(buffer, 0, 4); // Clear static array
@ -391,3 +391,45 @@ unsigned int nn_unicode_upperCodepoint(unsigned int codepoint);
 char *nn_unicode_upper(nn_Alloc *alloc, const char *s);
 unsigned int nn_unicode_lowerCodepoint(unsigned int codepoint);
 char *nn_unicode_lower(nn_Alloc *alloc, const char *s);
 unsigned int nn_unicode_nextCodepointPermissive(const char *s, nn_size_t *index) {
    nn_size_t i = *index;
    if(nn_unicode_isValidCodepoint(s + i)) {
        // TODO: handle edge-case where suboptimial encoding is used
        unsigned int p = nn_unicode_codepointAt(s, i);
        *index = i + nn_unicode_codepointSize(p);
        return p;
    }
    unsigned int p = (unsigned char)s[i];
    *index = i + 1;
    return p;
 }
 nn_size_t nn_unicode_lenPermissive(const char *b) {
    nn_size_t len = 0;
    nn_size_t cur = 0;
    while(b[cur]) {
        nn_unicode_nextCodepointPermissive(b, &cur);
        len++;
    }
    return len;
 }
 nn_size_t nn_unicode_wlenPermissive(const char *s) {
    nn_size_t wlen = 0;
    nn_size_t cur = 0;
    while (s[cur]) {
        unsigned int codepoint = nn_unicode_nextCodepointPermissive(s, &cur);
        wlen += nn_unicode_charWidth(codepoint);
    }
    return wlen;
 }
 nn_intptr_t nn_unicode_indexPermissive(const char *s, nn_size_t codepointIndex) {
    nn_size_t bytes = 0;
    while(true) {
        if(codepointIndex == 0) return bytes;
        nn_unicode_nextCodepointPermissive(s, &bytes);
        codepointIndex--;
    }
 }