From 64a6b84b30a3cccbbd5216e545a99e7fe8b62b15 Mon Sep 17 00:00:00 2001 From: IonutParau Date: Sun, 13 Jul 2025 11:58:25 +0200 Subject: [PATCH] unicode now allows invalid unicode --- TODO.md | 3 +-- src/components/eeprom.c | 11 ++++++++++- src/components/gpu.c | 24 +++++++--------------- src/neonucleus.h | 10 ++++++++++ src/sandbox.lua | 9 ++++++++- src/testLuaArch.c | 37 +++++++++++----------------------- src/unicode.c | 44 ++++++++++++++++++++++++++++++++++++++++- 7 files changed, 91 insertions(+), 47 deletions(-) diff --git a/TODO.md b/TODO.md index b2cc707..661f4e9 100644 --- a/TODO.md +++ b/TODO.md @@ -1,10 +1,8 @@ # Parity with Vanilla OC (only the stuff that makes sense for an emulator) -- make the `unicode` library in testLuaArch support invalid UTF-8 (WHY IS IT OK WITH THAT) - in-memory version of `filesystem` - complete the GPU implementation (screen buffers and missing methods) - complete the screen implementation (bunch of missing methods) -- support invalid UTF-8 for GPU set and fill, which should pretend the byte value is the codepoint. - `hologram` component - `computer` component - `modem` component @@ -20,6 +18,7 @@ - Rework filesystem component to pre-process paths to ensure proper sandboxing and not allow arbitrary remote file access - Do a huge audit at some point +- `nn_unicode_charWidth` appears to be bugged, look into that. # The extra components diff --git a/src/components/eeprom.c b/src/components/eeprom.c index c325ebf..35f959b 100644 --- a/src/components/eeprom.c +++ b/src/components/eeprom.c @@ -242,7 +242,16 @@ void nn_eeprom_getChecksum(nn_eeprom *eeprom, void *_, nn_component *component, nn_data_crc32(buf, dataLen + codeLen, hash); nn_dealloc(alloc, buf, dataCap + codeCap); - nn_return_string(computer, hash, sizeof(hash)); + char encoded[8]; + + const char *hex = "0123456789abcdef"; + for(int i = 0; i < 4; i++) { + unsigned char b = hash[i]; + encoded[i*2] = hex[b >> 4]; + encoded[i*2+1] = hex[b & 0xF]; + } + + nn_return_string(computer, encoded, sizeof(encoded)); nn_eeprom_readCost(component, dataLen + codeLen); } diff --git a/src/components/gpu.c b/src/components/gpu.c index 97e33c8..31036d2 100644 --- a/src/components/gpu.c +++ b/src/components/gpu.c @@ -131,24 +131,17 @@ void nni_gpu_set(nni_gpu *gpu, void *_, nn_component *component, nn_computer *co return; } - int current = 0; - int len = 0; + nn_size_t current = 0; while(s[current] != 0) { - if(nn_unicode_isValidCodepoint(s + current)) { - int codepoint = nn_unicode_codepointAt(s, current); - nn_setPixel(gpu->currentScreen, x, y, nni_gpu_makePixel(gpu, s + current)); - current += nn_unicode_codepointSize(codepoint); - } else { - unsigned int codepoint = (unsigned char)s[current]; - nn_setPixel(gpu->currentScreen, x, y, nni_gpu_makePixel(gpu, s + current)); - current++; - } + unsigned int codepoint = nn_unicode_nextCodepointPermissive(s, ¤t); + char buf[NN_MAXIMUM_UNICODE_BUFFER]; + nn_unicode_codepointToChar(buf, codepoint, NULL); + nn_setPixel(gpu->currentScreen, x, y, nni_gpu_makePixel(gpu, buf)); if(isVertical) { y++; } else { x++; } - len++; } nn_simulateBufferedIndirect(component, 1, gpu->ctrl.screenSetsPerTick); @@ -297,12 +290,9 @@ void nni_gpu_fill(nni_gpu *gpu, void *_, nn_component *component, nn_computer *c nn_setCError(computer, "bad argument #5 (character expected)"); return; } - if(!nn_unicode_validate(s)) { - nn_setCError(computer, "invalid utf-8"); - return; - } - int codepoint = nn_unicode_codepointAt(s, 0); + nn_size_t startIdx = 0; + int codepoint = nn_unicode_nextCodepointPermissive(s, &startIdx); // prevent DoS if(x < 0) x = 0; diff --git a/src/neonucleus.h b/src/neonucleus.h index 96e65c5..1488428 100644 --- a/src/neonucleus.h +++ b/src/neonucleus.h @@ -319,6 +319,16 @@ unsigned int nn_unicode_lowerCodepoint(unsigned int codepoint); // returned string must be nn_deallocStr()'d char *nn_unicode_lower(nn_Alloc *alloc, const char *s); +// permissive means it allows invalid UTF-8, in which case each byte is treated as a codepoint + +// it will return the codepoint starting at byte *index, but will also set *index to the byte afterward it +// since it is permissive, it supports invalid UTF-8 +unsigned int nn_unicode_nextCodepointPermissive(const char *s, nn_size_t *index); +nn_size_t nn_unicode_lenPermissive(const char *s); +nn_size_t nn_unicode_wlenPermissive(const char *s); +// if not found, it will return -1. This is why it is an nn_intptr_t +nn_intptr_t nn_unicode_indexPermissive(const char *s, nn_size_t codepointIndex); + // Data card stuff // Hashing diff --git a/src/sandbox.lua b/src/sandbox.lua index e1c9b73..b359cfc 100644 --- a/src/sandbox.lua +++ b/src/sandbox.lua @@ -400,9 +400,16 @@ sandbox = { utf8 = copy(utf8), unicode = copy(unicode, { - isWide = function(s) return unicode.wlen(s) > unicode.len(s) end, + isWide = function(s) + local c = unicode.sub(s, 1, 1) + return unicode.wlen(c) > unicode.len(c) + end, upper = string.upper, lower = string.lower, + wtrunc = function (str,space) + space = space - 1 + return str:sub(1,(space >= utf8.len(str)) and (#str) or (utf8.offset(str,space+1)-1)) + end, }), checkArg = checkArg, component = libcomponent, diff --git a/src/testLuaArch.c b/src/testLuaArch.c index b59d53a..6cfe680 100644 --- a/src/testLuaArch.c +++ b/src/testLuaArch.c @@ -469,10 +469,7 @@ int testLuaArch_unicode_sub(lua_State *L) { const char *s = luaL_checkstring(L, 1); nn_Alloc *alloc = testLuaArch_getAlloc(L); int start = luaL_checkinteger(L, 2); - if(!nn_unicode_validate(s)) { - luaL_error(L, "invalid utf-8"); - } - int len = nn_unicode_len(s); + int len = nn_unicode_lenPermissive(s); if(len < 0) { luaL_error(L, "length overflow"); } @@ -503,22 +500,9 @@ int testLuaArch_unicode_sub(lua_State *L) { return 1; } - // there is a way to do it without an allocation - // however, I'm lazy - size_t pointLen; - unsigned int *points = nn_unicode_codepoints(alloc, s, &pointLen); - if(points == NULL) { - luaL_error(L, "out of memory"); - } - - char *sub = nn_unicode_char(alloc, points + start - 1, stop - start + 1); - if(sub == NULL) { - nn_dealloc(alloc, points, sizeof(unsigned int) * pointLen); - luaL_error(L, "out of memory"); - } - const char *res = testLuaArch_pushstring(L, sub); - nn_deallocStr(alloc, sub); - nn_dealloc(alloc, points, sizeof(unsigned int) * pointLen); + nn_size_t startByte = nn_unicode_indexPermissive(s, start - 1); + nn_size_t termByte = nn_unicode_indexPermissive(s, stop); + const char *res = testLuaArch_pushlstring(L, s + startByte, termByte - startByte); if (!res) { luaL_error(L, "out of memory"); } @@ -555,10 +539,13 @@ int testLuaArch_unicode_char(lua_State *L) { int testLuaArch_unicode_len(lua_State *L) { const char *s = luaL_checkstring(L, 1); - if(!nn_unicode_validate(s)) { - luaL_error(L, "invalid utf-8"); - } - lua_pushinteger(L, nn_unicode_len(s)); + lua_pushinteger(L, nn_unicode_lenPermissive(s)); + return 1; +} + +int testLuaArch_unicode_wlen(lua_State *L) { + const char *s = luaL_checkstring(L, 1); + lua_pushinteger(L, nn_unicode_lenPermissive(s)); return 1; } @@ -653,7 +640,7 @@ void testLuaArch_loadEnv(lua_State *L) { lua_setfield(L, unicode, "sub"); lua_pushcfunction(L, testLuaArch_unicode_len); lua_setfield(L, unicode, "len"); - lua_pushcfunction(L, testLuaArch_unicode_len); + lua_pushcfunction(L, testLuaArch_unicode_wlen); lua_setfield(L, unicode, "wlen"); lua_pushcfunction(L, testLuaArch_unicode_char); lua_setfield(L, unicode, "char"); diff --git a/src/unicode.c b/src/unicode.c index dd53708..e6f19ad 100644 --- a/src/unicode.c +++ b/src/unicode.c @@ -329,7 +329,7 @@ nn_size_t nn_unicode_codepointSize(unsigned int codepoint) { void nn_unicode_codepointToChar(char *buffer, unsigned int codepoint, nn_size_t *len) { nn_size_t codepointSize = nn_unicode_codepointSize(codepoint); - *len = codepointSize; + if(len != NULL) *len = codepointSize; nn_memset(buffer, 0, 4); // Clear static array @@ -391,3 +391,45 @@ unsigned int nn_unicode_upperCodepoint(unsigned int codepoint); char *nn_unicode_upper(nn_Alloc *alloc, const char *s); unsigned int nn_unicode_lowerCodepoint(unsigned int codepoint); char *nn_unicode_lower(nn_Alloc *alloc, const char *s); + +unsigned int nn_unicode_nextCodepointPermissive(const char *s, nn_size_t *index) { + nn_size_t i = *index; + if(nn_unicode_isValidCodepoint(s + i)) { + // TODO: handle edge-case where suboptimial encoding is used + unsigned int p = nn_unicode_codepointAt(s, i); + *index = i + nn_unicode_codepointSize(p); + return p; + } + unsigned int p = (unsigned char)s[i]; + *index = i + 1; + return p; +} + +nn_size_t nn_unicode_lenPermissive(const char *b) { + nn_size_t len = 0; + nn_size_t cur = 0; + while(b[cur]) { + nn_unicode_nextCodepointPermissive(b, &cur); + len++; + } + return len; +} + +nn_size_t nn_unicode_wlenPermissive(const char *s) { + nn_size_t wlen = 0; + nn_size_t cur = 0; + while (s[cur]) { + unsigned int codepoint = nn_unicode_nextCodepointPermissive(s, &cur); + wlen += nn_unicode_charWidth(codepoint); + } + return wlen; +} + +nn_intptr_t nn_unicode_indexPermissive(const char *s, nn_size_t codepointIndex) { + nn_size_t bytes = 0; + while(true) { + if(codepointIndex == 0) return bytes; + nn_unicode_nextCodepointPermissive(s, &bytes); + codepointIndex--; + } +}