unicode now allows invalid unicode

This commit is contained in:
IonutParau 2025-07-13 11:58:25 +02:00
parent 2621554165
commit 64a6b84b30
7 changed files with 91 additions and 47 deletions

View File

@ -1,10 +1,8 @@
# Parity with Vanilla OC (only the stuff that makes sense for an emulator)
- make the `unicode` library in testLuaArch support invalid UTF-8 (WHY IS IT OK WITH THAT)
- in-memory version of `filesystem`
- complete the GPU implementation (screen buffers and missing methods)
- complete the screen implementation (bunch of missing methods)
- support invalid UTF-8 for GPU set and fill, which should pretend the byte value is the codepoint.
- `hologram` component
- `computer` component
- `modem` component
@ -20,6 +18,7 @@
- Rework filesystem component to pre-process paths to ensure proper sandboxing and not allow arbitrary remote file access
- Do a huge audit at some point
- `nn_unicode_charWidth` appears to be bugged, look into that.
# The extra components

View File

@ -242,7 +242,16 @@ void nn_eeprom_getChecksum(nn_eeprom *eeprom, void *_, nn_component *component,
nn_data_crc32(buf, dataLen + codeLen, hash);
nn_dealloc(alloc, buf, dataCap + codeCap);
nn_return_string(computer, hash, sizeof(hash));
char encoded[8];
const char *hex = "0123456789abcdef";
for(int i = 0; i < 4; i++) {
unsigned char b = hash[i];
encoded[i*2] = hex[b >> 4];
encoded[i*2+1] = hex[b & 0xF];
}
nn_return_string(computer, encoded, sizeof(encoded));
nn_eeprom_readCost(component, dataLen + codeLen);
}

View File

@ -131,24 +131,17 @@ void nni_gpu_set(nni_gpu *gpu, void *_, nn_component *component, nn_computer *co
return;
}
int current = 0;
int len = 0;
nn_size_t current = 0;
while(s[current] != 0) {
if(nn_unicode_isValidCodepoint(s + current)) {
int codepoint = nn_unicode_codepointAt(s, current);
nn_setPixel(gpu->currentScreen, x, y, nni_gpu_makePixel(gpu, s + current));
current += nn_unicode_codepointSize(codepoint);
} else {
unsigned int codepoint = (unsigned char)s[current];
nn_setPixel(gpu->currentScreen, x, y, nni_gpu_makePixel(gpu, s + current));
current++;
}
unsigned int codepoint = nn_unicode_nextCodepointPermissive(s, &current);
char buf[NN_MAXIMUM_UNICODE_BUFFER];
nn_unicode_codepointToChar(buf, codepoint, NULL);
nn_setPixel(gpu->currentScreen, x, y, nni_gpu_makePixel(gpu, buf));
if(isVertical) {
y++;
} else {
x++;
}
len++;
}
nn_simulateBufferedIndirect(component, 1, gpu->ctrl.screenSetsPerTick);
@ -297,12 +290,9 @@ void nni_gpu_fill(nni_gpu *gpu, void *_, nn_component *component, nn_computer *c
nn_setCError(computer, "bad argument #5 (character expected)");
return;
}
if(!nn_unicode_validate(s)) {
nn_setCError(computer, "invalid utf-8");
return;
}
int codepoint = nn_unicode_codepointAt(s, 0);
nn_size_t startIdx = 0;
int codepoint = nn_unicode_nextCodepointPermissive(s, &startIdx);
// prevent DoS
if(x < 0) x = 0;

View File

@ -319,6 +319,16 @@ unsigned int nn_unicode_lowerCodepoint(unsigned int codepoint);
// returned string must be nn_deallocStr()'d
char *nn_unicode_lower(nn_Alloc *alloc, const char *s);
// permissive means it allows invalid UTF-8, in which case each byte is treated as a codepoint
// it will return the codepoint starting at byte *index, but will also set *index to the byte afterward it
// since it is permissive, it supports invalid UTF-8
unsigned int nn_unicode_nextCodepointPermissive(const char *s, nn_size_t *index);
nn_size_t nn_unicode_lenPermissive(const char *s);
nn_size_t nn_unicode_wlenPermissive(const char *s);
// if not found, it will return -1. This is why it is an nn_intptr_t
nn_intptr_t nn_unicode_indexPermissive(const char *s, nn_size_t codepointIndex);
// Data card stuff
// Hashing

View File

@ -400,9 +400,16 @@ sandbox = {
utf8 = copy(utf8),
unicode = copy(unicode, {
isWide = function(s) return unicode.wlen(s) > unicode.len(s) end,
isWide = function(s)
local c = unicode.sub(s, 1, 1)
return unicode.wlen(c) > unicode.len(c)
end,
upper = string.upper,
lower = string.lower,
wtrunc = function (str,space)
space = space - 1
return str:sub(1,(space >= utf8.len(str)) and (#str) or (utf8.offset(str,space+1)-1))
end,
}),
checkArg = checkArg,
component = libcomponent,

View File

@ -469,10 +469,7 @@ int testLuaArch_unicode_sub(lua_State *L) {
const char *s = luaL_checkstring(L, 1);
nn_Alloc *alloc = testLuaArch_getAlloc(L);
int start = luaL_checkinteger(L, 2);
if(!nn_unicode_validate(s)) {
luaL_error(L, "invalid utf-8");
}
int len = nn_unicode_len(s);
int len = nn_unicode_lenPermissive(s);
if(len < 0) {
luaL_error(L, "length overflow");
}
@ -503,22 +500,9 @@ int testLuaArch_unicode_sub(lua_State *L) {
return 1;
}
// there is a way to do it without an allocation
// however, I'm lazy
size_t pointLen;
unsigned int *points = nn_unicode_codepoints(alloc, s, &pointLen);
if(points == NULL) {
luaL_error(L, "out of memory");
}
char *sub = nn_unicode_char(alloc, points + start - 1, stop - start + 1);
if(sub == NULL) {
nn_dealloc(alloc, points, sizeof(unsigned int) * pointLen);
luaL_error(L, "out of memory");
}
const char *res = testLuaArch_pushstring(L, sub);
nn_deallocStr(alloc, sub);
nn_dealloc(alloc, points, sizeof(unsigned int) * pointLen);
nn_size_t startByte = nn_unicode_indexPermissive(s, start - 1);
nn_size_t termByte = nn_unicode_indexPermissive(s, stop);
const char *res = testLuaArch_pushlstring(L, s + startByte, termByte - startByte);
if (!res) {
luaL_error(L, "out of memory");
}
@ -555,10 +539,13 @@ int testLuaArch_unicode_char(lua_State *L) {
int testLuaArch_unicode_len(lua_State *L) {
const char *s = luaL_checkstring(L, 1);
if(!nn_unicode_validate(s)) {
luaL_error(L, "invalid utf-8");
}
lua_pushinteger(L, nn_unicode_len(s));
lua_pushinteger(L, nn_unicode_lenPermissive(s));
return 1;
}
int testLuaArch_unicode_wlen(lua_State *L) {
const char *s = luaL_checkstring(L, 1);
lua_pushinteger(L, nn_unicode_lenPermissive(s));
return 1;
}
@ -653,7 +640,7 @@ void testLuaArch_loadEnv(lua_State *L) {
lua_setfield(L, unicode, "sub");
lua_pushcfunction(L, testLuaArch_unicode_len);
lua_setfield(L, unicode, "len");
lua_pushcfunction(L, testLuaArch_unicode_len);
lua_pushcfunction(L, testLuaArch_unicode_wlen);
lua_setfield(L, unicode, "wlen");
lua_pushcfunction(L, testLuaArch_unicode_char);
lua_setfield(L, unicode, "char");

View File

@ -329,7 +329,7 @@ nn_size_t nn_unicode_codepointSize(unsigned int codepoint) {
void nn_unicode_codepointToChar(char *buffer, unsigned int codepoint, nn_size_t *len) {
nn_size_t codepointSize = nn_unicode_codepointSize(codepoint);
*len = codepointSize;
if(len != NULL) *len = codepointSize;
nn_memset(buffer, 0, 4); // Clear static array
@ -391,3 +391,45 @@ unsigned int nn_unicode_upperCodepoint(unsigned int codepoint);
char *nn_unicode_upper(nn_Alloc *alloc, const char *s);
unsigned int nn_unicode_lowerCodepoint(unsigned int codepoint);
char *nn_unicode_lower(nn_Alloc *alloc, const char *s);
unsigned int nn_unicode_nextCodepointPermissive(const char *s, nn_size_t *index) {
nn_size_t i = *index;
if(nn_unicode_isValidCodepoint(s + i)) {
// TODO: handle edge-case where suboptimial encoding is used
unsigned int p = nn_unicode_codepointAt(s, i);
*index = i + nn_unicode_codepointSize(p);
return p;
}
unsigned int p = (unsigned char)s[i];
*index = i + 1;
return p;
}
nn_size_t nn_unicode_lenPermissive(const char *b) {
nn_size_t len = 0;
nn_size_t cur = 0;
while(b[cur]) {
nn_unicode_nextCodepointPermissive(b, &cur);
len++;
}
return len;
}
nn_size_t nn_unicode_wlenPermissive(const char *s) {
nn_size_t wlen = 0;
nn_size_t cur = 0;
while (s[cur]) {
unsigned int codepoint = nn_unicode_nextCodepointPermissive(s, &cur);
wlen += nn_unicode_charWidth(codepoint);
}
return wlen;
}
nn_intptr_t nn_unicode_indexPermissive(const char *s, nn_size_t codepointIndex) {
nn_size_t bytes = 0;
while(true) {
if(codepointIndex == 0) return bytes;
nn_unicode_nextCodepointPermissive(s, &bytes);
codepointIndex--;
}
}