mirror of
https://github.com/NeoFlock/neonucleus.git
synced 2025-09-24 09:03:32 +02:00
178 lines
5.3 KiB
C
178 lines
5.3 KiB
C
#include "neonucleus.h"
|
|
#include <assert.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
|
|
static bool nn_unicode_is_continuation(unsigned char byte) {
|
|
return (byte >> 6) == 0b10;
|
|
}
|
|
|
|
bool nn_unicode_validate(const char *b) {
|
|
// TODO: validate UTF-8-ness
|
|
const unsigned char* s = (const unsigned char*)b;
|
|
while (*s) {
|
|
if(s[0] <= 0x7F) {
|
|
s++;
|
|
} else if((s[0] >> 5) == 0b110) {
|
|
if (!nn_unicode_is_continuation(s[1])) {
|
|
return false;
|
|
}
|
|
s += 2;
|
|
} else if((s[0] >> 4) == 0b1110) {
|
|
if (!nn_unicode_is_continuation(s[1])) {
|
|
return false;
|
|
}
|
|
if (!nn_unicode_is_continuation(s[2])) {
|
|
return false;
|
|
}
|
|
s += 3;
|
|
} else if((s[0] >> 3) == 0b11110) {
|
|
if (!nn_unicode_is_continuation(s[1])) {
|
|
return false;
|
|
}
|
|
if (!nn_unicode_is_continuation(s[2])) {
|
|
return false;
|
|
}
|
|
if (!nn_unicode_is_continuation(s[3])) {
|
|
return false;
|
|
}
|
|
s += 4;
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// A general unicode library, which assumes unicode encoding.
|
|
// It is used to power the Lua architecture's Unicode API re-implementation.
|
|
// It can also just be used to deal with unicode.
|
|
|
|
char *nn_unicode_char(unsigned int *codepoints, size_t codepointCount) {
|
|
size_t len = 0;
|
|
for (size_t i = 0; i < codepointCount; i++) {
|
|
unsigned int codepoint = codepoints[i];
|
|
len += nn_unicode_codepointSize(codepoint);
|
|
}
|
|
|
|
char *buf = nn_malloc(len+1);
|
|
if (buf == NULL) return buf;
|
|
|
|
size_t j = 0;
|
|
for (size_t i = 0; i < codepointCount; i++) {
|
|
int codepoint = codepoints[i];
|
|
size_t codepointLen = 0;
|
|
const char *c = nn_unicode_codepointToChar(codepoint, &codepointLen);
|
|
memcpy(buf + j, c, codepointLen);
|
|
j += codepointLen;
|
|
}
|
|
buf[j] = '\0';
|
|
assert(j == len); // better safe than sorry
|
|
|
|
return buf;
|
|
}
|
|
|
|
unsigned int *nn_unicode_codepoints(const char *s) {
|
|
size_t l = nn_unicode_len(s);
|
|
unsigned int *buf = nn_malloc(sizeof(unsigned int) * l);
|
|
if(buf == NULL) return NULL;
|
|
size_t cur = 0;
|
|
size_t bufidx = 0;
|
|
while(s[cur] != 0) {
|
|
unsigned int point = nn_unicode_codepointAt(s, cur);
|
|
cur += nn_unicode_codepointSize(point);
|
|
buf[bufidx++] = point;
|
|
}
|
|
return buf;
|
|
}
|
|
|
|
size_t nn_unicode_len(const char *b) {
|
|
size_t count = 0;
|
|
const unsigned char* s = (const unsigned char*)b;
|
|
while (*s) {
|
|
count++;
|
|
if(s[0] <= 0x7F) {
|
|
s++;
|
|
} else if((s[0] >> 5) == 0b110) {
|
|
s += 2;
|
|
} else if((s[0] >> 4) == 0b1110) {
|
|
s += 3;
|
|
} else if((s[0] >> 3) == 0b11110) {
|
|
s += 4;
|
|
}
|
|
}
|
|
return count;
|
|
}
|
|
|
|
unsigned int nn_unicode_codepointAt(const char *s, size_t byteOffset) {
|
|
unsigned int point = 0;
|
|
const unsigned char *b = (const unsigned char *)s + byteOffset;
|
|
|
|
const unsigned char subpartMask = 0b111111;
|
|
// look into nn_unicode_codepointToChar as well.
|
|
if(b[0] <= 0x7F) {
|
|
return b[0];
|
|
} else if((b[0] >> 5) == 0b110) {
|
|
point += ((unsigned int)(b[0] & 0b11111)) << 6;
|
|
point += ((unsigned int)(b[1] & subpartMask));
|
|
} else if((b[0] >> 4) == 0b1110) {
|
|
point += ((unsigned int)(b[0] & 0b1111)) << 12;
|
|
point += ((unsigned int)(b[1] & subpartMask)) << 6;
|
|
point += ((unsigned int)(b[2] & subpartMask));
|
|
} else if((b[0] >> 3) == 0b11110) {
|
|
point += ((unsigned int)(b[0] & 0b111)) << 18;
|
|
point += ((unsigned int)(b[1] & subpartMask)) << 12;
|
|
point += ((unsigned int)(b[2] & subpartMask)) << 6;
|
|
point += ((unsigned int)(b[3] & subpartMask));
|
|
}
|
|
return point;
|
|
}
|
|
|
|
size_t nn_unicode_codepointSize(unsigned int codepoint) {
|
|
if (codepoint <= 0x007f) {
|
|
return 1;
|
|
} else if (codepoint <= 0x07ff) {
|
|
return 2;
|
|
} else if (codepoint <= 0xffff) {
|
|
return 3;
|
|
} else if (codepoint <= 0x10ffff) {
|
|
return 4;
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
|
|
const char *nn_unicode_codepointToChar(unsigned int codepoint, size_t *len) {
|
|
size_t codepointSize = nn_unicode_codepointSize(codepoint);
|
|
*len = codepointSize;
|
|
|
|
static char buffer[4];
|
|
memset(buffer, 0, 4); // Clear static array
|
|
|
|
if (codepointSize == 1) {
|
|
buffer[0] = (char)codepoint;
|
|
} else if (codepointSize == 2) {
|
|
buffer[0] = 0b11000000 + ((codepoint >> 6) & 0b11111);
|
|
buffer[1] = 0b10000000 + (codepoint & 0b111111);
|
|
} else if (codepointSize == 3) {
|
|
buffer[0] = 0b11100000 + ((codepoint >> 12) & 0b1111);
|
|
buffer[1] = 0b10000000 + ((codepoint >> 6) & 0b111111);
|
|
buffer[2] = 0b10000000 + (codepoint & 0b111111);
|
|
} else if (codepointSize == 4) {
|
|
buffer[0] = 0b11110000 + ((codepoint >> 18) & 0b111);
|
|
buffer[1] = 0b10000000 + ((codepoint >> 12) & 0b111111);
|
|
buffer[2] = 0b10000000 + ((codepoint >> 6) & 0b111111);
|
|
buffer[3] = 0b10000000 + (codepoint & 0b111111);
|
|
}
|
|
|
|
return buffer;
|
|
}
|
|
|
|
size_t nn_unicode_charWidth(unsigned int codepoint);
|
|
|
|
size_t nn_unicode_wlen(const char *s);
|
|
|
|
// NOT IMPLEMENTED YET
|
|
void nn_unicode_upper(char *s);
|
|
void nn_unicode_lower(char *s);
|