#include "neonucleus.h" #include #include #include static bool nn_unicode_is_continuation(unsigned char byte) { return (byte >> 6) == 0b10; } bool nn_unicode_validate(const char *b) { // TODO: validate UTF-8-ness const unsigned char* s = (const unsigned char*)b; while (*s) { if(s[0] <= 0x7F) { s++; } else if((s[0] >> 5) == 0b110) { if (!nn_unicode_is_continuation(s[1])) { return false; } s += 2; } else if((s[0] >> 4) == 0b1110) { if (!nn_unicode_is_continuation(s[1])) { return false; } if (!nn_unicode_is_continuation(s[2])) { return false; } s += 3; } else if((s[0] >> 3) == 0b11110) { if (!nn_unicode_is_continuation(s[1])) { return false; } if (!nn_unicode_is_continuation(s[2])) { return false; } if (!nn_unicode_is_continuation(s[3])) { return false; } s += 4; } else { return false; } } return true; } // A general unicode library, which assumes unicode encoding. // It is used to power the Lua architecture's Unicode API re-implementation. // It can also just be used to deal with unicode. char *nn_unicode_char(unsigned int *codepoints, size_t codepointCount) { size_t len = 0; for (size_t i = 0; i < codepointCount; i++) { unsigned int codepoint = codepoints[i]; len += nn_unicode_codepointSize(codepoint); } char *buf = nn_malloc(len+1); if (buf == NULL) return buf; size_t j = 0; for (size_t i = 0; i < codepointCount; i++) { int codepoint = codepoints[i]; size_t codepointLen = 0; const char *c = nn_unicode_codepointToChar(codepoint, &codepointLen); memcpy(buf + j, c, codepointLen); j += codepointLen; } buf[j] = '\0'; assert(j == len); // better safe than sorry return buf; } unsigned int *nn_unicode_codepoints(const char *s) { size_t l = nn_unicode_len(s); unsigned int *buf = nn_malloc(sizeof(unsigned int) * l); if(buf == NULL) return NULL; size_t cur = 0; size_t bufidx = 0; while(s[cur] != 0) { unsigned int point = nn_unicode_codepointAt(s, cur); cur += nn_unicode_codepointSize(point); buf[bufidx++] = point; } return buf; } size_t nn_unicode_len(const char *b) { size_t count = 0; const unsigned char* s = (const unsigned char*)b; while (*s) { count++; if(s[0] <= 0x7F) { s++; } else if((s[0] >> 5) == 0b110) { s += 2; } else if((s[0] >> 4) == 0b1110) { s += 3; } else if((s[0] >> 3) == 0b11110) { s += 4; } } return count; } unsigned int nn_unicode_codepointAt(const char *s, size_t byteOffset) { unsigned int point = 0; const unsigned char *b = (const unsigned char *)s + byteOffset; const unsigned char subpartMask = 0b111111; // look into nn_unicode_codepointToChar as well. if(b[0] <= 0x7F) { return b[0]; } else if((b[0] >> 5) == 0b110) { point += ((unsigned int)(b[0] & 0b11111)) << 6; point += ((unsigned int)(b[1] & subpartMask)); } else if((b[0] >> 4) == 0b1110) { point += ((unsigned int)(b[0] & 0b1111)) << 12; point += ((unsigned int)(b[1] & subpartMask)) << 6; point += ((unsigned int)(b[2] & subpartMask)); } else if((b[0] >> 3) == 0b11110) { point += ((unsigned int)(b[0] & 0b111)) << 18; point += ((unsigned int)(b[1] & subpartMask)) << 12; point += ((unsigned int)(b[2] & subpartMask)) << 6; point += ((unsigned int)(b[3] & subpartMask)); } return point; } size_t nn_unicode_codepointSize(unsigned int codepoint) { if (codepoint <= 0x007f) { return 1; } else if (codepoint <= 0x07ff) { return 2; } else if (codepoint <= 0xffff) { return 3; } else if (codepoint <= 0x10ffff) { return 4; } return 1; } const char *nn_unicode_codepointToChar(unsigned int codepoint, size_t *len) { size_t codepointSize = nn_unicode_codepointSize(codepoint); *len = codepointSize; static char buffer[4]; memset(buffer, 0, 4); // Clear static array if (codepointSize == 1) { buffer[0] = (char)codepoint; } else if (codepointSize == 2) { buffer[0] = 0b11000000 + ((codepoint >> 6) & 0b11111); buffer[1] = 0b10000000 + (codepoint & 0b111111); } else if (codepointSize == 3) { buffer[0] = 0b11100000 + ((codepoint >> 12) & 0b1111); buffer[1] = 0b10000000 + ((codepoint >> 6) & 0b111111); buffer[2] = 0b10000000 + (codepoint & 0b111111); } else if (codepointSize == 4) { buffer[0] = 0b11110000 + ((codepoint >> 18) & 0b111); buffer[1] = 0b10000000 + ((codepoint >> 12) & 0b111111); buffer[2] = 0b10000000 + ((codepoint >> 6) & 0b111111); buffer[3] = 0b10000000 + (codepoint & 0b111111); } return buffer; } size_t nn_unicode_charWidth(unsigned int codepoint); size_t nn_unicode_wlen(const char *s); // NOT IMPLEMENTED YET void nn_unicode_upper(char *s); void nn_unicode_lower(char *s);