From 5b99c73a1a479c3dfc40ee1ca800371c38ccdb62 Mon Sep 17 00:00:00 2001 From: Blendi Date: Wed, 22 Apr 2026 17:29:48 +0200 Subject: [PATCH] lexer: strings --- src/lexer.c | 189 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/lexer.h | 10 ++- src/main.c | 4 +- 3 files changed, 200 insertions(+), 3 deletions(-) diff --git a/src/lexer.c b/src/lexer.c index 7105218..df29bd0 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -1,5 +1,6 @@ #include "types.h" #include "helper.h" +#include #include "lexer.h" int noomL_isalpha(char c) { @@ -259,6 +260,179 @@ noom_uint_t noomL_getcomment(const char* str, noomL_ErrorType* error) { return 0; } +noom_uint_t noomL_getstring(const char* s, noomL_ErrorType* error, noom_LuaVersion version) { + noom_uint_t len = 0; + if (s[len] == '"' || s[len] == '\'') { + char starter = s[len]; // either `'` or `"` + + len++; // double quoted string + + while (1) { + if (s[len] == starter) { + // it's over + len++; + return len; // found a whole string! + } else if (s[len] == '\\') { + len++; // oh boy! + if (s[len] == '\\') { + len++; + } else if (s[len] == 'a') { + len++; + } else if (s[len] == 'b') { + len++; + } else if (s[len] == 'f') { + len++; + } else if (s[len] == 'n') { + len++; + } else if (s[len] == 'r') { + len++; + } else if (s[len] == 't') { + len++; + } else if (s[len] == 'v') { + len++; + + // both string using single or double quote can have either escaped inside + } else if (s[len] == '"') { + len++; + } else if (s[len] == '\'') { + len++; + } else if (s[len] == '\n') { + len++; + } else if (s[len] == '\r' && s[len+1] == '\n') { // fuck windows :fire: + len += 2; + + } else if (noomL_isnumber(s[len])) { + // fuckkkk + noom_uint_t count = 0; + for (noom_uint_t i = 0; i < 3; i++) { + if (noomL_isnumber(s[len + i])) count++; else break; + } + if (count == 3) { // could be too big + if ((s[len] > '2') || (s[len] == '2' && (s[len+1] > '5' || (s[len+1] == '5' && s[len+2] > '5')))) { + // >255, i could also make it a number first but meh + *error = NOOML_ERROR_DECIMAL_ESCAPE_TOO_BIG; + return 0; + } + } + // count can't be 0 because this if wouldn't run. + len += count; + + } else if (s[len] == 'x' && version >= NOOM_VERSION_52) { + len++; + + if ((!noomL_ishex(s[len])) || (!noomL_ishex(s[len+1]))) { + *error = NOOML_ERROR_HEX_ESCAPE_INVALID; + return 0; + } + len += 2; + + } else if (s[len] == 'z' && version >= NOOM_VERSION_52) { + len++; + while (noomL_iswhitespace(s[len])) len++; + + } else if (s[len] == 'u' && version >= NOOM_VERSION_53) { + len++; + if (s[len] != '{') { + *error = NOOML_ERROR_UNICODE_ESCAPE_UNOPENED; + return 0; + } + len++; + + while (s[len] == '0') len++; // remove leading zeroes. + + noom_uint_t hexlen = 0; + while (noomL_ishex(s[len + hexlen])) hexlen++; + + // 5.3 allows <= 10ffff + // whereas 5.4 allows <= 7fffffff + // fuck my life + if (version == NOOM_VERSION_53) { + if (hexlen == 6) { + if (s[len] > '1' || (s[len] == '1' && s[len+1] > '0')) { + *error = NOOML_ERROR_UNICODE_ESCAPE_TOO_BIG; + return 0; + } + } else if (hexlen > 6) { + *error = NOOML_ERROR_UNICODE_ESCAPE_TOO_BIG; + return 0; + } + } else if (version >= NOOM_VERSION_54) { + if (hexlen == 8) { + if (s[len] > '7') { + *error = NOOML_ERROR_UNICODE_ESCAPE_TOO_BIG; + return 0; + } + } else if (hexlen > 8) { + *error = NOOML_ERROR_UNICODE_ESCAPE_TOO_BIG; + return 0; + } + } + len += hexlen; + + if (s[len] != '}') { + *error = NOOML_ERROR_UNICODE_ESCAPE_UNCLOSED; + return 0; + } + len++; + + } else if (s[len] == '\0') { + // no. just leave it for the string to find afterward. + } else { + len++; // allow any random escape + } + } else if (s[len] == '\0') { + *error = NOOML_ERROR_UNFINISHED_STRING; + return 0; + } else if (s[len] == '\n') { + // unfinished because you can't have a newline in it + *error = NOOML_ERROR_UNFINISHED_STRING; + return 0; + + } else { + len++; // anything else is just a thing in the string. + } + } + } else if (s[len] == '[') { // potential multi-line string + len++; + + noom_uint_t order = 0; + int succ = 0; + + while (s[len] == '=') { order++; len++; } + + if (s[len] == '[') { len++; succ = 1; } + + if (succ) { // it is a multi-line string. + while (1) { + if (s[len] == ']') { // potential ender + len++; + noom_uint_t order2 = 0; + noom_uint_t startp = len; // intentionally after the `]` + + while (s[len] == '=') { order2++; len++; } + + if (s[len] == ']' && order == order2) { // holy shit it's real + len++; + + return len; + } else { + // nope. + len = startp; // go back just in case like ]=]==] + } + + } else if (s[len] == '\0') { + *error = NOOML_ERROR_UNFINISHED_LONG_STRING; + return 0; + } else { + len++; + } + } + } + } + + return 0; +} + int noomL_iskeyword(const char* s, noom_uint_t len, noom_LuaVersion version) { if (noom_streql(s, len, "true", 4)) return 1; if (noom_streql(s, len, "false", 5)) return 1; @@ -379,6 +553,21 @@ noomL_ErrorType noomL_lex(const char* s, noom_uint_t start, noomL_Token* token, if (err != NOOML_ERROR_NONE) return err; } } + + { + noomL_ErrorType err = NOOML_ERROR_NONE; + noom_uint_t stringLen = noomL_getstring(str, &err, version); + + if (stringLen) { + token->type = NOOML_TOKEN_STRING; + token->offset = start; + token->length = stringLen; + + return NOOML_ERROR_NONE; + } else { + if (err != NOOML_ERROR_NONE) return err; + } + } { noom_uint_t symbolLen = noomL_getsymbol(str, version); diff --git a/src/lexer.h b/src/lexer.h index a22e89a..4c1f359 100644 --- a/src/lexer.h +++ b/src/lexer.h @@ -19,7 +19,15 @@ typedef enum noomL_ErrorType { NOOML_ERROR_UNKNOWN, NOOML_ERROR_MALFORMED_NUM, - NOOML_ERROR_UNFINISHED_COMMENT + NOOML_ERROR_UNFINISHED_COMMENT, + NOOML_ERROR_UNFINISHED_STRING, + NOOML_ERROR_UNFINISHED_LONG_STRING, + + NOOML_ERROR_DECIMAL_ESCAPE_TOO_BIG, + NOOML_ERROR_HEX_ESCAPE_INVALID, + NOOML_ERROR_UNICODE_ESCAPE_UNOPENED, + NOOML_ERROR_UNICODE_ESCAPE_UNCLOSED, + NOOML_ERROR_UNICODE_ESCAPE_TOO_BIG, } noomL_ErrorType; typedef struct noomL_Token { diff --git a/src/main.c b/src/main.c index ea8af11..1171d27 100644 --- a/src/main.c +++ b/src/main.c @@ -33,14 +33,14 @@ void print_node(noomP_Node* node, noom_uint_t depth) { int main(int argc, char** argv) { // uhh uhhh uhhhhh - const char* code = "--[=[i\nam\na\nlong\ncomment]]lololnotoveryet]==]nah lol]=] --local a = 2\nlocal b = 3"; + const char* code = "local a = [=[Hello, world!]]lol]==]]]=]"; noom_uint_t pos = 0; printf("LEX OUTPUT:\n"); noomL_Token token; while (1) { - noomL_lex(code, pos, &token, NOOM_VERSION_54); + noomL_lex(code, pos, &token, NOOM_VERSION_53); printf("%s ", noomL_formatTokenType(token.type)); for (noom_uint_t i = 0; i < token.length; i++) putchar((code + token.offset)[i]);