From e49291b7cfd8eabd8fc5a8b7043eff9032323078 Mon Sep 17 00:00:00 2001 From: Rangi42 Date: Tue, 28 Jan 2025 00:04:49 -0500 Subject: [PATCH] Refactor `readUTF8Char` into `charmap_ConvertNext` --- include/util.hpp | 7 ------- src/asm/charmap.cpp | 21 ++++++++++++++++++--- src/asm/parser.y | 10 +++++----- src/util.cpp | 23 ----------------------- 4 files changed, 23 insertions(+), 38 deletions(-) diff --git a/include/util.hpp b/include/util.hpp index a45de124f..6cf9e0784 100644 --- a/include/util.hpp +++ b/include/util.hpp @@ -3,13 +3,6 @@ #ifndef RGBDS_UTIL_HPP #define RGBDS_UTIL_HPP -#include -#include -#include - char const *printChar(int c); -// @return The number of bytes read, or 0 if invalid data was found -size_t readUTF8Char(std::vector *dest, char const *src); - #endif // RGBDS_UTIL_HPP diff --git a/src/asm/charmap.cpp b/src/asm/charmap.cpp index b5875ee1d..b7eadbca7 100644 --- a/src/asm/charmap.cpp +++ b/src/asm/charmap.cpp @@ -11,6 +11,7 @@ #include #include +#include "extern/utf8decoder.hpp" #include "helpers.hpp" #include "util.hpp" @@ -224,16 +225,30 @@ size_t charmap_ConvertNext(std::string_view &input, std::vector *output matchLen = value.size(); } else if (inputIdx < input.length()) { // No match found, but there is some input left - int firstChar = input[inputIdx]; + size_t codepointLen = 0; // This will write the codepoint's value to `output`, little-endian - size_t codepointLen = readUTF8Char(output, input.data() + inputIdx); + for (uint32_t state = 0, codepoint = 0;;) { + if (decode(&state, &codepoint, input[inputIdx + codepointLen]) == 1) { + codepointLen = 0; + break; + } + + if (output) { + output->push_back(input[inputIdx + codepointLen]); + } + codepointLen++; + + if (state == 0) { + break; + } + } if (codepointLen == 0) { error("Input string is not valid UTF-8\n"); } // Warn if this character is not mapped but any others are - if (charmap.nodes.size() > 1) { + if (int firstChar = input[inputIdx]; charmap.nodes.size() > 1) { warning(WARNING_UNMAPPED_CHAR_1, "Unmapped character %s\n", printChar(firstChar)); } else if (charmap.name != DEFAULT_CHARMAP_NAME) { warning( diff --git a/src/asm/parser.y b/src/asm/parser.y index 4150ed41c..215766e18 100644 --- a/src/asm/parser.y +++ b/src/asm/parser.y @@ -2527,10 +2527,10 @@ static size_t strlenUTF8(std::string const &str) { size_t len = 0; uint32_t state = 0; - for (uint32_t codep = 0; *ptr; ptr++) { + for (uint32_t codepoint = 0; *ptr; ptr++) { uint8_t byte = *ptr; - switch (decode(&state, &codep, byte)) { + switch (decode(&state, &codepoint, byte)) { case 1: errorInvalidUTF8Byte(byte, "STRLEN"); state = 0; @@ -2553,12 +2553,12 @@ static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len char const *ptr = str.c_str(); size_t index = 0; uint32_t state = 0; - uint32_t codep = 0; + uint32_t codepoint = 0; uint32_t curPos = 1; // RGBASM strings are 1-indexed! // Advance to starting position in source string. while (ptr[index] && curPos < pos) { - switch (decode(&state, &codep, ptr[index])) { + switch (decode(&state, &codepoint, ptr[index])) { case 1: errorInvalidUTF8Byte(ptr[index], "STRSUB"); state = 0; @@ -2583,7 +2583,7 @@ static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len // Compute the result length in bytes. while (ptr[index] && curLen < len) { - switch (decode(&state, &codep, ptr[index])) { + switch (decode(&state, &codepoint, ptr[index])) { case 1: errorInvalidUTF8Byte(ptr[index], "STRSUB"); state = 0; diff --git a/src/util.cpp b/src/util.cpp index 67f776fc9..d56762f98 100644 --- a/src/util.cpp +++ b/src/util.cpp @@ -5,9 +5,6 @@ #include #include #include -#include - -#include "extern/utf8decoder.hpp" char const *printChar(int c) { // "'A'" + '\0': 4 bytes @@ -53,23 +50,3 @@ char const *printChar(int c) { buf[4] = '\0'; return buf; } - -size_t readUTF8Char(std::vector *dest, char const *src) { - uint32_t state = 0, codepoint; - size_t i = 0; - - for (;;) { - if (decode(&state, &codepoint, src[i]) == 1) { - return 0; - } - - if (dest) { - dest->push_back(src[i]); - } - i++; - - if (state == 0) { - return i; - } - } -}