diff --git a/src/asm/parser.y b/src/asm/parser.y index 215766e18..5d984408f 100644 --- a/src/asm/parser.y +++ b/src/asm/parser.y @@ -72,7 +72,7 @@ static uint32_t strToNum(std::vector const &s); static void errorInvalidUTF8Byte(uint8_t byte, char const *functionName); - static size_t strlenUTF8(std::string const &str); + static size_t strlenUTF8(std::string const &str, char const *functionName); static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len); static size_t charlenUTF8(std::string const &str); static std::string charsubUTF8(std::string const &str, uint32_t pos); @@ -1517,7 +1517,7 @@ relocexpr_no_str: $$.makeNumber(pos != std::string::npos ? pos + 1 : 0); } | OP_STRLEN LPAREN string RPAREN { - $$.makeNumber(strlenUTF8($3)); + $$.makeNumber(strlenUTF8($3, "STRLEN")); } | OP_CHARLEN LPAREN string RPAREN { $$.makeNumber(charlenUTF8($3)); @@ -1569,13 +1569,13 @@ string: $$ = std::move($1); } | OP_STRSUB LPAREN string COMMA iconst COMMA uconst RPAREN { - size_t len = strlenUTF8($3); + size_t len = strlenUTF8($3, "STRSUB"); uint32_t pos = adjustNegativePos($5, len, "STRSUB"); $$ = strsubUTF8($3, pos, $7); } | OP_STRSUB LPAREN string COMMA iconst RPAREN { - size_t len = strlenUTF8($3); + size_t len = strlenUTF8($3, "STRSUB"); uint32_t pos = adjustNegativePos($5, len, "STRSUB"); $$ = strsubUTF8($3, pos, pos > len ? 0 : len + 1 - pos); @@ -2522,7 +2522,7 @@ static void errorInvalidUTF8Byte(uint8_t byte, char const *functionName) { error("%s: Invalid UTF-8 byte 0x%02hhX\n", functionName, byte); } -static size_t strlenUTF8(std::string const &str) { +static size_t strlenUTF8(std::string const &str, char const *functionName) { char const *ptr = str.c_str(); size_t len = 0; uint32_t state = 0; @@ -2532,7 +2532,7 @@ static size_t strlenUTF8(std::string const &str) { switch (decode(&state, &codepoint, byte)) { case 1: - errorInvalidUTF8Byte(byte, "STRLEN"); + errorInvalidUTF8Byte(byte, functionName); state = 0; // fallthrough case 0: @@ -2543,7 +2543,8 @@ static size_t strlenUTF8(std::string const &str) { // Check for partial code point. if (state != 0) { - error("STRLEN: Incomplete UTF-8 character\n"); + error("%s: Incomplete UTF-8 character\n", functionName); + len++; } return len; @@ -2595,13 +2596,14 @@ static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len index++; } - if (curLen < len) { - warning(WARNING_BUILTIN_ARG, "STRSUB: Length too big: %" PRIu32 "\n", len); - } - // Check for partial code point. if (state != 0) { error("STRSUB: Incomplete UTF-8 character\n"); + curLen++; + } + + if (curLen < len) { + warning(WARNING_BUILTIN_ARG, "STRSUB: Length too big: %" PRIu32 "\n", len); } return std::string(ptr + startIndex, ptr + index); diff --git a/test/asm/invalid-utf-8-strings.asm b/test/asm/invalid-utf-8-strings.asm index f8cc392f6..db722c596 100644 --- a/test/asm/invalid-utf-8-strings.asm +++ b/test/asm/invalid-utf-8-strings.asm @@ -16,11 +16,11 @@ DEF invalid EQUS "aäb漢,a DEF n = STRLEN("{invalid}") DEF copy EQUS STRSUB("{invalid}", 1) -println "\"{invalid}\" == \"{copy}\" ({d:n})" +println "\"{#s:invalid}\" == \"{#s:copy}\" ({d:n})" DEF mid1 EQUS STRSUB("{invalid}", 5, 2) DEF mid2 EQUS STRSUB("{invalid}", 9, 1) -println "\"{mid2}{mid1}\"" +println "\"{#s:mid2}{#s:mid1}\"" ; characters: ; 1: U+0041 A @@ -38,4 +38,18 @@ println "\"{#s:invalid}\": {d:n} == {d:r}" REDEF mid1 EQUS CHARSUB("{invalid}", 4) REDEF mid2 EQUS CHARSUB("{invalid}", 7) -println "\"{mid2}{mid1}\"" +println "\"{#s:mid2}{#s:mid1}\"" + +; characters: +; 1: U+0061 a +; 2: U+0062 b +; 3: U+0063 c +; 4: incomplete U+6F22 kanji (0xE6 0xBC without 0xA2) +REDEF invalid EQUS "abc" + +DEF n = STRLEN("{invalid}") +DEF r = CHARLEN("{invalid}") +println "\"{#s:invalid}\": {d:n} == {d:r}" + +DEF final EQUS STRSUB("{invalid}", 4, 1) +println "\"{#s:invalid}\" ends \"{#s:final}\"" diff --git a/test/asm/invalid-utf-8-strings.err b/test/asm/invalid-utf-8-strings.err index 529c7a9ee..8ac3be6f4 100644 --- a/test/asm/invalid-utf-8-strings.err +++ b/test/asm/invalid-utf-8-strings.err @@ -7,13 +7,13 @@ error: invalid-utf-8-strings.asm(16): error: invalid-utf-8-strings.asm(16): STRLEN: Invalid UTF-8 byte 0xA2 error: invalid-utf-8-strings.asm(17): - STRLEN: Invalid UTF-8 byte 0xA3 + STRSUB: Invalid UTF-8 byte 0xA3 error: invalid-utf-8-strings.asm(17): - STRLEN: Invalid UTF-8 byte 0xA4 + STRSUB: Invalid UTF-8 byte 0xA4 error: invalid-utf-8-strings.asm(17): - STRLEN: Invalid UTF-8 byte 0xF0 + STRSUB: Invalid UTF-8 byte 0xF0 error: invalid-utf-8-strings.asm(17): - STRLEN: Invalid UTF-8 byte 0xA2 + STRSUB: Invalid UTF-8 byte 0xA2 error: invalid-utf-8-strings.asm(17): STRSUB: Invalid UTF-8 byte 0xA3 error: invalid-utf-8-strings.asm(17): @@ -23,21 +23,21 @@ error: invalid-utf-8-strings.asm(17): error: invalid-utf-8-strings.asm(17): STRSUB: Invalid UTF-8 byte 0xA2 error: invalid-utf-8-strings.asm(21): - STRLEN: Invalid UTF-8 byte 0xA3 + STRSUB: Invalid UTF-8 byte 0xA3 error: invalid-utf-8-strings.asm(21): - STRLEN: Invalid UTF-8 byte 0xA4 + STRSUB: Invalid UTF-8 byte 0xA4 error: invalid-utf-8-strings.asm(21): - STRLEN: Invalid UTF-8 byte 0xF0 + STRSUB: Invalid UTF-8 byte 0xF0 error: invalid-utf-8-strings.asm(21): - STRLEN: Invalid UTF-8 byte 0xA2 + STRSUB: Invalid UTF-8 byte 0xA2 error: invalid-utf-8-strings.asm(22): - STRLEN: Invalid UTF-8 byte 0xA3 + STRSUB: Invalid UTF-8 byte 0xA3 error: invalid-utf-8-strings.asm(22): - STRLEN: Invalid UTF-8 byte 0xA4 + STRSUB: Invalid UTF-8 byte 0xA4 error: invalid-utf-8-strings.asm(22): - STRLEN: Invalid UTF-8 byte 0xF0 + STRSUB: Invalid UTF-8 byte 0xF0 error: invalid-utf-8-strings.asm(22): - STRLEN: Invalid UTF-8 byte 0xA2 + STRSUB: Invalid UTF-8 byte 0xA2 error: invalid-utf-8-strings.asm(22): STRSUB: Invalid UTF-8 byte 0xA3 error: invalid-utf-8-strings.asm(22): @@ -76,4 +76,10 @@ error: invalid-utf-8-strings.asm(40): Input string is not valid UTF-8 error: invalid-utf-8-strings.asm(40): Input string is not valid UTF-8 -error: Assembly aborted (39 errors)! +error: invalid-utf-8-strings.asm(50): + STRLEN: Incomplete UTF-8 character +error: invalid-utf-8-strings.asm(54): + STRSUB: Incomplete UTF-8 character +error: invalid-utf-8-strings.asm(54): + STRSUB: Incomplete UTF-8 character +error: Assembly aborted (42 errors)! diff --git a/test/asm/invalid-utf-8-strings.out b/test/asm/invalid-utf-8-strings.out index 2d9147a9a..5602df61d 100644 --- a/test/asm/invalid-utf-8-strings.out +++ b/test/asm/invalid-utf-8-strings.out @@ -2,3 +2,5 @@ "b,a" "A 漢": 7 == 7 "漢" +"abc": 4 == 4 +"abc" ends ""