Skip to content

Commit

Permalink
Fix STRLEN and STRSUB on incomplete UTF-8
Browse files Browse the repository at this point in the history
  • Loading branch information
Rangi42 committed Jan 28, 2025
1 parent 44caffe commit e32f37d
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 27 deletions.
24 changes: 13 additions & 11 deletions src/asm/parser.y
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@

static uint32_t strToNum(std::vector<int32_t> const &s);
static void errorInvalidUTF8Byte(uint8_t byte, char const *functionName);
static size_t strlenUTF8(std::string const &str);
static size_t strlenUTF8(std::string const &str, char const *functionName);
static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len);
static size_t charlenUTF8(std::string const &str);
static std::string charsubUTF8(std::string const &str, uint32_t pos);
Expand Down Expand Up @@ -1517,7 +1517,7 @@ relocexpr_no_str:
$$.makeNumber(pos != std::string::npos ? pos + 1 : 0);
}
| OP_STRLEN LPAREN string RPAREN {
$$.makeNumber(strlenUTF8($3));
$$.makeNumber(strlenUTF8($3, "STRLEN"));
}
| OP_CHARLEN LPAREN string RPAREN {
$$.makeNumber(charlenUTF8($3));
Expand Down Expand Up @@ -1569,13 +1569,13 @@ string:
$$ = std::move($1);
}
| OP_STRSUB LPAREN string COMMA iconst COMMA uconst RPAREN {
size_t len = strlenUTF8($3);
size_t len = strlenUTF8($3, "STRSUB");
uint32_t pos = adjustNegativePos($5, len, "STRSUB");

$$ = strsubUTF8($3, pos, $7);
}
| OP_STRSUB LPAREN string COMMA iconst RPAREN {
size_t len = strlenUTF8($3);
size_t len = strlenUTF8($3, "STRSUB");
uint32_t pos = adjustNegativePos($5, len, "STRSUB");

$$ = strsubUTF8($3, pos, pos > len ? 0 : len + 1 - pos);
Expand Down Expand Up @@ -2522,7 +2522,7 @@ static void errorInvalidUTF8Byte(uint8_t byte, char const *functionName) {
error("%s: Invalid UTF-8 byte 0x%02hhX\n", functionName, byte);
}

static size_t strlenUTF8(std::string const &str) {
static size_t strlenUTF8(std::string const &str, char const *functionName) {
char const *ptr = str.c_str();
size_t len = 0;
uint32_t state = 0;
Expand All @@ -2532,7 +2532,7 @@ static size_t strlenUTF8(std::string const &str) {

switch (decode(&state, &codepoint, byte)) {
case 1:
errorInvalidUTF8Byte(byte, "STRLEN");
errorInvalidUTF8Byte(byte, functionName);
state = 0;
// fallthrough
case 0:
Expand All @@ -2543,7 +2543,8 @@ static size_t strlenUTF8(std::string const &str) {

// Check for partial code point.
if (state != 0) {
error("STRLEN: Incomplete UTF-8 character\n");
error("%s: Incomplete UTF-8 character\n", functionName);
len++;
}

return len;
Expand Down Expand Up @@ -2595,13 +2596,14 @@ static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len
index++;
}

if (curLen < len) {
warning(WARNING_BUILTIN_ARG, "STRSUB: Length too big: %" PRIu32 "\n", len);
}

// Check for partial code point.
if (state != 0) {
error("STRSUB: Incomplete UTF-8 character\n");
curLen++;
}

if (curLen < len) {
warning(WARNING_BUILTIN_ARG, "STRSUB: Length too big: %" PRIu32 "\n", len);
}

return std::string(ptr + startIndex, ptr + index);
Expand Down
20 changes: 17 additions & 3 deletions test/asm/invalid-utf-8-strings.asm
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ DEF invalid EQUS "aäb漢,a
DEF n = STRLEN("{invalid}")
DEF copy EQUS STRSUB("{invalid}", 1)

println "\"{invalid}\" == \"{copy}\" ({d:n})"
println "\"{#s:invalid}\" == \"{#s:copy}\" ({d:n})"

DEF mid1 EQUS STRSUB("{invalid}", 5, 2)
DEF mid2 EQUS STRSUB("{invalid}", 9, 1)
println "\"{mid2}{mid1}\""
println "\"{#s:mid2}{#s:mid1}\""

; characters:
; 1: U+0041 A
Expand All @@ -38,4 +38,18 @@ println "\"{#s:invalid}\": {d:n} == {d:r}"

REDEF mid1 EQUS CHARSUB("{invalid}", 4)
REDEF mid2 EQUS CHARSUB("{invalid}", 7)
println "\"{mid2}{mid1}\""
println "\"{#s:mid2}{#s:mid1}\""

; characters:
; 1: U+0061 a
; 2: U+0062 b
; 3: U+0063 c
; 4: incomplete U+6F22 kanji (0xE6 0xBC without 0xA2)
REDEF invalid EQUS "abcæ¼"

DEF n = STRLEN("{invalid}")
DEF r = CHARLEN("{invalid}")
println "\"{#s:invalid}\": {d:n} == {d:r}"

DEF final EQUS STRSUB("{invalid}", 4, 1)
println "\"{#s:invalid}\" ends \"{#s:final}\""
32 changes: 19 additions & 13 deletions test/asm/invalid-utf-8-strings.err
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@ error: invalid-utf-8-strings.asm(16):
error: invalid-utf-8-strings.asm(16):
STRLEN: Invalid UTF-8 byte 0xA2
error: invalid-utf-8-strings.asm(17):
STRLEN: Invalid UTF-8 byte 0xA3
STRSUB: Invalid UTF-8 byte 0xA3
error: invalid-utf-8-strings.asm(17):
STRLEN: Invalid UTF-8 byte 0xA4
STRSUB: Invalid UTF-8 byte 0xA4
error: invalid-utf-8-strings.asm(17):
STRLEN: Invalid UTF-8 byte 0xF0
STRSUB: Invalid UTF-8 byte 0xF0
error: invalid-utf-8-strings.asm(17):
STRLEN: Invalid UTF-8 byte 0xA2
STRSUB: Invalid UTF-8 byte 0xA2
error: invalid-utf-8-strings.asm(17):
STRSUB: Invalid UTF-8 byte 0xA3
error: invalid-utf-8-strings.asm(17):
Expand All @@ -23,21 +23,21 @@ error: invalid-utf-8-strings.asm(17):
error: invalid-utf-8-strings.asm(17):
STRSUB: Invalid UTF-8 byte 0xA2
error: invalid-utf-8-strings.asm(21):
STRLEN: Invalid UTF-8 byte 0xA3
STRSUB: Invalid UTF-8 byte 0xA3
error: invalid-utf-8-strings.asm(21):
STRLEN: Invalid UTF-8 byte 0xA4
STRSUB: Invalid UTF-8 byte 0xA4
error: invalid-utf-8-strings.asm(21):
STRLEN: Invalid UTF-8 byte 0xF0
STRSUB: Invalid UTF-8 byte 0xF0
error: invalid-utf-8-strings.asm(21):
STRLEN: Invalid UTF-8 byte 0xA2
STRSUB: Invalid UTF-8 byte 0xA2
error: invalid-utf-8-strings.asm(22):
STRLEN: Invalid UTF-8 byte 0xA3
STRSUB: Invalid UTF-8 byte 0xA3
error: invalid-utf-8-strings.asm(22):
STRLEN: Invalid UTF-8 byte 0xA4
STRSUB: Invalid UTF-8 byte 0xA4
error: invalid-utf-8-strings.asm(22):
STRLEN: Invalid UTF-8 byte 0xF0
STRSUB: Invalid UTF-8 byte 0xF0
error: invalid-utf-8-strings.asm(22):
STRLEN: Invalid UTF-8 byte 0xA2
STRSUB: Invalid UTF-8 byte 0xA2
error: invalid-utf-8-strings.asm(22):
STRSUB: Invalid UTF-8 byte 0xA3
error: invalid-utf-8-strings.asm(22):
Expand Down Expand Up @@ -76,4 +76,10 @@ error: invalid-utf-8-strings.asm(40):
Input string is not valid UTF-8
error: invalid-utf-8-strings.asm(40):
Input string is not valid UTF-8
error: Assembly aborted (39 errors)!
error: invalid-utf-8-strings.asm(50):
STRLEN: Incomplete UTF-8 character
error: invalid-utf-8-strings.asm(54):
STRSUB: Incomplete UTF-8 character
error: invalid-utf-8-strings.asm(54):
STRSUB: Incomplete UTF-8 character
error: Assembly aborted (42 errors)!
2 changes: 2 additions & 0 deletions test/asm/invalid-utf-8-strings.out
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@
"b,a"
"A ��� 漢": 7 == 7
"漢�"
"abc�": 4 == 4
"abc�" ends "�"

0 comments on commit e32f37d

Please sign in to comment.