commit faeaa564686873e4720a0c1ef9879f58347d754e
parent d515a3d96e1301b7d9ba0d38a00038894ebefcd4
Author: Laslo Hunhold <dev@frign.de>
Date: Sat, 18 Dec 2021 01:04:37 +0100
Improve a small edge-case in lg_utf8_decode()
Okay, this case is really crazy but possible: Before this change,
when we encountered e.g. a 0xF0 (which indicates a 4-byte-UTF-8
sequence and implies 3 subsequent continuation bytes) but have a
string-length of e.g. 2, we would automatically return 4 (> 2) no matter
how the following bytes look like to indicate that we need a larger
buffer.
However, it's actually necessary to check the subsequent bytes until
the buffer-end as we might have a case like
0xF0 0x80 0x00
where 0xF0 is followed by a single continuation byte but then the
continuation stops and we have a NUL-byte. It's more expected to
return 2 in such a situation because we obtain more information about
the string by inspecting the continuation bytes instead of throwing
our hands up so early.
Also add this to the test-cases of the decoder to prevent any
regressions.
Signed-off-by: Laslo Hunhold <dev@frign.de>
Diffstat:
2 files changed, 62 insertions(+), 3 deletions(-)
diff --git a/src/utf8.c b/src/utf8.c
@@ -84,11 +84,29 @@ lg_utf8_decode(const char *s, size_t n, uint_least32_t *cp)
}
if (1 + off > n) {
/*
- * input is not long enough, set cp as invalid and
- * return number of bytes needed
+ * input is not long enough, set cp as invalid
*/
*cp = LG_INVALID_CODE_POINT;
- return 1 + off;
+
+ /*
+ * count the following continuation bytes, but nothing
+ * else in case we have a "rogue" case where e.g. such a
+ * sequence starter occurs right before a NUL-byte.
+ */
+ for (i = 0; 1 + i < n; i++) {
+ if(!BETWEEN(((const unsigned char *)s)[1 + i],
+ 0x80, 0xBF)) {
+ break;
+ }
+ }
+
+ /*
+ * if the continuation bytes do not continue until
+ * the end, return the incomplete sequence length.
+ * Otherwise return the number of bytes we actually
+ * expected, which is larger than n.
+ */
+ return ((1 + i) < n) ? (1 + i) : (1 + off);
}
/*
diff --git a/test/utf8-decode.c b/test/utf8-decode.c
@@ -114,6 +114,16 @@ static const struct {
.exp_cp = LG_INVALID_CODE_POINT,
},
{
+ /* invalid 3-byte sequence (short string, second byte malformed)
+ * [ 11100000 01111111 ] ->
+ * INVALID
+ */
+ .arr = (char *)(unsigned char[]){ 0xE0, 0x7F },
+ .len = 2,
+ .exp_len = 1,
+ .exp_cp = LG_INVALID_CODE_POINT,
+ },
+ {
/* invalid 3-byte sequence (third byte missing)
* [ 11100000 10111111 ] ->
* INVALID
@@ -184,6 +194,27 @@ static const struct {
.exp_cp = LG_INVALID_CODE_POINT,
},
{
+ /* invalid 4-byte sequence (short string 1, second byte malformed)
+ * [ 11110011 011111111 ] ->
+ * INVALID
+ */
+ .arr = (char *)(unsigned char[]){ 0xF3, 0x7F },
+ .len = 2,
+ .exp_len = 1,
+ .exp_cp = LG_INVALID_CODE_POINT,
+ },
+ {
+ /* invalid 4-byte sequence (short string 2, second byte malformed)
+ * [ 11110011 011111111 10111111 ] ->
+ * INVALID
+ */
+ .arr = (char *)(unsigned char[]){ 0xF3, 0x7F, 0xBF },
+ .len = 3,
+ .exp_len = 1,
+ .exp_cp = LG_INVALID_CODE_POINT,
+ },
+
+ {
/* invalid 4-byte sequence (third byte missing)
* [ 11110011 10111111 ] ->
* INVALID
@@ -204,6 +235,16 @@ static const struct {
.exp_cp = LG_INVALID_CODE_POINT,
},
{
+ /* invalid 4-byte sequence (short string, third byte malformed)
+ * [ 11110011 10111111 01111111 ] ->
+ * INVALID
+ */
+ .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0x7F },
+ .len = 3,
+ .exp_len = 2,
+ .exp_cp = LG_INVALID_CODE_POINT,
+ },
+ {
/* invalid 4-byte sequence (fourth byte missing)
* [ 11110011 10111111 10111111 ] ->
* INVALID