Improve a small edge-case in lg_utf8_decode() - libgrapheme

commit faeaa564686873e4720a0c1ef9879f58347d754e
parent d515a3d96e1301b7d9ba0d38a00038894ebefcd4
Author: Laslo Hunhold <dev@frign.de>
Date:   Sat, 18 Dec 2021 01:04:37 +0100

Improve a small edge-case in lg_utf8_decode()

Okay, this case is really crazy but possible: Before this change,
when we encountered e.g. a 0xF0 (which indicates a 4-byte-UTF-8
sequence and implies 3 subsequent continuation bytes) but have a
string-length of e.g. 2, we would automatically return 4 (> 2) no matter
how the following bytes look like to indicate that we need a larger
buffer.

However, it's actually necessary to check the subsequent bytes until
the buffer-end as we might have a case like

   0xF0 0x80 0x00

where 0xF0 is followed by a single continuation byte but then the
continuation stops and we have a NUL-byte. It's more expected to
return 2 in such a situation because we obtain more information about
the string by inspecting the continuation bytes instead of throwing
our hands up so early.

Also add this to the test-cases of the decoder to prevent any
regressions.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
M src/utf8.c  | 24 +++++++++++++++++++++---
M test/utf8-decode.c  | 41 +++++++++++++++++++++++++++++++++++++++++

2 files changed, 62 insertions(+), 3 deletions(-)
diff --git a/src/utf8.c b/src/utf8.c
@@ -84,11 +84,29 @@ lg_utf8_decode(const char *s, size_t n, uint_least32_t *cp)
 	}
 	if (1 + off > n) {
 		/*
-		 * input is not long enough, set cp as invalid and
-		 * return number of bytes needed
+		 * input is not long enough, set cp as invalid
 		 */
 		*cp = LG_INVALID_CODE_POINT;
-		return 1 + off;
+
+		/*
+		 * count the following continuation bytes, but nothing
+		 * else in case we have a "rogue" case where e.g. such a
+		 * sequence starter occurs right before a NUL-byte.
+		 */
+		for (i = 0; 1 + i < n; i++) {
+			if(!BETWEEN(((const unsigned char *)s)[1 + i],
+			            0x80, 0xBF)) {
+				break;
+			}
+		}
+
+		/*
+		 * if the continuation bytes do not continue until
+		 * the end, return the incomplete sequence length.
+		 * Otherwise return the number of bytes we actually
+		 * expected, which is larger than n.
+		 */
+		return ((1 + i) < n) ? (1 + i) : (1 + off);
 	}
 
 	/*
diff --git a/test/utf8-decode.c b/test/utf8-decode.c
@@ -114,6 +114,16 @@ static const struct {
 		.exp_cp  = LG_INVALID_CODE_POINT,
 	},
 	{
+		/* invalid 3-byte sequence (short string, second byte malformed)
+		 * [ 11100000 01111111 ] ->
+		 * INVALID
+		 */
+		.arr     = (char *)(unsigned char[]){ 0xE0, 0x7F },
+		.len     = 2,
+		.exp_len = 1,
+		.exp_cp  = LG_INVALID_CODE_POINT,
+	},
+	{
 		/* invalid 3-byte sequence (third byte missing)
 		 * [ 11100000 10111111 ] ->
 		 * INVALID
@@ -184,6 +194,27 @@ static const struct {
 		.exp_cp  = LG_INVALID_CODE_POINT,
 	},
 	{
+		/* invalid 4-byte sequence (short string 1, second byte malformed)
+		 * [ 11110011 011111111 ] ->
+		 * INVALID
+		 */
+		.arr     = (char *)(unsigned char[]){ 0xF3, 0x7F },
+		.len     = 2,
+		.exp_len = 1,
+		.exp_cp  = LG_INVALID_CODE_POINT,
+	},
+	{
+		/* invalid 4-byte sequence (short string 2, second byte malformed)
+		 * [ 11110011 011111111 10111111 ] ->
+		 * INVALID
+		 */
+		.arr     = (char *)(unsigned char[]){ 0xF3, 0x7F, 0xBF },
+		.len     = 3,
+		.exp_len = 1,
+		.exp_cp  = LG_INVALID_CODE_POINT,
+	},
+
+	{
 		/* invalid 4-byte sequence (third byte missing)
 		 * [ 11110011 10111111 ] ->
 		 * INVALID
@@ -204,6 +235,16 @@ static const struct {
 		.exp_cp  = LG_INVALID_CODE_POINT,
 	},
 	{
+		/* invalid 4-byte sequence (short string, third byte malformed)
+		 * [ 11110011 10111111 01111111 ] ->
+		 * INVALID
+		 */
+		.arr     = (char *)(unsigned char[]){ 0xF3, 0xBF, 0x7F },
+		.len     = 3,
+		.exp_len = 2,
+		.exp_cp  = LG_INVALID_CODE_POINT,
+	},
+	{
 		/* invalid 4-byte sequence (fourth byte missing)
 		 * [ 11110011 10111111 10111111 ] ->
 		 * INVALID

	libgrapheme unicode string library
	git clone git://git.suckless.org/libgrapheme
	Log \| Files \| Refs \| README \| LICENSE

M	src/utf8.c	\|	24	+++++++++++++++++++++---
M	test/utf8-decode.c	\|	41	+++++++++++++++++++++++++++++++++++++++++