libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

commit faeaa564686873e4720a0c1ef9879f58347d754e
parent d515a3d96e1301b7d9ba0d38a00038894ebefcd4
Author: Laslo Hunhold <dev@frign.de>
Date:   Sat, 18 Dec 2021 01:04:37 +0100

Improve a small edge-case in lg_utf8_decode()

Okay, this case is really crazy but possible: Before this change,
when we encountered e.g. a 0xF0 (which indicates a 4-byte-UTF-8
sequence and implies 3 subsequent continuation bytes) but have a
string-length of e.g. 2, we would automatically return 4 (> 2) no matter
how the following bytes look like to indicate that we need a larger
buffer.

However, it's actually necessary to check the subsequent bytes until
the buffer-end as we might have a case like

   0xF0 0x80 0x00

where 0xF0 is followed by a single continuation byte but then the
continuation stops and we have a NUL-byte. It's more expected to
return 2 in such a situation because we obtain more information about
the string by inspecting the continuation bytes instead of throwing
our hands up so early.

Also add this to the test-cases of the decoder to prevent any
regressions.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
Msrc/utf8.c | 24+++++++++++++++++++++---
Mtest/utf8-decode.c | 41+++++++++++++++++++++++++++++++++++++++++
2 files changed, 62 insertions(+), 3 deletions(-)

diff --git a/src/utf8.c b/src/utf8.c @@ -84,11 +84,29 @@ lg_utf8_decode(const char *s, size_t n, uint_least32_t *cp) } if (1 + off > n) { /* - * input is not long enough, set cp as invalid and - * return number of bytes needed + * input is not long enough, set cp as invalid */ *cp = LG_INVALID_CODE_POINT; - return 1 + off; + + /* + * count the following continuation bytes, but nothing + * else in case we have a "rogue" case where e.g. such a + * sequence starter occurs right before a NUL-byte. + */ + for (i = 0; 1 + i < n; i++) { + if(!BETWEEN(((const unsigned char *)s)[1 + i], + 0x80, 0xBF)) { + break; + } + } + + /* + * if the continuation bytes do not continue until + * the end, return the incomplete sequence length. + * Otherwise return the number of bytes we actually + * expected, which is larger than n. + */ + return ((1 + i) < n) ? (1 + i) : (1 + off); } /* diff --git a/test/utf8-decode.c b/test/utf8-decode.c @@ -114,6 +114,16 @@ static const struct { .exp_cp = LG_INVALID_CODE_POINT, }, { + /* invalid 3-byte sequence (short string, second byte malformed) + * [ 11100000 01111111 ] -> + * INVALID + */ + .arr = (char *)(unsigned char[]){ 0xE0, 0x7F }, + .len = 2, + .exp_len = 1, + .exp_cp = LG_INVALID_CODE_POINT, + }, + { /* invalid 3-byte sequence (third byte missing) * [ 11100000 10111111 ] -> * INVALID @@ -184,6 +194,27 @@ static const struct { .exp_cp = LG_INVALID_CODE_POINT, }, { + /* invalid 4-byte sequence (short string 1, second byte malformed) + * [ 11110011 011111111 ] -> + * INVALID + */ + .arr = (char *)(unsigned char[]){ 0xF3, 0x7F }, + .len = 2, + .exp_len = 1, + .exp_cp = LG_INVALID_CODE_POINT, + }, + { + /* invalid 4-byte sequence (short string 2, second byte malformed) + * [ 11110011 011111111 10111111 ] -> + * INVALID + */ + .arr = (char *)(unsigned char[]){ 0xF3, 0x7F, 0xBF }, + .len = 3, + .exp_len = 1, + .exp_cp = LG_INVALID_CODE_POINT, + }, + + { /* invalid 4-byte sequence (third byte missing) * [ 11110011 10111111 ] -> * INVALID @@ -204,6 +235,16 @@ static const struct { .exp_cp = LG_INVALID_CODE_POINT, }, { + /* invalid 4-byte sequence (short string, third byte malformed) + * [ 11110011 10111111 01111111 ] -> + * INVALID + */ + .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0x7F }, + .len = 3, + .exp_len = 2, + .exp_cp = LG_INVALID_CODE_POINT, + }, + { /* invalid 4-byte sequence (fourth byte missing) * [ 11110011 10111111 10111111 ] -> * INVALID