Refactor UTF-8 decoder - libgrapheme - unicode string library

commit 7034bddcc4bc2262cec8a47d47272c1a2cd9bf9d
parent 2ee29acf9aac51c7b2b52d3fa955de411f619274
Author: Laslo Hunhold <dev@frign.de>
Date:   Thu, 28 May 2020 12:42:08 +0200

Refactor UTF-8 decoder

The old decoder was more or less written during a few hours of waiting
time together with Roberto at Budapest airport after slcon 2 in 2015.

Mattias rightfully pointed out some issues with it, namely

  - it contains unnecessary null byte handling
  - overlong encoding detection is inefficient
  - surrogate code points are not rejected
  - code points beyong U+10FFFF are not rejected

I personally had some notes on the decoder and what I didn't like,
especially its semantics. The problem with processing data is that
you can end up in a situation where you need more than is provided.
It is difficult to express these conditions with error codes, especially
when you also return a codepoint.

Another issue was error handling, that was inadequate in some cases.

Inspired by strlcat from OpenBSD, which returns a length that can be
larger than the total array capacity to indicate a truncation, I changed
the semantics of the decode-function as follows:
When there's any issue, the codepoint is set to CP_INVALID (naturally).
The length return value gives more indication for the error, though
it's not exhaustive, which doesn't matter though.

When the given array is shorter than necessary, the returned length
shows how much data _is_ needed to properly process it. In any other
case, the number of "consumed" bytes is returned (in error cases
excluding the "unexpected" byte). So, comparable to strlcat, one can
use the decode()-function safely like

   char *arr = ...; size_t arrlen = ...;
   uint32_t cp;
   size_t len;

   if ((len = grapheme_cp_decode(&cp, arr, arrlen)) > arrlen) {
	/*
	 * array too short; read in more data or just surrender and
	 * continue with cp regardless
	 */
   }

   ...

The check for CP_INVALID isn't even necessary in most cases and
checking the returned length alone is sufficient.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
M src/codepoint.c  | 150 +++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
M src/codepoint.h  | 2 +-
M src/grapheme.c  | 28 +++++++++++++++++++++-------

3 files changed, 116 insertions(+), 64 deletions(-)
diff --git a/src/codepoint.c b/src/codepoint.c
@@ -5,76 +5,114 @@
 #define BETWEEN(c, l, u) (c >= l && c <= u)
 #define LEN(x) (sizeof(x) / sizeof(*x))
 
+/* lookup-table for the types of sequence first bytes */
+static const struct {
+	uint8_t  lower; /* lower bound of sequence first byte */
+	uint8_t  upper; /* upper bound of sequence first byte */
+	uint32_t mincp; /* smallest non-overlong encoded codepoint */
+	/*
+	 * implicit: table-offset represents the number of following
+	 * bytes of the form 10xxxxxx (6 bits capacity each)
+	 */
+} lut[] = {
+	[0] = {
+		/* 0xxxxxxx */
+		.lower = 0x00, /* 00000000 */
+		.upper = 0x7F, /* 01111111 */
+		.mincp = 0,
+	},
+	[1] = {
+		/* 110xxxxx */
+		.lower = 0xC0, /* 11000000 */
+		.upper = 0xDF, /* 11011111 */
+		.mincp = 1 << 7, /* [0] has 7 bits capacity */
+	},
+	[2] = {
+		/* 1110xxxx */
+		.lower = 0xE0, /* 11100000 */
+		.upper = 0xEF, /* 11101111 */
+		.mincp = 1 << 11, /* [1] has 5+6=11 bits capacity */
+	},
+	[3] = {
+		/* 11110xxx */
+		.lower = 0xF0, /* 11110000 */
+		.upper = 0xF7, /* 11110111 */
+		.mincp = 1 << 16, /* [2] has 4+6+6=16 bits capacity */
+	},
+};
+
 size_t
-cp_decode(const uint8_t *str, Codepoint *p)
+grapheme_cp_decode(uint32_t *cp, const uint8_t *s, size_t n)
 {
-	size_t off, j, k, l;
-
-	struct {
-		uint8_t lower;
-		uint8_t upper;
-		uint8_t mask;
-		uint8_t bits;
-	} lookup[] = {
-		{ 0x00, 0x7F, 0xFF,  7 }, /* 00000000 - 01111111, 01111111 */
-		{ 0xC0, 0xDF, 0x1F, 11 }, /* 11000000 - 11011111, 00011111 */
-		{ 0xE0, 0xEF, 0x0F, 16 }, /* 11100000 - 11101111, 00001111 */
-		{ 0xF0, 0xF7, 0x07, 21 }, /* 11110000 - 11110111, 00000111 */
-		{ 0xF8, 0xFB, 0x03, 26 }, /* 11111000 - 11111011, 00000011 */
-		{ 0xFC, 0xFD, 0x01, 31 }, /* 11111100 - 11111101, 00000001 */
-	};
+	size_t off, i;
 
-	/* empty string */
-	if (str[0] == '\0') {
-		*p = 0;
-		return 0;
+	if (n == 0) {
+		/* a sequence must be at least 1 byte long */
+		*cp = CP_INVALID;
+		return 1;
 	}
 
-	/* find out in which ranges str[0] is */
-	for (off = 0; off < LEN(lookup); off++) {
-		if (BETWEEN(str[0], lookup[off].lower, lookup[off].upper)) {
-			*p = str[0] & lookup[off].mask;
+	/* identify sequence type with the first byte */
+	for (off = 0; off < LEN(lut); off++) {
+		if (BETWEEN(s[0], lut[off].lower, lut[off].upper)) {
+			/*
+			 * first byte is within the bounds; fill
+			 * p with the the first bits contained in
+			 * the first byte ("upper-lower" is the bitmask)
+			 */
+			*cp = s[0] & (lut[off].upper - lut[off].lower);
 			break;
 		}
 	}
-	if (off == 0) {
-		/* ASCII */
-		return 1;
-	} else if (off == LEN(lookup)) {
-		/* not in ranges */
-		*p = CP_INVALID;
+	if (off == LEN(lut)) {
+		/*
+		 * first byte does not match a sequence type;
+		 * set cp as invalid and return 1 byte processed
+		 */
+		*cp = CP_INVALID;
 		return 1;
 	}
+	if (1 + off > n) {
+		/*
+		 * input is not long enough, set cp as invalid and
+		 * return number of bytes needed
+		 */
+		*cp = CP_INVALID;
+		return 1 + off;
+	}
 
-	/* off denotes the number of upcoming expected code units */
-	for (j = 0; j < off; j++) {
-		if (str[j] == '\0') {
-			*p = CP_INVALID;
-			return j;
-		}
-		if ((str[1 + j] & 0xC0) != 0x80) {
-			*p = CP_INVALID;
-			return 1 + j;
+	/*
+	 * process 'off' following bytes, each of the form 10xxxxxx
+	 * (i.e. between 0x80 (10000000) and 0xBF (10111111))
+	 */
+	for (i = 1; i <= off; i++) {
+		if(!BETWEEN(s[i], 0x80, 0xBF)) {
+			/*
+			 * byte does not match format; return
+			 * number of bytes processed excluding the
+			 * unexpected character as recommended since
+			 * Unicode 6 (chapter 3)
+			 */
+			*cp = CP_INVALID;
+			return 1 + (i - 1);
 		}
-		*p <<= 6;
-		*p |= str[1 + j] & 0x3F; /* 00111111 */
+		/*
+		 * shift codepoint by 6 bits and add the 6 stored bits
+		 * in s[i] to it using the bitmask 0x3F (00111111)
+		 */
+		*cp = (*cp << 6) | (s[i] & 0x3F);
 	}
 
-	if (*p == 0) {
-		if (off != 0) {
-			/* overencoded NUL */
-			*p = CP_INVALID;
-		}
-	} else {
-		/* determine effective bytes */
-		for (k = 0; ((*p << k) & (1 << 31)) == 0; k++)
-			;
-		for (l = 0; l < off; l++) {
-			if ((32 - k) <= lookup[l].bits) {
-				*p = CP_INVALID;
-			}
-		}
+	if (*cp < lut[off].mincp || BETWEEN(*cp, 0xD800, 0xDFFF) ||
+	    *cp > 0x10FFFF) {
+		/*
+		 * codepoint is overlong encoded in the sequence, is a
+		 * high or low UTF-16 surrogate half (0xD800..0xDFFF) or
+		 * not representable in UTF-16 (>0x10FFFF) (RFC-3629
+		 * specifies the latter two conditions)
+		 */
+		*cp = CP_INVALID;
 	}
 
-	return 1 + j;
+	return 1 + off;
 }
diff --git a/src/codepoint.h b/src/codepoint.h
@@ -9,6 +9,6 @@ typedef uint32_t Codepoint;
 
 #define CP_INVALID 0xFFFD
 
-size_t cp_decode(const uint8_t *, Codepoint *);
+size_t grapheme_cp_decode(uint32_t *, const uint8_t *, size_t);
 
 #endif /* CODEPOINT_H */
diff --git a/src/grapheme.c b/src/grapheme.c
@@ -12,20 +12,34 @@ grapheme_len(const char *str)
 	size_t ret, len = 0;
 	int state = 0;
 
+	if (str == NULL) {
+		return 0;
+	}
+
+	/*
+	 * grapheme_cp_decode, when it encounters an unexpected byte,
+	 * does not count it to the error and instead assumes that the
+	 * unexpected byte is the beginning of a new sequence.
+	 * This way, when the string ends with a null byte, we never
+	 * miss it, even if the previous UTF-8 sequence terminates
+	 * unexpectedly, as it would either act as an unexpected byte,
+	 * saved for later, or as a null byte itself, that we can catch.
+	 * We pass 5 to the length, as we will never read beyond
+	 * the null byte for the reasons given above.
+	 */
+
 	/* get first code point */
-	if ((ret = cp_decode((const uint8_t *)str, &cp0)) == 0) {
+	len += grapheme_cp_decode(&cp0, (uint8_t *)str, 5);
+	if (cp0 == CP_INVALID) {
 		return len;
 	}
-	len += ret;
 
 	while (cp0 != 0) {
 		/* get next codepoint */
-		if ((ret = cp_decode((const uint8_t *)(str + len), &cp1)) == 0) {
-			break;
-		}
+		ret = grapheme_cp_decode(&cp1, (uint8_t *)(str + len), 5);
 
-		if (boundary(cp0, cp1, &state)) {
-			/* we have a breakpoint */
+		if (cp1 == CP_INVALID || boundary(cp0, cp1, &state)) {
+			/* we read an invalid cp or have a breakpoint */
 			break;
 		} else {
 			/* we don't have a breakpoint, continue */

	libgrapheme unicode string library
	git clone git://git.suckless.org/libgrapheme
	Log \| Files \| Refs \| README \| LICENSE

M	src/codepoint.c	\|	150	+++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
M	src/codepoint.h	\|	2	+-
M	src/grapheme.c	\|	28	+++++++++++++++++++++-------