Encourage strict aliasing for library users (uint8_t * -> char *) - libgrapheme

commit b99a40eefc2ec1ad8714ed210a3aeedfb3283159
parent 20c105bcdd1c54401d4d23cdb9ded56ee7a2ffd4
Author: Laslo Hunhold <dev@frign.de>
Date:   Fri, 17 Dec 2021 00:34:27 +0100

Encourage strict aliasing for library users (uint8_t * -> char *)

After a long-winded discussion with Michael Forney who has a really
deep understanding of the C-specification, he rightfully pointed out
that using uint8_t * might look good on paper, but leads to subtle
problems due to intrinsics within the C99-specification.

While you can alias any pointer to character types (char, unsigned char,
signed char), uint8_t is not a character type and aliasing to it breaks
the strict aliasing rule. This is not a problem in practice as gcc
is the only big compiler enforcing strict aliasing and uint8_t is
usually defined as unsigned char, inheriting the aliasing property for
technical reasons, but strictly speaking uint8_t is not a character
type.

With uint8_t * in the API, library users would've been forced to cast
any input-string to uint8_t *, breaking the strict aliasing rule. A
lot of code relies on this or conveniently disables strict aliasing
through compiler flags, but using char-arrays is the only really
portable and safe way to work with it.
Given char is usually 8 bits and indicates strongly that we're dealing
with a string is one strong point for using char *, another is that
C11 introduced UTF-8-string-literals of the form u8"..." which are
of type char[]. In this sense, using char * ensures some form of
forward-compatibility and fits nicely within the spec that's slowly
converging towards UTF-8.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
M grapheme.h  | 6 +++---
M man/lg_grapheme_nextbreak.3  | 4 ++--
M src/grapheme.c  | 2 +-
M src/utf8.c  | 25 +++++++++++++++++--------
M test/utf8-decode.c  | 46 +++++++++++++++++++++++-----------------------
M test/utf8-encode.c  | 16 ++++++++--------

6 files changed, 54 insertions(+), 45 deletions(-)
diff --git a/grapheme.h b/grapheme.h
@@ -19,11 +19,11 @@ typedef struct lg_internal_segmentation_state {
 
 #define LG_CODEPOINT_INVALID UINT32_C(0xFFFD)
 
-size_t lg_grapheme_nextbreak(const uint8_t *);
+size_t lg_grapheme_nextbreak(const char *);
 
 bool lg_grapheme_isbreak(uint_least32_t, uint_least32_t, LG_SEGMENTATION_STATE *);
 
-size_t lg_utf8_decode(const uint8_t *, size_t, uint_least32_t *);
-size_t lg_utf8_encode(uint_least32_t, uint8_t *, size_t);
+size_t lg_utf8_decode(const char *, size_t, uint_least32_t *);
+size_t lg_utf8_encode(uint_least32_t, char *, size_t);
 
 #endif /* GRAPHEME_H */
diff --git a/man/lg_grapheme_nextbreak.3 b/man/lg_grapheme_nextbreak.3
@@ -7,7 +7,7 @@
 .Sh SYNOPSIS
 .In grapheme.h
 .Ft size_t
-.Fn lg_grapheme_nextbreak "const uint8_t *str"
+.Fn lg_grapheme_nextbreak "const char *str"
 .Sh DESCRIPTION
 .Fn lg_grapheme_nextbreak
 computes the offset (in bytes) to the next grapheme
@@ -52,7 +52,7 @@ main(void)
 
 	/* print each grapheme cluster with byte-length */
 	for (; *s != '\\0';) {
-		len = lg_grapheme_nextbreak((uint8_t *)s);
+		len = lg_grapheme_nextbreak(s);
 		printf("%2zu bytes | %.*s\\n", len, (int)len, s, len);
 		s += len;
 	}
diff --git a/src/grapheme.c b/src/grapheme.c
@@ -179,7 +179,7 @@ hasbreak:
 }
 
 size_t
-lg_grapheme_nextbreak(const uint8_t *str)
+lg_grapheme_nextbreak(const char *str)
 {
 	uint_least32_t cp0, cp1;
 	size_t ret, len = 0;
diff --git a/src/utf8.c b/src/utf8.c
@@ -48,7 +48,7 @@ static const struct {
 };
 
 size_t
-lg_utf8_decode(const uint8_t *s, size_t n, uint_least32_t *cp)
+lg_utf8_decode(const char *s, size_t n, uint_least32_t *cp)
 {
 	size_t off, i;
 
@@ -60,13 +60,14 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint_least32_t *cp)
 
 	/* identify sequence type with the first byte */
 	for (off = 0; off < LEN(lut); off++) {
-		if (BETWEEN(s[0], lut[off].lower, lut[off].upper)) {
+		if (BETWEEN(((unsigned char *)s)[0], lut[off].lower,
+		            lut[off].upper)) {
 			/*
 			 * first byte is within the bounds; fill
 			 * p with the the first bits contained in
 			 * the first byte (by subtracting the high bits)
 			 */
-			*cp = s[0] - lut[off].lower;
+			*cp = ((unsigned char *)s)[0] - lut[off].lower;
 			break;
 		}
 	}
@@ -74,6 +75,9 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint_least32_t *cp)
 		/*
 		 * first byte does not match a sequence type;
 		 * set cp as invalid and return 1 byte processed
+		 *
+		 * this also includes the cases where bits higher than
+		 * the 8th are set on systems with CHAR_BIT > 8
 		 */
 		*cp = LG_CODEPOINT_INVALID;
 		return 1;
@@ -92,12 +96,16 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint_least32_t *cp)
 	 * (i.e. between 0x80 (10000000) and 0xBF (10111111))
 	 */
 	for (i = 1; i <= off; i++) {
-		if(!BETWEEN(s[i], 0x80, 0xBF)) {
+		if(!BETWEEN(((unsigned char *)s)[i], 0x80, 0xBF)) {
 			/*
 			 * byte does not match format; return
 			 * number of bytes processed excluding the
 			 * unexpected character as recommended since
 			 * Unicode 6 (chapter 3)
+			 *
+			 * this also includes the cases where bits
+			 * higher than the 8th are set on systems
+			 * with CHAR_BIT > 8
 			 */
 			*cp = LG_CODEPOINT_INVALID;
 			return 1 + (i - 1);
@@ -106,7 +114,7 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint_least32_t *cp)
 		 * shift code point by 6 bits and add the 6 stored bits
 		 * in s[i] to it using the bitmask 0x3F (00111111)
 		 */
-		*cp = (*cp << 6) | (s[i] & 0x3F);
+		*cp = (*cp << 6) | (((unsigned char *)s)[i] & 0x3F);
 	}
 
 	if (*cp < lut[off].mincp ||
@@ -125,7 +133,7 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint_least32_t *cp)
 }
 
 size_t
-lg_utf8_encode(uint_least32_t cp, uint8_t *s, size_t n)
+lg_utf8_encode(uint_least32_t cp, char *s, size_t n)
 {
 	size_t off, i;
 
@@ -165,7 +173,7 @@ lg_utf8_encode(uint_least32_t cp, uint8_t *s, size_t n)
 	 * We do not overwrite the mask because we guaranteed earlier
 	 * that there are no bits higher than the mask allows.
 	 */
-	s[0] = lut[off].lower | (uint8_t)(cp >> (6 * off));
+	((unsigned char *)s)[0] = lut[off].lower | (uint8_t)(cp >> (6 * off));
 
 	for (i = 1; i <= off; i++) {
 		/*
@@ -174,7 +182,8 @@ lg_utf8_encode(uint_least32_t cp, uint8_t *s, size_t n)
 		 * extract from the properly-shifted value using the
 		 * mask 00111111 (0x3F)
 		 */
-		s[i] = 0x80 | ((cp >> (6 * (off - i))) & 0x3F);
+		((unsigned char *)s)[i] = 0x80 |
+		                          ((cp >> (6 * (off - i))) & 0x3F);
 	}
 
 	return 1 + off;
diff --git a/test/utf8-decode.c b/test/utf8-decode.c
@@ -8,7 +8,7 @@
 #include "util.h"
 
 static const struct {
-	uint8_t       *arr;     /* UTF-8 byte sequence */
+	char          *arr;     /* UTF-8 byte sequence */
 	size_t         len;     /* length of UTF-8 byte sequence */
 	size_t         exp_len; /* expected length returned */
 	uint_least32_t exp_cp;  /* expected code point returned */
@@ -28,7 +28,7 @@ static const struct {
 		 * [ 11111101 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xFD },
+		.arr     = (char *)(unsigned char[]){ 0xFD },
 		.len     = 1,
 		.exp_len = 1,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -38,7 +38,7 @@ static const struct {
 		 * [ 00000001 ] ->
 		 * 0000001
 		 */
-		.arr     = (uint8_t[]){ 0x01 },
+		.arr     = (char *)(unsigned char[]){ 0x01 },
 		.len     = 1,
 		.exp_len = 1,
 		.exp_cp  = 0x1,
@@ -48,7 +48,7 @@ static const struct {
 		 * [ 11000011 10111111 ] ->
 		 * 00011111111
 		 */
-		.arr     = (uint8_t[]){ 0xC3, 0xBF },
+		.arr     = (char *)(unsigned char[]){ 0xC3, 0xBF },
 		.len     = 2,
 		.exp_len = 2,
 		.exp_cp  = 0xFF,
@@ -58,7 +58,7 @@ static const struct {
 		 * [ 11000011 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xC3 },
+		.arr     = (char *)(unsigned char[]){ 0xC3 },
 		.len     = 1,
 		.exp_len = 2,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -68,7 +68,7 @@ static const struct {
 		 * [ 11000011 11111111 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xC3, 0xFF },
+		.arr     = (char *)(unsigned char[]){ 0xC3, 0xFF },
 		.len     = 2,
 		.exp_len = 1,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -78,7 +78,7 @@ static const struct {
 		 * [ 11000001 10111111 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xC1, 0xBF },
+		.arr     = (char *)(unsigned char[]){ 0xC1, 0xBF },
 		.len     = 2,
 		.exp_len = 2,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -88,7 +88,7 @@ static const struct {
 		 * [ 11100000 10111111 10111111 ] ->
 		 * 0000111111111111
 		 */
-		.arr     = (uint8_t[]){ 0xE0, 0xBF, 0xBF },
+		.arr     = (char *)(unsigned char[]){ 0xE0, 0xBF, 0xBF },
 		.len     = 3,
 		.exp_len = 3,
 		.exp_cp  = 0xFFF,
@@ -98,7 +98,7 @@ static const struct {
 		 * [ 11100000 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xE0 },
+		.arr     = (char *)(unsigned char[]){ 0xE0 },
 		.len     = 1,
 		.exp_len = 3,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -108,7 +108,7 @@ static const struct {
 		 * [ 11100000 01111111 10111111 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xE0, 0x7F, 0xBF },
+		.arr     = (char *)(unsigned char[]){ 0xE0, 0x7F, 0xBF },
 		.len     = 3,
 		.exp_len = 1,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -118,7 +118,7 @@ static const struct {
 		 * [ 11100000 10111111 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xE0, 0xBF },
+		.arr     = (char *)(unsigned char[]){ 0xE0, 0xBF },
 		.len     = 2,
 		.exp_len = 3,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -128,7 +128,7 @@ static const struct {
 		 * [ 11100000 10111111 01111111 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xE0, 0xBF, 0x7F },
+		.arr     = (char *)(unsigned char[]){ 0xE0, 0xBF, 0x7F },
 		.len     = 3,
 		.exp_len = 2,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -138,7 +138,7 @@ static const struct {
 		 * [ 11100000 10011111 10111111 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xE0, 0x9F, 0xBF },
+		.arr     = (char *)(unsigned char[]){ 0xE0, 0x9F, 0xBF },
 		.len     = 3,
 		.exp_len = 3,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -148,7 +148,7 @@ static const struct {
 		 * [ 11101101 10100000 10000000 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xED, 0xA0, 0x80 },
+		.arr     = (char *)(unsigned char[]){ 0xED, 0xA0, 0x80 },
 		.len     = 3,
 		.exp_len = 3,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -158,7 +158,7 @@ static const struct {
 		 * [ 11110011 10111111 10111111 10111111 ] ->
 		 * 011111111111111111111
 		 */
-		.arr     = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF },
+		.arr     = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0xBF },
 		.len     = 4,
 		.exp_len = 4,
 		.exp_cp  = UINT32_C(0xFFFFF),
@@ -168,7 +168,7 @@ static const struct {
 		 * [ 11110011 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xF3 },
+		.arr     = (char *)(unsigned char[]){ 0xF3 },
 		.len     = 1,
 		.exp_len = 4,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -178,7 +178,7 @@ static const struct {
 		 * [ 11110011 01111111 10111111 10111111 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xF3, 0x7F, 0xBF, 0xBF },
+		.arr     = (char *)(unsigned char[]){ 0xF3, 0x7F, 0xBF, 0xBF },
 		.len     = 4,
 		.exp_len = 1,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -188,7 +188,7 @@ static const struct {
 		 * [ 11110011 10111111 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xF3, 0xBF },
+		.arr     = (char *)(unsigned char[]){ 0xF3, 0xBF },
 		.len     = 2,
 		.exp_len = 4,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -198,7 +198,7 @@ static const struct {
 		 * [ 11110011 10111111 01111111 10111111 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xF3, 0xBF, 0x7F, 0xBF },
+		.arr     = (char *)(unsigned char[]){ 0xF3, 0xBF, 0x7F, 0xBF },
 		.len     = 4,
 		.exp_len = 2,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -208,7 +208,7 @@ static const struct {
 		 * [ 11110011 10111111 10111111 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xF3, 0xBF, 0xBF },
+		.arr     = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF },
 		.len     = 3,
 		.exp_len = 4,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -218,7 +218,7 @@ static const struct {
 		 * [ 11110011 10111111 10111111 01111111 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0x7F },
+		.arr     = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0x7F },
 		.len     = 4,
 		.exp_len = 3,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -228,7 +228,7 @@ static const struct {
 		 * [ 11110000 10000000 10000001 10111111 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xF0, 0x80, 0x81, 0xBF },
+		.arr     = (char *)(unsigned char[]){ 0xF0, 0x80, 0x81, 0xBF },
 		.len     = 4,
 		.exp_len = 4,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -238,7 +238,7 @@ static const struct {
 		 * [ 11110100 10010000 10000000 10000000 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xF4, 0x90, 0x80, 0x80 },
+		.arr     = (char *)(unsigned char[]){ 0xF4, 0x90, 0x80, 0x80 },
 		.len     = 4,
 		.exp_len = 4,
 		.exp_cp  = LG_CODEPOINT_INVALID,
diff --git a/test/utf8-encode.c b/test/utf8-encode.c
@@ -9,43 +9,43 @@
 
 static const struct {
 	uint_least32_t cp;      /* input code point */
-	uint8_t       *exp_arr; /* expected UTF-8 byte sequence */
+	char          *exp_arr; /* expected UTF-8 byte sequence */
 	size_t         exp_len; /* expected length of UTF-8 sequence */
 } enc_test[] = {
 	{
 		/* invalid code point (UTF-16 surrogate half) */
 		.cp      = UINT32_C(0xD800),
-		.exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD },
+		.exp_arr = (char *)(unsigned char[]){ 0xEF, 0xBF, 0xBD },
 		.exp_len = 3,
 	},
 	{
 		/* invalid code point (UTF-16-unrepresentable) */
 		.cp      = UINT32_C(0x110000),
-		.exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD },
+		.exp_arr = (char *)(unsigned char[]){ 0xEF, 0xBF, 0xBD },
 		.exp_len = 3,
 	},
 	{
 		/* code point encoded to a 1-byte sequence */
 		.cp      = 0x01,
-		.exp_arr = (uint8_t[]){ 0x01 },
+		.exp_arr = (char *)(unsigned char[]){ 0x01 },
 		.exp_len = 1,
 	},
 	{
 		/* code point encoded to a 2-byte sequence */
 		.cp      = 0xFF,
-		.exp_arr = (uint8_t[]){ 0xC3, 0xBF },
+		.exp_arr = (char *)(unsigned char[]){ 0xC3, 0xBF },
 		.exp_len = 2,
 	},
 	{
 		/* code point encoded to a 3-byte sequence */
 		.cp      = 0xFFF,
-		.exp_arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF },
+		.exp_arr = (char *)(unsigned char[]){ 0xE0, 0xBF, 0xBF },
 		.exp_len = 3,
 	},
 	{
 		/* code point encoded to a 4-byte sequence */
 		.cp      = UINT32_C(0xFFFFF),
-		.exp_arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF },
+		.exp_arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0xBF },
 		.exp_len = 4,
 	},
 };
@@ -59,7 +59,7 @@ main(int argc, char *argv[])
 
 	/* UTF-8 encoder test */
 	for (i = 0, failed = 0; i < LEN(enc_test); i++) {
-		uint8_t arr[4];
+		char arr[4];
 		size_t len;
 
 		len = lg_utf8_encode(enc_test[i].cp, arr, LEN(arr));

	libgrapheme unicode string library
	git clone git://git.suckless.org/libgrapheme
	Log \| Files \| Refs \| README \| LICENSE

M	grapheme.h	\|	6	+++---
M	man/lg_grapheme_nextbreak.3	\|	4	++--
M	src/grapheme.c	\|	2	+-
M	src/utf8.c	\|	25	+++++++++++++++++--------
M	test/utf8-decode.c	\|	46	+++++++++++++++++++++++-----------------------
M	test/utf8-encode.c	\|	16	++++++++--------