commit b99a40eefc2ec1ad8714ed210a3aeedfb3283159
parent 20c105bcdd1c54401d4d23cdb9ded56ee7a2ffd4
Author: Laslo Hunhold <dev@frign.de>
Date: Fri, 17 Dec 2021 00:34:27 +0100
Encourage strict aliasing for library users (uint8_t * -> char *)
After a long-winded discussion with Michael Forney who has a really
deep understanding of the C-specification, he rightfully pointed out
that using uint8_t * might look good on paper, but leads to subtle
problems due to intrinsics within the C99-specification.
While you can alias any pointer to character types (char, unsigned char,
signed char), uint8_t is not a character type and aliasing to it breaks
the strict aliasing rule. This is not a problem in practice as gcc
is the only big compiler enforcing strict aliasing and uint8_t is
usually defined as unsigned char, inheriting the aliasing property for
technical reasons, but strictly speaking uint8_t is not a character
type.
With uint8_t * in the API, library users would've been forced to cast
any input-string to uint8_t *, breaking the strict aliasing rule. A
lot of code relies on this or conveniently disables strict aliasing
through compiler flags, but using char-arrays is the only really
portable and safe way to work with it.
Given char is usually 8 bits and indicates strongly that we're dealing
with a string is one strong point for using char *, another is that
C11 introduced UTF-8-string-literals of the form u8"..." which are
of type char[]. In this sense, using char * ensures some form of
forward-compatibility and fits nicely within the spec that's slowly
converging towards UTF-8.
Signed-off-by: Laslo Hunhold <dev@frign.de>
Diffstat:
6 files changed, 54 insertions(+), 45 deletions(-)
diff --git a/grapheme.h b/grapheme.h
@@ -19,11 +19,11 @@ typedef struct lg_internal_segmentation_state {
#define LG_CODEPOINT_INVALID UINT32_C(0xFFFD)
-size_t lg_grapheme_nextbreak(const uint8_t *);
+size_t lg_grapheme_nextbreak(const char *);
bool lg_grapheme_isbreak(uint_least32_t, uint_least32_t, LG_SEGMENTATION_STATE *);
-size_t lg_utf8_decode(const uint8_t *, size_t, uint_least32_t *);
-size_t lg_utf8_encode(uint_least32_t, uint8_t *, size_t);
+size_t lg_utf8_decode(const char *, size_t, uint_least32_t *);
+size_t lg_utf8_encode(uint_least32_t, char *, size_t);
#endif /* GRAPHEME_H */
diff --git a/man/lg_grapheme_nextbreak.3 b/man/lg_grapheme_nextbreak.3
@@ -7,7 +7,7 @@
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
-.Fn lg_grapheme_nextbreak "const uint8_t *str"
+.Fn lg_grapheme_nextbreak "const char *str"
.Sh DESCRIPTION
.Fn lg_grapheme_nextbreak
computes the offset (in bytes) to the next grapheme
@@ -52,7 +52,7 @@ main(void)
/* print each grapheme cluster with byte-length */
for (; *s != '\\0';) {
- len = lg_grapheme_nextbreak((uint8_t *)s);
+ len = lg_grapheme_nextbreak(s);
printf("%2zu bytes | %.*s\\n", len, (int)len, s, len);
s += len;
}
diff --git a/src/grapheme.c b/src/grapheme.c
@@ -179,7 +179,7 @@ hasbreak:
}
size_t
-lg_grapheme_nextbreak(const uint8_t *str)
+lg_grapheme_nextbreak(const char *str)
{
uint_least32_t cp0, cp1;
size_t ret, len = 0;
diff --git a/src/utf8.c b/src/utf8.c
@@ -48,7 +48,7 @@ static const struct {
};
size_t
-lg_utf8_decode(const uint8_t *s, size_t n, uint_least32_t *cp)
+lg_utf8_decode(const char *s, size_t n, uint_least32_t *cp)
{
size_t off, i;
@@ -60,13 +60,14 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint_least32_t *cp)
/* identify sequence type with the first byte */
for (off = 0; off < LEN(lut); off++) {
- if (BETWEEN(s[0], lut[off].lower, lut[off].upper)) {
+ if (BETWEEN(((unsigned char *)s)[0], lut[off].lower,
+ lut[off].upper)) {
/*
* first byte is within the bounds; fill
* p with the the first bits contained in
* the first byte (by subtracting the high bits)
*/
- *cp = s[0] - lut[off].lower;
+ *cp = ((unsigned char *)s)[0] - lut[off].lower;
break;
}
}
@@ -74,6 +75,9 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint_least32_t *cp)
/*
* first byte does not match a sequence type;
* set cp as invalid and return 1 byte processed
+ *
+ * this also includes the cases where bits higher than
+ * the 8th are set on systems with CHAR_BIT > 8
*/
*cp = LG_CODEPOINT_INVALID;
return 1;
@@ -92,12 +96,16 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint_least32_t *cp)
* (i.e. between 0x80 (10000000) and 0xBF (10111111))
*/
for (i = 1; i <= off; i++) {
- if(!BETWEEN(s[i], 0x80, 0xBF)) {
+ if(!BETWEEN(((unsigned char *)s)[i], 0x80, 0xBF)) {
/*
* byte does not match format; return
* number of bytes processed excluding the
* unexpected character as recommended since
* Unicode 6 (chapter 3)
+ *
+ * this also includes the cases where bits
+ * higher than the 8th are set on systems
+ * with CHAR_BIT > 8
*/
*cp = LG_CODEPOINT_INVALID;
return 1 + (i - 1);
@@ -106,7 +114,7 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint_least32_t *cp)
* shift code point by 6 bits and add the 6 stored bits
* in s[i] to it using the bitmask 0x3F (00111111)
*/
- *cp = (*cp << 6) | (s[i] & 0x3F);
+ *cp = (*cp << 6) | (((unsigned char *)s)[i] & 0x3F);
}
if (*cp < lut[off].mincp ||
@@ -125,7 +133,7 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint_least32_t *cp)
}
size_t
-lg_utf8_encode(uint_least32_t cp, uint8_t *s, size_t n)
+lg_utf8_encode(uint_least32_t cp, char *s, size_t n)
{
size_t off, i;
@@ -165,7 +173,7 @@ lg_utf8_encode(uint_least32_t cp, uint8_t *s, size_t n)
* We do not overwrite the mask because we guaranteed earlier
* that there are no bits higher than the mask allows.
*/
- s[0] = lut[off].lower | (uint8_t)(cp >> (6 * off));
+ ((unsigned char *)s)[0] = lut[off].lower | (uint8_t)(cp >> (6 * off));
for (i = 1; i <= off; i++) {
/*
@@ -174,7 +182,8 @@ lg_utf8_encode(uint_least32_t cp, uint8_t *s, size_t n)
* extract from the properly-shifted value using the
* mask 00111111 (0x3F)
*/
- s[i] = 0x80 | ((cp >> (6 * (off - i))) & 0x3F);
+ ((unsigned char *)s)[i] = 0x80 |
+ ((cp >> (6 * (off - i))) & 0x3F);
}
return 1 + off;
diff --git a/test/utf8-decode.c b/test/utf8-decode.c
@@ -8,7 +8,7 @@
#include "util.h"
static const struct {
- uint8_t *arr; /* UTF-8 byte sequence */
+ char *arr; /* UTF-8 byte sequence */
size_t len; /* length of UTF-8 byte sequence */
size_t exp_len; /* expected length returned */
uint_least32_t exp_cp; /* expected code point returned */
@@ -28,7 +28,7 @@ static const struct {
* [ 11111101 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xFD },
+ .arr = (char *)(unsigned char[]){ 0xFD },
.len = 1,
.exp_len = 1,
.exp_cp = LG_CODEPOINT_INVALID,
@@ -38,7 +38,7 @@ static const struct {
* [ 00000001 ] ->
* 0000001
*/
- .arr = (uint8_t[]){ 0x01 },
+ .arr = (char *)(unsigned char[]){ 0x01 },
.len = 1,
.exp_len = 1,
.exp_cp = 0x1,
@@ -48,7 +48,7 @@ static const struct {
* [ 11000011 10111111 ] ->
* 00011111111
*/
- .arr = (uint8_t[]){ 0xC3, 0xBF },
+ .arr = (char *)(unsigned char[]){ 0xC3, 0xBF },
.len = 2,
.exp_len = 2,
.exp_cp = 0xFF,
@@ -58,7 +58,7 @@ static const struct {
* [ 11000011 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xC3 },
+ .arr = (char *)(unsigned char[]){ 0xC3 },
.len = 1,
.exp_len = 2,
.exp_cp = LG_CODEPOINT_INVALID,
@@ -68,7 +68,7 @@ static const struct {
* [ 11000011 11111111 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xC3, 0xFF },
+ .arr = (char *)(unsigned char[]){ 0xC3, 0xFF },
.len = 2,
.exp_len = 1,
.exp_cp = LG_CODEPOINT_INVALID,
@@ -78,7 +78,7 @@ static const struct {
* [ 11000001 10111111 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xC1, 0xBF },
+ .arr = (char *)(unsigned char[]){ 0xC1, 0xBF },
.len = 2,
.exp_len = 2,
.exp_cp = LG_CODEPOINT_INVALID,
@@ -88,7 +88,7 @@ static const struct {
* [ 11100000 10111111 10111111 ] ->
* 0000111111111111
*/
- .arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF },
+ .arr = (char *)(unsigned char[]){ 0xE0, 0xBF, 0xBF },
.len = 3,
.exp_len = 3,
.exp_cp = 0xFFF,
@@ -98,7 +98,7 @@ static const struct {
* [ 11100000 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xE0 },
+ .arr = (char *)(unsigned char[]){ 0xE0 },
.len = 1,
.exp_len = 3,
.exp_cp = LG_CODEPOINT_INVALID,
@@ -108,7 +108,7 @@ static const struct {
* [ 11100000 01111111 10111111 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xE0, 0x7F, 0xBF },
+ .arr = (char *)(unsigned char[]){ 0xE0, 0x7F, 0xBF },
.len = 3,
.exp_len = 1,
.exp_cp = LG_CODEPOINT_INVALID,
@@ -118,7 +118,7 @@ static const struct {
* [ 11100000 10111111 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xE0, 0xBF },
+ .arr = (char *)(unsigned char[]){ 0xE0, 0xBF },
.len = 2,
.exp_len = 3,
.exp_cp = LG_CODEPOINT_INVALID,
@@ -128,7 +128,7 @@ static const struct {
* [ 11100000 10111111 01111111 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xE0, 0xBF, 0x7F },
+ .arr = (char *)(unsigned char[]){ 0xE0, 0xBF, 0x7F },
.len = 3,
.exp_len = 2,
.exp_cp = LG_CODEPOINT_INVALID,
@@ -138,7 +138,7 @@ static const struct {
* [ 11100000 10011111 10111111 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xE0, 0x9F, 0xBF },
+ .arr = (char *)(unsigned char[]){ 0xE0, 0x9F, 0xBF },
.len = 3,
.exp_len = 3,
.exp_cp = LG_CODEPOINT_INVALID,
@@ -148,7 +148,7 @@ static const struct {
* [ 11101101 10100000 10000000 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xED, 0xA0, 0x80 },
+ .arr = (char *)(unsigned char[]){ 0xED, 0xA0, 0x80 },
.len = 3,
.exp_len = 3,
.exp_cp = LG_CODEPOINT_INVALID,
@@ -158,7 +158,7 @@ static const struct {
* [ 11110011 10111111 10111111 10111111 ] ->
* 011111111111111111111
*/
- .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF },
+ .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0xBF },
.len = 4,
.exp_len = 4,
.exp_cp = UINT32_C(0xFFFFF),
@@ -168,7 +168,7 @@ static const struct {
* [ 11110011 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xF3 },
+ .arr = (char *)(unsigned char[]){ 0xF3 },
.len = 1,
.exp_len = 4,
.exp_cp = LG_CODEPOINT_INVALID,
@@ -178,7 +178,7 @@ static const struct {
* [ 11110011 01111111 10111111 10111111 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xF3, 0x7F, 0xBF, 0xBF },
+ .arr = (char *)(unsigned char[]){ 0xF3, 0x7F, 0xBF, 0xBF },
.len = 4,
.exp_len = 1,
.exp_cp = LG_CODEPOINT_INVALID,
@@ -188,7 +188,7 @@ static const struct {
* [ 11110011 10111111 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xF3, 0xBF },
+ .arr = (char *)(unsigned char[]){ 0xF3, 0xBF },
.len = 2,
.exp_len = 4,
.exp_cp = LG_CODEPOINT_INVALID,
@@ -198,7 +198,7 @@ static const struct {
* [ 11110011 10111111 01111111 10111111 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xF3, 0xBF, 0x7F, 0xBF },
+ .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0x7F, 0xBF },
.len = 4,
.exp_len = 2,
.exp_cp = LG_CODEPOINT_INVALID,
@@ -208,7 +208,7 @@ static const struct {
* [ 11110011 10111111 10111111 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF },
+ .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF },
.len = 3,
.exp_len = 4,
.exp_cp = LG_CODEPOINT_INVALID,
@@ -218,7 +218,7 @@ static const struct {
* [ 11110011 10111111 10111111 01111111 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0x7F },
+ .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0x7F },
.len = 4,
.exp_len = 3,
.exp_cp = LG_CODEPOINT_INVALID,
@@ -228,7 +228,7 @@ static const struct {
* [ 11110000 10000000 10000001 10111111 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xF0, 0x80, 0x81, 0xBF },
+ .arr = (char *)(unsigned char[]){ 0xF0, 0x80, 0x81, 0xBF },
.len = 4,
.exp_len = 4,
.exp_cp = LG_CODEPOINT_INVALID,
@@ -238,7 +238,7 @@ static const struct {
* [ 11110100 10010000 10000000 10000000 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xF4, 0x90, 0x80, 0x80 },
+ .arr = (char *)(unsigned char[]){ 0xF4, 0x90, 0x80, 0x80 },
.len = 4,
.exp_len = 4,
.exp_cp = LG_CODEPOINT_INVALID,
diff --git a/test/utf8-encode.c b/test/utf8-encode.c
@@ -9,43 +9,43 @@
static const struct {
uint_least32_t cp; /* input code point */
- uint8_t *exp_arr; /* expected UTF-8 byte sequence */
+ char *exp_arr; /* expected UTF-8 byte sequence */
size_t exp_len; /* expected length of UTF-8 sequence */
} enc_test[] = {
{
/* invalid code point (UTF-16 surrogate half) */
.cp = UINT32_C(0xD800),
- .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD },
+ .exp_arr = (char *)(unsigned char[]){ 0xEF, 0xBF, 0xBD },
.exp_len = 3,
},
{
/* invalid code point (UTF-16-unrepresentable) */
.cp = UINT32_C(0x110000),
- .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD },
+ .exp_arr = (char *)(unsigned char[]){ 0xEF, 0xBF, 0xBD },
.exp_len = 3,
},
{
/* code point encoded to a 1-byte sequence */
.cp = 0x01,
- .exp_arr = (uint8_t[]){ 0x01 },
+ .exp_arr = (char *)(unsigned char[]){ 0x01 },
.exp_len = 1,
},
{
/* code point encoded to a 2-byte sequence */
.cp = 0xFF,
- .exp_arr = (uint8_t[]){ 0xC3, 0xBF },
+ .exp_arr = (char *)(unsigned char[]){ 0xC3, 0xBF },
.exp_len = 2,
},
{
/* code point encoded to a 3-byte sequence */
.cp = 0xFFF,
- .exp_arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF },
+ .exp_arr = (char *)(unsigned char[]){ 0xE0, 0xBF, 0xBF },
.exp_len = 3,
},
{
/* code point encoded to a 4-byte sequence */
.cp = UINT32_C(0xFFFFF),
- .exp_arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF },
+ .exp_arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0xBF },
.exp_len = 4,
},
};
@@ -59,7 +59,7 @@ main(int argc, char *argv[])
/* UTF-8 encoder test */
for (i = 0, failed = 0; i < LEN(enc_test); i++) {
- uint8_t arr[4];
+ char arr[4];
size_t len;
len = lg_utf8_encode(enc_test[i].cp, arr, LEN(arr));