libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

commit 8394ef260030e8087ded6ead63c9dd2f51c07109
parent c0e14c9b89c1ac78b72b7d8840261fbb7285d07a
Author: Laslo Hunhold <dev@frign.de>
Date:   Sun, 12 Dec 2021 12:48:34 +0100

Assume nothing, assert what we want in the API

After a long discussion with and a lot of input from Michael Forney,
which I am very thankful for, we came to the conclusion that accepting
"char *" is problematic and implies a lot of things that reach back
decades. In fact, "char" is so broken that the functions defined in
bloody "string.h" operate on "void *". Go figure...

It would have been possible to simply change the API to work with
"unsigned char *" to avoid signed integer representation shenanigans,
but at that point, if the user cannot use string literals and char-arrays
without a cast anyway, we might as well go all the way and be clear
about what we really want (array of octets). Supporting char doesn't
really make sense when char is not 8-bit, given then the user would
have to do a lot of hand-crafting to make it work anyway. Even if
such systems with chars >8 bit were common (which they aren't at all), I
would still have probably made this decision here.

UTF-8 is an 8-bit encoding and it only makes sense on octet-arrays.
This might even motivate libgrapheme-users to be explicit about it
in their code as well. Otherwise, given POSIX guarantees that char
is exactly 8 bits, simply casting to (uint8_t *) will work.

C's type system is already weak enough, so where possible ambiguities
like that should be resolved in favor of more robust solutions in my
opinion.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
Mgrapheme.h | 6+++---
Msrc/grapheme.c | 2+-
Msrc/utf8.c | 25++++++++-----------------
Mtest/utf8-decode.c | 131++++++++++++++++---------------------------------------------------------------
Mtest/utf8-encode.c | 38++++++++------------------------------
5 files changed, 46 insertions(+), 156 deletions(-)

diff --git a/grapheme.h b/grapheme.h @@ -18,11 +18,11 @@ typedef struct lg_internal_segmentation_state { #define LG_CODEPOINT_INVALID UINT32_C(0xFFFD) -size_t lg_grapheme_nextbreak(const char *); +size_t lg_grapheme_nextbreak(const uint8_t *); int lg_grapheme_isbreak(uint_least32_t, uint_least32_t, LG_SEGMENTATION_STATE *); -size_t lg_utf8_decode(const char *, size_t, uint_least32_t *); -size_t lg_utf8_encode(uint_least32_t, char *, size_t); +size_t lg_utf8_decode(const uint8_t *, size_t, uint_least32_t *); +size_t lg_utf8_encode(uint_least32_t, uint8_t *, size_t); #endif /* GRAPHEME_H */ diff --git a/src/grapheme.c b/src/grapheme.c @@ -177,7 +177,7 @@ hasbreak: } size_t -lg_grapheme_nextbreak(const char *str) +lg_grapheme_nextbreak(const uint8_t *str) { uint_least32_t cp0, cp1; size_t ret, len = 0; diff --git a/src/utf8.c b/src/utf8.c @@ -48,17 +48,10 @@ static const struct { }; size_t -lg_utf8_decode(const char *s, size_t n, uint_least32_t *cp) +lg_utf8_decode(const uint8_t *s, size_t n, uint_least32_t *cp) { size_t off, i; - /* - * char is guaranteed to be at least 8 bits, but it could - * be more. We assume that the encoding is faithful such - * that any higher bits are zero. If we encounter anything - * else, we treat it as an encoding error. - */ - if (n == 0) { /* a sequence must be at least 1 byte long */ *cp = LG_CODEPOINT_INVALID; @@ -67,15 +60,13 @@ lg_utf8_decode(const char *s, size_t n, uint_least32_t *cp) /* identify sequence type with the first byte */ for (off = 0; off < LEN(lut); off++) { - if (BETWEEN((unsigned char)s[0], lut[off].lower, - lut[off].upper)) { + if (BETWEEN(s[0], lut[off].lower, lut[off].upper)) { /* * first byte is within the bounds; fill * p with the the first bits contained in * the first byte (by subtracting the high bits) - * and discarding any higher bits than 8 */ - *cp = ((unsigned char)s[0] - lut[off].lower) & 0xff; + *cp = s[0] - lut[off].lower; break; } } @@ -101,7 +92,7 @@ lg_utf8_decode(const char *s, size_t n, uint_least32_t *cp) * (i.e. between 0x80 (10000000) and 0xBF (10111111)) */ for (i = 1; i <= off; i++) { - if(!BETWEEN((unsigned char)s[i], 0x80, 0xBF)) { + if(!BETWEEN(s[i], 0x80, 0xBF)) { /* * byte does not match format; return * number of bytes processed excluding the @@ -115,7 +106,7 @@ lg_utf8_decode(const char *s, size_t n, uint_least32_t *cp) * shift code point by 6 bits and add the 6 stored bits * in s[i] to it using the bitmask 0x3F (00111111) */ - *cp = (*cp << 6) | ((unsigned char)s[i] & 0x3F); + *cp = (*cp << 6) | (s[i] & 0x3F); } if (*cp < lut[off].mincp || @@ -134,7 +125,7 @@ lg_utf8_decode(const char *s, size_t n, uint_least32_t *cp) } size_t -lg_utf8_encode(uint_least32_t cp, char *s, size_t n) +lg_utf8_encode(uint_least32_t cp, uint8_t *s, size_t n) { size_t off, i; @@ -170,7 +161,7 @@ lg_utf8_encode(uint_least32_t cp, char *s, size_t n) * We do not overwrite the mask because we guaranteed earlier * that there are no bits higher than the mask allows. */ - s[0] = (unsigned char)(lut[off].lower | (cp >> (6 * off))); + s[0] = lut[off].lower | (cp >> (6 * off)); for (i = 1; i <= off; i++) { /* @@ -179,7 +170,7 @@ lg_utf8_encode(uint_least32_t cp, char *s, size_t n) * extract from the properly-shifted value using the * mask 00111111 (0x3F) */ - s[i] = (unsigned char)(0x80 | ((cp >> (6 * (off - i))) & 0x3F)); + s[i] = 0x80 | ((cp >> (6 * (off - i))) & 0x3F); } return 1 + off; diff --git a/test/utf8-decode.c b/test/utf8-decode.c @@ -9,10 +9,10 @@ #define LEN(x) (sizeof(x) / sizeof(*(x))) static const struct { - char *arr; /* UTF-8 byte sequence */ - size_t len; /* length of UTF-8 byte sequence */ - size_t exp_len; /* expected length returned */ - uint32_t exp_cp; /* expected code point returned */ + uint8_t *arr; /* UTF-8 byte sequence */ + size_t len; /* length of UTF-8 byte sequence */ + size_t exp_len; /* expected length returned */ + uint_least32_t exp_cp; /* expected code point returned */ } dec_test[] = { { /* empty sequence @@ -29,9 +29,7 @@ static const struct { * [ 11111101 ] -> * INVALID */ - .arr = (char[]){ - (unsigned char)0xFD, - }, + .arr = (uint8_t[]){ 0xFD }, .len = 1, .exp_len = 1, .exp_cp = LG_CODEPOINT_INVALID, @@ -41,9 +39,7 @@ static const struct { * [ 00000001 ] -> * 0000001 */ - .arr = (char[]){ - (unsigned char)0x01, - }, + .arr = (uint8_t[]){ 0x01 }, .len = 1, .exp_len = 1, .exp_cp = 0x1, @@ -53,10 +49,7 @@ static const struct { * [ 11000011 10111111 ] -> * 00011111111 */ - .arr = (char[]){ - (unsigned char)0xC3, - (unsigned char)0xBF, - }, + .arr = (uint8_t[]){ 0xC3, 0xBF }, .len = 2, .exp_len = 2, .exp_cp = 0xFF, @@ -66,9 +59,7 @@ static const struct { * [ 11000011 ] -> * INVALID */ - .arr = (char[]){ - (unsigned char)0xC3 - }, + .arr = (uint8_t[]){ 0xC3 }, .len = 1, .exp_len = 2, .exp_cp = LG_CODEPOINT_INVALID, @@ -78,10 +69,7 @@ static const struct { * [ 11000011 11111111 ] -> * INVALID */ - .arr = (char[]){ - (unsigned char)0xC3, - (unsigned char)0xFF, - }, + .arr = (uint8_t[]){ 0xC3, 0xFF }, .len = 2, .exp_len = 1, .exp_cp = LG_CODEPOINT_INVALID, @@ -91,10 +79,7 @@ static const struct { * [ 11000001 10111111 ] -> * INVALID */ - .arr = (char[]){ - (unsigned char)0xC1, - (unsigned char)0xBF, - }, + .arr = (uint8_t[]){ 0xC1, 0xBF }, .len = 2, .exp_len = 2, .exp_cp = LG_CODEPOINT_INVALID, @@ -104,11 +89,7 @@ static const struct { * [ 11100000 10111111 10111111 ] -> * 0000111111111111 */ - .arr = (char[]){ - (unsigned char)0xE0, - (unsigned char)0xBF, - (unsigned char)0xBF, - }, + .arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF }, .len = 3, .exp_len = 3, .exp_cp = 0xFFF, @@ -118,9 +99,7 @@ static const struct { * [ 11100000 ] -> * INVALID */ - .arr = (char[]){ - (unsigned char)0xE0, - }, + .arr = (uint8_t[]){ 0xE0 }, .len = 1, .exp_len = 3, .exp_cp = LG_CODEPOINT_INVALID, @@ -130,11 +109,7 @@ static const struct { * [ 11100000 01111111 10111111 ] -> * INVALID */ - .arr = (char[]){ - (unsigned char)0xE0, - (unsigned char)0x7F, - (unsigned char)0xBF, - }, + .arr = (uint8_t[]){ 0xE0, 0x7F, 0xBF }, .len = 3, .exp_len = 1, .exp_cp = LG_CODEPOINT_INVALID, @@ -144,10 +119,7 @@ static const struct { * [ 11100000 10111111 ] -> * INVALID */ - .arr = (char[]){ - (unsigned char)0xE0, - (unsigned char)0xBF, - }, + .arr = (uint8_t[]){ 0xE0, 0xBF }, .len = 2, .exp_len = 3, .exp_cp = LG_CODEPOINT_INVALID, @@ -157,11 +129,7 @@ static const struct { * [ 11100000 10111111 01111111 ] -> * INVALID */ - .arr = (char[]){ - (unsigned char)0xE0, - (unsigned char)0xBF, - (unsigned char)0x7F, - }, + .arr = (uint8_t[]){ 0xE0, 0xBF, 0x7F }, .len = 3, .exp_len = 2, .exp_cp = LG_CODEPOINT_INVALID, @@ -171,11 +139,7 @@ static const struct { * [ 11100000 10011111 10111111 ] -> * INVALID */ - .arr = (char[]){ - (unsigned char)0xE0, - (unsigned char)0x9F, - (unsigned char)0xBF, - }, + .arr = (uint8_t[]){ 0xE0, 0x9F, 0xBF }, .len = 3, .exp_len = 3, .exp_cp = LG_CODEPOINT_INVALID, @@ -185,11 +149,7 @@ static const struct { * [ 11101101 10100000 10000000 ] -> * INVALID */ - .arr = (char[]){ - (unsigned char)0xED, - (unsigned char)0xA0, - (unsigned char)0x80, - }, + .arr = (uint8_t[]){ 0xED, 0xA0, 0x80 }, .len = 3, .exp_len = 3, .exp_cp = LG_CODEPOINT_INVALID, @@ -199,12 +159,7 @@ static const struct { * [ 11110011 10111111 10111111 10111111 ] -> * 011111111111111111111 */ - .arr = (char[]){ - (unsigned char)0xF3, - (unsigned char)0xBF, - (unsigned char)0xBF, - (unsigned char)0xBF, - }, + .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF }, .len = 4, .exp_len = 4, .exp_cp = UINT32_C(0xFFFFF), @@ -214,9 +169,7 @@ static const struct { * [ 11110011 ] -> * INVALID */ - .arr = (char[]){ - (unsigned char)0xF3, - }, + .arr = (uint8_t[]){ 0xF3 }, .len = 1, .exp_len = 4, .exp_cp = LG_CODEPOINT_INVALID, @@ -226,12 +179,7 @@ static const struct { * [ 11110011 01111111 10111111 10111111 ] -> * INVALID */ - .arr = (char[]){ - (unsigned char)0xF3, - (unsigned char)0x7F, - (unsigned char)0xBF, - (unsigned char)0xBF, - }, + .arr = (uint8_t[]){ 0xF3, 0x7F, 0xBF, 0xBF }, .len = 4, .exp_len = 1, .exp_cp = LG_CODEPOINT_INVALID, @@ -241,10 +189,7 @@ static const struct { * [ 11110011 10111111 ] -> * INVALID */ - .arr = (char[]){ - (unsigned char)0xF3, - (unsigned char)0xBF, - }, + .arr = (uint8_t[]){ 0xF3, 0xBF }, .len = 2, .exp_len = 4, .exp_cp = LG_CODEPOINT_INVALID, @@ -254,12 +199,7 @@ static const struct { * [ 11110011 10111111 01111111 10111111 ] -> * INVALID */ - .arr = (char[]){ - (unsigned char)0xF3, - (unsigned char)0xBF, - (unsigned char)0x7F, - (unsigned char)0xBF, - }, + .arr = (uint8_t[]){ 0xF3, 0xBF, 0x7F, 0xBF }, .len = 4, .exp_len = 2, .exp_cp = LG_CODEPOINT_INVALID, @@ -269,11 +209,7 @@ static const struct { * [ 11110011 10111111 10111111 ] -> * INVALID */ - .arr = (char[]){ - (unsigned char)0xF3, - (unsigned char)0xBF, - (unsigned char)0xBF, - }, + .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF }, .len = 3, .exp_len = 4, .exp_cp = LG_CODEPOINT_INVALID, @@ -283,12 +219,7 @@ static const struct { * [ 11110011 10111111 10111111 01111111 ] -> * INVALID */ - .arr = (char[]){ - (unsigned char)0xF3, - (unsigned char)0xBF, - (unsigned char)0xBF, - (unsigned char)0x7F, - }, + .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0x7F }, .len = 4, .exp_len = 3, .exp_cp = LG_CODEPOINT_INVALID, @@ -298,12 +229,7 @@ static const struct { * [ 11110000 10000000 10000001 10111111 ] -> * INVALID */ - .arr = (char[]){ - (unsigned char)0xF0, - (unsigned char)0x80, - (unsigned char)0x81, - (unsigned char)0xBF, - }, + .arr = (uint8_t[]){ 0xF0, 0x80, 0x81, 0xBF }, .len = 4, .exp_len = 4, .exp_cp = LG_CODEPOINT_INVALID, @@ -313,12 +239,7 @@ static const struct { * [ 11110100 10010000 10000000 10000000 ] -> * INVALID */ - .arr = (char[]){ - (unsigned char)0xF4, - (unsigned char)0x90, - (unsigned char)0x80, - (unsigned char)0x80, - }, + .arr = (uint8_t[]){ 0xF4, 0x90, 0x80, 0x80 }, .len = 4, .exp_len = 4, .exp_cp = LG_CODEPOINT_INVALID, diff --git a/test/utf8-encode.c b/test/utf8-encode.c @@ -10,65 +10,43 @@ static const struct { uint_least32_t cp; /* input code point */ - char *exp_arr; /* expected UTF-8 byte sequence */ + uint8_t *exp_arr; /* expected UTF-8 byte sequence */ size_t exp_len; /* expected length of UTF-8 sequence */ } enc_test[] = { { /* invalid code point (UTF-16 surrogate half) */ .cp = UINT32_C(0xD800), - .exp_arr = (char[]){ - (unsigned char)0xEF, - (unsigned char)0xBF, - (unsigned char)0xBD, - }, + .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD }, .exp_len = 3, }, { /* invalid code point (UTF-16-unrepresentable) */ .cp = UINT32_C(0x110000), - .exp_arr = (char[]){ - (unsigned char)0xEF, - (unsigned char)0xBF, - (unsigned char)0xBD, - }, + .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD }, .exp_len = 3, }, { /* code point encoded to a 1-byte sequence */ .cp = 0x01, - .exp_arr = (char[]){ - (unsigned char)0x01 - }, + .exp_arr = (uint8_t[]){ 0x01 }, .exp_len = 1, }, { /* code point encoded to a 2-byte sequence */ .cp = 0xFF, - .exp_arr = (char[]){ - (unsigned char)0xC3, - (unsigned char)0xBF, - }, + .exp_arr = (uint8_t[]){ 0xC3, 0xBF }, .exp_len = 2, }, { /* code point encoded to a 3-byte sequence */ .cp = 0xFFF, - .exp_arr = (char[]){ - (unsigned char)0xE0, - (unsigned char)0xBF, - (unsigned char)0xBF, - }, + .exp_arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF }, .exp_len = 3, }, { /* code point encoded to a 4-byte sequence */ .cp = UINT32_C(0xFFFFF), - .exp_arr = (char[]){ - (unsigned char)0xF3, - (unsigned char)0xBF, - (unsigned char)0xBF, - (unsigned char)0xBF, - }, + .exp_arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF }, .exp_len = 4, }, }; @@ -80,7 +58,7 @@ main(void) /* UTF-8 encoder test */ for (i = 0, failed = 0; i < LEN(enc_test); i++) { - char arr[4]; + uint8_t arr[4]; size_t len; len = lg_utf8_encode(enc_test[i].cp, arr, LEN(arr));