libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

commit f8e8649a4fd88e61f9473400f44b9b1c5fce9e7c
parent cb7e9c00899ae0ed57a84991308b7f880f4ddef6
Author: Laslo Hunhold <dev@frign.de>
Date:   Sun, 19 Dec 2021 00:52:23 +0100

Rewrite grapheme_next_character_break() and add size-parameter

Not in all cases will you have a NUL-terminated string to look at,
but some length-bounded "raw" array in memory. Comparable to how
we already do it in grapheme_decode_utf8() to handle NUL-terminated
strings, we add a len-parameter to grapheme_next_character_break()
that can be set to SIZE_MAX to indicate that the string doesn't have
a known bound but is instead NUL-terminated. Otherwise, if len is
not SIZE_MAX, we have a proper bound.

It was planned anyway, but this was a good point to rewrite the function
to make it more readable and simplify it. There was especially no reason
to call grapheme_decode_utf8() more than once.

This will bring 99% feature-parity with what most people do with
ICU without all the unnecessary cruft, boiler-plate and incantations
you need with ICU.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
Mgrapheme.h | 2+-
Mman/grapheme_next_character_break.3 | 36++++++++++++++++++++++++++++--------
Msrc/character.c | 57++++++++++++++++++++++++---------------------------------
3 files changed, 53 insertions(+), 42 deletions(-)

diff --git a/grapheme.h b/grapheme.h @@ -19,7 +19,7 @@ typedef struct grapheme_internal_segmentation_state { #define GRAPHEME_INVALID_CODEPOINT UINT32_C(0xFFFD) -size_t grapheme_next_character_break(const char *); +size_t grapheme_next_character_break(const char *, size_t); bool grapheme_is_character_break(uint_least32_t, uint_least32_t, GRAPHEME_STATE *); diff --git a/man/grapheme_next_character_break.3 b/man/grapheme_next_character_break.3 @@ -7,19 +7,30 @@ .Sh SYNOPSIS .In grapheme.h .Ft size_t -.Fn grapheme_next_character_break "const char *str" +.Fn grapheme_next_character_break "const char *str" "size_t len" .Sh DESCRIPTION The .Fn grapheme_next_character_break function computes the offset (in bytes) to the next grapheme cluster break (see .Xr libgrapheme 7 ) -in the UTF-8-encoded NUL-terminated string -.Va str . +in the UTF-8-encoded string +.Va str +of length +.Va len . If a grapheme cluster begins at .Va str this offset is equal to the length of said grapheme cluster. .Pp +If +.Va len +is set to +.Dv SIZE_MAX +(stdint.h is already included by grapheme.h) the string +.Va str +is interpreted to be NUL-terminated and processing stops when a +NUL-byte is encountered. +.Pp For non-UTF-8 input data .Xr grapheme_is_character_break 3 can be used instead. @@ -48,15 +59,24 @@ main(void) "\\x9F\\x91\\xA9\\xE2\\x80\\x8D\\xF0\\x9F\\x91\\xA6 \\xF0" "\\x9F\\x87\\xBA\\xF0\\x9F\\x87\\xB8 \\xE0\\xA4\\xA8\\xE0" "\\xA5\\x80 \\xE0\\xAE\\xA8\\xE0\\xAE\\xBF!"; - size_t len; + size_t ret, len, off; printf("Input: \\"%s\\"\\n", s); /* print each grapheme cluster with byte-length */ - for (; *s != '\\0';) { - len = grapheme_next_character_break(s); - printf("%2zu bytes | %.*s\\n", len, (int)len, s, len); - s += len; + printf("Grapheme clusters in NUL-delimited input:\\n"); + for (off = 0; s[off] != '\\0'; off += ret) { + ret = grapheme_next_character_break(s + off, SIZE_MAX); + printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret); + } + printf("\\n"); + + /* do the same, but this time string is length-delimited */ + len = 17; + printf("Grapheme clusters in input delimited to %zu bytes:\\n", len); + for (off = 0; off < len; off += ret) { + ret = grapheme_next_character_break(s + off, len - off); + printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret); } return 0; diff --git a/src/character.c b/src/character.c @@ -179,50 +179,41 @@ hasbreak: } size_t -grapheme_next_character_break(const char *str) +grapheme_next_character_break(const char *str, size_t len) { - uint_least32_t cp0, cp1; - size_t ret, len = 0; GRAPHEME_STATE state = { 0 }; + uint_least32_t cp0 = 0, cp1 = 0; + size_t off, ret; - if (str == NULL) { + if (str == NULL || len == 0) { return 0; } - /* - * grapheme_decode_utf8, when it encounters an unexpected byte, - * does not count it to the error and instead assumes that the - * unexpected byte is the beginning of a new sequence. - * This way, when the string ends with a null byte, we never - * miss it, even if the previous UTF-8 sequence terminates - * unexpectedly, as it would either act as an unexpected byte, - * saved for later, or as a null byte itself, that we can catch. - * We pass SIZE_MAX to the length, as we will never read beyond - * the null byte for the reasons given above. - */ - - /* get first codepoint */ - len += grapheme_decode_utf8(str, SIZE_MAX, &cp0); - if (cp0 == GRAPHEME_INVALID_CODEPOINT) { - return len; - } + for (off = 0; (len == SIZE_MAX) || off < len; off += ret) { + cp0 = cp1; + ret = grapheme_decode_utf8(str + off, (len == SIZE_MAX) ? + SIZE_MAX : len - off, &cp1); - while (cp0 != 0) { - /* get next codepoint */ - ret = grapheme_decode_utf8(str + len, SIZE_MAX, &cp1); + if (len != SIZE_MAX && ret > (len - off)) { + /* string ended abruptly, simply accept cropping */ + ret = len - off; + } - if (cp1 == GRAPHEME_INVALID_CODEPOINT || - grapheme_is_character_break(cp0, cp1, &state)) { - /* we read an invalid cp or have a breakpoint */ + if (len == SIZE_MAX && cp1 == 0) { + /* we hit a NUL-byte and are done */ break; - } else { - /* we don't have a breakpoint, continue */ - len += ret; } - /* prepare next round */ - cp0 = cp1; + if (off == 0) { + /* + * we skip the first round, as we need both + * cp0 and cp1 to be initialized + */ + continue; + } else if (grapheme_is_character_break(cp0, cp1, &state)) { + break; + } } - return len; + return off; }