commit f8e8649a4fd88e61f9473400f44b9b1c5fce9e7c
parent cb7e9c00899ae0ed57a84991308b7f880f4ddef6
Author: Laslo Hunhold <dev@frign.de>
Date: Sun, 19 Dec 2021 00:52:23 +0100
Rewrite grapheme_next_character_break() and add size-parameter
Not in all cases will you have a NUL-terminated string to look at,
but some length-bounded "raw" array in memory. Comparable to how
we already do it in grapheme_decode_utf8() to handle NUL-terminated
strings, we add a len-parameter to grapheme_next_character_break()
that can be set to SIZE_MAX to indicate that the string doesn't have
a known bound but is instead NUL-terminated. Otherwise, if len is
not SIZE_MAX, we have a proper bound.
It was planned anyway, but this was a good point to rewrite the function
to make it more readable and simplify it. There was especially no reason
to call grapheme_decode_utf8() more than once.
This will bring 99% feature-parity with what most people do with
ICU without all the unnecessary cruft, boiler-plate and incantations
you need with ICU.
Signed-off-by: Laslo Hunhold <dev@frign.de>
Diffstat:
3 files changed, 53 insertions(+), 42 deletions(-)
diff --git a/grapheme.h b/grapheme.h
@@ -19,7 +19,7 @@ typedef struct grapheme_internal_segmentation_state {
#define GRAPHEME_INVALID_CODEPOINT UINT32_C(0xFFFD)
-size_t grapheme_next_character_break(const char *);
+size_t grapheme_next_character_break(const char *, size_t);
bool grapheme_is_character_break(uint_least32_t, uint_least32_t, GRAPHEME_STATE *);
diff --git a/man/grapheme_next_character_break.3 b/man/grapheme_next_character_break.3
@@ -7,19 +7,30 @@
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
-.Fn grapheme_next_character_break "const char *str"
+.Fn grapheme_next_character_break "const char *str" "size_t len"
.Sh DESCRIPTION
The
.Fn grapheme_next_character_break
function computes the offset (in bytes) to the next grapheme
cluster break (see
.Xr libgrapheme 7 )
-in the UTF-8-encoded NUL-terminated string
-.Va str .
+in the UTF-8-encoded string
+.Va str
+of length
+.Va len .
If a grapheme cluster begins at
.Va str
this offset is equal to the length of said grapheme cluster.
.Pp
+If
+.Va len
+is set to
+.Dv SIZE_MAX
+(stdint.h is already included by grapheme.h) the string
+.Va str
+is interpreted to be NUL-terminated and processing stops when a
+NUL-byte is encountered.
+.Pp
For non-UTF-8 input data
.Xr grapheme_is_character_break 3
can be used instead.
@@ -48,15 +59,24 @@ main(void)
"\\x9F\\x91\\xA9\\xE2\\x80\\x8D\\xF0\\x9F\\x91\\xA6 \\xF0"
"\\x9F\\x87\\xBA\\xF0\\x9F\\x87\\xB8 \\xE0\\xA4\\xA8\\xE0"
"\\xA5\\x80 \\xE0\\xAE\\xA8\\xE0\\xAE\\xBF!";
- size_t len;
+ size_t ret, len, off;
printf("Input: \\"%s\\"\\n", s);
/* print each grapheme cluster with byte-length */
- for (; *s != '\\0';) {
- len = grapheme_next_character_break(s);
- printf("%2zu bytes | %.*s\\n", len, (int)len, s, len);
- s += len;
+ printf("Grapheme clusters in NUL-delimited input:\\n");
+ for (off = 0; s[off] != '\\0'; off += ret) {
+ ret = grapheme_next_character_break(s + off, SIZE_MAX);
+ printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret);
+ }
+ printf("\\n");
+
+ /* do the same, but this time string is length-delimited */
+ len = 17;
+ printf("Grapheme clusters in input delimited to %zu bytes:\\n", len);
+ for (off = 0; off < len; off += ret) {
+ ret = grapheme_next_character_break(s + off, len - off);
+ printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret);
}
return 0;
diff --git a/src/character.c b/src/character.c
@@ -179,50 +179,41 @@ hasbreak:
}
size_t
-grapheme_next_character_break(const char *str)
+grapheme_next_character_break(const char *str, size_t len)
{
- uint_least32_t cp0, cp1;
- size_t ret, len = 0;
GRAPHEME_STATE state = { 0 };
+ uint_least32_t cp0 = 0, cp1 = 0;
+ size_t off, ret;
- if (str == NULL) {
+ if (str == NULL || len == 0) {
return 0;
}
- /*
- * grapheme_decode_utf8, when it encounters an unexpected byte,
- * does not count it to the error and instead assumes that the
- * unexpected byte is the beginning of a new sequence.
- * This way, when the string ends with a null byte, we never
- * miss it, even if the previous UTF-8 sequence terminates
- * unexpectedly, as it would either act as an unexpected byte,
- * saved for later, or as a null byte itself, that we can catch.
- * We pass SIZE_MAX to the length, as we will never read beyond
- * the null byte for the reasons given above.
- */
-
- /* get first codepoint */
- len += grapheme_decode_utf8(str, SIZE_MAX, &cp0);
- if (cp0 == GRAPHEME_INVALID_CODEPOINT) {
- return len;
- }
+ for (off = 0; (len == SIZE_MAX) || off < len; off += ret) {
+ cp0 = cp1;
+ ret = grapheme_decode_utf8(str + off, (len == SIZE_MAX) ?
+ SIZE_MAX : len - off, &cp1);
- while (cp0 != 0) {
- /* get next codepoint */
- ret = grapheme_decode_utf8(str + len, SIZE_MAX, &cp1);
+ if (len != SIZE_MAX && ret > (len - off)) {
+ /* string ended abruptly, simply accept cropping */
+ ret = len - off;
+ }
- if (cp1 == GRAPHEME_INVALID_CODEPOINT ||
- grapheme_is_character_break(cp0, cp1, &state)) {
- /* we read an invalid cp or have a breakpoint */
+ if (len == SIZE_MAX && cp1 == 0) {
+ /* we hit a NUL-byte and are done */
break;
- } else {
- /* we don't have a breakpoint, continue */
- len += ret;
}
- /* prepare next round */
- cp0 = cp1;
+ if (off == 0) {
+ /*
+ * we skip the first round, as we need both
+ * cp0 and cp1 to be initialized
+ */
+ continue;
+ } else if (grapheme_is_character_break(cp0, cp1, &state)) {
+ break;
+ }
}
- return len;
+ return off;
}