Rewrite grapheme_next_character_break() and add size-parameter - libgrapheme

commit f8e8649a4fd88e61f9473400f44b9b1c5fce9e7c
parent cb7e9c00899ae0ed57a84991308b7f880f4ddef6
Author: Laslo Hunhold <dev@frign.de>
Date:   Sun, 19 Dec 2021 00:52:23 +0100

Rewrite grapheme_next_character_break() and add size-parameter

Not in all cases will you have a NUL-terminated string to look at,
but some length-bounded "raw" array in memory. Comparable to how
we already do it in grapheme_decode_utf8() to handle NUL-terminated
strings, we add a len-parameter to grapheme_next_character_break()
that can be set to SIZE_MAX to indicate that the string doesn't have
a known bound but is instead NUL-terminated. Otherwise, if len is
not SIZE_MAX, we have a proper bound.

It was planned anyway, but this was a good point to rewrite the function
to make it more readable and simplify it. There was especially no reason
to call grapheme_decode_utf8() more than once.

This will bring 99% feature-parity with what most people do with
ICU without all the unnecessary cruft, boiler-plate and incantations
you need with ICU.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
M grapheme.h  | 2 +-
M man/grapheme_next_character_break.3  | 36 ++++++++++++++++++++++++++++--------
M src/character.c  | 57 ++++++++++++++++++++++++---------------------------------

3 files changed, 53 insertions(+), 42 deletions(-)
diff --git a/grapheme.h b/grapheme.h
@@ -19,7 +19,7 @@ typedef struct grapheme_internal_segmentation_state {
 
 #define GRAPHEME_INVALID_CODEPOINT UINT32_C(0xFFFD)
 
-size_t grapheme_next_character_break(const char *);
+size_t grapheme_next_character_break(const char *, size_t);
 
 bool grapheme_is_character_break(uint_least32_t, uint_least32_t, GRAPHEME_STATE *);
 
diff --git a/man/grapheme_next_character_break.3 b/man/grapheme_next_character_break.3
@@ -7,19 +7,30 @@
 .Sh SYNOPSIS
 .In grapheme.h
 .Ft size_t
-.Fn grapheme_next_character_break "const char *str"
+.Fn grapheme_next_character_break "const char *str" "size_t len"
 .Sh DESCRIPTION
 The
 .Fn grapheme_next_character_break
 function computes the offset (in bytes) to the next grapheme
 cluster break (see
 .Xr libgrapheme 7 )
-in the UTF-8-encoded NUL-terminated string
-.Va str .
+in the UTF-8-encoded string
+.Va str
+of length
+.Va len .
 If a grapheme cluster begins at
 .Va str
 this offset is equal to the length of said grapheme cluster.
 .Pp
+If
+.Va len
+is set to
+.Dv SIZE_MAX
+(stdint.h is already included by grapheme.h) the string
+.Va str
+is interpreted to be NUL-terminated and processing stops when a
+NUL-byte is encountered.
+.Pp
 For non-UTF-8 input data
 .Xr grapheme_is_character_break 3
 can be used instead.
@@ -48,15 +59,24 @@ main(void)
 	          "\\x9F\\x91\\xA9\\xE2\\x80\\x8D\\xF0\\x9F\\x91\\xA6 \\xF0"
 	          "\\x9F\\x87\\xBA\\xF0\\x9F\\x87\\xB8 \\xE0\\xA4\\xA8\\xE0"
 	          "\\xA5\\x80 \\xE0\\xAE\\xA8\\xE0\\xAE\\xBF!";
-	size_t len;
+	size_t ret, len, off;
 
 	printf("Input: \\"%s\\"\\n", s);
 
 	/* print each grapheme cluster with byte-length */
-	for (; *s != '\\0';) {
-		len = grapheme_next_character_break(s);
-		printf("%2zu bytes | %.*s\\n", len, (int)len, s, len);
-		s += len;
+	printf("Grapheme clusters in NUL-delimited input:\\n");
+	for (off = 0; s[off] != '\\0'; off += ret) {
+		ret = grapheme_next_character_break(s + off, SIZE_MAX);
+		printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret);
+	}
+	printf("\\n");
+
+	/* do the same, but this time string is length-delimited */
+	len = 17;
+	printf("Grapheme clusters in input delimited to %zu bytes:\\n", len);
+	for (off = 0; off < len; off += ret) {
+		ret = grapheme_next_character_break(s + off, len - off);
+		printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret);
 	}
 
 	return 0;
diff --git a/src/character.c b/src/character.c
@@ -179,50 +179,41 @@ hasbreak:
 }
 
 size_t
-grapheme_next_character_break(const char *str)
+grapheme_next_character_break(const char *str, size_t len)
 {
-	uint_least32_t cp0, cp1;
-	size_t ret, len = 0;
 	GRAPHEME_STATE state = { 0 };
+	uint_least32_t cp0 = 0, cp1 = 0;
+	size_t off, ret;
 
-	if (str == NULL) {
+	if (str == NULL || len == 0) {
 		return 0;
 	}
 
-	/*
-	 * grapheme_decode_utf8, when it encounters an unexpected byte,
-	 * does not count it to the error and instead assumes that the
-	 * unexpected byte is the beginning of a new sequence.
-	 * This way, when the string ends with a null byte, we never
-	 * miss it, even if the previous UTF-8 sequence terminates
-	 * unexpectedly, as it would either act as an unexpected byte,
-	 * saved for later, or as a null byte itself, that we can catch.
-	 * We pass SIZE_MAX to the length, as we will never read beyond
-	 * the null byte for the reasons given above.
-	 */
-
-	/* get first codepoint */
-	len += grapheme_decode_utf8(str, SIZE_MAX, &cp0);
-	if (cp0 == GRAPHEME_INVALID_CODEPOINT) {
-		return len;
-	}
+	for (off = 0; (len == SIZE_MAX) || off < len; off += ret) {
+		cp0 = cp1;
+		ret = grapheme_decode_utf8(str + off, (len == SIZE_MAX) ?
+		                           SIZE_MAX : len - off, &cp1);
 
-	while (cp0 != 0) {
-		/* get next codepoint */
-		ret = grapheme_decode_utf8(str + len, SIZE_MAX, &cp1);
+		if (len != SIZE_MAX && ret > (len - off)) {
+			/* string ended abruptly, simply accept cropping */
+			ret = len - off;
+		}
 
-		if (cp1 == GRAPHEME_INVALID_CODEPOINT ||
-		    grapheme_is_character_break(cp0, cp1, &state)) {
-			/* we read an invalid cp or have a breakpoint */
+		if (len == SIZE_MAX && cp1 == 0) {
+			/* we hit a NUL-byte and are done */
 			break;
-		} else {
-			/* we don't have a breakpoint, continue */
-			len += ret;
 		}
 
-		/* prepare next round */
-		cp0 = cp1;
+		if (off == 0) {
+			/*
+			 * we skip the first round, as we need both
+			 * cp0 and cp1 to be initialized
+			 */
+			continue;
+		} else if (grapheme_is_character_break(cp0, cp1, &state)) {
+			break;
+		}
 	}
 
-	return len;
+	return off;
 }

	libgrapheme unicode string library
	git clone git://git.suckless.org/libgrapheme
	Log \| Files \| Refs \| README \| LICENSE

M	grapheme.h	\|	2	+-
M	man/grapheme_next_character_break.3	\|	36	++++++++++++++++++++++++++++--------
M	src/character.c	\|	57	++++++++++++++++++++++++---------------------------------