Implement word-segmentation - libgrapheme

commit 1a67a7fc7df73df260f8b7a18effd3b3e2b161c7
parent 1930624b9a9703c3449d2a877640e33c6d71f190
Author: Laslo Hunhold <dev@frign.de>
Date:   Mon,  6 Jun 2022 22:16:46 +0200

Implement word-segmentation

This was a tough nut to crack and took a lot of hours and multiple
rewrites to get right.

The first issue was that some codepoints could be in multiple classes
at the same time, requiring the implementation of a "conflict-handler"
in the data parser.

The segmentation algorithm itself then was highly complicated, as it
parses the data on two levels involving ignoring certain character
property classes and doing so gracefully and simultaneously.

Now it works though and passes the 1800+ tests provided by the Unicode
consortium. The LUTs are highly compressed and the complete library
still only weighs in at around 92K, which is lightweight given what
it does. If you link statically, it will cut away most of it as well.

What needed to be rethought was the general API-structure. It is
impossible to do word-segmentation on a 2-codepoint-comparison-with-
state basis and the only "form" is a function taking the entire
array and returning the offset to the next break. The API was adapted
accordingly.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
M Makefile  | 24 ++++++++++++++++++++++++
M benchmark/character.c  | 5 +++--
M benchmark/utf8-decode.c  | 4 ++--
A benchmark/word.c  | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++
M gen/character-test.c  | 2 +-
M gen/character.c  | 4 +++-
M gen/util.c  | 27 ++++++++++++++++++++-------
M gen/util.h  | 6 ++++--
A gen/word-test.c  | 19 +++++++++++++++++++
A gen/word.c  | 158 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M grapheme.h  | 8 ++++++--
D man/grapheme_next_character_break.3  | 92 -------------------------------------------------------------------------------
A man/grapheme_next_character_break_utf8.3  | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M src/character.c  | 44 ++++++++++++++++++++++++++++++++++----------
A src/word.c  | 373 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M test/character.c  | 12 ++++--------
M test/util.c  | 31 +++++++++++++------------------
M test/util.h  | 6 +++---
A test/word.c  | 16 ++++++++++++++++

19 files changed, 827 insertions(+), 148 deletions(-)
diff --git a/Makefile b/Makefile
@@ -7,25 +7,32 @@ include config.mk
 BENCHMARK =\
 	benchmark/character\
 	benchmark/utf8-decode\
+	benchmark/word\
 
 DATA =\
 	data/emoji-data.txt\
 	data/GraphemeBreakProperty.txt\
 	data/GraphemeBreakTest.txt\
+	data/WordBreakProperty.txt\
+	data/WordBreakTest.txt\
 
 GEN =\
 	gen/character\
 	gen/character-test\
+	gen/word\
+	gen/word-test\
 
 SRC =\
 	src/character\
 	src/utf8\
 	src/util\
+	src/word\
 
 TEST =\
 	test/character\
 	test/utf8-decode\
 	test/utf8-encode\
+	test/word\
 
 MAN3 =\
 	man/grapheme_decode_utf8.3\
@@ -40,27 +47,38 @@ all: libgrapheme.a libgrapheme.so
 benchmark/character.o: benchmark/character.c config.mk gen/character-test.h grapheme.h benchmark/util.h
 benchmark/utf8-decode.o: benchmark/utf8-decode.c config.mk gen/character-test.h grapheme.h benchmark/util.h
 benchmark/util.o: benchmark/util.c config.mk benchmark/util.h
+benchmark/word.o: benchmark/word.c config.mk gen/word-test.h grapheme.h benchmark/util.h
 gen/character.o: gen/character.c config.mk gen/util.h
 gen/character-test.o: gen/character-test.c config.mk gen/util.h
+gen/word.o: gen/word.c config.mk gen/util.h
+gen/word-test.o: gen/word-test.c config.mk gen/util.h
 gen/util.o: gen/util.c config.mk gen/util.h
 src/character.o: src/character.c config.mk gen/character.h grapheme.h src/util.h
 src/utf8.o: src/utf8.c config.mk grapheme.h
 src/util.o: src/util.c config.mk gen/types.h grapheme.h src/util.h
+src/word.o: src/word.c config.mk gen/word.h grapheme.h src/util.h
 test/character.o: test/character.c config.mk gen/character-test.h grapheme.h test/util.h
 test/utf8-encode.o: test/utf8-encode.c config.mk grapheme.h test/util.h
 test/utf8-decode.o: test/utf8-decode.c config.mk grapheme.h test/util.h
 test/util.o: test/util.c config.mk test/util.h
+test/word.o: test/word.c config.mk gen/word-test.h grapheme.h test/util.h
 
 benchmark/character: benchmark/character.o benchmark/util.o libgrapheme.a
 benchmark/utf8-decode: benchmark/utf8-decode.o benchmark/util.o libgrapheme.a
+benchmark/word: benchmark/word.o benchmark/util.o libgrapheme.a
 gen/character: gen/character.o gen/util.o
 gen/character-test: gen/character-test.o gen/util.o
+gen/word: gen/word.o gen/util.o
+gen/word-test: gen/word-test.o gen/util.o
 test/character: test/character.o test/util.o libgrapheme.a
 test/utf8-encode: test/utf8-encode.o test/util.o libgrapheme.a
 test/utf8-decode: test/utf8-decode.o test/util.o libgrapheme.a
+test/word: test/word.o test/util.o libgrapheme.a
 
 gen/character.h: data/emoji-data.txt data/GraphemeBreakProperty.txt gen/character
 gen/character-test.h: data/GraphemeBreakTest.txt gen/character-test
+gen/word.h: data/WordBreakProperty.txt gen/word
+gen/word-test.h: data/WordBreakTest.txt gen/word-test
 
 data/emoji-data.txt:
 	wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt
@@ -71,6 +89,12 @@ data/GraphemeBreakProperty.txt:
 data/GraphemeBreakTest.txt:
 	wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/auxiliary/GraphemeBreakTest.txt
 
+data/WordBreakProperty.txt:
+	wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/auxiliary/WordBreakProperty.txt
+
+data/WordBreakTest.txt:
+	wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/auxiliary/WordBreakTest.txt
+
 $(BENCHMARK):
 	$(CC) -o $@ $(LDFLAGS) $@.o benchmark/util.o libgrapheme.a -lutf8proc
 
diff --git a/benchmark/character.c b/benchmark/character.c
@@ -12,7 +12,7 @@
 
 #include <utf8proc.h>
 
-#define NUM_ITERATIONS 1000000
+#define NUM_ITERATIONS 100000
 
 struct break_benchmark_payload {
 	uint_least32_t *buf;
@@ -56,7 +56,8 @@ main(int argc, char *argv[])
 
 	(void)argc;
 
-	if ((p.buf = generate_cp_test_buffer(character_test, LEN(character_test),
+	if ((p.buf = generate_cp_test_buffer(character_break_test,
+	                                     LEN(character_break_test),
 	                                     &(p.buflen))) == NULL) {
 		return 1;
 	}
diff --git a/benchmark/utf8-decode.c b/benchmark/utf8-decode.c
@@ -64,8 +64,8 @@ main(int argc, char *argv[])
 
 	(void)argc;
 
-	p.buf = generate_utf8_test_buffer(character_test,
-	                                  LEN(character_test),
+	p.buf = generate_utf8_test_buffer(character_break_test,
+	                                  LEN(character_break_test),
 	                                  &(p.buflen));
 
 	/* convert cp-buffer to stupid custom libutf8proc-uint8-type */
diff --git a/benchmark/word.c b/benchmark/word.c
@@ -0,0 +1,52 @@
+/* See LICENSE file for copyright and license details. */
+#include <errno.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../grapheme.h"
+#include "../gen/word-test.h"
+#include "util.h"
+
+#define NUM_ITERATIONS 100000
+
+struct break_benchmark_payload {
+	uint_least32_t *buf;
+	size_t buflen;
+};
+
+void
+libgrapheme(const void *payload)
+{
+	const struct break_benchmark_payload *p = payload;
+	size_t off;
+
+	for (off = 0; off < p->buflen; ) {
+		off += grapheme_next_word_break(p->buf + off, p->buflen - off);
+	}
+}
+
+int
+main(int argc, char *argv[])
+{
+	struct break_benchmark_payload p;
+	double baseline = (double)NAN;
+
+	(void)argc;
+
+	if ((p.buf = generate_cp_test_buffer(word_break_test,
+	                                     LEN(word_break_test),
+	                                     &(p.buflen))) == NULL) {
+		return 1;
+	}
+
+	printf("%s\n", argv[0]);
+	run_benchmark(libgrapheme, &p, "libgrapheme ", NULL, "codepoint",
+	              &baseline, NUM_ITERATIONS, p.buflen - 1);
+
+	free(p.buf);
+
+	return 0;
+}
diff --git a/gen/character-test.c b/gen/character-test.c
@@ -12,7 +12,7 @@ main(int argc, char *argv[])
 	(void)argc;
 
 	break_test_list_parse("data/GraphemeBreakTest.txt", &test, &testlen);
-	break_test_list_print(test, testlen, "character_test", argv[0]);
+	break_test_list_print(test, testlen, "character_break_test", argv[0]);
 	break_test_list_free(test, testlen);
 
 	return 0;
diff --git a/gen/character.c b/gen/character.c
@@ -1,4 +1,6 @@
 /* See LICENSE file for copyright and license details. */
+#include <stddef.h>
+
 #include "util.h"
 
 #define FILE_EMOJI    "data/emoji-data.txt"
@@ -89,7 +91,7 @@ main(int argc, char *argv[])
 
 	properties_generate_break_property(char_break_property,
 	                                   LEN(char_break_property),
-	                                   "char", argv[0]);
+	                                   NULL, "char", argv[0]);
 
 	return 0;
 }
diff --git a/gen/util.c b/gen/util.c
@@ -3,6 +3,7 @@
 #include <errno.h>
 #include <inttypes.h>
 #include <stdbool.h>
+#include <stddef.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <stdio.h>
@@ -20,6 +21,7 @@ struct properties_payload {
 	const struct property_spec *spec;
 	uint_least8_t speclen;
 	int (*set_value)(struct properties_payload *, uint_least32_t, uint_least8_t);
+	uint_least8_t (*handle_conflict)(uint_least32_t, uint_least8_t, uint_least8_t);
 };
 
 struct properties_compressed {
@@ -421,11 +423,18 @@ set_value_bp(struct properties_payload *payload, uint_least32_t cp,
              uint_least8_t value)
 {
 	if (payload->prop[cp].break_property != 0) {
-		fprintf(stderr, "set_value_bp: "
-	                "Character break property overwrite (%s <- %s).\n",
-		        payload->spec[payload->prop[cp].break_property].enumname,
-			payload->spec[value].enumname);
-		return 1;
+		if (payload->handle_conflict == NULL) {
+			fprintf(stderr, "set_value_bp: "
+	                        "Unhandled character break property "
+			        "overwrite for 0x%06X (%s <- %s).\n",
+		                cp, payload->spec[payload->prop[cp].
+			        break_property].enumname,
+			        payload->spec[value].enumname);
+			return 1;
+		} else {
+			value = payload->handle_conflict(cp,
+			        payload->prop[cp].break_property, value);
+		}
 	}
 	payload->prop[cp].break_property = value;
 
@@ -440,8 +449,11 @@ get_value_bp(const struct properties *prop, size_t offset)
 
 void
 properties_generate_break_property(const struct property_spec *spec,
-                                   uint_least8_t speclen, const char *prefix,
-				   const char *argv0)
+                                   uint_least8_t speclen,
+                                   uint_least8_t (*handle_conflict)(
+                                   uint_least32_t, uint_least8_t,
+                                   uint_least8_t), const char *prefix,
+                                   const char *argv0)
 {
 	struct properties_compressed comp;
 	struct properties_major_minor mm;
@@ -461,6 +473,7 @@ properties_generate_break_property(const struct property_spec *spec,
 	payload.spec = spec;
 	payload.speclen = speclen;
 	payload.set_value = set_value_bp;
+	payload.handle_conflict = handle_conflict;
 
 	/* parse each file exactly once and ignore NULL-fields */
 	for (i = 0; i < speclen; i++) {
diff --git a/gen/util.h b/gen/util.h
@@ -23,8 +23,10 @@ void parse_file_with_callback(const char *, int (*callback)(const char *,
                               char **, size_t, char *, void *), void *payload);
 
 void properties_generate_break_property(const struct property_spec *,
-                                        uint_least8_t, const char *,
-                                        const char *);
+                                        uint_least8_t, uint_least8_t
+                                        (*handle_conflict)(uint_least32_t,
+                                        uint_least8_t, uint_least8_t),
+                                        const char *, const char *);
 
 void break_test_list_parse(char *, struct break_test **, size_t *);
 void break_test_list_print(const struct break_test *, size_t,
diff --git a/gen/word-test.c b/gen/word-test.c
@@ -0,0 +1,19 @@
+/* See LICENSE file for copyright and license details. */
+#include <stddef.h>
+
+#include "util.h"
+
+int
+main(int argc, char *argv[])
+{
+	struct break_test *test = NULL;
+	size_t testlen = 0;
+
+	(void)argc;
+
+	break_test_list_parse("data/WordBreakTest.txt", &test, &testlen);
+	break_test_list_print(test, testlen, "word_break_test", argv[0]);
+	break_test_list_free(test, testlen);
+
+	return 0;
+}
diff --git a/gen/word.c b/gen/word.c
@@ -0,0 +1,158 @@
+/* See LICENSE file for copyright and license details. */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "util.h"
+
+#define FILE_EMOJI "data/emoji-data.txt"
+#define FILE_WORD  "data/WordBreakProperty.txt"
+
+static const struct property_spec word_break_property[] = {
+	{
+		.enumname = "OTHER",
+		.file     = NULL,
+		.ucdname  = NULL,
+	},
+	{
+		.enumname = "ALETTER",
+		.file     = FILE_WORD,
+		.ucdname  = "ALetter",
+	},
+	{
+		.enumname = "BOTH_ALETTER_EXTPICT",
+		.file     = NULL,
+		.ucdname  = NULL,
+	},
+	{
+		.enumname = "CR",
+		.file     = FILE_WORD,
+		.ucdname  = "CR",
+	},
+	{
+		.enumname = "DOUBLE_QUOTE",
+		.file     = FILE_WORD,
+		.ucdname  = "Double_Quote",
+	},
+	{
+		.enumname = "EXTEND",
+		.file     = FILE_WORD,
+		.ucdname  = "Extend",
+	},
+	{
+		.enumname = "EXTENDED_PICTOGRAPHIC",
+		.file     = FILE_EMOJI,
+		.ucdname  = "Extended_Pictographic",
+	},
+	{
+		.enumname = "EXTENDNUMLET",
+		.file     = FILE_WORD,
+		.ucdname  = "ExtendNumLet",
+	},
+	{
+		.enumname = "FORMAT",
+		.file     = FILE_WORD,
+		.ucdname  = "Format",
+	},
+	{
+		.enumname = "HEBREW_LETTER",
+		.file     = FILE_WORD,
+		.ucdname  = "Hebrew_Letter",
+	},
+	{
+		.enumname = "KATAKANA",
+		.file     = FILE_WORD,
+		.ucdname  = "Katakana",
+	},
+	{
+		.enumname = "LF",
+		.file     = FILE_WORD,
+		.ucdname  = "LF",
+	},
+	{
+		.enumname = "MIDLETTER",
+		.file     = FILE_WORD,
+		.ucdname  = "MidLetter",
+	},
+	{
+		.enumname = "MIDNUM",
+		.file     = FILE_WORD,
+		.ucdname  = "MidNum",
+	},
+	{
+		.enumname = "MIDNUMLET",
+		.file     = FILE_WORD,
+		.ucdname  = "MidNumLet",
+	},
+	{
+		.enumname = "NEWLINE",
+		.file     = FILE_WORD,
+		.ucdname  = "Newline",
+	},
+	{
+		.enumname = "NUMERIC",
+		.file     = FILE_WORD,
+		.ucdname  = "Numeric",
+	},
+	{
+		.enumname = "REGIONAL_INDICATOR",
+		.file     = FILE_WORD,
+		.ucdname  = "Regional_Indicator",
+	},
+	{
+		.enumname = "SINGLE_QUOTE",
+		.file     = FILE_WORD,
+		.ucdname  = "Single_Quote",
+	},
+	{
+		.enumname = "WSEGSPACE",
+		.file     = FILE_WORD,
+		.ucdname  = "WSegSpace",
+	},
+	{
+		.enumname = "ZWJ",
+		.file     = FILE_WORD,
+		.ucdname  = "ZWJ",
+	},
+};
+
+static uint_least8_t
+handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t prop2)
+{
+	uint_least8_t result;
+
+	(void)cp;
+
+	if ((!strcmp(word_break_property[prop1].enumname, "ALETTER") &&
+	     !strcmp(word_break_property[prop2].enumname, "EXTENDED_PICTOGRAPHIC")) ||
+	    (!strcmp(word_break_property[prop1].enumname, "EXTENDED_PICTOGRAPHIC") &&
+	     !strcmp(word_break_property[prop2].enumname, "ALETTER"))) {
+		for (result = 0; result < LEN(word_break_property); result++) {
+			if (!strcmp(word_break_property[result].enumname,
+			            "BOTH_ALETTER_EXTPICT")) {
+				break;
+			}
+		}
+		if (result == LEN(word_break_property)) {
+			fprintf(stderr, "handle_conflict: Internal error.\n");
+			exit(1);
+		}
+	} else {
+		fprintf(stderr, "handle_conflict: Cannot handle conflict.\n");
+		exit(1);
+	}
+
+	return result;
+}
+
+int
+main(int argc, char *argv[])
+{
+	(void)argc;
+
+	properties_generate_break_property(word_break_property,
+	                                   LEN(word_break_property),
+	                                   handle_conflict, "word", argv[0]);
+
+	return 0;
+}
diff --git a/grapheme.h b/grapheme.h
@@ -15,10 +15,14 @@ typedef struct grapheme_internal_segmentation_state {
 
 #define GRAPHEME_INVALID_CODEPOINT UINT32_C(0xFFFD)
 
-size_t grapheme_next_character_break(const char *, size_t);
-
 bool grapheme_is_character_break(uint_least32_t, uint_least32_t, GRAPHEME_STATE *);
 
+size_t grapheme_next_character_break(const uint_least32_t *, size_t);
+size_t grapheme_next_word_break(const uint_least32_t *, size_t);
+
+size_t grapheme_next_character_break_utf8(const char *, size_t);
+size_t grapheme_next_word_break_utf8(const char *, size_t);
+
 size_t grapheme_decode_utf8(const char *, size_t, uint_least32_t *);
 size_t grapheme_encode_utf8(uint_least32_t, char *, size_t);
 
diff --git a/man/grapheme_next_character_break.3 b/man/grapheme_next_character_break.3
@@ -1,92 +0,0 @@
-.Dd 2021-12-22
-.Dt GRAPHEME_NEXT_CHARACTER_BREAK 3
-.Os suckless.org
-.Sh NAME
-.Nm grapheme_next_character_break
-.Nd determine byte-offset to next grapheme cluster break
-.Sh SYNOPSIS
-.In grapheme.h
-.Ft size_t
-.Fn grapheme_next_character_break "const char *str" "size_t len"
-.Sh DESCRIPTION
-The
-.Fn grapheme_next_character_break
-function computes the offset (in bytes) to the next grapheme
-cluster break (see
-.Xr libgrapheme 7 )
-in the UTF-8-encoded string
-.Va str
-of length
-.Va len .
-If a grapheme cluster begins at
-.Va str
-this offset is equal to the length of said grapheme cluster.
-.Pp
-If
-.Va len
-is set to
-.Dv SIZE_MAX
-(stdint.h is already included by grapheme.h) the string
-.Va str
-is interpreted to be NUL-terminated and processing stops when a
-NUL-byte is encountered.
-.Pp
-For non-UTF-8 input data
-.Xr grapheme_is_character_break 3
-can be used instead.
-.Sh RETURN VALUES
-The
-.Fn grapheme_next_character_break
-function returns the offset (in bytes) to the next grapheme cluster
-break in
-.Va str
-or 0 if
-.Va str
-is
-.Dv NULL .
-.Sh EXAMPLES
-.Bd -literal
-/* cc (-static) -o example example.c -lgrapheme */
-#include <grapheme.h>
-#include <stdint.h>
-#include <stdio.h>
-
-int
-main(void)
-{
-	/* UTF-8 encoded input */
-	char *s = "T\\xC3\\xABst \\xF0\\x9F\\x91\\xA8\\xE2\\x80\\x8D\\xF0"
-	          "\\x9F\\x91\\xA9\\xE2\\x80\\x8D\\xF0\\x9F\\x91\\xA6 \\xF0"
-	          "\\x9F\\x87\\xBA\\xF0\\x9F\\x87\\xB8 \\xE0\\xA4\\xA8\\xE0"
-	          "\\xA5\\x80 \\xE0\\xAE\\xA8\\xE0\\xAE\\xBF!";
-	size_t ret, len, off;
-
-	printf("Input: \\"%s\\"\\n", s);
-
-	/* print each grapheme cluster with byte-length */
-	printf("Grapheme clusters in NUL-delimited input:\\n");
-	for (off = 0; s[off] != '\\0'; off += ret) {
-		ret = grapheme_next_character_break(s + off, SIZE_MAX);
-		printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret);
-	}
-	printf("\\n");
-
-	/* do the same, but this time string is length-delimited */
-	len = 17;
-	printf("Grapheme clusters in input delimited to %zu bytes:\\n", len);
-	for (off = 0; off < len; off += ret) {
-		ret = grapheme_next_character_break(s + off, len - off);
-		printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret);
-	}
-
-	return 0;
-}
-.Ed
-.Sh SEE ALSO
-.Xr grapheme_is_character_break 3 ,
-.Xr libgrapheme 7
-.Sh STANDARDS
-.Fn grapheme_next_character_break
-is compliant with the Unicode 14.0.0 specification.
-.Sh AUTHORS
-.An Laslo Hunhold Aq Mt dev@frign.de
diff --git a/man/grapheme_next_character_break_utf8.3 b/man/grapheme_next_character_break_utf8.3
@@ -0,0 +1,92 @@
+.Dd 2021-12-22
+.Dt GRAPHEME_NEXT_CHARACTER_BREAK_UTF8 3
+.Os suckless.org
+.Sh NAME
+.Nm grapheme_next_character_break_utf8
+.Nd determine byte-offset to next grapheme cluster break
+.Sh SYNOPSIS
+.In grapheme.h
+.Ft size_t
+.Fn grapheme_next_character_break_utf8 "const char *str" "size_t len"
+.Sh DESCRIPTION
+The
+.Fn grapheme_next_character_break_utf8
+function computes the offset (in bytes) to the next grapheme
+cluster break (see
+.Xr libgrapheme 7 )
+in the UTF-8-encoded string
+.Va str
+of length
+.Va len .
+If a grapheme cluster begins at
+.Va str
+this offset is equal to the length of said grapheme cluster.
+.Pp
+If
+.Va len
+is set to
+.Dv SIZE_MAX
+(stdint.h is already included by grapheme.h) the string
+.Va str
+is interpreted to be NUL-terminated and processing stops when a
+NUL-byte is encountered.
+.Pp
+For non-UTF-8 input data
+.Xr grapheme_is_character_break 3
+can be used instead.
+.Sh RETURN VALUES
+The
+.Fn grapheme_next_character_break_utf8
+function returns the offset (in bytes) to the next grapheme cluster
+break in
+.Va str
+or 0 if
+.Va str
+is
+.Dv NULL .
+.Sh EXAMPLES
+.Bd -literal
+/* cc (-static) -o example example.c -lgrapheme */
+#include <grapheme.h>
+#include <stdint.h>
+#include <stdio.h>
+
+int
+main(void)
+{
+	/* UTF-8 encoded input */
+	char *s = "T\\xC3\\xABst \\xF0\\x9F\\x91\\xA8\\xE2\\x80\\x8D\\xF0"
+	          "\\x9F\\x91\\xA9\\xE2\\x80\\x8D\\xF0\\x9F\\x91\\xA6 \\xF0"
+	          "\\x9F\\x87\\xBA\\xF0\\x9F\\x87\\xB8 \\xE0\\xA4\\xA8\\xE0"
+	          "\\xA5\\x80 \\xE0\\xAE\\xA8\\xE0\\xAE\\xBF!";
+	size_t ret, len, off;
+
+	printf("Input: \\"%s\\"\\n", s);
+
+	/* print each grapheme cluster with byte-length */
+	printf("Grapheme clusters in NUL-delimited input:\\n");
+	for (off = 0; s[off] != '\\0'; off += ret) {
+		ret = grapheme_next_character_break_utf8(s + off, SIZE_MAX);
+		printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret);
+	}
+	printf("\\n");
+
+	/* do the same, but this time string is length-delimited */
+	len = 17;
+	printf("Grapheme clusters in input delimited to %zu bytes:\\n", len);
+	for (off = 0; off < len; off += ret) {
+		ret = grapheme_next_character_break_utf8(s + off, len - off);
+		printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret);
+	}
+
+	return 0;
+}
+.Ed
+.Sh SEE ALSO
+.Xr grapheme_is_character_break 3 ,
+.Xr libgrapheme 7
+.Sh STANDARDS
+.Fn grapheme_next_character_break_utf8
+is compliant with the Unicode 14.0.0 specification.
+.Sh AUTHORS
+.An Laslo Hunhold Aq Mt dev@frign.de
diff --git a/src/character.c b/src/character.c
@@ -102,7 +102,7 @@ static const uint_least16_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
 		UINT16_C(1 << CHAR_BREAK_PROP_REGIONAL_INDICATOR),
 };
 
-static enum char_break_property
+static inline enum char_break_property
 get_break_prop(uint_least32_t cp)
 {
 	if (likely(cp <= 0x10FFFF)) {
@@ -132,14 +132,14 @@ grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STA
 		state->prop_set = true;
 
 		/* update flags */
-		state->gb11_flag = flag_update_gb11[cp0_prop +
-		                                    NUM_CHAR_BREAK_PROPS *
-		                                    state->gb11_flag] &
-	                           UINT16_C(1 << cp1_prop);
-		state->gb12_13_flag = flag_update_gb12_13[cp0_prop +
-		                                          NUM_CHAR_BREAK_PROPS *
-		                                          state->gb12_13_flag] &
-		                      UINT16_C(1 << cp1_prop);
+		state->gb11_flag =
+			flag_update_gb11[cp0_prop + NUM_CHAR_BREAK_PROPS *
+			                 state->gb11_flag] &
+			UINT16_C(1 << cp1_prop);
+		state->gb12_13_flag =
+			flag_update_gb12_13[cp0_prop + NUM_CHAR_BREAK_PROPS *
+		                            state->gb12_13_flag] &
+		        UINT16_C(1 << cp1_prop);
 
 		/*
 		 * Apply grapheme cluster breaking algorithm (UAX #29), see
@@ -177,7 +177,31 @@ grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STA
 }
 
 size_t
-grapheme_next_character_break(const char *str, size_t len)
+grapheme_next_character_break(const uint_least32_t *str, size_t len)
+{
+	GRAPHEME_STATE state = { 0 };
+	size_t off;
+
+	if (str == NULL || len == 0) {
+		return 0;
+	}
+
+	for (off = 1; off < len; off++) {
+		if (grapheme_is_character_break(str[off - 1], str[off], &state)) {
+			break;
+		}
+	}
+
+	/* with no breaks we break at the end */
+	if (off == len) {
+		return len;
+	} else {
+		return off;
+	}
+}
+
+size_t
+grapheme_next_character_break_utf8(const char *str, size_t len)
 {
 	GRAPHEME_STATE state = { 0 };
 	uint_least32_t cp0 = 0, cp1 = 0;
diff --git a/src/word.c b/src/word.c
@@ -0,0 +1,373 @@
+/* See LICENSE file for copyright and license details. */
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../gen/word.h"
+#include "../grapheme.h"
+#include "util.h"
+
+static inline enum word_break_property
+get_break_prop(uint_least32_t cp)
+{
+	if (likely(cp <= 0x10FFFF)) {
+		return (enum word_break_property)
+		       word_break_minor[word_break_major[cp >> 8] + (cp & 0xff)];
+	} else {
+		return WORD_BREAK_PROP_OTHER;
+	}
+}
+
+static inline size_t
+get_codepoint(const void *str, size_t len, size_t offset, uint_least32_t *cp)
+{
+	if (offset < len) {
+		*cp = ((const uint_least32_t *)str)[offset];
+		return 1;
+	} else {
+		*cp = GRAPHEME_INVALID_CODEPOINT;
+		return 0;
+	}
+}
+
+static inline size_t
+get_codepoint_utf8(const void *str, size_t len, size_t offset, uint_least32_t *cp)
+{
+	if (offset < len) {
+		return grapheme_decode_utf8((const char *)str + offset,
+		                            len - offset, cp);
+	} else {
+		*cp = GRAPHEME_INVALID_CODEPOINT;
+		return 0;
+	}
+}
+
+static size_t
+next_word_break(const void *str, size_t len, size_t (*get_codepoint)
+                (const void *, size_t, size_t, uint_least32_t *))
+{
+	struct {
+		enum word_break_property a, b, c, d;
+	} raw, skip;
+	enum word_break_property res;
+	uint_least32_t cp;
+	size_t off, tmp, new_off;
+	bool ri_even = true;
+
+	/* check degenerate cases */
+	if (str == NULL || len == 0) {
+		return 0;
+	}
+
+	/*
+	 * Apply word breaking algorithm (UAX #29), see
+	 * https://unicode.org/reports/tr29/#Word_Boundary_Rules
+	 *
+	 * There are 4 slots (a, b, c, d) of "break" properties and
+	 * we check if there is a break in the middle between b and c.
+	 *
+	 * The position of this middle spot is determined by off,
+	 * which gives the offset of the first element on the right
+	 * hand side of said spot, or, in other words, gives the number
+	 * of elements on the left hand side.
+	 *
+	 * It is further complicated by the fact that the algorithm
+	 * expects you to skip certain characters for the second
+	 * half of the rules (after WB4). Thus, we do not only have
+	 * the "raw" properties as described above, but also the "skip"
+	 * properties, where the skip.a and skip.b, for instance,
+	 * give the two preceding character properties behind the
+	 * currently investigated breakpoint.
+	 *
+	 */
+
+	/*
+	 * Initialize the different properties such that we have
+	 * a good state after the state-update in the loop
+	 */
+	raw.b = NUM_WORD_BREAK_PROPS;
+	if ((off = get_codepoint(str, len, 0, &cp)) >= len) {
+		return 1;
+	}
+	raw.c = get_break_prop(cp);
+	(void)get_codepoint(str, len, off, &cp);
+	raw.d = get_break_prop(cp);
+	skip.a = skip.b = NUM_WORD_BREAK_PROPS;
+
+	for (; off < len; off = new_off) {
+		/*
+		 * Update left side (a and b) of the skip state by
+		 * "shifting in" the raw.c property as long as it is
+		 * not one of the "ignored" character properties.
+		 * While at it, update the RI-counter.
+		 *
+		 */
+		if (raw.c != WORD_BREAK_PROP_EXTEND &&
+		    raw.c != WORD_BREAK_PROP_FORMAT &&
+		    raw.c != WORD_BREAK_PROP_ZWJ) {
+		    	skip.a = skip.b;
+			skip.b = raw.c;
+
+			if (skip.b == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
+				/*
+				 * The property we just shifted in is
+				 * a regional indicator, increasing the
+				 * number of consecutive RIs on the left
+				 * side of the breakpoint by one, changing
+				 * the oddness.
+				 *
+				 */
+				ri_even = !ri_even;
+			} else {
+				/*
+				 * We saw no regional indicator, so the
+				 * number of consecutive RIs on the left
+				 * side of the breakpoint is zero, which
+				 * is an even number.
+				 *
+				 */
+				ri_even = true;
+			}
+		}
+
+		/*
+		 * Update right side (b and c) of the skip state by
+		 * starting at the breakpoint and detecting the two
+		 * following non-ignored character classes
+		 *
+		 */
+		skip.c = NUM_WORD_BREAK_PROPS;
+		for (tmp = off; tmp < len; ) {
+			tmp += get_codepoint(str, len, tmp, &cp);
+			res = get_break_prop(cp);
+
+			if (res != WORD_BREAK_PROP_EXTEND &&
+			    res != WORD_BREAK_PROP_FORMAT &&
+			    res != WORD_BREAK_PROP_ZWJ) {
+				skip.c = res;
+				break;
+			}
+		}
+		skip.d = NUM_WORD_BREAK_PROPS;
+		for (; tmp < len; ) {
+			tmp += get_codepoint(str, len, tmp, &cp);
+			res = get_break_prop(cp);
+
+			if (res != WORD_BREAK_PROP_EXTEND &&
+			    res != WORD_BREAK_PROP_FORMAT &&
+			    res != WORD_BREAK_PROP_ZWJ) {
+				skip.d = res;
+				break;
+			}
+		}
+
+		/*
+		 * Update the raw state by simply shifting everything
+		 * in and, if we still have data left, determining
+		 * the character class of the next codepoint.
+		 *
+		 */
+		raw.a = raw.b;
+		raw.b = raw.c;
+		raw.c = raw.d;
+		if ((new_off = off + get_codepoint(str, len, off, &cp)) < len) {
+			get_codepoint(str, len, new_off, &cp);
+			raw.d = get_break_prop(cp);
+		} else {
+			raw.d = NUM_WORD_BREAK_PROPS;
+		}
+
+		/* WB3 */
+		if (raw.b == WORD_BREAK_PROP_CR &&
+		    raw.c == WORD_BREAK_PROP_LF) {
+			continue;
+		}
+
+		/* WB3a */
+		if (raw.b == WORD_BREAK_PROP_NEWLINE ||
+		    raw.b == WORD_BREAK_PROP_CR      ||
+		    raw.b == WORD_BREAK_PROP_LF) {
+			break;
+		}
+
+		/* WB3b */
+		if (raw.c == WORD_BREAK_PROP_NEWLINE ||
+		    raw.c == WORD_BREAK_PROP_CR      ||
+		    raw.c == WORD_BREAK_PROP_LF) {
+			break;
+		}
+
+		/* WB3c */
+		if (raw.b == WORD_BREAK_PROP_ZWJ &&
+		    (raw.c == WORD_BREAK_PROP_EXTENDED_PICTOGRAPHIC ||
+		     raw.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT)) {
+			continue;
+		}
+
+		/* WB3d */
+		if (raw.b == WORD_BREAK_PROP_WSEGSPACE &&
+		    raw.c == WORD_BREAK_PROP_WSEGSPACE) {
+			continue;
+		}
+
+		/* WB4 */
+		if (raw.c == WORD_BREAK_PROP_EXTEND ||
+		    raw.c == WORD_BREAK_PROP_FORMAT ||
+		    raw.c == WORD_BREAK_PROP_ZWJ) {
+			continue;
+		}
+
+		/* WB5 */
+		if ((skip.b == WORD_BREAK_PROP_ALETTER              ||
+		     skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+		     skip.b == WORD_BREAK_PROP_HEBREW_LETTER) &&
+		    (skip.c == WORD_BREAK_PROP_ALETTER              ||
+		     skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+		     skip.c == WORD_BREAK_PROP_HEBREW_LETTER)) {
+			continue;
+		}
+
+		/* WB6 */
+		if ((skip.b == WORD_BREAK_PROP_ALETTER              ||
+		     skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+		     skip.b == WORD_BREAK_PROP_HEBREW_LETTER) &&
+		    (skip.c == WORD_BREAK_PROP_MIDLETTER    ||
+		     skip.c == WORD_BREAK_PROP_MIDNUMLET    ||
+		     skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) &&
+		    len > 2 &&
+		    (skip.d == WORD_BREAK_PROP_ALETTER              ||
+		     skip.d == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+		     skip.d == WORD_BREAK_PROP_HEBREW_LETTER)) {
+			continue;
+		}
+
+		/* WB7 */
+		if ((skip.b == WORD_BREAK_PROP_MIDLETTER    ||
+		     skip.b == WORD_BREAK_PROP_MIDNUMLET    ||
+		     skip.b == WORD_BREAK_PROP_SINGLE_QUOTE) &&
+		    (skip.c == WORD_BREAK_PROP_ALETTER              ||
+		     skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+		     skip.c == WORD_BREAK_PROP_HEBREW_LETTER) &&
+		    len > 2 &&
+		    (skip.a == WORD_BREAK_PROP_ALETTER              ||
+		     skip.a == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+		     skip.a == WORD_BREAK_PROP_HEBREW_LETTER)) {
+			continue;
+		}
+
+		/* WB7a */
+		if (skip.b == WORD_BREAK_PROP_HEBREW_LETTER &&
+		    skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) {
+			continue;
+		}
+
+		/* WB7b */
+		if (skip.b == WORD_BREAK_PROP_HEBREW_LETTER &&
+		    skip.c == WORD_BREAK_PROP_DOUBLE_QUOTE &&
+		    len > 2 &&
+		    skip.d == WORD_BREAK_PROP_HEBREW_LETTER) {
+			continue;
+		}
+
+		/* WB7c */
+		if (skip.b == WORD_BREAK_PROP_DOUBLE_QUOTE &&
+		    skip.c == WORD_BREAK_PROP_HEBREW_LETTER &&
+		    off > 1 &&
+		    skip.a == WORD_BREAK_PROP_HEBREW_LETTER) {
+			continue;
+		}
+
+		/* WB8 */
+		if (skip.b == WORD_BREAK_PROP_NUMERIC &&
+		    skip.c == WORD_BREAK_PROP_NUMERIC) {
+			continue;
+		}
+
+		/* WB9 */
+		if ((skip.b == WORD_BREAK_PROP_ALETTER              ||
+		     skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+		     skip.b == WORD_BREAK_PROP_HEBREW_LETTER) &&
+		    skip.c == WORD_BREAK_PROP_NUMERIC) {
+			continue;
+		}
+
+		/* WB10 */
+		if (skip.b == WORD_BREAK_PROP_NUMERIC &&
+		    (skip.c == WORD_BREAK_PROP_ALETTER              ||
+		     skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+		     skip.c == WORD_BREAK_PROP_HEBREW_LETTER)) {
+			continue;
+		}
+
+		/* WB11 */
+		if ((skip.b == WORD_BREAK_PROP_MIDNUM       ||
+		     skip.b == WORD_BREAK_PROP_MIDNUMLET    ||
+		     skip.b == WORD_BREAK_PROP_SINGLE_QUOTE) &&
+		    skip.c == WORD_BREAK_PROP_NUMERIC &&
+		    off > 1 &&
+		    skip.a == WORD_BREAK_PROP_NUMERIC) {
+			continue;
+		}
+
+		/* WB12 */
+		if (skip.b == WORD_BREAK_PROP_NUMERIC &&
+		    (skip.c == WORD_BREAK_PROP_MIDNUM       ||
+		     skip.c == WORD_BREAK_PROP_MIDNUMLET    ||
+		     skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) &&
+		    len > 2 &&
+		    skip.d == WORD_BREAK_PROP_NUMERIC) {
+			continue;
+		}
+
+		/* WB13 */
+		if (skip.b == WORD_BREAK_PROP_KATAKANA &&
+		    skip.c == WORD_BREAK_PROP_KATAKANA) {
+			continue;
+		}
+
+		/* WB13a */
+		if ((skip.b == WORD_BREAK_PROP_ALETTER              ||
+		     skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+		     skip.b == WORD_BREAK_PROP_HEBREW_LETTER        ||
+		     skip.b == WORD_BREAK_PROP_NUMERIC              ||
+		     skip.b == WORD_BREAK_PROP_KATAKANA             ||
+		     skip.b == WORD_BREAK_PROP_EXTENDNUMLET) &&
+		    skip.c == WORD_BREAK_PROP_EXTENDNUMLET) {
+			continue;
+		}
+
+		/* WB13b */
+		if (skip.b == WORD_BREAK_PROP_EXTENDNUMLET &&
+		    (skip.c == WORD_BREAK_PROP_ALETTER              ||
+		     skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+		     skip.c == WORD_BREAK_PROP_HEBREW_LETTER        ||
+		     skip.c == WORD_BREAK_PROP_NUMERIC              ||
+		     skip.c == WORD_BREAK_PROP_KATAKANA)) {
+			continue;
+		}
+
+		/* WB15 and WB16 */
+		if (!ri_even &&
+		    skip.c == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
+			continue;
+		}
+
+		/* WB999 */
+		break;
+	}
+
+	return off;
+}
+
+size_t
+grapheme_next_word_break(const uint_least32_t *str, size_t len)
+{
+	return next_word_break(str, len, get_codepoint);
+}
+
+size_t
+grapheme_next_word_break_utf8(const char *str, size_t len)
+{
+	return next_word_break(str, len, get_codepoint_utf8);
+}
diff --git a/test/character.c b/test/character.c
@@ -3,19 +3,15 @@
 #include <stdint.h>
 
 #include "../gen/character-test.h"
+#include "../grapheme.h"
 #include "util.h"
 
-static bool
-is_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STATE *state)
-{
-	return grapheme_is_character_break(cp0, cp1, state);
-}
-
 int
 main(int argc, char *argv[])
 {
 	(void)argc;
 
-	return run_break_tests(is_break, character_test,
-	                       LEN(character_test), argv[0]);
+	return run_break_tests(grapheme_next_character_break,
+	                       character_break_test,
+	                       LEN(character_break_test), argv[0]);
 }
diff --git a/test/util.c b/test/util.c
@@ -10,30 +10,25 @@
 #include "util.h"
 
 int
-run_break_tests(bool (*is_break)(uint_least32_t, uint_least32_t, GRAPHEME_STATE *),
+run_break_tests(size_t (*next_break)(const uint_least32_t *, size_t),
                 const struct break_test *test, size_t testlen, const char *argv0)
 {
 	GRAPHEME_STATE state;
-	size_t i, j, k, len, failed;
+	size_t i, j, off, res, failed;
 
 	/* character break test */
 	for (i = 0, failed = 0; i < testlen; i++) {
-		memset(&state, 0, sizeof(state));
-		for (j = 0, k = 0, len = 1; j < test[i].cplen; j++) {
-			if ((j + 1) == test[i].cplen ||
-			    is_break(test[i].cp[j], test[i].cp[j + 1],
-			             &state)) {
-				/* check if our resulting length matches */
-				if (k == test[i].lenlen ||
-				    len != test[i].len[k++]) {
-					fprintf(stderr, "%s: Failed test \"%s\".\n",
-					        argv0, test[i].descr);
-					failed++;
-					break;
-				}
-				len = 1;
-			} else {
-				len++;
+		for (j = 0, off = 0; off < test[i].cplen; off += res) {
+			res = next_break(test[i].cp + off, test[i].cplen - off);
+
+			/* check if our resulting offset matches */
+			if (j == test[i].lenlen ||
+			    res != test[i].len[j++]) {
+				fprintf(stderr, "%s: Failed test %zu \"%s\".\n",
+				        argv0, i, test[i].descr);
+				fprintf(stderr, "J=%zu: EXPECTED len %zu, got %zu\n", j-1, test[i].len[j-1], res);
+				failed++;
+				break;
 			}
 		}
 	}
diff --git a/test/util.h b/test/util.h
@@ -7,8 +7,8 @@
 
 #define LEN(x) (sizeof(x) / sizeof(*(x)))
 
-int run_break_tests(bool (*is_break)(uint_least32_t, uint_least32_t,
-                    GRAPHEME_STATE *), const struct break_test *test,
-                    size_t testlen, const char *);
+int run_break_tests(size_t (*next_break)(const uint_least32_t *, size_t),
+                    const struct break_test *test, size_t testlen,
+                    const char *);
 
 #endif /* UTIL_H */
diff --git a/test/word.c b/test/word.c
@@ -0,0 +1,16 @@
+/* See LICENSE file for copyright and license details. */
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "../gen/word-test.h"
+#include "../grapheme.h"
+#include "util.h"
+
+int
+main(int argc, char *argv[])
+{
+	(void)argc;
+
+	return run_break_tests(grapheme_next_word_break, word_break_test,
+	                       LEN(word_break_test), argv[0]);
+}

	libgrapheme unicode string library
	git clone git://git.suckless.org/libgrapheme
	Log \| Files \| Refs \| README \| LICENSE

M	Makefile	\|	24	++++++++++++++++++++++++
M	benchmark/character.c	\|	5	+++--
M	benchmark/utf8-decode.c	\|	4	++--
A	benchmark/word.c	\|	52	++++++++++++++++++++++++++++++++++++++++++++++++++++
M	gen/character-test.c	\|	2	+-
M	gen/character.c	\|	4	+++-
M	gen/util.c	\|	27	++++++++++++++++++++-------
M	gen/util.h	\|	6	++++--
A	gen/word-test.c	\|	19	+++++++++++++++++++
A	gen/word.c	\|	158	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	grapheme.h	\|	8	++++++--
D	man/grapheme_next_character_break.3	\|	92	-------------------------------------------------------------------------------
A	man/grapheme_next_character_break_utf8.3	\|	92	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	src/character.c	\|	44	++++++++++++++++++++++++++++++++++----------
A	src/word.c	\|	373	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	test/character.c	\|	12	++++--------
M	test/util.c	\|	31	+++++++++++++------------------
M	test/util.h	\|	6	+++---
A	test/word.c	\|	16	++++++++++++++++