Refactor data-generation and library structure - libgrapheme

commit 0e3d5f60213ba55935364c73422b373ac380f574
parent f334f95e146045257631c605510413ba8de4639d
Author: Laslo Hunhold <dev@frign.de>
Date:   Wed,  8 Dec 2021 17:47:58 +0100

Refactor data-generation and library structure

What I always didn't like was the fact that you would have to have
two heisenstates in grapheme_boundary() (one for the grapheme-proptable
and one for the emoji-proptable). This unnecessarily complicated the
handling a little bit, even though there is still room for improvement.

A new folder gen was created to contain the generation tools. The data
folder from now on only contains data files.

Now gen/util.c contains all necessary functions to properly parse
property files (and test files) and you merely have to create an
"order list" (e.g. in gen/grapheme.c and gen/grapheme-test.c) and then
are good to go. This doesn't immensely remove code duplication, but
will come in handy in the future.

Additionally, src/boundary.c was moved into src/grapheme.c so there's
only one object file pulling in the data-table. This separation makes
the structure of the program clearer and helps the linker discard
unused library elements.

The heisenstate was increased to 64 bits for future use.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
M LICENSE  | 2 +-
M Makefile  | 54 +++++++++++++++++++++++++++---------------------------
R data/grapheme_boundary.txt -> data/GraphemeBreakProperty.txt  | 0 
R data/grapheme_boundary_test.txt -> data/GraphemeBreakTest.txt  | 0 
D data/datautil.c  | 159 -------------------------------------------------------------------------------
D data/datautil.h  | 20 --------------------
R data/emoji.txt -> data/emoji-data.txt  | 0 
D data/emoji.c  | 78 ------------------------------------------------------------------------------
D data/grapheme_boundary.c  | 138 -------------------------------------------------------------------------------
D data/grapheme_boundary_test.c  | 139 -------------------------------------------------------------------------------
A gen/grapheme-test.c  | 18 ++++++++++++++++++
A gen/grapheme.c  | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A gen/util.c  | 384 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A gen/util.h  | 37 +++++++++++++++++++++++++++++++++++++
D src/boundary.c  | 181 -------------------------------------------------------------------------------
M src/grapheme.c  | 150 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M src/util.c  | 32 ++++++++++++++++++++++++++++++--
M src/util.h  | 9 ++++++---
A test/grapheme.c  | 43 +++++++++++++++++++++++++++++++++++++++++++
D test/grapheme_boundary.c  | 41 -----------------------------------------

20 files changed, 788 insertions(+), 789 deletions(-)
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 ISC-License
 
-Copyright 2019-2020 Laslo Hunhold <dev@frign.de>
+Copyright 2019-2021 Laslo Hunhold <dev@frign.de>
 
 Permission to use, copy, modify, and/or distribute this software for any
 purpose with or without fee is hereby granted, provided that the above
diff --git a/Makefile b/Makefile
@@ -4,52 +4,52 @@
 
 include config.mk
 
-LIB = src/boundary src/codepoint src/grapheme src/util
-TEST = test/grapheme_boundary test/utf8-decode test/utf8-encode
-DATA = data/emoji data/grapheme_boundary data/grapheme_boundary_test
+DATA =\
+	data/emoji-data.txt\
+	data/GraphemeBreakProperty.txt\
+	data/GraphemeBreakTest.txt
+GEN = gen/grapheme gen/grapheme-test
+LIB = src/codepoint src/grapheme src/util
+TEST = test/grapheme test/utf8-decode test/utf8-encode
 
 MAN3 = man/grapheme_bytelen.3
 MAN7 = man/libgrapheme.7
 
 all: libgrapheme.a libgrapheme.so
 
-data/emoji.h: data/emoji.txt data/emoji
-data/grapheme_boundary.h: data/grapheme_boundary.txt data/grapheme_boundary
-data/grapheme_boundary_test.h: data/grapheme_boundary_test.txt data/grapheme_boundary_test
-
-data/emoji.o: data/emoji.c config.mk data/datautil.h
-data/grapheme_boundary.o: data/grapheme_boundary.c config.mk data/datautil.h
-data/grapheme_boundary_test.o: data/grapheme_boundary_test.c config.mk data/datautil.h
-data/datautil.o: data/datautil.c config.mk data/datautil.h
-src/boundary.o: src/boundary.c config.mk data/emoji.h data/grapheme_boundary.h grapheme.h
+gen/grapheme.o: gen/grapheme.c config.mk gen/util.h
+gen/grapheme-test.o: gen/grapheme-test.c config.mk gen/util.h
+gen/util.o: gen/util.c config.mk gen/util.h
 src/codepoint.o: src/codepoint.c config.mk grapheme.h
-src/grapheme.o: src/grapheme.c config.mk grapheme.h
+src/grapheme.o: src/grapheme.c config.mk gen/grapheme.h grapheme.h src/util.h
 src/util.o: src/util.c config.mk src/util.h
-test/grapheme_boundary.o: test/grapheme_boundary.c config.mk data/grapheme_boundary_test.h grapheme.h
+test/grapheme.o: test/grapheme.c config.mk gen/grapheme-test.h grapheme.h
 test/utf8-encode.o: test/utf8-encode.c config.mk grapheme.h
 test/utf8-decode.o: test/utf8-decode.c config.mk grapheme.h
 
-data/emoji: data/emoji.o data/datautil.o
-data/grapheme_boundary: data/grapheme_boundary.o data/datautil.o
-data/grapheme_boundary_test: data/grapheme_boundary_test.o data/datautil.o
-test/grapheme_boundary: test/grapheme_boundary.o libgrapheme.a
+gen/grapheme: gen/grapheme.o gen/util.o
+gen/grapheme-test: gen/grapheme-test.o gen/util.o
+test/grapheme: test/grapheme.o libgrapheme.a
 test/utf8-encode: test/utf8-encode.o libgrapheme.a
 test/utf8-decode: test/utf8-decode.o libgrapheme.a
 
-data/emoji.txt:
+gen/grapheme.h: data/emoji-data.txt data/GraphemeBreakProperty.txt gen/grapheme
+gen/grapheme-test.h: data/GraphemeBreakTest.txt gen/grapheme-test
+
+data/emoji-data.txt:
 	wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt
 
-data/grapheme_boundary.txt:
+data/GraphemeBreakProperty.txt:
 	wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/auxiliary/GraphemeBreakProperty.txt
 
-data/grapheme_boundary_test.txt:
+data/GraphemeBreakTest.txt:
 	wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/auxiliary/GraphemeBreakTest.txt
 
-$(DATA:=.h):
-	$(@:.h=) < $(@:.h=.txt) > $@
+$(GEN):
+	$(CC) -o $@ $(LDFLAGS) $@.o gen/util.o
 
-$(DATA):
-	$(CC) -o $@ $(LDFLAGS) $@.o data/datautil.o
+$(GEN:=.h):
+	$(@:.h=) > $@
 
 $(TEST):
 	$(CC) -o $@ $(LDFLAGS) $@.o libgrapheme.a
@@ -86,7 +86,7 @@ uninstall:
 	rm -f "$(DESTDIR)$(INCPREFIX)/grapheme.h"
 
 clean:
-	rm -f $(DATA:=.h) $(DATA:=.o) data/datautil.o $(LIB:=.o) $(TEST:=.o) $(DATA) $(TEST) libgrapheme.a libgrapheme.so
+	rm -f $(GEN:=.h) $(GEN:=.o) $(GEN) gen/util.o $(LIB:=.o) $(TEST:=.o) $(TEST) libgrapheme.a libgrapheme.so
 
 clean-data:
-	rm -f $(DATA:=.txt)
+	rm -f $(DATA)
diff --git a/data/grapheme_boundary.txt b/data/GraphemeBreakProperty.txt
diff --git a/data/grapheme_boundary_test.txt b/data/GraphemeBreakTest.txt
diff --git a/data/datautil.c b/data/datautil.c
@@ -1,159 +0,0 @@
-/* See LICENSE file for copyright and license details. */
-#include <stdint.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <errno.h>
-
-#include "datautil.h"
-
-void
-parse_input(int (*process_line)(char **, size_t, char *))
-{
-	char *line = NULL, **field = NULL, *comment;
-	size_t linebufsize = 0, i, fieldbufsize = 0, j, nfields;
-	ssize_t len;
-
-	while ((len = getline(&line, &linebufsize, stdin)) >= 0) {
-		/* remove trailing newline */
-		if (len > 0 && line[len - 1] == '\n') {
-			line[len - 1] = '\0';
-			len--;
-		}
-
-		/* skip empty lines and comment lines */
-		if (len == 0 || line[0] == '#') {
-			continue;
-		}
-
-		/* tokenize line into fields */
-		for (i = 0, nfields = 0, comment = NULL; i < (size_t)len; i++) {
-			/* extend field buffer, if necessary */
-			if (++nfields > fieldbufsize) {
-				if ((field = realloc(field, nfields *
-				                     sizeof(*field))) == NULL) {
-					fprintf(stderr, "realloc: %s\n", strerror(errno));
-					exit(1);
-				}
-				fieldbufsize = nfields;
-			}
-
-			/* skip leading whitespace */
-			while (line[i] == ' ') {
-				i++;
-			}
-
-			/* set current position as field start */
-			field[nfields - 1] = &line[i];
-
-			/* continue until we reach ';' or '#' or end */
-			while (line[i] != ';' && line[i] != '#' &&
-			       line[i] != '\0') {
-				i++;
-			}
-			if (line [i] == '#') {
-				/* set comment-variable for later */
-				comment = &line[i + 1];
-			}
-
-			/* go back whitespace and terminate field there */
-			if (i > 0) {
-				for (j = i - 1; line[j] == ' '; j--)
-					;
-				line[j + 1] = '\0';
-			} else {
-				line[i] = '\0';
-			}
-
-			/* if comment is set, we are done */
-			if (comment != NULL) {
-				break;
-			}
-		}
-
-		/* skip leading whitespace in comment */
-		while (comment != NULL && comment[0] == ' ') {
-			comment++;
-		}
-
-		/* call line processing function */
-		if (process_line(field, nfields, comment)) {
-			exit(1);
-		}
-	}
-
-	free(line);
-	free(field);
-}
-
-static int
-valid_hexstring(const char *str)
-{
-	const char *p = str;
-
-	while ((*p >= '0' && *p <= '9') ||
-	       (*p >= 'a' && *p <= 'f') ||
-	       (*p >= 'A' && *p <= 'F')) {
-		p++;
-	}
-
-	if (*p != '\0') {
-		fprintf(stderr, "invalid code point range '%s'\n", str);
-		return 0;
-	}
-
-	return 1;
-}
-
-int
-cp_parse(const char *str, uint32_t *cp)
-{
-	if (!valid_hexstring(str)) {
-		return 1;
-	}
-	*cp = strtol(str, NULL, 16);
-
-	return 0;
-}
-
-int
-range_parse(const char *str, struct range *range)
-{
-	char *p;
-
-	if ((p = strstr(str, "..")) == NULL) {
-		/* input has the form "XXXXXX" */
-		if (!valid_hexstring(str)) {
-			return 1;
-		}
-		range->lower = range->upper = strtol(str, NULL, 16);
-	} else {
-		/* input has the form "XXXXXX..XXXXXX" */
-		*p = '\0';
-		p += 2;
-		if (!valid_hexstring(str) || !valid_hexstring(p)) {
-			return 1;
-		}
-		range->lower = strtol(str, NULL, 16);
-		range->upper = strtol(p, NULL, 16);
-	}
-
-	return 0;
-}
-
-void
-range_list_append(struct range **range, size_t *nranges, const struct range *new)
-{
-	if (*nranges > 0 && (*range)[*nranges - 1].upper == new->lower) {
-		/* we can merge with previous entry */
-		(*range)[*nranges - 1].upper = new->upper;
-	} else {
-		/* need to append new entry */
-		if ((*range = realloc(*range, (++(*nranges)) * sizeof(**range))) == NULL) {
-			fprintf(stderr, "realloc: %s\n", strerror(errno));
-			exit(1);
-		}
-		(*range)[*nranges - 1].lower = new->lower;
-		(*range)[*nranges - 1].upper = new->upper;
-	}
-}
diff --git a/data/datautil.h b/data/datautil.h
@@ -1,20 +0,0 @@
-/* See LICENSE file for copyright and license details. */
-#ifndef DATAUTIL_H
-#define DATAUTIL_H
-
-#include <stddef.h>
-#include <stdint.h>
-
-#define LEN(x) (sizeof (x) / sizeof *(x))
-
-struct range {
-	uint32_t lower;
-	uint32_t upper;
-};
-
-void parse_input(int (*process_line)(char **, size_t, char *));
-int cp_parse(const char *, uint32_t *);
-int range_parse(const char *, struct range *);
-void range_list_append(struct range **, size_t *, const struct range *);
-
-#endif /* DATAUTIL_H */
diff --git a/data/emoji.txt b/data/emoji-data.txt
diff --git a/data/emoji.c b/data/emoji.c
@@ -1,78 +0,0 @@
-/* See LICENSE file for copyright and license details. */
-#include <stddef.h>
-#include <stdio.h>
-#include <string.h>
-
-#include "datautil.h"
-
-static struct {
-	char         *enumname;
-	char         *identifier;
-	struct range *table;
-	size_t        tablelen;
-} properties[] = {
-	{
-		/* extended pictographic */
-		.enumname   = "EMOJI_PROP_EXTPICT",
-		.identifier = "Extended_Pictographic",
-	},
-};
-
-int
-process_line(char **field, size_t nfields, char *comment)
-{
-	size_t i;
-	struct range r;
-
-	(void)comment;
-
-	if (nfields < 2) {
-		return 1;
-	}
-
-	for (i = 0; i < LEN(properties); i++) {
-		if (!strcmp(field[1], properties[i].identifier)) {
-			if (range_parse(field[0], &r)) {
-				return 1;
-			}
-			range_list_append(&(properties[i].table),
-			                  &(properties[i].tablelen), &r);
-			break;
-		}
-	}
-
-	return 0;
-}
-
-int
-main(void)
-{
-	size_t i, j;
-
-	printf("/* Automatically generated by data/emo */\n"
-	       "#include <stdint.h>\n\n#include \"../src/util.h\"\n\n");
-
-	parse_input(process_line);
-
-	/* output enum */
-	printf("enum emoji_prop {\n");
-	for (i = 0; i < LEN(properties); i++) {
-		printf("\t%s,\n", properties[i].enumname);
-	}
-	printf("};\n\n");
-
-	/* output table */
-	printf("static const struct range_list emoji_prop[] = {\n");
-	for (i = 0; i < LEN(properties); i++) {
-		printf("\t[%s] = {\n\t\t.data = (struct range[]){\n", properties[i].enumname);
-		for (j = 0; j < properties[i].tablelen; j++) {
-			printf("\t\t\t{ UINT32_C(0x%06X), UINT32_C(0x%06X) },\n",
-			       properties[i].table[j].lower,
-			       properties[i].table[j].upper);
-		}
-		printf("\t\t},\n\t\t.len = %zu,\n\t},\n", properties[i].tablelen);
-	}
-	printf("};\n");
-
-	return 0;
-}
diff --git a/data/grapheme_boundary.c b/data/grapheme_boundary.c
@@ -1,138 +0,0 @@
-/* See LICENSE file for copyright and license details. */
-#include <stddef.h>
-#include <stdio.h>
-#include <string.h>
-
-#include "datautil.h"
-
-static struct {
-	char         *enumname;
-	char         *identifier;
-	struct range *table;
-	size_t        tablelen;
-} properties[] = {
-	{
-		/* carriage return */
-		.enumname   = "GB_PROP_CR",
-		.identifier = "CR",
-	},
-	{
-		/* line feed */
-		.enumname   = "GB_PROP_LF",
-		.identifier = "LF",
-	},
-	{
-		/* control character */
-		.enumname   = "GB_PROP_CONTROL",
-		.identifier = "Control",
-	},
-	{
-		/* grapheme extender */
-		.enumname   = "GB_PROP_EXTEND",
-		.identifier = "Extend",
-	},
-	{
-		/* zero width joiner */
-		.enumname   = "GB_PROP_ZWJ",
-		.identifier = "ZWJ",
-	},
-	{
-		/* regional indicator */
-		.enumname   = "GB_PROP_REGIONAL_INDICATOR",
-		.identifier = "Regional_Indicator",
-	},
-	{
-		/* prepend character */
-		.enumname   = "GB_PROP_PREPEND",
-		.identifier = "Prepend",
-	},
-	{
-		/* spacing mark */
-		.enumname   = "GB_PROP_SPACINGMARK",
-		.identifier = "SpacingMark",
-	},
-	{
-		/* hangul syllable type L */
-		.enumname   = "GB_PROP_L",
-		.identifier = "L",
-	},
-	{
-		/* hangul syllable type V */
-		.enumname   = "GB_PROP_V",
-		.identifier = "V",
-	},
-	{
-		/* hangul syllable type T */
-		.enumname   = "GB_PROP_T",
-		.identifier = "T",
-	},
-	{
-		/* hangul syllable type LV */
-		.enumname   = "GB_PROP_LV",
-		.identifier = "LV",
-	},
-	{
-		/* hangul syllable type LVT */
-		.enumname   = "GB_PROP_LVT",
-		.identifier = "LVT",
-	},
-};
-
-int
-process_line(char **field, size_t nfields, char *comment)
-{
-	size_t i;
-	struct range r;
-
-	(void)comment;
-
-	if (nfields < 2) {
-		return 1;
-	}
-
-	for (i = 0; i < LEN(properties); i++) {
-		if (!strcmp(field[1], properties[i].identifier)) {
-			if (range_parse(field[0], &r)) {
-				return 1;
-			}
-			range_list_append(&(properties[i].table),
-			                  &(properties[i].tablelen), &r);
-			break;
-		}
-	}
-
-	return 0;
-}
-
-int
-main(void)
-{
-	size_t i, j;
-
-	printf("/* Automatically generated by data/gbp */\n"
-	       "#include <stdint.h>\n\n#include \"../src/util.h\"\n\n");
-
-	parse_input(process_line);
-
-	/* output enum */
-	printf("enum gb_prop {\n");
-	for (i = 0; i < LEN(properties); i++) {
-		printf("\t%s,\n", properties[i].enumname);
-	}
-	printf("};\n\n");
-
-	/* output table */
-	printf("static const struct range_list gb_prop[] = {\n");
-	for (i = 0; i < LEN(properties); i++) {
-		printf("\t[%s] = {\n\t\t.data = (struct range[]){\n", properties[i].enumname);
-		for (j = 0; j < properties[i].tablelen; j++) {
-			printf("\t\t\t{ UINT32_C(0x%06X), UINT32_C(0x%06X) },\n",
-			       properties[i].table[j].lower,
-			       properties[i].table[j].upper);
-		}
-		printf("\t\t},\n\t\t.len = %zu,\n\t},\n", properties[i].tablelen);
-	}
-	printf("};\n");
-
-	return 0;
-}
diff --git a/data/grapheme_boundary_test.c b/data/grapheme_boundary_test.c
@@ -1,139 +0,0 @@
-/* See LICENSE file for copyright and license details. */
-#include <stddef.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <errno.h>
-
-#include "datautil.h"
-
-struct break_test {
-	uint32_t *cp;
-	size_t cplen;
-	size_t *len;
-	size_t lenlen;
-	char *descr;
-};
-
-static struct break_test *test = NULL;
-static size_t ntests = 0;
-
-int
-process_line(char **field, size_t nfields, char *comment)
-{
-	struct break_test *t;
-	size_t i;
-	char *token;
-
-	if (nfields < 1) {
-		return 1;
-	}
-
-	/* append new testcase and initialize with zeroes */
-	if ((test = realloc(test, ++ntests * sizeof(*test))) == NULL) {
-		fprintf(stderr, "realloc: %s\n", strerror(errno));
-		return 1;
-	}
-	t = &test[ntests - 1];
-	memset(t, 0, sizeof(*t));
-
-	/* parse testcase "<÷|×> <cp> <÷|×> ... <cp> <÷|×>" */
-	for (token = strtok(field[0], " "), i = 0; token != NULL; i++,
-	     token = strtok(NULL, " ")) {
-		if (i % 2 == 0) {
-			/* delimiter */
-			if (!strncmp(token, "\xC3\xB7", 2)) { /* UTF-8 */
-				/*
-				 * '÷' indicates a breakpoint,
-				 * the current length is done; allocate
-				 * a new length field and set it to 0
-				 */
-				if ((t->len = realloc(t->len,
-				     ++t->lenlen * sizeof(*t->len))) == NULL) {
-					fprintf(stderr, "realloc: %s\n",
-					        strerror(errno));
-					return 1;
-				}
-				t->len[t->lenlen - 1] = 0;
-			} else if (!strncmp(token, "\xC3\x97", 2)) { /* UTF-8 */
-				/*
-				 * '×' indicates a non-breakpoint, do nothing
-				 */
-			} else {
-				fprintf(stderr, "malformed delimiter '%s'\n",
-				        token);
-				return 1;
-			}
-		} else {
-			/* add code point to cp-array */
-			if ((t->cp = realloc(t->cp, ++t->cplen *
-			                     sizeof(*t->cp))) == NULL) {
-				fprintf(stderr, "realloc: %s\n", strerror(errno));
-				return 1;
-			}
-			if (cp_parse(token, &t->cp[t->cplen - 1])) {
-				return 1;
-			}
-			if (t->lenlen > 0) {
-				t->len[t->lenlen - 1]++;
-			}
-		}
-	}
-	if (t->len[t->lenlen - 1] == 0) {
-		/* we allocated one more length than we needed */
-		t->lenlen--;
-	}
-
-	/* store comment */
-	if ((test[ntests - 1].descr = strdup(comment)) == NULL) {
-		fprintf(stderr, "strdup: %s\n", strerror(errno));
-		return 1;
-	}
-
-	return 0;
-}
-
-int
-main(void)
-{
-	size_t i, j;
-
-	printf("/* Automatically generated by data/gbt */\n"
-	       "#include <stdint.h>\n#include <stddef.h>\n\n");
-
-	parse_input(process_line);
-
-	printf("static const struct break_test {\n\tuint32_t *cp;\n"
-	       "\tsize_t cplen;\n\tsize_t *len;\n\tsize_t lenlen;\n"
-	       "\tchar *descr;\n} t[] = {\n");
-	for (i = 0; i < ntests; i++) {
-		printf("\t{\n");
-
-		printf("\t\t.cp     = (uint32_t[]){");
-		for (j = 0; j < test[i].cplen; j++) {
-			printf(" UINT32_C(0x%06X)", test[i].cp[j]);
-			if (j + 1 < test[i].cplen) {
-				putchar(',');
-			}
-		}
-		printf(" },\n");
-		printf("\t\t.cplen  = %zu,\n", test[i].cplen);
-
-		printf("\t\t.len    = (size_t[]){");
-		for (j = 0; j < test[i].lenlen; j++) {
-			printf(" %zu", test[i].len[j]);
-			if (j + 1 < test[i].lenlen) {
-				putchar(',');
-			}
-		}
-		printf(" },\n");
-		printf("\t\t.lenlen = %zu,\n", test[i].lenlen);
-
-		printf("\t\t.descr  = \"%s\",\n", test[i].descr);
-
-		printf("\t},\n");
-	}
-	printf("};\n");
-
-	return 0;
-}
diff --git a/gen/grapheme-test.c b/gen/grapheme-test.c
@@ -0,0 +1,18 @@
+/* See LICENSE file for copyright and license details. */
+#include <stddef.h>
+
+#include "util.h"
+
+int
+main(int argc, char *argv[])
+{
+	struct segment_test *st = NULL;
+	size_t numsegtests = 0;
+
+	(void)argc;
+
+	segment_test_list_parse("data/GraphemeBreakTest.txt", &st, &numsegtests);
+	segment_test_list_print(st, numsegtests, "grapheme_test", argv[0]);
+
+	return 0;
+}
diff --git a/gen/grapheme.c b/gen/grapheme.c
@@ -0,0 +1,92 @@
+/* See LICENSE file for copyright and license details. */
+#include <stddef.h>
+
+#include "util.h"
+
+#define FILE_EMOJI    "data/emoji-data.txt"
+#define FILE_GRAPHEME "data/GraphemeBreakProperty.txt"
+
+static struct property segment_property[] = {
+	{
+		.enumname   = "GRAPHEME_PROP_CONTROL",
+		.identifier = "Control",
+		.fname      = FILE_GRAPHEME,
+	},
+	{
+		.enumname   = "GRAPHEME_PROP_CR",
+		.identifier = "CR",
+		.fname      = FILE_GRAPHEME,
+	},
+	{
+		.enumname   = "GRAPHEME_PROP_EXTEND",
+		.identifier = "Extend",
+		.fname      = FILE_GRAPHEME,
+	},
+	{
+		.enumname   = "GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC",
+		.identifier = "Extended_Pictographic",
+		.fname      = FILE_EMOJI,
+	},
+	{
+		.enumname   = "GRAPHEME_PROP_HANGUL_L",
+		.identifier = "L",
+		.fname      = FILE_GRAPHEME,
+	},
+	{
+		.enumname   = "GRAPHEME_PROP_HANGUL_V",
+		.identifier = "V",
+		.fname      = FILE_GRAPHEME,
+	},
+	{
+		.enumname   = "GRAPHEME_PROP_HANGUL_T",
+		.identifier = "T",
+		.fname      = FILE_GRAPHEME,
+	},
+	{
+		.enumname   = "GRAPHEME_PROP_HANGUL_LV",
+		.identifier = "LV",
+		.fname      = FILE_GRAPHEME,
+	},
+	{
+		.enumname   = "GRAPHEME_PROP_HANGUL_LVT",
+		.identifier = "LVT",
+		.fname      = FILE_GRAPHEME,
+	},
+	{
+		.enumname   = "GRAPHEME_PROP_LF",
+		.identifier = "LF",
+		.fname      = FILE_GRAPHEME,
+	},
+	{
+		.enumname   = "GRAPHEME_PROP_PREPEND",
+		.identifier = "Prepend",
+		.fname      = FILE_GRAPHEME,
+	},
+	{
+		.enumname   = "GRAPHEME_PROP_REGIONAL_INDICATOR",
+		.identifier = "Regional_Indicator",
+		.fname      = FILE_GRAPHEME,
+	},
+	{
+		.enumname   = "GRAPHEME_PROP_SPACINGMARK",
+		.identifier = "SpacingMark",
+		.fname      = FILE_GRAPHEME,
+	},
+	{
+		.enumname   = "GRAPHEME_PROP_ZWJ",
+		.identifier = "ZWJ",
+		.fname      = FILE_GRAPHEME,
+	},
+};
+
+int
+main(int argc, char *argv[])
+{
+	(void)argc;
+
+	property_list_parse(segment_property, LEN(segment_property));
+	property_list_print(segment_property, LEN(segment_property),
+	                    "grapheme_prop", argv[0]);
+
+	return 0;
+}
diff --git a/gen/util.c b/gen/util.c
@@ -0,0 +1,384 @@
+/* See LICENSE file for copyright and license details. */
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+
+#include "util.h"
+
+struct property_list_payload
+{
+	struct property *prop;
+	size_t numprops;
+};
+
+struct segment_test_payload
+{
+	struct segment_test **st;
+	size_t *numsegtests;
+};
+
+static int
+valid_hexstring(const char *str)
+{
+	const char *p = str;
+
+	while ((*p >= '0' && *p <= '9') ||
+	       (*p >= 'a' && *p <= 'f') ||
+	       (*p >= 'A' && *p <= 'F')) {
+		p++;
+	}
+
+	if (*p != '\0') {
+		fprintf(stderr, "valid_hexstring: Invalid code point range '%s'\n", str);
+		return 0;
+	}
+
+	return 1;
+}
+
+static int
+cp_parse(const char *str, uint32_t *cp)
+{
+	if (!valid_hexstring(str)) {
+		return 1;
+	}
+	*cp = strtol(str, NULL, 16);
+
+	return 0;
+}
+
+static int
+range_parse(const char *str, struct range *range)
+{
+	char *p;
+
+	if ((p = strstr(str, "..")) == NULL) {
+		/* input has the form "XXXXXX" */
+		if (!valid_hexstring(str)) {
+			return 1;
+		}
+		range->lower = range->upper = strtol(str, NULL, 16);
+	} else {
+		/* input has the form "XXXXXX..XXXXXX" */
+		*p = '\0';
+		p += 2;
+		if (!valid_hexstring(str) || !valid_hexstring(p)) {
+			return 1;
+		}
+		range->lower = strtol(str, NULL, 16);
+		range->upper = strtol(p, NULL, 16);
+	}
+
+	return 0;
+}
+
+void
+range_list_append(struct range **range, size_t *nranges, const struct range *new)
+{
+	if (*nranges > 0 && (*range)[*nranges - 1].upper == new->lower) {
+		/* we can merge with previous entry */
+		(*range)[*nranges - 1].upper = new->upper;
+	} else {
+		/* need to append new entry */
+		if ((*range = realloc(*range, (++(*nranges)) * sizeof(**range))) == NULL) {
+			fprintf(stderr, "realloc: %s\n", strerror(errno));
+			exit(1);
+		}
+		(*range)[*nranges - 1].lower = new->lower;
+		(*range)[*nranges - 1].upper = new->upper;
+	}
+}
+
+void parse_file_with_callback(char *fname, int (*callback)(char *, char **, size_t, char *, void *), void *payload)
+{
+	FILE *fp;
+	char *line = NULL, **field = NULL, *comment;
+	size_t linebufsize = 0, i, fieldbufsize = 0, j, nfields;
+	ssize_t len;
+
+	/* open file */
+	if (!(fp = fopen(fname, "r"))) {
+		fprintf(stderr, "fopen '%s': %s\n", fname,
+		        strerror(errno));
+		exit(1);
+	}
+
+	while ((len = getline(&line, &linebufsize, fp)) >= 0) {
+		/* remove trailing newline */
+		if (len > 0 && line[len - 1] == '\n') {
+			line[len - 1] = '\0';
+			len--;
+		}
+
+		/* skip empty lines and comment lines */
+		if (len == 0 || line[0] == '#') {
+			continue;
+		}
+
+		/* tokenize line into fields */
+		for (i = 0, nfields = 0, comment = NULL; i < (size_t)len; i++) {
+			/* extend field buffer, if necessary */
+			if (++nfields > fieldbufsize) {
+				if ((field = realloc(field, nfields *
+                                     sizeof(*field))) == NULL) {
+					fprintf(stderr, "realloc: %s\n", strerror(errno));
+					exit(1);
+				}
+				fieldbufsize = nfields;
+			}
+
+			/* skip leading whitespace */
+			while (line[i] == ' ') {
+				i++;
+			}
+
+			/* set current position as field start */
+			field[nfields - 1] = &line[i];
+
+			/* continue until we reach ';' or '#' or end */
+			while (line[i] != ';' && line[i] != '#' &&
+			       line[i] != '\0') {
+				i++;
+			}
+			if (line[i] == '#') {
+				/* set comment-variable for later */
+				comment = &line[i + 1];
+			}
+
+			/* go back whitespace and terminate field there */
+			if (i > 0) {
+				for (j = i - 1; line[j] == ' '; j--)
+					;
+				line[j + 1] = '\0';
+			} else {
+				line[i] = '\0';
+			}
+
+			/* if comment is set, we are done */
+			if (comment != NULL) {
+				break;
+			}
+		}
+
+		/* skip leading whitespace in comment */
+		while (comment != NULL && comment[0] == ' ') {
+			comment++;
+		}
+
+		/* call callback function */
+		if (callback(fname, field, nfields, comment, payload)) {
+			fprintf(stderr, "parse_file_with_callback: Malformed input.\n");
+			exit(1);
+		}
+	}
+}
+
+int
+property_list_callback(char *fname, char **field, size_t nfields, char *comment, void *payload)
+{
+	struct property *prop = ((struct property_list_payload *)payload)->prop;
+	struct range r;
+	size_t i, numprops = ((struct property_list_payload *)payload)->numprops;
+
+	(void)comment;
+
+	if (nfields < 2) {
+		return 1;
+	}
+
+	for (i = 0; i < numprops; i++) {
+		if (!strcmp(field[1], prop[i].identifier) &&
+		    !strcmp(fname, prop[i].fname)) {
+			if (range_parse(field[0], &r)) {
+				return 1;
+			}
+			range_list_append(&(prop[i].table),
+			                  &(prop[i].tablelen), &r);
+			break;
+		}
+	}
+
+	return 0;
+}
+
+void
+property_list_parse(struct property *prop, size_t numprops)
+{
+	struct property_list_payload pl = { .prop = prop, .numprops = numprops };
+	size_t i;
+
+	/* make sure to parse each file only once */
+	for (i = 0; i < numprops; i++) {
+		if (prop[i].tablelen > 0) {
+			/* property's file was already parsed */
+			continue;
+		}
+
+		parse_file_with_callback(prop[i].fname, property_list_callback, &pl);
+	}
+}
+
+void
+property_list_print(const struct property *prop, size_t numprops,
+                    const char *identifier, const char *progname)
+{
+	size_t i, j;
+
+	printf("/* Automatically generated by %s */\n"
+	       "#include <stdint.h>\n\n#include \"../src/util.h\"\n\n",
+	       progname);
+
+	/* print enum */
+	printf("enum %s {\n", identifier);
+	for (i = 0; i < numprops; i++) {
+		printf("\t%s,\n", prop[i].enumname);
+	}
+	printf("};\n\n");
+
+	/* print table */
+	printf("static const struct range_list %s[] = {\n", identifier);
+	for (i = 0; i < numprops; i++) {
+		printf("\t[%s] = {\n\t\t.data = (struct range[]){\n",
+		       prop[i].enumname);
+		for (j = 0; j < prop[i].tablelen; j++) {
+			printf("\t\t\t{ UINT32_C(0x%06X), UINT32_C(0x%06X) },\n",
+			       prop[i].table[j].lower,
+			       prop[i].table[j].upper);
+		}
+		printf("\t\t},\n\t\t.len = %zu,\n\t},\n", prop[i].tablelen);
+	}
+	printf("};\n");
+}
+
+int
+segment_test_callback(char *fname, char **field, size_t nfields, char *comment, void *payload)
+{
+	struct segment_test *t, **test = ((struct segment_test_payload *)payload)->st;
+	size_t i, *ntests = ((struct segment_test_payload *)payload)->numsegtests;
+	char *token;
+
+	(void)fname;
+
+	if (nfields < 1) {
+		return 1;
+	}
+
+	/* append new testcase and initialize with zeroes */
+	if ((*test = realloc(*test, ++(*ntests) * sizeof(**test))) == NULL) {
+		fprintf(stderr, "realloc: %s\n", strerror(errno));
+		return 1;
+	}
+	t = &(*test)[*ntests - 1];
+	memset(t, 0, sizeof(*t));
+
+	/* parse testcase "<÷|×> <cp> <÷|×> ... <cp> <÷|×>" */
+	for (token = strtok(field[0], " "), i = 0; token != NULL; i++,
+	     token = strtok(NULL, " ")) {
+		if (i % 2 == 0) {
+			/* delimiter */
+			if (!strncmp(token, "\xC3\xB7", 2)) { /* UTF-8 */
+				/*
+				 * '÷' indicates a breakpoint,
+				 * the current length is done; allocate
+				 * a new length field and set it to 0
+				 */
+				if ((t->len = realloc(t->len,
+				     ++t->lenlen * sizeof(*t->len))) == NULL) {
+					fprintf(stderr, "realloc: %s\n",
+					        strerror(errno));
+					return 1;
+				}
+				t->len[t->lenlen - 1] = 0;
+			} else if (!strncmp(token, "\xC3\x97", 2)) { /* UTF-8 */
+				/*
+				 * '×' indicates a non-breakpoint, do nothing
+				 */
+			} else {
+				fprintf(stderr, "malformed delimiter '%s'\n",
+				        token);
+				return 1;
+			}
+		} else {
+			/* add code point to cp-array */
+			if ((t->cp = realloc(t->cp, ++t->cplen *
+			                     sizeof(*t->cp))) == NULL) {
+				fprintf(stderr, "realloc: %s\n", strerror(errno));
+				return 1;
+			}
+			if (cp_parse(token, &t->cp[t->cplen - 1])) {
+				return 1;
+			}
+			if (t->lenlen > 0) {
+				t->len[t->lenlen - 1]++;
+			}
+		}
+	}
+	if (t->len[t->lenlen - 1] == 0) {
+		/* we allocated one more length than we needed */
+		t->lenlen--;
+	}
+
+	/* store comment */
+	if (((*test)[*ntests - 1].descr = strdup(comment)) == NULL) {
+		fprintf(stderr, "strdup: %s\n", strerror(errno));
+		return 1;
+	}
+
+	return 0;
+}
+
+void
+segment_test_list_parse(char *fname, struct segment_test **st, size_t *numsegtests)
+{
+	struct segment_test_payload pl = { .st = st, .numsegtests = numsegtests };
+	*st = NULL;
+	*numsegtests = 0;
+
+	parse_file_with_callback(fname, segment_test_callback, &pl);
+}
+
+void
+segment_test_list_print(struct segment_test *st, size_t numsegtests,
+                        const char *identifier, const char *progname)
+{
+	size_t i, j;
+
+	printf("/* Automatically generated by %s */\n"
+	       "#include <stdint.h>\n#include <stddef.h>\n\n", progname);
+
+	printf("static const struct {\n\tuint32_t *cp;\n"
+	       "\tsize_t cplen;\n\tsize_t *len;\n\tsize_t lenlen;\n"
+	       "\tchar *descr;\n} %s[] = {\n", identifier);
+	for (i = 0; i < numsegtests; i++) {
+		printf("\t{\n");
+
+		printf("\t\t.cp     = (uint32_t[]){");
+		for (j = 0; j < st[i].cplen; j++) {
+			printf(" UINT32_C(0x%06X)", st[i].cp[j]);
+			if (j + 1 < st[i].cplen) {
+				putchar(',');
+			}
+		}
+		printf(" },\n");
+		printf("\t\t.cplen  = %zu,\n", st[i].cplen);
+
+		printf("\t\t.len    = (size_t[]){");
+		for (j = 0; j < st[i].lenlen; j++) {
+			printf(" %zu", st[i].len[j]);
+			if (j + 1 < st[i].lenlen) {
+				putchar(',');
+			}
+		}
+		printf(" },\n");
+		printf("\t\t.lenlen = %zu,\n", st[i].lenlen);
+
+		printf("\t\t.descr  = \"%s\",\n", st[i].descr);
+
+		printf("\t},\n");
+	}
+	printf("};\n");
+}
+
+
diff --git a/gen/util.h b/gen/util.h
@@ -0,0 +1,37 @@
+/* See LICENSE file for copyright and license details. */
+#ifndef UTIL_H
+#define UTIL_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#define LEN(x) (sizeof (x) / sizeof *(x))
+
+struct range {
+	uint32_t lower;
+	uint32_t upper;
+};
+
+struct property {
+	char         *enumname;
+	char         *identifier;
+	char         *fname;
+	struct range *table;
+	size_t        tablelen;
+};
+
+struct segment_test {
+	uint32_t *cp;
+	size_t cplen;
+	size_t *len;
+	size_t lenlen;
+	char *descr;
+};
+
+void property_list_parse(struct property *, size_t);
+void property_list_print(const struct property *, size_t, const char *, const char *);
+
+void segment_test_list_parse(char *, struct segment_test **, size_t *);
+void segment_test_list_print(struct segment_test *, size_t, const char *, const char *);
+
+#endif /* UTIL_H */
diff --git a/src/boundary.c b/src/boundary.c
@@ -1,181 +0,0 @@
-/* See LICENSE file for copyright and license details. */
-#include <stddef.h>
-#include <stdint.h>
-#include <stdlib.h>
-
-#include "../data/emoji.h"
-#include "../data/grapheme_boundary.h"
-
-enum {
-	GRAPHEME_STATE_RI_ODD = 1 << 0, /* odd number of RI's before the seam */
-	GRAPHEME_STATE_EMOJI  = 1 << 1, /* within emoji modifier or zwj sequence */
-};
-
-static int
-cp_cmp(const void *a, const void *b)
-{
-	uint32_t cp = *(uint32_t *)a;
-	uint32_t *range = (uint32_t *)b;
-
-	return (cp >= range[0] && cp <= range[1]) ? 0 : (cp - range[0]);
-}
-
-static int
-has_property(uint32_t cp, struct heisenstate *cpstate,
-             const struct range_list *proptable, int property)
-{
-	if (heisenstate_get(cpstate, property) == -1) {
-		/* state undetermined, make a lookup and set it */
-		heisenstate_set(cpstate, property, bsearch(&cp,
-		                proptable[property].data,
-		                proptable[property].len,
-				sizeof(*proptable[property].data),
-		                cp_cmp) ? 1 : 0);
-	}
-
-	return heisenstate_get(cpstate, property);
-}
-
-int
-grapheme_boundary(uint32_t a, uint32_t b, int *state)
-{
-	struct heisenstate gb[2] = { 0 }, emoji[2] = { 0 };
-	int s;
-
-	/* skip printable ASCII */
-	if ((a >= 0x20 && a <= 0x7E) &&
-	    (b >= 0x20 && b <= 0x7E)) {
-		return 1;
-	}
-
-	/* set internal state based on given state-pointer */
-	s = (state != NULL) ? *state : 0;
-
-	/*
-	 * Apply grapheme cluster breaking algorithm (UAX #29), see
-	 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
-	 */
-
-	/*
-	 * update state
-	 */
-	if (has_property(b, &gb[1], gb_prop, GB_PROP_REGIONAL_INDICATOR)) {
-		if (has_property(a, &gb[0], gb_prop, GB_PROP_REGIONAL_INDICATOR)) {
-			/* one more RI is on the left side of the seam */
-			s ^= GRAPHEME_STATE_RI_ODD;
-		} else {
-			/* an RI appeared on the right side but the left
-			   side is not an RI, reset state (0 is even) */
-			s &= ~GRAPHEME_STATE_RI_ODD;
-		}
-	}
-	if (!(*state & GRAPHEME_STATE_EMOJI) &&
-	    ((has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) &&
-	      has_property(b, &gb[1],    gb_prop,    GB_PROP_ZWJ)) ||
-             (has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) &&
-	      has_property(b, &gb[1],    gb_prop,    GB_PROP_EXTEND)))) {
-		s |= GRAPHEME_STATE_EMOJI;
-	} else if ((*state & GRAPHEME_STATE_EMOJI) &&
-	           ((has_property(a, &gb[0],    gb_prop,    GB_PROP_ZWJ) &&
-		     has_property(b, &emoji[1], emoji_prop, EMOJI_PROP_EXTPICT)) ||
-	            (has_property(a, &gb[0],    gb_prop,    GB_PROP_EXTEND) &&
-		     has_property(b, &gb[1],    gb_prop,    GB_PROP_EXTEND)) ||
-	            (has_property(a, &gb[0],    gb_prop,    GB_PROP_EXTEND) &&
-		     has_property(b, &gb[1],    gb_prop,    GB_PROP_ZWJ)) ||
-	            (has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) &&
-		     has_property(b, &gb[1],    gb_prop,    GB_PROP_ZWJ)) ||
-	            (has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) &&
-		     has_property(b, &gb[1],    gb_prop,    GB_PROP_EXTEND)))) {
-		/* GRAPHEME_STATE_EMOJI remains */
-	} else {
-		s &= ~GRAPHEME_STATE_EMOJI;
-	}
-
-	/* write updated state to state-pointer, if given */
-	if (state != NULL) {
-		*state = s;
-	}
-
-	/*
-	 * apply rules
-	 */
-
-	/* skip GB1 and GB2, as they are never satisfied here */
-
-	/* GB3 */
-	if (has_property(a, &gb[0], gb_prop, GB_PROP_CR) &&
-	    has_property(b, &gb[1], gb_prop, GB_PROP_LF)) {
-		return 0;
-	}
-
-	/* GB4 */
-	if (has_property(a, &gb[0], gb_prop, GB_PROP_CONTROL) ||
-	    has_property(a, &gb[0], gb_prop, GB_PROP_CR) ||
-	    has_property(a, &gb[0], gb_prop, GB_PROP_LF)) {
-		return 1;
-	}
-
-	/* GB5 */
-	if (has_property(b, &gb[1], gb_prop, GB_PROP_CONTROL) ||
-	    has_property(b, &gb[1], gb_prop, GB_PROP_CR) ||
-	    has_property(b, &gb[1], gb_prop, GB_PROP_LF)) {
-		return 1;
-	}
-
-	/* GB6 */
-	if (has_property(a, &gb[0], gb_prop, GB_PROP_L) &&
-	    (has_property(b, &gb[1], gb_prop, GB_PROP_L) ||
-	     has_property(b, &gb[1], gb_prop, GB_PROP_V) ||
-	     has_property(b, &gb[1], gb_prop, GB_PROP_LV) ||
-	     has_property(b, &gb[1], gb_prop, GB_PROP_LVT))) {
-		return 0;
-	}
-
-	/* GB7 */
-	if ((has_property(a, &gb[0], gb_prop, GB_PROP_LV) ||
-	     has_property(a, &gb[0], gb_prop, GB_PROP_V)) &&
-	    (has_property(b, &gb[1], gb_prop, GB_PROP_V) ||
-	     has_property(b, &gb[1], gb_prop, GB_PROP_T))) {
-		return 0;
-	}
-
-	/* GB8 */
-	if ((has_property(a, &gb[0], gb_prop, GB_PROP_LVT) ||
-	     has_property(a, &gb[0], gb_prop, GB_PROP_T)) &&
-	    has_property(b, &gb[1], gb_prop, GB_PROP_T)) {
-		return 0;
-	}
-
-	/* GB9 */
-	if (has_property(b, &gb[1], gb_prop, GB_PROP_EXTEND) ||
-	    has_property(b, &gb[1], gb_prop, GB_PROP_ZWJ)) {
-		return 0;
-	}
-
-	/* GB9a */
-	if (has_property(b, &gb[1], gb_prop, GB_PROP_SPACINGMARK)) {
-		return 0;
-	}
-
-	/* GB9b */
-	if (has_property(a, &gb[0], gb_prop, GB_PROP_PREPEND)) {
-		return 0;
-	}
-
-	/* GB11 */
-	if ((s & GRAPHEME_STATE_EMOJI) &&
-	    has_property(a, &gb[0], gb_prop, GB_PROP_ZWJ) &&
-	    has_property(b, &emoji[1], emoji_prop, EMOJI_PROP_EXTPICT)) {
-		return 0;
-	}
-
-	/* GB12/GB13 */
-	if (has_property(a, &gb[0], gb_prop, GB_PROP_REGIONAL_INDICATOR) &&
-	    has_property(b, &gb[1], gb_prop, GB_PROP_REGIONAL_INDICATOR) &&
-	    (s & GRAPHEME_STATE_RI_ODD)) {
-		return 0;
-	}
-
-	/* GB999 */
-	return 1;
-}
diff --git a/src/grapheme.c b/src/grapheme.c
@@ -2,8 +2,158 @@
 #include <stddef.h>
 #include <stdlib.h>
 
+#include "../gen/grapheme.h"
 #include "../grapheme.h"
 
+enum {
+	GRAPHEME_STATE_RI_ODD = 1 << 0, /* odd number of RI's before the seam */
+	GRAPHEME_STATE_EMOJI  = 1 << 1, /* within emoji modifier or zwj sequence */
+};
+
+int
+grapheme_boundary(uint32_t a, uint32_t b, int *state)
+{
+	struct heisenstate prop[2] = { 0 };
+	int s;
+
+	/* skip printable ASCII */
+	if ((a >= 0x20 && a <= 0x7E) &&
+	    (b >= 0x20 && b <= 0x7E)) {
+		return 1;
+	}
+
+	/* set internal state based on given state-pointer */
+	s = (state != NULL) ? *state : 0;
+
+	/*
+	 * Apply grapheme cluster breaking algorithm (UAX #29), see
+	 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
+	 */
+
+	/*
+	 * update state
+	 */
+	if (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR)) {
+		if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR)) {
+			/* one more RI is on the left side of the seam */
+			s ^= GRAPHEME_STATE_RI_ODD;
+		} else {
+			/* an RI appeared on the right side but the left
+			   side is not an RI, reset state (0 is even) */
+			s &= ~GRAPHEME_STATE_RI_ODD;
+		}
+	}
+	if (!(*state & GRAPHEME_STATE_EMOJI) &&
+	    ((has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) &&
+	      has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) ||
+             (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) &&
+	      has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTEND)))) {
+		s |= GRAPHEME_STATE_EMOJI;
+	} else if ((*state & GRAPHEME_STATE_EMOJI) &&
+	           ((has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_ZWJ) &&
+		     has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC)) ||
+	            (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_EXTEND) &&
+		     has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTEND)) ||
+	            (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_EXTEND) &&
+		     has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) ||
+	            (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) &&
+		     has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) ||
+	            (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) &&
+		     has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTEND)))) {
+		/* GRAPHEME_STATE_EMOJI remains */
+	} else {
+		s &= ~GRAPHEME_STATE_EMOJI;
+	}
+
+	/* write updated state to state-pointer, if given */
+	if (state != NULL) {
+		*state = s;
+	}
+
+	/*
+	 * apply rules
+	 */
+
+	/* skip GB1 and GB2, as they are never satisfied here */
+
+	/* GB3 */
+	if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_CR) &&
+	    has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_LF)) {
+		return 0;
+	}
+
+	/* GB4 */
+	if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_CONTROL) ||
+	    has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_CR) ||
+	    has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_LF)) {
+		return 1;
+	}
+
+	/* GB5 */
+	if (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_CONTROL) ||
+	    has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_CR) ||
+	    has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_LF)) {
+		return 1;
+	}
+
+	/* GB6 */
+	if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_HANGUL_L) &&
+	    (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_L) ||
+	     has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_V) ||
+	     has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_LV) ||
+	     has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_LVT))) {
+		return 0;
+	}
+
+	/* GB7 */
+	if ((has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_HANGUL_LV) ||
+	     has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_HANGUL_V)) &&
+	    (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_V) ||
+	     has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_T))) {
+		return 0;
+	}
+
+	/* GB8 */
+	if ((has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_HANGUL_LVT) ||
+	     has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_HANGUL_T)) &&
+	    has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_T)) {
+		return 0;
+	}
+
+	/* GB9 */
+	if (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTEND) ||
+	    has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) {
+		return 0;
+	}
+
+	/* GB9a */
+	if (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_SPACINGMARK)) {
+		return 0;
+	}
+
+	/* GB9b */
+	if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_PREPEND)) {
+		return 0;
+	}
+
+	/* GB11 */
+	if ((s & GRAPHEME_STATE_EMOJI) &&
+	    has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_ZWJ) &&
+	    has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC)) {
+		return 0;
+	}
+
+	/* GB12/GB13 */
+	if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR) &&
+	    has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR) &&
+	    (s & GRAPHEME_STATE_RI_ODD)) {
+		return 0;
+	}
+
+	/* GB999 */
+	return 1;
+}
+
 size_t
 grapheme_bytelen(const char *str)
 {
diff --git a/src/util.c b/src/util.c
@@ -1,10 +1,13 @@
 /* See LICENSE file for copyright and license details. */
+#include <stdint.h>
+#include <stdlib.h>
+
 #include "util.h"
 
 int
 heisenstate_get(struct heisenstate *h, int slot)
 {
-	if (h == NULL || slot >= 16 || slot < 0 ||
+	if (h == NULL || slot >= 64 || slot < 0 ||
 	    !(h->determined & (1 << slot))) {
 		/* no state given, slot out of range or undetermined */
 		return -1;
@@ -17,7 +20,7 @@ heisenstate_get(struct heisenstate *h, int slot)
 int
 heisenstate_set(struct heisenstate *h, int slot, int state)
 {
-	if (h == NULL || slot >= 16 || slot < 0) {
+	if (h == NULL || slot >= 64 || slot < 0) {
 		/* no state given or slot out of range */
 		return 1;
 	} else {
@@ -31,3 +34,28 @@ heisenstate_set(struct heisenstate *h, int slot, int state)
 
 	return 0;
 }
+
+static int
+cp_cmp(const void *a, const void *b)
+{
+	uint32_t cp = *(uint32_t *)a;
+	uint32_t *range = (uint32_t *)b;
+
+	return (cp >= range[0] && cp <= range[1]) ? 0 : (cp - range[0]);
+}
+
+int
+has_property(uint32_t cp, struct heisenstate *cpstate,
+             const struct range_list *proptable, int property)
+{
+	if (heisenstate_get(cpstate, property) == -1) {
+		/* state undetermined, make a lookup and set it */
+		heisenstate_set(cpstate, property, bsearch(&cp,
+		                proptable[property].data,
+		                proptable[property].len,
+				sizeof(*proptable[property].data),
+		                cp_cmp) ? 1 : 0);
+	}
+
+	return heisenstate_get(cpstate, property);
+}
diff --git a/src/util.h b/src/util.h
@@ -17,13 +17,16 @@ struct range_list {
 	size_t len;
 };
 
-/* 16-slot (0,...,15) optionally undetermined binary state */
+/* 64-slot (0,...,63) optionally undetermined binary state */
 struct heisenstate {
-	uint_least16_t determined;
-	uint_least16_t state;
+	uint_least64_t determined;
+	uint_least64_t state;
 };
 
 int heisenstate_get(struct heisenstate *, int);
 int heisenstate_set(struct heisenstate *, int, int);
 
+int has_property(uint32_t, struct heisenstate *,
+                 const struct range_list *, int);
+
 #endif /* UTIL_H */
diff --git a/test/grapheme.c b/test/grapheme.c
@@ -0,0 +1,43 @@
+/* See LICENSE file for copyright and license details. */
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "../grapheme.h"
+#include "../gen/grapheme-test.h"
+
+#define LEN(x) (sizeof(x) / sizeof(*x))
+
+int
+main(void)
+{
+	int state;
+	size_t i, j, k, len, failed;
+
+	/* grapheme break test */
+	for (i = 0, failed = 0; i < LEN(grapheme_test); i++) {
+		for (j = 0, k = 0, state = 0, len = 1; j < grapheme_test[i].cplen; j++) {
+			if ((j + 1) == grapheme_test[i].cplen ||
+			    grapheme_boundary(grapheme_test[i].cp[j],
+			                      grapheme_test[i].cp[j + 1],
+			                      &state)) {
+				/* check if our resulting length matches */
+				if (k == grapheme_test[i].lenlen ||
+				    len != grapheme_test[i].len[k++]) {
+					fprintf(stderr, "Failed \"%s\"\n",
+					        grapheme_test[i].descr);
+					failed++;
+					break;
+				}
+				len = 1;
+			} else {
+				len++;
+			}
+		}
+	}
+	printf("Grapheme break test: Passed %zu out of %zu tests.\n",
+	       LEN(grapheme_test) - failed, LEN(grapheme_test));
+
+	return (failed > 0) ? 1 : 0;
+}
diff --git a/test/grapheme_boundary.c b/test/grapheme_boundary.c
@@ -1,41 +0,0 @@
-/* See LICENSE file for copyright and license details. */
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>
-
-#include "../grapheme.h"
-#include "../data/grapheme_boundary_test.h"
-
-#define LEN(x) (sizeof(x) / sizeof(*x))
-
-int
-main(void)
-{
-	int state;
-	size_t i, j, k, len, failed;
-
-	/* grapheme break test */
-	for (i = 0, failed = 0; i < LEN(t); i++) {
-		for (j = 0, k = 0, state = 0, len = 1; j < t[i].cplen; j++) {
-			if ((j + 1) == t[i].cplen ||
-			    grapheme_boundary(t[i].cp[j], t[i].cp[j + 1],
-			                      &state)) {
-				/* check if our resulting length matches */
-				if (k == t[i].lenlen || len != t[i].len[k++]) {
-					fprintf(stderr, "Failed \"%s\"\n",
-					        t[i].descr);
-					failed++;
-					break;
-				}
-				len = 1;
-			} else {
-				len++;
-			}
-		}
-	}
-	printf("Grapheme break test: Passed %zu out of %zu tests.\n",
-	       LEN(t) - failed, LEN(t));
-
-	return (failed > 0) ? 1 : 0;
-}

	libgrapheme unicode string library
	git clone git://git.suckless.org/libgrapheme
	Log \| Files \| Refs \| README \| LICENSE

M	LICENSE	\|	2	+-
M	Makefile	\|	54	+++++++++++++++++++++++++++---------------------------
R	data/grapheme_boundary.txt -> data/GraphemeBreakProperty.txt	\|	0
R	data/grapheme_boundary_test.txt -> data/GraphemeBreakTest.txt	\|	0
D	data/datautil.c	\|	159	-------------------------------------------------------------------------------
D	data/datautil.h	\|	20	--------------------
R	data/emoji.txt -> data/emoji-data.txt	\|	0
D	data/emoji.c	\|	78	------------------------------------------------------------------------------
D	data/grapheme_boundary.c	\|	138	-------------------------------------------------------------------------------
D	data/grapheme_boundary_test.c	\|	139	-------------------------------------------------------------------------------
A	gen/grapheme-test.c	\|	18	++++++++++++++++++
A	gen/grapheme.c	\|	92	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	gen/util.c	\|	384	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	gen/util.h	\|	37	+++++++++++++++++++++++++++++++++++++
D	src/boundary.c	\|	181	-------------------------------------------------------------------------------
M	src/grapheme.c	\|	150	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	src/util.c	\|	32	++++++++++++++++++++++++++++++--
M	src/util.h	\|	9	++++++---
A	test/grapheme.c	\|	43	+++++++++++++++++++++++++++++++++++++++++++
D	test/grapheme_boundary.c	\|	41	-----------------------------------------