libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

commit 0e3d5f60213ba55935364c73422b373ac380f574
parent f334f95e146045257631c605510413ba8de4639d
Author: Laslo Hunhold <dev@frign.de>
Date:   Wed,  8 Dec 2021 17:47:58 +0100

Refactor data-generation and library structure

What I always didn't like was the fact that you would have to have
two heisenstates in grapheme_boundary() (one for the grapheme-proptable
and one for the emoji-proptable). This unnecessarily complicated the
handling a little bit, even though there is still room for improvement.

A new folder gen was created to contain the generation tools. The data
folder from now on only contains data files.

Now gen/util.c contains all necessary functions to properly parse
property files (and test files) and you merely have to create an
"order list" (e.g. in gen/grapheme.c and gen/grapheme-test.c) and then
are good to go. This doesn't immensely remove code duplication, but
will come in handy in the future.

Additionally, src/boundary.c was moved into src/grapheme.c so there's
only one object file pulling in the data-table. This separation makes
the structure of the program clearer and helps the linker discard
unused library elements.

The heisenstate was increased to 64 bits for future use.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
MLICENSE | 2+-
MMakefile | 54+++++++++++++++++++++++++++---------------------------
Rdata/grapheme_boundary.txt -> data/GraphemeBreakProperty.txt | 0
Rdata/grapheme_boundary_test.txt -> data/GraphemeBreakTest.txt | 0
Ddata/datautil.c | 159-------------------------------------------------------------------------------
Ddata/datautil.h | 20--------------------
Rdata/emoji.txt -> data/emoji-data.txt | 0
Ddata/emoji.c | 78------------------------------------------------------------------------------
Ddata/grapheme_boundary.c | 138-------------------------------------------------------------------------------
Ddata/grapheme_boundary_test.c | 139-------------------------------------------------------------------------------
Agen/grapheme-test.c | 18++++++++++++++++++
Agen/grapheme.c | 92+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Agen/util.c | 384+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Agen/util.h | 37+++++++++++++++++++++++++++++++++++++
Dsrc/boundary.c | 181-------------------------------------------------------------------------------
Msrc/grapheme.c | 150+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/util.c | 32++++++++++++++++++++++++++++++--
Msrc/util.h | 9++++++---
Atest/grapheme.c | 43+++++++++++++++++++++++++++++++++++++++++++
Dtest/grapheme_boundary.c | 41-----------------------------------------
20 files changed, 788 insertions(+), 789 deletions(-)

diff --git a/LICENSE b/LICENSE @@ -1,6 +1,6 @@ ISC-License -Copyright 2019-2020 Laslo Hunhold <dev@frign.de> +Copyright 2019-2021 Laslo Hunhold <dev@frign.de> Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above diff --git a/Makefile b/Makefile @@ -4,52 +4,52 @@ include config.mk -LIB = src/boundary src/codepoint src/grapheme src/util -TEST = test/grapheme_boundary test/utf8-decode test/utf8-encode -DATA = data/emoji data/grapheme_boundary data/grapheme_boundary_test +DATA =\ + data/emoji-data.txt\ + data/GraphemeBreakProperty.txt\ + data/GraphemeBreakTest.txt +GEN = gen/grapheme gen/grapheme-test +LIB = src/codepoint src/grapheme src/util +TEST = test/grapheme test/utf8-decode test/utf8-encode MAN3 = man/grapheme_bytelen.3 MAN7 = man/libgrapheme.7 all: libgrapheme.a libgrapheme.so -data/emoji.h: data/emoji.txt data/emoji -data/grapheme_boundary.h: data/grapheme_boundary.txt data/grapheme_boundary -data/grapheme_boundary_test.h: data/grapheme_boundary_test.txt data/grapheme_boundary_test - -data/emoji.o: data/emoji.c config.mk data/datautil.h -data/grapheme_boundary.o: data/grapheme_boundary.c config.mk data/datautil.h -data/grapheme_boundary_test.o: data/grapheme_boundary_test.c config.mk data/datautil.h -data/datautil.o: data/datautil.c config.mk data/datautil.h -src/boundary.o: src/boundary.c config.mk data/emoji.h data/grapheme_boundary.h grapheme.h +gen/grapheme.o: gen/grapheme.c config.mk gen/util.h +gen/grapheme-test.o: gen/grapheme-test.c config.mk gen/util.h +gen/util.o: gen/util.c config.mk gen/util.h src/codepoint.o: src/codepoint.c config.mk grapheme.h -src/grapheme.o: src/grapheme.c config.mk grapheme.h +src/grapheme.o: src/grapheme.c config.mk gen/grapheme.h grapheme.h src/util.h src/util.o: src/util.c config.mk src/util.h -test/grapheme_boundary.o: test/grapheme_boundary.c config.mk data/grapheme_boundary_test.h grapheme.h +test/grapheme.o: test/grapheme.c config.mk gen/grapheme-test.h grapheme.h test/utf8-encode.o: test/utf8-encode.c config.mk grapheme.h test/utf8-decode.o: test/utf8-decode.c config.mk grapheme.h -data/emoji: data/emoji.o data/datautil.o -data/grapheme_boundary: data/grapheme_boundary.o data/datautil.o -data/grapheme_boundary_test: data/grapheme_boundary_test.o data/datautil.o -test/grapheme_boundary: test/grapheme_boundary.o libgrapheme.a +gen/grapheme: gen/grapheme.o gen/util.o +gen/grapheme-test: gen/grapheme-test.o gen/util.o +test/grapheme: test/grapheme.o libgrapheme.a test/utf8-encode: test/utf8-encode.o libgrapheme.a test/utf8-decode: test/utf8-decode.o libgrapheme.a -data/emoji.txt: +gen/grapheme.h: data/emoji-data.txt data/GraphemeBreakProperty.txt gen/grapheme +gen/grapheme-test.h: data/GraphemeBreakTest.txt gen/grapheme-test + +data/emoji-data.txt: wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt -data/grapheme_boundary.txt: +data/GraphemeBreakProperty.txt: wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/auxiliary/GraphemeBreakProperty.txt -data/grapheme_boundary_test.txt: +data/GraphemeBreakTest.txt: wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/auxiliary/GraphemeBreakTest.txt -$(DATA:=.h): - $(@:.h=) < $(@:.h=.txt) > $@ +$(GEN): + $(CC) -o $@ $(LDFLAGS) $@.o gen/util.o -$(DATA): - $(CC) -o $@ $(LDFLAGS) $@.o data/datautil.o +$(GEN:=.h): + $(@:.h=) > $@ $(TEST): $(CC) -o $@ $(LDFLAGS) $@.o libgrapheme.a @@ -86,7 +86,7 @@ uninstall: rm -f "$(DESTDIR)$(INCPREFIX)/grapheme.h" clean: - rm -f $(DATA:=.h) $(DATA:=.o) data/datautil.o $(LIB:=.o) $(TEST:=.o) $(DATA) $(TEST) libgrapheme.a libgrapheme.so + rm -f $(GEN:=.h) $(GEN:=.o) $(GEN) gen/util.o $(LIB:=.o) $(TEST:=.o) $(TEST) libgrapheme.a libgrapheme.so clean-data: - rm -f $(DATA:=.txt) + rm -f $(DATA) diff --git a/data/grapheme_boundary.txt b/data/GraphemeBreakProperty.txt diff --git a/data/grapheme_boundary_test.txt b/data/GraphemeBreakTest.txt diff --git a/data/datautil.c b/data/datautil.c @@ -1,159 +0,0 @@ -/* See LICENSE file for copyright and license details. */ -#include <stdint.h> -#include <stdlib.h> -#include <stdio.h> -#include <string.h> -#include <errno.h> - -#include "datautil.h" - -void -parse_input(int (*process_line)(char **, size_t, char *)) -{ - char *line = NULL, **field = NULL, *comment; - size_t linebufsize = 0, i, fieldbufsize = 0, j, nfields; - ssize_t len; - - while ((len = getline(&line, &linebufsize, stdin)) >= 0) { - /* remove trailing newline */ - if (len > 0 && line[len - 1] == '\n') { - line[len - 1] = '\0'; - len--; - } - - /* skip empty lines and comment lines */ - if (len == 0 || line[0] == '#') { - continue; - } - - /* tokenize line into fields */ - for (i = 0, nfields = 0, comment = NULL; i < (size_t)len; i++) { - /* extend field buffer, if necessary */ - if (++nfields > fieldbufsize) { - if ((field = realloc(field, nfields * - sizeof(*field))) == NULL) { - fprintf(stderr, "realloc: %s\n", strerror(errno)); - exit(1); - } - fieldbufsize = nfields; - } - - /* skip leading whitespace */ - while (line[i] == ' ') { - i++; - } - - /* set current position as field start */ - field[nfields - 1] = &line[i]; - - /* continue until we reach ';' or '#' or end */ - while (line[i] != ';' && line[i] != '#' && - line[i] != '\0') { - i++; - } - if (line [i] == '#') { - /* set comment-variable for later */ - comment = &line[i + 1]; - } - - /* go back whitespace and terminate field there */ - if (i > 0) { - for (j = i - 1; line[j] == ' '; j--) - ; - line[j + 1] = '\0'; - } else { - line[i] = '\0'; - } - - /* if comment is set, we are done */ - if (comment != NULL) { - break; - } - } - - /* skip leading whitespace in comment */ - while (comment != NULL && comment[0] == ' ') { - comment++; - } - - /* call line processing function */ - if (process_line(field, nfields, comment)) { - exit(1); - } - } - - free(line); - free(field); -} - -static int -valid_hexstring(const char *str) -{ - const char *p = str; - - while ((*p >= '0' && *p <= '9') || - (*p >= 'a' && *p <= 'f') || - (*p >= 'A' && *p <= 'F')) { - p++; - } - - if (*p != '\0') { - fprintf(stderr, "invalid code point range '%s'\n", str); - return 0; - } - - return 1; -} - -int -cp_parse(const char *str, uint32_t *cp) -{ - if (!valid_hexstring(str)) { - return 1; - } - *cp = strtol(str, NULL, 16); - - return 0; -} - -int -range_parse(const char *str, struct range *range) -{ - char *p; - - if ((p = strstr(str, "..")) == NULL) { - /* input has the form "XXXXXX" */ - if (!valid_hexstring(str)) { - return 1; - } - range->lower = range->upper = strtol(str, NULL, 16); - } else { - /* input has the form "XXXXXX..XXXXXX" */ - *p = '\0'; - p += 2; - if (!valid_hexstring(str) || !valid_hexstring(p)) { - return 1; - } - range->lower = strtol(str, NULL, 16); - range->upper = strtol(p, NULL, 16); - } - - return 0; -} - -void -range_list_append(struct range **range, size_t *nranges, const struct range *new) -{ - if (*nranges > 0 && (*range)[*nranges - 1].upper == new->lower) { - /* we can merge with previous entry */ - (*range)[*nranges - 1].upper = new->upper; - } else { - /* need to append new entry */ - if ((*range = realloc(*range, (++(*nranges)) * sizeof(**range))) == NULL) { - fprintf(stderr, "realloc: %s\n", strerror(errno)); - exit(1); - } - (*range)[*nranges - 1].lower = new->lower; - (*range)[*nranges - 1].upper = new->upper; - } -} diff --git a/data/datautil.h b/data/datautil.h @@ -1,20 +0,0 @@ -/* See LICENSE file for copyright and license details. */ -#ifndef DATAUTIL_H -#define DATAUTIL_H - -#include <stddef.h> -#include <stdint.h> - -#define LEN(x) (sizeof (x) / sizeof *(x)) - -struct range { - uint32_t lower; - uint32_t upper; -}; - -void parse_input(int (*process_line)(char **, size_t, char *)); -int cp_parse(const char *, uint32_t *); -int range_parse(const char *, struct range *); -void range_list_append(struct range **, size_t *, const struct range *); - -#endif /* DATAUTIL_H */ diff --git a/data/emoji.txt b/data/emoji-data.txt diff --git a/data/emoji.c b/data/emoji.c @@ -1,78 +0,0 @@ -/* See LICENSE file for copyright and license details. */ -#include <stddef.h> -#include <stdio.h> -#include <string.h> - -#include "datautil.h" - -static struct { - char *enumname; - char *identifier; - struct range *table; - size_t tablelen; -} properties[] = { - { - /* extended pictographic */ - .enumname = "EMOJI_PROP_EXTPICT", - .identifier = "Extended_Pictographic", - }, -}; - -int -process_line(char **field, size_t nfields, char *comment) -{ - size_t i; - struct range r; - - (void)comment; - - if (nfields < 2) { - return 1; - } - - for (i = 0; i < LEN(properties); i++) { - if (!strcmp(field[1], properties[i].identifier)) { - if (range_parse(field[0], &r)) { - return 1; - } - range_list_append(&(properties[i].table), - &(properties[i].tablelen), &r); - break; - } - } - - return 0; -} - -int -main(void) -{ - size_t i, j; - - printf("/* Automatically generated by data/emo */\n" - "#include <stdint.h>\n\n#include \"../src/util.h\"\n\n"); - - parse_input(process_line); - - /* output enum */ - printf("enum emoji_prop {\n"); - for (i = 0; i < LEN(properties); i++) { - printf("\t%s,\n", properties[i].enumname); - } - printf("};\n\n"); - - /* output table */ - printf("static const struct range_list emoji_prop[] = {\n"); - for (i = 0; i < LEN(properties); i++) { - printf("\t[%s] = {\n\t\t.data = (struct range[]){\n", properties[i].enumname); - for (j = 0; j < properties[i].tablelen; j++) { - printf("\t\t\t{ UINT32_C(0x%06X), UINT32_C(0x%06X) },\n", - properties[i].table[j].lower, - properties[i].table[j].upper); - } - printf("\t\t},\n\t\t.len = %zu,\n\t},\n", properties[i].tablelen); - } - printf("};\n"); - - return 0; -} diff --git a/data/grapheme_boundary.c b/data/grapheme_boundary.c @@ -1,138 +0,0 @@ -/* See LICENSE file for copyright and license details. */ -#include <stddef.h> -#include <stdio.h> -#include <string.h> - -#include "datautil.h" - -static struct { - char *enumname; - char *identifier; - struct range *table; - size_t tablelen; -} properties[] = { - { - /* carriage return */ - .enumname = "GB_PROP_CR", - .identifier = "CR", - }, - { - /* line feed */ - .enumname = "GB_PROP_LF", - .identifier = "LF", - }, - { - /* control character */ - .enumname = "GB_PROP_CONTROL", - .identifier = "Control", - }, - { - /* grapheme extender */ - .enumname = "GB_PROP_EXTEND", - .identifier = "Extend", - }, - { - /* zero width joiner */ - .enumname = "GB_PROP_ZWJ", - .identifier = "ZWJ", - }, - { - /* regional indicator */ - .enumname = "GB_PROP_REGIONAL_INDICATOR", - .identifier = "Regional_Indicator", - }, - { - /* prepend character */ - .enumname = "GB_PROP_PREPEND", - .identifier = "Prepend", - }, - { - /* spacing mark */ - .enumname = "GB_PROP_SPACINGMARK", - .identifier = "SpacingMark", - }, - { - /* hangul syllable type L */ - .enumname = "GB_PROP_L", - .identifier = "L", - }, - { - /* hangul syllable type V */ - .enumname = "GB_PROP_V", - .identifier = "V", - }, - { - /* hangul syllable type T */ - .enumname = "GB_PROP_T", - .identifier = "T", - }, - { - /* hangul syllable type LV */ - .enumname = "GB_PROP_LV", - .identifier = "LV", - }, - { - /* hangul syllable type LVT */ - .enumname = "GB_PROP_LVT", - .identifier = "LVT", - }, -}; - -int -process_line(char **field, size_t nfields, char *comment) -{ - size_t i; - struct range r; - - (void)comment; - - if (nfields < 2) { - return 1; - } - - for (i = 0; i < LEN(properties); i++) { - if (!strcmp(field[1], properties[i].identifier)) { - if (range_parse(field[0], &r)) { - return 1; - } - range_list_append(&(properties[i].table), - &(properties[i].tablelen), &r); - break; - } - } - - return 0; -} - -int -main(void) -{ - size_t i, j; - - printf("/* Automatically generated by data/gbp */\n" - "#include <stdint.h>\n\n#include \"../src/util.h\"\n\n"); - - parse_input(process_line); - - /* output enum */ - printf("enum gb_prop {\n"); - for (i = 0; i < LEN(properties); i++) { - printf("\t%s,\n", properties[i].enumname); - } - printf("};\n\n"); - - /* output table */ - printf("static const struct range_list gb_prop[] = {\n"); - for (i = 0; i < LEN(properties); i++) { - printf("\t[%s] = {\n\t\t.data = (struct range[]){\n", properties[i].enumname); - for (j = 0; j < properties[i].tablelen; j++) { - printf("\t\t\t{ UINT32_C(0x%06X), UINT32_C(0x%06X) },\n", - properties[i].table[j].lower, - properties[i].table[j].upper); - } - printf("\t\t},\n\t\t.len = %zu,\n\t},\n", properties[i].tablelen); - } - printf("};\n"); - - return 0; -} diff --git a/data/grapheme_boundary_test.c b/data/grapheme_boundary_test.c @@ -1,139 +0,0 @@ -/* See LICENSE file for copyright and license details. */ -#include <stddef.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <errno.h> - -#include "datautil.h" - -struct break_test { - uint32_t *cp; - size_t cplen; - size_t *len; - size_t lenlen; - char *descr; -}; - -static struct break_test *test = NULL; -static size_t ntests = 0; - -int -process_line(char **field, size_t nfields, char *comment) -{ - struct break_test *t; - size_t i; - char *token; - - if (nfields < 1) { - return 1; - } - - /* append new testcase and initialize with zeroes */ - if ((test = realloc(test, ++ntests * sizeof(*test))) == NULL) { - fprintf(stderr, "realloc: %s\n", strerror(errno)); - return 1; - } - t = &test[ntests - 1]; - memset(t, 0, sizeof(*t)); - - /* parse testcase "<÷|×> <cp> <÷|×> ... <cp> <÷|×>" */ - for (token = strtok(field[0], " "), i = 0; token != NULL; i++, - token = strtok(NULL, " ")) { - if (i % 2 == 0) { - /* delimiter */ - if (!strncmp(token, "\xC3\xB7", 2)) { /* UTF-8 */ - /* - * '÷' indicates a breakpoint, - * the current length is done; allocate - * a new length field and set it to 0 - */ - if ((t->len = realloc(t->len, - ++t->lenlen * sizeof(*t->len))) == NULL) { - fprintf(stderr, "realloc: %s\n", - strerror(errno)); - return 1; - } - t->len[t->lenlen - 1] = 0; - } else if (!strncmp(token, "\xC3\x97", 2)) { /* UTF-8 */ - /* - * '×' indicates a non-breakpoint, do nothing - */ - } else { - fprintf(stderr, "malformed delimiter '%s'\n", - token); - return 1; - } - } else { - /* add code point to cp-array */ - if ((t->cp = realloc(t->cp, ++t->cplen * - sizeof(*t->cp))) == NULL) { - fprintf(stderr, "realloc: %s\n", strerror(errno)); - return 1; - } - if (cp_parse(token, &t->cp[t->cplen - 1])) { - return 1; - } - if (t->lenlen > 0) { - t->len[t->lenlen - 1]++; - } - } - } - if (t->len[t->lenlen - 1] == 0) { - /* we allocated one more length than we needed */ - t->lenlen--; - } - - /* store comment */ - if ((test[ntests - 1].descr = strdup(comment)) == NULL) { - fprintf(stderr, "strdup: %s\n", strerror(errno)); - return 1; - } - - return 0; -} - -int -main(void) -{ - size_t i, j; - - printf("/* Automatically generated by data/gbt */\n" - "#include <stdint.h>\n#include <stddef.h>\n\n"); - - parse_input(process_line); - - printf("static const struct break_test {\n\tuint32_t *cp;\n" - "\tsize_t cplen;\n\tsize_t *len;\n\tsize_t lenlen;\n" - "\tchar *descr;\n} t[] = {\n"); - for (i = 0; i < ntests; i++) { - printf("\t{\n"); - - printf("\t\t.cp = (uint32_t[]){"); - for (j = 0; j < test[i].cplen; j++) { - printf(" UINT32_C(0x%06X)", test[i].cp[j]); - if (j + 1 < test[i].cplen) { - putchar(','); - } - } - printf(" },\n"); - printf("\t\t.cplen = %zu,\n", test[i].cplen); - - printf("\t\t.len = (size_t[]){"); - for (j = 0; j < test[i].lenlen; j++) { - printf(" %zu", test[i].len[j]); - if (j + 1 < test[i].lenlen) { - putchar(','); - } - } - printf(" },\n"); - printf("\t\t.lenlen = %zu,\n", test[i].lenlen); - - printf("\t\t.descr = \"%s\",\n", test[i].descr); - - printf("\t},\n"); - } - printf("};\n"); - - return 0; -} diff --git a/gen/grapheme-test.c b/gen/grapheme-test.c @@ -0,0 +1,18 @@ +/* See LICENSE file for copyright and license details. */ +#include <stddef.h> + +#include "util.h" + +int +main(int argc, char *argv[]) +{ + struct segment_test *st = NULL; + size_t numsegtests = 0; + + (void)argc; + + segment_test_list_parse("data/GraphemeBreakTest.txt", &st, &numsegtests); + segment_test_list_print(st, numsegtests, "grapheme_test", argv[0]); + + return 0; +} diff --git a/gen/grapheme.c b/gen/grapheme.c @@ -0,0 +1,92 @@ +/* See LICENSE file for copyright and license details. */ +#include <stddef.h> + +#include "util.h" + +#define FILE_EMOJI "data/emoji-data.txt" +#define FILE_GRAPHEME "data/GraphemeBreakProperty.txt" + +static struct property segment_property[] = { + { + .enumname = "GRAPHEME_PROP_CONTROL", + .identifier = "Control", + .fname = FILE_GRAPHEME, + }, + { + .enumname = "GRAPHEME_PROP_CR", + .identifier = "CR", + .fname = FILE_GRAPHEME, + }, + { + .enumname = "GRAPHEME_PROP_EXTEND", + .identifier = "Extend", + .fname = FILE_GRAPHEME, + }, + { + .enumname = "GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC", + .identifier = "Extended_Pictographic", + .fname = FILE_EMOJI, + }, + { + .enumname = "GRAPHEME_PROP_HANGUL_L", + .identifier = "L", + .fname = FILE_GRAPHEME, + }, + { + .enumname = "GRAPHEME_PROP_HANGUL_V", + .identifier = "V", + .fname = FILE_GRAPHEME, + }, + { + .enumname = "GRAPHEME_PROP_HANGUL_T", + .identifier = "T", + .fname = FILE_GRAPHEME, + }, + { + .enumname = "GRAPHEME_PROP_HANGUL_LV", + .identifier = "LV", + .fname = FILE_GRAPHEME, + }, + { + .enumname = "GRAPHEME_PROP_HANGUL_LVT", + .identifier = "LVT", + .fname = FILE_GRAPHEME, + }, + { + .enumname = "GRAPHEME_PROP_LF", + .identifier = "LF", + .fname = FILE_GRAPHEME, + }, + { + .enumname = "GRAPHEME_PROP_PREPEND", + .identifier = "Prepend", + .fname = FILE_GRAPHEME, + }, + { + .enumname = "GRAPHEME_PROP_REGIONAL_INDICATOR", + .identifier = "Regional_Indicator", + .fname = FILE_GRAPHEME, + }, + { + .enumname = "GRAPHEME_PROP_SPACINGMARK", + .identifier = "SpacingMark", + .fname = FILE_GRAPHEME, + }, + { + .enumname = "GRAPHEME_PROP_ZWJ", + .identifier = "ZWJ", + .fname = FILE_GRAPHEME, + }, +}; + +int +main(int argc, char *argv[]) +{ + (void)argc; + + property_list_parse(segment_property, LEN(segment_property)); + property_list_print(segment_property, LEN(segment_property), + "grapheme_prop", argv[0]); + + return 0; +} diff --git a/gen/util.c b/gen/util.c @@ -0,0 +1,384 @@ +/* See LICENSE file for copyright and license details. */ +#include <stdint.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <errno.h> + +#include "util.h" + +struct property_list_payload +{ + struct property *prop; + size_t numprops; +}; + +struct segment_test_payload +{ + struct segment_test **st; + size_t *numsegtests; +}; + +static int +valid_hexstring(const char *str) +{ + const char *p = str; + + while ((*p >= '0' && *p <= '9') || + (*p >= 'a' && *p <= 'f') || + (*p >= 'A' && *p <= 'F')) { + p++; + } + + if (*p != '\0') { + fprintf(stderr, "valid_hexstring: Invalid code point range '%s'\n", str); + return 0; + } + + return 1; +} + +static int +cp_parse(const char *str, uint32_t *cp) +{ + if (!valid_hexstring(str)) { + return 1; + } + *cp = strtol(str, NULL, 16); + + return 0; +} + +static int +range_parse(const char *str, struct range *range) +{ + char *p; + + if ((p = strstr(str, "..")) == NULL) { + /* input has the form "XXXXXX" */ + if (!valid_hexstring(str)) { + return 1; + } + range->lower = range->upper = strtol(str, NULL, 16); + } else { + /* input has the form "XXXXXX..XXXXXX" */ + *p = '\0'; + p += 2; + if (!valid_hexstring(str) || !valid_hexstring(p)) { + return 1; + } + range->lower = strtol(str, NULL, 16); + range->upper = strtol(p, NULL, 16); + } + + return 0; +} + +void +range_list_append(struct range **range, size_t *nranges, const struct range *new) +{ + if (*nranges > 0 && (*range)[*nranges - 1].upper == new->lower) { + /* we can merge with previous entry */ + (*range)[*nranges - 1].upper = new->upper; + } else { + /* need to append new entry */ + if ((*range = realloc(*range, (++(*nranges)) * sizeof(**range))) == NULL) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + (*range)[*nranges - 1].lower = new->lower; + (*range)[*nranges - 1].upper = new->upper; + } +} + +void parse_file_with_callback(char *fname, int (*callback)(char *, char **, size_t, char *, void *), void *payload) +{ + FILE *fp; + char *line = NULL, **field = NULL, *comment; + size_t linebufsize = 0, i, fieldbufsize = 0, j, nfields; + ssize_t len; + + /* open file */ + if (!(fp = fopen(fname, "r"))) { + fprintf(stderr, "fopen '%s': %s\n", fname, + strerror(errno)); + exit(1); + } + + while ((len = getline(&line, &linebufsize, fp)) >= 0) { + /* remove trailing newline */ + if (len > 0 && line[len - 1] == '\n') { + line[len - 1] = '\0'; + len--; + } + + /* skip empty lines and comment lines */ + if (len == 0 || line[0] == '#') { + continue; + } + + /* tokenize line into fields */ + for (i = 0, nfields = 0, comment = NULL; i < (size_t)len; i++) { + /* extend field buffer, if necessary */ + if (++nfields > fieldbufsize) { + if ((field = realloc(field, nfields * + sizeof(*field))) == NULL) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + fieldbufsize = nfields; + } + + /* skip leading whitespace */ + while (line[i] == ' ') { + i++; + } + + /* set current position as field start */ + field[nfields - 1] = &line[i]; + + /* continue until we reach ';' or '#' or end */ + while (line[i] != ';' && line[i] != '#' && + line[i] != '\0') { + i++; + } + if (line[i] == '#') { + /* set comment-variable for later */ + comment = &line[i + 1]; + } + + /* go back whitespace and terminate field there */ + if (i > 0) { + for (j = i - 1; line[j] == ' '; j--) + ; + line[j + 1] = '\0'; + } else { + line[i] = '\0'; + } + + /* if comment is set, we are done */ + if (comment != NULL) { + break; + } + } + + /* skip leading whitespace in comment */ + while (comment != NULL && comment[0] == ' ') { + comment++; + } + + /* call callback function */ + if (callback(fname, field, nfields, comment, payload)) { + fprintf(stderr, "parse_file_with_callback: Malformed input.\n"); + exit(1); + } + } +} + +int +property_list_callback(char *fname, char **field, size_t nfields, char *comment, void *payload) +{ + struct property *prop = ((struct property_list_payload *)payload)->prop; + struct range r; + size_t i, numprops = ((struct property_list_payload *)payload)->numprops; + + (void)comment; + + if (nfields < 2) { + return 1; + } + + for (i = 0; i < numprops; i++) { + if (!strcmp(field[1], prop[i].identifier) && + !strcmp(fname, prop[i].fname)) { + if (range_parse(field[0], &r)) { + return 1; + } + range_list_append(&(prop[i].table), + &(prop[i].tablelen), &r); + break; + } + } + + return 0; +} + +void +property_list_parse(struct property *prop, size_t numprops) +{ + struct property_list_payload pl = { .prop = prop, .numprops = numprops }; + size_t i; + + /* make sure to parse each file only once */ + for (i = 0; i < numprops; i++) { + if (prop[i].tablelen > 0) { + /* property's file was already parsed */ + continue; + } + + parse_file_with_callback(prop[i].fname, property_list_callback, &pl); + } +} + +void +property_list_print(const struct property *prop, size_t numprops, + const char *identifier, const char *progname) +{ + size_t i, j; + + printf("/* Automatically generated by %s */\n" + "#include <stdint.h>\n\n#include \"../src/util.h\"\n\n", + progname); + + /* print enum */ + printf("enum %s {\n", identifier); + for (i = 0; i < numprops; i++) { + printf("\t%s,\n", prop[i].enumname); + } + printf("};\n\n"); + + /* print table */ + printf("static const struct range_list %s[] = {\n", identifier); + for (i = 0; i < numprops; i++) { + printf("\t[%s] = {\n\t\t.data = (struct range[]){\n", + prop[i].enumname); + for (j = 0; j < prop[i].tablelen; j++) { + printf("\t\t\t{ UINT32_C(0x%06X), UINT32_C(0x%06X) },\n", + prop[i].table[j].lower, + prop[i].table[j].upper); + } + printf("\t\t},\n\t\t.len = %zu,\n\t},\n", prop[i].tablelen); + } + printf("};\n"); +} + +int +segment_test_callback(char *fname, char **field, size_t nfields, char *comment, void *payload) +{ + struct segment_test *t, **test = ((struct segment_test_payload *)payload)->st; + size_t i, *ntests = ((struct segment_test_payload *)payload)->numsegtests; + char *token; + + (void)fname; + + if (nfields < 1) { + return 1; + } + + /* append new testcase and initialize with zeroes */ + if ((*test = realloc(*test, ++(*ntests) * sizeof(**test))) == NULL) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + return 1; + } + t = &(*test)[*ntests - 1]; + memset(t, 0, sizeof(*t)); + + /* parse testcase "<÷|×> <cp> <÷|×> ... <cp> <÷|×>" */ + for (token = strtok(field[0], " "), i = 0; token != NULL; i++, + token = strtok(NULL, " ")) { + if (i % 2 == 0) { + /* delimiter */ + if (!strncmp(token, "\xC3\xB7", 2)) { /* UTF-8 */ + /* + * '÷' indicates a breakpoint, + * the current length is done; allocate + * a new length field and set it to 0 + */ + if ((t->len = realloc(t->len, + ++t->lenlen * sizeof(*t->len))) == NULL) { + fprintf(stderr, "realloc: %s\n", + strerror(errno)); + return 1; + } + t->len[t->lenlen - 1] = 0; + } else if (!strncmp(token, "\xC3\x97", 2)) { /* UTF-8 */ + /* + * '×' indicates a non-breakpoint, do nothing + */ + } else { + fprintf(stderr, "malformed delimiter '%s'\n", + token); + return 1; + } + } else { + /* add code point to cp-array */ + if ((t->cp = realloc(t->cp, ++t->cplen * + sizeof(*t->cp))) == NULL) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + return 1; + } + if (cp_parse(token, &t->cp[t->cplen - 1])) { + return 1; + } + if (t->lenlen > 0) { + t->len[t->lenlen - 1]++; + } + } + } + if (t->len[t->lenlen - 1] == 0) { + /* we allocated one more length than we needed */ + t->lenlen--; + } + + /* store comment */ + if (((*test)[*ntests - 1].descr = strdup(comment)) == NULL) { + fprintf(stderr, "strdup: %s\n", strerror(errno)); + return 1; + } + + return 0; +} + +void +segment_test_list_parse(char *fname, struct segment_test **st, size_t *numsegtests) +{ + struct segment_test_payload pl = { .st = st, .numsegtests = numsegtests }; + *st = NULL; + *numsegtests = 0; + + parse_file_with_callback(fname, segment_test_callback, &pl); +} + +void +segment_test_list_print(struct segment_test *st, size_t numsegtests, + const char *identifier, const char *progname) +{ + size_t i, j; + + printf("/* Automatically generated by %s */\n" + "#include <stdint.h>\n#include <stddef.h>\n\n", progname); + + printf("static const struct {\n\tuint32_t *cp;\n" + "\tsize_t cplen;\n\tsize_t *len;\n\tsize_t lenlen;\n" + "\tchar *descr;\n} %s[] = {\n", identifier); + for (i = 0; i < numsegtests; i++) { + printf("\t{\n"); + + printf("\t\t.cp = (uint32_t[]){"); + for (j = 0; j < st[i].cplen; j++) { + printf(" UINT32_C(0x%06X)", st[i].cp[j]); + if (j + 1 < st[i].cplen) { + putchar(','); + } + } + printf(" },\n"); + printf("\t\t.cplen = %zu,\n", st[i].cplen); + + printf("\t\t.len = (size_t[]){"); + for (j = 0; j < st[i].lenlen; j++) { + printf(" %zu", st[i].len[j]); + if (j + 1 < st[i].lenlen) { + putchar(','); + } + } + printf(" },\n"); + printf("\t\t.lenlen = %zu,\n", st[i].lenlen); + + printf("\t\t.descr = \"%s\",\n", st[i].descr); + + printf("\t},\n"); + } + printf("};\n"); +} + + diff --git a/gen/util.h b/gen/util.h @@ -0,0 +1,37 @@ +/* See LICENSE file for copyright and license details. */ +#ifndef UTIL_H +#define UTIL_H + +#include <stddef.h> +#include <stdint.h> + +#define LEN(x) (sizeof (x) / sizeof *(x)) + +struct range { + uint32_t lower; + uint32_t upper; +}; + +struct property { + char *enumname; + char *identifier; + char *fname; + struct range *table; + size_t tablelen; +}; + +struct segment_test { + uint32_t *cp; + size_t cplen; + size_t *len; + size_t lenlen; + char *descr; +}; + +void property_list_parse(struct property *, size_t); +void property_list_print(const struct property *, size_t, const char *, const char *); + +void segment_test_list_parse(char *, struct segment_test **, size_t *); +void segment_test_list_print(struct segment_test *, size_t, const char *, const char *); + +#endif /* UTIL_H */ diff --git a/src/boundary.c b/src/boundary.c @@ -1,181 +0,0 @@ -/* See LICENSE file for copyright and license details. */ -#include <stddef.h> -#include <stdint.h> -#include <stdlib.h> - -#include "../data/emoji.h" -#include "../data/grapheme_boundary.h" - -enum { - GRAPHEME_STATE_RI_ODD = 1 << 0, /* odd number of RI's before the seam */ - GRAPHEME_STATE_EMOJI = 1 << 1, /* within emoji modifier or zwj sequence */ -}; - -static int -cp_cmp(const void *a, const void *b) -{ - uint32_t cp = *(uint32_t *)a; - uint32_t *range = (uint32_t *)b; - - return (cp >= range[0] && cp <= range[1]) ? 0 : (cp - range[0]); -} - -static int -has_property(uint32_t cp, struct heisenstate *cpstate, - const struct range_list *proptable, int property) -{ - if (heisenstate_get(cpstate, property) == -1) { - /* state undetermined, make a lookup and set it */ - heisenstate_set(cpstate, property, bsearch(&cp, - proptable[property].data, - proptable[property].len, - sizeof(*proptable[property].data), - cp_cmp) ? 1 : 0); - } - - return heisenstate_get(cpstate, property); -} - -int -grapheme_boundary(uint32_t a, uint32_t b, int *state) -{ - struct heisenstate gb[2] = { 0 }, emoji[2] = { 0 }; - int s; - - /* skip printable ASCII */ - if ((a >= 0x20 && a <= 0x7E) && - (b >= 0x20 && b <= 0x7E)) { - return 1; - } - - /* set internal state based on given state-pointer */ - s = (state != NULL) ? *state : 0; - - /* - * Apply grapheme cluster breaking algorithm (UAX #29), see - * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules - */ - - /* - * update state - */ - if (has_property(b, &gb[1], gb_prop, GB_PROP_REGIONAL_INDICATOR)) { - if (has_property(a, &gb[0], gb_prop, GB_PROP_REGIONAL_INDICATOR)) { - /* one more RI is on the left side of the seam */ - s ^= GRAPHEME_STATE_RI_ODD; - } else { - /* an RI appeared on the right side but the left - side is not an RI, reset state (0 is even) */ - s &= ~GRAPHEME_STATE_RI_ODD; - } - } - if (!(*state & GRAPHEME_STATE_EMOJI) && - ((has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) && - has_property(b, &gb[1], gb_prop, GB_PROP_ZWJ)) || - (has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) && - has_property(b, &gb[1], gb_prop, GB_PROP_EXTEND)))) { - s |= GRAPHEME_STATE_EMOJI; - } else if ((*state & GRAPHEME_STATE_EMOJI) && - ((has_property(a, &gb[0], gb_prop, GB_PROP_ZWJ) && - has_property(b, &emoji[1], emoji_prop, EMOJI_PROP_EXTPICT)) || - (has_property(a, &gb[0], gb_prop, GB_PROP_EXTEND) && - has_property(b, &gb[1], gb_prop, GB_PROP_EXTEND)) || - (has_property(a, &gb[0], gb_prop, GB_PROP_EXTEND) && - has_property(b, &gb[1], gb_prop, GB_PROP_ZWJ)) || - (has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) && - has_property(b, &gb[1], gb_prop, GB_PROP_ZWJ)) || - (has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) && - has_property(b, &gb[1], gb_prop, GB_PROP_EXTEND)))) { - /* GRAPHEME_STATE_EMOJI remains */ - } else { - s &= ~GRAPHEME_STATE_EMOJI; - } - - /* write updated state to state-pointer, if given */ - if (state != NULL) { - *state = s; - } - - /* - * apply rules - */ - - /* skip GB1 and GB2, as they are never satisfied here */ - - /* GB3 */ - if (has_property(a, &gb[0], gb_prop, GB_PROP_CR) && - has_property(b, &gb[1], gb_prop, GB_PROP_LF)) { - return 0; - } - - /* GB4 */ - if (has_property(a, &gb[0], gb_prop, GB_PROP_CONTROL) || - has_property(a, &gb[0], gb_prop, GB_PROP_CR) || - has_property(a, &gb[0], gb_prop, GB_PROP_LF)) { - return 1; - } - - /* GB5 */ - if (has_property(b, &gb[1], gb_prop, GB_PROP_CONTROL) || - has_property(b, &gb[1], gb_prop, GB_PROP_CR) || - has_property(b, &gb[1], gb_prop, GB_PROP_LF)) { - return 1; - } - - /* GB6 */ - if (has_property(a, &gb[0], gb_prop, GB_PROP_L) && - (has_property(b, &gb[1], gb_prop, GB_PROP_L) || - has_property(b, &gb[1], gb_prop, GB_PROP_V) || - has_property(b, &gb[1], gb_prop, GB_PROP_LV) || - has_property(b, &gb[1], gb_prop, GB_PROP_LVT))) { - return 0; - } - - /* GB7 */ - if ((has_property(a, &gb[0], gb_prop, GB_PROP_LV) || - has_property(a, &gb[0], gb_prop, GB_PROP_V)) && - (has_property(b, &gb[1], gb_prop, GB_PROP_V) || - has_property(b, &gb[1], gb_prop, GB_PROP_T))) { - return 0; - } - - /* GB8 */ - if ((has_property(a, &gb[0], gb_prop, GB_PROP_LVT) || - has_property(a, &gb[0], gb_prop, GB_PROP_T)) && - has_property(b, &gb[1], gb_prop, GB_PROP_T)) { - return 0; - } - - /* GB9 */ - if (has_property(b, &gb[1], gb_prop, GB_PROP_EXTEND) || - has_property(b, &gb[1], gb_prop, GB_PROP_ZWJ)) { - return 0; - } - - /* GB9a */ - if (has_property(b, &gb[1], gb_prop, GB_PROP_SPACINGMARK)) { - return 0; - } - - /* GB9b */ - if (has_property(a, &gb[0], gb_prop, GB_PROP_PREPEND)) { - return 0; - } - - /* GB11 */ - if ((s & GRAPHEME_STATE_EMOJI) && - has_property(a, &gb[0], gb_prop, GB_PROP_ZWJ) && - has_property(b, &emoji[1], emoji_prop, EMOJI_PROP_EXTPICT)) { - return 0; - } - - /* GB12/GB13 */ - if (has_property(a, &gb[0], gb_prop, GB_PROP_REGIONAL_INDICATOR) && - has_property(b, &gb[1], gb_prop, GB_PROP_REGIONAL_INDICATOR) && - (s & GRAPHEME_STATE_RI_ODD)) { - return 0; - } - - /* GB999 */ - return 1; -} diff --git a/src/grapheme.c b/src/grapheme.c @@ -2,8 +2,158 @@ #include <stddef.h> #include <stdlib.h> +#include "../gen/grapheme.h" #include "../grapheme.h" +enum { + GRAPHEME_STATE_RI_ODD = 1 << 0, /* odd number of RI's before the seam */ + GRAPHEME_STATE_EMOJI = 1 << 1, /* within emoji modifier or zwj sequence */ +}; + +int +grapheme_boundary(uint32_t a, uint32_t b, int *state) +{ + struct heisenstate prop[2] = { 0 }; + int s; + + /* skip printable ASCII */ + if ((a >= 0x20 && a <= 0x7E) && + (b >= 0x20 && b <= 0x7E)) { + return 1; + } + + /* set internal state based on given state-pointer */ + s = (state != NULL) ? *state : 0; + + /* + * Apply grapheme cluster breaking algorithm (UAX #29), see + * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules + */ + + /* + * update state + */ + if (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR)) { + if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR)) { + /* one more RI is on the left side of the seam */ + s ^= GRAPHEME_STATE_RI_ODD; + } else { + /* an RI appeared on the right side but the left + side is not an RI, reset state (0 is even) */ + s &= ~GRAPHEME_STATE_RI_ODD; + } + } + if (!(*state & GRAPHEME_STATE_EMOJI) && + ((has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) && + has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) || + (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) && + has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTEND)))) { + s |= GRAPHEME_STATE_EMOJI; + } else if ((*state & GRAPHEME_STATE_EMOJI) && + ((has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_ZWJ) && + has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC)) || + (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_EXTEND) && + has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTEND)) || + (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_EXTEND) && + has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) || + (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) && + has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) || + (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) && + has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTEND)))) { + /* GRAPHEME_STATE_EMOJI remains */ + } else { + s &= ~GRAPHEME_STATE_EMOJI; + } + + /* write updated state to state-pointer, if given */ + if (state != NULL) { + *state = s; + } + + /* + * apply rules + */ + + /* skip GB1 and GB2, as they are never satisfied here */ + + /* GB3 */ + if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_CR) && + has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_LF)) { + return 0; + } + + /* GB4 */ + if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_CONTROL) || + has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_CR) || + has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_LF)) { + return 1; + } + + /* GB5 */ + if (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_CONTROL) || + has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_CR) || + has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_LF)) { + return 1; + } + + /* GB6 */ + if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_HANGUL_L) && + (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_L) || + has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_V) || + has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_LV) || + has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_LVT))) { + return 0; + } + + /* GB7 */ + if ((has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_HANGUL_LV) || + has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_HANGUL_V)) && + (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_V) || + has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_T))) { + return 0; + } + + /* GB8 */ + if ((has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_HANGUL_LVT) || + has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_HANGUL_T)) && + has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_T)) { + return 0; + } + + /* GB9 */ + if (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTEND) || + has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) { + return 0; + } + + /* GB9a */ + if (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_SPACINGMARK)) { + return 0; + } + + /* GB9b */ + if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_PREPEND)) { + return 0; + } + + /* GB11 */ + if ((s & GRAPHEME_STATE_EMOJI) && + has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_ZWJ) && + has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC)) { + return 0; + } + + /* GB12/GB13 */ + if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR) && + has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR) && + (s & GRAPHEME_STATE_RI_ODD)) { + return 0; + } + + /* GB999 */ + return 1; +} + size_t grapheme_bytelen(const char *str) { diff --git a/src/util.c b/src/util.c @@ -1,10 +1,13 @@ /* See LICENSE file for copyright and license details. */ +#include <stdint.h> +#include <stdlib.h> + #include "util.h" int heisenstate_get(struct heisenstate *h, int slot) { - if (h == NULL || slot >= 16 || slot < 0 || + if (h == NULL || slot >= 64 || slot < 0 || !(h->determined & (1 << slot))) { /* no state given, slot out of range or undetermined */ return -1; @@ -17,7 +20,7 @@ heisenstate_get(struct heisenstate *h, int slot) int heisenstate_set(struct heisenstate *h, int slot, int state) { - if (h == NULL || slot >= 16 || slot < 0) { + if (h == NULL || slot >= 64 || slot < 0) { /* no state given or slot out of range */ return 1; } else { @@ -31,3 +34,28 @@ heisenstate_set(struct heisenstate *h, int slot, int state) return 0; } + +static int +cp_cmp(const void *a, const void *b) +{ + uint32_t cp = *(uint32_t *)a; + uint32_t *range = (uint32_t *)b; + + return (cp >= range[0] && cp <= range[1]) ? 0 : (cp - range[0]); +} + +int +has_property(uint32_t cp, struct heisenstate *cpstate, + const struct range_list *proptable, int property) +{ + if (heisenstate_get(cpstate, property) == -1) { + /* state undetermined, make a lookup and set it */ + heisenstate_set(cpstate, property, bsearch(&cp, + proptable[property].data, + proptable[property].len, + sizeof(*proptable[property].data), + cp_cmp) ? 1 : 0); + } + + return heisenstate_get(cpstate, property); +} diff --git a/src/util.h b/src/util.h @@ -17,13 +17,16 @@ struct range_list { size_t len; }; -/* 16-slot (0,...,15) optionally undetermined binary state */ +/* 64-slot (0,...,63) optionally undetermined binary state */ struct heisenstate { - uint_least16_t determined; - uint_least16_t state; + uint_least64_t determined; + uint_least64_t state; }; int heisenstate_get(struct heisenstate *, int); int heisenstate_set(struct heisenstate *, int, int); +int has_property(uint32_t, struct heisenstate *, + const struct range_list *, int); + #endif /* UTIL_H */ diff --git a/test/grapheme.c b/test/grapheme.c @@ -0,0 +1,43 @@ +/* See LICENSE file for copyright and license details. */ +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> + +#include "../grapheme.h" +#include "../gen/grapheme-test.h" + +#define LEN(x) (sizeof(x) / sizeof(*x)) + +int +main(void) +{ + int state; + size_t i, j, k, len, failed; + + /* grapheme break test */ + for (i = 0, failed = 0; i < LEN(grapheme_test); i++) { + for (j = 0, k = 0, state = 0, len = 1; j < grapheme_test[i].cplen; j++) { + if ((j + 1) == grapheme_test[i].cplen || + grapheme_boundary(grapheme_test[i].cp[j], + grapheme_test[i].cp[j + 1], + &state)) { + /* check if our resulting length matches */ + if (k == grapheme_test[i].lenlen || + len != grapheme_test[i].len[k++]) { + fprintf(stderr, "Failed \"%s\"\n", + grapheme_test[i].descr); + failed++; + break; + } + len = 1; + } else { + len++; + } + } + } + printf("Grapheme break test: Passed %zu out of %zu tests.\n", + LEN(grapheme_test) - failed, LEN(grapheme_test)); + + return (failed > 0) ? 1 : 0; +} diff --git a/test/grapheme_boundary.c b/test/grapheme_boundary.c @@ -1,41 +0,0 @@ -/* See LICENSE file for copyright and license details. */ -#include <stddef.h> -#include <stdint.h> -#include <stdio.h> -#include <string.h> - -#include "../grapheme.h" -#include "../data/grapheme_boundary_test.h" - -#define LEN(x) (sizeof(x) / sizeof(*x)) - -int -main(void) -{ - int state; - size_t i, j, k, len, failed; - - /* grapheme break test */ - for (i = 0, failed = 0; i < LEN(t); i++) { - for (j = 0, k = 0, state = 0, len = 1; j < t[i].cplen; j++) { - if ((j + 1) == t[i].cplen || - grapheme_boundary(t[i].cp[j], t[i].cp[j + 1], - &state)) { - /* check if our resulting length matches */ - if (k == t[i].lenlen || len != t[i].len[k++]) { - fprintf(stderr, "Failed \"%s\"\n", - t[i].descr); - failed++; - break; - } - len = 1; - } else { - len++; - } - } - } - printf("Grapheme break test: Passed %zu out of %zu tests.\n", - LEN(t) - failed, LEN(t)); - - return (failed > 0) ? 1 : 0; -}