libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

commit ea318c404e67e71aaf9aeb2dab671eee57ed2766
parent eee3a6e193f42bc9cb98e85920d194b800435ab8
Author: Laslo Hunhold <dev@frign.de>
Date:   Fri,  4 Dec 2020 13:35:13 +0100

Refactor data-tables and lookup-code to be more universal

Previously, we had an explicit list of property-tables in boundary.c,
but this shouldn't be the place to put them, especially if we plan to
use the tables somewhere else, too. Instead, structure the data by
better by emitting an enum for each datatype and base the rest of the
code on it.

This turns out for boundary to require two heisenstates (see below) for
the emoji- and grapheme-break-tables, which is more than before, but
that was only possible because we reduced the generalizability of the
code.

The advantage is that if the Unicode specification adds another type
of character and implements it in the algorithm, it is as simple as
adding an element to the properties-array in the data-generator (e.g.
data/grapheme_boundary.c) which in turn automatically adds it to the
enum and you are free to access it from the code.

The specific changes are summarized below:

- Add heisenstate-struct to handle partially-known states to prevent
  multiple identical lookups. Previously, we already kept track of it,
  but handled the states by hand.
- Remove explicit table-listing in src/boundary.c and adapt code
- New datatable-format that is necessary to have an enum-indexed table
- Rename data/util.h to data/datautil.h

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
MMakefile | 21+++++++++++----------
Adata/datautil.c | 159+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adata/datautil.h | 20++++++++++++++++++++
Mdata/emoji.c | 26++++++++++++++++++--------
Mdata/grapheme_boundary.c | 62++++++++++++++++++++++++++++++++++++++++++--------------------
Mdata/grapheme_boundary_test.c | 2+-
Ddata/util.c | 159-------------------------------------------------------------------------------
Ddata/util.h | 20--------------------
Msrc/boundary.c | 209+++++++++++++++++++++----------------------------------------------------------
Asrc/util.c | 33+++++++++++++++++++++++++++++++++
Asrc/util.h | 29+++++++++++++++++++++++++++++
11 files changed, 368 insertions(+), 372 deletions(-)

diff --git a/Makefile b/Makefile @@ -4,7 +4,7 @@ include config.mk -LIB = src/boundary src/codepoint src/grapheme +LIB = src/boundary src/codepoint src/grapheme src/util TEST = test/grapheme_boundary test/utf8-decode test/utf8-encode DATA = data/emoji data/grapheme_boundary data/grapheme_boundary_test @@ -17,20 +17,21 @@ data/emoji.h: data/emoji.txt data/emoji data/grapheme_boundary.h: data/grapheme_boundary.txt data/grapheme_boundary data/grapheme_boundary_test.h: data/grapheme_boundary_test.txt data/grapheme_boundary_test -data/emoji.o: data/emoji.c config.mk data/util.h -data/grapheme_boundary.o: data/grapheme_boundary.c config.mk data/util.h -data/grapheme_boundary_test.o: data/grapheme_boundary_test.c config.mk data/util.h -data/util.o: data/util.c config.mk data/util.h +data/emoji.o: data/emoji.c config.mk data/datautil.h +data/grapheme_boundary.o: data/grapheme_boundary.c config.mk data/datautil.h +data/grapheme_boundary_test.o: data/grapheme_boundary_test.c config.mk data/datautil.h +data/datautil.o: data/datautil.c config.mk data/datautil.h src/boundary.o: src/boundary.c config.mk data/emoji.h data/grapheme_boundary.h grapheme.h src/codepoint.o: src/codepoint.c config.mk grapheme.h src/grapheme.o: src/grapheme.c config.mk grapheme.h +src/util.o: src/util.c config.mk src/util.h test/grapheme_boundary.o: test/grapheme_boundary.c config.mk data/grapheme_boundary_test.h grapheme.h test/utf8-encode.o: test/utf8-encode.c config.mk grapheme.h test/utf8-decode.o: test/utf8-decode.c config.mk grapheme.h -data/emoji: data/emoji.o data/util.o -data/grapheme_boundary: data/grapheme_boundary.o data/util.o -data/grapheme_boundary_test: data/grapheme_boundary_test.o data/util.o +data/emoji: data/emoji.o data/datautil.o +data/grapheme_boundary: data/grapheme_boundary.o data/datautil.o +data/grapheme_boundary_test: data/grapheme_boundary_test.o data/datautil.o test/grapheme_boundary: test/grapheme_boundary.o libgrapheme.a test/utf8-encode: test/utf8-encode.o libgrapheme.a test/utf8-decode: test/utf8-decode.o libgrapheme.a @@ -48,7 +49,7 @@ $(DATA:=.h): $(@:.h=) < $(@:.h=.txt) > $@ $(DATA): - $(CC) -o $@ $(LDFLAGS) $@.o data/util.o + $(CC) -o $@ $(LDFLAGS) $@.o data/datautil.o $(TEST): $(CC) -o $@ $(LDFLAGS) $@.o libgrapheme.a @@ -85,7 +86,7 @@ uninstall: rm -f "$(DESTDIR)$(INCPREFIX)/grapheme.h" clean: - rm -f $(DATA:=.h) $(DATA:=.o) data/util.o $(LIB:=.o) $(TEST:=.o) $(DATA) $(TEST) libgrapheme.a libgrapheme.so + rm -f $(DATA:=.h) $(DATA:=.o) data/datautil.o $(LIB:=.o) $(TEST:=.o) $(DATA) $(TEST) libgrapheme.a libgrapheme.so clean-data: rm -f $(DATA:=.txt) diff --git a/data/datautil.c b/data/datautil.c @@ -0,0 +1,159 @@ +/* See LICENSE file for copyright and license details. */ +#include <stdint.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <errno.h> + +#include "datautil.h" + +void +parse_input(int (*process_line)(char **, size_t, char *)) +{ + char *line = NULL, **field = NULL, *comment; + size_t linebufsize = 0, i, fieldbufsize = 0, j, nfields; + ssize_t len; + + while ((len = getline(&line, &linebufsize, stdin)) >= 0) { + /* remove trailing newline */ + if (len > 0 && line[len - 1] == '\n') { + line[len - 1] = '\0'; + len--; + } + + /* skip empty lines and comment lines */ + if (len == 0 || line[0] == '#') { + continue; + } + + /* tokenize line into fields */ + for (i = 0, nfields = 0, comment = NULL; i < (size_t)len; i++) { + /* extend field buffer, if necessary */ + if (++nfields > fieldbufsize) { + if ((field = realloc(field, nfields * + sizeof(*field))) == NULL) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + fieldbufsize = nfields; + } + + /* skip leading whitespace */ + while (line[i] == ' ') { + i++; + } + + /* set current position as field start */ + field[nfields - 1] = &line[i]; + + /* continue until we reach ';' or '#' or end */ + while (line[i] != ';' && line[i] != '#' && + line[i] != '\0') { + i++; + } + if (line [i] == '#') { + /* set comment-variable for later */ + comment = &line[i + 1]; + } + + /* go back whitespace and terminate field there */ + if (i > 0) { + for (j = i - 1; line[j] == ' '; j--) + ; + line[j + 1] = '\0'; + } else { + line[i] = '\0'; + } + + /* if comment is set, we are done */ + if (comment != NULL) { + break; + } + } + + /* skip leading whitespace in comment */ + while (comment != NULL && comment[0] == ' ') { + comment++; + } + + /* call line processing function */ + if (process_line(field, nfields, comment)) { + exit(1); + } + } + + free(line); + free(field); +} + +static int +valid_hexstring(const char *str) +{ + const char *p = str; + + while ((*p >= '0' && *p <= '9') || + (*p >= 'a' && *p <= 'f') || + (*p >= 'A' && *p <= 'F')) { + p++; + } + + if (*p != '\0') { + fprintf(stderr, "invalid code point range '%s'\n", str); + return 0; + } + + return 1; +} + +int +cp_parse(const char *str, uint32_t *cp) +{ + if (!valid_hexstring(str)) { + return 1; + } + *cp = strtol(str, NULL, 16); + + return 0; +} + +int +range_parse(const char *str, struct range *range) +{ + char *p; + + if ((p = strstr(str, "..")) == NULL) { + /* input has the form "XXXXXX" */ + if (!valid_hexstring(str)) { + return 1; + } + range->lower = range->upper = strtol(str, NULL, 16); + } else { + /* input has the form "XXXXXX..XXXXXX" */ + *p = '\0'; + p += 2; + if (!valid_hexstring(str) || !valid_hexstring(p)) { + return 1; + } + range->lower = strtol(str, NULL, 16); + range->upper = strtol(p, NULL, 16); + } + + return 0; +} + +void +range_list_append(struct range **range, size_t *nranges, const struct range *new) +{ + if (*nranges > 0 && (*range)[*nranges - 1].upper == new->lower) { + /* we can merge with previous entry */ + (*range)[*nranges - 1].upper = new->upper; + } else { + /* need to append new entry */ + if ((*range = realloc(*range, (++(*nranges)) * sizeof(**range))) == NULL) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + (*range)[*nranges - 1].lower = new->lower; + (*range)[*nranges - 1].upper = new->upper; + } +} diff --git a/data/datautil.h b/data/datautil.h @@ -0,0 +1,20 @@ +/* See LICENSE file for copyright and license details. */ +#ifndef DATAUTIL_H +#define DATAUTIL_H + +#include <stddef.h> +#include <stdint.h> + +#define LEN(x) (sizeof (x) / sizeof *(x)) + +struct range { + uint32_t lower; + uint32_t upper; +}; + +void parse_input(int (*process_line)(char **, size_t, char *)); +int cp_parse(const char *, uint32_t *); +int range_parse(const char *, struct range *); +void range_list_append(struct range **, size_t *, const struct range *); + +#endif /* DATAUTIL_H */ diff --git a/data/emoji.c b/data/emoji.c @@ -3,17 +3,18 @@ #include <stdio.h> #include <string.h> -#include "util.h" +#include "datautil.h" static struct { + char *enumname; char *identifier; - char *tablename; struct range *table; size_t tablelen; } properties[] = { { + /* extended pictographic */ + .enumname = "EMOJI_PROP_EXTPICT", .identifier = "Extended_Pictographic", - .tablename = "extpict_table", }, }; @@ -49,20 +50,29 @@ main(void) size_t i, j; printf("/* Automatically generated by data/emo */\n" - "#include <stdint.h>\n"); + "#include <stdint.h>\n\n#include \"../src/util.h\"\n\n"); parse_input(process_line); + /* output enum */ + printf("enum emoji_prop {\n"); for (i = 0; i < LEN(properties); i++) { - printf("\nstatic const uint32_t %s[][2] = {\n", - properties[i].tablename); + printf("\t%s,\n", properties[i].enumname); + } + printf("};\n\n"); + + /* output table */ + printf("static const struct range_list emoji_prop[] = {\n"); + for (i = 0; i < LEN(properties); i++) { + printf("\t[%s] = {\n\t\t.data = (struct range[]){\n", properties[i].enumname); for (j = 0; j < properties[i].tablelen; j++) { - printf("\t{ UINT32_C(0x%06X), UINT32_C(0x%06X) },\n", + printf("\t\t\t{ UINT32_C(0x%06X), UINT32_C(0x%06X) },\n", properties[i].table[j].lower, properties[i].table[j].upper); } - printf("};\n"); + printf("\t\t},\n\t\t.len = %zu,\n\t},\n", properties[i].tablelen); } + printf("};\n"); return 0; } diff --git a/data/grapheme_boundary.c b/data/grapheme_boundary.c @@ -3,65 +3,78 @@ #include <stdio.h> #include <string.h> -#include "util.h" +#include "datautil.h" static struct { + char *enumname; char *identifier; - char *tablename; struct range *table; size_t tablelen; } properties[] = { { + /* carriage return */ + .enumname = "GB_PROP_CR", .identifier = "CR", - .tablename = "cr_table", }, { + /* line feed */ + .enumname = "GB_PROP_LF", .identifier = "LF", - .tablename = "lf_table", }, { + /* control character */ + .enumname = "GB_PROP_CONTROL", .identifier = "Control", - .tablename = "control_table", }, { + /* grapheme extender */ + .enumname = "GB_PROP_EXTEND", .identifier = "Extend", - .tablename = "extend_table", }, { + /* zero width joiner */ + .enumname = "GB_PROP_ZWJ", .identifier = "ZWJ", - .tablename = "zwj_table", }, { + /* regional indicator */ + .enumname = "GB_PROP_REGIONAL_INDICATOR", .identifier = "Regional_Indicator", - .tablename = "ri_table", }, { + /* prepend character */ + .enumname = "GB_PROP_PREPEND", .identifier = "Prepend", - .tablename = "prepend_table", }, { + /* spacing mark */ + .enumname = "GB_PROP_SPACINGMARK", .identifier = "SpacingMark", - .tablename = "spacingmark_table", }, { + /* hangul syllable type L */ + .enumname = "GB_PROP_L", .identifier = "L", - .tablename = "l_table", }, { + /* hangul syllable type V */ + .enumname = "GB_PROP_V", .identifier = "V", - .tablename = "v_table", }, { + /* hangul syllable type T */ + .enumname = "GB_PROP_T", .identifier = "T", - .tablename = "t_table", }, { + /* hangul syllable type LV */ + .enumname = "GB_PROP_LV", .identifier = "LV", - .tablename = "lv_table", }, { + /* hangul syllable type LVT */ + .enumname = "GB_PROP_LVT", .identifier = "LVT", - .tablename = "lvt_table", }, }; @@ -97,20 +110,29 @@ main(void) size_t i, j; printf("/* Automatically generated by data/gbp */\n" - "#include <stdint.h>\n"); + "#include <stdint.h>\n\n#include \"../src/util.h\"\n\n"); parse_input(process_line); + /* output enum */ + printf("enum gb_prop {\n"); for (i = 0; i < LEN(properties); i++) { - printf("\nstatic const uint32_t %s[][2] = {\n", - properties[i].tablename); + printf("\t%s,\n", properties[i].enumname); + } + printf("};\n\n"); + + /* output table */ + printf("static const struct range_list gb_prop[] = {\n"); + for (i = 0; i < LEN(properties); i++) { + printf("\t[%s] = {\n\t\t.data = (struct range[]){\n", properties[i].enumname); for (j = 0; j < properties[i].tablelen; j++) { - printf("\t{ UINT32_C(0x%06X), UINT32_C(0x%06X) },\n", + printf("\t\t\t{ UINT32_C(0x%06X), UINT32_C(0x%06X) },\n", properties[i].table[j].lower, properties[i].table[j].upper); } - printf("};\n"); + printf("\t\t},\n\t\t.len = %zu,\n\t},\n", properties[i].tablelen); } + printf("};\n"); return 0; } diff --git a/data/grapheme_boundary_test.c b/data/grapheme_boundary_test.c @@ -5,7 +5,7 @@ #include <string.h> #include <errno.h> -#include "util.h" +#include "datautil.h" struct break_test { uint32_t *cp; diff --git a/data/util.c b/data/util.c @@ -1,159 +0,0 @@ -/* See LICENSE file for copyright and license details. */ -#include <stdint.h> -#include <stdlib.h> -#include <stdio.h> -#include <string.h> -#include <errno.h> - -#include "util.h" - -void -parse_input(int (*process_line)(char **, size_t, char *)) -{ - char *line = NULL, **field = NULL, *comment; - size_t linebufsize = 0, i, fieldbufsize = 0, j, nfields; - ssize_t len; - - while ((len = getline(&line, &linebufsize, stdin)) >= 0) { - /* remove trailing newline */ - if (len > 0 && line[len - 1] == '\n') { - line[len - 1] = '\0'; - len--; - } - - /* skip empty lines and comment lines */ - if (len == 0 || line[0] == '#') { - continue; - } - - /* tokenize line into fields */ - for (i = 0, nfields = 0, comment = NULL; i < (size_t)len; i++) { - /* extend field buffer, if necessary */ - if (++nfields > fieldbufsize) { - if ((field = realloc(field, nfields * - sizeof(*field))) == NULL) { - fprintf(stderr, "realloc: %s\n", strerror(errno)); - exit(1); - } - fieldbufsize = nfields; - } - - /* skip leading whitespace */ - while (line[i] == ' ') { - i++; - } - - /* set current position as field start */ - field[nfields - 1] = &line[i]; - - /* continue until we reach ';' or '#' or end */ - while (line[i] != ';' && line[i] != '#' && - line[i] != '\0') { - i++; - } - if (line [i] == '#') { - /* set comment-variable for later */ - comment = &line[i + 1]; - } - - /* go back whitespace and terminate field there */ - if (i > 0) { - for (j = i - 1; line[j] == ' '; j--) - ; - line[j + 1] = '\0'; - } else { - line[i] = '\0'; - } - - /* if comment is set, we are done */ - if (comment != NULL) { - break; - } - } - - /* skip leading whitespace in comment */ - while (comment != NULL && comment[0] == ' ') { - comment++; - } - - /* call line processing function */ - if (process_line(field, nfields, comment)) { - exit(1); - } - } - - free(line); - free(field); -} - -static int -valid_hexstring(const char *str) -{ - const char *p = str; - - while ((*p >= '0' && *p <= '9') || - (*p >= 'a' && *p <= 'f') || - (*p >= 'A' && *p <= 'F')) { - p++; - } - - if (*p != '\0') { - fprintf(stderr, "invalid code point range '%s'\n", str); - return 0; - } - - return 1; -} - -int -cp_parse(const char *str, uint32_t *cp) -{ - if (!valid_hexstring(str)) { - return 1; - } - *cp = strtol(str, NULL, 16); - - return 0; -} - -int -range_parse(const char *str, struct range *range) -{ - char *p; - - if ((p = strstr(str, "..")) == NULL) { - /* input has the form "XXXXXX" */ - if (!valid_hexstring(str)) { - return 1; - } - range->lower = range->upper = strtol(str, NULL, 16); - } else { - /* input has the form "XXXXXX..XXXXXX" */ - *p = '\0'; - p += 2; - if (!valid_hexstring(str) || !valid_hexstring(p)) { - return 1; - } - range->lower = strtol(str, NULL, 16); - range->upper = strtol(p, NULL, 16); - } - - return 0; -} - -void -range_list_append(struct range **range, size_t *nranges, const struct range *new) -{ - if (*nranges > 0 && (*range)[*nranges - 1].upper == new->lower) { - /* we can merge with previous entry */ - (*range)[*nranges - 1].upper = new->upper; - } else { - /* need to append new entry */ - if ((*range = realloc(*range, (++(*nranges)) * sizeof(**range))) == NULL) { - fprintf(stderr, "realloc: %s\n", strerror(errno)); - exit(1); - } - (*range)[*nranges - 1].lower = new->lower; - (*range)[*nranges - 1].upper = new->upper; - } -} diff --git a/data/util.h b/data/util.h @@ -1,20 +0,0 @@ -/* See LICENSE file for copyright and license details. */ -#ifndef UTIL_H -#define UTIL_H - -#include <stddef.h> -#include <stdint.h> - -#define LEN(x) (sizeof (x) / sizeof *(x)) - -struct range { - uint32_t lower; - uint32_t upper; -}; - -void parse_input(int (*process_line)(char **, size_t, char *)); -int cp_parse(const char *, uint32_t *); -int range_parse(const char *, struct range *); -void range_list_append(struct range **, size_t *, const struct range *); - -#endif /* UTIL_H */ diff --git a/src/boundary.c b/src/boundary.c @@ -6,98 +6,11 @@ #include "../data/emoji.h" #include "../data/grapheme_boundary.h" -#define LEN(x) (sizeof(x) / sizeof(*x)) - enum { GRAPHEME_STATE_RI_ODD = 1 << 0, /* odd number of RI's before the seam */ GRAPHEME_STATE_EMOJI = 1 << 1, /* within emoji modifier or zwj sequence */ }; -enum cp_property { - PROP_CR, /* carriage return */ - PROP_LF, /* line feed */ - PROP_CONTROL, /* control character */ - PROP_EXTEND, /* grapheme extender (TODO Emoji_Modifier=Yes) */ - PROP_ZWJ, /* zero width joiner */ - PROP_RI, /* regional indicator */ - PROP_PREPEND, /* prepend character */ - PROP_SPACINGMARK, /* spacing mark */ - PROP_L, /* hangul syllable type L */ - PROP_V, /* hangul syllable type V */ - PROP_T, /* hangul syllable type T */ - PROP_LV, /* hangul syllable type LV */ - PROP_LVT, /* hangul syllable type LVT */ - PROP_EXTPICT, /* extended pictographic */ -}; - -struct { - const uint32_t (*table)[2]; - size_t tablelen; -} cp_property_tables[] = { - [PROP_CR] = { - .table = cr_table, - .tablelen = LEN(cr_table), - }, - [PROP_LF] = { - .table = lf_table, - .tablelen = LEN(lf_table), - }, - [PROP_CONTROL] = { - .table = control_table, - .tablelen = LEN(control_table), - }, - [PROP_EXTEND] = { - .table = extend_table, - .tablelen = LEN(extend_table), - }, - [PROP_ZWJ] = { - .table = zwj_table, - .tablelen = LEN(zwj_table), - }, - [PROP_RI] = { - .table = ri_table, - .tablelen = LEN(ri_table), - }, - [PROP_PREPEND] = { - .table = prepend_table, - .tablelen = LEN(prepend_table), - }, - [PROP_SPACINGMARK] = { - .table = spacingmark_table, - .tablelen = LEN(spacingmark_table), - }, - [PROP_L] = { - .table = l_table, - .tablelen = LEN(l_table), - }, - [PROP_V] = { - .table = v_table, - .tablelen = LEN(v_table), - }, - [PROP_T] = { - .table = t_table, - .tablelen = LEN(t_table), - }, - [PROP_LV] = { - .table = lv_table, - .tablelen = LEN(lv_table), - }, - [PROP_LVT] = { - .table = lvt_table, - .tablelen = LEN(lvt_table), - }, - [PROP_EXTPICT] = { - .table = extpict_table, - .tablelen = LEN(extpict_table), - }, -}; - -struct cp_properties { - uint32_t cp; - int_least16_t determined; - int_least16_t state; -}; - static int cp_cmp(const void *a, const void *b) { @@ -108,37 +21,25 @@ cp_cmp(const void *a, const void *b) } static int -has_property(struct cp_properties *props, enum cp_property p) +has_property(uint32_t cp, struct heisenstate *cpstate, + const struct range_list *proptable, int property) { - if (!(props->determined & (1 << p))) { - /* not determined yet, do a lookup and set the state */ - if (bsearch(&props->cp, cp_property_tables[p].table, - cp_property_tables[p].tablelen, - sizeof(*cp_property_tables[p].table), - cp_cmp)) { - props->state |= (1 << p); - } else { - props->state &= ~(1 << p); - } - - /* now it's determined */ - props->determined |= (1 << p); + if (heisenstate_get(cpstate, property) == -1) { + /* state undetermined, make a lookup and set it */ + heisenstate_set(cpstate, property, bsearch(&cp, + proptable[property].data, + proptable[property].len, + sizeof(*proptable[property].data), + cp_cmp) ? 1 : 0); } - return (props->state & (1 << p)); + return heisenstate_get(cpstate, property); } int grapheme_boundary(uint32_t a, uint32_t b, int *state) { - struct cp_properties props[] = { - { - .cp = a, - }, - { - .cp = b, - }, - }; + struct heisenstate gb[2] = { 0 }, emoji[2] = { 0 }; int s; /* skip printable ASCII */ @@ -158,8 +59,8 @@ grapheme_boundary(uint32_t a, uint32_t b, int *state) /* * update state */ - if (has_property(&props[1], PROP_RI)) { - if (has_property(&props[0], PROP_RI)) { + if (has_property(b, &gb[1], gb_prop, GB_PROP_REGIONAL_INDICATOR)) { + if (has_property(a, &gb[0], gb_prop, GB_PROP_REGIONAL_INDICATOR)) { /* one more RI is on the left side of the seam */ s ^= GRAPHEME_STATE_RI_ODD; } else { @@ -169,22 +70,22 @@ grapheme_boundary(uint32_t a, uint32_t b, int *state) } } if (!(*state & GRAPHEME_STATE_EMOJI) && - ((has_property(&props[0], PROP_EXTPICT) && - has_property(&props[1], PROP_ZWJ)) || - (has_property(&props[0], PROP_EXTPICT) && - has_property(&props[1], PROP_EXTEND)))) { + ((has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) && + has_property(b, &gb[1], gb_prop, GB_PROP_ZWJ)) || + (has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) && + has_property(b, &gb[1], gb_prop, GB_PROP_EXTEND)))) { s |= GRAPHEME_STATE_EMOJI; } else if ((*state & GRAPHEME_STATE_EMOJI) && - ((has_property(&props[0], PROP_ZWJ) && - has_property(&props[1], PROP_EXTPICT)) || - (has_property(&props[0], PROP_EXTEND) && - has_property(&props[1], PROP_EXTEND)) || - (has_property(&props[0], PROP_EXTEND) && - has_property(&props[1], PROP_ZWJ)) || - (has_property(&props[0], PROP_EXTPICT) && - has_property(&props[1], PROP_ZWJ)) || - (has_property(&props[0], PROP_EXTPICT) && - has_property(&props[1], PROP_EXTEND)))) { + ((has_property(a, &gb[0], gb_prop, GB_PROP_ZWJ) && + has_property(b, &emoji[1], emoji_prop, EMOJI_PROP_EXTPICT)) || + (has_property(a, &gb[0], gb_prop, GB_PROP_EXTEND) && + has_property(b, &gb[1], gb_prop, GB_PROP_EXTEND)) || + (has_property(a, &gb[0], gb_prop, GB_PROP_EXTEND) && + has_property(b, &gb[1], gb_prop, GB_PROP_ZWJ)) || + (has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) && + has_property(b, &gb[1], gb_prop, GB_PROP_ZWJ)) || + (has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) && + has_property(b, &gb[1], gb_prop, GB_PROP_EXTEND)))) { /* GRAPHEME_STATE_EMOJI remains */ } else { s &= ~GRAPHEME_STATE_EMOJI; @@ -202,75 +103,75 @@ grapheme_boundary(uint32_t a, uint32_t b, int *state) /* skip GB1 and GB2, as they are never satisfied here */ /* GB3 */ - if (has_property(&props[0], PROP_CR) && - has_property(&props[1], PROP_LF)) { + if (has_property(a, &gb[0], gb_prop, GB_PROP_CR) && + has_property(b, &gb[1], gb_prop, GB_PROP_LF)) { return 0; } /* GB4 */ - if (has_property(&props[0], PROP_CONTROL) || - has_property(&props[0], PROP_CR) || - has_property(&props[0], PROP_LF)) { + if (has_property(a, &gb[0], gb_prop, GB_PROP_CONTROL) || + has_property(a, &gb[0], gb_prop, GB_PROP_CR) || + has_property(a, &gb[0], gb_prop, GB_PROP_LF)) { return 1; } /* GB5 */ - if (has_property(&props[1], PROP_CONTROL) || - has_property(&props[1], PROP_CR) || - has_property(&props[1], PROP_LF)) { + if (has_property(b, &gb[1], gb_prop, GB_PROP_CONTROL) || + has_property(b, &gb[1], gb_prop, GB_PROP_CR) || + has_property(b, &gb[1], gb_prop, GB_PROP_LF)) { return 1; } /* GB6 */ - if (has_property(&props[0], PROP_L) && - (has_property(&props[1], PROP_L) || - has_property(&props[1], PROP_V) || - has_property(&props[1], PROP_LV) || - has_property(&props[1], PROP_LVT))) { + if (has_property(a, &gb[0], gb_prop, GB_PROP_L) && + (has_property(b, &gb[1], gb_prop, GB_PROP_L) || + has_property(b, &gb[1], gb_prop, GB_PROP_V) || + has_property(b, &gb[1], gb_prop, GB_PROP_LV) || + has_property(b, &gb[1], gb_prop, GB_PROP_LVT))) { return 0; } /* GB7 */ - if ((has_property(&props[0], PROP_LV) || - has_property(&props[0], PROP_V)) && - (has_property(&props[1], PROP_V) || - has_property(&props[1], PROP_T))) { + if ((has_property(a, &gb[0], gb_prop, GB_PROP_LV) || + has_property(a, &gb[0], gb_prop, GB_PROP_V)) && + (has_property(b, &gb[1], gb_prop, GB_PROP_V) || + has_property(b, &gb[1], gb_prop, GB_PROP_T))) { return 0; } /* GB8 */ - if ((has_property(&props[0], PROP_LVT) || - has_property(&props[0], PROP_T)) && - has_property(&props[1], PROP_T)) { + if ((has_property(a, &gb[0], gb_prop, GB_PROP_LVT) || + has_property(a, &gb[0], gb_prop, GB_PROP_T)) && + has_property(b, &gb[1], gb_prop, GB_PROP_T)) { return 0; } /* GB9 */ - if (has_property(&props[1], PROP_EXTEND) || - has_property(&props[1], PROP_ZWJ)) { + if (has_property(b, &gb[1], gb_prop, GB_PROP_EXTEND) || + has_property(b, &gb[1], gb_prop, GB_PROP_ZWJ)) { return 0; } /* GB9a */ - if (has_property(&props[1], PROP_SPACINGMARK)) { + if (has_property(b, &gb[1], gb_prop, GB_PROP_SPACINGMARK)) { return 0; } /* GB9b */ - if (has_property(&props[0], PROP_PREPEND)) { + if (has_property(a, &gb[0], gb_prop, GB_PROP_PREPEND)) { return 0; } /* GB11 */ if ((s & GRAPHEME_STATE_EMOJI) && - has_property(&props[0], PROP_ZWJ) && - has_property(&props[1], PROP_EXTPICT)) { + has_property(a, &gb[0], gb_prop, GB_PROP_ZWJ) && + has_property(b, &emoji[1], emoji_prop, EMOJI_PROP_EXTPICT)) { return 0; } /* GB12/GB13 */ - if (has_property(&props[0], PROP_RI) && - has_property(&props[1], PROP_RI) && + if (has_property(a, &gb[0], gb_prop, GB_PROP_REGIONAL_INDICATOR) && + has_property(b, &gb[1], gb_prop, GB_PROP_REGIONAL_INDICATOR) && (s & GRAPHEME_STATE_RI_ODD)) { return 0; } diff --git a/src/util.c b/src/util.c @@ -0,0 +1,33 @@ +/* See LICENSE file for copyright and license details. */ +#include "util.h" + +int +heisenstate_get(struct heisenstate *h, int slot) +{ + if (h == NULL || slot >= 16 || slot < 0 || + !(h->determined & (1 << slot))) { + /* no state given, slot out of range or undetermined */ + return -1; + } else { + /* slot determined, return state (0 or 1) */ + return (h->state & (1 << slot)) ? 1 : 0; + } +} + +int +heisenstate_set(struct heisenstate *h, int slot, int state) +{ + if (h == NULL || slot >= 16 || slot < 0) { + /* no state given or slot out of range */ + return 1; + } else { + h->determined |= (1 << slot); + if (state) { + h->state |= (1 << slot); + } else { + h->state &= ~(1 << slot); + } + } + + return 0; +} diff --git a/src/util.h b/src/util.h @@ -0,0 +1,29 @@ +/* See LICENSE file for copyright and license details. */ +#ifndef UTIL_H +#define UTIL_H + +#include <stddef.h> +#include <stdint.h> + +#define LEN(x) (sizeof (x) / sizeof *(x)) + +struct range { + uint32_t lower; + uint32_t upper; +}; + +struct range_list { + struct range *data; + size_t len; +}; + +/* 16-slot (0,...,15) optionally undetermined binary state */ +struct heisenstate { + uint_least16_t determined; + uint_least16_t state; +}; + +int heisenstate_get(struct heisenstate *, int); +int heisenstate_set(struct heisenstate *, int, int); + +#endif /* UTIL_H */