libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

commit 65b354f0fcb1d925f4340dbb4415ea06e8af2bec
parent 3ee106e4ab1d5fe4696ab9089f052706d7cb9a48
Author: Laslo Hunhold <dev@frign.de>
Date:   Sun,  1 Sep 2024 22:42:18 +0200

Update grapheme break algorithm to Unicode version 15.1.0

While the change to the algorithm looks harmless in the specification,
it comes at the price of more complexity because we have to keep track
of a relatively complex state for a sequence of indic conjunct breaks.

Fortunately adding so many additional classes only decreases the
compression ratio for the grapheme cluster LUTs by ~0.5%.

We now pass all 1187 character tests.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
MMakefile | 4++--
Mgen/character.c | 104++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mgen/util.c | 5++++-
Mgen/util.h | 1+
Msrc/character.c | 376+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
5 files changed, 409 insertions(+), 81 deletions(-)

diff --git a/Makefile b/Makefile @@ -196,7 +196,7 @@ src/sentence.o: src/sentence.c Makefile config.mk gen/sentence.h grapheme.h src/ src/utf8.o: src/utf8.c Makefile config.mk grapheme.h src/util.o: src/util.c Makefile config.mk gen/types.h grapheme.h src/util.h src/word.o: src/word.c Makefile config.mk gen/word.h grapheme.h src/util.h -test/bidirectional.o: test/bidirectional.c Makefile config.mk gen/bidirectional-test.h grapheme.h test/util.h +test/bidirectional.o: test/bidirectional.c Makefile config.mk gen/bidirectional.h gen/bidirectional-test.h grapheme.h test/util.h test/case.o: test/case.c Makefile config.mk grapheme.h test/util.h test/character.o: test/character.c Makefile config.mk gen/character-test.h grapheme.h test/util.h test/line.o: test/line.c Makefile config.mk gen/line-test.h grapheme.h test/util.h @@ -236,7 +236,7 @@ test/word$(BINSUFFIX): test/word.o test/util.o $(ANAME) gen/bidirectional.h: data/BidiBrackets.txt data/BidiMirroring.txt data/DerivedBidiClass.txt data/UnicodeData.txt gen/bidirectional$(BINSUFFIX) gen/bidirectional-test.h: data/BidiCharacterTest.txt data/BidiTest.txt gen/bidirectional-test$(BINSUFFIX) gen/case.h: data/DerivedCoreProperties.txt data/UnicodeData.txt data/SpecialCasing.txt gen/case$(BINSUFFIX) -gen/character.h: data/emoji-data.txt data/GraphemeBreakProperty.txt gen/character$(BINSUFFIX) +gen/character.h: data/DerivedCoreProperties.txt data/emoji-data.txt data/GraphemeBreakProperty.txt gen/character$(BINSUFFIX) gen/character-test.h: data/GraphemeBreakTest.txt gen/character-test$(BINSUFFIX) gen/line.h: data/emoji-data.txt data/EastAsianWidth.txt data/LineBreak.txt gen/line$(BINSUFFIX) gen/line-test.h: data/LineBreakTest.txt gen/line-test$(BINSUFFIX) diff --git a/gen/character.c b/gen/character.c @@ -1,8 +1,12 @@ /* See LICENSE file for copyright and license details. */ #include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> #include "util.h" +#define FILE_DCP "data/DerivedCoreProperties.txt" #define FILE_EMOJI "data/emoji-data.txt" #define FILE_GRAPHEME "data/GraphemeBreakProperty.txt" @@ -13,6 +17,21 @@ static const struct property_spec char_break_property[] = { .ucdname = NULL, }, { + .enumname = "BOTH_EXTEND_ICB_EXTEND", + .file = NULL, + .ucdname = NULL, + }, + { + .enumname = "BOTH_EXTEND_ICB_LINKER", + .file = NULL, + .ucdname = NULL, + }, + { + .enumname = "BOTH_ZWJ_ICB_EXTEND", + .file = NULL, + .ucdname = NULL, + }, + { .enumname = "CONTROL", .file = FILE_GRAPHEME, .ucdname = "Control", @@ -58,6 +77,24 @@ static const struct property_spec char_break_property[] = { .ucdname = "LVT", }, { + .enumname = "ICB_CONSONANT", + .file = FILE_DCP, + .ucdname = "InCB", + .ucdsubname = "Consonant", + }, + { + .enumname = "ICB_EXTEND", + .file = FILE_DCP, + .ucdname = "InCB", + .ucdsubname = "Extend", + }, + { + .enumname = "ICB_LINKER", + .file = FILE_DCP, + .ucdname = "InCB", + .ucdsubname = "Linker", + }, + { .enumname = "LF", .file = FILE_GRAPHEME, .ucdname = "LF", @@ -84,14 +121,75 @@ static const struct property_spec char_break_property[] = { }, }; +static uint_least8_t +handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t prop2) +{ + uint_least8_t result; + + (void)cp; + + if ((!strcmp(char_break_property[prop1].enumname, "EXTEND") && + !strcmp(char_break_property[prop2].enumname, "ICB_EXTEND")) || + (!strcmp(char_break_property[prop1].enumname, "ICB_EXTEND") && + !strcmp(char_break_property[prop2].enumname, "EXTEND"))) { + for (result = 0; result < LEN(char_break_property); result++) { + if (!strcmp(char_break_property[result].enumname, + "BOTH_EXTEND_ICB_EXTEND")) { + break; + } + } + if (result == LEN(char_break_property)) { + fprintf(stderr, "handle_conflict: Internal error.\n"); + exit(1); + } + } else if ((!strcmp(char_break_property[prop1].enumname, "EXTEND") && + !strcmp(char_break_property[prop2].enumname, + "ICB_LINKER")) || + (!strcmp(char_break_property[prop1].enumname, + "ICB_LINKER") && + !strcmp(char_break_property[prop2].enumname, "EXTEND"))) { + for (result = 0; result < LEN(char_break_property); result++) { + if (!strcmp(char_break_property[result].enumname, + "BOTH_EXTEND_ICB_LINKER")) { + break; + } + } + if (result == LEN(char_break_property)) { + fprintf(stderr, "handle_conflict: Internal error.\n"); + exit(1); + } + } else if ((!strcmp(char_break_property[prop1].enumname, "ZWJ") && + !strcmp(char_break_property[prop2].enumname, + "ICB_EXTEND")) || + (!strcmp(char_break_property[prop1].enumname, + "ICB_EXTEND") && + !strcmp(char_break_property[prop2].enumname, "ZWJ"))) { + for (result = 0; result < LEN(char_break_property); result++) { + if (!strcmp(char_break_property[result].enumname, + "BOTH_ZWJ_ICB_EXTEND")) { + break; + } + } + if (result == LEN(char_break_property)) { + fprintf(stderr, "handle_conflict: Internal error.\n"); + exit(1); + } + } else { + fprintf(stderr, "handle_conflict: Cannot handle conflict.\n"); + exit(1); + } + + return result; +} + int main(int argc, char *argv[]) { (void)argc; - properties_generate_break_property(char_break_property, - LEN(char_break_property), NULL, NULL, - NULL, "char_break", argv[0]); + properties_generate_break_property( + char_break_property, LEN(char_break_property), NULL, + handle_conflict, NULL, "char_break", argv[0]); return 0; } diff --git a/gen/util.c b/gen/util.c @@ -317,7 +317,10 @@ properties_callback(const char *file, char **field, size_t nfields, (comment != NULL && !strncmp(p->spec[i].ucdname, comment, strlen(p->spec[i].ucdname)) && - comment[strlen(p->spec[i].ucdname)] == ' '))) { + comment[strlen(p->spec[i].ucdname)] == ' ')) && + (p->spec[i].ucdsubname == NULL || + (nfields >= 3 && + !strcmp(p->spec[i].ucdsubname, field[2])))) { /* parse range in first field */ if (range_parse(field[0], &r)) { return 1; diff --git a/gen/util.h b/gen/util.h @@ -13,6 +13,7 @@ struct property_spec { const char *enumname; const char *file; const char *ucdname; + const char *ucdsubname; }; struct properties { diff --git a/src/character.c b/src/character.c @@ -1,3 +1,5 @@ +#include <stdio.h> + /* See LICENSE file for copyright and license details. */ #include <limits.h> #include <stdbool.h> @@ -12,97 +14,239 @@ struct character_break_state { bool prop_set; bool gb11_flag; bool gb12_13_flag; + uint_least8_t gb9c_level; }; -static const uint_least16_t dont_break[NUM_CHAR_BREAK_PROPS] = { +static const uint_least32_t dont_break[NUM_CHAR_BREAK_PROPS] = { [CHAR_BREAK_PROP_OTHER] = - UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ - UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ - UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ - [CHAR_BREAK_PROP_CR] = UINT16_C(1) << CHAR_BREAK_PROP_LF, /* GB3 */ + UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ + [CHAR_BREAK_PROP_ICB_CONSONANT] = + UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ + [CHAR_BREAK_PROP_ICB_EXTEND] = + UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ + [CHAR_BREAK_PROP_ICB_LINKER] = + UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ + [CHAR_BREAK_PROP_CR] = UINT32_C(1) << CHAR_BREAK_PROP_LF, /* GB3 */ [CHAR_BREAK_PROP_EXTEND] = - UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ - UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ - UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ + UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ + [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND] = + UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ + [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER] = + UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] = - UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ - UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ - UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ + UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ [CHAR_BREAK_PROP_HANGUL_L] = - UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_L | /* GB6 */ - UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB6 */ - UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LV | /* GB6 */ - UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LVT | /* GB6 */ - UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ - UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ - UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ + UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_L | /* GB6 */ + UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB6 */ + UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_LV | /* GB6 */ + UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_LVT | /* GB6 */ + UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ [CHAR_BREAK_PROP_HANGUL_V] = - UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */ - UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */ - UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ - UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ - UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ + UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */ + UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */ + UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ [CHAR_BREAK_PROP_HANGUL_T] = - UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */ - UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ - UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ - UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ + UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */ + UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ [CHAR_BREAK_PROP_HANGUL_LV] = - UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */ - UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */ - UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ - UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ - UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ + UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */ + UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */ + UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ [CHAR_BREAK_PROP_HANGUL_LVT] = - UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */ - UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ - UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ - UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ + UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */ + UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ [CHAR_BREAK_PROP_PREPEND] = - UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ - UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ - UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK | /* GB9a */ - (UINT16_C(0xFFFF) & - ~(UINT16_C(1) << CHAR_BREAK_PROP_CR | - UINT16_C(1) << CHAR_BREAK_PROP_LF | - UINT16_C(1) << CHAR_BREAK_PROP_CONTROL)), /* GB9b */ + UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK | /* GB9a */ + (UINT32_C(0xFFFFFFFF) & + ~(UINT32_C(1) << CHAR_BREAK_PROP_CR | + UINT32_C(1) << CHAR_BREAK_PROP_LF | + UINT32_C(1) << CHAR_BREAK_PROP_CONTROL)), /* GB9b */ [CHAR_BREAK_PROP_REGIONAL_INDICATOR] = - UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ - UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ - UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ + UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ [CHAR_BREAK_PROP_SPACINGMARK] = - UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ - UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ - UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ + UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ [CHAR_BREAK_PROP_ZWJ] = - UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ - UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ - UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ + UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ + [CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND] = + UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ + }; -static const uint_least16_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] = { +static const uint_least32_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] = { [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] = - UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | - UINT16_C(1) << CHAR_BREAK_PROP_EXTEND, + UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ + UINT32_C(1) + << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER, /* GB9 */ [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] = - UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC, + UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC, + [CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] = + UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC, [CHAR_BREAK_PROP_EXTEND + NUM_CHAR_BREAK_PROPS] = - UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | - UINT16_C(1) << CHAR_BREAK_PROP_ZWJ, + UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | + UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND, + [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] = + UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | + UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND, + [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER + NUM_CHAR_BREAK_PROPS] = + UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | + UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND, [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_CHAR_BREAK_PROPS] = - UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | - UINT16_C(1) << CHAR_BREAK_PROP_EXTEND, + UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | + UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | + UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER, }; -static const uint_least16_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = { +static const uint_least32_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = { [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] = - UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC, + UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC, + [CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] = + UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC, }; -static const uint_least16_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = { +static const uint_least32_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = { [CHAR_BREAK_PROP_REGIONAL_INDICATOR] = - UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR, + UINT32_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR, }; -static const uint_least16_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = { +static const uint_least32_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = { [CHAR_BREAK_PROP_REGIONAL_INDICATOR + NUM_CHAR_BREAK_PROPS] = - UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR, + UINT32_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR, }; static inline enum char_break_property @@ -126,7 +270,9 @@ state_serialize(const struct character_break_state *in, uint_least16_t *out) (uint_least16_t)(((uint_least16_t)(in->gb11_flag)) << 9) | /* 10th bit */ (uint_least16_t)(((uint_least16_t)(in->gb12_13_flag)) - << 10); /* 11th bit */ + << 10) | /* 11th bit */ + (uint_least16_t)(((uint_least16_t)(in->gb9c_level & 0x3)) + << 11); /* 12th and 13th bit */ } static inline void @@ -136,6 +282,7 @@ state_deserialize(uint_least16_t in, struct character_break_state *out) out->prop_set = in & (UINT16_C(1) << 8); out->gb11_flag = in & (UINT16_C(1) << 9); out->gb12_13_flag = in & (UINT16_C(1) << 10); + out->gb9c_level = (uint_least8_t)(in >> 11) & UINT8_C(0x3); } bool @@ -164,26 +311,105 @@ grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, state.gb11_flag = flag_update_gb11[cp0_prop + NUM_CHAR_BREAK_PROPS * state.gb11_flag] & - UINT16_C(1) << cp1_prop; + UINT32_C(1) << cp1_prop; state.gb12_13_flag = flag_update_gb12_13[cp0_prop + NUM_CHAR_BREAK_PROPS * state.gb12_13_flag] & - UINT16_C(1) << cp1_prop; + UINT32_C(1) << cp1_prop; + + /* + * update GB9c state, which deals with indic conjunct breaks. + * We want to detect the following prefix: + * + * ICB_CONSONANT + * [ICB_EXTEND ICB_LINKER]* + * ICB_LINKER + * [ICB_EXTEND ICB_LINKER]* + * + * This representation is not ideal: In reality, what is + * meant is that the prefix is a sequence of [ICB_EXTEND + * ICB_LINKER]*, following an ICB_CONSONANT, that contains at + * least one ICB_LINKER. We thus use the following equivalent + * representation that allows us to store the levels 0..3 in 2 + * bits. + * + * ICB_CONSONANT -- Level 1 + * ICB_EXTEND* -- Level 2 + * ICB_LINKER -- Level 3 + * [ICB_EXTEND ICB_LINKER]* -- Level 3 + * + * The following chain of if-else-blocks is a bit redundant and + * of course could be optimised, but this is kept as is for + * best readability. + */ + if (state.gb9c_level == 0 && + cp0_prop == CHAR_BREAK_PROP_ICB_CONSONANT) { + /* the sequence has begun */ + state.gb9c_level = 1; + } else if ((state.gb9c_level == 1 || state.gb9c_level == 2) && + (cp0_prop == CHAR_BREAK_PROP_ICB_EXTEND || + cp0_prop == CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND || + cp0_prop == + CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND)) { + /* + * either the level is 1 and thus the ICB consonant is + * followed by an ICB extend, where we jump + * to level 2, or we are at level 2 and just witness + * more ICB extends, staying at level 2. + */ + state.gb9c_level = 2; + } else if ((state.gb9c_level == 1 || state.gb9c_level == 2) && + (cp0_prop == CHAR_BREAK_PROP_ICB_LINKER || + cp0_prop == + CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER)) { + /* + * witnessing an ICB linker directly lifts us up to + * level 3 + */ + state.gb9c_level = 3; + } else if (state.gb9c_level == 3 && + (cp0_prop == CHAR_BREAK_PROP_ICB_EXTEND || + cp0_prop == CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND || + cp0_prop == + CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND || + cp0_prop == CHAR_BREAK_PROP_ICB_LINKER || + cp0_prop == + CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER)) { + /* + * we stay at level 3 when we observe either ICB + * extends or linkers + */ + state.gb9c_level = 3; + } else { + /* + * the sequence has collapsed, but it could be + * that the left property is ICB consonant, which + * means that we jump right back to level 1 instead + * of 0 + */ + if (cp0_prop == CHAR_BREAK_PROP_ICB_CONSONANT) { + state.gb9c_level = 1; + } else { + state.gb9c_level = 0; + } + } /* * Apply grapheme cluster breaking algorithm (UAX #29), see * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules */ - notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) || + notbreak = (dont_break[cp0_prop] & (UINT32_C(1) << cp1_prop)) || + (state.gb9c_level == 3 && + cp1_prop == CHAR_BREAK_PROP_ICB_CONSONANT) || (dont_break_gb11[cp0_prop + state.gb11_flag * NUM_CHAR_BREAK_PROPS] & - (UINT16_C(1) << cp1_prop)) || + (UINT32_C(1) << cp1_prop)) || (dont_break_gb12_13[cp0_prop + state.gb12_13_flag * NUM_CHAR_BREAK_PROPS] & - (UINT16_C(1) << cp1_prop)); + (UINT32_C(1) << cp1_prop)); /* update or reset flags (when we have a break) */ if (likely(!notbreak)) { @@ -202,11 +428,11 @@ grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, * Given we have no state, this behaves as if the state-booleans * were all set to false */ - notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) || + notbreak = (dont_break[cp0_prop] & (UINT32_C(1) << cp1_prop)) || (dont_break_gb11[cp0_prop] & - (UINT16_C(1) << cp1_prop)) || + (UINT32_C(1) << cp1_prop)) || (dont_break_gb12_13[cp0_prop] & - (UINT16_C(1) << cp1_prop)); + (UINT32_C(1) << cp1_prop)); } return !notbreak;