commit efb2f452b6d1327ba091ac8a69556a060401afed
parent f2783665bc71b9b1f1b72830629c3724bd8e1ae4
Author: Laslo Hunhold <dev@frign.de>
Date: Thu, 13 Oct 2022 23:54:28 +0200
Merge branch 'master' into bidirectional
This brings this branch up to speed with the previous work.
Signed-off-by: Laslo Hunhold <dev@frign.de>
Diffstat:
70 files changed, 4228 insertions(+), 1827 deletions(-)
diff --git a/Makefile b/Makefile
@@ -1,9 +1,18 @@
# See LICENSE file for copyright and license details
# libgrapheme - unicode string library
.POSIX:
+.SUFFIXES:
+
+VERSION_MAJOR = 2
+VERSION_MINOR = 0
+VERSION_PATCH = 1
+UNICODE_VERSION = 15.0.0
+MAN_DATE = 2022-10-06
include config.mk
+VERSION = $(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH)
+
BENCHMARK =\
benchmark/case\
benchmark/character\
@@ -52,6 +61,7 @@ SRC =\
src/word\
TEST =\
+ test/case\
test/character\
test/line\
test/sentence\
@@ -59,48 +69,120 @@ TEST =\
test/utf8-encode\
test/word\
+MAN_TEMPLATE =\
+ man/template/is_case.sh\
+ man/template/next_break.sh\
+ man/template/to_case.sh\
+
MAN3 =\
- man/grapheme_decode_utf8.3\
- man/grapheme_encode_utf8.3\
- man/grapheme_is_character_break.3\
-
-MAN7 = man/libgrapheme.7
-
-all: data/LICENSE libgrapheme.a libgrapheme.so
-
-benchmark/case.o: benchmark/case.c config.mk gen/word-test.h grapheme.h benchmark/util.h
-benchmark/character.o: benchmark/character.c config.mk gen/character-test.h grapheme.h benchmark/util.h
-benchmark/line.o: benchmark/line.c config.mk gen/line-test.h grapheme.h benchmark/util.h
-benchmark/utf8-decode.o: benchmark/utf8-decode.c config.mk gen/character-test.h grapheme.h benchmark/util.h
-benchmark/sentence.o: benchmark/sentence.c config.mk gen/sentence-test.h grapheme.h benchmark/util.h
-benchmark/util.o: benchmark/util.c config.mk benchmark/util.h
-benchmark/word.o: benchmark/word.c config.mk gen/word-test.h grapheme.h benchmark/util.h
-gen/bidirectional.o: gen/bidirectional.c config.mk gen/util.h
-gen/case.o: gen/case.c config.mk gen/util.h
-gen/character.o: gen/character.c config.mk gen/util.h
-gen/character-test.o: gen/character-test.c config.mk gen/util.h
-gen/line.o: gen/line.c config.mk gen/util.h
-gen/line-test.o: gen/line-test.c config.mk gen/util.h
-gen/sentence.o: gen/sentence.c config.mk gen/util.h
-gen/sentence-test.o: gen/sentence-test.c config.mk gen/util.h
-gen/word.o: gen/word.c config.mk gen/util.h
-gen/word-test.o: gen/word-test.c config.mk gen/util.h
-gen/util.o: gen/util.c config.mk gen/util.h
-src/bidirectional.o: src/bidirectional.c config.mk gen/bidirectional.h grapheme.h src/util.h
-src/case.o: src/case.c config.mk gen/case.h grapheme.h src/util.h
-src/character.o: src/character.c config.mk gen/character.h grapheme.h src/util.h
-src/line.o: src/line.c config.mk gen/line.h grapheme.h src/util.h
-src/sentence.o: src/sentence.c config.mk gen/sentence.h grapheme.h src/util.h
-src/utf8.o: src/utf8.c config.mk grapheme.h
-src/util.o: src/util.c config.mk gen/types.h grapheme.h src/util.h
-src/word.o: src/word.c config.mk gen/word.h grapheme.h src/util.h
-test/character.o: test/character.c config.mk gen/character-test.h grapheme.h test/util.h
-test/line.o: test/line.c config.mk gen/line-test.h grapheme.h test/util.h
-test/sentence.o: test/sentence.c config.mk gen/sentence-test.h grapheme.h test/util.h
-test/utf8-encode.o: test/utf8-encode.c config.mk grapheme.h test/util.h
-test/utf8-decode.o: test/utf8-decode.c config.mk grapheme.h test/util.h
-test/util.o: test/util.c config.mk test/util.h
-test/word.o: test/word.c config.mk gen/word-test.h grapheme.h test/util.h
+ man/grapheme_decode_utf8\
+ man/grapheme_encode_utf8\
+ man/grapheme_is_character_break\
+ man/grapheme_is_uppercase\
+ man/grapheme_is_uppercase_utf8\
+ man/grapheme_is_lowercase\
+ man/grapheme_is_lowercase_utf8\
+ man/grapheme_is_titlecase\
+ man/grapheme_is_titlecase_utf8\
+ man/grapheme_next_character_break\
+ man/grapheme_next_line_break\
+ man/grapheme_next_sentence_break\
+ man/grapheme_next_word_break\
+ man/grapheme_next_character_break_utf8\
+ man/grapheme_next_line_break_utf8\
+ man/grapheme_next_sentence_break_utf8\
+ man/grapheme_next_word_break_utf8\
+ man/grapheme_to_uppercase\
+ man/grapheme_to_uppercase_utf8\
+ man/grapheme_to_lowercase\
+ man/grapheme_to_lowercase_utf8\
+ man/grapheme_to_titlecase\
+ man/grapheme_to_titlecase_utf8\
+
+MAN7 =\
+ man/libgrapheme\
+
+all: data/LICENSE $(MAN3:=.3) $(MAN7:=.7) libgrapheme.a $(SONAME)
+
+data/DerivedBidiClass.txt:
+ wget -O $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/DerivedBidiClass.txt
+
+data/DerivedCoreProperties.txt:
+ wget -O $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/DerivedCoreProperties.txt
+
+data/EastAsianWidth.txt:
+ wget -O $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/EastAsianWidth.txt
+
+data/emoji-data.txt:
+ wget -O $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/emoji/emoji-data.txt
+
+data/GraphemeBreakProperty.txt:
+ wget -O $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/auxiliary/GraphemeBreakProperty.txt
+
+data/GraphemeBreakTest.txt:
+ wget -O $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/auxiliary/GraphemeBreakTest.txt
+
+data/LICENSE:
+ wget -O $@ https://www.unicode.org/license.txt
+
+data/LineBreak.txt:
+ wget -O $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/LineBreak.txt
+
+data/LineBreakTest.txt:
+ wget -O $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/auxiliary/LineBreakTest.txt
+
+data/SentenceBreakProperty.txt:
+ wget -O $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/auxiliary/SentenceBreakProperty.txt
+
+data/SentenceBreakTest.txt:
+ wget -O $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/auxiliary/SentenceBreakTest.txt
+
+data/SpecialCasing.txt:
+ wget -O $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/SpecialCasing.txt
+
+data/UnicodeData.txt:
+ wget -O $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt
+
+data/WordBreakProperty.txt:
+ wget -O $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/auxiliary/WordBreakProperty.txt
+
+data/WordBreakTest.txt:
+ wget -O $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/auxiliary/WordBreakTest.txt
+
+benchmark/case.o: benchmark/case.c Makefile config.mk gen/word-test.h grapheme.h benchmark/util.h
+benchmark/character.o: benchmark/character.c Makefile config.mk gen/character-test.h grapheme.h benchmark/util.h
+benchmark/line.o: benchmark/line.c Makefile config.mk gen/line-test.h grapheme.h benchmark/util.h
+benchmark/utf8-decode.o: benchmark/utf8-decode.c Makefile config.mk gen/character-test.h grapheme.h benchmark/util.h
+benchmark/sentence.o: benchmark/sentence.c Makefile config.mk gen/sentence-test.h grapheme.h benchmark/util.h
+benchmark/util.o: benchmark/util.c Makefile config.mk benchmark/util.h
+benchmark/word.o: benchmark/word.c Makefile config.mk gen/word-test.h grapheme.h benchmark/util.h
+gen/bidirectional.o: gen/bidirectional.c Makefile config.mk gen/util.h
+gen/case.o: gen/case.c Makefile config.mk gen/util.h
+gen/character.o: gen/character.c Makefile config.mk gen/util.h
+gen/character-test.o: gen/character-test.c Makefile config.mk gen/util.h
+gen/line.o: gen/line.c Makefile config.mk gen/util.h
+gen/line-test.o: gen/line-test.c Makefile config.mk gen/util.h
+gen/sentence.o: gen/sentence.c Makefile config.mk gen/util.h
+gen/sentence-test.o: gen/sentence-test.c Makefile config.mk gen/util.h
+gen/word.o: gen/word.c Makefile config.mk gen/util.h
+gen/word-test.o: gen/word-test.c Makefile config.mk gen/util.h
+gen/util.o: gen/util.c Makefile config.mk gen/util.h
+src/bidirectional.o: src/bidirectional.c Makefile config.mk gen/bidirectional.h grapheme.h src/util.h
+src/case.o: src/case.c Makefile config.mk gen/case.h grapheme.h src/util.h
+src/character.o: src/character.c Makefile config.mk gen/character.h grapheme.h src/util.h
+src/line.o: src/line.c Makefile config.mk gen/line.h grapheme.h src/util.h
+src/sentence.o: src/sentence.c Makefile config.mk gen/sentence.h grapheme.h src/util.h
+src/utf8.o: src/utf8.c Makefile config.mk grapheme.h
+src/util.o: src/util.c Makefile config.mk gen/types.h grapheme.h src/util.h
+src/word.o: src/word.c Makefile config.mk gen/word.h grapheme.h src/util.h
+test/case.o: test/case.c Makefile config.mk grapheme.h test/util.h
+test/character.o: test/character.c Makefile config.mk gen/character-test.h grapheme.h test/util.h
+test/line.o: test/line.c Makefile config.mk gen/line-test.h grapheme.h test/util.h
+test/sentence.o: test/sentence.c Makefile config.mk gen/sentence-test.h grapheme.h test/util.h
+test/utf8-encode.o: test/utf8-encode.c Makefile config.mk grapheme.h test/util.h
+test/utf8-decode.o: test/utf8-decode.c Makefile config.mk grapheme.h test/util.h
+test/util.o: test/util.c Makefile config.mk test/util.h
+test/word.o: test/word.c Makefile config.mk gen/word-test.h grapheme.h test/util.h
benchmark/case: benchmark/case.o benchmark/util.o libgrapheme.a
benchmark/character: benchmark/character.o benchmark/util.o libgrapheme.a
@@ -118,6 +200,7 @@ gen/sentence: gen/sentence.o gen/util.o
gen/sentence-test: gen/sentence-test.o gen/util.o
gen/word: gen/word.o gen/util.o
gen/word-test: gen/word-test.o gen/util.o
+test/case: test/case.o test/util.o libgrapheme.a
test/character: test/character.o test/util.o libgrapheme.a
test/line: test/line.o test/util.o libgrapheme.a
test/sentence: test/sentence.o test/util.o libgrapheme.a
@@ -136,71 +219,65 @@ gen/sentence-test.h: data/SentenceBreakTest.txt gen/sentence-test
gen/word.h: data/WordBreakProperty.txt gen/word
gen/word-test.h: data/WordBreakTest.txt gen/word-test
-data/DerivedBidiClass.txt:
- wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/extracted/DerivedBidiClass.txt
-
-data/DerivedCoreProperties.txt:
- wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/DerivedCoreProperties.txt
-
-data/EastAsianWidth.txt:
- wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/EastAsianWidth.txt
-
-data/emoji-data.txt:
- wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt
-
-data/GraphemeBreakProperty.txt:
- wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/auxiliary/GraphemeBreakProperty.txt
-
-data/GraphemeBreakTest.txt:
- wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/auxiliary/GraphemeBreakTest.txt
-
-data/LICENSE:
- wget -O $@ https://www.unicode.org/license.txt
-
-data/LineBreak.txt:
- wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/LineBreak.txt
-
-data/LineBreakTest.txt:
- wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/auxiliary/LineBreakTest.txt
-
-data/SentenceBreakProperty.txt:
- wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/auxiliary/SentenceBreakProperty.txt
-
-data/SentenceBreakTest.txt:
- wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/auxiliary/SentenceBreakTest.txt
-
-data/SpecialCasing.txt:
- wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/SpecialCasing.txt
-
-data/UnicodeData.txt:
- wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/UnicodeData.txt
-
-data/WordBreakProperty.txt:
- wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/auxiliary/WordBreakProperty.txt
-
-data/WordBreakTest.txt:
- wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/auxiliary/WordBreakTest.txt
+man/grapheme_is_character_break.3: man/grapheme_is_character_break.sh Makefile config.mk
+man/grapheme_is_uppercase.3: man/grapheme_is_uppercase.sh man/template/is_case.sh Makefile config.mk
+man/grapheme_is_uppercase_utf8.3: man/grapheme_is_uppercase_utf8.sh man/template/is_case.sh Makefile config.mk
+man/grapheme_is_lowercase.3: man/grapheme_is_lowercase.sh man/template/is_case.sh Makefile config.mk
+man/grapheme_is_lowercase_utf8.3: man/grapheme_is_lowercase_utf8.sh man/template/is_case.sh Makefile config.mk
+man/grapheme_is_titlecase.3: man/grapheme_is_titlecase.sh man/template/is_case.sh Makefile config.mk
+man/grapheme_is_titlecase_utf8.3: man/grapheme_is_titlecase_utf8.sh man/template/is_case.sh Makefile config.mk
+man/grapheme_next_character_break.3: man/grapheme_next_character_break.sh man/template/next_break.sh Makefile config.mk
+man/grapheme_next_line_break.3: man/grapheme_next_line_break.sh man/template/next_break.sh Makefile config.mk
+man/grapheme_next_sentence_break.3: man/grapheme_next_sentence_break.sh man/template/next_break.sh Makefile config.mk
+man/grapheme_next_word_break.3: man/grapheme_next_word_break.sh man/template/next_break.sh Makefile config.mk
+man/grapheme_next_character_break_utf8.3: man/grapheme_next_character_break_utf8.sh man/template/next_break.sh Makefile config.mk
+man/grapheme_next_line_break_utf8.3: man/grapheme_next_line_break_utf8.sh man/template/next_break.sh Makefile config.mk
+man/grapheme_next_sentence_break_utf8.3: man/grapheme_next_sentence_break_utf8.sh man/template/next_break.sh Makefile config.mk
+man/grapheme_next_word_break_utf8.3: man/grapheme_next_word_break_utf8.sh man/template/next_break.sh Makefile config.mk
+man/grapheme_to_uppercase.3: man/grapheme_to_uppercase.sh man/template/to_case.sh Makefile config.mk
+man/grapheme_to_lowercase.3: man/grapheme_to_lowercase.sh man/template/to_case.sh Makefile config.mk
+man/grapheme_to_titlecase.3: man/grapheme_to_titlecase.sh man/template/to_case.sh Makefile config.mk
+man/grapheme_to_uppercase_utf8.3: man/grapheme_to_uppercase_utf8.sh man/template/to_case.sh Makefile config.mk
+man/grapheme_to_lowercase_utf8.3: man/grapheme_to_lowercase_utf8.sh man/template/to_case.sh Makefile config.mk
+man/grapheme_to_titlecase_utf8.3: man/grapheme_to_titlecase_utf8.sh man/template/to_case.sh Makefile config.mk
+man/grapheme_decode_utf8.3: man/grapheme_decode_utf8.sh Makefile config.mk
+man/grapheme_encode_utf8.3: man/grapheme_encode_utf8.sh Makefile config.mk
+
+man/libgrapheme.7: man/libgrapheme.sh Makefile config.mk
+
+$(GEN:=.o) gen/util.o:
+ $(BUILD_CC) -c -o $@ $(BUILD_CPPFLAGS) $(BUILD_CFLAGS) $(@:.o=.c)
+
+$(BENCHMARK:=.o) benchmark/util.o $(TEST:=.o) test/util.o:
+ $(CC) -c -o $@ $(CPPFLAGS) $(CFLAGS) $(@:.o=.c)
+
+$(SRC:=.o):
+ $(CC) -c -o $@ $(CPPFLAGS) $(CFLAGS) $(SHFLAGS) $(@:.o=.c)
$(BENCHMARK):
$(CC) -o $@ $(LDFLAGS) $@.o benchmark/util.o libgrapheme.a -lutf8proc
$(GEN):
- $(CC) -o $@ $(LDFLAGS) $@.o gen/util.o
-
-$(GEN:=.h):
- $(@:.h=) > $@
+ $(BUILD_CC) -o $@ $(BUILD_LDFLAGS) $@.o gen/util.o
$(TEST):
$(CC) -o $@ $(LDFLAGS) $@.o test/util.o libgrapheme.a
-.c.o:
- $(CC) -c -o $@ $(CPPFLAGS) $(CFLAGS) $<
+$(GEN:=.h):
+ $(@:.h=) > $@
libgrapheme.a: $(SRC:=.o)
- $(AR) -rcs $@ $?
+ $(AR) -rc $@ $?
+ $(RANLIB) $@
+
+$(SONAME): $(SRC:=.o)
+ $(CC) -o $@ $(SOFLAGS) $(LDFLAGS) $(SRC:=.o)
-libgrapheme.so: $(SRC:=.o)
- $(CC) -o $@ -shared $(SRC:=.o)
+$(MAN3:=.3):
+ SH="$(SH)" MAN_DATE="$(MAN_DATE)" UNICODE_VERSION="$(UNICODE_VERSION)" $(SH) $(@:.3=.sh) > $@
+
+$(MAN7:=.7):
+ SH="$(SH)" MAN_DATE="$(MAN_DATE)" UNICODE_VERSION="$(UNICODE_VERSION)" $(SH) $(@:.7=.sh) > $@
benchmark: $(BENCHMARK)
for m in $(BENCHMARK); do ./$$m; done
@@ -213,39 +290,43 @@ install: all
mkdir -p "$(DESTDIR)$(INCPREFIX)"
mkdir -p "$(DESTDIR)$(MANPREFIX)/man3"
mkdir -p "$(DESTDIR)$(MANPREFIX)/man7"
- cp -f $(MAN3) "$(DESTDIR)$(MANPREFIX)/man3"
- cp -f $(MAN7) "$(DESTDIR)$(MANPREFIX)/man7"
+ cp -f $(MAN3:=.3) "$(DESTDIR)$(MANPREFIX)/man3"
+ cp -f $(MAN7:=.7) "$(DESTDIR)$(MANPREFIX)/man7"
cp -f libgrapheme.a "$(DESTDIR)$(LIBPREFIX)"
- cp -f libgrapheme.so "$(DESTDIR)$(LIBPREFIX)"
+ cp -f $(SONAME) "$(DESTDIR)$(LIBPREFIX)/$(SONAME)"
+ if [ "$(SOSYMLINK)" = "true" ]; then i=0; while [ "$$i" -le $(VERSION_MINOR) ]; do ln -sf "$(SONAME)" "$(DESTDIR)$(LIBPREFIX)/libgrapheme.so.$(VERSION_MAJOR).$$i"; i=$$((i+1)); done; fi
+ if [ "$(SOSYMLINK)" = "true" ]; then ln -sf "$(SONAME)" "$(DESTDIR)$(LIBPREFIX)/libgrapheme.so.$(VERSION_MAJOR)"; fi
+ if [ "$(SOSYMLINK)" = "true" ]; then ln -sf "$(SONAME)" "$(DESTDIR)$(LIBPREFIX)/libgrapheme.so"; fi
cp -f grapheme.h "$(DESTDIR)$(INCPREFIX)"
- ldconfig || true
+ if ! [ -z "$(LDCONFIG)" ]; then $(SHELL) -c "$(LDCONFIG)"; fi
uninstall:
- for m in $(MAN3); do rm -f "$(DESTDIR)$(MANPREFIX)/man3/`basename $$m`"; done
- for m in $(MAN7); do rm -f "$(DESTDIR)$(MANPREFIX)/man7/`basename $$m`"; done
+ for m in $(MAN3:=.3); do rm -f "$(DESTDIR)$(MANPREFIX)/man3/`basename $$m`"; done
+ for m in $(MAN7:=.7); do rm -f "$(DESTDIR)$(MANPREFIX)/man7/`basename $$m`"; done
rm -f "$(DESTDIR)$(LIBPREFIX)/libgrapheme.a"
- rm -f "$(DESTDIR)$(LIBPREFIX)/libgrapheme.so"
+ rm -f "$(DESTDIR)$(LIBPREFIX)/$(SONAME)"
+ if [ "$(SOSYMLINK)" = "true" ]; then i=0; while [ "$$i" -le $(VERSION_MINOR) ]; do rm -f "$(DESTDIR)$(LIBPREFIX)/libgrapheme.so.$(VERSION_MAJOR).$$i"; i=$$((i+1)); done; fi
+ if [ "$(SOSYMLINK)" = "true" ]; then rm -f "$(DESTDIR)$(LIBPREFIX)/libgrapheme.so.$(VERSION_MAJOR)"; fi
+ if [ "$(SOSYMLINK)" = "true" ]; then rm -f "$(DESTDIR)$(LIBPREFIX)/libgrapheme.so"; fi
rm -f "$(DESTDIR)$(INCPREFIX)/grapheme.h"
- ldconfig || true
+ if ! [ -z "$(LDCONFIG)" ]; then $(SHELL) -c "$(LDCONFIG)"; fi
clean:
- rm -f $(BENCHMARK:=.o) benchmark/util.o $(BENCHMARK) $(GEN:=.h) $(GEN:=.o) gen/util.o $(GEN) $(SRC:=.o) src/util.o $(TEST:=.o) test/util.o $(TEST) libgrapheme.a libgrapheme.so
+ rm -f $(BENCHMARK:=.o) benchmark/util.o $(BENCHMARK) $(GEN:=.h) $(GEN:=.o) gen/util.o $(GEN) $(SRC:=.o) src/util.o $(TEST:=.o) test/util.o $(TEST) libgrapheme.a $(SONAME) $(MAN3:=.3) $(MAN7:=.7)
clean-data:
rm -f $(DATA)
-print:
- @echo $(PREFIX)
-
dist:
rm -rf "libgrapheme-$(VERSION)"
mkdir "libgrapheme-$(VERSION)"
- for m in benchmark data gen man src test; do mkdir "libgrapheme-$(VERSION)/$$m"; done
+ for m in benchmark data gen man man/template src test; do mkdir "libgrapheme-$(VERSION)/$$m"; done
cp config.mk grapheme.h LICENSE Makefile README "libgrapheme-$(VERSION)"
cp $(BENCHMARK:=.c) benchmark/util.c benchmark/util.h "libgrapheme-$(VERSION)/benchmark"
cp $(DATA) "libgrapheme-$(VERSION)/data"
cp $(GEN:=.c) gen/util.c gen/types.h gen/util.h "libgrapheme-$(VERSION)/gen"
- cp $(MAN3) $(MAN7) "libgrapheme-$(VERSION)/man"
+ cp $(MAN3:=.sh) $(MAN7:=.sh) "libgrapheme-$(VERSION)/man"
+ cp $(MAN_TEMPLATE) "libgrapheme-$(VERSION)/man/template"
cp $(SRC:=.c) src/util.h "libgrapheme-$(VERSION)/src"
cp $(TEST:=.c) test/util.c test/util.h "libgrapheme-$(VERSION)/test"
tar -cf - "libgrapheme-$(VERSION)" | gzip -c > "libgrapheme-$(VERSION).tar.gz"
diff --git a/README b/README
@@ -1,25 +1,34 @@
libgrapheme
===========
-The libgrapheme library provides functions to properly handle Unicode
-strings according to the Unicode specification. Unicode strings are made
-up of user-perceived characters (so-called "grapheme clusters") that are
-made up of one or more Unicode codepoints, which in turn are encoded in
-one or more bytes in an encoding like UTF-8.
-
-There is a widespread misconception that it was enough to simply
-determine codepoints in a string and treat them as user-perceived
-characters to be Unicode compliant. While this may work in some cases,
-this assumption quickly breaks, especially for non-Western languages and
-decomposed Unicode strings where user-perceived characters are usually
-represented using multiple codepoints.
-
-Despite the complicated multilevel structure of Unicode strings,
-libgrapheme provides methods to work with them at the byte-level (i.e.
-UTF-8 ‘char’ arrays) while also providing codepoint-level methods.
-
-See libgrapheme(7) to get started and try out the self-contained examples
-given on the manual pages for each function.
+libgrapheme is an extremely simple freestanding C99 library providing
+utilities for properly handling strings according to the latest Unicode
+standard 15.0.0. It offers fully Unicode compliant
+
+ - grapheme cluster (i.e. user-perceived character) segmentation
+ - word segmentation
+ - sentence segmentation
+ - detection of permissible line break opportunities
+ - case detection (lower-, upper- and title-case)
+ - case conversion (to lower-, upper- and title-case)
+
+on UTF-8 strings and codepoint arrays, which both can also be
+null-terminated.
+
+The necessary lookup-tables are automatically generated from the Unicode
+standard data (contained in the tarball) and heavily compressed. Over
+10,000 automatically generated conformance tests and over 150 unit tests
+ensure conformance and correctness.
+
+There is no complicated build-system involved and it's all done using one
+POSIX-compliant Makefile. All you need is a C99 compiler, given the
+lookup-table-generators and compressors are also written in C99. The
+resulting library is freestanding and thus not even dependent on a
+standard library to be present at runtime, making it a suitable choice
+for bare metal applications.
+
+It is also way smaller and much faster than the other established
+Unicode string libraries (ICU, GNU's libunistring, libutf8proc).
Requirements
------------
@@ -27,8 +36,9 @@ A C99-compiler and POSIX make.
Installation
------------
-Edit config.mk to match your local setup (usually not necessary, the
-default prefix is /usr/local).
+Run ./configure, which automatically edits config.mk to match your local
+setup. Edit config.mk by hand if necessary or desired for further
+customization.
Afterwards enter the following command to build and install libgrapheme
(if necessary as root):
@@ -37,16 +47,12 @@ Afterwards enter the following command to build and install libgrapheme
Conformance
-----------
-The libgrapheme library is compliant with the Unicode 14.0.0
-specification (September 2021).
-
-To ensure conformance, libgrapheme includes hundreds of tests including
-all provided with the standard-provided test-data that is parsed
-automatically. The tests can be run with
+The libgrapheme library is compliant with the Unicode 15.0.0
+specification (September 2022). The tests can be run with
make test
-to check standard conformance.
+to check standard conformance and correctness.
Usage
-----
diff --git a/benchmark/character.c b/benchmark/character.c
@@ -23,7 +23,7 @@ struct break_benchmark_payload {
void
libgrapheme(const void *payload)
{
- GRAPHEME_STATE state = { 0 };
+ uint_least16_t state = 0;
const struct break_benchmark_payload *p = payload;
size_t i;
@@ -80,6 +80,7 @@ main(int argc, char *argv[])
&baseline, NUM_ITERATIONS, p.buflen - 1);
free(p.buf);
+ free(p.buf_utf8proc);
return 0;
}
diff --git a/config.mk b/config.mk
@@ -1,7 +1,4 @@
-# libgrapheme version
-VERSION = 1
-
-# Customize below to fit your system
+# Customize below to fit your system (run ./configure for automatic presets)
# paths
PREFIX = /usr/local
@@ -11,9 +8,23 @@ MANPREFIX = $(PREFIX)/share/man
# flags
CPPFLAGS = -D_DEFAULT_SOURCE
-CFLAGS = -std=c99 -Os -fPIC -Wall -Wextra -Wpedantic
+CFLAGS = -std=c99 -Os -Wall -Wextra -Wpedantic
LDFLAGS = -s
+BUILD_CPPFLAGS = $(CPPFLAGS)
+BUILD_CFLAGS = $(CFLAGS)
+BUILD_LDFLAGS = $(LDFLAGS)
+
+SHFLAGS = -fPIC -ffreestanding
+
+SOFLAGS = -shared -nostdlib -Wl,--soname=libgrapheme.so.$(VERSION_MAJOR).$(VERSION_MINOR)
+SONAME = libgrapheme.so.$(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH)
+SOSYMLINK = true
+
# tools
-CC = cc
-AR = ar
+CC = cc
+BUILD_CC = $(CC)
+AR = ar
+RANLIB = ranlib
+LDCONFIG = ldconfig # unset to not call ldconfig(1) after install/uninstall
+SH = sh
diff --git a/configure b/configure
@@ -0,0 +1,39 @@
+#!/bin/sh
+# See LICENSE file for copyright and license details.
+
+replace_line()
+{
+ VAR=$1
+ ALIGNMENT=$2
+ VALUE=$3
+ awk "/^${VAR}[ ]*=/ { print \"${VAR}${ALIGNMENT} = ${VALUE}\"; next }; { print; }" config.mk > config.mk.tmp
+ mv config.mk.tmp config.mk
+}
+
+case $(uname) in
+ DragonFly|FreeBSD|Linux|NetBSD)
+ # the default
+ replace_line 'SOFLAGS' ' ' '-shared -nostdlib -Wl,--soname=libgrapheme.so.$(VERSION_MAJOR).$(VERSION_MINOR)'
+ replace_line 'SONAME' ' ' 'libgrapheme.so.$(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH)'
+ replace_line 'SOSYMLINK' '' 'true'
+ replace_line 'LDCONFIG' '' 'ldconfig \# unset to not call ldconfig(1) after install/uninstall'
+ ;;
+ OpenBSD)
+ replace_line 'SOFLAGS' ' ' '-shared -nostdlib'
+ replace_line 'SONAME' ' ' 'libgrapheme.so.$(VERSION_MAJOR).$(VERSION_MINOR)'
+ replace_line 'SOSYMLINK' '' 'false'
+ replace_line 'LDCONFIG' '' ''
+ ;;
+ Darwin)
+ replace_line 'SOFLAGS' ' ' '-dynamiclib -install_name libgrapheme.$(VERSION_MAJOR).dylib -current_version $(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH) -compatibility_version $(VERSION_MAJOR).$(VERSION_MINOR).0'
+ replace_line 'SONAME' ' ' 'libgrapheme.$(VERSION_MAJOR).dylib'
+ replace_line 'SOSYMLINK' '' 'false'
+ replace_line 'LDCONFIG' '' ''
+ ;;
+ *)
+ echo "Your system does not have a preset. Edit config.mk and send a patch please! :)"
+ exit 1
+ ;;
+esac
+
+exit 0
diff --git a/data/DerivedCoreProperties.txt b/data/DerivedCoreProperties.txt
@@ -1,11 +1,11 @@
-# DerivedCoreProperties-14.0.0.txt
-# Date: 2021-08-12, 23:12:53 GMT
-# © 2021 Unicode®, Inc.
+# DerivedCoreProperties-15.0.0.txt
+# Date: 2022-08-05, 22:17:05 GMT
+# © 2022 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
-# For terms of use, see http://www.unicode.org/terms_of_use.html
+# For terms of use, see https://www.unicode.org/terms_of_use.html
#
# Unicode Character Database
-# For documentation, see http://www.unicode.org/reports/tr44/
+# For documentation, see https://www.unicode.org/reports/tr44/
# ================================================
@@ -462,6 +462,7 @@ FFE9..FFEC ; Math # Sm [4] HALFWIDTH LEFTWARDS ARROW..HALFWIDTH DOWNWARDS A
0BD7 ; Alphabetic # Mc TAMIL AU LENGTH MARK
0C00 ; Alphabetic # Mn TELUGU SIGN COMBINING CANDRABINDU ABOVE
0C01..0C03 ; Alphabetic # Mc [3] TELUGU SIGN CANDRABINDU..TELUGU SIGN VISARGA
+0C04 ; Alphabetic # Mn TELUGU SIGN COMBINING ANUSVARA ABOVE
0C05..0C0C ; Alphabetic # Lo [8] TELUGU LETTER A..TELUGU LETTER VOCALIC L
0C0E..0C10 ; Alphabetic # Lo [3] TELUGU LETTER E..TELUGU LETTER AI
0C12..0C28 ; Alphabetic # Lo [23] TELUGU LETTER O..TELUGU LETTER NA
@@ -497,6 +498,7 @@ FFE9..FFEC ; Math # Sm [4] HALFWIDTH LEFTWARDS ARROW..HALFWIDTH DOWNWARDS A
0CE0..0CE1 ; Alphabetic # Lo [2] KANNADA LETTER VOCALIC RR..KANNADA LETTER VOCALIC LL
0CE2..0CE3 ; Alphabetic # Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL
0CF1..0CF2 ; Alphabetic # Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
+0CF3 ; Alphabetic # Mc KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
0D00..0D01 ; Alphabetic # Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU
0D02..0D03 ; Alphabetic # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
0D04..0D0C ; Alphabetic # Lo [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L
@@ -552,7 +554,7 @@ FFE9..FFEC ; Math # Sm [4] HALFWIDTH LEFTWARDS ARROW..HALFWIDTH DOWNWARDS A
0F49..0F6C ; Alphabetic # Lo [36] TIBETAN LETTER NYA..TIBETAN LETTER RRA
0F71..0F7E ; Alphabetic # Mn [14] TIBETAN VOWEL SIGN AA..TIBETAN SIGN RJES SU NGA RO
0F7F ; Alphabetic # Mc TIBETAN SIGN RNAM BCAD
-0F80..0F81 ; Alphabetic # Mn [2] TIBETAN VOWEL SIGN REVERSED I..TIBETAN VOWEL SIGN REVERSED II
+0F80..0F83 ; Alphabetic # Mn [4] TIBETAN VOWEL SIGN REVERSED I..TIBETAN SIGN SNA LDAN
0F88..0F8C ; Alphabetic # Lo [5] TIBETAN SIGN LCE TSA CAN..TIBETAN SIGN INVERTED MCHU CAN
0F8D..0F97 ; Alphabetic # Mn [11] TIBETAN SUBJOINED SIGN LCE TSA CAN..TIBETAN SUBJOINED LETTER JA
0F99..0FBC ; Alphabetic # Mn [36] TIBETAN SUBJOINED LETTER NYA..TIBETAN SUBJOINED LETTER FIXED-FORM RA
@@ -1053,6 +1055,7 @@ FFDA..FFDC ; Alphabetic # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANG
11071..11072 ; Alphabetic # Lo [2] BRAHMI LETTER OLD TAMIL SHORT E..BRAHMI LETTER OLD TAMIL SHORT O
11073..11074 ; Alphabetic # Mn [2] BRAHMI VOWEL SIGN OLD TAMIL SHORT E..BRAHMI VOWEL SIGN OLD TAMIL SHORT O
11075 ; Alphabetic # Lo BRAHMI LETTER OLD TAMIL LLA
+11080..11081 ; Alphabetic # Mn [2] KAITHI SIGN CANDRABINDU..KAITHI SIGN ANUSVARA
11082 ; Alphabetic # Mc KAITHI SIGN VISARGA
11083..110AF ; Alphabetic # Lo [45] KAITHI LETTER A..KAITHI LETTER HA
110B0..110B2 ; Alphabetic # Mc [3] KAITHI VOWEL SIGN AA..KAITHI VOWEL SIGN II
@@ -1089,6 +1092,8 @@ FFDA..FFDC ; Alphabetic # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANG
11234 ; Alphabetic # Mn KHOJKI SIGN ANUSVARA
11237 ; Alphabetic # Mn KHOJKI SIGN SHADDA
1123E ; Alphabetic # Mn KHOJKI SIGN SUKUN
+1123F..11240 ; Alphabetic # Lo [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I
+11241 ; Alphabetic # Mn KHOJKI VOWEL SIGN VOCALIC R
11280..11286 ; Alphabetic # Lo [7] MULTANI LETTER A..MULTANI LETTER GA
11288 ; Alphabetic # Lo MULTANI LETTER GHA
1128A..1128D ; Alphabetic # Lo [4] MULTANI LETTER CA..MULTANI LETTER JJA
@@ -1243,12 +1248,22 @@ FFDA..FFDC ; Alphabetic # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANG
11EE0..11EF2 ; Alphabetic # Lo [19] MAKASAR LETTER KA..MAKASAR ANGKA
11EF3..11EF4 ; Alphabetic # Mn [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U
11EF5..11EF6 ; Alphabetic # Mc [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O
+11F00..11F01 ; Alphabetic # Mn [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA
+11F02 ; Alphabetic # Lo KAWI SIGN REPHA
+11F03 ; Alphabetic # Mc KAWI SIGN VISARGA
+11F04..11F10 ; Alphabetic # Lo [13] KAWI LETTER A..KAWI LETTER O
+11F12..11F33 ; Alphabetic # Lo [34] KAWI LETTER KA..KAWI LETTER JNYA
+11F34..11F35 ; Alphabetic # Mc [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA
+11F36..11F3A ; Alphabetic # Mn [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R
+11F3E..11F3F ; Alphabetic # Mc [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI
+11F40 ; Alphabetic # Mn KAWI VOWEL SIGN EU
11FB0 ; Alphabetic # Lo LISU LETTER YHA
12000..12399 ; Alphabetic # Lo [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U
12400..1246E ; Alphabetic # Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM
12480..12543 ; Alphabetic # Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU
12F90..12FF0 ; Alphabetic # Lo [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114
-13000..1342E ; Alphabetic # Lo [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032
+13000..1342F ; Alphabetic # Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D
+13441..13446 ; Alphabetic # Lo [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN
14400..14646 ; Alphabetic # Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530
16800..16A38 ; Alphabetic # Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ
16A40..16A5E ; Alphabetic # Lo [31] MRO LETTER TA..MRO LETTER TEK
@@ -1275,7 +1290,9 @@ FFDA..FFDC ; Alphabetic # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANG
1AFF5..1AFFB ; Alphabetic # Lm [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5
1AFFD..1AFFE ; Alphabetic # Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
1B000..1B122 ; Alphabetic # Lo [291] KATAKANA LETTER ARCHAIC E..KATAKANA LETTER ARCHAIC WU
+1B132 ; Alphabetic # Lo HIRAGANA LETTER SMALL KO
1B150..1B152 ; Alphabetic # Lo [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO
+1B155 ; Alphabetic # Lo KATAKANA LETTER SMALL KO
1B164..1B167 ; Alphabetic # Lo [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
1B170..1B2FB ; Alphabetic # Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
1BC00..1BC6A ; Alphabetic # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M
@@ -1316,16 +1333,21 @@ FFDA..FFDC ; Alphabetic # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANG
1DF00..1DF09 ; Alphabetic # L& [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
1DF0A ; Alphabetic # Lo LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK
1DF0B..1DF1E ; Alphabetic # L& [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
+1DF25..1DF2A ; Alphabetic # L& [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK
1E000..1E006 ; Alphabetic # Mn [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE
1E008..1E018 ; Alphabetic # Mn [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU
1E01B..1E021 ; Alphabetic # Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
1E023..1E024 ; Alphabetic # Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
1E026..1E02A ; Alphabetic # Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
+1E030..1E06D ; Alphabetic # Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
+1E08F ; Alphabetic # Mn COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
1E100..1E12C ; Alphabetic # Lo [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W
1E137..1E13D ; Alphabetic # Lm [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER
1E14E ; Alphabetic # Lo NYIAKENG PUACHUE HMONG LOGOGRAM NYAJ
1E290..1E2AD ; Alphabetic # Lo [30] TOTO LETTER PA..TOTO LETTER A
1E2C0..1E2EB ; Alphabetic # Lo [44] WANCHO LETTER AA..WANCHO LETTER YIH
+1E4D0..1E4EA ; Alphabetic # Lo [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL
+1E4EB ; Alphabetic # Lm NAG MUNDARI SIGN OJOD
1E7E0..1E7E6 ; Alphabetic # Lo [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO
1E7E8..1E7EB ; Alphabetic # Lo [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE
1E7ED..1E7EE ; Alphabetic # Lo [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE
@@ -1371,14 +1393,15 @@ FFDA..FFDC ; Alphabetic # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANG
1F150..1F169 ; Alphabetic # So [26] NEGATIVE CIRCLED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z
1F170..1F189 ; Alphabetic # So [26] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER Z
20000..2A6DF ; Alphabetic # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
-2A700..2B738 ; Alphabetic # Lo [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738
+2A700..2B739 ; Alphabetic # Lo [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739
2B740..2B81D ; Alphabetic # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
2B820..2CEA1 ; Alphabetic # Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
2CEB0..2EBE0 ; Alphabetic # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
2F800..2FA1D ; Alphabetic # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
30000..3134A ; Alphabetic # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
+31350..323AF ; Alphabetic # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
-# Total code points: 133396
+# Total code points: 137765
# ================================================
@@ -1663,6 +1686,7 @@ FFDA..FFDC ; Alphabetic # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANG
052F ; Lowercase # L& CYRILLIC SMALL LETTER EL WITH DESCENDER
0560..0588 ; Lowercase # L& [41] ARMENIAN SMALL LETTER TURNED AYB..ARMENIAN SMALL LETTER YI WITH STROKE
10D0..10FA ; Lowercase # L& [43] GEORGIAN LETTER AN..GEORGIAN LETTER AIN
+10FC ; Lowercase # Lm MODIFIER LETTER GEORGIAN NAR
10FD..10FF ; Lowercase # L& [3] GEORGIAN LETTER AEN..GEORGIAN LETTER LABIAL SIGN
13F8..13FD ; Lowercase # L& [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV
1C80..1C88 ; Lowercase # L& [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK
@@ -2012,12 +2036,14 @@ A7D3 ; Lowercase # L& LATIN SMALL LETTER DOUBLE THORN
A7D5 ; Lowercase # L& LATIN SMALL LETTER DOUBLE WYNN
A7D7 ; Lowercase # L& LATIN SMALL LETTER MIDDLE SCOTS S
A7D9 ; Lowercase # L& LATIN SMALL LETTER SIGMOID S
+A7F2..A7F4 ; Lowercase # Lm [3] MODIFIER LETTER CAPITAL C..MODIFIER LETTER CAPITAL Q
A7F6 ; Lowercase # L& LATIN SMALL LETTER REVERSED HALF H
A7F8..A7F9 ; Lowercase # Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE
A7FA ; Lowercase # L& LATIN LETTER SMALL CAPITAL TURNED M
AB30..AB5A ; Lowercase # L& [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG
AB5C..AB5F ; Lowercase # Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK
AB60..AB68 ; Lowercase # L& [9] LATIN SMALL LETTER SAKHA YAT..LATIN SMALL LETTER TURNED R WITH MIDDLE TILDE
+AB69 ; Lowercase # Lm MODIFIER LETTER SMALL TURNED W
AB70..ABBF ; Lowercase # L& [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETTER YA
FB00..FB06 ; Lowercase # L& [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST
FB13..FB17 ; Lowercase # L& [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH
@@ -2065,9 +2091,11 @@ FF41..FF5A ; Lowercase # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH L
1D7CB ; Lowercase # L& MATHEMATICAL BOLD SMALL DIGAMMA
1DF00..1DF09 ; Lowercase # L& [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
1DF0B..1DF1E ; Lowercase # L& [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
+1DF25..1DF2A ; Lowercase # L& [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK
+1E030..1E06D ; Lowercase # Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
1E922..1E943 ; Lowercase # L& [34] ADLAM SMALL LETTER ALIF..ADLAM SMALL LETTER SHA
-# Total code points: 2471
+# Total code points: 2544
# ================================================
@@ -2767,6 +2795,7 @@ FF21..FF3A ; Uppercase # L& [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH
10C7 ; Cased # L& GEORGIAN CAPITAL LETTER YN
10CD ; Cased # L& GEORGIAN CAPITAL LETTER AEN
10D0..10FA ; Cased # L& [43] GEORGIAN LETTER AN..GEORGIAN LETTER AIN
+10FC ; Cased # Lm MODIFIER LETTER GEORGIAN NAR
10FD..10FF ; Cased # L& [3] GEORGIAN LETTER AEN..GEORGIAN LETTER LABIAL SIGN
13A0..13F5 ; Cased # L& [86] CHEROKEE LETTER A..CHEROKEE LETTER MV
13F8..13FD ; Cased # L& [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV
@@ -2837,12 +2866,14 @@ A790..A7CA ; Cased # L& [59] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN SM
A7D0..A7D1 ; Cased # L& [2] LATIN CAPITAL LETTER CLOSED INSULAR G..LATIN SMALL LETTER CLOSED INSULAR G
A7D3 ; Cased # L& LATIN SMALL LETTER DOUBLE THORN
A7D5..A7D9 ; Cased # L& [5] LATIN SMALL LETTER DOUBLE WYNN..LATIN SMALL LETTER SIGMOID S
+A7F2..A7F4 ; Cased # Lm [3] MODIFIER LETTER CAPITAL C..MODIFIER LETTER CAPITAL Q
A7F5..A7F6 ; Cased # L& [2] LATIN CAPITAL LETTER REVERSED HALF H..LATIN SMALL LETTER REVERSED HALF H
A7F8..A7F9 ; Cased # Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE
A7FA ; Cased # L& LATIN LETTER SMALL CAPITAL TURNED M
AB30..AB5A ; Cased # L& [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG
AB5C..AB5F ; Cased # Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK
AB60..AB68 ; Cased # L& [9] LATIN SMALL LETTER SAKHA YAT..LATIN SMALL LETTER TURNED R WITH MIDDLE TILDE
+AB69 ; Cased # Lm MODIFIER LETTER SMALL TURNED W
AB70..ABBF ; Cased # L& [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETTER YA
FB00..FB06 ; Cased # L& [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST
FB13..FB17 ; Cased # L& [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH
@@ -2899,12 +2930,14 @@ FF41..FF5A ; Cased # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN
1D7C4..1D7CB ; Cased # L& [8] MATHEMATICAL SANS-SERIF BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD SMALL DIGAMMA
1DF00..1DF09 ; Cased # L& [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
1DF0B..1DF1E ; Cased # L& [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
+1DF25..1DF2A ; Cased # L& [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK
+1E030..1E06D ; Cased # Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
1E900..1E943 ; Cased # L& [68] ADLAM CAPITAL LETTER ALIF..ADLAM SMALL LETTER SHA
1F130..1F149 ; Cased # So [26] SQUARED LATIN CAPITAL LETTER A..SQUARED LATIN CAPITAL LETTER Z
1F150..1F169 ; Cased # So [26] NEGATIVE CIRCLED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z
1F170..1F189 ; Cased # So [26] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER Z
-# Total code points: 4453
+# Total code points: 4526
# ================================================
@@ -3054,7 +3087,7 @@ FF41..FF5A ; Cased # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN
0EB1 ; Case_Ignorable # Mn LAO VOWEL SIGN MAI KAN
0EB4..0EBC ; Case_Ignorable # Mn [9] LAO VOWEL SIGN I..LAO SEMIVOWEL SIGN LO
0EC6 ; Case_Ignorable # Lm LAO KO LA
-0EC8..0ECD ; Case_Ignorable # Mn [6] LAO TONE MAI EK..LAO NIGGAHITA
+0EC8..0ECE ; Case_Ignorable # Mn [7] LAO TONE MAI EK..LAO YAMAKKAN
0F18..0F19 ; Case_Ignorable # Mn [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS
0F35 ; Case_Ignorable # Mn TIBETAN MARK NGAS BZUNG NYI ZLA
0F37 ; Case_Ignorable # Mn TIBETAN MARK NGAS BZUNG SGOR RTAGS
@@ -3263,6 +3296,7 @@ FFF9..FFFB ; Case_Ignorable # Cf [3] INTERLINEAR ANNOTATION ANCHOR..INTERLI
10AE5..10AE6 ; Case_Ignorable # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW
10D24..10D27 ; Case_Ignorable # Mn [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI
10EAB..10EAC ; Case_Ignorable # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
+10EFD..10EFF ; Case_Ignorable # Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA
10F46..10F50 ; Case_Ignorable # Mn [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW
10F82..10F85 ; Case_Ignorable # Mn [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW
11001 ; Case_Ignorable # Mn BRAHMI SIGN ANUSVARA
@@ -3287,6 +3321,7 @@ FFF9..FFFB ; Case_Ignorable # Cf [3] INTERLINEAR ANNOTATION ANCHOR..INTERLI
11234 ; Case_Ignorable # Mn KHOJKI SIGN ANUSVARA
11236..11237 ; Case_Ignorable # Mn [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA
1123E ; Case_Ignorable # Mn KHOJKI SIGN SUKUN
+11241 ; Case_Ignorable # Mn KHOJKI VOWEL SIGN VOCALIC R
112DF ; Case_Ignorable # Mn KHUDAWADI SIGN ANUSVARA
112E3..112EA ; Case_Ignorable # Mn [8] KHUDAWADI VOWEL SIGN U..KHUDAWADI SIGN VIRAMA
11300..11301 ; Case_Ignorable # Mn [2] GRANTHA SIGN COMBINING ANUSVARA ABOVE..GRANTHA SIGN CANDRABINDU
@@ -3348,7 +3383,13 @@ FFF9..FFFB ; Case_Ignorable # Cf [3] INTERLINEAR ANNOTATION ANCHOR..INTERLI
11D95 ; Case_Ignorable # Mn GUNJALA GONDI SIGN ANUSVARA
11D97 ; Case_Ignorable # Mn GUNJALA GONDI VIRAMA
11EF3..11EF4 ; Case_Ignorable # Mn [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U
-13430..13438 ; Case_Ignorable # Cf [9] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END SEGMENT
+11F00..11F01 ; Case_Ignorable # Mn [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA
+11F36..11F3A ; Case_Ignorable # Mn [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R
+11F40 ; Case_Ignorable # Mn KAWI VOWEL SIGN EU
+11F42 ; Case_Ignorable # Mn KAWI CONJOINER
+13430..1343F ; Case_Ignorable # Cf [16] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE
+13440 ; Case_Ignorable # Mn EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY
+13447..13455 ; Case_Ignorable # Mn [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED
16AF0..16AF4 ; Case_Ignorable # Mn [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE
16B30..16B36 ; Case_Ignorable # Mn [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM
16B40..16B43 ; Case_Ignorable # Lm [4] PAHAWH HMONG SIGN VOS SEEV..PAHAWH HMONG SIGN IB YAM
@@ -3382,10 +3423,14 @@ FFF9..FFFB ; Case_Ignorable # Cf [3] INTERLINEAR ANNOTATION ANCHOR..INTERLI
1E01B..1E021 ; Case_Ignorable # Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
1E023..1E024 ; Case_Ignorable # Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
1E026..1E02A ; Case_Ignorable # Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
+1E030..1E06D ; Case_Ignorable # Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
+1E08F ; Case_Ignorable # Mn COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
1E130..1E136 ; Case_Ignorable # Mn [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
1E137..1E13D ; Case_Ignorable # Lm [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER
1E2AE ; Case_Ignorable # Mn TOTO SIGN RISING TONE
1E2EC..1E2EF ; Case_Ignorable # Mn [4] WANCHO TONE TUP..WANCHO TONE KOINI
+1E4EB ; Case_Ignorable # Lm NAG MUNDARI SIGN OJOD
+1E4EC..1E4EF ; Case_Ignorable # Mn [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH
1E8D0..1E8D6 ; Case_Ignorable # Mn [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS
1E944..1E94A ; Case_Ignorable # Mn [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA
1E94B ; Case_Ignorable # Lm ADLAM NASALIZATION MARK
@@ -3394,7 +3439,7 @@ E0001 ; Case_Ignorable # Cf LANGUAGE TAG
E0020..E007F ; Case_Ignorable # Cf [96] TAG SPACE..CANCEL TAG
E0100..E01EF ; Case_Ignorable # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
-# Total code points: 2602
+# Total code points: 2707
# ================================================
@@ -6617,6 +6662,7 @@ FFDA..FFDC ; ID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL
111DC ; ID_Start # Lo SHARADA HEADSTROKE
11200..11211 ; ID_Start # Lo [18] KHOJKI LETTER A..KHOJKI LETTER JJA
11213..1122B ; ID_Start # Lo [25] KHOJKI LETTER NYA..KHOJKI LETTER LLA
+1123F..11240 ; ID_Start # Lo [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I
11280..11286 ; ID_Start # Lo [7] MULTANI LETTER A..MULTANI LETTER GA
11288 ; ID_Start # Lo MULTANI LETTER GHA
1128A..1128D ; ID_Start # Lo [4] MULTANI LETTER CA..MULTANI LETTER JJA
@@ -6679,12 +6725,16 @@ FFDA..FFDC ; ID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL
11D6A..11D89 ; ID_Start # Lo [32] GUNJALA GONDI LETTER OO..GUNJALA GONDI LETTER SA
11D98 ; ID_Start # Lo GUNJALA GONDI OM
11EE0..11EF2 ; ID_Start # Lo [19] MAKASAR LETTER KA..MAKASAR ANGKA
+11F02 ; ID_Start # Lo KAWI SIGN REPHA
+11F04..11F10 ; ID_Start # Lo [13] KAWI LETTER A..KAWI LETTER O
+11F12..11F33 ; ID_Start # Lo [34] KAWI LETTER KA..KAWI LETTER JNYA
11FB0 ; ID_Start # Lo LISU LETTER YHA
12000..12399 ; ID_Start # Lo [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U
12400..1246E ; ID_Start # Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM
12480..12543 ; ID_Start # Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU
12F90..12FF0 ; ID_Start # Lo [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114
-13000..1342E ; ID_Start # Lo [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032
+13000..1342F ; ID_Start # Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D
+13441..13446 ; ID_Start # Lo [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN
14400..14646 ; ID_Start # Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530
16800..16A38 ; ID_Start # Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ
16A40..16A5E ; ID_Start # Lo [31] MRO LETTER TA..MRO LETTER TEK
@@ -6707,7 +6757,9 @@ FFDA..FFDC ; ID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL
1AFF5..1AFFB ; ID_Start # Lm [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5
1AFFD..1AFFE ; ID_Start # Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
1B000..1B122 ; ID_Start # Lo [291] KATAKANA LETTER ARCHAIC E..KATAKANA LETTER ARCHAIC WU
+1B132 ; ID_Start # Lo HIRAGANA LETTER SMALL KO
1B150..1B152 ; ID_Start # Lo [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO
+1B155 ; ID_Start # Lo KATAKANA LETTER SMALL KO
1B164..1B167 ; ID_Start # Lo [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
1B170..1B2FB ; ID_Start # Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
1BC00..1BC6A ; ID_Start # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M
@@ -6747,11 +6799,15 @@ FFDA..FFDC ; ID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL
1DF00..1DF09 ; ID_Start # L& [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
1DF0A ; ID_Start # Lo LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK
1DF0B..1DF1E ; ID_Start # L& [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
+1DF25..1DF2A ; ID_Start # L& [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK
+1E030..1E06D ; ID_Start # Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
1E100..1E12C ; ID_Start # Lo [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W
1E137..1E13D ; ID_Start # Lm [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER
1E14E ; ID_Start # Lo NYIAKENG PUACHUE HMONG LOGOGRAM NYAJ
1E290..1E2AD ; ID_Start # Lo [30] TOTO LETTER PA..TOTO LETTER A
1E2C0..1E2EB ; ID_Start # Lo [44] WANCHO LETTER AA..WANCHO LETTER YIH
+1E4D0..1E4EA ; ID_Start # Lo [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL
+1E4EB ; ID_Start # Lm NAG MUNDARI SIGN OJOD
1E7E0..1E7E6 ; ID_Start # Lo [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO
1E7E8..1E7EB ; ID_Start # Lo [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE
1E7ED..1E7EE ; ID_Start # Lo [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE
@@ -6793,14 +6849,15 @@ FFDA..FFDC ; ID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL
1EEA5..1EEA9 ; ID_Start # Lo [5] ARABIC MATHEMATICAL DOUBLE-STRUCK WAW..ARABIC MATHEMATICAL DOUBLE-STRUCK YEH
1EEAB..1EEBB ; ID_Start # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN
20000..2A6DF ; ID_Start # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
-2A700..2B738 ; ID_Start # Lo [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738
+2A700..2B739 ; ID_Start # Lo [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739
2B740..2B81D ; ID_Start # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
2B820..2CEA1 ; ID_Start # Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
2CEB0..2EBE0 ; ID_Start # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
2F800..2FA1D ; ID_Start # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
30000..3134A ; ID_Start # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
+31350..323AF ; ID_Start # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
-# Total code points: 131997
+# Total code points: 136345
# ================================================
@@ -7083,6 +7140,7 @@ FFDA..FFDC ; ID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL
0CE2..0CE3 ; ID_Continue # Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL
0CE6..0CEF ; ID_Continue # Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE
0CF1..0CF2 ; ID_Continue # Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
+0CF3 ; ID_Continue # Mc KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
0D00..0D01 ; ID_Continue # Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU
0D02..0D03 ; ID_Continue # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
0D04..0D0C ; ID_Continue # Lo [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L
@@ -7136,7 +7194,7 @@ FFDA..FFDC ; ID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL
0EBD ; ID_Continue # Lo LAO SEMIVOWEL SIGN NYO
0EC0..0EC4 ; ID_Continue # Lo [5] LAO VOWEL SIGN E..LAO VOWEL SIGN AI
0EC6 ; ID_Continue # Lm LAO KO LA
-0EC8..0ECD ; ID_Continue # Mn [6] LAO TONE MAI EK..LAO NIGGAHITA
+0EC8..0ECE ; ID_Continue # Mn [7] LAO TONE MAI EK..LAO YAMAKKAN
0ED0..0ED9 ; ID_Continue # Nd [10] LAO DIGIT ZERO..LAO DIGIT NINE
0EDC..0EDF ; ID_Continue # Lo [4] LAO HO NO..LAO LETTER KHMU NYO
0F00 ; ID_Continue # Lo TIBETAN SYLLABLE OM
@@ -7719,6 +7777,7 @@ FFDA..FFDC ; ID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HAN
10E80..10EA9 ; ID_Continue # Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET
10EAB..10EAC ; ID_Continue # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
10EB0..10EB1 ; ID_Continue # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE
+10EFD..10EFF ; ID_Continue # Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA
10F00..10F1C ; ID_Continue # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL
10F27 ; ID_Continue # Lo OLD SOGDIAN LIGATURE AYIN-DALETH
10F30..10F45 ; ID_Continue # Lo [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN
@@ -7781,6 +7840,8 @@ FFDA..FFDC ; ID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HAN
11235 ; ID_Continue # Mc KHOJKI SIGN VIRAMA
11236..11237 ; ID_Continue # Mn [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA
1123E ; ID_Continue # Mn KHOJKI SIGN SUKUN
+1123F..11240 ; ID_Continue # Lo [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I
+11241 ; ID_Continue # Mn KHOJKI VOWEL SIGN VOCALIC R
11280..11286 ; ID_Continue # Lo [7] MULTANI LETTER A..MULTANI LETTER GA
11288 ; ID_Continue # Lo MULTANI LETTER GHA
1128A..1128D ; ID_Continue # Lo [4] MULTANI LETTER CA..MULTANI LETTER JJA
@@ -7963,12 +8024,27 @@ FFDA..FFDC ; ID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HAN
11EE0..11EF2 ; ID_Continue # Lo [19] MAKASAR LETTER KA..MAKASAR ANGKA
11EF3..11EF4 ; ID_Continue # Mn [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U
11EF5..11EF6 ; ID_Continue # Mc [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O
+11F00..11F01 ; ID_Continue # Mn [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA
+11F02 ; ID_Continue # Lo KAWI SIGN REPHA
+11F03 ; ID_Continue # Mc KAWI SIGN VISARGA
+11F04..11F10 ; ID_Continue # Lo [13] KAWI LETTER A..KAWI LETTER O
+11F12..11F33 ; ID_Continue # Lo [34] KAWI LETTER KA..KAWI LETTER JNYA
+11F34..11F35 ; ID_Continue # Mc [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA
+11F36..11F3A ; ID_Continue # Mn [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R
+11F3E..11F3F ; ID_Continue # Mc [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI
+11F40 ; ID_Continue # Mn KAWI VOWEL SIGN EU
+11F41 ; ID_Continue # Mc KAWI SIGN KILLER
+11F42 ; ID_Continue # Mn KAWI CONJOINER
+11F50..11F59 ; ID_Continue # Nd [10] KAWI DIGIT ZERO..KAWI DIGIT NINE
11FB0 ; ID_Continue # Lo LISU LETTER YHA
12000..12399 ; ID_Continue # Lo [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U
12400..1246E ; ID_Continue # Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM
12480..12543 ; ID_Continue # Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU
12F90..12FF0 ; ID_Continue # Lo [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114
-13000..1342E ; ID_Continue # Lo [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032
+13000..1342F ; ID_Continue # Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D
+13440 ; ID_Continue # Mn EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY
+13441..13446 ; ID_Continue # Lo [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN
+13447..13455 ; ID_Continue # Mn [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED
14400..14646 ; ID_Continue # Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530
16800..16A38 ; ID_Continue # Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ
16A40..16A5E ; ID_Continue # Lo [31] MRO LETTER TA..MRO LETTER TEK
@@ -8001,7 +8077,9 @@ FFDA..FFDC ; ID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HAN
1AFF5..1AFFB ; ID_Continue # Lm [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5
1AFFD..1AFFE ; ID_Continue # Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
1B000..1B122 ; ID_Continue # Lo [291] KATAKANA LETTER ARCHAIC E..KATAKANA LETTER ARCHAIC WU
+1B132 ; ID_Continue # Lo HIRAGANA LETTER SMALL KO
1B150..1B152 ; ID_Continue # Lo [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO
+1B155 ; ID_Continue # Lo KATAKANA LETTER SMALL KO
1B164..1B167 ; ID_Continue # Lo [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
1B170..1B2FB ; ID_Continue # Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
1BC00..1BC6A ; ID_Continue # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M
@@ -8058,11 +8136,14 @@ FFDA..FFDC ; ID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HAN
1DF00..1DF09 ; ID_Continue # L& [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
1DF0A ; ID_Continue # Lo LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK
1DF0B..1DF1E ; ID_Continue # L& [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
+1DF25..1DF2A ; ID_Continue # L& [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK
1E000..1E006 ; ID_Continue # Mn [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE
1E008..1E018 ; ID_Continue # Mn [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU
1E01B..1E021 ; ID_Continue # Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
1E023..1E024 ; ID_Continue # Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
1E026..1E02A ; ID_Continue # Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
+1E030..1E06D ; ID_Continue # Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
+1E08F ; ID_Continue # Mn COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
1E100..1E12C ; ID_Continue # Lo [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W
1E130..1E136 ; ID_Continue # Mn [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
1E137..1E13D ; ID_Continue # Lm [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER
@@ -8073,6 +8154,10 @@ FFDA..FFDC ; ID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HAN
1E2C0..1E2EB ; ID_Continue # Lo [44] WANCHO LETTER AA..WANCHO LETTER YIH
1E2EC..1E2EF ; ID_Continue # Mn [4] WANCHO TONE TUP..WANCHO TONE KOINI
1E2F0..1E2F9 ; ID_Continue # Nd [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE
+1E4D0..1E4EA ; ID_Continue # Lo [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL
+1E4EB ; ID_Continue # Lm NAG MUNDARI SIGN OJOD
+1E4EC..1E4EF ; ID_Continue # Mn [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH
+1E4F0..1E4F9 ; ID_Continue # Nd [10] NAG MUNDARI DIGIT ZERO..NAG MUNDARI DIGIT NINE
1E7E0..1E7E6 ; ID_Continue # Lo [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO
1E7E8..1E7EB ; ID_Continue # Lo [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE
1E7ED..1E7EE ; ID_Continue # Lo [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE
@@ -8118,15 +8203,16 @@ FFDA..FFDC ; ID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HAN
1EEAB..1EEBB ; ID_Continue # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN
1FBF0..1FBF9 ; ID_Continue # Nd [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE
20000..2A6DF ; ID_Continue # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
-2A700..2B738 ; ID_Continue # Lo [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738
+2A700..2B739 ; ID_Continue # Lo [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739
2B740..2B81D ; ID_Continue # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
2B820..2CEA1 ; ID_Continue # Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
2CEB0..2EBE0 ; ID_Continue # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
2F800..2FA1D ; ID_Continue # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
30000..3134A ; ID_Continue # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
+31350..323AF ; ID_Continue # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
E0100..E01EF ; ID_Continue # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
-# Total code points: 135072
+# Total code points: 139482
# ================================================
@@ -8685,6 +8771,7 @@ FFDA..FFDC ; XID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGU
111DC ; XID_Start # Lo SHARADA HEADSTROKE
11200..11211 ; XID_Start # Lo [18] KHOJKI LETTER A..KHOJKI LETTER JJA
11213..1122B ; XID_Start # Lo [25] KHOJKI LETTER NYA..KHOJKI LETTER LLA
+1123F..11240 ; XID_Start # Lo [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I
11280..11286 ; XID_Start # Lo [7] MULTANI LETTER A..MULTANI LETTER GA
11288 ; XID_Start # Lo MULTANI LETTER GHA
1128A..1128D ; XID_Start # Lo [4] MULTANI LETTER CA..MULTANI LETTER JJA
@@ -8747,12 +8834,16 @@ FFDA..FFDC ; XID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGU
11D6A..11D89 ; XID_Start # Lo [32] GUNJALA GONDI LETTER OO..GUNJALA GONDI LETTER SA
11D98 ; XID_Start # Lo GUNJALA GONDI OM
11EE0..11EF2 ; XID_Start # Lo [19] MAKASAR LETTER KA..MAKASAR ANGKA
+11F02 ; XID_Start # Lo KAWI SIGN REPHA
+11F04..11F10 ; XID_Start # Lo [13] KAWI LETTER A..KAWI LETTER O
+11F12..11F33 ; XID_Start # Lo [34] KAWI LETTER KA..KAWI LETTER JNYA
11FB0 ; XID_Start # Lo LISU LETTER YHA
12000..12399 ; XID_Start # Lo [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U
12400..1246E ; XID_Start # Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM
12480..12543 ; XID_Start # Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU
12F90..12FF0 ; XID_Start # Lo [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114
-13000..1342E ; XID_Start # Lo [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032
+13000..1342F ; XID_Start # Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D
+13441..13446 ; XID_Start # Lo [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN
14400..14646 ; XID_Start # Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530
16800..16A38 ; XID_Start # Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ
16A40..16A5E ; XID_Start # Lo [31] MRO LETTER TA..MRO LETTER TEK
@@ -8775,7 +8866,9 @@ FFDA..FFDC ; XID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGU
1AFF5..1AFFB ; XID_Start # Lm [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5
1AFFD..1AFFE ; XID_Start # Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
1B000..1B122 ; XID_Start # Lo [291] KATAKANA LETTER ARCHAIC E..KATAKANA LETTER ARCHAIC WU
+1B132 ; XID_Start # Lo HIRAGANA LETTER SMALL KO
1B150..1B152 ; XID_Start # Lo [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO
+1B155 ; XID_Start # Lo KATAKANA LETTER SMALL KO
1B164..1B167 ; XID_Start # Lo [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
1B170..1B2FB ; XID_Start # Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
1BC00..1BC6A ; XID_Start # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M
@@ -8815,11 +8908,15 @@ FFDA..FFDC ; XID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGU
1DF00..1DF09 ; XID_Start # L& [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
1DF0A ; XID_Start # Lo LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK
1DF0B..1DF1E ; XID_Start # L& [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
+1DF25..1DF2A ; XID_Start # L& [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK
+1E030..1E06D ; XID_Start # Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
1E100..1E12C ; XID_Start # Lo [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W
1E137..1E13D ; XID_Start # Lm [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER
1E14E ; XID_Start # Lo NYIAKENG PUACHUE HMONG LOGOGRAM NYAJ
1E290..1E2AD ; XID_Start # Lo [30] TOTO LETTER PA..TOTO LETTER A
1E2C0..1E2EB ; XID_Start # Lo [44] WANCHO LETTER AA..WANCHO LETTER YIH
+1E4D0..1E4EA ; XID_Start # Lo [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL
+1E4EB ; XID_Start # Lm NAG MUNDARI SIGN OJOD
1E7E0..1E7E6 ; XID_Start # Lo [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO
1E7E8..1E7EB ; XID_Start # Lo [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE
1E7ED..1E7EE ; XID_Start # Lo [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE
@@ -8861,14 +8958,15 @@ FFDA..FFDC ; XID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGU
1EEA5..1EEA9 ; XID_Start # Lo [5] ARABIC MATHEMATICAL DOUBLE-STRUCK WAW..ARABIC MATHEMATICAL DOUBLE-STRUCK YEH
1EEAB..1EEBB ; XID_Start # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN
20000..2A6DF ; XID_Start # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
-2A700..2B738 ; XID_Start # Lo [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738
+2A700..2B739 ; XID_Start # Lo [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739
2B740..2B81D ; XID_Start # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
2B820..2CEA1 ; XID_Start # Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
2CEB0..2EBE0 ; XID_Start # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
2F800..2FA1D ; XID_Start # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
30000..3134A ; XID_Start # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
+31350..323AF ; XID_Start # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
-# Total code points: 131974
+# Total code points: 136322
# ================================================
@@ -9147,6 +9245,7 @@ FFDA..FFDC ; XID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGU
0CE2..0CE3 ; XID_Continue # Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL
0CE6..0CEF ; XID_Continue # Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE
0CF1..0CF2 ; XID_Continue # Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
+0CF3 ; XID_Continue # Mc KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
0D00..0D01 ; XID_Continue # Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU
0D02..0D03 ; XID_Continue # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
0D04..0D0C ; XID_Continue # Lo [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L
@@ -9200,7 +9299,7 @@ FFDA..FFDC ; XID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGU
0EBD ; XID_Continue # Lo LAO SEMIVOWEL SIGN NYO
0EC0..0EC4 ; XID_Continue # Lo [5] LAO VOWEL SIGN E..LAO VOWEL SIGN AI
0EC6 ; XID_Continue # Lm LAO KO LA
-0EC8..0ECD ; XID_Continue # Mn [6] LAO TONE MAI EK..LAO NIGGAHITA
+0EC8..0ECE ; XID_Continue # Mn [7] LAO TONE MAI EK..LAO YAMAKKAN
0ED0..0ED9 ; XID_Continue # Nd [10] LAO DIGIT ZERO..LAO DIGIT NINE
0EDC..0EDF ; XID_Continue # Lo [4] LAO HO NO..LAO LETTER KHMU NYO
0F00 ; XID_Continue # Lo TIBETAN SYLLABLE OM
@@ -9788,6 +9887,7 @@ FFDA..FFDC ; XID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HA
10E80..10EA9 ; XID_Continue # Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET
10EAB..10EAC ; XID_Continue # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
10EB0..10EB1 ; XID_Continue # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE
+10EFD..10EFF ; XID_Continue # Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA
10F00..10F1C ; XID_Continue # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL
10F27 ; XID_Continue # Lo OLD SOGDIAN LIGATURE AYIN-DALETH
10F30..10F45 ; XID_Continue # Lo [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN
@@ -9850,6 +9950,8 @@ FFDA..FFDC ; XID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HA
11235 ; XID_Continue # Mc KHOJKI SIGN VIRAMA
11236..11237 ; XID_Continue # Mn [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA
1123E ; XID_Continue # Mn KHOJKI SIGN SUKUN
+1123F..11240 ; XID_Continue # Lo [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I
+11241 ; XID_Continue # Mn KHOJKI VOWEL SIGN VOCALIC R
11280..11286 ; XID_Continue # Lo [7] MULTANI LETTER A..MULTANI LETTER GA
11288 ; XID_Continue # Lo MULTANI LETTER GHA
1128A..1128D ; XID_Continue # Lo [4] MULTANI LETTER CA..MULTANI LETTER JJA
@@ -10032,12 +10134,27 @@ FFDA..FFDC ; XID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HA
11EE0..11EF2 ; XID_Continue # Lo [19] MAKASAR LETTER KA..MAKASAR ANGKA
11EF3..11EF4 ; XID_Continue # Mn [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U
11EF5..11EF6 ; XID_Continue # Mc [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O
+11F00..11F01 ; XID_Continue # Mn [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA
+11F02 ; XID_Continue # Lo KAWI SIGN REPHA
+11F03 ; XID_Continue # Mc KAWI SIGN VISARGA
+11F04..11F10 ; XID_Continue # Lo [13] KAWI LETTER A..KAWI LETTER O
+11F12..11F33 ; XID_Continue # Lo [34] KAWI LETTER KA..KAWI LETTER JNYA
+11F34..11F35 ; XID_Continue # Mc [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA
+11F36..11F3A ; XID_Continue # Mn [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R
+11F3E..11F3F ; XID_Continue # Mc [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI
+11F40 ; XID_Continue # Mn KAWI VOWEL SIGN EU
+11F41 ; XID_Continue # Mc KAWI SIGN KILLER
+11F42 ; XID_Continue # Mn KAWI CONJOINER
+11F50..11F59 ; XID_Continue # Nd [10] KAWI DIGIT ZERO..KAWI DIGIT NINE
11FB0 ; XID_Continue # Lo LISU LETTER YHA
12000..12399 ; XID_Continue # Lo [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U
12400..1246E ; XID_Continue # Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM
12480..12543 ; XID_Continue # Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU
12F90..12FF0 ; XID_Continue # Lo [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114
-13000..1342E ; XID_Continue # Lo [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032
+13000..1342F ; XID_Continue # Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D
+13440 ; XID_Continue # Mn EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY
+13441..13446 ; XID_Continue # Lo [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN
+13447..13455 ; XID_Continue # Mn [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED
14400..14646 ; XID_Continue # Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530
16800..16A38 ; XID_Continue # Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ
16A40..16A5E ; XID_Continue # Lo [31] MRO LETTER TA..MRO LETTER TEK
@@ -10070,7 +10187,9 @@ FFDA..FFDC ; XID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HA
1AFF5..1AFFB ; XID_Continue # Lm [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5
1AFFD..1AFFE ; XID_Continue # Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
1B000..1B122 ; XID_Continue # Lo [291] KATAKANA LETTER ARCHAIC E..KATAKANA LETTER ARCHAIC WU
+1B132 ; XID_Continue # Lo HIRAGANA LETTER SMALL KO
1B150..1B152 ; XID_Continue # Lo [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO
+1B155 ; XID_Continue # Lo KATAKANA LETTER SMALL KO
1B164..1B167 ; XID_Continue # Lo [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
1B170..1B2FB ; XID_Continue # Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
1BC00..1BC6A ; XID_Continue # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M
@@ -10127,11 +10246,14 @@ FFDA..FFDC ; XID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HA
1DF00..1DF09 ; XID_Continue # L& [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
1DF0A ; XID_Continue # Lo LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK
1DF0B..1DF1E ; XID_Continue # L& [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
+1DF25..1DF2A ; XID_Continue # L& [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK
1E000..1E006 ; XID_Continue # Mn [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE
1E008..1E018 ; XID_Continue # Mn [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU
1E01B..1E021 ; XID_Continue # Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
1E023..1E024 ; XID_Continue # Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
1E026..1E02A ; XID_Continue # Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
+1E030..1E06D ; XID_Continue # Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
+1E08F ; XID_Continue # Mn COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
1E100..1E12C ; XID_Continue # Lo [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W
1E130..1E136 ; XID_Continue # Mn [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
1E137..1E13D ; XID_Continue # Lm [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER
@@ -10142,6 +10264,10 @@ FFDA..FFDC ; XID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HA
1E2C0..1E2EB ; XID_Continue # Lo [44] WANCHO LETTER AA..WANCHO LETTER YIH
1E2EC..1E2EF ; XID_Continue # Mn [4] WANCHO TONE TUP..WANCHO TONE KOINI
1E2F0..1E2F9 ; XID_Continue # Nd [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE
+1E4D0..1E4EA ; XID_Continue # Lo [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL
+1E4EB ; XID_Continue # Lm NAG MUNDARI SIGN OJOD
+1E4EC..1E4EF ; XID_Continue # Mn [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH
+1E4F0..1E4F9 ; XID_Continue # Nd [10] NAG MUNDARI DIGIT ZERO..NAG MUNDARI DIGIT NINE
1E7E0..1E7E6 ; XID_Continue # Lo [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO
1E7E8..1E7EB ; XID_Continue # Lo [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE
1E7ED..1E7EE ; XID_Continue # Lo [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE
@@ -10187,15 +10313,16 @@ FFDA..FFDC ; XID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HA
1EEAB..1EEBB ; XID_Continue # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN
1FBF0..1FBF9 ; XID_Continue # Nd [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE
20000..2A6DF ; XID_Continue # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
-2A700..2B738 ; XID_Continue # Lo [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738
+2A700..2B739 ; XID_Continue # Lo [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739
2B740..2B81D ; XID_Continue # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
2B820..2CEA1 ; XID_Continue # Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
2CEB0..2EBE0 ; XID_Continue # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
2F800..2FA1D ; XID_Continue # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
30000..3134A ; XID_Continue # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
+31350..323AF ; XID_Continue # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
E0100..E01EF ; XID_Continue # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
-# Total code points: 135053
+# Total code points: 139463
# ================================================
@@ -10206,7 +10333,7 @@ E0100..E01EF ; XID_Continue # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTO
# + Variation_Selector
# - White_Space
# - FFF9..FFFB (Interlinear annotation format characters)
-# - 13430..13438 (Egyptian hieroglyph format characters)
+# - 13430..13440 (Egyptian hieroglyph format characters)
# - Prepended_Concatenation_Mark (Exceptional format characters that should be visible)
00AD ; Default_Ignorable_Code_Point # Cf SOFT HYPHEN
@@ -10351,7 +10478,7 @@ E01F0..E0FFF ; Default_Ignorable_Code_Point # Cn [3600] <reserved-E01F0>..<rese
0E47..0E4E ; Grapheme_Extend # Mn [8] THAI CHARACTER MAITAIKHU..THAI CHARACTER YAMAKKAN
0EB1 ; Grapheme_Extend # Mn LAO VOWEL SIGN MAI KAN
0EB4..0EBC ; Grapheme_Extend # Mn [9] LAO VOWEL SIGN I..LAO SEMIVOWEL SIGN LO
-0EC8..0ECD ; Grapheme_Extend # Mn [6] LAO TONE MAI EK..LAO NIGGAHITA
+0EC8..0ECE ; Grapheme_Extend # Mn [7] LAO TONE MAI EK..LAO YAMAKKAN
0F18..0F19 ; Grapheme_Extend # Mn [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS
0F35 ; Grapheme_Extend # Mn TIBETAN MARK NGAS BZUNG NYI ZLA
0F37 ; Grapheme_Extend # Mn TIBETAN MARK NGAS BZUNG SGOR RTAGS
@@ -10490,6 +10617,7 @@ FF9E..FF9F ; Grapheme_Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK.
10AE5..10AE6 ; Grapheme_Extend # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW
10D24..10D27 ; Grapheme_Extend # Mn [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI
10EAB..10EAC ; Grapheme_Extend # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
+10EFD..10EFF ; Grapheme_Extend # Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA
10F46..10F50 ; Grapheme_Extend # Mn [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW
10F82..10F85 ; Grapheme_Extend # Mn [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW
11001 ; Grapheme_Extend # Mn BRAHMI SIGN ANUSVARA
@@ -10512,6 +10640,7 @@ FF9E..FF9F ; Grapheme_Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK.
11234 ; Grapheme_Extend # Mn KHOJKI SIGN ANUSVARA
11236..11237 ; Grapheme_Extend # Mn [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA
1123E ; Grapheme_Extend # Mn KHOJKI SIGN SUKUN
+11241 ; Grapheme_Extend # Mn KHOJKI VOWEL SIGN VOCALIC R
112DF ; Grapheme_Extend # Mn KHUDAWADI SIGN ANUSVARA
112E3..112EA ; Grapheme_Extend # Mn [8] KHUDAWADI VOWEL SIGN U..KHUDAWADI SIGN VIRAMA
11300..11301 ; Grapheme_Extend # Mn [2] GRANTHA SIGN COMBINING ANUSVARA ABOVE..GRANTHA SIGN CANDRABINDU
@@ -10579,6 +10708,12 @@ FF9E..FF9F ; Grapheme_Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK.
11D95 ; Grapheme_Extend # Mn GUNJALA GONDI SIGN ANUSVARA
11D97 ; Grapheme_Extend # Mn GUNJALA GONDI VIRAMA
11EF3..11EF4 ; Grapheme_Extend # Mn [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U
+11F00..11F01 ; Grapheme_Extend # Mn [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA
+11F36..11F3A ; Grapheme_Extend # Mn [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R
+11F40 ; Grapheme_Extend # Mn KAWI VOWEL SIGN EU
+11F42 ; Grapheme_Extend # Mn KAWI CONJOINER
+13440 ; Grapheme_Extend # Mn EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY
+13447..13455 ; Grapheme_Extend # Mn [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED
16AF0..16AF4 ; Grapheme_Extend # Mn [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE
16B30..16B36 ; Grapheme_Extend # Mn [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM
16F4F ; Grapheme_Extend # Mn MIAO SIGN CONSONANT MODIFIER BAR
@@ -10605,15 +10740,17 @@ FF9E..FF9F ; Grapheme_Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK.
1E01B..1E021 ; Grapheme_Extend # Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
1E023..1E024 ; Grapheme_Extend # Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
1E026..1E02A ; Grapheme_Extend # Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
+1E08F ; Grapheme_Extend # Mn COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
1E130..1E136 ; Grapheme_Extend # Mn [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
1E2AE ; Grapheme_Extend # Mn TOTO SIGN RISING TONE
1E2EC..1E2EF ; Grapheme_Extend # Mn [4] WANCHO TONE TUP..WANCHO TONE KOINI
+1E4EC..1E4EF ; Grapheme_Extend # Mn [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH
1E8D0..1E8D6 ; Grapheme_Extend # Mn [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS
1E944..1E94A ; Grapheme_Extend # Mn [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA
E0020..E007F ; Grapheme_Extend # Cf [96] TAG SPACE..CANCEL TAG
E0100..E01EF ; Grapheme_Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
-# Total code points: 2090
+# Total code points: 2125
# ================================================
@@ -10913,6 +11050,7 @@ E0100..E01EF ; Grapheme_Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELE
0CE0..0CE1 ; Grapheme_Base # Lo [2] KANNADA LETTER VOCALIC RR..KANNADA LETTER VOCALIC LL
0CE6..0CEF ; Grapheme_Base # Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE
0CF1..0CF2 ; Grapheme_Base # Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
+0CF3 ; Grapheme_Base # Mc KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
0D02..0D03 ; Grapheme_Base # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
0D04..0D0C ; Grapheme_Base # Lo [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L
0D0E..0D10 ; Grapheme_Base # Lo [3] MALAYALAM LETTER E..MALAYALAM LETTER AI
@@ -11965,6 +12103,7 @@ FFFC..FFFD ; Grapheme_Base # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEME
11232..11233 ; Grapheme_Base # Mc [2] KHOJKI VOWEL SIGN O..KHOJKI VOWEL SIGN AU
11235 ; Grapheme_Base # Mc KHOJKI SIGN VIRAMA
11238..1123D ; Grapheme_Base # Po [6] KHOJKI DANDA..KHOJKI ABBREVIATION SIGN
+1123F..11240 ; Grapheme_Base # Lo [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I
11280..11286 ; Grapheme_Base # Lo [7] MULTANI LETTER A..MULTANI LETTER GA
11288 ; Grapheme_Base # Lo MULTANI LETTER GHA
1128A..1128D ; Grapheme_Base # Lo [4] MULTANI LETTER CA..MULTANI LETTER JJA
@@ -12080,6 +12219,7 @@ FFFC..FFFD ; Grapheme_Base # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEME
11A9D ; Grapheme_Base # Lo SOYOMBO MARK PLUTA
11A9E..11AA2 ; Grapheme_Base # Po [5] SOYOMBO HEAD MARK WITH MOON AND SUN AND TRIPLE FLAME..SOYOMBO TERMINAL MARK-2
11AB0..11AF8 ; Grapheme_Base # Lo [73] CANADIAN SYLLABICS NATTILIK HI..PAU CIN HAU GLOTTAL STOP FINAL
+11B00..11B09 ; Grapheme_Base # Po [10] DEVANAGARI HEAD MARK..DEVANAGARI SIGN MINDU
11C00..11C08 ; Grapheme_Base # Lo [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L
11C0A..11C2E ; Grapheme_Base # Lo [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA
11C2F ; Grapheme_Base # Mc BHAIKSUKI VOWEL SIGN AA
@@ -12109,6 +12249,15 @@ FFFC..FFFD ; Grapheme_Base # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEME
11EE0..11EF2 ; Grapheme_Base # Lo [19] MAKASAR LETTER KA..MAKASAR ANGKA
11EF5..11EF6 ; Grapheme_Base # Mc [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O
11EF7..11EF8 ; Grapheme_Base # Po [2] MAKASAR PASSIMBANG..MAKASAR END OF SECTION
+11F02 ; Grapheme_Base # Lo KAWI SIGN REPHA
+11F03 ; Grapheme_Base # Mc KAWI SIGN VISARGA
+11F04..11F10 ; Grapheme_Base # Lo [13] KAWI LETTER A..KAWI LETTER O
+11F12..11F33 ; Grapheme_Base # Lo [34] KAWI LETTER KA..KAWI LETTER JNYA
+11F34..11F35 ; Grapheme_Base # Mc [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA
+11F3E..11F3F ; Grapheme_Base # Mc [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI
+11F41 ; Grapheme_Base # Mc KAWI SIGN KILLER
+11F43..11F4F ; Grapheme_Base # Po [13] KAWI DANDA..KAWI PUNCTUATION CLOSING SPIRAL
+11F50..11F59 ; Grapheme_Base # Nd [10] KAWI DIGIT ZERO..KAWI DIGIT NINE
11FB0 ; Grapheme_Base # Lo LISU LETTER YHA
11FC0..11FD4 ; Grapheme_Base # No [21] TAMIL FRACTION ONE THREE-HUNDRED-AND-TWENTIETH..TAMIL FRACTION DOWNSCALING FACTOR KIIZH
11FD5..11FDC ; Grapheme_Base # So [8] TAMIL SIGN NEL..TAMIL SIGN MUKKURUNI
@@ -12121,7 +12270,8 @@ FFFC..FFFD ; Grapheme_Base # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEME
12480..12543 ; Grapheme_Base # Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU
12F90..12FF0 ; Grapheme_Base # Lo [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114
12FF1..12FF2 ; Grapheme_Base # Po [2] CYPRO-MINOAN SIGN CM301..CYPRO-MINOAN SIGN CM302
-13000..1342E ; Grapheme_Base # Lo [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032
+13000..1342F ; Grapheme_Base # Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D
+13441..13446 ; Grapheme_Base # Lo [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN
14400..14646 ; Grapheme_Base # Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530
16800..16A38 ; Grapheme_Base # Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ
16A40..16A5E ; Grapheme_Base # Lo [31] MRO LETTER TA..MRO LETTER TEK
@@ -12159,7 +12309,9 @@ FFFC..FFFD ; Grapheme_Base # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEME
1AFF5..1AFFB ; Grapheme_Base # Lm [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5
1AFFD..1AFFE ; Grapheme_Base # Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
1B000..1B122 ; Grapheme_Base # Lo [291] KATAKANA LETTER ARCHAIC E..KATAKANA LETTER ARCHAIC WU
+1B132 ; Grapheme_Base # Lo HIRAGANA LETTER SMALL KO
1B150..1B152 ; Grapheme_Base # Lo [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO
+1B155 ; Grapheme_Base # Lo KATAKANA LETTER SMALL KO
1B164..1B167 ; Grapheme_Base # Lo [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
1B170..1B2FB ; Grapheme_Base # Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
1BC00..1BC6A ; Grapheme_Base # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M
@@ -12180,6 +12332,7 @@ FFFC..FFFD ; Grapheme_Base # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEME
1D1AE..1D1EA ; Grapheme_Base # So [61] MUSICAL SYMBOL PEDAL MARK..MUSICAL SYMBOL KORON
1D200..1D241 ; Grapheme_Base # So [66] GREEK VOCAL NOTATION SYMBOL-1..GREEK INSTRUMENTAL NOTATION SYMBOL-54
1D245 ; Grapheme_Base # So GREEK MUSICAL LEIMMA
+1D2C0..1D2D3 ; Grapheme_Base # No [20] KAKTOVIK NUMERAL ZERO..KAKTOVIK NUMERAL NINETEEN
1D2E0..1D2F3 ; Grapheme_Base # No [20] MAYAN NUMERAL ZERO..MAYAN NUMERAL NINETEEN
1D300..1D356 ; Grapheme_Base # So [87] MONOGRAM FOR EARTH..TETRAGRAM FOR FOSTERING
1D360..1D378 ; Grapheme_Base # No [25] COUNTING ROD UNIT DIGIT ONE..TALLY MARK FIVE
@@ -12233,6 +12386,8 @@ FFFC..FFFD ; Grapheme_Base # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEME
1DF00..1DF09 ; Grapheme_Base # L& [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
1DF0A ; Grapheme_Base # Lo LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK
1DF0B..1DF1E ; Grapheme_Base # L& [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
+1DF25..1DF2A ; Grapheme_Base # L& [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK
+1E030..1E06D ; Grapheme_Base # Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
1E100..1E12C ; Grapheme_Base # Lo [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W
1E137..1E13D ; Grapheme_Base # Lm [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER
1E140..1E149 ; Grapheme_Base # Nd [10] NYIAKENG PUACHUE HMONG DIGIT ZERO..NYIAKENG PUACHUE HMONG DIGIT NINE
@@ -12242,6 +12397,9 @@ FFFC..FFFD ; Grapheme_Base # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEME
1E2C0..1E2EB ; Grapheme_Base # Lo [44] WANCHO LETTER AA..WANCHO LETTER YIH
1E2F0..1E2F9 ; Grapheme_Base # Nd [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE
1E2FF ; Grapheme_Base # Sc WANCHO NGUN SIGN
+1E4D0..1E4EA ; Grapheme_Base # Lo [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL
+1E4EB ; Grapheme_Base # Lm NAG MUNDARI SIGN OJOD
+1E4F0..1E4F9 ; Grapheme_Base # Nd [10] NAG MUNDARI DIGIT ZERO..NAG MUNDARI DIGIT NINE
1E7E0..1E7E6 ; Grapheme_Base # Lo [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO
1E7E8..1E7EB ; Grapheme_Base # Lo [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE
1E7ED..1E7EE ; Grapheme_Base # Lo [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE
@@ -12310,10 +12468,10 @@ FFFC..FFFD ; Grapheme_Base # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEME
1F300..1F3FA ; Grapheme_Base # So [251] CYCLONE..AMPHORA
1F3FB..1F3FF ; Grapheme_Base # Sk [5] EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6
1F400..1F6D7 ; Grapheme_Base # So [728] RAT..ELEVATOR
-1F6DD..1F6EC ; Grapheme_Base # So [16] PLAYGROUND SLIDE..AIRPLANE ARRIVING
+1F6DC..1F6EC ; Grapheme_Base # So [17] WIRELESS..AIRPLANE ARRIVING
1F6F0..1F6FC ; Grapheme_Base # So [13] SATELLITE..ROLLER SKATE
-1F700..1F773 ; Grapheme_Base # So [116] ALCHEMICAL SYMBOL FOR QUINTESSENCE..ALCHEMICAL SYMBOL FOR HALF OUNCE
-1F780..1F7D8 ; Grapheme_Base # So [89] BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE..NEGATIVE CIRCLED SQUARE
+1F700..1F776 ; Grapheme_Base # So [119] ALCHEMICAL SYMBOL FOR QUINTESSENCE..LUNAR ECLIPSE
+1F77B..1F7D9 ; Grapheme_Base # So [95] HAUMEA..NINE POINTED WHITE STAR
1F7E0..1F7EB ; Grapheme_Base # So [12] LARGE ORANGE CIRCLE..LARGE BROWN SQUARE
1F7F0 ; Grapheme_Base # So HEAVY EQUALS SIGN
1F800..1F80B ; Grapheme_Base # So [12] LEFTWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD..DOWNWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD
@@ -12324,27 +12482,26 @@ FFFC..FFFD ; Grapheme_Base # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEME
1F8B0..1F8B1 ; Grapheme_Base # So [2] ARROW POINTING UPWARDS THEN NORTH WEST..ARROW POINTING RIGHTWARDS THEN CURVING SOUTH WEST
1F900..1FA53 ; Grapheme_Base # So [340] CIRCLED CROSS FORMEE WITH FOUR DOTS..BLACK CHESS KNIGHT-BISHOP
1FA60..1FA6D ; Grapheme_Base # So [14] XIANGQI RED GENERAL..XIANGQI BLACK SOLDIER
-1FA70..1FA74 ; Grapheme_Base # So [5] BALLET SHOES..THONG SANDAL
-1FA78..1FA7C ; Grapheme_Base # So [5] DROP OF BLOOD..CRUTCH
-1FA80..1FA86 ; Grapheme_Base # So [7] YO-YO..NESTING DOLLS
-1FA90..1FAAC ; Grapheme_Base # So [29] RINGED PLANET..HAMSA
-1FAB0..1FABA ; Grapheme_Base # So [11] FLY..NEST WITH EGGS
-1FAC0..1FAC5 ; Grapheme_Base # So [6] ANATOMICAL HEART..PERSON WITH CROWN
-1FAD0..1FAD9 ; Grapheme_Base # So [10] BLUEBERRIES..JAR
-1FAE0..1FAE7 ; Grapheme_Base # So [8] MELTING FACE..BUBBLES
-1FAF0..1FAF6 ; Grapheme_Base # So [7] HAND WITH INDEX FINGER AND THUMB CROSSED..HEART HANDS
+1FA70..1FA7C ; Grapheme_Base # So [13] BALLET SHOES..CRUTCH
+1FA80..1FA88 ; Grapheme_Base # So [9] YO-YO..FLUTE
+1FA90..1FABD ; Grapheme_Base # So [46] RINGED PLANET..WING
+1FABF..1FAC5 ; Grapheme_Base # So [7] GOOSE..PERSON WITH CROWN
+1FACE..1FADB ; Grapheme_Base # So [14] MOOSE..PEA POD
+1FAE0..1FAE8 ; Grapheme_Base # So [9] MELTING FACE..SHAKING FACE
+1FAF0..1FAF8 ; Grapheme_Base # So [9] HAND WITH INDEX FINGER AND THUMB CROSSED..RIGHTWARDS PUSHING HAND
1FB00..1FB92 ; Grapheme_Base # So [147] BLOCK SEXTANT-1..UPPER HALF INVERSE MEDIUM SHADE AND LOWER HALF BLOCK
1FB94..1FBCA ; Grapheme_Base # So [55] LEFT HALF INVERSE MEDIUM SHADE AND RIGHT HALF BLOCK..WHITE UP-POINTING CHEVRON
1FBF0..1FBF9 ; Grapheme_Base # Nd [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE
20000..2A6DF ; Grapheme_Base # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
-2A700..2B738 ; Grapheme_Base # Lo [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738
+2A700..2B739 ; Grapheme_Base # Lo [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739
2B740..2B81D ; Grapheme_Base # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
2B820..2CEA1 ; Grapheme_Base # Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
2CEB0..2EBE0 ; Grapheme_Base # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
2F800..2FA1D ; Grapheme_Base # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
30000..3134A ; Grapheme_Base # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
+31350..323AF ; Grapheme_Base # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
-# Total code points: 142539
+# Total code points: 146986
# ================================================
@@ -12410,7 +12567,9 @@ ABED ; Grapheme_Link # Mn MEETEI MAYEK APUN IYEK
11C3F ; Grapheme_Link # Mn BHAIKSUKI SIGN VIRAMA
11D44..11D45 ; Grapheme_Link # Mn [2] MASARAM GONDI SIGN HALANTA..MASARAM GONDI VIRAMA
11D97 ; Grapheme_Link # Mn GUNJALA GONDI VIRAMA
+11F41 ; Grapheme_Link # Mc KAWI SIGN KILLER
+11F42 ; Grapheme_Link # Mn KAWI CONJOINER
-# Total code points: 63
+# Total code points: 65
# EOF
diff --git a/data/EastAsianWidth.txt b/data/EastAsianWidth.txt
@@ -1,6 +1,6 @@
-# EastAsianWidth-14.0.0.txt
-# Date: 2021-07-06, 09:58:53 GMT [KW, LI]
-# © 2021 Unicode®, Inc.
+# EastAsianWidth-15.0.0.txt
+# Date: 2022-05-24, 17:40:20 GMT [KW, LI]
+# © 2022 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see https://www.unicode.org/terms_of_use.html
#
@@ -534,6 +534,7 @@
0CE2..0CE3;N # Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL
0CE6..0CEF;N # Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE
0CF1..0CF2;N # Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
+0CF3;N # Mc KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
0D00..0D01;N # Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU
0D02..0D03;N # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
0D04..0D0C;N # Lo [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L
@@ -595,7 +596,7 @@
0EBD;N # Lo LAO SEMIVOWEL SIGN NYO
0EC0..0EC4;N # Lo [5] LAO VOWEL SIGN E..LAO VOWEL SIGN AI
0EC6;N # Lm LAO KO LA
-0EC8..0ECD;N # Mn [6] LAO TONE MAI EK..LAO NIGGAHITA
+0EC8..0ECE;N # Mn [7] LAO TONE MAI EK..LAO YAMAKKAN
0ED0..0ED9;N # Nd [10] LAO DIGIT ZERO..LAO DIGIT NINE
0EDC..0EDF;N # Lo [4] LAO HO NO..LAO LETTER KHMU NYO
0F00;N # Lo TIBETAN SYLLABLE OM
@@ -1946,6 +1947,7 @@ FFFD;A # So REPLACEMENT CHARACTER
10EAB..10EAC;N # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
10EAD;N # Pd YEZIDI HYPHENATION MARK
10EB0..10EB1;N # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE
+10EFD..10EFF;N # Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA
10F00..10F1C;N # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL
10F1D..10F26;N # No [10] OLD SOGDIAN NUMBER ONE..OLD SOGDIAN FRACTION ONE HALF
10F27;N # Lo OLD SOGDIAN LIGATURE AYIN-DALETH
@@ -2028,6 +2030,8 @@ FFFD;A # So REPLACEMENT CHARACTER
11236..11237;N # Mn [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA
11238..1123D;N # Po [6] KHOJKI DANDA..KHOJKI ABBREVIATION SIGN
1123E;N # Mn KHOJKI SIGN SUKUN
+1123F..11240;N # Lo [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I
+11241;N # Mn KHOJKI VOWEL SIGN VOCALIC R
11280..11286;N # Lo [7] MULTANI LETTER A..MULTANI LETTER GA
11288;N # Lo MULTANI LETTER GHA
1128A..1128D;N # Lo [4] MULTANI LETTER CA..MULTANI LETTER JJA
@@ -2190,6 +2194,7 @@ FFFD;A # So REPLACEMENT CHARACTER
11A9E..11AA2;N # Po [5] SOYOMBO HEAD MARK WITH MOON AND SUN AND TRIPLE FLAME..SOYOMBO TERMINAL MARK-2
11AB0..11ABF;N # Lo [16] CANADIAN SYLLABICS NATTILIK HI..CANADIAN SYLLABICS SPA
11AC0..11AF8;N # Lo [57] PAU CIN HAU LETTER PA..PAU CIN HAU GLOTTAL STOP FINAL
+11B00..11B09;N # Po [10] DEVANAGARI HEAD MARK..DEVANAGARI SIGN MINDU
11C00..11C08;N # Lo [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L
11C0A..11C2E;N # Lo [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA
11C2F;N # Mc BHAIKSUKI VOWEL SIGN AA
@@ -2235,6 +2240,19 @@ FFFD;A # So REPLACEMENT CHARACTER
11EF3..11EF4;N # Mn [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U
11EF5..11EF6;N # Mc [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O
11EF7..11EF8;N # Po [2] MAKASAR PASSIMBANG..MAKASAR END OF SECTION
+11F00..11F01;N # Mn [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA
+11F02;N # Lo KAWI SIGN REPHA
+11F03;N # Mc KAWI SIGN VISARGA
+11F04..11F10;N # Lo [13] KAWI LETTER A..KAWI LETTER O
+11F12..11F33;N # Lo [34] KAWI LETTER KA..KAWI LETTER JNYA
+11F34..11F35;N # Mc [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA
+11F36..11F3A;N # Mn [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R
+11F3E..11F3F;N # Mc [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI
+11F40;N # Mn KAWI VOWEL SIGN EU
+11F41;N # Mc KAWI SIGN KILLER
+11F42;N # Mn KAWI CONJOINER
+11F43..11F4F;N # Po [13] KAWI DANDA..KAWI PUNCTUATION CLOSING SPIRAL
+11F50..11F59;N # Nd [10] KAWI DIGIT ZERO..KAWI DIGIT NINE
11FB0;N # Lo LISU LETTER YHA
11FC0..11FD4;N # No [21] TAMIL FRACTION ONE THREE-HUNDRED-AND-TWENTIETH..TAMIL FRACTION DOWNSCALING FACTOR KIIZH
11FD5..11FDC;N # So [8] TAMIL SIGN NEL..TAMIL SIGN MUKKURUNI
@@ -2247,8 +2265,11 @@ FFFD;A # So REPLACEMENT CHARACTER
12480..12543;N # Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU
12F90..12FF0;N # Lo [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114
12FF1..12FF2;N # Po [2] CYPRO-MINOAN SIGN CM301..CYPRO-MINOAN SIGN CM302
-13000..1342E;N # Lo [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032
-13430..13438;N # Cf [9] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END SEGMENT
+13000..1342F;N # Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D
+13430..1343F;N # Cf [16] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE
+13440;N # Mn EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY
+13441..13446;N # Lo [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN
+13447..13455;N # Mn [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED
14400..14646;N # Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530
16800..16A38;N # Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ
16A40..16A5E;N # Lo [31] MRO LETTER TA..MRO LETTER TEK
@@ -2293,7 +2314,9 @@ FFFD;A # So REPLACEMENT CHARACTER
1AFFD..1AFFE;W # Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
1B000..1B0FF;W # Lo [256] KATAKANA LETTER ARCHAIC E..HENTAIGANA LETTER RE-2
1B100..1B122;W # Lo [35] HENTAIGANA LETTER RE-3..KATAKANA LETTER ARCHAIC WU
+1B132;W # Lo HIRAGANA LETTER SMALL KO
1B150..1B152;W # Lo [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO
+1B155;W # Lo KATAKANA LETTER SMALL KO
1B164..1B167;W # Lo [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
1B170..1B2FB;W # Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
1BC00..1BC6A;N # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M
@@ -2324,6 +2347,7 @@ FFFD;A # So REPLACEMENT CHARACTER
1D200..1D241;N # So [66] GREEK VOCAL NOTATION SYMBOL-1..GREEK INSTRUMENTAL NOTATION SYMBOL-54
1D242..1D244;N # Mn [3] COMBINING GREEK MUSICAL TRISEME..COMBINING GREEK MUSICAL PENTASEME
1D245;N # So GREEK MUSICAL LEIMMA
+1D2C0..1D2D3;N # No [20] KAKTOVIK NUMERAL ZERO..KAKTOVIK NUMERAL NINETEEN
1D2E0..1D2F3;N # No [20] MAYAN NUMERAL ZERO..MAYAN NUMERAL NINETEEN
1D300..1D356;N # So [87] MONOGRAM FOR EARTH..TETRAGRAM FOR FOSTERING
1D360..1D378;N # No [25] COUNTING ROD UNIT DIGIT ONE..TALLY MARK FIVE
@@ -2383,11 +2407,14 @@ FFFD;A # So REPLACEMENT CHARACTER
1DF00..1DF09;N # Ll [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
1DF0A;N # Lo LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK
1DF0B..1DF1E;N # Ll [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
+1DF25..1DF2A;N # Ll [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK
1E000..1E006;N # Mn [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE
1E008..1E018;N # Mn [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU
1E01B..1E021;N # Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
1E023..1E024;N # Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
1E026..1E02A;N # Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
+1E030..1E06D;N # Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
+1E08F;N # Mn COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
1E100..1E12C;N # Lo [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W
1E130..1E136;N # Mn [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
1E137..1E13D;N # Lm [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER
@@ -2400,6 +2427,10 @@ FFFD;A # So REPLACEMENT CHARACTER
1E2EC..1E2EF;N # Mn [4] WANCHO TONE TUP..WANCHO TONE KOINI
1E2F0..1E2F9;N # Nd [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE
1E2FF;N # Sc WANCHO NGUN SIGN
+1E4D0..1E4EA;N # Lo [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL
+1E4EB;N # Lm NAG MUNDARI SIGN OJOD
+1E4EC..1E4EF;N # Mn [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH
+1E4F0..1E4F9;N # Nd [10] NAG MUNDARI DIGIT ZERO..NAG MUNDARI DIGIT NINE
1E7E0..1E7E6;N # Lo [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO
1E7E8..1E7EB;N # Lo [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE
1E7ED..1E7EE;N # Lo [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE
@@ -2528,13 +2559,14 @@ FFFD;A # So REPLACEMENT CHARACTER
1F6D0..1F6D2;W # So [3] PLACE OF WORSHIP..SHOPPING TROLLEY
1F6D3..1F6D4;N # So [2] STUPA..PAGODA
1F6D5..1F6D7;W # So [3] HINDU TEMPLE..ELEVATOR
-1F6DD..1F6DF;W # So [3] PLAYGROUND SLIDE..RING BUOY
+1F6DC..1F6DF;W # So [4] WIRELESS..RING BUOY
1F6E0..1F6EA;N # So [11] HAMMER AND WRENCH..NORTHEAST-POINTING AIRPLANE
1F6EB..1F6EC;W # So [2] AIRPLANE DEPARTURE..AIRPLANE ARRIVING
1F6F0..1F6F3;N # So [4] SATELLITE..PASSENGER SHIP
1F6F4..1F6FC;W # So [9] SCOOTER..ROLLER SKATE
-1F700..1F773;N # So [116] ALCHEMICAL SYMBOL FOR QUINTESSENCE..ALCHEMICAL SYMBOL FOR HALF OUNCE
-1F780..1F7D8;N # So [89] BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE..NEGATIVE CIRCLED SQUARE
+1F700..1F776;N # So [119] ALCHEMICAL SYMBOL FOR QUINTESSENCE..LUNAR ECLIPSE
+1F77B..1F77F;N # So [5] HAUMEA..ORCUS
+1F780..1F7D9;N # So [90] BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE..NINE POINTED WHITE STAR
1F7E0..1F7EB;W # So [12] LARGE ORANGE CIRCLE..LARGE BROWN SQUARE
1F7F0;W # So HEAVY EQUALS SIGN
1F800..1F80B;N # So [12] LEFTWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD..DOWNWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD
@@ -2551,22 +2583,20 @@ FFFD;A # So REPLACEMENT CHARACTER
1F947..1F9FF;W # So [185] FIRST PLACE MEDAL..NAZAR AMULET
1FA00..1FA53;N # So [84] NEUTRAL CHESS KING..BLACK CHESS KNIGHT-BISHOP
1FA60..1FA6D;N # So [14] XIANGQI RED GENERAL..XIANGQI BLACK SOLDIER
-1FA70..1FA74;W # So [5] BALLET SHOES..THONG SANDAL
-1FA78..1FA7C;W # So [5] DROP OF BLOOD..CRUTCH
-1FA80..1FA86;W # So [7] YO-YO..NESTING DOLLS
-1FA90..1FAAC;W # So [29] RINGED PLANET..HAMSA
-1FAB0..1FABA;W # So [11] FLY..NEST WITH EGGS
-1FAC0..1FAC5;W # So [6] ANATOMICAL HEART..PERSON WITH CROWN
-1FAD0..1FAD9;W # So [10] BLUEBERRIES..JAR
-1FAE0..1FAE7;W # So [8] MELTING FACE..BUBBLES
-1FAF0..1FAF6;W # So [7] HAND WITH INDEX FINGER AND THUMB CROSSED..HEART HANDS
+1FA70..1FA7C;W # So [13] BALLET SHOES..CRUTCH
+1FA80..1FA88;W # So [9] YO-YO..FLUTE
+1FA90..1FABD;W # So [46] RINGED PLANET..WING
+1FABF..1FAC5;W # So [7] GOOSE..PERSON WITH CROWN
+1FACE..1FADB;W # So [14] MOOSE..PEA POD
+1FAE0..1FAE8;W # So [9] MELTING FACE..SHAKING FACE
+1FAF0..1FAF8;W # So [9] HAND WITH INDEX FINGER AND THUMB CROSSED..RIGHTWARDS PUSHING HAND
1FB00..1FB92;N # So [147] BLOCK SEXTANT-1..UPPER HALF INVERSE MEDIUM SHADE AND LOWER HALF BLOCK
1FB94..1FBCA;N # So [55] LEFT HALF INVERSE MEDIUM SHADE AND RIGHT HALF BLOCK..WHITE UP-POINTING CHEVRON
1FBF0..1FBF9;N # Nd [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE
20000..2A6DF;W # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
2A6E0..2A6FF;W # Cn [32] <reserved-2A6E0>..<reserved-2A6FF>
-2A700..2B738;W # Lo [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738
-2B739..2B73F;W # Cn [7] <reserved-2B739>..<reserved-2B73F>
+2A700..2B739;W # Lo [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739
+2B73A..2B73F;W # Cn [6] <reserved-2B73A>..<reserved-2B73F>
2B740..2B81D;W # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
2B81E..2B81F;W # Cn [2] <reserved-2B81E>..<reserved-2B81F>
2B820..2CEA1;W # Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
@@ -2577,7 +2607,9 @@ FFFD;A # So REPLACEMENT CHARACTER
2FA1E..2FA1F;W # Cn [2] <reserved-2FA1E>..<reserved-2FA1F>
2FA20..2FFFD;W # Cn [1502] <reserved-2FA20>..<reserved-2FFFD>
30000..3134A;W # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
-3134B..3FFFD;W # Cn [60595] <reserved-3134B>..<reserved-3FFFD>
+3134B..3134F;W # Cn [5] <reserved-3134B>..<reserved-3134F>
+31350..323AF;W # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
+323B0..3FFFD;W # Cn [56398] <reserved-323B0>..<reserved-3FFFD>
E0001;N # Cf LANGUAGE TAG
E0020..E007F;N # Cf [96] TAG SPACE..CANCEL TAG
E0100..E01EF;A # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
diff --git a/data/GraphemeBreakProperty.txt b/data/GraphemeBreakProperty.txt
@@ -1,11 +1,11 @@
-# GraphemeBreakProperty-14.0.0.txt
-# Date: 2021-08-12, 23:13:02 GMT
-# © 2021 Unicode®, Inc.
+# GraphemeBreakProperty-15.0.0.txt
+# Date: 2022-04-27, 17:07:38 GMT
+# © 2022 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
-# For terms of use, see http://www.unicode.org/terms_of_use.html
+# For terms of use, see https://www.unicode.org/terms_of_use.html
#
# Unicode Character Database
-# For documentation, see http://www.unicode.org/reports/tr44/
+# For documentation, see https://www.unicode.org/reports/tr44/
# ================================================
@@ -32,8 +32,9 @@
11A3A ; Prepend # Lo ZANABAZAR SQUARE CLUSTER-INITIAL LETTER RA
11A84..11A89 ; Prepend # Lo [6] SOYOMBO SIGN JIHVAMULIYA..SOYOMBO CLUSTER-INITIAL LETTER SA
11D46 ; Prepend # Lo MASARAM GONDI REPHA
+11F02 ; Prepend # Lo KAWI SIGN REPHA
-# Total code points: 26
+# Total code points: 27
# ================================================
@@ -67,7 +68,7 @@
FEFF ; Control # Cf ZERO WIDTH NO-BREAK SPACE
FFF0..FFF8 ; Control # Cn [9] <reserved-FFF0>..<reserved-FFF8>
FFF9..FFFB ; Control # Cf [3] INTERLINEAR ANNOTATION ANCHOR..INTERLINEAR ANNOTATION TERMINATOR
-13430..13438 ; Control # Cf [9] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END SEGMENT
+13430..1343F ; Control # Cf [16] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE
1BCA0..1BCA3 ; Control # Cf [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP
1D173..1D17A ; Control # Cf [8] MUSICAL SYMBOL BEGIN BEAM..MUSICAL SYMBOL END PHRASE
E0000 ; Control # Cn <reserved-E0000>
@@ -76,7 +77,7 @@ E0002..E001F ; Control # Cn [30] <reserved-E0002>..<reserved-E001F>
E0080..E00FF ; Control # Cn [128] <reserved-E0080>..<reserved-E00FF>
E01F0..E0FFF ; Control # Cn [3600] <reserved-E01F0>..<reserved-E0FFF>
-# Total code points: 3886
+# Total code points: 3893
# ================================================
@@ -185,7 +186,7 @@ E01F0..E0FFF ; Control # Cn [3600] <reserved-E01F0>..<reserved-E0FFF>
0E47..0E4E ; Extend # Mn [8] THAI CHARACTER MAITAIKHU..THAI CHARACTER YAMAKKAN
0EB1 ; Extend # Mn LAO VOWEL SIGN MAI KAN
0EB4..0EBC ; Extend # Mn [9] LAO VOWEL SIGN I..LAO SEMIVOWEL SIGN LO
-0EC8..0ECD ; Extend # Mn [6] LAO TONE MAI EK..LAO NIGGAHITA
+0EC8..0ECE ; Extend # Mn [7] LAO TONE MAI EK..LAO YAMAKKAN
0F18..0F19 ; Extend # Mn [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS
0F35 ; Extend # Mn TIBETAN MARK NGAS BZUNG NYI ZLA
0F37 ; Extend # Mn TIBETAN MARK NGAS BZUNG SGOR RTAGS
@@ -324,6 +325,7 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
10AE5..10AE6 ; Extend # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW
10D24..10D27 ; Extend # Mn [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI
10EAB..10EAC ; Extend # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
+10EFD..10EFF ; Extend # Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA
10F46..10F50 ; Extend # Mn [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW
10F82..10F85 ; Extend # Mn [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW
11001 ; Extend # Mn BRAHMI SIGN ANUSVARA
@@ -346,6 +348,7 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
11234 ; Extend # Mn KHOJKI SIGN ANUSVARA
11236..11237 ; Extend # Mn [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA
1123E ; Extend # Mn KHOJKI SIGN SUKUN
+11241 ; Extend # Mn KHOJKI VOWEL SIGN VOCALIC R
112DF ; Extend # Mn KHUDAWADI SIGN ANUSVARA
112E3..112EA ; Extend # Mn [8] KHUDAWADI VOWEL SIGN U..KHUDAWADI SIGN VIRAMA
11300..11301 ; Extend # Mn [2] GRANTHA SIGN COMBINING ANUSVARA ABOVE..GRANTHA SIGN CANDRABINDU
@@ -413,6 +416,12 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
11D95 ; Extend # Mn GUNJALA GONDI SIGN ANUSVARA
11D97 ; Extend # Mn GUNJALA GONDI VIRAMA
11EF3..11EF4 ; Extend # Mn [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U
+11F00..11F01 ; Extend # Mn [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA
+11F36..11F3A ; Extend # Mn [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R
+11F40 ; Extend # Mn KAWI VOWEL SIGN EU
+11F42 ; Extend # Mn KAWI CONJOINER
+13440 ; Extend # Mn EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY
+13447..13455 ; Extend # Mn [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED
16AF0..16AF4 ; Extend # Mn [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE
16B30..16B36 ; Extend # Mn [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM
16F4F ; Extend # Mn MIAO SIGN CONSONANT MODIFIER BAR
@@ -439,16 +448,18 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
1E01B..1E021 ; Extend # Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
1E023..1E024 ; Extend # Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
1E026..1E02A ; Extend # Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
+1E08F ; Extend # Mn COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
1E130..1E136 ; Extend # Mn [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
1E2AE ; Extend # Mn TOTO SIGN RISING TONE
1E2EC..1E2EF ; Extend # Mn [4] WANCHO TONE TUP..WANCHO TONE KOINI
+1E4EC..1E4EF ; Extend # Mn [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH
1E8D0..1E8D6 ; Extend # Mn [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS
1E944..1E94A ; Extend # Mn [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA
1F3FB..1F3FF ; Extend # Sk [5] EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6
E0020..E007F ; Extend # Cf [96] TAG SPACE..CANCEL TAG
E0100..E01EF ; Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
-# Total code points: 2095
+# Total code points: 2130
# ================================================
@@ -489,6 +500,7 @@ E0100..E01EF ; Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
0CC3..0CC4 ; SpacingMark # Mc [2] KANNADA VOWEL SIGN VOCALIC R..KANNADA VOWEL SIGN VOCALIC RR
0CC7..0CC8 ; SpacingMark # Mc [2] KANNADA VOWEL SIGN EE..KANNADA VOWEL SIGN AI
0CCA..0CCB ; SpacingMark # Mc [2] KANNADA VOWEL SIGN O..KANNADA VOWEL SIGN OO
+0CF3 ; SpacingMark # Mc KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
0D02..0D03 ; SpacingMark # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
0D3F..0D40 ; SpacingMark # Mc [2] MALAYALAM VOWEL SIGN I..MALAYALAM VOWEL SIGN II
0D46..0D48 ; SpacingMark # Mc [3] MALAYALAM VOWEL SIGN E..MALAYALAM VOWEL SIGN AI
@@ -614,12 +626,16 @@ ABEC ; SpacingMark # Mc MEETEI MAYEK LUM IYEK
11D93..11D94 ; SpacingMark # Mc [2] GUNJALA GONDI VOWEL SIGN OO..GUNJALA GONDI VOWEL SIGN AU
11D96 ; SpacingMark # Mc GUNJALA GONDI SIGN VISARGA
11EF5..11EF6 ; SpacingMark # Mc [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O
+11F03 ; SpacingMark # Mc KAWI SIGN VISARGA
+11F34..11F35 ; SpacingMark # Mc [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA
+11F3E..11F3F ; SpacingMark # Mc [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI
+11F41 ; SpacingMark # Mc KAWI SIGN KILLER
16F51..16F87 ; SpacingMark # Mc [55] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN UI
16FF0..16FF1 ; SpacingMark # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY
1D166 ; SpacingMark # Mc MUSICAL SYMBOL COMBINING SPRECHGESANG STEM
1D16D ; SpacingMark # Mc MUSICAL SYMBOL COMBINING AUGMENTATION DOT
-# Total code points: 388
+# Total code points: 395
# ================================================
diff --git a/data/GraphemeBreakTest.txt b/data/GraphemeBreakTest.txt
@@ -1,11 +1,11 @@
-# GraphemeBreakTest-14.0.0.txt
-# Date: 2021-03-08, 06:22:32 GMT
-# © 2021 Unicode®, Inc.
+# GraphemeBreakTest-15.0.0.txt
+# Date: 2022-02-26, 00:38:37 GMT
+# © 2022 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
-# For terms of use, see http://www.unicode.org/terms_of_use.html
+# For terms of use, see https://www.unicode.org/terms_of_use.html
#
# Unicode Character Database
-# For documentation, see http://www.unicode.org/reports/tr44/
+# For documentation, see https://www.unicode.org/reports/tr44/
#
# Default Grapheme_Cluster_Break Test
#
diff --git a/data/LineBreak.txt b/data/LineBreak.txt
@@ -1,6 +1,6 @@
-# LineBreak-14.0.0.txt
-# Date: 2021-07-06, 09:58:55 GMT [KW, LI]
-# © 2021 Unicode®, Inc.
+# LineBreak-15.0.0.txt
+# Date: 2022-07-28, 09:20:42 GMT [KW, LI]
+# © 2022 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see https://www.unicode.org/terms_of_use.html
#
@@ -481,6 +481,7 @@
0CE2..0CE3;CM # Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL
0CE6..0CEF;NU # Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE
0CF1..0CF2;AL # Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
+0CF3;CM # Mc KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
0D00..0D01;CM # Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU
0D02..0D03;CM # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
0D04..0D0C;AL # Lo [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L
@@ -542,7 +543,7 @@
0EBD;SA # Lo LAO SEMIVOWEL SIGN NYO
0EC0..0EC4;SA # Lo [5] LAO VOWEL SIGN E..LAO VOWEL SIGN AI
0EC6;SA # Lm LAO KO LA
-0EC8..0ECD;SA # Mn [6] LAO TONE MAI EK..LAO NIGGAHITA
+0EC8..0ECE;SA # Mn [7] LAO TONE MAI EK..LAO YAMAKKAN
0ED0..0ED9;NU # Nd [10] LAO DIGIT ZERO..LAO DIGIT NINE
0EDC..0EDF;SA # Lo [4] LAO HO NO..LAO LETTER KHMU NYO
0F00;AL # Lo TIBETAN SYLLABLE OM
@@ -855,7 +856,11 @@
1D79..1D7F;AL # Ll [7] LATIN SMALL LETTER INSULAR G..LATIN SMALL LETTER UPSILON WITH STROKE
1D80..1D9A;AL # Ll [27] LATIN SMALL LETTER B WITH PALATAL HOOK..LATIN SMALL LETTER EZH WITH RETROFLEX HOOK
1D9B..1DBF;AL # Lm [37] MODIFIER LETTER SMALL TURNED ALPHA..MODIFIER LETTER SMALL THETA
-1DC0..1DFF;CM # Mn [64] COMBINING DOTTED GRAVE ACCENT..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
+1DC0..1DCC;CM # Mn [13] COMBINING DOTTED GRAVE ACCENT..COMBINING MACRON-BREVE
+1DCD;GL # Mn COMBINING DOUBLE CIRCUMFLEX ABOVE
+1DCE..1DFB;CM # Mn [46] COMBINING OGONEK ABOVE..COMBINING DELETION MARK
+1DFC;GL # Mn COMBINING DOUBLE INVERTED BREVE BELOW
+1DFD..1DFF;CM # Mn [3] COMBINING ALMOST EQUAL TO BELOW..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
1E00..1EFF;AL # L& [256] LATIN CAPITAL LETTER A WITH RING BELOW..LATIN SMALL LETTER Y WITH LOOP
1F00..1F15;AL # L& [22] GREEK SMALL LETTER ALPHA WITH PSILI..GREEK SMALL LETTER EPSILON WITH DASIA AND OXIA
1F18..1F1D;AL # Lu [6] GREEK CAPITAL LETTER EPSILON WITH PSILI..GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA
@@ -931,7 +936,7 @@
2054;AL # Pc INVERTED UNDERTIE
2055;AL # Po FLOWER PUNCTUATION MARK
2056;BA # Po THREE DOT PUNCTUATION
-2057;AL # Po QUADRUPLE PRIME
+2057;PO # Po QUADRUPLE PRIME
2058..205B;BA # Po [4] FOUR DOT PUNCTUATION..FOUR DOT MARK
205C;AL # Po DOTTED CROSS
205D..205E;BA # Po [2] TRICOLON..VERTICAL FOUR DOTS
@@ -2793,6 +2798,7 @@ FFFD;AI # So REPLACEMENT CHARACTER
10EAB..10EAC;CM # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
10EAD;BA # Pd YEZIDI HYPHENATION MARK
10EB0..10EB1;AL # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE
+10EFD..10EFF;CM # Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA
10F00..10F1C;AL # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL
10F1D..10F26;AL # No [10] OLD SOGDIAN NUMBER ONE..OLD SOGDIAN FRACTION ONE HALF
10F27;AL # Lo OLD SOGDIAN LIGATURE AYIN-DALETH
@@ -2882,6 +2888,8 @@ FFFD;AI # So REPLACEMENT CHARACTER
1123B..1123C;BA # Po [2] KHOJKI SECTION MARK..KHOJKI DOUBLE SECTION MARK
1123D;AL # Po KHOJKI ABBREVIATION SIGN
1123E;CM # Mn KHOJKI SIGN SUKUN
+1123F..11240;AL # Lo [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I
+11241;CM # Mn KHOJKI VOWEL SIGN VOCALIC R
11280..11286;AL # Lo [7] MULTANI LETTER A..MULTANI LETTER GA
11288;AL # Lo MULTANI LETTER GHA
1128A..1128D;AL # Lo [4] MULTANI LETTER CA..MULTANI LETTER JJA
@@ -3055,6 +3063,7 @@ FFFD;AI # So REPLACEMENT CHARACTER
11AA1..11AA2;BA # Po [2] SOYOMBO TERMINAL MARK-1..SOYOMBO TERMINAL MARK-2
11AB0..11ABF;AL # Lo [16] CANADIAN SYLLABICS NATTILIK HI..CANADIAN SYLLABICS SPA
11AC0..11AF8;AL # Lo [57] PAU CIN HAU LETTER PA..PAU CIN HAU GLOTTAL STOP FINAL
+11B00..11B09;BB # Po [10] DEVANAGARI HEAD MARK..DEVANAGARI SIGN MINDU
11C00..11C08;AL # Lo [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L
11C0A..11C2E;AL # Lo [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA
11C2F;CM # Mc BHAIKSUKI VOWEL SIGN AA
@@ -3101,6 +3110,20 @@ FFFD;AI # So REPLACEMENT CHARACTER
11EF3..11EF4;CM # Mn [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U
11EF5..11EF6;CM # Mc [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O
11EF7..11EF8;AL # Po [2] MAKASAR PASSIMBANG..MAKASAR END OF SECTION
+11F00..11F01;CM # Mn [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA
+11F02;AL # Lo KAWI SIGN REPHA
+11F03;CM # Mc KAWI SIGN VISARGA
+11F04..11F10;AL # Lo [13] KAWI LETTER A..KAWI LETTER O
+11F12..11F33;AL # Lo [34] KAWI LETTER KA..KAWI LETTER JNYA
+11F34..11F35;CM # Mc [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA
+11F36..11F3A;CM # Mn [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R
+11F3E..11F3F;CM # Mc [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI
+11F40;CM # Mn KAWI VOWEL SIGN EU
+11F41;CM # Mc KAWI SIGN KILLER
+11F42;CM # Mn KAWI CONJOINER
+11F43..11F44;BA # Po [2] KAWI DANDA..KAWI DOUBLE DANDA
+11F45..11F4F;ID # Po [11] KAWI PUNCTUATION SECTION MARKER..KAWI PUNCTUATION CLOSING SPIRAL
+11F50..11F59;NU # Nd [10] KAWI DIGIT ZERO..KAWI DIGIT NINE
11FB0;AL # Lo LISU LETTER YHA
11FC0..11FD4;AL # No [21] TAMIL FRACTION ONE THREE-HUNDRED-AND-TWENTIETH..TAMIL FRACTION DOWNSCALING FACTOR KIIZH
11FD5..11FDC;AL # So [8] TAMIL SIGN NEL..TAMIL SIGN MUKKURUNI
@@ -3126,10 +3149,18 @@ FFFD;AI # So REPLACEMENT CHARACTER
1328A..13378;AL # Lo [239] EGYPTIAN HIEROGLYPH O037..EGYPTIAN HIEROGLYPH V011
13379;OP # Lo EGYPTIAN HIEROGLYPH V011A
1337A..1337B;CL # Lo [2] EGYPTIAN HIEROGLYPH V011B..EGYPTIAN HIEROGLYPH V011C
-1337C..1342E;AL # Lo [179] EGYPTIAN HIEROGLYPH V012..EGYPTIAN HIEROGLYPH AA032
+1337C..1342F;AL # Lo [180] EGYPTIAN HIEROGLYPH V012..EGYPTIAN HIEROGLYPH V011D
13430..13436;GL # Cf [7] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH OVERLAY MIDDLE
13437;OP # Cf EGYPTIAN HIEROGLYPH BEGIN SEGMENT
13438;CL # Cf EGYPTIAN HIEROGLYPH END SEGMENT
+13439..1343B;GL # Cf [3] EGYPTIAN HIEROGLYPH INSERT AT MIDDLE..EGYPTIAN HIEROGLYPH INSERT AT BOTTOM
+1343C;OP # Cf EGYPTIAN HIEROGLYPH BEGIN ENCLOSURE
+1343D;CL # Cf EGYPTIAN HIEROGLYPH END ENCLOSURE
+1343E;OP # Cf EGYPTIAN HIEROGLYPH BEGIN WALLED ENCLOSURE
+1343F;CL # Cf EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE
+13440;CM # Mn EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY
+13441..13446;AL # Lo [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN
+13447..13455;CM # Mn [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED
14400..145CD;AL # Lo [462] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A409
145CE;OP # Lo ANATOLIAN HIEROGLYPH A410 BEGIN LOGOGRAM MARK
145CF;CL # Lo ANATOLIAN HIEROGLYPH A410A END LOGOGRAM MARK
@@ -3179,7 +3210,9 @@ FFFD;AI # So REPLACEMENT CHARACTER
1AFFD..1AFFE;AL # Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
1B000..1B0FF;ID # Lo [256] KATAKANA LETTER ARCHAIC E..HENTAIGANA LETTER RE-2
1B100..1B122;ID # Lo [35] HENTAIGANA LETTER RE-3..KATAKANA LETTER ARCHAIC WU
+1B132;CJ # Lo HIRAGANA LETTER SMALL KO
1B150..1B152;CJ # Lo [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO
+1B155;CJ # Lo KATAKANA LETTER SMALL KO
1B164..1B167;CJ # Lo [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
1B170..1B2FB;ID # Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
1BC00..1BC6A;AL # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M
@@ -3210,6 +3243,7 @@ FFFD;AI # So REPLACEMENT CHARACTER
1D200..1D241;AL # So [66] GREEK VOCAL NOTATION SYMBOL-1..GREEK INSTRUMENTAL NOTATION SYMBOL-54
1D242..1D244;CM # Mn [3] COMBINING GREEK MUSICAL TRISEME..COMBINING GREEK MUSICAL PENTASEME
1D245;AL # So GREEK MUSICAL LEIMMA
+1D2C0..1D2D3;AL # No [20] KAKTOVIK NUMERAL ZERO..KAKTOVIK NUMERAL NINETEEN
1D2E0..1D2F3;AL # No [20] MAYAN NUMERAL ZERO..MAYAN NUMERAL NINETEEN
1D300..1D356;AL # So [87] MONOGRAM FOR EARTH..TETRAGRAM FOR FOSTERING
1D360..1D378;AL # No [25] COUNTING ROD UNIT DIGIT ONE..TALLY MARK FIVE
@@ -3270,11 +3304,14 @@ FFFD;AI # So REPLACEMENT CHARACTER
1DF00..1DF09;AL # Ll [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
1DF0A;AL # Lo LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK
1DF0B..1DF1E;AL # Ll [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
+1DF25..1DF2A;AL # Ll [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK
1E000..1E006;CM # Mn [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE
1E008..1E018;CM # Mn [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU
1E01B..1E021;CM # Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
1E023..1E024;CM # Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
1E026..1E02A;CM # Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
+1E030..1E06D;AL # Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
+1E08F;CM # Mn COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
1E100..1E12C;AL # Lo [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W
1E130..1E136;CM # Mn [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
1E137..1E13D;AL # Lm [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER
@@ -3287,6 +3324,10 @@ FFFD;AI # So REPLACEMENT CHARACTER
1E2EC..1E2EF;CM # Mn [4] WANCHO TONE TUP..WANCHO TONE KOINI
1E2F0..1E2F9;NU # Nd [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE
1E2FF;PR # Sc WANCHO NGUN SIGN
+1E4D0..1E4EA;AL # Lo [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL
+1E4EB;AL # Lm NAG MUNDARI SIGN OJOD
+1E4EC..1E4EF;CM # Mn [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH
+1E4F0..1E4F9;NU # Nd [10] NAG MUNDARI DIGIT ZERO..NAG MUNDARI DIGIT NINE
1E7E0..1E7E6;AL # Lo [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO
1E7E8..1E7EB;AL # Lo [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE
1E7ED..1E7EE;AL # Lo [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE
@@ -3454,16 +3495,18 @@ FFFD;AI # So REPLACEMENT CHARACTER
1F6C1..1F6CB;ID # So [11] BATHTUB..COUCH AND LAMP
1F6CC;EB # So SLEEPING ACCOMMODATION
1F6CD..1F6D7;ID # So [11] SHOPPING BAGS..ELEVATOR
-1F6D8..1F6DC;ID # Cn [5] <reserved-1F6D8>..<reserved-1F6DC>
-1F6DD..1F6EC;ID # So [16] PLAYGROUND SLIDE..AIRPLANE ARRIVING
+1F6D8..1F6DB;ID # Cn [4] <reserved-1F6D8>..<reserved-1F6DB>
+1F6DC..1F6EC;ID # So [17] WIRELESS..AIRPLANE ARRIVING
1F6ED..1F6EF;ID # Cn [3] <reserved-1F6ED>..<reserved-1F6EF>
1F6F0..1F6FC;ID # So [13] SATELLITE..ROLLER SKATE
1F6FD..1F6FF;ID # Cn [3] <reserved-1F6FD>..<reserved-1F6FF>
1F700..1F773;AL # So [116] ALCHEMICAL SYMBOL FOR QUINTESSENCE..ALCHEMICAL SYMBOL FOR HALF OUNCE
-1F774..1F77F;ID # Cn [12] <reserved-1F774>..<reserved-1F77F>
+1F774..1F776;ID # So [3] LOT OF FORTUNE..LUNAR ECLIPSE
+1F777..1F77A;ID # Cn [4] <reserved-1F777>..<reserved-1F77A>
+1F77B..1F77F;ID # So [5] HAUMEA..ORCUS
1F780..1F7D4;AL # So [85] BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE..HEAVY TWELVE POINTED PINWHEEL STAR
-1F7D5..1F7D8;ID # So [4] CIRCLED TRIANGLE..NEGATIVE CIRCLED SQUARE
-1F7D9..1F7DF;ID # Cn [7] <reserved-1F7D9>..<reserved-1F7DF>
+1F7D5..1F7D9;ID # So [5] CIRCLED TRIANGLE..NINE POINTED WHITE STAR
+1F7DA..1F7DF;ID # Cn [6] <reserved-1F7DA>..<reserved-1F7DF>
1F7E0..1F7EB;ID # So [12] LARGE ORANGE CIRCLE..LARGE BROWN SQUARE
1F7EC..1F7EF;ID # Cn [4] <reserved-1F7EC>..<reserved-1F7EF>
1F7F0;ID # So HEAVY EQUALS SIGN
@@ -3509,33 +3552,29 @@ FFFD;AI # So REPLACEMENT CHARACTER
1FA54..1FA5F;ID # Cn [12] <reserved-1FA54>..<reserved-1FA5F>
1FA60..1FA6D;ID # So [14] XIANGQI RED GENERAL..XIANGQI BLACK SOLDIER
1FA6E..1FA6F;ID # Cn [2] <reserved-1FA6E>..<reserved-1FA6F>
-1FA70..1FA74;ID # So [5] BALLET SHOES..THONG SANDAL
-1FA75..1FA77;ID # Cn [3] <reserved-1FA75>..<reserved-1FA77>
-1FA78..1FA7C;ID # So [5] DROP OF BLOOD..CRUTCH
+1FA70..1FA7C;ID # So [13] BALLET SHOES..CRUTCH
1FA7D..1FA7F;ID # Cn [3] <reserved-1FA7D>..<reserved-1FA7F>
-1FA80..1FA86;ID # So [7] YO-YO..NESTING DOLLS
-1FA87..1FA8F;ID # Cn [9] <reserved-1FA87>..<reserved-1FA8F>
-1FA90..1FAAC;ID # So [29] RINGED PLANET..HAMSA
-1FAAD..1FAAF;ID # Cn [3] <reserved-1FAAD>..<reserved-1FAAF>
-1FAB0..1FABA;ID # So [11] FLY..NEST WITH EGGS
-1FABB..1FABF;ID # Cn [5] <reserved-1FABB>..<reserved-1FABF>
-1FAC0..1FAC2;ID # So [3] ANATOMICAL HEART..PEOPLE HUGGING
+1FA80..1FA88;ID # So [9] YO-YO..FLUTE
+1FA89..1FA8F;ID # Cn [7] <reserved-1FA89>..<reserved-1FA8F>
+1FA90..1FABD;ID # So [46] RINGED PLANET..WING
+1FABE;ID # Cn <reserved-1FABE>
+1FABF..1FAC2;ID # So [4] GOOSE..PEOPLE HUGGING
1FAC3..1FAC5;EB # So [3] PREGNANT MAN..PERSON WITH CROWN
-1FAC6..1FACF;ID # Cn [10] <reserved-1FAC6>..<reserved-1FACF>
-1FAD0..1FAD9;ID # So [10] BLUEBERRIES..JAR
-1FADA..1FADF;ID # Cn [6] <reserved-1FADA>..<reserved-1FADF>
-1FAE0..1FAE7;ID # So [8] MELTING FACE..BUBBLES
-1FAE8..1FAEF;ID # Cn [8] <reserved-1FAE8>..<reserved-1FAEF>
-1FAF0..1FAF6;EB # So [7] HAND WITH INDEX FINGER AND THUMB CROSSED..HEART HANDS
-1FAF7..1FAFF;ID # Cn [9] <reserved-1FAF7>..<reserved-1FAFF>
+1FAC6..1FACD;ID # Cn [8] <reserved-1FAC6>..<reserved-1FACD>
+1FACE..1FADB;ID # So [14] MOOSE..PEA POD
+1FADC..1FADF;ID # Cn [4] <reserved-1FADC>..<reserved-1FADF>
+1FAE0..1FAE8;ID # So [9] MELTING FACE..SHAKING FACE
+1FAE9..1FAEF;ID # Cn [7] <reserved-1FAE9>..<reserved-1FAEF>
+1FAF0..1FAF8;EB # So [9] HAND WITH INDEX FINGER AND THUMB CROSSED..RIGHTWARDS PUSHING HAND
+1FAF9..1FAFF;ID # Cn [7] <reserved-1FAF9>..<reserved-1FAFF>
1FB00..1FB92;AL # So [147] BLOCK SEXTANT-1..UPPER HALF INVERSE MEDIUM SHADE AND LOWER HALF BLOCK
1FB94..1FBCA;AL # So [55] LEFT HALF INVERSE MEDIUM SHADE AND RIGHT HALF BLOCK..WHITE UP-POINTING CHEVRON
1FBF0..1FBF9;NU # Nd [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE
1FC00..1FFFD;ID # Cn [1022] <reserved-1FC00>..<reserved-1FFFD>
20000..2A6DF;ID # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
2A6E0..2A6FF;ID # Cn [32] <reserved-2A6E0>..<reserved-2A6FF>
-2A700..2B738;ID # Lo [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738
-2B739..2B73F;ID # Cn [7] <reserved-2B739>..<reserved-2B73F>
+2A700..2B739;ID # Lo [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739
+2B73A..2B73F;ID # Cn [6] <reserved-2B73A>..<reserved-2B73F>
2B740..2B81D;ID # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
2B81E..2B81F;ID # Cn [2] <reserved-2B81E>..<reserved-2B81F>
2B820..2CEA1;ID # Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
@@ -3546,7 +3585,9 @@ FFFD;AI # So REPLACEMENT CHARACTER
2FA1E..2FA1F;ID # Cn [2] <reserved-2FA1E>..<reserved-2FA1F>
2FA20..2FFFD;ID # Cn [1502] <reserved-2FA20>..<reserved-2FFFD>
30000..3134A;ID # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
-3134B..3FFFD;ID # Cn [60595] <reserved-3134B>..<reserved-3FFFD>
+3134B..3134F;ID # Cn [5] <reserved-3134B>..<reserved-3134F>
+31350..323AF;ID # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
+323B0..3FFFD;ID # Cn [56398] <reserved-323B0>..<reserved-3FFFD>
E0001;CM # Cf LANGUAGE TAG
E0020..E007F;CM # Cf [96] TAG SPACE..CANCEL TAG
E0100..E01EF;CM # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
diff --git a/data/LineBreakTest.txt b/data/LineBreakTest.txt
@@ -1,11 +1,11 @@
-# LineBreakTest-14.0.0.txt
-# Date: 2021-08-20, 21:08:45 GMT
-# © 2021 Unicode®, Inc.
+# LineBreakTest-15.0.0.txt
+# Date: 2022-02-26, 00:38:39 GMT
+# © 2022 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
-# For terms of use, see http://www.unicode.org/terms_of_use.html
+# For terms of use, see https://www.unicode.org/terms_of_use.html
#
# Unicode Character Database
-# For documentation, see http://www.unicode.org/reports/tr44/
+# For documentation, see https://www.unicode.org/reports/tr44/
#
# Default Line_Break Test
#
diff --git a/data/SentenceBreakProperty.txt b/data/SentenceBreakProperty.txt
@@ -1,11 +1,11 @@
-# SentenceBreakProperty-14.0.0.txt
-# Date: 2021-08-12, 23:13:21 GMT
-# © 2021 Unicode®, Inc.
+# SentenceBreakProperty-15.0.0.txt
+# Date: 2022-08-05, 22:17:35 GMT
+# © 2022 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
-# For terms of use, see http://www.unicode.org/terms_of_use.html
+# For terms of use, see https://www.unicode.org/terms_of_use.html
#
# Unicode Character Database
-# For documentation, see http://www.unicode.org/reports/tr44/
+# For documentation, see https://www.unicode.org/reports/tr44/
# ================================================
@@ -144,6 +144,7 @@
0CCC..0CCD ; Extend # Mn [2] KANNADA VOWEL SIGN AU..KANNADA SIGN VIRAMA
0CD5..0CD6 ; Extend # Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK
0CE2..0CE3 ; Extend # Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL
+0CF3 ; Extend # Mc KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
0D00..0D01 ; Extend # Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU
0D02..0D03 ; Extend # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
0D3B..0D3C ; Extend # Mn [2] MALAYALAM SIGN VERTICAL BAR VIRAMA..MALAYALAM SIGN CIRCULAR VIRAMA
@@ -167,7 +168,7 @@
0E47..0E4E ; Extend # Mn [8] THAI CHARACTER MAITAIKHU..THAI CHARACTER YAMAKKAN
0EB1 ; Extend # Mn LAO VOWEL SIGN MAI KAN
0EB4..0EBC ; Extend # Mn [9] LAO VOWEL SIGN I..LAO SEMIVOWEL SIGN LO
-0EC8..0ECD ; Extend # Mn [6] LAO TONE MAI EK..LAO NIGGAHITA
+0EC8..0ECE ; Extend # Mn [7] LAO TONE MAI EK..LAO YAMAKKAN
0F18..0F19 ; Extend # Mn [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS
0F35 ; Extend # Mn TIBETAN MARK NGAS BZUNG NYI ZLA
0F37 ; Extend # Mn TIBETAN MARK NGAS BZUNG SGOR RTAGS
@@ -371,6 +372,7 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
10AE5..10AE6 ; Extend # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW
10D24..10D27 ; Extend # Mn [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI
10EAB..10EAC ; Extend # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
+10EFD..10EFF ; Extend # Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA
10F46..10F50 ; Extend # Mn [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW
10F82..10F85 ; Extend # Mn [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW
11000 ; Extend # Mc BRAHMI SIGN CANDRABINDU
@@ -407,6 +409,7 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
11235 ; Extend # Mc KHOJKI SIGN VIRAMA
11236..11237 ; Extend # Mn [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA
1123E ; Extend # Mn KHOJKI SIGN SUKUN
+11241 ; Extend # Mn KHOJKI VOWEL SIGN VOCALIC R
112DF ; Extend # Mn KHUDAWADI SIGN ANUSVARA
112E0..112E2 ; Extend # Mc [3] KHUDAWADI VOWEL SIGN AA..KHUDAWADI VOWEL SIGN II
112E3..112EA ; Extend # Mn [8] KHUDAWADI VOWEL SIGN U..KHUDAWADI SIGN VIRAMA
@@ -516,6 +519,16 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
11D97 ; Extend # Mn GUNJALA GONDI VIRAMA
11EF3..11EF4 ; Extend # Mn [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U
11EF5..11EF6 ; Extend # Mc [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O
+11F00..11F01 ; Extend # Mn [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA
+11F03 ; Extend # Mc KAWI SIGN VISARGA
+11F34..11F35 ; Extend # Mc [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA
+11F36..11F3A ; Extend # Mn [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R
+11F3E..11F3F ; Extend # Mc [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI
+11F40 ; Extend # Mn KAWI VOWEL SIGN EU
+11F41 ; Extend # Mc KAWI SIGN KILLER
+11F42 ; Extend # Mn KAWI CONJOINER
+13440 ; Extend # Mn EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY
+13447..13455 ; Extend # Mn [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED
16AF0..16AF4 ; Extend # Mn [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE
16B30..16B36 ; Extend # Mn [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM
16F4F ; Extend # Mn MIAO SIGN CONSONANT MODIFIER BAR
@@ -544,15 +557,17 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
1E01B..1E021 ; Extend # Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
1E023..1E024 ; Extend # Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
1E026..1E02A ; Extend # Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
+1E08F ; Extend # Mn COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
1E130..1E136 ; Extend # Mn [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
1E2AE ; Extend # Mn TOTO SIGN RISING TONE
1E2EC..1E2EF ; Extend # Mn [4] WANCHO TONE TUP..WANCHO TONE KOINI
+1E4EC..1E4EF ; Extend # Mn [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH
1E8D0..1E8D6 ; Extend # Mn [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS
1E944..1E94A ; Extend # Mn [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA
E0020..E007F ; Extend # Cf [96] TAG SPACE..CANCEL TAG
E0100..E01EF ; Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
-# Total code points: 2508
+# Total code points: 2550
# ================================================
@@ -581,12 +596,12 @@ FEFF ; Format # Cf ZERO WIDTH NO-BREAK SPACE
FFF9..FFFB ; Format # Cf [3] INTERLINEAR ANNOTATION ANCHOR..INTERLINEAR ANNOTATION TERMINATOR
110BD ; Format # Cf KAITHI NUMBER SIGN
110CD ; Format # Cf KAITHI NUMBER SIGN ABOVE
-13430..13438 ; Format # Cf [9] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END SEGMENT
+13430..1343F ; Format # Cf [16] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE
1BCA0..1BCA3 ; Format # Cf [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP
1D173..1D17A ; Format # Cf [8] MUSICAL SYMBOL BEGIN BEAM..MUSICAL SYMBOL END PHRASE
E0001 ; Format # Cf LANGUAGE TAG
-# Total code points: 65
+# Total code points: 72
# ================================================
@@ -880,6 +895,7 @@ E0001 ; Format # Cf LANGUAGE TAG
052D ; Lower # L& CYRILLIC SMALL LETTER DCHE
052F ; Lower # L& CYRILLIC SMALL LETTER EL WITH DESCENDER
0560..0588 ; Lower # L& [41] ARMENIAN SMALL LETTER TURNED AYB..ARMENIAN SMALL LETTER YI WITH STROKE
+10FC ; Lower # Lm MODIFIER LETTER GEORGIAN NAR
13F8..13FD ; Lower # L& [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV
1C80..1C88 ; Lower # L& [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK
1D00..1D2B ; Lower # L& [44] LATIN LETTER SMALL CAPITAL A..CYRILLIC LETTER SMALL CAPITAL EL
@@ -1228,12 +1244,14 @@ A7D3 ; Lower # L& LATIN SMALL LETTER DOUBLE THORN
A7D5 ; Lower # L& LATIN SMALL LETTER DOUBLE WYNN
A7D7 ; Lower # L& LATIN SMALL LETTER MIDDLE SCOTS S
A7D9 ; Lower # L& LATIN SMALL LETTER SIGMOID S
+A7F2..A7F4 ; Lower # Lm [3] MODIFIER LETTER CAPITAL C..MODIFIER LETTER CAPITAL Q
A7F6 ; Lower # L& LATIN SMALL LETTER REVERSED HALF H
A7F8..A7F9 ; Lower # Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE
A7FA ; Lower # L& LATIN LETTER SMALL CAPITAL TURNED M
AB30..AB5A ; Lower # L& [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG
AB5C..AB5F ; Lower # Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK
AB60..AB68 ; Lower # L& [9] LATIN SMALL LETTER SAKHA YAT..LATIN SMALL LETTER TURNED R WITH MIDDLE TILDE
+AB69 ; Lower # Lm MODIFIER LETTER SMALL TURNED W
AB70..ABBF ; Lower # L& [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETTER YA
FB00..FB06 ; Lower # L& [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST
FB13..FB17 ; Lower # L& [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH
@@ -1281,9 +1299,11 @@ FF41..FF5A ; Lower # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN
1D7CB ; Lower # L& MATHEMATICAL BOLD SMALL DIGAMMA
1DF00..1DF09 ; Lower # L& [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
1DF0B..1DF1E ; Lower # L& [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
+1DF25..1DF2A ; Lower # L& [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK
+1E030..1E06D ; Lower # Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
1E922..1E943 ; Lower # L& [34] ADLAM SMALL LETTER ALIF..ADLAM SMALL LETTER SHA
-# Total code points: 2424
+# Total code points: 2497
# ================================================
@@ -2102,7 +2122,6 @@ FF21..FF3A ; Upper # L& [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LAT
1075..1081 ; OLetter # Lo [13] MYANMAR LETTER SHAN KA..MYANMAR LETTER SHAN HA
108E ; OLetter # Lo MYANMAR LETTER RUMAI PALAUNG FA
10D0..10FA ; OLetter # L& [43] GEORGIAN LETTER AN..GEORGIAN LETTER AIN
-10FC ; OLetter # Lm MODIFIER LETTER GEORGIAN NAR
10FD..10FF ; OLetter # L& [3] GEORGIAN LETTER AEN..GEORGIAN LETTER LABIAL SIGN
1100..1248 ; OLetter # Lo [329] HANGUL CHOSEONG KIYEOK..ETHIOPIC SYLLABLE QWA
124A..124D ; OLetter # Lo [4] ETHIOPIC SYLLABLE QWI..ETHIOPIC SYLLABLE QWE
@@ -2215,7 +2234,6 @@ A6E6..A6EF ; OLetter # Nl [10] BAMUM LETTER MO..BAMUM LETTER KOGHOM
A717..A71F ; OLetter # Lm [9] MODIFIER LETTER DOT VERTICAL BAR..MODIFIER LETTER LOW INVERTED EXCLAMATION MARK
A788 ; OLetter # Lm MODIFIER LETTER LOW CIRCUMFLEX ACCENT
A78F ; OLetter # Lo LATIN LETTER SINOLOGICAL DOT
-A7F2..A7F4 ; OLetter # Lm [3] MODIFIER LETTER CAPITAL C..MODIFIER LETTER CAPITAL Q
A7F7 ; OLetter # Lo LATIN EPIGRAPHIC LETTER SIDEWAYS I
A7FB..A801 ; OLetter # Lo [7] LATIN EPIGRAPHIC LETTER REVERSED F..SYLOTI NAGRI LETTER I
A803..A805 ; OLetter # Lo [3] SYLOTI NAGRI LETTER U..SYLOTI NAGRI LETTER O
@@ -2258,7 +2276,6 @@ AB09..AB0E ; OLetter # Lo [6] ETHIOPIC SYLLABLE DDHU..ETHIOPIC SYLLABLE DDH
AB11..AB16 ; OLetter # Lo [6] ETHIOPIC SYLLABLE DZU..ETHIOPIC SYLLABLE DZO
AB20..AB26 ; OLetter # Lo [7] ETHIOPIC SYLLABLE CCHHA..ETHIOPIC SYLLABLE CCHHO
AB28..AB2E ; OLetter # Lo [7] ETHIOPIC SYLLABLE BBA..ETHIOPIC SYLLABLE BBO
-AB69 ; OLetter # Lm MODIFIER LETTER SMALL TURNED W
ABC0..ABE2 ; OLetter # Lo [35] MEETEI MAYEK LETTER KOK..MEETEI MAYEK LETTER I LONSUM
AC00..D7A3 ; OLetter # Lo [11172] HANGUL SYLLABLE GA..HANGUL SYLLABLE HIH
D7B0..D7C6 ; OLetter # Lo [23] HANGUL JUNGSEONG O-YEO..HANGUL JUNGSEONG ARAEA-E
@@ -2366,6 +2383,7 @@ FFDA..FFDC ; OLetter # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL
111DC ; OLetter # Lo SHARADA HEADSTROKE
11200..11211 ; OLetter # Lo [18] KHOJKI LETTER A..KHOJKI LETTER JJA
11213..1122B ; OLetter # Lo [25] KHOJKI LETTER NYA..KHOJKI LETTER LLA
+1123F..11240 ; OLetter # Lo [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I
11280..11286 ; OLetter # Lo [7] MULTANI LETTER A..MULTANI LETTER GA
11288 ; OLetter # Lo MULTANI LETTER GHA
1128A..1128D ; OLetter # Lo [4] MULTANI LETTER CA..MULTANI LETTER JJA
@@ -2427,12 +2445,16 @@ FFDA..FFDC ; OLetter # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL
11D6A..11D89 ; OLetter # Lo [32] GUNJALA GONDI LETTER OO..GUNJALA GONDI LETTER SA
11D98 ; OLetter # Lo GUNJALA GONDI OM
11EE0..11EF2 ; OLetter # Lo [19] MAKASAR LETTER KA..MAKASAR ANGKA
+11F02 ; OLetter # Lo KAWI SIGN REPHA
+11F04..11F10 ; OLetter # Lo [13] KAWI LETTER A..KAWI LETTER O
+11F12..11F33 ; OLetter # Lo [34] KAWI LETTER KA..KAWI LETTER JNYA
11FB0 ; OLetter # Lo LISU LETTER YHA
12000..12399 ; OLetter # Lo [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U
12400..1246E ; OLetter # Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM
12480..12543 ; OLetter # Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU
12F90..12FF0 ; OLetter # Lo [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114
-13000..1342E ; OLetter # Lo [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032
+13000..1342F ; OLetter # Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D
+13441..13446 ; OLetter # Lo [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN
14400..14646 ; OLetter # Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530
16800..16A38 ; OLetter # Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ
16A40..16A5E ; OLetter # Lo [31] MRO LETTER TA..MRO LETTER TEK
@@ -2454,7 +2476,9 @@ FFDA..FFDC ; OLetter # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL
1AFF5..1AFFB ; OLetter # Lm [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5
1AFFD..1AFFE ; OLetter # Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
1B000..1B122 ; OLetter # Lo [291] KATAKANA LETTER ARCHAIC E..KATAKANA LETTER ARCHAIC WU
+1B132 ; OLetter # Lo HIRAGANA LETTER SMALL KO
1B150..1B152 ; OLetter # Lo [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO
+1B155 ; OLetter # Lo KATAKANA LETTER SMALL KO
1B164..1B167 ; OLetter # Lo [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
1B170..1B2FB ; OLetter # Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
1BC00..1BC6A ; OLetter # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M
@@ -2467,6 +2491,8 @@ FFDA..FFDC ; OLetter # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL
1E14E ; OLetter # Lo NYIAKENG PUACHUE HMONG LOGOGRAM NYAJ
1E290..1E2AD ; OLetter # Lo [30] TOTO LETTER PA..TOTO LETTER A
1E2C0..1E2EB ; OLetter # Lo [44] WANCHO LETTER AA..WANCHO LETTER YIH
+1E4D0..1E4EA ; OLetter # Lo [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL
+1E4EB ; OLetter # Lm NAG MUNDARI SIGN OJOD
1E7E0..1E7E6 ; OLetter # Lo [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO
1E7E8..1E7EB ; OLetter # Lo [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE
1E7ED..1E7EE ; OLetter # Lo [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE
@@ -2507,14 +2533,15 @@ FFDA..FFDC ; OLetter # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL
1EEA5..1EEA9 ; OLetter # Lo [5] ARABIC MATHEMATICAL DOUBLE-STRUCK WAW..ARABIC MATHEMATICAL DOUBLE-STRUCK YEH
1EEAB..1EEBB ; OLetter # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN
20000..2A6DF ; OLetter # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
-2A700..2B738 ; OLetter # Lo [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738
+2A700..2B739 ; OLetter # Lo [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739
2B740..2B81D ; OLetter # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
2B820..2CEA1 ; OLetter # Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
2CEB0..2EBE0 ; OLetter # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
2F800..2FA1D ; OLetter # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
30000..3134A ; OLetter # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
+31350..323AF ; OLetter # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
-# Total code points: 127761
+# Total code points: 132036
# ================================================
@@ -2573,16 +2600,18 @@ FF10..FF19 ; Numeric # Nd [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE
11C50..11C59 ; Numeric # Nd [10] BHAIKSUKI DIGIT ZERO..BHAIKSUKI DIGIT NINE
11D50..11D59 ; Numeric # Nd [10] MASARAM GONDI DIGIT ZERO..MASARAM GONDI DIGIT NINE
11DA0..11DA9 ; Numeric # Nd [10] GUNJALA GONDI DIGIT ZERO..GUNJALA GONDI DIGIT NINE
+11F50..11F59 ; Numeric # Nd [10] KAWI DIGIT ZERO..KAWI DIGIT NINE
16A60..16A69 ; Numeric # Nd [10] MRO DIGIT ZERO..MRO DIGIT NINE
16AC0..16AC9 ; Numeric # Nd [10] TANGSA DIGIT ZERO..TANGSA DIGIT NINE
16B50..16B59 ; Numeric # Nd [10] PAHAWH HMONG DIGIT ZERO..PAHAWH HMONG DIGIT NINE
1D7CE..1D7FF ; Numeric # Nd [50] MATHEMATICAL BOLD DIGIT ZERO..MATHEMATICAL MONOSPACE DIGIT NINE
1E140..1E149 ; Numeric # Nd [10] NYIAKENG PUACHUE HMONG DIGIT ZERO..NYIAKENG PUACHUE HMONG DIGIT NINE
1E2F0..1E2F9 ; Numeric # Nd [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE
+1E4F0..1E4F9 ; Numeric # Nd [10] NAG MUNDARI DIGIT ZERO..NAG MUNDARI DIGIT NINE
1E950..1E959 ; Numeric # Nd [10] ADLAM DIGIT ZERO..ADLAM DIGIT NINE
1FBF0..1FBF9 ; Numeric # Nd [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE
-# Total code points: 662
+# Total code points: 682
# ================================================
@@ -2664,6 +2693,7 @@ FF61 ; STerm # Po HALFWIDTH IDEOGRAPHIC FULL STOP
11A9B..11A9C ; STerm # Po [2] SOYOMBO MARK SHAD..SOYOMBO MARK DOUBLE SHAD
11C41..11C42 ; STerm # Po [2] BHAIKSUKI DANDA..BHAIKSUKI DOUBLE DANDA
11EF7..11EF8 ; STerm # Po [2] MAKASAR PASSIMBANG..MAKASAR END OF SECTION
+11F43..11F44 ; STerm # Po [2] KAWI DANDA..KAWI DOUBLE DANDA
16A6E..16A6F ; STerm # Po [2] MRO DANDA..MRO DOUBLE DANDA
16AF5 ; STerm # Po BASSA VAH FULL STOP
16B37..16B38 ; STerm # Po [2] PAHAWH HMONG SIGN VOS THOM..PAHAWH HMONG SIGN VOS TSHAB CEEB
@@ -2672,7 +2702,7 @@ FF61 ; STerm # Po HALFWIDTH IDEOGRAPHIC FULL STOP
1BC9F ; STerm # Po DUPLOYAN PUNCTUATION CHINOOK FULL STOP
1DA88 ; STerm # Po SIGNWRITING FULL STOP
-# Total code points: 149
+# Total code points: 151
# ================================================
diff --git a/data/SentenceBreakTest.txt b/data/SentenceBreakTest.txt
@@ -1,11 +1,11 @@
-# SentenceBreakTest-14.0.0.txt
-# Date: 2021-03-08, 06:22:40 GMT
-# © 2021 Unicode®, Inc.
+# SentenceBreakTest-15.0.0.txt
+# Date: 2022-02-26, 00:39:00 GMT
+# © 2022 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
-# For terms of use, see http://www.unicode.org/terms_of_use.html
+# For terms of use, see https://www.unicode.org/terms_of_use.html
#
# Unicode Character Database
-# For documentation, see http://www.unicode.org/reports/tr44/
+# For documentation, see https://www.unicode.org/reports/tr44/
#
# Default Sentence_Break Test
#
diff --git a/data/SpecialCasing.txt b/data/SpecialCasing.txt
@@ -1,11 +1,11 @@
-# SpecialCasing-14.0.0.txt
-# Date: 2021-03-08, 19:35:55 GMT
-# © 2021 Unicode®, Inc.
+# SpecialCasing-15.0.0.txt
+# Date: 2022-02-02, 23:35:52 GMT
+# © 2022 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
-# For terms of use, see http://www.unicode.org/terms_of_use.html
+# For terms of use, see https://www.unicode.org/terms_of_use.html
#
# Unicode Character Database
-# For documentation, see http://www.unicode.org/reports/tr44/
+# For documentation, see https://www.unicode.org/reports/tr44/
#
# Special Casing
#
diff --git a/data/UnicodeData.txt b/data/UnicodeData.txt
@@ -2975,6 +2975,7 @@
0CEF;KANNADA DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
0CF1;KANNADA SIGN JIHVAMULIYA;Lo;0;L;;;;;N;;;;;
0CF2;KANNADA SIGN UPADHMANIYA;Lo;0;L;;;;;N;;;;;
+0CF3;KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT;Mc;0;L;;;;;N;;;;;
0D00;MALAYALAM SIGN COMBINING ANUSVARA ABOVE;Mn;0;NSM;;;;;N;;;;;
0D01;MALAYALAM SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;
0D02;MALAYALAM SIGN ANUSVARA;Mc;0;L;;;;;N;;;;;
@@ -3339,6 +3340,7 @@
0ECB;LAO TONE MAI CATAWA;Mn;122;NSM;;;;;N;;;;;
0ECC;LAO CANCELLATION MARK;Mn;0;NSM;;;;;N;;;;;
0ECD;LAO NIGGAHITA;Mn;0;NSM;;;;;N;;;;;
+0ECE;LAO YAMAKKAN;Mn;0;NSM;;;;;N;;;;;
0ED0;LAO DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;;
0ED1;LAO DIGIT ONE;Nd;0;L;;1;1;1;N;;;;;
0ED2;LAO DIGIT TWO;Nd;0;L;;2;2;2;N;;;;;
@@ -19393,6 +19395,9 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
10EAD;YEZIDI HYPHENATION MARK;Pd;0;R;;;;;N;;;;;
10EB0;YEZIDI LETTER LAM WITH DOT ABOVE;Lo;0;R;;;;;N;;;;;
10EB1;YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE;Lo;0;R;;;;;N;;;;;
+10EFD;ARABIC SMALL LOW WORD SAKTA;Mn;220;NSM;;;;;N;;;;;
+10EFE;ARABIC SMALL LOW WORD QASR;Mn;220;NSM;;;;;N;;;;;
+10EFF;ARABIC SMALL LOW WORD MADDA;Mn;220;NSM;;;;;N;;;;;
10F00;OLD SOGDIAN LETTER ALEPH;Lo;0;R;;;;;N;;;;;
10F01;OLD SOGDIAN LETTER FINAL ALEPH;Lo;0;R;;;;;N;;;;;
10F02;OLD SOGDIAN LETTER BETH;Lo;0;R;;;;;N;;;;;
@@ -20058,6 +20063,9 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
1123C;KHOJKI DOUBLE SECTION MARK;Po;0;L;;;;;N;;;;;
1123D;KHOJKI ABBREVIATION SIGN;Po;0;L;;;;;N;;;;;
1123E;KHOJKI SIGN SUKUN;Mn;0;NSM;;;;;N;;;;;
+1123F;KHOJKI LETTER QA;Lo;0;L;;;;;N;;;;;
+11240;KHOJKI LETTER SHORT I;Lo;0;L;;;;;N;;;;;
+11241;KHOJKI VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;;
11280;MULTANI LETTER A;Lo;0;L;;;;;N;;;;;
11281;MULTANI LETTER I;Lo;0;L;;;;;N;;;;;
11282;MULTANI LETTER U;Lo;0;L;;;;;N;;;;;
@@ -21256,6 +21264,16 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
11AF6;PAU CIN HAU LOW-FALLING TONE LONG FINAL;Lo;0;L;;;;;N;;;;;
11AF7;PAU CIN HAU LOW-FALLING TONE FINAL;Lo;0;L;;;;;N;;;;;
11AF8;PAU CIN HAU GLOTTAL STOP FINAL;Lo;0;L;;;;;N;;;;;
+11B00;DEVANAGARI HEAD MARK;Po;0;L;;;;;N;;;;;
+11B01;DEVANAGARI HEAD MARK WITH HEADSTROKE;Po;0;L;;;;;N;;;;;
+11B02;DEVANAGARI SIGN BHALE;Po;0;L;;;;;N;;;;;
+11B03;DEVANAGARI SIGN BHALE WITH HOOK;Po;0;L;;;;;N;;;;;
+11B04;DEVANAGARI SIGN EXTENDED BHALE;Po;0;L;;;;;N;;;;;
+11B05;DEVANAGARI SIGN EXTENDED BHALE WITH HOOK;Po;0;L;;;;;N;;;;;
+11B06;DEVANAGARI SIGN WESTERN FIVE-LIKE BHALE;Po;0;L;;;;;N;;;;;
+11B07;DEVANAGARI SIGN WESTERN NINE-LIKE BHALE;Po;0;L;;;;;N;;;;;
+11B08;DEVANAGARI SIGN REVERSED NINE-LIKE BHALE;Po;0;L;;;;;N;;;;;
+11B09;DEVANAGARI SIGN MINDU;Po;0;L;;;;;N;;;;;
11C00;BHAIKSUKI LETTER A;Lo;0;L;;;;;N;;;;;
11C01;BHAIKSUKI LETTER AA;Lo;0;L;;;;;N;;;;;
11C02;BHAIKSUKI LETTER I;Lo;0;L;;;;;N;;;;;
@@ -21584,6 +21602,92 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
11EF6;MAKASAR VOWEL SIGN O;Mc;0;L;;;;;N;;;;;
11EF7;MAKASAR PASSIMBANG;Po;0;L;;;;;N;;;;;
11EF8;MAKASAR END OF SECTION;Po;0;L;;;;;N;;;;;
+11F00;KAWI SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;
+11F01;KAWI SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;;
+11F02;KAWI SIGN REPHA;Lo;0;L;;;;;N;;;;;
+11F03;KAWI SIGN VISARGA;Mc;0;L;;;;;N;;;;;
+11F04;KAWI LETTER A;Lo;0;L;;;;;N;;;;;
+11F05;KAWI LETTER AA;Lo;0;L;;;;;N;;;;;
+11F06;KAWI LETTER I;Lo;0;L;;;;;N;;;;;
+11F07;KAWI LETTER II;Lo;0;L;;;;;N;;;;;
+11F08;KAWI LETTER U;Lo;0;L;;;;;N;;;;;
+11F09;KAWI LETTER UU;Lo;0;L;;;;;N;;;;;
+11F0A;KAWI LETTER VOCALIC R;Lo;0;L;;;;;N;;;;;
+11F0B;KAWI LETTER VOCALIC RR;Lo;0;L;;;;;N;;;;;
+11F0C;KAWI LETTER VOCALIC L;Lo;0;L;;;;;N;;;;;
+11F0D;KAWI LETTER VOCALIC LL;Lo;0;L;;;;;N;;;;;
+11F0E;KAWI LETTER E;Lo;0;L;;;;;N;;;;;
+11F0F;KAWI LETTER AI;Lo;0;L;;;;;N;;;;;
+11F10;KAWI LETTER O;Lo;0;L;;;;;N;;;;;
+11F12;KAWI LETTER KA;Lo;0;L;;;;;N;;;;;
+11F13;KAWI LETTER KHA;Lo;0;L;;;;;N;;;;;
+11F14;KAWI LETTER GA;Lo;0;L;;;;;N;;;;;
+11F15;KAWI LETTER GHA;Lo;0;L;;;;;N;;;;;
+11F16;KAWI LETTER NGA;Lo;0;L;;;;;N;;;;;
+11F17;KAWI LETTER CA;Lo;0;L;;;;;N;;;;;
+11F18;KAWI LETTER CHA;Lo;0;L;;;;;N;;;;;
+11F19;KAWI LETTER JA;Lo;0;L;;;;;N;;;;;
+11F1A;KAWI LETTER JHA;Lo;0;L;;;;;N;;;;;
+11F1B;KAWI LETTER NYA;Lo;0;L;;;;;N;;;;;
+11F1C;KAWI LETTER TTA;Lo;0;L;;;;;N;;;;;
+11F1D;KAWI LETTER TTHA;Lo;0;L;;;;;N;;;;;
+11F1E;KAWI LETTER DDA;Lo;0;L;;;;;N;;;;;
+11F1F;KAWI LETTER DDHA;Lo;0;L;;;;;N;;;;;
+11F20;KAWI LETTER NNA;Lo;0;L;;;;;N;;;;;
+11F21;KAWI LETTER TA;Lo;0;L;;;;;N;;;;;
+11F22;KAWI LETTER THA;Lo;0;L;;;;;N;;;;;
+11F23;KAWI LETTER DA;Lo;0;L;;;;;N;;;;;
+11F24;KAWI LETTER DHA;Lo;0;L;;;;;N;;;;;
+11F25;KAWI LETTER NA;Lo;0;L;;;;;N;;;;;
+11F26;KAWI LETTER PA;Lo;0;L;;;;;N;;;;;
+11F27;KAWI LETTER PHA;Lo;0;L;;;;;N;;;;;
+11F28;KAWI LETTER BA;Lo;0;L;;;;;N;;;;;
+11F29;KAWI LETTER BHA;Lo;0;L;;;;;N;;;;;
+11F2A;KAWI LETTER MA;Lo;0;L;;;;;N;;;;;
+11F2B;KAWI LETTER YA;Lo;0;L;;;;;N;;;;;
+11F2C;KAWI LETTER RA;Lo;0;L;;;;;N;;;;;
+11F2D;KAWI LETTER LA;Lo;0;L;;;;;N;;;;;
+11F2E;KAWI LETTER WA;Lo;0;L;;;;;N;;;;;
+11F2F;KAWI LETTER SHA;Lo;0;L;;;;;N;;;;;
+11F30;KAWI LETTER SSA;Lo;0;L;;;;;N;;;;;
+11F31;KAWI LETTER SA;Lo;0;L;;;;;N;;;;;
+11F32;KAWI LETTER HA;Lo;0;L;;;;;N;;;;;
+11F33;KAWI LETTER JNYA;Lo;0;L;;;;;N;;;;;
+11F34;KAWI VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;
+11F35;KAWI VOWEL SIGN ALTERNATE AA;Mc;0;L;;;;;N;;;;;
+11F36;KAWI VOWEL SIGN I;Mn;0;NSM;;;;;N;;;;;
+11F37;KAWI VOWEL SIGN II;Mn;0;NSM;;;;;N;;;;;
+11F38;KAWI VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;
+11F39;KAWI VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;
+11F3A;KAWI VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;;
+11F3E;KAWI VOWEL SIGN E;Mc;0;L;;;;;N;;;;;
+11F3F;KAWI VOWEL SIGN AI;Mc;0;L;;;;;N;;;;;
+11F40;KAWI VOWEL SIGN EU;Mn;0;NSM;;;;;N;;;;;
+11F41;KAWI SIGN KILLER;Mc;9;L;;;;;N;;;;;
+11F42;KAWI CONJOINER;Mn;9;NSM;;;;;N;;;;;
+11F43;KAWI DANDA;Po;0;L;;;;;N;;;;;
+11F44;KAWI DOUBLE DANDA;Po;0;L;;;;;N;;;;;
+11F45;KAWI PUNCTUATION SECTION MARKER;Po;0;L;;;;;N;;;;;
+11F46;KAWI PUNCTUATION ALTERNATE SECTION MARKER;Po;0;L;;;;;N;;;;;
+11F47;KAWI PUNCTUATION FLOWER;Po;0;L;;;;;N;;;;;
+11F48;KAWI PUNCTUATION SPACE FILLER;Po;0;L;;;;;N;;;;;
+11F49;KAWI PUNCTUATION DOT;Po;0;L;;;;;N;;;;;
+11F4A;KAWI PUNCTUATION DOUBLE DOT;Po;0;L;;;;;N;;;;;
+11F4B;KAWI PUNCTUATION TRIPLE DOT;Po;0;L;;;;;N;;;;;
+11F4C;KAWI PUNCTUATION CIRCLE;Po;0;L;;;;;N;;;;;
+11F4D;KAWI PUNCTUATION FILLED CIRCLE;Po;0;L;;;;;N;;;;;
+11F4E;KAWI PUNCTUATION SPIRAL;Po;0;L;;;;;N;;;;;
+11F4F;KAWI PUNCTUATION CLOSING SPIRAL;Po;0;L;;;;;N;;;;;
+11F50;KAWI DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;;
+11F51;KAWI DIGIT ONE;Nd;0;L;;1;1;1;N;;;;;
+11F52;KAWI DIGIT TWO;Nd;0;L;;2;2;2;N;;;;;
+11F53;KAWI DIGIT THREE;Nd;0;L;;3;3;3;N;;;;;
+11F54;KAWI DIGIT FOUR;Nd;0;L;;4;4;4;N;;;;;
+11F55;KAWI DIGIT FIVE;Nd;0;L;;5;5;5;N;;;;;
+11F56;KAWI DIGIT SIX;Nd;0;L;;6;6;6;N;;;;;
+11F57;KAWI DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;;
+11F58;KAWI DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;;
+11F59;KAWI DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
11FB0;LISU LETTER YHA;Lo;0;L;;;;;N;;;;;
11FC0;TAMIL FRACTION ONE THREE-HUNDRED-AND-TWENTIETH;No;0;L;;;;1/320;N;;;;;
11FC1;TAMIL FRACTION ONE ONE-HUNDRED-AND-SIXTIETH;No;0;L;;;;1/160;N;;;;;
@@ -24040,6 +24144,7 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
1342C;EGYPTIAN HIEROGLYPH AA030;Lo;0;L;;;;;N;;;;;
1342D;EGYPTIAN HIEROGLYPH AA031;Lo;0;L;;;;;N;;;;;
1342E;EGYPTIAN HIEROGLYPH AA032;Lo;0;L;;;;;N;;;;;
+1342F;EGYPTIAN HIEROGLYPH V011D;Lo;0;L;;;;;N;;;;;
13430;EGYPTIAN HIEROGLYPH VERTICAL JOINER;Cf;0;L;;;;;N;;;;;
13431;EGYPTIAN HIEROGLYPH HORIZONTAL JOINER;Cf;0;L;;;;;N;;;;;
13432;EGYPTIAN HIEROGLYPH INSERT AT TOP START;Cf;0;L;;;;;N;;;;;
@@ -24049,6 +24154,35 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
13436;EGYPTIAN HIEROGLYPH OVERLAY MIDDLE;Cf;0;L;;;;;N;;;;;
13437;EGYPTIAN HIEROGLYPH BEGIN SEGMENT;Cf;0;L;;;;;N;;;;;
13438;EGYPTIAN HIEROGLYPH END SEGMENT;Cf;0;L;;;;;N;;;;;
+13439;EGYPTIAN HIEROGLYPH INSERT AT MIDDLE;Cf;0;L;;;;;N;;;;;
+1343A;EGYPTIAN HIEROGLYPH INSERT AT TOP;Cf;0;L;;;;;N;;;;;
+1343B;EGYPTIAN HIEROGLYPH INSERT AT BOTTOM;Cf;0;L;;;;;N;;;;;
+1343C;EGYPTIAN HIEROGLYPH BEGIN ENCLOSURE;Cf;0;L;;;;;N;;;;;
+1343D;EGYPTIAN HIEROGLYPH END ENCLOSURE;Cf;0;L;;;;;N;;;;;
+1343E;EGYPTIAN HIEROGLYPH BEGIN WALLED ENCLOSURE;Cf;0;L;;;;;N;;;;;
+1343F;EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE;Cf;0;L;;;;;N;;;;;
+13440;EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY;Mn;0;NSM;;;;;N;;;;;
+13441;EGYPTIAN HIEROGLYPH FULL BLANK;Lo;0;L;;;;;N;;;;;
+13442;EGYPTIAN HIEROGLYPH HALF BLANK;Lo;0;L;;;;;N;;;;;
+13443;EGYPTIAN HIEROGLYPH LOST SIGN;Lo;0;L;;;;;N;;;;;
+13444;EGYPTIAN HIEROGLYPH HALF LOST SIGN;Lo;0;L;;;;;N;;;;;
+13445;EGYPTIAN HIEROGLYPH TALL LOST SIGN;Lo;0;L;;;;;N;;;;;
+13446;EGYPTIAN HIEROGLYPH WIDE LOST SIGN;Lo;0;L;;;;;N;;;;;
+13447;EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START;Mn;0;NSM;;;;;N;;;;;
+13448;EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT BOTTOM START;Mn;0;NSM;;;;;N;;;;;
+13449;EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT START;Mn;0;NSM;;;;;N;;;;;
+1344A;EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP END;Mn;0;NSM;;;;;N;;;;;
+1344B;EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP;Mn;0;NSM;;;;;N;;;;;
+1344C;EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT BOTTOM START AND TOP END;Mn;0;NSM;;;;;N;;;;;
+1344D;EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT START AND TOP;Mn;0;NSM;;;;;N;;;;;
+1344E;EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT BOTTOM END;Mn;0;NSM;;;;;N;;;;;
+1344F;EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START AND BOTTOM END;Mn;0;NSM;;;;;N;;;;;
+13450;EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT BOTTOM;Mn;0;NSM;;;;;N;;;;;
+13451;EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT START AND BOTTOM;Mn;0;NSM;;;;;N;;;;;
+13452;EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT END;Mn;0;NSM;;;;;N;;;;;
+13453;EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP AND END;Mn;0;NSM;;;;;N;;;;;
+13454;EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT BOTTOM AND END;Mn;0;NSM;;;;;N;;;;;
+13455;EGYPTIAN HIEROGLYPH MODIFIER DAMAGED;Mn;0;NSM;;;;;N;;;;;
14400;ANATOLIAN HIEROGLYPH A001;Lo;0;L;;;;;N;;;;;
14401;ANATOLIAN HIEROGLYPH A002;Lo;0;L;;;;;N;;;;;
14402;ANATOLIAN HIEROGLYPH A003;Lo;0;L;;;;;N;;;;;
@@ -27289,9 +27423,11 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
1B120;KATAKANA LETTER ARCHAIC YI;Lo;0;L;;;;;N;;;;;
1B121;KATAKANA LETTER ARCHAIC YE;Lo;0;L;;;;;N;;;;;
1B122;KATAKANA LETTER ARCHAIC WU;Lo;0;L;;;;;N;;;;;
+1B132;HIRAGANA LETTER SMALL KO;Lo;0;L;;;;;N;;;;;
1B150;HIRAGANA LETTER SMALL WI;Lo;0;L;;;;;N;;;;;
1B151;HIRAGANA LETTER SMALL WE;Lo;0;L;;;;;N;;;;;
1B152;HIRAGANA LETTER SMALL WO;Lo;0;L;;;;;N;;;;;
+1B155;KATAKANA LETTER SMALL KO;Lo;0;L;;;;;N;;;;;
1B164;KATAKANA LETTER SMALL WI;Lo;0;L;;;;;N;;;;;
1B165;KATAKANA LETTER SMALL WE;Lo;0;L;;;;;N;;;;;
1B166;KATAKANA LETTER SMALL WO;Lo;0;L;;;;;N;;;;;
@@ -28573,6 +28709,26 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
1D243;COMBINING GREEK MUSICAL TETRASEME;Mn;230;NSM;;;;;N;;;;;
1D244;COMBINING GREEK MUSICAL PENTASEME;Mn;230;NSM;;;;;N;;;;;
1D245;GREEK MUSICAL LEIMMA;So;0;ON;;;;;N;;;;;
+1D2C0;KAKTOVIK NUMERAL ZERO;No;0;L;;;;0;N;;;;;
+1D2C1;KAKTOVIK NUMERAL ONE;No;0;L;;;;1;N;;;;;
+1D2C2;KAKTOVIK NUMERAL TWO;No;0;L;;;;2;N;;;;;
+1D2C3;KAKTOVIK NUMERAL THREE;No;0;L;;;;3;N;;;;;
+1D2C4;KAKTOVIK NUMERAL FOUR;No;0;L;;;;4;N;;;;;
+1D2C5;KAKTOVIK NUMERAL FIVE;No;0;L;;;;5;N;;;;;
+1D2C6;KAKTOVIK NUMERAL SIX;No;0;L;;;;6;N;;;;;
+1D2C7;KAKTOVIK NUMERAL SEVEN;No;0;L;;;;7;N;;;;;
+1D2C8;KAKTOVIK NUMERAL EIGHT;No;0;L;;;;8;N;;;;;
+1D2C9;KAKTOVIK NUMERAL NINE;No;0;L;;;;9;N;;;;;
+1D2CA;KAKTOVIK NUMERAL TEN;No;0;L;;;;10;N;;;;;
+1D2CB;KAKTOVIK NUMERAL ELEVEN;No;0;L;;;;11;N;;;;;
+1D2CC;KAKTOVIK NUMERAL TWELVE;No;0;L;;;;12;N;;;;;
+1D2CD;KAKTOVIK NUMERAL THIRTEEN;No;0;L;;;;13;N;;;;;
+1D2CE;KAKTOVIK NUMERAL FOURTEEN;No;0;L;;;;14;N;;;;;
+1D2CF;KAKTOVIK NUMERAL FIFTEEN;No;0;L;;;;15;N;;;;;
+1D2D0;KAKTOVIK NUMERAL SIXTEEN;No;0;L;;;;16;N;;;;;
+1D2D1;KAKTOVIK NUMERAL SEVENTEEN;No;0;L;;;;17;N;;;;;
+1D2D2;KAKTOVIK NUMERAL EIGHTEEN;No;0;L;;;;18;N;;;;;
+1D2D3;KAKTOVIK NUMERAL NINETEEN;No;0;L;;;;19;N;;;;;
1D2E0;MAYAN NUMERAL ZERO;No;0;L;;;;0;N;;;;;
1D2E1;MAYAN NUMERAL ONE;No;0;L;;;;1;N;;;;;
1D2E2;MAYAN NUMERAL TWO;No;0;L;;;;2;N;;;;;
@@ -30404,6 +30560,12 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
1DF1C;LATIN SMALL LETTER TESH DIGRAPH WITH RETROFLEX HOOK;Ll;0;L;;;;;N;;;;;
1DF1D;LATIN SMALL LETTER C WITH RETROFLEX HOOK;Ll;0;L;;;;;N;;;;;
1DF1E;LATIN SMALL LETTER S WITH CURL;Ll;0;L;;;;;N;;;;;
+1DF25;LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK;Ll;0;L;;;;;N;;;;;
+1DF26;LATIN SMALL LETTER L WITH MID-HEIGHT LEFT HOOK;Ll;0;L;;;;;N;;;;;
+1DF27;LATIN SMALL LETTER N WITH MID-HEIGHT LEFT HOOK;Ll;0;L;;;;;N;;;;;
+1DF28;LATIN SMALL LETTER R WITH MID-HEIGHT LEFT HOOK;Ll;0;L;;;;;N;;;;;
+1DF29;LATIN SMALL LETTER S WITH MID-HEIGHT LEFT HOOK;Ll;0;L;;;;;N;;;;;
+1DF2A;LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK;Ll;0;L;;;;;N;;;;;
1E000;COMBINING GLAGOLITIC LETTER AZU;Mn;230;NSM;;;;;N;;;;;
1E001;COMBINING GLAGOLITIC LETTER BUKY;Mn;230;NSM;;;;;N;;;;;
1E002;COMBINING GLAGOLITIC LETTER VEDE;Mn;230;NSM;;;;;N;;;;;
@@ -30442,6 +30604,69 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
1E028;COMBINING GLAGOLITIC LETTER BIG YUS;Mn;230;NSM;;;;;N;;;;;
1E029;COMBINING GLAGOLITIC LETTER IOTATED BIG YUS;Mn;230;NSM;;;;;N;;;;;
1E02A;COMBINING GLAGOLITIC LETTER FITA;Mn;230;NSM;;;;;N;;;;;
+1E030;MODIFIER LETTER CYRILLIC SMALL A;Lm;0;L;<super> 0430;;;;N;;;;;
+1E031;MODIFIER LETTER CYRILLIC SMALL BE;Lm;0;L;<super> 0431;;;;N;;;;;
+1E032;MODIFIER LETTER CYRILLIC SMALL VE;Lm;0;L;<super> 0432;;;;N;;;;;
+1E033;MODIFIER LETTER CYRILLIC SMALL GHE;Lm;0;L;<super> 0433;;;;N;;;;;
+1E034;MODIFIER LETTER CYRILLIC SMALL DE;Lm;0;L;<super> 0434;;;;N;;;;;
+1E035;MODIFIER LETTER CYRILLIC SMALL IE;Lm;0;L;<super> 0435;;;;N;;;;;
+1E036;MODIFIER LETTER CYRILLIC SMALL ZHE;Lm;0;L;<super> 0436;;;;N;;;;;
+1E037;MODIFIER LETTER CYRILLIC SMALL ZE;Lm;0;L;<super> 0437;;;;N;;;;;
+1E038;MODIFIER LETTER CYRILLIC SMALL I;Lm;0;L;<super> 0438;;;;N;;;;;
+1E039;MODIFIER LETTER CYRILLIC SMALL KA;Lm;0;L;<super> 043A;;;;N;;;;;
+1E03A;MODIFIER LETTER CYRILLIC SMALL EL;Lm;0;L;<super> 043B;;;;N;;;;;
+1E03B;MODIFIER LETTER CYRILLIC SMALL EM;Lm;0;L;<super> 043C;;;;N;;;;;
+1E03C;MODIFIER LETTER CYRILLIC SMALL O;Lm;0;L;<super> 043E;;;;N;;;;;
+1E03D;MODIFIER LETTER CYRILLIC SMALL PE;Lm;0;L;<super> 043F;;;;N;;;;;
+1E03E;MODIFIER LETTER CYRILLIC SMALL ER;Lm;0;L;<super> 0440;;;;N;;;;;
+1E03F;MODIFIER LETTER CYRILLIC SMALL ES;Lm;0;L;<super> 0441;;;;N;;;;;
+1E040;MODIFIER LETTER CYRILLIC SMALL TE;Lm;0;L;<super> 0442;;;;N;;;;;
+1E041;MODIFIER LETTER CYRILLIC SMALL U;Lm;0;L;<super> 0443;;;;N;;;;;
+1E042;MODIFIER LETTER CYRILLIC SMALL EF;Lm;0;L;<super> 0444;;;;N;;;;;
+1E043;MODIFIER LETTER CYRILLIC SMALL HA;Lm;0;L;<super> 0445;;;;N;;;;;
+1E044;MODIFIER LETTER CYRILLIC SMALL TSE;Lm;0;L;<super> 0446;;;;N;;;;;
+1E045;MODIFIER LETTER CYRILLIC SMALL CHE;Lm;0;L;<super> 0447;;;;N;;;;;
+1E046;MODIFIER LETTER CYRILLIC SMALL SHA;Lm;0;L;<super> 0448;;;;N;;;;;
+1E047;MODIFIER LETTER CYRILLIC SMALL YERU;Lm;0;L;<super> 044B;;;;N;;;;;
+1E048;MODIFIER LETTER CYRILLIC SMALL E;Lm;0;L;<super> 044D;;;;N;;;;;
+1E049;MODIFIER LETTER CYRILLIC SMALL YU;Lm;0;L;<super> 044E;;;;N;;;;;
+1E04A;MODIFIER LETTER CYRILLIC SMALL DZZE;Lm;0;L;<super> A689;;;;N;;;;;
+1E04B;MODIFIER LETTER CYRILLIC SMALL SCHWA;Lm;0;L;<super> 04D9;;;;N;;;;;
+1E04C;MODIFIER LETTER CYRILLIC SMALL BYELORUSSIAN-UKRAINIAN I;Lm;0;L;<super> 0456;;;;N;;;;;
+1E04D;MODIFIER LETTER CYRILLIC SMALL JE;Lm;0;L;<super> 0458;;;;N;;;;;
+1E04E;MODIFIER LETTER CYRILLIC SMALL BARRED O;Lm;0;L;<super> 04E9;;;;N;;;;;
+1E04F;MODIFIER LETTER CYRILLIC SMALL STRAIGHT U;Lm;0;L;<super> 04AF;;;;N;;;;;
+1E050;MODIFIER LETTER CYRILLIC SMALL PALOCHKA;Lm;0;L;<super> 04CF;;;;N;;;;;
+1E051;CYRILLIC SUBSCRIPT SMALL LETTER A;Lm;0;L;<sub> 0430;;;;N;;;;;
+1E052;CYRILLIC SUBSCRIPT SMALL LETTER BE;Lm;0;L;<sub> 0431;;;;N;;;;;
+1E053;CYRILLIC SUBSCRIPT SMALL LETTER VE;Lm;0;L;<sub> 0432;;;;N;;;;;
+1E054;CYRILLIC SUBSCRIPT SMALL LETTER GHE;Lm;0;L;<sub> 0433;;;;N;;;;;
+1E055;CYRILLIC SUBSCRIPT SMALL LETTER DE;Lm;0;L;<sub> 0434;;;;N;;;;;
+1E056;CYRILLIC SUBSCRIPT SMALL LETTER IE;Lm;0;L;<sub> 0435;;;;N;;;;;
+1E057;CYRILLIC SUBSCRIPT SMALL LETTER ZHE;Lm;0;L;<sub> 0436;;;;N;;;;;
+1E058;CYRILLIC SUBSCRIPT SMALL LETTER ZE;Lm;0;L;<sub> 0437;;;;N;;;;;
+1E059;CYRILLIC SUBSCRIPT SMALL LETTER I;Lm;0;L;<sub> 0438;;;;N;;;;;
+1E05A;CYRILLIC SUBSCRIPT SMALL LETTER KA;Lm;0;L;<sub> 043A;;;;N;;;;;
+1E05B;CYRILLIC SUBSCRIPT SMALL LETTER EL;Lm;0;L;<sub> 043B;;;;N;;;;;
+1E05C;CYRILLIC SUBSCRIPT SMALL LETTER O;Lm;0;L;<sub> 043E;;;;N;;;;;
+1E05D;CYRILLIC SUBSCRIPT SMALL LETTER PE;Lm;0;L;<sub> 043F;;;;N;;;;;
+1E05E;CYRILLIC SUBSCRIPT SMALL LETTER ES;Lm;0;L;<sub> 0441;;;;N;;;;;
+1E05F;CYRILLIC SUBSCRIPT SMALL LETTER U;Lm;0;L;<sub> 0443;;;;N;;;;;
+1E060;CYRILLIC SUBSCRIPT SMALL LETTER EF;Lm;0;L;<sub> 0444;;;;N;;;;;
+1E061;CYRILLIC SUBSCRIPT SMALL LETTER HA;Lm;0;L;<sub> 0445;;;;N;;;;;
+1E062;CYRILLIC SUBSCRIPT SMALL LETTER TSE;Lm;0;L;<sub> 0446;;;;N;;;;;
+1E063;CYRILLIC SUBSCRIPT SMALL LETTER CHE;Lm;0;L;<sub> 0447;;;;N;;;;;
+1E064;CYRILLIC SUBSCRIPT SMALL LETTER SHA;Lm;0;L;<sub> 0448;;;;N;;;;;
+1E065;CYRILLIC SUBSCRIPT SMALL LETTER HARD SIGN;Lm;0;L;<sub> 044A;;;;N;;;;;
+1E066;CYRILLIC SUBSCRIPT SMALL LETTER YERU;Lm;0;L;<sub> 044B;;;;N;;;;;
+1E067;CYRILLIC SUBSCRIPT SMALL LETTER GHE WITH UPTURN;Lm;0;L;<sub> 0491;;;;N;;;;;
+1E068;CYRILLIC SUBSCRIPT SMALL LETTER BYELORUSSIAN-UKRAINIAN I;Lm;0;L;<sub> 0456;;;;N;;;;;
+1E069;CYRILLIC SUBSCRIPT SMALL LETTER DZE;Lm;0;L;<sub> 0455;;;;N;;;;;
+1E06A;CYRILLIC SUBSCRIPT SMALL LETTER DZHE;Lm;0;L;<sub> 045F;;;;N;;;;;
+1E06B;MODIFIER LETTER CYRILLIC SMALL ES WITH DESCENDER;Lm;0;L;<super> 04AB;;;;N;;;;;
+1E06C;MODIFIER LETTER CYRILLIC SMALL YERU WITH BACK YER;Lm;0;L;<super> A651;;;;N;;;;;
+1E06D;MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE;Lm;0;L;<super> 04B1;;;;N;;;;;
+1E08F;COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I;Mn;230;NSM;;;;;N;;;;;
1E100;NYIAKENG PUACHUE HMONG LETTER MA;Lo;0;L;;;;;N;;;;;
1E101;NYIAKENG PUACHUE HMONG LETTER TSA;Lo;0;L;;;;;N;;;;;
1E102;NYIAKENG PUACHUE HMONG LETTER NTA;Lo;0;L;;;;;N;;;;;
@@ -30603,6 +30828,48 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
1E2F8;WANCHO DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;;
1E2F9;WANCHO DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
1E2FF;WANCHO NGUN SIGN;Sc;0;ET;;;;;N;;;;;
+1E4D0;NAG MUNDARI LETTER O;Lo;0;L;;;;;N;;;;;
+1E4D1;NAG MUNDARI LETTER OP;Lo;0;L;;;;;N;;;;;
+1E4D2;NAG MUNDARI LETTER OL;Lo;0;L;;;;;N;;;;;
+1E4D3;NAG MUNDARI LETTER OY;Lo;0;L;;;;;N;;;;;
+1E4D4;NAG MUNDARI LETTER ONG;Lo;0;L;;;;;N;;;;;
+1E4D5;NAG MUNDARI LETTER A;Lo;0;L;;;;;N;;;;;
+1E4D6;NAG MUNDARI LETTER AJ;Lo;0;L;;;;;N;;;;;
+1E4D7;NAG MUNDARI LETTER AB;Lo;0;L;;;;;N;;;;;
+1E4D8;NAG MUNDARI LETTER ANY;Lo;0;L;;;;;N;;;;;
+1E4D9;NAG MUNDARI LETTER AH;Lo;0;L;;;;;N;;;;;
+1E4DA;NAG MUNDARI LETTER I;Lo;0;L;;;;;N;;;;;
+1E4DB;NAG MUNDARI LETTER IS;Lo;0;L;;;;;N;;;;;
+1E4DC;NAG MUNDARI LETTER IDD;Lo;0;L;;;;;N;;;;;
+1E4DD;NAG MUNDARI LETTER IT;Lo;0;L;;;;;N;;;;;
+1E4DE;NAG MUNDARI LETTER IH;Lo;0;L;;;;;N;;;;;
+1E4DF;NAG MUNDARI LETTER U;Lo;0;L;;;;;N;;;;;
+1E4E0;NAG MUNDARI LETTER UC;Lo;0;L;;;;;N;;;;;
+1E4E1;NAG MUNDARI LETTER UD;Lo;0;L;;;;;N;;;;;
+1E4E2;NAG MUNDARI LETTER UK;Lo;0;L;;;;;N;;;;;
+1E4E3;NAG MUNDARI LETTER UR;Lo;0;L;;;;;N;;;;;
+1E4E4;NAG MUNDARI LETTER E;Lo;0;L;;;;;N;;;;;
+1E4E5;NAG MUNDARI LETTER ENN;Lo;0;L;;;;;N;;;;;
+1E4E6;NAG MUNDARI LETTER EG;Lo;0;L;;;;;N;;;;;
+1E4E7;NAG MUNDARI LETTER EM;Lo;0;L;;;;;N;;;;;
+1E4E8;NAG MUNDARI LETTER EN;Lo;0;L;;;;;N;;;;;
+1E4E9;NAG MUNDARI LETTER ETT;Lo;0;L;;;;;N;;;;;
+1E4EA;NAG MUNDARI LETTER ELL;Lo;0;L;;;;;N;;;;;
+1E4EB;NAG MUNDARI SIGN OJOD;Lm;0;L;;;;;N;;;;;
+1E4EC;NAG MUNDARI SIGN MUHOR;Mn;232;NSM;;;;;N;;;;;
+1E4ED;NAG MUNDARI SIGN TOYOR;Mn;232;NSM;;;;;N;;;;;
+1E4EE;NAG MUNDARI SIGN IKIR;Mn;220;NSM;;;;;N;;;;;
+1E4EF;NAG MUNDARI SIGN SUTUH;Mn;230;NSM;;;;;N;;;;;
+1E4F0;NAG MUNDARI DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;;
+1E4F1;NAG MUNDARI DIGIT ONE;Nd;0;L;;1;1;1;N;;;;;
+1E4F2;NAG MUNDARI DIGIT TWO;Nd;0;L;;2;2;2;N;;;;;
+1E4F3;NAG MUNDARI DIGIT THREE;Nd;0;L;;3;3;3;N;;;;;
+1E4F4;NAG MUNDARI DIGIT FOUR;Nd;0;L;;4;4;4;N;;;;;
+1E4F5;NAG MUNDARI DIGIT FIVE;Nd;0;L;;5;5;5;N;;;;;
+1E4F6;NAG MUNDARI DIGIT SIX;Nd;0;L;;6;6;6;N;;;;;
+1E4F7;NAG MUNDARI DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;;
+1E4F8;NAG MUNDARI DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;;
+1E4F9;NAG MUNDARI DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
1E7E0;ETHIOPIC SYLLABLE HHYA;Lo;0;L;;;;;N;;;;;
1E7E1;ETHIOPIC SYLLABLE HHYU;Lo;0;L;;;;;N;;;;;
1E7E2;ETHIOPIC SYLLABLE HHYI;Lo;0;L;;;;;N;;;;;
@@ -32678,6 +32945,7 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
1F6D5;HINDU TEMPLE;So;0;ON;;;;;N;;;;;
1F6D6;HUT;So;0;ON;;;;;N;;;;;
1F6D7;ELEVATOR;So;0;ON;;;;;N;;;;;
+1F6DC;WIRELESS;So;0;ON;;;;;N;;;;;
1F6DD;PLAYGROUND SLIDE;So;0;ON;;;;;N;;;;;
1F6DE;WHEEL;So;0;ON;;;;;N;;;;;
1F6DF;RING BUOY;So;0;ON;;;;;N;;;;;
@@ -32823,6 +33091,14 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
1F771;ALCHEMICAL SYMBOL FOR MONTH;So;0;ON;;;;;N;;;;;
1F772;ALCHEMICAL SYMBOL FOR HALF DRAM;So;0;ON;;;;;N;;;;;
1F773;ALCHEMICAL SYMBOL FOR HALF OUNCE;So;0;ON;;;;;N;;;;;
+1F774;LOT OF FORTUNE;So;0;ON;;;;;N;;;;;
+1F775;OCCULTATION;So;0;ON;;;;;N;;;;;
+1F776;LUNAR ECLIPSE;So;0;ON;;;;;N;;;;;
+1F77B;HAUMEA;So;0;ON;;;;;N;;;;;
+1F77C;MAKEMAKE;So;0;ON;;;;;N;;;;;
+1F77D;GONGGONG;So;0;ON;;;;;N;;;;;
+1F77E;QUAOAR;So;0;ON;;;;;N;;;;;
+1F77F;ORCUS;So;0;ON;;;;;N;;;;;
1F780;BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE;So;0;ON;;;;;N;;;;;
1F781;BLACK UP-POINTING ISOSCELES RIGHT TRIANGLE;So;0;ON;;;;;N;;;;;
1F782;BLACK RIGHT-POINTING ISOSCELES RIGHT TRIANGLE;So;0;ON;;;;;N;;;;;
@@ -32912,6 +33188,7 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
1F7D6;NEGATIVE CIRCLED TRIANGLE;So;0;ON;;;;;N;;;;;
1F7D7;CIRCLED SQUARE;So;0;ON;;;;;N;;;;;
1F7D8;NEGATIVE CIRCLED SQUARE;So;0;ON;;;;;N;;;;;
+1F7D9;NINE POINTED WHITE STAR;So;0;ON;;;;;N;;;;;
1F7E0;LARGE ORANGE CIRCLE;So;0;ON;;;;;N;;;;;
1F7E1;LARGE YELLOW CIRCLE;So;0;ON;;;;;N;;;;;
1F7E2;LARGE GREEN CIRCLE;So;0;ON;;;;;N;;;;;
@@ -33434,6 +33711,9 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
1FA72;BRIEFS;So;0;ON;;;;;N;;;;;
1FA73;SHORTS;So;0;ON;;;;;N;;;;;
1FA74;THONG SANDAL;So;0;ON;;;;;N;;;;;
+1FA75;LIGHT BLUE HEART;So;0;ON;;;;;N;;;;;
+1FA76;GREY HEART;So;0;ON;;;;;N;;;;;
+1FA77;PINK HEART;So;0;ON;;;;;N;;;;;
1FA78;DROP OF BLOOD;So;0;ON;;;;;N;;;;;
1FA79;ADHESIVE BANDAGE;So;0;ON;;;;;N;;;;;
1FA7A;STETHOSCOPE;So;0;ON;;;;;N;;;;;
@@ -33446,6 +33726,8 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
1FA84;MAGIC WAND;So;0;ON;;;;;N;;;;;
1FA85;PINATA;So;0;ON;;;;;N;;;;;
1FA86;NESTING DOLLS;So;0;ON;;;;;N;;;;;
+1FA87;MARACAS;So;0;ON;;;;;N;;;;;
+1FA88;FLUTE;So;0;ON;;;;;N;;;;;
1FA90;RINGED PLANET;So;0;ON;;;;;N;;;;;
1FA91;CHAIR;So;0;ON;;;;;N;;;;;
1FA92;RAZOR;So;0;ON;;;;;N;;;;;
@@ -33475,6 +33757,9 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
1FAAA;IDENTIFICATION CARD;So;0;ON;;;;;N;;;;;
1FAAB;LOW BATTERY;So;0;ON;;;;;N;;;;;
1FAAC;HAMSA;So;0;ON;;;;;N;;;;;
+1FAAD;FOLDING HAND FAN;So;0;ON;;;;;N;;;;;
+1FAAE;HAIR PICK;So;0;ON;;;;;N;;;;;
+1FAAF;KHANDA;So;0;ON;;;;;N;;;;;
1FAB0;FLY;So;0;ON;;;;;N;;;;;
1FAB1;WORM;So;0;ON;;;;;N;;;;;
1FAB2;BEETLE;So;0;ON;;;;;N;;;;;
@@ -33486,12 +33771,18 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
1FAB8;CORAL;So;0;ON;;;;;N;;;;;
1FAB9;EMPTY NEST;So;0;ON;;;;;N;;;;;
1FABA;NEST WITH EGGS;So;0;ON;;;;;N;;;;;
+1FABB;HYACINTH;So;0;ON;;;;;N;;;;;
+1FABC;JELLYFISH;So;0;ON;;;;;N;;;;;
+1FABD;WING;So;0;ON;;;;;N;;;;;
+1FABF;GOOSE;So;0;ON;;;;;N;;;;;
1FAC0;ANATOMICAL HEART;So;0;ON;;;;;N;;;;;
1FAC1;LUNGS;So;0;ON;;;;;N;;;;;
1FAC2;PEOPLE HUGGING;So;0;ON;;;;;N;;;;;
1FAC3;PREGNANT MAN;So;0;ON;;;;;N;;;;;
1FAC4;PREGNANT PERSON;So;0;ON;;;;;N;;;;;
1FAC5;PERSON WITH CROWN;So;0;ON;;;;;N;;;;;
+1FACE;MOOSE;So;0;ON;;;;;N;;;;;
+1FACF;DONKEY;So;0;ON;;;;;N;;;;;
1FAD0;BLUEBERRIES;So;0;ON;;;;;N;;;;;
1FAD1;BELL PEPPER;So;0;ON;;;;;N;;;;;
1FAD2;OLIVE;So;0;ON;;;;;N;;;;;
@@ -33502,6 +33793,8 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
1FAD7;POURING LIQUID;So;0;ON;;;;;N;;;;;
1FAD8;BEANS;So;0;ON;;;;;N;;;;;
1FAD9;JAR;So;0;ON;;;;;N;;;;;
+1FADA;GINGER ROOT;So;0;ON;;;;;N;;;;;
+1FADB;PEA POD;So;0;ON;;;;;N;;;;;
1FAE0;MELTING FACE;So;0;ON;;;;;N;;;;;
1FAE1;SALUTING FACE;So;0;ON;;;;;N;;;;;
1FAE2;FACE WITH OPEN EYES AND HAND OVER MOUTH;So;0;ON;;;;;N;;;;;
@@ -33510,6 +33803,7 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
1FAE5;DOTTED LINE FACE;So;0;ON;;;;;N;;;;;
1FAE6;BITING LIP;So;0;ON;;;;;N;;;;;
1FAE7;BUBBLES;So;0;ON;;;;;N;;;;;
+1FAE8;SHAKING FACE;So;0;ON;;;;;N;;;;;
1FAF0;HAND WITH INDEX FINGER AND THUMB CROSSED;So;0;ON;;;;;N;;;;;
1FAF1;RIGHTWARDS HAND;So;0;ON;;;;;N;;;;;
1FAF2;LEFTWARDS HAND;So;0;ON;;;;;N;;;;;
@@ -33517,6 +33811,8 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
1FAF4;PALM UP HAND;So;0;ON;;;;;N;;;;;
1FAF5;INDEX POINTING AT THE VIEWER;So;0;ON;;;;;N;;;;;
1FAF6;HEART HANDS;So;0;ON;;;;;N;;;;;
+1FAF7;LEFTWARDS PUSHING HAND;So;0;ON;;;;;N;;;;;
+1FAF8;RIGHTWARDS PUSHING HAND;So;0;ON;;;;;N;;;;;
1FB00;BLOCK SEXTANT-1;So;0;ON;;;;;N;;;;;
1FB01;BLOCK SEXTANT-2;So;0;ON;;;;;N;;;;;
1FB02;BLOCK SEXTANT-12;So;0;ON;;;;;N;;;;;
@@ -33732,7 +34028,7 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
2A6DF;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;
-2B738;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
+2B739;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;;
2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;;
2B820;<CJK Ideograph Extension E, First>;Lo;0;L;;;;;N;;;;;
@@ -34283,6 +34579,8 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
2FA1D;CJK COMPATIBILITY IDEOGRAPH-2FA1D;Lo;0;L;2A600;;;;N;;;;;
30000;<CJK Ideograph Extension G, First>;Lo;0;L;;;;;N;;;;;
3134A;<CJK Ideograph Extension G, Last>;Lo;0;L;;;;;N;;;;;
+31350;<CJK Ideograph Extension H, First>;Lo;0;L;;;;;N;;;;;
+323AF;<CJK Ideograph Extension H, Last>;Lo;0;L;;;;;N;;;;;
E0001;LANGUAGE TAG;Cf;0;BN;;;;;N;;;;;
E0020;TAG SPACE;Cf;0;BN;;;;;N;;;;;
E0021;TAG EXCLAMATION MARK;Cf;0;BN;;;;;N;;;;;
diff --git a/data/WordBreakProperty.txt b/data/WordBreakProperty.txt
@@ -1,11 +1,11 @@
-# WordBreakProperty-14.0.0.txt
-# Date: 2021-07-10, 00:35:32 GMT
-# © 2021 Unicode®, Inc.
+# WordBreakProperty-15.0.0.txt
+# Date: 2022-04-27, 02:41:26 GMT
+# © 2022 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
-# For terms of use, see http://www.unicode.org/terms_of_use.html
+# For terms of use, see https://www.unicode.org/terms_of_use.html
#
# Unicode Character Database
-# For documentation, see http://www.unicode.org/reports/tr44/
+# For documentation, see https://www.unicode.org/reports/tr44/
# ================================================
@@ -180,6 +180,7 @@ FB46..FB4F ; Hebrew_Letter # Lo [10] HEBREW LETTER TSADI WITH DAGESH..HEBREW
0CCC..0CCD ; Extend # Mn [2] KANNADA VOWEL SIGN AU..KANNADA SIGN VIRAMA
0CD5..0CD6 ; Extend # Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK
0CE2..0CE3 ; Extend # Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL
+0CF3 ; Extend # Mc KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
0D00..0D01 ; Extend # Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU
0D02..0D03 ; Extend # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
0D3B..0D3C ; Extend # Mn [2] MALAYALAM SIGN VERTICAL BAR VIRAMA..MALAYALAM SIGN CIRCULAR VIRAMA
@@ -203,7 +204,7 @@ FB46..FB4F ; Hebrew_Letter # Lo [10] HEBREW LETTER TSADI WITH DAGESH..HEBREW
0E47..0E4E ; Extend # Mn [8] THAI CHARACTER MAITAIKHU..THAI CHARACTER YAMAKKAN
0EB1 ; Extend # Mn LAO VOWEL SIGN MAI KAN
0EB4..0EBC ; Extend # Mn [9] LAO VOWEL SIGN I..LAO SEMIVOWEL SIGN LO
-0EC8..0ECD ; Extend # Mn [6] LAO TONE MAI EK..LAO NIGGAHITA
+0EC8..0ECE ; Extend # Mn [7] LAO TONE MAI EK..LAO YAMAKKAN
0F18..0F19 ; Extend # Mn [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS
0F35 ; Extend # Mn TIBETAN MARK NGAS BZUNG NYI ZLA
0F37 ; Extend # Mn TIBETAN MARK NGAS BZUNG SGOR RTAGS
@@ -407,6 +408,7 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
10AE5..10AE6 ; Extend # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW
10D24..10D27 ; Extend # Mn [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI
10EAB..10EAC ; Extend # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
+10EFD..10EFF ; Extend # Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA
10F46..10F50 ; Extend # Mn [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW
10F82..10F85 ; Extend # Mn [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW
11000 ; Extend # Mc BRAHMI SIGN CANDRABINDU
@@ -443,6 +445,7 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
11235 ; Extend # Mc KHOJKI SIGN VIRAMA
11236..11237 ; Extend # Mn [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA
1123E ; Extend # Mn KHOJKI SIGN SUKUN
+11241 ; Extend # Mn KHOJKI VOWEL SIGN VOCALIC R
112DF ; Extend # Mn KHUDAWADI SIGN ANUSVARA
112E0..112E2 ; Extend # Mc [3] KHUDAWADI VOWEL SIGN AA..KHUDAWADI VOWEL SIGN II
112E3..112EA ; Extend # Mn [8] KHUDAWADI VOWEL SIGN U..KHUDAWADI SIGN VIRAMA
@@ -552,6 +555,16 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
11D97 ; Extend # Mn GUNJALA GONDI VIRAMA
11EF3..11EF4 ; Extend # Mn [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U
11EF5..11EF6 ; Extend # Mc [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O
+11F00..11F01 ; Extend # Mn [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA
+11F03 ; Extend # Mc KAWI SIGN VISARGA
+11F34..11F35 ; Extend # Mc [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA
+11F36..11F3A ; Extend # Mn [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R
+11F3E..11F3F ; Extend # Mc [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI
+11F40 ; Extend # Mn KAWI VOWEL SIGN EU
+11F41 ; Extend # Mc KAWI SIGN KILLER
+11F42 ; Extend # Mn KAWI CONJOINER
+13440 ; Extend # Mn EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY
+13447..13455 ; Extend # Mn [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED
16AF0..16AF4 ; Extend # Mn [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE
16B30..16B36 ; Extend # Mn [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM
16F4F ; Extend # Mn MIAO SIGN CONSONANT MODIFIER BAR
@@ -580,16 +593,18 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
1E01B..1E021 ; Extend # Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
1E023..1E024 ; Extend # Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
1E026..1E02A ; Extend # Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
+1E08F ; Extend # Mn COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
1E130..1E136 ; Extend # Mn [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
1E2AE ; Extend # Mn TOTO SIGN RISING TONE
1E2EC..1E2EF ; Extend # Mn [4] WANCHO TONE TUP..WANCHO TONE KOINI
+1E4EC..1E4EF ; Extend # Mn [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH
1E8D0..1E8D6 ; Extend # Mn [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS
1E944..1E94A ; Extend # Mn [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA
1F3FB..1F3FF ; Extend # Sk [5] EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6
E0020..E007F ; Extend # Cf [96] TAG SPACE..CANCEL TAG
E0100..E01EF ; Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
-# Total code points: 2512
+# Total code points: 2554
# ================================================
@@ -615,12 +630,12 @@ FEFF ; Format # Cf ZERO WIDTH NO-BREAK SPACE
FFF9..FFFB ; Format # Cf [3] INTERLINEAR ANNOTATION ANCHOR..INTERLINEAR ANNOTATION TERMINATOR
110BD ; Format # Cf KAITHI NUMBER SIGN
110CD ; Format # Cf KAITHI NUMBER SIGN ABOVE
-13430..13438 ; Format # Cf [9] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END SEGMENT
+13430..1343F ; Format # Cf [16] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE
1BCA0..1BCA3 ; Format # Cf [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP
1D173..1D17A ; Format # Cf [8] MUSICAL SYMBOL BEGIN BEAM..MUSICAL SYMBOL END PHRASE
E0001 ; Format # Cf LANGUAGE TAG
-# Total code points: 64
+# Total code points: 71
# ================================================
@@ -641,9 +656,10 @@ FF71..FF9D ; Katakana # Lo [45] HALFWIDTH KATAKANA LETTER A..HALFWIDTH KATAK
1AFFD..1AFFE ; Katakana # Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
1B000 ; Katakana # Lo KATAKANA LETTER ARCHAIC E
1B120..1B122 ; Katakana # Lo [3] KATAKANA LETTER ARCHAIC YI..KATAKANA LETTER ARCHAIC WU
+1B155 ; Katakana # Lo KATAKANA LETTER SMALL KO
1B164..1B167 ; Katakana # Lo [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
-# Total code points: 330
+# Total code points: 331
# ================================================
@@ -1127,6 +1143,7 @@ FFDA..FFDC ; ALetter # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL
111DC ; ALetter # Lo SHARADA HEADSTROKE
11200..11211 ; ALetter # Lo [18] KHOJKI LETTER A..KHOJKI LETTER JJA
11213..1122B ; ALetter # Lo [25] KHOJKI LETTER NYA..KHOJKI LETTER LLA
+1123F..11240 ; ALetter # Lo [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I
11280..11286 ; ALetter # Lo [7] MULTANI LETTER A..MULTANI LETTER GA
11288 ; ALetter # Lo MULTANI LETTER GHA
1128A..1128D ; ALetter # Lo [4] MULTANI LETTER CA..MULTANI LETTER JJA
@@ -1187,12 +1204,16 @@ FFDA..FFDC ; ALetter # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL
11D6A..11D89 ; ALetter # Lo [32] GUNJALA GONDI LETTER OO..GUNJALA GONDI LETTER SA
11D98 ; ALetter # Lo GUNJALA GONDI OM
11EE0..11EF2 ; ALetter # Lo [19] MAKASAR LETTER KA..MAKASAR ANGKA
+11F02 ; ALetter # Lo KAWI SIGN REPHA
+11F04..11F10 ; ALetter # Lo [13] KAWI LETTER A..KAWI LETTER O
+11F12..11F33 ; ALetter # Lo [34] KAWI LETTER KA..KAWI LETTER JNYA
11FB0 ; ALetter # Lo LISU LETTER YHA
12000..12399 ; ALetter # Lo [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U
12400..1246E ; ALetter # Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM
12480..12543 ; ALetter # Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU
12F90..12FF0 ; ALetter # Lo [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114
-13000..1342E ; ALetter # Lo [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032
+13000..1342F ; ALetter # Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D
+13441..13446 ; ALetter # Lo [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN
14400..14646 ; ALetter # Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530
16800..16A38 ; ALetter # Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ
16A40..16A5E ; ALetter # Lo [31] MRO LETTER TA..MRO LETTER TEK
@@ -1245,11 +1266,15 @@ FFDA..FFDC ; ALetter # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL
1DF00..1DF09 ; ALetter # L& [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
1DF0A ; ALetter # Lo LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK
1DF0B..1DF1E ; ALetter # L& [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
+1DF25..1DF2A ; ALetter # L& [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK
+1E030..1E06D ; ALetter # Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
1E100..1E12C ; ALetter # Lo [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W
1E137..1E13D ; ALetter # Lm [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER
1E14E ; ALetter # Lo NYIAKENG PUACHUE HMONG LOGOGRAM NYAJ
1E290..1E2AD ; ALetter # Lo [30] TOTO LETTER PA..TOTO LETTER A
1E2C0..1E2EB ; ALetter # Lo [44] WANCHO LETTER AA..WANCHO LETTER YIH
+1E4D0..1E4EA ; ALetter # Lo [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL
+1E4EB ; ALetter # Lm NAG MUNDARI SIGN OJOD
1E7E0..1E7E6 ; ALetter # Lo [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO
1E7E8..1E7EB ; ALetter # Lo [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE
1E7ED..1E7EE ; ALetter # Lo [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE
@@ -1294,7 +1319,7 @@ FFDA..FFDC ; ALetter # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL
1F150..1F169 ; ALetter # So [26] NEGATIVE CIRCLED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z
1F170..1F189 ; ALetter # So [26] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER Z
-# Total code points: 29336
+# Total code points: 29489
# ================================================
@@ -1398,16 +1423,18 @@ FF10..FF19 ; Numeric # Nd [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE
11C50..11C59 ; Numeric # Nd [10] BHAIKSUKI DIGIT ZERO..BHAIKSUKI DIGIT NINE
11D50..11D59 ; Numeric # Nd [10] MASARAM GONDI DIGIT ZERO..MASARAM GONDI DIGIT NINE
11DA0..11DA9 ; Numeric # Nd [10] GUNJALA GONDI DIGIT ZERO..GUNJALA GONDI DIGIT NINE
+11F50..11F59 ; Numeric # Nd [10] KAWI DIGIT ZERO..KAWI DIGIT NINE
16A60..16A69 ; Numeric # Nd [10] MRO DIGIT ZERO..MRO DIGIT NINE
16AC0..16AC9 ; Numeric # Nd [10] TANGSA DIGIT ZERO..TANGSA DIGIT NINE
16B50..16B59 ; Numeric # Nd [10] PAHAWH HMONG DIGIT ZERO..PAHAWH HMONG DIGIT NINE
1D7CE..1D7FF ; Numeric # Nd [50] MATHEMATICAL BOLD DIGIT ZERO..MATHEMATICAL MONOSPACE DIGIT NINE
1E140..1E149 ; Numeric # Nd [10] NYIAKENG PUACHUE HMONG DIGIT ZERO..NYIAKENG PUACHUE HMONG DIGIT NINE
1E2F0..1E2F9 ; Numeric # Nd [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE
+1E4F0..1E4F9 ; Numeric # Nd [10] NAG MUNDARI DIGIT ZERO..NAG MUNDARI DIGIT NINE
1E950..1E959 ; Numeric # Nd [10] ADLAM DIGIT ZERO..ADLAM DIGIT NINE
1FBF0..1FBF9 ; Numeric # Nd [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE
-# Total code points: 661
+# Total code points: 681
# ================================================
diff --git a/data/WordBreakTest.txt b/data/WordBreakTest.txt
@@ -1,11 +1,11 @@
-# WordBreakTest-14.0.0.txt
-# Date: 2021-03-08, 06:22:40 GMT
-# © 2021 Unicode®, Inc.
+# WordBreakTest-15.0.0.txt
+# Date: 2022-02-26, 00:39:00 GMT
+# © 2022 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
-# For terms of use, see http://www.unicode.org/terms_of_use.html
+# For terms of use, see https://www.unicode.org/terms_of_use.html
#
# Unicode Character Database
-# For documentation, see http://www.unicode.org/reports/tr44/
+# For documentation, see https://www.unicode.org/reports/tr44/
#
# Default Word_Break Test
#
diff --git a/data/emoji-data.txt b/data/emoji-data.txt
@@ -1,13 +1,13 @@
-# emoji-data-14.0.0.txt
-# Date: 2021-08-26, 17:22:22 GMT
-# © 2021 Unicode®, Inc.
+# emoji-data.txt
+# Date: 2022-08-02, 00:26:10 GMT
+# © 2022 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
-# For terms of use, see http://www.unicode.org/terms_of_use.html
+# For terms of use, see https://www.unicode.org/terms_of_use.html
#
# Emoji Data for UTS #51
-# Used with Emoji Version 14.0 and subsequent minor revisions (if any)
+# Used with Emoji Version 15.0 and subsequent minor revisions (if any)
#
-# For documentation and usage, see http://www.unicode.org/reports/tr51
+# For documentation and usage, see https://www.unicode.org/reports/tr51
#
# Format:
# <codepoint(s)> ; <property> # <comments>
@@ -19,8 +19,7 @@
# ================================================
-# All omitted code points have Emoji=No
-# @missing: 0000..10FFFF ; Emoji ; No
+# All omitted code points have Emoji=No
0023 ; Emoji # E0.0 [1] (#️) hash sign
002A ; Emoji # E0.0 [1] (*️) asterisk
@@ -341,6 +340,7 @@
1F6D1..1F6D2 ; Emoji # E3.0 [2] (🛑..🛒) stop sign..shopping cart
1F6D5 ; Emoji # E12.0 [1] (🛕) hindu temple
1F6D6..1F6D7 ; Emoji # E13.0 [2] (🛖..🛗) hut..elevator
+1F6DC ; Emoji # E15.0 [1] (🛜) wireless
1F6DD..1F6DF ; Emoji # E14.0 [3] (🛝..🛟) playground slide..ring buoy
1F6E0..1F6E5 ; Emoji # E0.7 [6] (🛠️..🛥️) hammer and wrench..motor boat
1F6E9 ; Emoji # E0.7 [1] (🛩️) small airplane
@@ -401,28 +401,36 @@
1F9E7..1F9FF ; Emoji # E11.0 [25] (🧧..🧿) red envelope..nazar amulet
1FA70..1FA73 ; Emoji # E12.0 [4] (🩰..🩳) ballet shoes..shorts
1FA74 ; Emoji # E13.0 [1] (🩴) thong sandal
+1FA75..1FA77 ; Emoji # E15.0 [3] (🩵..🩷) light blue heart..pink heart
1FA78..1FA7A ; Emoji # E12.0 [3] (🩸..🩺) drop of blood..stethoscope
1FA7B..1FA7C ; Emoji # E14.0 [2] (🩻..🩼) x-ray..crutch
1FA80..1FA82 ; Emoji # E12.0 [3] (🪀..🪂) yo-yo..parachute
1FA83..1FA86 ; Emoji # E13.0 [4] (🪃..🪆) boomerang..nesting dolls
+1FA87..1FA88 ; Emoji # E15.0 [2] (🪇..🪈) maracas..flute
1FA90..1FA95 ; Emoji # E12.0 [6] (🪐..🪕) ringed planet..banjo
1FA96..1FAA8 ; Emoji # E13.0 [19] (🪖..🪨) military helmet..rock
1FAA9..1FAAC ; Emoji # E14.0 [4] (🪩..🪬) mirror ball..hamsa
+1FAAD..1FAAF ; Emoji # E15.0 [3] (🪭..🪯) folding hand fan..khanda
1FAB0..1FAB6 ; Emoji # E13.0 [7] (🪰..🪶) fly..feather
1FAB7..1FABA ; Emoji # E14.0 [4] (🪷..🪺) lotus..nest with eggs
+1FABB..1FABD ; Emoji # E15.0 [3] (🪻..🪽) hyacinth..wing
+1FABF ; Emoji # E15.0 [1] (🪿) goose
1FAC0..1FAC2 ; Emoji # E13.0 [3] (🫀..🫂) anatomical heart..people hugging
1FAC3..1FAC5 ; Emoji # E14.0 [3] (🫃..🫅) pregnant man..person with crown
+1FACE..1FACF ; Emoji # E15.0 [2] (🫎..🫏) moose..donkey
1FAD0..1FAD6 ; Emoji # E13.0 [7] (🫐..🫖) blueberries..teapot
1FAD7..1FAD9 ; Emoji # E14.0 [3] (🫗..🫙) pouring liquid..jar
+1FADA..1FADB ; Emoji # E15.0 [2] (🫚..🫛) ginger root..pea pod
1FAE0..1FAE7 ; Emoji # E14.0 [8] (🫠..🫧) melting face..bubbles
+1FAE8 ; Emoji # E15.0 [1] (🫨) shaking face
1FAF0..1FAF6 ; Emoji # E14.0 [7] (🫰..🫶) hand with index finger and thumb crossed..heart hands
+1FAF7..1FAF8 ; Emoji # E15.0 [2] (🫷..🫸) leftwards pushing hand..rightwards pushing hand
-# Total elements: 1404
+# Total elements: 1424
# ================================================
-# All omitted code points have Emoji_Presentation=No
-# @missing: 0000..10FFFF ; Emoji_Presentation ; No
+# All omitted code points have Emoji_Presentation=No
231A..231B ; Emoji_Presentation # E0.6 [2] (⌚..⌛) watch..hourglass done
23E9..23EC ; Emoji_Presentation # E0.6 [4] (⏩..⏬) fast-forward button..fast down button
@@ -625,6 +633,7 @@
1F6D1..1F6D2 ; Emoji_Presentation # E3.0 [2] (🛑..🛒) stop sign..shopping cart
1F6D5 ; Emoji_Presentation # E12.0 [1] (🛕) hindu temple
1F6D6..1F6D7 ; Emoji_Presentation # E13.0 [2] (🛖..🛗) hut..elevator
+1F6DC ; Emoji_Presentation # E15.0 [1] (🛜) wireless
1F6DD..1F6DF ; Emoji_Presentation # E14.0 [3] (🛝..🛟) playground slide..ring buoy
1F6EB..1F6EC ; Emoji_Presentation # E1.0 [2] (🛫..🛬) airplane departure..airplane arrival
1F6F4..1F6F6 ; Emoji_Presentation # E3.0 [3] (🛴..🛶) kick scooter..canoe
@@ -681,28 +690,36 @@
1F9E7..1F9FF ; Emoji_Presentation # E11.0 [25] (🧧..🧿) red envelope..nazar amulet
1FA70..1FA73 ; Emoji_Presentation # E12.0 [4] (🩰..🩳) ballet shoes..shorts
1FA74 ; Emoji_Presentation # E13.0 [1] (🩴) thong sandal
+1FA75..1FA77 ; Emoji_Presentation # E15.0 [3] (🩵..🩷) light blue heart..pink heart
1FA78..1FA7A ; Emoji_Presentation # E12.0 [3] (🩸..🩺) drop of blood..stethoscope
1FA7B..1FA7C ; Emoji_Presentation # E14.0 [2] (🩻..🩼) x-ray..crutch
1FA80..1FA82 ; Emoji_Presentation # E12.0 [3] (🪀..🪂) yo-yo..parachute
1FA83..1FA86 ; Emoji_Presentation # E13.0 [4] (🪃..🪆) boomerang..nesting dolls
+1FA87..1FA88 ; Emoji_Presentation # E15.0 [2] (🪇..🪈) maracas..flute
1FA90..1FA95 ; Emoji_Presentation # E12.0 [6] (🪐..🪕) ringed planet..banjo
1FA96..1FAA8 ; Emoji_Presentation # E13.0 [19] (🪖..🪨) military helmet..rock
1FAA9..1FAAC ; Emoji_Presentation # E14.0 [4] (🪩..🪬) mirror ball..hamsa
+1FAAD..1FAAF ; Emoji_Presentation # E15.0 [3] (🪭..🪯) folding hand fan..khanda
1FAB0..1FAB6 ; Emoji_Presentation # E13.0 [7] (🪰..🪶) fly..feather
1FAB7..1FABA ; Emoji_Presentation # E14.0 [4] (🪷..🪺) lotus..nest with eggs
+1FABB..1FABD ; Emoji_Presentation # E15.0 [3] (🪻..🪽) hyacinth..wing
+1FABF ; Emoji_Presentation # E15.0 [1] (🪿) goose
1FAC0..1FAC2 ; Emoji_Presentation # E13.0 [3] (🫀..🫂) anatomical heart..people hugging
1FAC3..1FAC5 ; Emoji_Presentation # E14.0 [3] (🫃..🫅) pregnant man..person with crown
+1FACE..1FACF ; Emoji_Presentation # E15.0 [2] (🫎..🫏) moose..donkey
1FAD0..1FAD6 ; Emoji_Presentation # E13.0 [7] (🫐..🫖) blueberries..teapot
1FAD7..1FAD9 ; Emoji_Presentation # E14.0 [3] (🫗..🫙) pouring liquid..jar
+1FADA..1FADB ; Emoji_Presentation # E15.0 [2] (🫚..🫛) ginger root..pea pod
1FAE0..1FAE7 ; Emoji_Presentation # E14.0 [8] (🫠..🫧) melting face..bubbles
+1FAE8 ; Emoji_Presentation # E15.0 [1] (🫨) shaking face
1FAF0..1FAF6 ; Emoji_Presentation # E14.0 [7] (🫰..🫶) hand with index finger and thumb crossed..heart hands
+1FAF7..1FAF8 ; Emoji_Presentation # E15.0 [2] (🫷..🫸) leftwards pushing hand..rightwards pushing hand
-# Total elements: 1185
+# Total elements: 1205
# ================================================
-# All omitted code points have Emoji_Modifier=No
-# @missing: 0000..10FFFF ; Emoji_Modifier ; No
+# All omitted code points have Emoji_Modifier=No
1F3FB..1F3FF ; Emoji_Modifier # E1.0 [5] (🏻..🏿) light skin tone..dark skin tone
@@ -710,8 +727,7 @@
# ================================================
-# All omitted code points have Emoji_Modifier_Base=No
-# @missing: 0000..10FFFF ; Emoji_Modifier_Base ; No
+# All omitted code points have Emoji_Modifier_Base=No
261D ; Emoji_Modifier_Base # E0.6 [1] (☝️) index pointing up
26F9 ; Emoji_Modifier_Base # E0.7 [1] (⛹️) person bouncing ball
@@ -762,13 +778,13 @@
1F9D1..1F9DD ; Emoji_Modifier_Base # E5.0 [13] (🧑..🧝) person..elf
1FAC3..1FAC5 ; Emoji_Modifier_Base # E14.0 [3] (🫃..🫅) pregnant man..person with crown
1FAF0..1FAF6 ; Emoji_Modifier_Base # E14.0 [7] (🫰..🫶) hand with index finger and thumb crossed..heart hands
+1FAF7..1FAF8 ; Emoji_Modifier_Base # E15.0 [2] (🫷..🫸) leftwards pushing hand..rightwards pushing hand
-# Total elements: 132
+# Total elements: 134
# ================================================
-# All omitted code points have Emoji_Component=No
-# @missing: 0000..10FFFF ; Emoji_Component ; No
+# All omitted code points have Emoji_Component=No
0023 ; Emoji_Component # E0.0 [1] (#️) hash sign
002A ; Emoji_Component # E0.0 [1] (*️) asterisk
@@ -785,8 +801,7 @@ E0020..E007F ; Emoji_Component # E0.0 [96] (..) tag space..c
# ================================================
-# All omitted code points have Extended_Pictographic=No
-# @missing: 0000..10FFFF ; Extended_Pictographic ; No
+# All omitted code points have Extended_Pictographic=No
00A9 ; Extended_Pictographic# E0.6 [1] (©️) copyright
00AE ; Extended_Pictographic# E0.6 [1] (®️) registered
@@ -1190,7 +1205,8 @@ E0020..E007F ; Emoji_Component # E0.0 [96] (..) tag space..c
1F6D3..1F6D4 ; Extended_Pictographic# E0.0 [2] (🛓..🛔) STUPA..PAGODA
1F6D5 ; Extended_Pictographic# E12.0 [1] (🛕) hindu temple
1F6D6..1F6D7 ; Extended_Pictographic# E13.0 [2] (🛖..🛗) hut..elevator
-1F6D8..1F6DC ; Extended_Pictographic# E0.0 [5] (..🛜) <reserved-1F6D8>..<reserved-1F6DC>
+1F6D8..1F6DB ; Extended_Pictographic# E0.0 [4] (..) <reserved-1F6D8>..<reserved-1F6DB>
+1F6DC ; Extended_Pictographic# E15.0 [1] (🛜) wireless
1F6DD..1F6DF ; Extended_Pictographic# E14.0 [3] (🛝..🛟) playground slide..ring buoy
1F6E0..1F6E5 ; Extended_Pictographic# E0.7 [6] (🛠️..🛥️) hammer and wrench..motor boat
1F6E6..1F6E8 ; Extended_Pictographic# E0.0 [3] (🛦..🛨) UP-POINTING MILITARY AIRPLANE..UP-POINTING SMALL AIRPLANE
@@ -1207,7 +1223,7 @@ E0020..E007F ; Emoji_Component # E0.0 [96] (..) tag space..c
1F6FA ; Extended_Pictographic# E12.0 [1] (🛺) auto rickshaw
1F6FB..1F6FC ; Extended_Pictographic# E13.0 [2] (🛻..🛼) pickup truck..roller skate
1F6FD..1F6FF ; Extended_Pictographic# E0.0 [3] (..) <reserved-1F6FD>..<reserved-1F6FF>
-1F774..1F77F ; Extended_Pictographic# E0.0 [12] (🝴..🝿) <reserved-1F774>..<reserved-1F77F>
+1F774..1F77F ; Extended_Pictographic# E0.0 [12] (🝴..🝿) LOT OF FORTUNE..ORCUS
1F7D5..1F7DF ; Extended_Pictographic# E0.0 [11] (🟕..) CIRCLED TRIANGLE..<reserved-1F7DF>
1F7E0..1F7EB ; Extended_Pictographic# E12.0 [12] (🟠..🟫) orange circle..brown square
1F7EC..1F7EF ; Extended_Pictographic# E0.0 [4] (..) <reserved-1F7EC>..<reserved-1F7EF>
@@ -1266,30 +1282,37 @@ E0020..E007F ; Emoji_Component # E0.0 [96] (..) tag space..c
1FA00..1FA6F ; Extended_Pictographic# E0.0 [112] (🨀..) NEUTRAL CHESS KING..<reserved-1FA6F>
1FA70..1FA73 ; Extended_Pictographic# E12.0 [4] (🩰..🩳) ballet shoes..shorts
1FA74 ; Extended_Pictographic# E13.0 [1] (🩴) thong sandal
-1FA75..1FA77 ; Extended_Pictographic# E0.0 [3] (🩵..🩷) <reserved-1FA75>..<reserved-1FA77>
+1FA75..1FA77 ; Extended_Pictographic# E15.0 [3] (🩵..🩷) light blue heart..pink heart
1FA78..1FA7A ; Extended_Pictographic# E12.0 [3] (🩸..🩺) drop of blood..stethoscope
1FA7B..1FA7C ; Extended_Pictographic# E14.0 [2] (🩻..🩼) x-ray..crutch
1FA7D..1FA7F ; Extended_Pictographic# E0.0 [3] (..) <reserved-1FA7D>..<reserved-1FA7F>
1FA80..1FA82 ; Extended_Pictographic# E12.0 [3] (🪀..🪂) yo-yo..parachute
1FA83..1FA86 ; Extended_Pictographic# E13.0 [4] (🪃..🪆) boomerang..nesting dolls
-1FA87..1FA8F ; Extended_Pictographic# E0.0 [9] (🪇..) <reserved-1FA87>..<reserved-1FA8F>
+1FA87..1FA88 ; Extended_Pictographic# E15.0 [2] (🪇..🪈) maracas..flute
+1FA89..1FA8F ; Extended_Pictographic# E0.0 [7] (..) <reserved-1FA89>..<reserved-1FA8F>
1FA90..1FA95 ; Extended_Pictographic# E12.0 [6] (🪐..🪕) ringed planet..banjo
1FA96..1FAA8 ; Extended_Pictographic# E13.0 [19] (🪖..🪨) military helmet..rock
1FAA9..1FAAC ; Extended_Pictographic# E14.0 [4] (🪩..🪬) mirror ball..hamsa
-1FAAD..1FAAF ; Extended_Pictographic# E0.0 [3] (🪭..🪯) <reserved-1FAAD>..<reserved-1FAAF>
+1FAAD..1FAAF ; Extended_Pictographic# E15.0 [3] (🪭..🪯) folding hand fan..khanda
1FAB0..1FAB6 ; Extended_Pictographic# E13.0 [7] (🪰..🪶) fly..feather
1FAB7..1FABA ; Extended_Pictographic# E14.0 [4] (🪷..🪺) lotus..nest with eggs
-1FABB..1FABF ; Extended_Pictographic# E0.0 [5] (🪻..🪿) <reserved-1FABB>..<reserved-1FABF>
+1FABB..1FABD ; Extended_Pictographic# E15.0 [3] (🪻..🪽) hyacinth..wing
+1FABE ; Extended_Pictographic# E0.0 [1] () <reserved-1FABE>
+1FABF ; Extended_Pictographic# E15.0 [1] (🪿) goose
1FAC0..1FAC2 ; Extended_Pictographic# E13.0 [3] (🫀..🫂) anatomical heart..people hugging
1FAC3..1FAC5 ; Extended_Pictographic# E14.0 [3] (🫃..🫅) pregnant man..person with crown
-1FAC6..1FACF ; Extended_Pictographic# E0.0 [10] (..🫏) <reserved-1FAC6>..<reserved-1FACF>
+1FAC6..1FACD ; Extended_Pictographic# E0.0 [8] (..) <reserved-1FAC6>..<reserved-1FACD>
+1FACE..1FACF ; Extended_Pictographic# E15.0 [2] (🫎..🫏) moose..donkey
1FAD0..1FAD6 ; Extended_Pictographic# E13.0 [7] (🫐..🫖) blueberries..teapot
1FAD7..1FAD9 ; Extended_Pictographic# E14.0 [3] (🫗..🫙) pouring liquid..jar
-1FADA..1FADF ; Extended_Pictographic# E0.0 [6] (🫚..) <reserved-1FADA>..<reserved-1FADF>
+1FADA..1FADB ; Extended_Pictographic# E15.0 [2] (🫚..🫛) ginger root..pea pod
+1FADC..1FADF ; Extended_Pictographic# E0.0 [4] (..) <reserved-1FADC>..<reserved-1FADF>
1FAE0..1FAE7 ; Extended_Pictographic# E14.0 [8] (🫠..🫧) melting face..bubbles
-1FAE8..1FAEF ; Extended_Pictographic# E0.0 [8] (🫨..) <reserved-1FAE8>..<reserved-1FAEF>
+1FAE8 ; Extended_Pictographic# E15.0 [1] (🫨) shaking face
+1FAE9..1FAEF ; Extended_Pictographic# E0.0 [7] (..) <reserved-1FAE9>..<reserved-1FAEF>
1FAF0..1FAF6 ; Extended_Pictographic# E14.0 [7] (🫰..🫶) hand with index finger and thumb crossed..heart hands
-1FAF7..1FAFF ; Extended_Pictographic# E0.0 [9] (🫷..) <reserved-1FAF7>..<reserved-1FAFF>
+1FAF7..1FAF8 ; Extended_Pictographic# E15.0 [2] (🫷..🫸) leftwards pushing hand..rightwards pushing hand
+1FAF9..1FAFF ; Extended_Pictographic# E0.0 [7] (..) <reserved-1FAF9>..<reserved-1FAFF>
1FC00..1FFFD ; Extended_Pictographic# E0.0[1022] (..) <reserved-1FC00>..<reserved-1FFFD>
# Total elements: 3537
diff --git a/gen/case.c b/gen/case.c
@@ -119,11 +119,14 @@ parse_cp_list(const char *str, uint_least32_t **cp, size_t *cplen)
}
/* go through the string again, parsing the numbers */
- for (i = 0, tmp1 = tmp2 = str; tmp2 != NULL; i++, tmp1 = tmp2 + 1) {
+ for (i = 0, tmp1 = tmp2 = str; tmp2 != NULL; i++) {
tmp2 = strchr(tmp1, ' ');
if (hextocp(tmp1, tmp2 ? (size_t)(tmp2 - tmp1) : strlen(tmp1), &((*cp)[i]))) {
return 1;
}
+ if (tmp2 != NULL) {
+ tmp1 = tmp2 + 1;
+ }
}
return 0;
@@ -166,7 +169,8 @@ specialcasing_callback(const char *file, char **field, size_t nfields,
/*
* overwrite value in "single mapping" property table by the
- * special value 0x110000 + (offset in special case array)
+ * special value 0x110000 + (offset in special case array),
+ * even if the special case has length 1
*/
prop_upper[cp].property = (int_least64_t)(UINT32_C(0x110000) + (sclen - 1));
prop_lower[cp].property = (int_least64_t)(UINT32_C(0x110000) + (sclen - 1));
@@ -297,5 +301,18 @@ main(int argc, char *argv[])
}
printf("};\n\n");
+ free(comp_lower.data);
+ free(comp_lower.offset);
+ free(comp_title.data);
+ free(comp_title.offset);
+ free(comp_upper.data);
+ free(comp_upper.offset);
+ free(mm_lower.major);
+ free(mm_lower.minor);
+ free(mm_title.major);
+ free(mm_title.minor);
+ free(mm_upper.major);
+ free(mm_upper.minor);
+
return 0;
}
diff --git a/gen/util.c b/gen/util.c
@@ -34,7 +34,7 @@ struct break_test_payload
static void *
reallocate_array(void *p, size_t len, size_t size)
{
- if (len > 0 && size > (size_t)(-1) / len) {
+ if (len > 0 && size > SIZE_MAX / len) {
errno = ENOMEM;
return NULL;
}
@@ -76,7 +76,7 @@ hextocp(const char *str, size_t len, uint_least32_t *cp)
(uint_least32_t)(str[i] - relative + off);
}
- if (*cp > 0x10ffff) {
+ if (*cp > UINT32_C(0x10FFFF)) {
fprintf(stderr, "hextocp: '%.*s' is too large.\n",
(int)len, str);
return 1;
@@ -251,14 +251,14 @@ properties_compress(const struct properties *prop,
uint_least32_t cp, i;
/* initialization */
- if (!(comp->offset = malloc((size_t)0x110000 * sizeof(*(comp->offset))))) {
+ if (!(comp->offset = malloc((size_t)UINT32_C(0x110000) * sizeof(*(comp->offset))))) {
fprintf(stderr, "malloc: %s\n", strerror(errno));
exit(1);
}
comp->data = NULL;
comp->datalen = 0;
- for (cp = 0; cp < 0x110000; cp++) {
+ for (cp = 0; cp < UINT32_C(0x110000); cp++) {
for (i = 0; i < comp->datalen; i++) {
if (!memcmp(&(prop[cp]), &(comp->data[i]), sizeof(*prop))) {
/* found a match! */
@@ -692,7 +692,13 @@ break_test_list_print(const struct break_test *test, size_t testlen,
void
break_test_list_free(struct break_test *test, size_t testlen)
{
- (void)testlen;
+ size_t i;
+
+ for (i = 0; i < testlen; i++) {
+ free(test[i].cp);
+ free(test[i].len);
+ free(test[i].descr);
+ }
free(test);
}
diff --git a/grapheme.h b/grapheme.h
@@ -6,12 +6,7 @@
#include <stddef.h>
#include <stdint.h>
-typedef struct grapheme_internal_segmentation_state {
- uint_least8_t prop;
- bool prop_set;
- bool gb11_flag;
- bool gb12_13_flag;
-} GRAPHEME_STATE;
+#define GRAPHEME_INVALID_CODEPOINT UINT32_C(0xFFFD)
enum grapheme_bidirectional_override {
GRAPHEME_BIDIRECTIONAL_OVERRIDE_NONE,
@@ -19,9 +14,25 @@ enum grapheme_bidirectional_override {
GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL,
};
-#define GRAPHEME_INVALID_CODEPOINT UINT32_C(0xFFFD)
+size_t grapheme_bidirectional_logical_to_visual(const uint_least32_t *, size_t,
+ enum grapheme_bidirectional_override,
+ uint_least32_t *, size_t);
+size_t grapheme_bidirectional_logical_to_visual_utf8(const char *, size_t,
+ enum grapheme_bidirectional_override,
+ char *, size_t);
+
+size_t grapheme_decode_utf8(const char *, size_t, uint_least32_t *);
+size_t grapheme_encode_utf8(uint_least32_t, char *, size_t);
-bool grapheme_is_character_break(uint_least32_t, uint_least32_t, GRAPHEME_STATE *);
+bool grapheme_is_character_break(uint_least32_t, uint_least32_t, uint_least16_t *);
+
+bool grapheme_is_lowercase(const uint_least32_t *, size_t, size_t *);
+bool grapheme_is_titlecase(const uint_least32_t *, size_t, size_t *);
+bool grapheme_is_uppercase(const uint_least32_t *, size_t, size_t *);
+
+bool grapheme_is_lowercase_utf8(const char *, size_t, size_t *);
+bool grapheme_is_titlecase_utf8(const char *, size_t, size_t *);
+bool grapheme_is_uppercase_utf8(const char *, size_t, size_t *);
size_t grapheme_next_character_break(const uint_least32_t *, size_t);
size_t grapheme_next_line_break(const uint_least32_t *, size_t);
@@ -33,30 +44,12 @@ size_t grapheme_next_line_break_utf8(const char *, size_t);
size_t grapheme_next_sentence_break_utf8(const char *, size_t);
size_t grapheme_next_word_break_utf8(const char *, size_t);
-size_t grapheme_to_uppercase(const uint_least32_t *, size_t, uint_least32_t *, size_t);
size_t grapheme_to_lowercase(const uint_least32_t *, size_t, uint_least32_t *, size_t);
size_t grapheme_to_titlecase(const uint_least32_t *, size_t, uint_least32_t *, size_t);
+size_t grapheme_to_uppercase(const uint_least32_t *, size_t, uint_least32_t *, size_t);
-size_t grapheme_to_uppercase_utf8(const char *, size_t, char *, size_t);
size_t grapheme_to_lowercase_utf8(const char *, size_t, char *, size_t);
size_t grapheme_to_titlecase_utf8(const char *, size_t, char *, size_t);
-
-bool grapheme_is_uppercase(const uint_least32_t *, size_t, size_t *);
-bool grapheme_is_lowercase(const uint_least32_t *, size_t, size_t *);
-bool grapheme_is_titlecase(const uint_least32_t *, size_t, size_t *);
-
-bool grapheme_is_uppercase_utf8(const char *, size_t, size_t *);
-bool grapheme_is_lowercase_utf8(const char *, size_t, size_t *);
-bool grapheme_is_titlecase_utf8(const char *, size_t, size_t *);
-
-size_t grapheme_bidirectional_logical_to_visual(const uint_least32_t *, size_t,
- enum grapheme_bidirectional_override,
- uint_least32_t *, size_t);
-size_t grapheme_bidirectional_logical_to_visual_utf8(const char *, size_t,
- enum grapheme_bidirectional_override,
- char *, size_t);
-
-size_t grapheme_decode_utf8(const char *, size_t, uint_least32_t *);
-size_t grapheme_encode_utf8(uint_least32_t, char *, size_t);
+size_t grapheme_to_uppercase_utf8(const char *, size_t, char *, size_t);
#endif /* GRAPHEME_H */
diff --git a/man/grapheme_decode_utf8.3 b/man/grapheme_decode_utf8.3
@@ -1,101 +0,0 @@
-.Dd 2021-12-22
-.Dt GRAPHEME_DECODE_UTF8 3
-.Os suckless.org
-.Sh NAME
-.Nm grapheme_decode_utf8
-.Nd decode first codepoint in UTF-8-encoded string
-.Sh SYNOPSIS
-.In grapheme.h
-.Ft size_t
-.Fn grapheme_decode_utf8 "const char *str" "size_t len" "uint_least32_t *cp"
-.Sh DESCRIPTION
-The
-.Fn grapheme_decode_utf8
-function decodes the next codepoint in the UTF-8-encoded string
-.Va str
-of length
-.Va len .
-If the UTF-8-sequence is invalid (overlong encoding, unexpected byte,
-string ends unexpectedly, empty string, etc.) the decoding is stopped
-at the last processed byte and the decoded codepoint set to
-.Dv GRAPHEME_INVALID_CODEPOINT .
-.Pp
-If
-.Va cp
-is not
-.Dv NULL
-the decoded codepoint is stored in the memory pointed to by
-.Va cp .
-.Pp
-Given NUL has a unique 1 byte representation, it is safe to operate on
-NUL-terminated strings by setting
-.Va len
-to
-.Dv SIZE_MAX
-(stdint.h is already included by grapheme.h) and terminating when
-.Va cp
-is 0 (see
-.Sx EXAMPLES
-for an example).
-.Sh RETURN VALUES
-The
-.Fn grapheme_decode_utf8
-function returns the number of processed bytes and 0 if
-.Va str
-is
-.Dv NULL
-or
-.Va len
-is 0.
-If the string ends unexpectedly in a multibyte sequence, the desired
-length (that is larger than
-.Va len )
-is returned.
-.Sh EXAMPLES
-.Bd -literal
-/* cc (-static) -o example example.c -lgrapheme */
-#include <grapheme.h>
-#include <inttypes.h>
-#include <stdio.h>
-
-void
-print_cps(const char *str, size_t len)
-{
- size_t ret, off;
- uint_least32_t cp;
-
- for (off = 0; off < len; off += ret) {
- if ((ret = grapheme_decode_utf8(str + off,
- len - off, &cp)) > (len - off)) {
- /*
- * string ended unexpectedly in the middle of a
- * multibyte sequence and we have the choice
- * here to possibly expand str by ret - len + off
- * bytes to get a full sequence, but we just
- * bail out in this case.
- */
- break;
- }
- printf("%"PRIxLEAST32"\\n", cp);
- }
-}
-
-void
-print_cps_nul_terminated(const char *str)
-{
- size_t ret, off;
- uint_least32_t cp;
-
- for (off = 0; (ret = grapheme_decode_utf8(str + off,
- SIZE_MAX, &cp)) > 0 &&
- cp != 0; off += ret) {
- printf("%"PRIxLEAST32"\\n", cp);
- }
-}
-.Ed
-.Sh SEE ALSO
-.Xr grapheme_encode_utf8 3 ,
-.Xr grapheme_is_character_break 3 ,
-.Xr libgrapheme 7
-.Sh AUTHORS
-.An Laslo Hunhold Aq Mt dev@frign.de
diff --git a/man/grapheme_decode_utf8.sh b/man/grapheme_decode_utf8.sh
@@ -0,0 +1,102 @@
+cat << EOF
+.Dd ${MAN_DATE}
+.Dt GRAPHEME_DECODE_UTF8 3
+.Os suckless.org
+.Sh NAME
+.Nm grapheme_decode_utf8
+.Nd decode first codepoint in UTF-8-encoded string
+.Sh SYNOPSIS
+.In grapheme.h
+.Ft size_t
+.Fn grapheme_decode_utf8 "const char *str" "size_t len" "uint_least32_t *cp"
+.Sh DESCRIPTION
+The
+.Fn grapheme_decode_utf8
+function decodes the first codepoint in the UTF-8-encoded string
+.Va str
+of length
+.Va len .
+If the UTF-8-sequence is invalid (overlong encoding, unexpected byte,
+string ends unexpectedly, empty string, etc.) the decoding is stopped
+at the last processed byte and the decoded codepoint set to
+.Dv GRAPHEME_INVALID_CODEPOINT .
+.Pp
+If
+.Va cp
+is not
+.Dv NULL
+the decoded codepoint is stored in the memory pointed to by
+.Va cp .
+.Pp
+Given NUL has a unique 1 byte representation, it is safe to operate on
+NUL-terminated strings by setting
+.Va len
+to
+.Dv SIZE_MAX
+(stdint.h is already included by grapheme.h) and terminating when
+.Va cp
+is 0 (see
+.Sx EXAMPLES
+for an example).
+.Sh RETURN VALUES
+The
+.Fn grapheme_decode_utf8
+function returns the number of processed bytes and 0 if
+.Va str
+is
+.Dv NULL
+or
+.Va len
+is 0.
+If the string ends unexpectedly in a multibyte sequence, the desired
+length (that is larger than
+.Va len )
+is returned.
+.Sh EXAMPLES
+.Bd -literal
+/* cc (-static) -o example example.c -lgrapheme */
+#include <grapheme.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+void
+print_cps(const char *str, size_t len)
+{
+ size_t ret, off;
+ uint_least32_t cp;
+
+ for (off = 0; off < len; off += ret) {
+ if ((ret = grapheme_decode_utf8(str + off,
+ len - off, &cp)) > (len - off)) {
+ /*
+ * string ended unexpectedly in the middle of a
+ * multibyte sequence and we have the choice
+ * here to possibly expand str by ret - len + off
+ * bytes to get a full sequence, but we just
+ * bail out in this case.
+ */
+ break;
+ }
+ printf("%"PRIxLEAST32"\\\\n", cp);
+ }
+}
+
+void
+print_cps_nul_terminated(const char *str)
+{
+ size_t ret, off;
+ uint_least32_t cp;
+
+ for (off = 0; (ret = grapheme_decode_utf8(str + off,
+ SIZE_MAX, &cp)) > 0 &&
+ cp != 0; off += ret) {
+ printf("%"PRIxLEAST32"\\\\n", cp);
+ }
+}
+.Ed
+.Sh SEE ALSO
+.Xr grapheme_encode_utf8 3 ,
+.Xr libgrapheme 7
+.Sh AUTHORS
+.An Laslo Hunhold Aq Mt dev@frign.de
+EOF
diff --git a/man/grapheme_encode_utf8.3 b/man/grapheme_encode_utf8.3
@@ -1,98 +0,0 @@
-.Dd 2021-12-22
-.Dt GRAPHEME_ENCODE_UTF8 3
-.Os suckless.org
-.Sh NAME
-.Nm grapheme_encode_utf8
-.Nd encode codepoint into UTF-8 string
-.Sh SYNOPSIS
-.In grapheme.h
-.Ft size_t
-.Fn grapheme_encode_utf8 "uint_least32_t cp" "char *str" "size_t len"
-.Sh DESCRIPTION
-The
-.Fn grapheme_encode_utf8
-function encodes the codepoint
-.Va cp
-into a UTF-8-string.
-If
-.Va str
-is not
-.Dv NULL
-and
-.Va len
-is large enough it writes the UTF-8-string to the memory pointed to by
-.Va str .
-.Sh RETURN VALUES
-The
-.Fn grapheme_encode_utf8
-function returns the length (in bytes) of the UTF-8-string resulting
-from encoding
-.Va cp .
-When the returned value is larger than
-.Va len
-it is indicated that the output string is too small and no data has been
-written.
-.Sh EXAMPLES
-.Bd -literal
-/* cc (-static) -o example example.c -lgrapheme */
-#include <grapheme.h>
-#include <stddef.h>
-#include <stdlib.h>
-
-size_t
-cps_to_utf8(const uint_least32_t *cp, size_t cplen, char *str, size_t len)
-{
- size_t i, off, ret;
-
- for (i = 0, off = 0; i < cplen; i++, off += ret) {
- if ((ret = grapheme_encode_utf8(cp[i], str + off,
- len - off)) > (len - off)) {
- /* buffer too small */
- break;
- }
- }
-
- return off;
-}
-
-size_t
-cps_bytelen(const uint_least32_t *cp, size_t cplen)
-{
- size_t i, len;
-
- for (i = 0, len = 0; i < cplen; i++) {
- len += grapheme_encode_utf8(cp[i], NULL, 0);
- }
-
- return len;
-}
-
-char *
-cps_to_utf8_alloc(const uint_least32_t *cp, size_t cplen)
-{
- char *str;
- size_t len, i, ret, off;
-
- len = cps_bytelen(cp, cplen);
-
- if (!(str = malloc(len))) {
- return NULL;
- }
-
- for (i = 0, off = 0; i < cplen; i++, off += ret) {
- if ((ret = grapheme_encode_utf8(cp[i], str + off,
- len - off)) > (len - off)) {
- /* buffer too small */
- break;
- }
- }
- str[off] = '\\0';
-
- return str;
-}
-.Ed
-.Sh SEE ALSO
-.Xr grapheme_decode_utf8 3 ,
-.Xr libgrapheme 7
-.Sh AUTHORS
-.An Laslo Hunhold Aq Mt dev@frign.de
diff --git a/man/grapheme_encode_utf8.sh b/man/grapheme_encode_utf8.sh
@@ -0,0 +1,103 @@
+cat << EOF
+.Dd ${MAN_DATE}
+.Dt GRAPHEME_ENCODE_UTF8 3
+.Os suckless.org
+.Sh NAME
+.Nm grapheme_encode_utf8
+.Nd encode codepoint into UTF-8 string
+.Sh SYNOPSIS
+.In grapheme.h
+.Ft size_t
+.Fn grapheme_encode_utf8 "uint_least32_t cp" "char *str" "size_t len"
+.Sh DESCRIPTION
+The
+.Fn grapheme_encode_utf8
+function encodes the codepoint
+.Va cp
+into a UTF-8-string.
+If
+.Va str
+is not
+.Dv NULL
+and
+.Va len
+is large enough it writes the UTF-8-string to the memory pointed to by
+.Va str .
+Otherwise no data is written.
+.Sh RETURN VALUES
+The
+.Fn grapheme_encode_utf8
+function returns the length (in bytes) of the UTF-8-string resulting
+from encoding
+.Va cp ,
+even if
+.Va len
+is not large enough or
+.Va str
+is
+.Dv NULL .
+.Sh EXAMPLES
+.Bd -literal
+/* cc (-static) -o example example.c -lgrapheme */
+#include <grapheme.h>
+#include <stddef.h>
+#include <stdlib.h>
+
+size_t
+cps_to_utf8(const uint_least32_t *cp, size_t cplen, char *str, size_t len)
+{
+ size_t i, off, ret;
+
+ for (i = 0, off = 0; i < cplen; i++, off += ret) {
+ if ((ret = grapheme_encode_utf8(cp[i], str + off,
+ len - off)) > (len - off)) {
+ /* buffer too small */
+ break;
+ }
+ }
+
+ return off;
+}
+
+size_t
+cps_bytelen(const uint_least32_t *cp, size_t cplen)
+{
+ size_t i, len;
+
+ for (i = 0, len = 0; i < cplen; i++) {
+ len += grapheme_encode_utf8(cp[i], NULL, 0);
+ }
+
+ return len;
+}
+
+char *
+cps_to_utf8_alloc(const uint_least32_t *cp, size_t cplen)
+{
+ char *str;
+ size_t len, i, ret, off;
+
+ len = cps_bytelen(cp, cplen);
+
+ if (!(str = malloc(len))) {
+ return NULL;
+ }
+
+ for (i = 0, off = 0; i < cplen; i++, off += ret) {
+ if ((ret = grapheme_encode_utf8(cp[i], str + off,
+ len - off)) > (len - off)) {
+ /* buffer too small */
+ break;
+ }
+ }
+ str[off] = '\\\\0';
+
+ return str;
+}
+.Ed
+.Sh SEE ALSO
+.Xr grapheme_decode_utf8 3 ,
+.Xr libgrapheme 7
+.Sh AUTHORS
+.An Laslo Hunhold Aq Mt dev@frign.de
+EOF
diff --git a/man/grapheme_is_character_break.3 b/man/grapheme_is_character_break.3
@@ -1,80 +0,0 @@
-.Dd 2021-12-22
-.Dt GRAPHEME_IS_CHARACTER_BREAK 3
-.Os suckless.org
-.Sh NAME
-.Nm grapheme_is_character_break
-.Nd test for a grapheme cluster break between two codepoints
-.Sh SYNOPSIS
-.In grapheme.h
-.Ft size_t
-.Fn grapheme_is_character_break "uint_least32_t cp1" "uint_least32_t cp2" "GRAPHEME_STATE *state"
-.Sh DESCRIPTION
-The
-.Fn grapheme_is_character_break
-function determines if there is a grapheme cluster break (see
-.Xr libgrapheme 7 )
-between the two codepoints
-.Va cp1
-and
-.Va cp2 .
-By specification this decision depends on a
-.Va state
-that can at most be completely reset after detecting a break and must
-be reset every time one deviates from sequential processing.
-.Pp
-If
-.Va state
-is
-.Dv NULL
-.Fn grapheme_is_character_break
-behaves as if it was called with a fully reset state.
-.Sh RETURN VALUES
-The
-.Fn grapheme_is_character_break
-function returns
-.Va true
-if there is a grapheme cluster break between the codepoints
-.Va cp1
-and
-.Va cp2
-and
-.Va false
-if there is not.
-.Sh EXAMPLES
-.Bd -literal
-/* cc (-static) -o example example.c -lgrapheme */
-#include <grapheme.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-int
-main(void)
-{
- GRAPHEME_STATE state = { 0 };
- uint_least32_t s1[] = ..., s2[] = ...; /* two input arrays */
- size_t i;
-
- for (i = 0; i + 1 < sizeof(s1) / sizeof(*s1); i++) {
- if (grapheme_is_character_break(s[i], s[i + 1], &state)) {
- printf("break in s1 at offset %zu\n", i);
- }
- }
- memset(&state, 0, sizeof(state)); /* reset state */
- for (i = 0; i + 1 < sizeof(s2) / sizeof(*s2); i++) {
- if (grapheme_is_character_break(s[i], s[i + 1], &state)) {
- printf("break in s2 at offset %zu\n", i);
- }
- }
-
- return 0;
-}
-.Ed
-.Sh SEE ALSO
-.Xr grapheme_next_character_break 3 ,
-.Xr libgrapheme 7
-.Sh STANDARDS
-.Fn grapheme_is_character_break
-is compliant with the Unicode 14.0.0 specification.
-.Sh AUTHORS
-.An Laslo Hunhold Aq Mt dev@frign.de
diff --git a/man/grapheme_is_character_break.sh b/man/grapheme_is_character_break.sh
@@ -0,0 +1,83 @@
+cat << EOF
+.Dd ${MAN_DATE}
+.Dt GRAPHEME_IS_CHARACTER_BREAK 3
+.Os suckless.org
+.Sh NAME
+.Nm grapheme_is_character_break
+.Nd test for a grapheme cluster break between two codepoints
+.Sh SYNOPSIS
+.In grapheme.h
+.Ft size_t
+.Fn grapheme_is_character_break "uint_least32_t cp1" "uint_least32_t cp2" "uint_least16_t *state"
+.Sh DESCRIPTION
+The
+.Fn grapheme_is_character_break
+function determines if there is a grapheme cluster break (see
+.Xr libgrapheme 7 )
+between the two codepoints
+.Va cp1
+and
+.Va cp2 .
+By specification this decision depends on a
+.Va state
+that can at most be completely reset after detecting a break and must
+be reset every time one deviates from sequential processing.
+.Pp
+If
+.Va state
+is
+.Dv NULL
+.Fn grapheme_is_character_break
+behaves as if it was called with a fully reset state.
+.Sh RETURN VALUES
+The
+.Fn grapheme_is_character_break
+function returns
+.Va true
+if there is a grapheme cluster break between the codepoints
+.Va cp1
+and
+.Va cp2
+and
+.Va false
+if there is not.
+.Sh EXAMPLES
+.Bd -literal
+/* cc (-static) -o example example.c -lgrapheme */
+#include <grapheme.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int
+main(void)
+{
+ uint_least16_t state = 0;
+ uint_least32_t s1[] = ..., s2[] = ...; /* two input arrays */
+ size_t i;
+
+ for (i = 0; i + 1 < sizeof(s1) / sizeof(*s1); i++) {
+ if (grapheme_is_character_break(s[i], s[i + 1], &state)) {
+ printf("break in s1 at offset %zu\n", i);
+ }
+ }
+ memset(&state, 0, sizeof(state)); /* reset state */
+ for (i = 0; i + 1 < sizeof(s2) / sizeof(*s2); i++) {
+ if (grapheme_is_character_break(s[i], s[i + 1], &state)) {
+ printf("break in s2 at offset %zu\n", i);
+ }
+ }
+
+ return 0;
+}
+.Ed
+.Sh SEE ALSO
+.Xr grapheme_next_character_break 3 ,
+.Xr grapheme_next_character_break_utf8 3 ,
+.Xr libgrapheme 7
+.Sh STANDARDS
+.Fn grapheme_is_character_break
+is compliant with the Unicode ${UNICODE_VERSION} specification.
+.Sh AUTHORS
+.An Laslo Hunhold Aq Mt dev@frign.de
+EOF
diff --git a/man/grapheme_is_lowercase.sh b/man/grapheme_is_lowercase.sh
@@ -0,0 +1,3 @@
+ENCODING="codepoint" \
+CASE="lowercase" \
+ $SH man/template/is_case.sh
diff --git a/man/grapheme_is_lowercase_utf8.sh b/man/grapheme_is_lowercase_utf8.sh
@@ -0,0 +1,3 @@
+ENCODING="utf8" \
+CASE="lowercase" \
+ $SH man/template/is_case.sh
diff --git a/man/grapheme_is_titlecase.sh b/man/grapheme_is_titlecase.sh
@@ -0,0 +1,3 @@
+ENCODING="codepoint" \
+CASE="titlecase" \
+ $SH man/template/is_case.sh
diff --git a/man/grapheme_is_titlecase_utf8.sh b/man/grapheme_is_titlecase_utf8.sh
@@ -0,0 +1,3 @@
+ENCODING="utf8" \
+CASE="titlecase" \
+ $SH man/template/is_case.sh
diff --git a/man/grapheme_is_uppercase.sh b/man/grapheme_is_uppercase.sh
@@ -0,0 +1,3 @@
+ENCODING="codepoint" \
+CASE="uppercase" \
+ $SH man/template/is_case.sh
diff --git a/man/grapheme_is_uppercase_utf8.sh b/man/grapheme_is_uppercase_utf8.sh
@@ -0,0 +1,3 @@
+ENCODING="utf8" \
+CASE="lowercase" \
+ $SH man/template/is_case.sh
diff --git a/man/grapheme_next_character_break.sh b/man/grapheme_next_character_break.sh
@@ -0,0 +1,4 @@
+ENCODING="codepoint" \
+TYPE="character" \
+REALTYPE="grapheme cluster" \
+ $SH man/template/next_break.sh
diff --git a/man/grapheme_next_character_break_utf8.3 b/man/grapheme_next_character_break_utf8.3
@@ -1,92 +0,0 @@
-.Dd 2021-12-22
-.Dt GRAPHEME_NEXT_CHARACTER_BREAK_UTF8 3
-.Os suckless.org
-.Sh NAME
-.Nm grapheme_next_character_break_utf8
-.Nd determine byte-offset to next grapheme cluster break
-.Sh SYNOPSIS
-.In grapheme.h
-.Ft size_t
-.Fn grapheme_next_character_break_utf8 "const char *str" "size_t len"
-.Sh DESCRIPTION
-The
-.Fn grapheme_next_character_break_utf8
-function computes the offset (in bytes) to the next grapheme
-cluster break (see
-.Xr libgrapheme 7 )
-in the UTF-8-encoded string
-.Va str
-of length
-.Va len .
-If a grapheme cluster begins at
-.Va str
-this offset is equal to the length of said grapheme cluster.
-.Pp
-If
-.Va len
-is set to
-.Dv SIZE_MAX
-(stdint.h is already included by grapheme.h) the string
-.Va str
-is interpreted to be NUL-terminated and processing stops when a
-NUL-byte is encountered.
-.Pp
-For non-UTF-8 input data
-.Xr grapheme_is_character_break 3
-can be used instead.
-.Sh RETURN VALUES
-The
-.Fn grapheme_next_character_break_utf8
-function returns the offset (in bytes) to the next grapheme cluster
-break in
-.Va str
-or 0 if
-.Va str
-is
-.Dv NULL .
-.Sh EXAMPLES
-.Bd -literal
-/* cc (-static) -o example example.c -lgrapheme */
-#include <grapheme.h>
-#include <stdint.h>
-#include <stdio.h>
-
-int
-main(void)
-{
- /* UTF-8 encoded input */
- char *s = "T\\xC3\\xABst \\xF0\\x9F\\x91\\xA8\\xE2\\x80\\x8D\\xF0"
- "\\x9F\\x91\\xA9\\xE2\\x80\\x8D\\xF0\\x9F\\x91\\xA6 \\xF0"
- "\\x9F\\x87\\xBA\\xF0\\x9F\\x87\\xB8 \\xE0\\xA4\\xA8\\xE0"
- "\\xA5\\x80 \\xE0\\xAE\\xA8\\xE0\\xAE\\xBF!";
- size_t ret, len, off;
-
- printf("Input: \\"%s\\"\\n", s);
-
- /* print each grapheme cluster with byte-length */
- printf("Grapheme clusters in NUL-delimited input:\\n");
- for (off = 0; s[off] != '\\0'; off += ret) {
- ret = grapheme_next_character_break_utf8(s + off, SIZE_MAX);
- printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret);
- }
- printf("\\n");
-
- /* do the same, but this time string is length-delimited */
- len = 17;
- printf("Grapheme clusters in input delimited to %zu bytes:\\n", len);
- for (off = 0; off < len; off += ret) {
- ret = grapheme_next_character_break_utf8(s + off, len - off);
- printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret);
- }
-
- return 0;
-}
-.Ed
-.Sh SEE ALSO
-.Xr grapheme_is_character_break 3 ,
-.Xr libgrapheme 7
-.Sh STANDARDS
-.Fn grapheme_next_character_break_utf8
-is compliant with the Unicode 14.0.0 specification.
-.Sh AUTHORS
-.An Laslo Hunhold Aq Mt dev@frign.de
diff --git a/man/grapheme_next_character_break_utf8.sh b/man/grapheme_next_character_break_utf8.sh
@@ -0,0 +1,4 @@
+ENCODING="utf8" \
+TYPE="character" \
+REALTYPE="grapheme cluster" \
+ $SH man/template/next_break.sh
diff --git a/man/grapheme_next_line_break.sh b/man/grapheme_next_line_break.sh
@@ -0,0 +1,4 @@
+ENCODING="codepoint" \
+TYPE="line" \
+REALTYPE="possible line" \
+ $SH man/template/next_break.sh
diff --git a/man/grapheme_next_line_break_utf8.sh b/man/grapheme_next_line_break_utf8.sh
@@ -0,0 +1,4 @@
+ENCODING="utf8" \
+TYPE="line" \
+REALTYPE="possible line" \
+ $SH man/template/next_break.sh
diff --git a/man/grapheme_next_sentence_break.sh b/man/grapheme_next_sentence_break.sh
@@ -0,0 +1,4 @@
+ENCODING="codepoint" \
+TYPE="sentence" \
+REALTYPE="sentence" \
+ $SH man/template/next_break.sh
diff --git a/man/grapheme_next_sentence_break_utf8.sh b/man/grapheme_next_sentence_break_utf8.sh
@@ -0,0 +1,4 @@
+ENCODING="utf8" \
+TYPE="sentence" \
+REALTYPE="sentence" \
+ $SH man/template/next_break.sh
diff --git a/man/grapheme_next_word_break.sh b/man/grapheme_next_word_break.sh
@@ -0,0 +1,4 @@
+ENCODING="codepoint" \
+TYPE="word" \
+REALTYPE="word" \
+ $SH man/template/next_break.sh
diff --git a/man/grapheme_next_word_break_utf8.sh b/man/grapheme_next_word_break_utf8.sh
@@ -0,0 +1,4 @@
+ENCODING="utf8" \
+TYPE="word" \
+REALTYPE="word" \
+ $SH man/template/next_break.sh
diff --git a/man/grapheme_to_lowercase.sh b/man/grapheme_to_lowercase.sh
@@ -0,0 +1,3 @@
+ENCODING="codepoint" \
+CASE="lowercase" \
+ $SH man/template/to_case.sh
diff --git a/man/grapheme_to_lowercase_utf8.sh b/man/grapheme_to_lowercase_utf8.sh
@@ -0,0 +1,3 @@
+ENCODING="utf8" \
+CASE="lowercase" \
+ $SH man/template/to_case.sh
diff --git a/man/grapheme_to_titlecase.sh b/man/grapheme_to_titlecase.sh
@@ -0,0 +1,3 @@
+ENCODING="codepoint" \
+CASE="titlecase" \
+ $SH man/template/to_case.sh
diff --git a/man/grapheme_to_titlecase_utf8.sh b/man/grapheme_to_titlecase_utf8.sh
@@ -0,0 +1,3 @@
+ENCODING="utf8" \
+CASE="titlecase" \
+ $SH man/template/to_case.sh
diff --git a/man/grapheme_to_uppercase.sh b/man/grapheme_to_uppercase.sh
@@ -0,0 +1,3 @@
+ENCODING="codepoint" \
+CASE="uppercase" \
+ $SH man/template/to_case.sh
diff --git a/man/grapheme_to_uppercase_utf8.sh b/man/grapheme_to_uppercase_utf8.sh
@@ -0,0 +1,3 @@
+ENCODING="utf8" \
+CASE="lowercase" \
+ $SH man/template/to_case.sh
diff --git a/man/libgrapheme.7 b/man/libgrapheme.7
@@ -1,140 +0,0 @@
-.Dd 2021-12-22
-.Dt LIBGRAPHEME 7
-.Os suckless.org
-.Sh NAME
-.Nm libgrapheme
-.Nd unicode string library
-.Sh SYNOPSIS
-.In grapheme.h
-.Sh DESCRIPTION
-The
-.Nm
-library provides functions to properly handle Unicode strings according
-to the Unicode specification.
-Unicode strings are made up of user-perceived characters (so-called
-.Dq grapheme clusters ,
-see
-.Sx MOTIVATION )
-that are made up of one or more Unicode codepoints, which in turn
-are encoded in one or more bytes in an encoding like UTF-8.
-.Pp
-There is a widespread misconception that it was enough to simply
-determine codepoints in a string and treat them as user-perceived
-characters to be Unicode compliant.
-While this may work in some cases, this assumption quickly breaks,
-especially for non-Western languages and decomposed Unicode strings
-where user-perceived characters are usually represented using multiple
-codepoints.
-.Pp
-Despite this complicated multilevel structure of Unicode strings,
-.Nm
-provides methods to work with them at the byte-level (i.e. UTF-8
-.Sq char
-arrays) while also offering codepoint-level methods.
-.Pp
-Every documented function's manual page provides a self-contained
-example illustrating the possible usage.
-.Sh SEE ALSO
-.Xr grapheme_decode_utf8 3 ,
-.Xr grapheme_encode_utf8 3 ,
-.Xr grapheme_is_character_break 3 ,
-.Xr grapheme_next_character_break 3
-.Sh STANDARDS
-.Nm
-is compliant with the Unicode 14.0.0 specification.
-.Sh MOTIVATION
-The idea behind every character encoding scheme like ASCII or Unicode
-is to express abstract characters (which can be thought of as shapes
-making up a written language). ASCII for instance, which comprises the
-range 0 to 127, assigns the number 65 (0x41) to the abstract character
-.Sq A .
-This number is called a
-.Dq codepoint ,
-and all codepoints of an encoding make up its so-called
-.Dq code space .
-.Pp
-Unicode's code space is much larger, ranging from 0 to 0x10FFFF, but its
-first 128 codepoints are identical to ASCII's. The additional code
-points are needed as Unicode's goal is to express all writing systems
-of the world.
-To give an example, the abstract character
-.Sq \[u00C4]
-is not expressable in ASCII, given no ASCII codepoint has been assigned
-to it.
-It can be expressed in Unicode, though, with the codepoint 196 (0xC4).
-.Pp
-One may assume that this process is straightfoward, but as more and
-more codepoints were assigned to abstract characters, the Unicode
-Consortium (that defines the Unicode standard) was facing a problem:
-Many (mostly non-European) languages have such a large amount of
-abstract characters that it would exhaust the available Unicode code
-space if one tried to assign a codepoint to each abstract character.
-The solution to that problem is best introduced with an example: Consider
-the abstract character
-.Sq \[u01DE] ,
-which is
-.Sq A
-with an umlaut and a macron added to it.
-In this sense, one can consider
-.Sq \[u01DE]
-as a two-fold modification (namely
-.Dq add umlaut
-and
-.Dq add macron )
-of the
-.Dq base character
-.Sq A .
-.Pp
-The Unicode Consortium adapted this idea by assigning codepoints to
-modifications.
-For example, the codepoint 0x308 represents adding an umlaut and 0x304
-represents adding a macron, and thus, the codepoint sequence
-.Dq 0x41 0x308 0x304 ,
-namely the base character
-.Sq A
-followed by the umlaut and macron modifiers, represents the abstract
-character
-.Sq \[u01DE] .
-As a side-note, the single codepoint 0x1DE was also assigned to
-.Sq \[u01DE] ,
-which is a good example for the fact that there can be multiple
-representations of a single abstract character in Unicode.
-.Pp
-Expressing a single abstract character with multiple codepoints solved
-the code space exhaustion-problem, and the concept has been greatly
-expanded since its first introduction (emojis, joiners, etc.). A sequence
-(which can also have the length 1) of codepoints that belong together
-this way and represents an abstract character is called a
-.Dq grapheme cluster .
-.Pp
-In many applications it is necessary to count the number of
-user-perceived characters, i.e. grapheme clusters, in a string.
-A good example for this is a terminal text editor, which needs to
-properly align characters on a grid.
-This is pretty simple with ASCII-strings, where you just count the number
-of bytes (as each byte is a codepoint and each codepoint is a grapheme
-cluster).
-With Unicode-strings, it is a common mistake to simply adapt the
-ASCII-approach and count the number of code points.
-This is wrong, as, for example, the sequence
-.Dq 0x41 0x308 0x304 ,
-while made up of 3 codepoints, is a single grapheme cluster and
-represents the user-perceived character
-.Sq \[u01DE] .
-.Pp
-The proper way to segment a string into user-perceived characters
-is to segment it into its grapheme clusters by applying the Unicode
-grapheme cluster breaking algorithm (UAX #29).
-It is based on a complex ruleset and lookup-tables and determines if a
-grapheme cluster ends or is continued between two codepoints.
-Libraries like ICU and libunistring, which also offer this functionality,
-are often bloated, not correct, difficult to use or not reasonably
-statically linkable.
-.Pp
-Analogously, the standard provides algorithms to separate strings by
-words, sentences and lines, convert cases and compare strings.
-The motivation behind
-.Nm
-is to make unicode handling suck less and abide by the UNIX philosophy.
-.Sh AUTHORS
-.An Laslo Hunhold Aq Mt dev@frign.de
diff --git a/man/libgrapheme.sh b/man/libgrapheme.sh
@@ -0,0 +1,167 @@
+cat << EOF
+.Dd ${MAN_DATE}
+.Dt LIBGRAPHEME 7
+.Os suckless.org
+.Sh NAME
+.Nm libgrapheme
+.Nd unicode string library
+.Sh SYNOPSIS
+.In grapheme.h
+.Sh DESCRIPTION
+The
+.Nm
+library provides functions to properly handle Unicode strings according
+to the Unicode specification in regard to character, word, sentence and
+line segmentation and case detection and conversion.
+.Pp
+Unicode strings are made up of user-perceived characters (so-called
+.Dq grapheme clusters ,
+see
+.Sx MOTIVATION )
+that are composed of one or more Unicode codepoints, which in turn
+are encoded in one or more bytes in an encoding like UTF-8.
+.Pp
+There is a widespread misconception that it was enough to simply
+determine codepoints in a string and treat them as user-perceived
+characters to be Unicode compliant.
+While this may work in some cases, this assumption quickly breaks,
+especially for non-Western languages and decomposed Unicode strings
+where user-perceived characters are usually represented using multiple
+codepoints.
+.Pp
+Despite this complicated multilevel structure of Unicode strings,
+.Nm
+provides methods to work with them at the byte-level (i.e. UTF-8
+.Sq char
+arrays) while also offering codepoint-level methods.
+Additionally, it is a
+.Dq freestanding
+library (see ISO/IEC 9899:1999 section 4.6) and thus does not depend on
+a standard library. This makes it easy to use in bare metal environments.
+.Pp
+Every documented function's manual page provides a self-contained
+example illustrating the possible usage.
+.Sh SEE ALSO
+.Xr grapheme_decode_utf8 3 ,
+.Xr grapheme_encode_utf8 3 ,
+.Xr grapheme_is_character_break 3 ,
+.Xr grapheme_is_lowercase 3 ,
+.Xr grapheme_is_lowercase_utf8 3 ,
+.Xr grapheme_is_titlecase 3 ,
+.Xr grapheme_is_titlecase_utf8 3 ,
+.Xr grapheme_is_uppercase 3 ,
+.Xr grapheme_is_uppercase_utf8 3 ,
+.Xr grapheme_next_character_break 3 ,
+.Xr grapheme_next_character_break_utf8 3 ,
+.Xr grapheme_next_line_break 3 ,
+.Xr grapheme_next_line_break_utf8 3 ,
+.Xr grapheme_next_sentence_break 3 ,
+.Xr grapheme_next_sentence_break_utf8 3 ,
+.Xr grapheme_next_word_break 3 ,
+.Xr grapheme_next_word_break_utf8 3 ,
+.Xr grapheme_to_lowercase 3 ,
+.Xr grapheme_to_lowercase_utf8 3 ,
+.Xr grapheme_to_titlecase 3 ,
+.Xr grapheme_to_titlecase_utf8 3
+.Xr grapheme_to_uppercase 3 ,
+.Xr grapheme_to_uppercase_utf8 3 ,
+.Sh STANDARDS
+.Nm
+is compliant with the Unicode ${UNICODE_VERSION} specification.
+.Sh MOTIVATION
+The idea behind every character encoding scheme like ASCII or Unicode
+is to express abstract characters (which can be thought of as shapes
+making up a written language). ASCII for instance, which comprises the
+range 0 to 127, assigns the number 65 (0x41) to the abstract character
+.Sq A .
+This number is called a
+.Dq codepoint ,
+and all codepoints of an encoding make up its so-called
+.Dq code space .
+.Pp
+Unicode's code space is much larger, ranging from 0 to 0x10FFFF, but its
+first 128 codepoints are identical to ASCII's. The additional code
+points are needed as Unicode's goal is to express all writing systems
+of the world.
+To give an example, the abstract character
+.Sq \[u00C4]
+is not expressable in ASCII, given no ASCII codepoint has been assigned
+to it.
+It can be expressed in Unicode, though, with the codepoint 196 (0xC4).
+.Pp
+One may assume that this process is straightfoward, but as more and
+more codepoints were assigned to abstract characters, the Unicode
+Consortium (that defines the Unicode standard) was facing a problem:
+Many (mostly non-European) languages have such a large amount of
+abstract characters that it would exhaust the available Unicode code
+space if one tried to assign a codepoint to each abstract character.
+The solution to that problem is best introduced with an example: Consider
+the abstract character
+.Sq \[u01DE] ,
+which is
+.Sq A
+with an umlaut and a macron added to it.
+In this sense, one can consider
+.Sq \[u01DE]
+as a two-fold modification (namely
+.Dq add umlaut
+and
+.Dq add macron )
+of the
+.Dq base character
+.Sq A .
+.Pp
+The Unicode Consortium adapted this idea by assigning codepoints to
+modifications.
+For example, the codepoint 0x308 represents adding an umlaut and 0x304
+represents adding a macron, and thus, the codepoint sequence
+.Dq 0x41 0x308 0x304 ,
+namely the base character
+.Sq A
+followed by the umlaut and macron modifiers, represents the abstract
+character
+.Sq \[u01DE] .
+As a side-note, the single codepoint 0x1DE was also assigned to
+.Sq \[u01DE] ,
+which is a good example for the fact that there can be multiple
+representations of a single abstract character in Unicode.
+.Pp
+Expressing a single abstract character with multiple codepoints solved
+the code space exhaustion-problem, and the concept has been greatly
+expanded since its first introduction (emojis, joiners, etc.). A sequence
+(which can also have the length 1) of codepoints that belong together
+this way and represents an abstract character is called a
+.Dq grapheme cluster .
+.Pp
+In many applications it is necessary to count the number of
+user-perceived characters, i.e. grapheme clusters, in a string.
+A good example for this is a terminal text editor, which needs to
+properly align characters on a grid.
+This is pretty simple with ASCII-strings, where you just count the number
+of bytes (as each byte is a codepoint and each codepoint is a grapheme
+cluster).
+With Unicode-strings, it is a common mistake to simply adapt the
+ASCII-approach and count the number of code points.
+This is wrong, as, for example, the sequence
+.Dq 0x41 0x308 0x304 ,
+while made up of 3 codepoints, is a single grapheme cluster and
+represents the user-perceived character
+.Sq \[u01DE] .
+.Pp
+The proper way to segment a string into user-perceived characters
+is to segment it into its grapheme clusters by applying the Unicode
+grapheme cluster breaking algorithm (UAX #29).
+It is based on a complex ruleset and lookup-tables and determines if a
+grapheme cluster ends or is continued between two codepoints.
+Libraries like ICU and libunistring, which also offer this functionality,
+are often bloated, not correct, difficult to use or not reasonably
+statically linkable.
+.Pp
+Analogously, the standard provides algorithms to separate strings by
+words, sentences and lines, convert cases and compare strings.
+The motivation behind
+.Nm
+is to make unicode handling suck less and abide by the UNIX philosophy.
+.Sh AUTHORS
+.An Laslo Hunhold Aq Mt dev@frign.de
+EOF
diff --git a/man/template/is_case.sh b/man/template/is_case.sh
@@ -0,0 +1,67 @@
+if [ "$ENCODING" = "utf8" ]; then
+ UNIT="byte"
+ ARRAYTYPE="UTF-8-encoded string"
+ SUFFIX="_utf8"
+ ANTISUFFIX=""
+ DATATYPE="char"
+else
+ UNIT="codepoint"
+ ARRAYTYPE="codepoint array"
+ SUFFIX=""
+ ANTISUFFIX="_utf8"
+ DATATYPE="uint_least32_t"
+fi
+
+cat << EOF
+.Dd ${MAN_DATE}
+.Dt GRAPHEME_IS_$(printf "%s%s" "$CASE" "$SUFFIX" | tr [:lower:] [:upper:]) 3
+.Os suckless.org
+.Sh NAME
+.Nm grapheme_is_${CASE}${SUFFIX}
+.Nd check if ${ARRAYTYPE} is ${CASE}
+.Sh SYNOPSIS
+.In grapheme.h
+.Ft size_t
+.Fn grapheme_is_${CASE}${SUFFIX} "const ${DATATYPE} *str" "size_t len" "size_t *caselen"
+.Sh DESCRIPTION
+The
+.Fn grapheme_is_${CASE}${SUFFIX}
+function checks if the ${ARRAYTYPE}
+.Va str
+is ${CASE} and writes the length of the matching ${CASE}-sequence to the integer pointed to by
+.Va caselen ,
+unless
+.Va caselen
+is set to
+.Dv NULL .
+.Pp
+If
+.Va len
+is set to
+.Dv SIZE_MAX
+(stdint.h is already included by grapheme.h) the ${ARRAYTYPE}
+.Va src
+is interpreted to be NUL-terminated and processing stops when a
+NUL-byte is encountered.
+.Pp
+For $(if [ "$ENCODING" != "utf8" ]; then printf "UTF-8-encoded"; else printf "non-UTF-8"; fi) input data
+.Xr grapheme_is_${CASE}${ANTISUFFIX} 3
+can be used instead.
+.Sh RETURN VALUES
+The
+.Fn grapheme_is_${CASE}${SUFFIX}
+function returns
+.Dv true
+if the ${ARRAYTYPE}
+.Va str
+is ${CASE}, otherwise
+.Dv false .
+.Sh SEE ALSO
+.Xr grapheme_is_${CASE}${ANTISUFFIX} 3 ,
+.Xr libgrapheme 7
+.Sh STANDARDS
+.Fn grapheme_is_${CASE}${SUFFIX}
+is compliant with the Unicode ${UNICODE_VERSION} specification.
+.Sh AUTHORS
+.An Laslo Hunhold Aq Mt dev@frign.de
+EOF
diff --git a/man/template/next_break.sh b/man/template/next_break.sh
@@ -0,0 +1,112 @@
+if [ "$ENCODING" = "utf8" ]; then
+ UNIT="byte"
+ SUFFIX="_utf8"
+ ANTISUFFIX=""
+else
+ UNIT="codepoint"
+ SUFFIX=""
+ ANTISUFFIX="_utf8"
+fi
+
+cat << EOF
+.Dd ${MAN_DATE}
+.Dt GRAPHEME_NEXT_$(printf "%s_break%s" "$TYPE" "$SUFFIX" | tr [:lower:] [:upper:]) 3
+.Os suckless.org
+.Sh NAME
+.Nm grapheme_next_${TYPE}_break${SUFFIX}
+.Nd determine ${UNIT}-offset to next ${REALTYPE} break
+.Sh SYNOPSIS
+.In grapheme.h
+.Ft size_t
+.Fn grapheme_next_${TYPE}_break${SUFFIX} "const $(if [ "$ENCODING" = "utf8" ]; then printf "char"; else printf "uint_least32_t"; fi) *str" "size_t len"
+.Sh DESCRIPTION
+The
+.Fn grapheme_next_${TYPE}_break${SUFFIX}
+function computes the offset (in ${UNIT}s) to the next ${REALTYPE}
+break (see
+.Xr libgrapheme 7 )
+in the $(if [ "$ENCODING" = "utf8" ]; then printf "UTF-8-encoded string"; else printf "codepoint array"; fi)
+.Va str
+of length
+.Va len .$(if [ "$TYPE" != "line" ]; then printf "\nIf a ${REALTYPE} begins at
+.Va str
+this offset is equal to the length of said ${REALTYPE}."; fi)
+.Pp
+If
+.Va len
+is set to
+.Dv SIZE_MAX
+(stdint.h is already included by grapheme.h) the string
+.Va str
+is interpreted to be NUL-terminated and processing stops when
+a $(if [ "$ENCODING" = "utf8" ]; then printf "NUL-byte"; else printf "codepoint with the value 0"; fi) is encountered.
+.Pp
+For $(if [ "$ENCODING" != "utf8" ]; then printf "UTF-8-encoded"; else printf "non-UTF-8"; fi) input
+data$(if [ "$TYPE" = "character" ] && [ "$ENCODING" = "utf8" ]; then printf "\n.Xr grapheme_is_character_break 3 and"; fi)
+.Xr grapheme_next_${TYPE}_break${ANTISUFFIX} 3
+can be used instead.
+.Sh RETURN VALUES
+The
+.Fn grapheme_next_${TYPE}_break${SUFFIX}
+function returns the offset (in ${UNIT}s) to the next ${REALTYPE}
+break in
+.Va str
+or 0 if
+.Va str
+is
+.Dv NULL .
+EOF
+
+if [ "$ENCODING" = "utf8" ]; then
+cat << EOF
+.Sh EXAMPLES
+.Bd -literal
+/* cc (-static) -o example example.c -lgrapheme */
+#include <grapheme.h>
+#include <stdint.h>
+#include <stdio.h>
+
+int
+main(void)
+{
+ /* UTF-8 encoded input */
+ char *s = "T\\\\xC3\\\\xABst \\\\xF0\\\\x9F\\\\x91\\\\xA8\\\\xE2\\\\x80\\\\x8D\\\\xF0"
+ "\\\\x9F\\\\x91\\\\xA9\\\\xE2\\\\x80\\\\x8D\\\\xF0\\\\x9F\\\\x91\\\\xA6 \\\\xF0"
+ "\\\\x9F\\\\x87\\\\xBA\\\\xF0\\\\x9F\\\\x87\\\\xB8 \\\\xE0\\\\xA4\\\\xA8\\\\xE0"
+ "\\\\xA5\\\\x80 \\\\xE0\\\\xAE\\\\xA8\\\\xE0\\\\xAE\\\\xBF!";
+ size_t ret, len, off;
+
+ printf("Input: \\\\"%s\\\\"\\\\n", s);
+
+ /* print each ${REALTYPE} with byte-length */
+ printf("${REALTYPE}s in NUL-delimited input:\\\\n");
+ for (off = 0; s[off] != '\\\\0'; off += ret) {
+ ret = grapheme_next_${TYPE}_break_utf8(s + off, SIZE_MAX);
+ printf("%2zu bytes | %.*s\\\\n", ret, (int)ret, s + off);
+ }
+ printf("\\\\n");
+
+ /* do the same, but this time string is length-delimited */
+ len = 17;
+ printf("${REALTYPE}s in input delimited to %zu bytes:\\\\n", len);
+ for (off = 0; off < len; off += ret) {
+ ret = grapheme_next_${TYPE}_break_utf8(s + off, len - off);
+ printf("%2zu bytes | %.*s\\\\n", ret, (int)ret, s + off);
+ }
+
+ return 0;
+}
+.Ed
+EOF
+fi
+
+cat << EOF
+.Sh SEE ALSO$(if [ "$TYPE" = "character" ] && [ "$ENCODING" != "utf8" ]; then printf "\n.Xr grapheme_is_character_break 3 ,"; fi)
+.Xr grapheme_next_${TYPE}_break${ANTISUFFIX} 3 ,
+.Xr libgrapheme 7
+.Sh STANDARDS
+.Fn grapheme_next_${TYPE}_break${SUFFIX}
+is compliant with the Unicode ${UNICODE_VERSION} specification.
+.Sh AUTHORS
+.An Laslo Hunhold Aq Mt dev@frign.de
+EOF
diff --git a/man/template/to_case.sh b/man/template/to_case.sh
@@ -0,0 +1,72 @@
+if [ "$ENCODING" = "utf8" ]; then
+ UNIT="byte"
+ ARRAYTYPE="UTF-8-encoded string"
+ SUFFIX="_utf8"
+ ANTISUFFIX=""
+ DATATYPE="char"
+else
+ UNIT="codepoint"
+ ARRAYTYPE="codepoint array"
+ SUFFIX=""
+ ANTISUFFIX="_utf8"
+ DATATYPE="uint_least32_t"
+fi
+
+cat << EOF
+.Dd ${MAN_DATE}
+.Dt GRAPHEME_TO_$(printf "%s%s" "$CASE" "$SUFFIX" | tr [:lower:] [:upper:]) 3
+.Os suckless.org
+.Sh NAME
+.Nm grapheme_to_${CASE}${SUFFIX}
+.Nd convert ${ARRAYTYPE} to ${CASE}
+.Sh SYNOPSIS
+.In grapheme.h
+.Ft size_t
+.Fn grapheme_to_${CASE}${SUFFIX} "const ${DATATYPE} *src" "size_t srclen" "${DATATYPE} *dest" "size_t destlen"
+.Sh DESCRIPTION
+The
+.Fn grapheme_to_${CASE}${SUFFIX}
+function converts the ${ARRAYTYPE}
+.Va str
+to ${CASE} and writes the result to
+.Va dest
+up to
+.Va destlen ,
+unless
+.Va dest
+is set to
+.Dv NULL .
+.Pp
+If
+.Va srclen
+is set to
+.Dv SIZE_MAX
+(stdint.h is already included by grapheme.h) the ${ARRAYTYPE}
+.Va src
+is interpreted to be NUL-terminated and processing stops when a
+NUL-byte is encountered.
+.Pp
+For $(if [ "$ENCODING" != "utf8" ]; then printf "UTF-8-encoded"; else printf "non-UTF-8"; fi) input data
+.Xr grapheme_to_${CASE}${ANTISUFFIX} 3
+can be used instead.
+.Sh RETURN VALUES
+The
+.Fn grapheme_to_${CASE}${SUFFIX}
+function returns the number of ${UNIT}s in the array resulting
+from converting
+.Va src
+to ${CASE}, even if
+.Va destlen
+is not large enough or
+.Va dest
+is
+.Dv NULL .
+.Sh SEE ALSO
+.Xr grapheme_to_${CASE}${ANTISUFFIX} 3 ,
+.Xr libgrapheme 7
+.Sh STANDARDS
+.Fn grapheme_to_${CASE}${SUFFIX}
+is compliant with the Unicode ${UNICODE_VERSION} specification.
+.Sh AUTHORS
+.An Laslo Hunhold Aq Mt dev@frign.de
+EOF
diff --git a/src/case.c b/src/case.c
@@ -1,4 +1,5 @@
/* See LICENSE file for copyright and license details. */
+#include <stddef.h>
#include <stdint.h>
#include "../grapheme.h"
@@ -8,9 +9,9 @@
static inline enum case_property
get_case_property(uint_least32_t cp)
{
- if (likely(cp <= 0x10FFFF)) {
+ if (likely(cp <= UINT32_C(0x10FFFF))) {
return (enum case_property)
- case_minor[case_major[cp >> 8] + (cp & 0xff)];
+ case_minor[case_major[cp >> 8] + (cp & 0xFF)];
} else {
return CASE_PROP_OTHER;
}
@@ -20,35 +21,31 @@ static inline int_least32_t
get_case_offset(uint_least32_t cp, const uint_least16_t *major,
const int_least32_t *minor)
{
- if (likely(cp <= 0x10FFFF)) {
+ if (likely(cp <= UINT32_C(0x10FFFF))) {
/*
* this value might be larger than or equal to 0x110000
* for the special-case-mapping. This needs to be handled
* separately
*/
- return minor[major[cp >> 8] + (cp & 0xff)];
+ return minor[major[cp >> 8] + (cp & 0xFF)];
} else {
return 0;
}
}
static inline size_t
-to_case(const void *src, size_t srclen, void *dest, size_t destlen,
- size_t srcnumprocess, uint_least8_t final_sigma_level,
- size_t (*get_codepoint)(const void *, size_t, size_t, uint_least32_t *),
- size_t (*set_codepoint)(uint_least32_t, void *, size_t, size_t),
- const uint_least16_t *major, const int_least32_t *minor,
- const struct special_case *sc)
+to_case(HERODOTUS_READER *r, HERODOTUS_WRITER *w,
+ uint_least8_t final_sigma_level, const uint_least16_t *major,
+ const int_least32_t *minor, const struct special_case *sc)
{
+ HERODOTUS_READER tmp;
enum case_property prop;
- size_t srcoff, destoff, res, tmp, off, i;
+ enum herodotus_status s;
+ size_t off, i;
uint_least32_t cp, tmp_cp;
int_least32_t map;
- for (srcoff = 0, destoff = 0; srcoff < srcnumprocess; srcoff += res) {
- /* read in next source codepoint */
- res = get_codepoint((const char *)src, srclen, srcoff, &cp);
-
+ for (; herodotus_read_codepoint(r, true, &cp) == HERODOTUS_STATUS_SUCCESS;) {
if (sc == lower_special) {
/*
* For the special Final_Sigma-rule (see SpecialCasing.txt),
@@ -72,8 +69,10 @@ to_case(const void *src, size_t srclen, void *dest, size_t destlen,
* if the succeeding character is cased, invalidating
* the after-condition
*/
- for (tmp = srcoff + res, prop = NUM_CASE_PROPS; tmp < srclen; ) {
- tmp += get_codepoint(src, srclen, tmp, &tmp_cp);
+ herodotus_reader_copy(r, &tmp);
+ for (prop = NUM_CASE_PROPS;
+ (s = herodotus_read_codepoint(&tmp, true, &tmp_cp)) ==
+ HERODOTUS_STATUS_SUCCESS; ) {
prop = get_case_property(tmp_cp);
if (prop != CASE_PROP_CASE_IGNORABLE &&
@@ -83,20 +82,19 @@ to_case(const void *src, size_t srclen, void *dest, size_t destlen,
}
/*
- * Now prop is something other than case-ignorable.
+ * Now prop is something other than case-ignorable or
+ * the source-string ended.
* If it is something other than cased, we know
* that the after-condition holds
*/
- if (prop != CASE_PROP_CASED &&
- prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
+ if (s != HERODOTUS_STATUS_SUCCESS ||
+ (prop != CASE_PROP_CASED &&
+ prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
/*
* write GREEK SMALL LETTER FINAL SIGMA to
* destination
*/
- destoff += set_codepoint(UINT32_C(0x03C2),
- dest,
- destlen,
- destoff);
+ herodotus_write_codepoint(w, UINT32_C(0x03C2));
/* reset Final_Sigma-state and continue */
final_sigma_level = 0;
@@ -132,208 +130,176 @@ to_case(const void *src, size_t srclen, void *dest, size_t destlen,
off = (uint_least32_t)map - UINT32_C(0x110000);
for (i = 0; i < sc[off].cplen; i++) {
- if (likely(destoff < destlen)) {
- /*
- * write special mapping to destination
- */
- destoff += set_codepoint(sc[off].cp[i],
- dest,
- destlen,
- destoff);
- } else {
- /*
- * further increase destoff to indicate
- * how much buffer space we need
- */
- destoff += set_codepoint(sc[off].cp[i],
- NULL, 0, 0);
- }
+ herodotus_write_codepoint(w, sc[off].cp[i]);
}
} else {
/* we have a simple mapping */
- if (likely(destoff < destlen)) {
- destoff += set_codepoint((uint_least32_t)((int_least32_t)cp + map),
- dest, destlen, destoff);
- } else {
- destoff += set_codepoint((uint_least32_t)((int_least32_t)cp + map),
- NULL, 0, 0);
- }
+ herodotus_write_codepoint(w, (uint_least32_t)
+ ((int_least32_t)cp + map));
}
}
- if (set_codepoint == set_codepoint_utf8 && destlen > 0) {
- /*
- * NUL-terminate destination to always ensure NUL-termination,
- * unless in check mode.
- * Just like with snprintf() a return value >= destlen indicates
- * truncation.
- */
- ((char *)dest)[(destoff < destlen) ? destoff : (destlen - 1)] = '\0';
- }
+ herodotus_writer_nul_terminate(w);
- return destoff;
+ return herodotus_writer_number_written(w);
+}
+
+static size_t
+herodotus_next_word_break(const HERODOTUS_READER *r)
+{
+ HERODOTUS_READER tmp;
+
+ herodotus_reader_copy(r, &tmp);
+
+ if (r->type == HERODOTUS_TYPE_CODEPOINT) {
+ return grapheme_next_word_break(tmp.src, tmp.srclen);
+ } else { /* r->type == HERODOTUS_TYPE_UTF8 */
+ return grapheme_next_word_break_utf8(tmp.src, tmp.srclen);
+ }
}
static inline size_t
-to_titlecase(const void *src, size_t srclen, void *dest, size_t destlen,
- size_t (*get_codepoint)(const void *, size_t, size_t, uint_least32_t *),
- size_t (*set_codepoint)(uint_least32_t, void *, size_t, size_t))
+to_titlecase(HERODOTUS_READER *r, HERODOTUS_WRITER *w)
{
enum case_property prop;
- size_t next_wb, srcoff, destoff, res;
+ enum herodotus_status s;
uint_least32_t cp;
+ size_t nwb;
- for (srcoff = destoff = 0; ; ) {
- if (get_codepoint == get_codepoint_utf8) {
- if ((next_wb = grapheme_next_word_break_utf8((const char *)src + srcoff,
- srclen - srcoff)) == 0) {
- /* we consumed all of the string */
- break;
- }
- } else {
- if ((next_wb = grapheme_next_word_break((const uint_least32_t *)src + srcoff,
- srclen - srcoff)) == 0) {
- /* we consumed all of the string */
- break;
- }
- }
-
- for (; next_wb > 0 && srcoff < srclen; next_wb -= res, srcoff += res) {
+ for (; (nwb = herodotus_next_word_break(r)) > 0;) {
+ herodotus_reader_push_advance_limit(r, nwb);
+ for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODOTUS_STATUS_SUCCESS;) {
/* check if we have a cased character */
- res = get_codepoint(src, srclen, srcoff, &cp);
prop = get_case_property(cp);
if (prop == CASE_PROP_CASED ||
prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
break;
} else {
/* write the data to the output verbatim, it if permits */
- destoff += set_codepoint_utf8(cp, dest, destlen, destoff);
- }
- }
+ herodotus_write_codepoint(w, cp);
- if (next_wb > 0) {
- /* get character length */
- res = get_codepoint(src, srclen, srcoff, &cp);
-
- /* we have a cased character at srcoff, map it to titlecase */
- if (get_codepoint == get_codepoint_utf8) {
- destoff += to_case((const char *)src + srcoff,
- srclen - srcoff,
- (char *)dest + destoff,
- (destoff < destlen) ? (destlen - destoff) : 0,
- res, 0,
- get_codepoint_utf8,
- set_codepoint_utf8, title_major,
- title_minor, title_special);
- } else {
- destoff += to_case((const uint_least32_t *)src + srcoff,
- srclen - srcoff,
- (uint_least32_t *)dest + destoff,
- (destoff < destlen) ? (destlen - destoff) : 0,
- res, 0,
- get_codepoint,
- set_codepoint, title_major,
- title_minor, title_special);
+ /* increment reader */
+ herodotus_read_codepoint(r, true, &cp);
}
-
- /* we consumed a character */
- srcoff += res;
- next_wb -= res;
}
- /* cast the rest of the codepoints in the word to lowercase */
- if (get_codepoint == get_codepoint_utf8) {
- destoff += to_case((const char *)src + srcoff,
- srclen - srcoff,
- (char *)dest + destoff,
- (destoff < destlen) ? (destlen - destoff) : 0,
- next_wb, 1,
- get_codepoint_utf8,
- set_codepoint_utf8, lower_major,
- lower_minor, lower_special);
+ if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
+ /* we are done */
+ herodotus_reader_pop_limit(r);
+ break;
+ } else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
+ /*
+ * we did not encounter any cased character
+ * up to the word break
+ */
+ herodotus_reader_pop_limit(r);
+ continue;
} else {
- destoff += to_case((const uint_least32_t *)src + srcoff,
- srclen - srcoff,
- (uint_least32_t *)dest + destoff,
- (destoff < destlen) ? (destlen - destoff) : 0,
- next_wb, 1,
- get_codepoint,
- set_codepoint, lower_major,
- lower_minor, lower_special);
+ /*
+ * we encountered a cased character before the word
+ * break, convert it to titlecase
+ */
+ herodotus_reader_push_advance_limit(r,
+ herodotus_reader_next_codepoint_break(r));
+ to_case(r, w, 0, title_major, title_minor, title_special);
+ herodotus_reader_pop_limit(r);
}
- srcoff += next_wb;
- }
- if (set_codepoint == set_codepoint_utf8) {
- /*
- * NUL-terminate destination to always ensure NUL-termination.
- * Just like with snprintf() a return value >= destlen indicates
- * truncation.
- */
- ((char *)dest)[(destoff < destlen) ? destoff : (destlen - 1)] = '\0';
+ /* cast the rest of the codepoints in the word to lowercase */
+ to_case(r, w, 1, lower_major, lower_minor, lower_special);
+
+ /* remove the limit on the word before the next iteration */
+ herodotus_reader_pop_limit(r);
}
- return destoff;
+ herodotus_writer_nul_terminate(w);
+
+ return herodotus_writer_number_written(w);
}
size_t
grapheme_to_uppercase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen)
{
- return to_case(src, srclen, dest, destlen, srclen, 0, get_codepoint, set_codepoint,
- upper_major, upper_minor, upper_special);
+ HERODOTUS_READER r;
+ HERODOTUS_WRITER w;
+
+ herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
+ herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
+
+ return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
}
size_t
grapheme_to_lowercase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen)
{
- return to_case(src, srclen, dest, destlen, srclen, 0, get_codepoint, set_codepoint,
- lower_major, lower_minor, lower_special);
+ HERODOTUS_READER r;
+ HERODOTUS_WRITER w;
+
+ herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
+ herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
+
+ return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
}
size_t
grapheme_to_titlecase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen)
{
- return to_titlecase(src, srclen, dest, destlen, get_codepoint,
- set_codepoint);
+ HERODOTUS_READER r;
+ HERODOTUS_WRITER w;
+
+ herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
+ herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
+
+ return to_titlecase(&r, &w);
}
size_t
grapheme_to_uppercase_utf8(const char *src, size_t srclen, char *dest, size_t destlen)
{
- return to_case(src, srclen, dest, destlen, srclen, 0, get_codepoint_utf8, set_codepoint_utf8,
- upper_major, upper_minor, upper_special);
+ HERODOTUS_READER r;
+ HERODOTUS_WRITER w;
+
+ herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
+ herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
+
+ return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
}
size_t
grapheme_to_lowercase_utf8(const char *src, size_t srclen, char *dest, size_t destlen)
{
- return to_case(src, srclen, dest, destlen, srclen, 0, get_codepoint_utf8, set_codepoint_utf8,
- lower_major, lower_minor, lower_special);
+ HERODOTUS_READER r;
+ HERODOTUS_WRITER w;
+
+ herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
+ herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
+ return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
}
size_t
grapheme_to_titlecase_utf8(const char *src, size_t srclen, char *dest, size_t destlen)
{
- return to_titlecase(src, srclen, dest, destlen, get_codepoint_utf8,
- set_codepoint_utf8);
+ HERODOTUS_READER r;
+ HERODOTUS_WRITER w;
+
+ herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
+ herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
+
+ return to_titlecase(&r, &w);
}
static inline bool
-is_case(const void *src, size_t srclen,
- size_t srcnumprocess,
- size_t (*get_codepoint)(const void *, size_t, size_t, uint_least32_t *),
- const uint_least16_t *major, const int_least32_t *minor,
- const struct special_case *sc, size_t *output)
+is_case(HERODOTUS_READER *r, const uint_least16_t *major,
+ const int_least32_t *minor, const struct special_case *sc,
+ size_t *output)
{
- size_t srcoff, new_srcoff, tmp, res, off, i;
- uint_least32_t cp, tmp_cp;
+ size_t off, i;
+ bool ret = true;
+ uint_least32_t cp;
int_least32_t map;
- for (srcoff = 0; srcoff < srcnumprocess; srcoff = new_srcoff) {
- /* read in next source codepoint */
- new_srcoff = srcoff + get_codepoint(src, srclen, srcoff, &cp);
-
+ for (; herodotus_read_codepoint(r, false, &cp) == HERODOTUS_STATUS_SUCCESS;) {
/* get and handle case mapping */
if (unlikely((map = get_case_offset(cp, major, minor)) >=
INT32_C(0x110000))) {
@@ -341,169 +307,164 @@ is_case(const void *src, size_t srclen,
* is the difference to 0x110000*/
off = (uint_least32_t)map - UINT32_C(0x110000);
- for (i = 0, tmp = srcoff; i < sc[off].cplen; i++, tmp += res) {
- res = get_codepoint(src, srclen, srcoff, &tmp_cp);
- if (tmp_cp != sc[off].cp[i]) {
- /* we have a difference */
- if (output) {
- *output = tmp;
+ for (i = 0; i < sc[off].cplen; i++) {
+ if (herodotus_read_codepoint(r, false, &cp) ==
+ HERODOTUS_STATUS_SUCCESS) {
+ if (cp != sc[off].cp[i]) {
+ ret = false;
+ goto done;
+ } else {
+ /* move forward */
+ herodotus_read_codepoint(r, true, &cp);
}
- return false;
+ } else {
+ /*
+ * input ended and we didn't see
+ * any difference so far, so this
+ * string is in fact okay
+ */
+ ret = true;
+ goto done;
}
}
- new_srcoff = tmp;
} else {
/* we have a simple mapping */
if (cp != (uint_least32_t)((int_least32_t)cp + map)) {
/* we have a difference */
- if (output) {
- *output = srcoff;
- }
- return false;
+ ret = false;
+ goto done;
+ } else {
+ /* move forward */
+ herodotus_read_codepoint(r, true, &cp);
}
}
}
-
+done:
if (output) {
- *output = srcoff;
+ *output = herodotus_reader_number_read(r);
}
- return true;
+ return ret;
}
static inline bool
-is_titlecase(const void *src, size_t srclen,
- size_t (*get_codepoint)(const void *, size_t, size_t, uint_least32_t *),
- size_t *output)
+is_titlecase(HERODOTUS_READER *r, size_t *output)
{
enum case_property prop;
- size_t next_wb, srcoff, res, tmp_output;
+ enum herodotus_status s;
+ bool ret = true;
uint_least32_t cp;
+ size_t nwb;
- for (srcoff = 0; ; ) {
- if (get_codepoint == get_codepoint_utf8) {
- if ((next_wb = grapheme_next_word_break_utf8((const char *)src + srcoff,
- srclen - srcoff)) == 0) {
- /* we consumed all of the string */
- break;
- }
- } else {
- if ((next_wb = grapheme_next_word_break((const uint_least32_t *)src + srcoff,
- srclen - srcoff)) == 0) {
- /* we consumed all of the string */
- break;
- }
- }
-
- for (; next_wb > 0 && srcoff < srclen; next_wb -= res, srcoff += res) {
+ for (; (nwb = herodotus_next_word_break(r)) > 0;) {
+ herodotus_reader_push_advance_limit(r, nwb);
+ for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODOTUS_STATUS_SUCCESS;) {
/* check if we have a cased character */
- res = get_codepoint(src, srclen, srcoff, &cp);
prop = get_case_property(cp);
if (prop == CASE_PROP_CASED ||
prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
break;
+ } else {
+ /* increment reader */
+ herodotus_read_codepoint(r, true, &cp);
}
}
- if (next_wb > 0) {
- /* get character length */
- res = get_codepoint(src, srclen, srcoff, &cp);
-
- /* we have a cased character at srcoff, check if it's titlecase */
- if (get_codepoint == get_codepoint_utf8) {
- if (!is_case((const char *)src + srcoff,
- srclen - srcoff, res,
- get_codepoint_utf8, title_major,
- title_minor, title_special, &tmp_output)) {
- if (output) {
- *output = srcoff + tmp_output;
- }
- return false;
- }
- } else {
- if (!is_case((const uint_least32_t *)src + srcoff,
- srclen - srcoff, res,
- get_codepoint, title_major,
- title_minor, title_special, &tmp_output)) {
- if (output) {
- *output = srcoff + tmp_output;
- }
- return false;
- }
+ if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
+ /* we are done */
+ break;
+ } else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
+ /*
+ * we did not encounter any cased character
+ * up to the word break
+ */
+ herodotus_reader_pop_limit(r);
+ continue;
+ } else {
+ /*
+ * we encountered a cased character before the word
+ * break, check if it's titlecase
+ */
+ herodotus_reader_push_advance_limit(r,
+ herodotus_reader_next_codepoint_break(r));
+ if (!is_case(r, title_major, title_minor, title_special, NULL)) {
+ ret = false;
+ goto done;
}
-
- /* we consumed a character */
- srcoff += res;
- next_wb -= res;
+ herodotus_reader_pop_limit(r);
}
/* check if the rest of the codepoints in the word are lowercase */
- if (get_codepoint == get_codepoint_utf8) {
- if (!is_case((const char *)src + srcoff,
- srclen - srcoff, next_wb,
- get_codepoint_utf8, lower_major,
- lower_minor, lower_special, &tmp_output)) {
- if (output) {
- *output = srcoff + tmp_output;
- }
- return false;
- }
- } else {
- if (!is_case((const uint_least32_t *)src + srcoff,
- srclen - srcoff, next_wb,
- get_codepoint, lower_major,
- lower_minor, lower_special, &tmp_output)) {
- if (output) {
- *output = srcoff + tmp_output;
- }
- return false;
- }
+ if (!is_case(r, lower_major, lower_minor, lower_special, NULL)) {
+ ret = false;
+ goto done;
}
- srcoff += next_wb;
- }
+ /* remove the limit on the word before the next iteration */
+ herodotus_reader_pop_limit(r);
+ }
+done:
if (output) {
- *output = srcoff;
+ *output = herodotus_reader_number_read(r);
}
- return true;
+ return ret;
}
bool
grapheme_is_uppercase(const uint_least32_t *src, size_t srclen, size_t *caselen)
{
- return is_case(src, srclen, srclen, get_codepoint,
- upper_major, upper_minor, upper_special, caselen);
+ HERODOTUS_READER r;
+
+ herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
+
+ return is_case(&r, upper_major, upper_minor, upper_special, caselen);
}
bool
grapheme_is_lowercase(const uint_least32_t *src, size_t srclen, size_t *caselen)
{
- return is_case(src, srclen, srclen, get_codepoint,
- lower_major, lower_minor, lower_special, caselen);
+ HERODOTUS_READER r;
+
+ herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
+
+ return is_case(&r, lower_major, lower_minor, lower_special, caselen);
}
bool
grapheme_is_titlecase(const uint_least32_t *src, size_t srclen, size_t *caselen)
{
- return is_titlecase(src, srclen, get_codepoint, caselen);
+ HERODOTUS_READER r;
+
+ herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
+
+ return is_titlecase(&r, caselen);
}
bool
grapheme_is_uppercase_utf8(const char *src, size_t srclen, size_t *caselen)
{
- return is_case(src, srclen, srclen, get_codepoint_utf8,
- upper_major, upper_minor, upper_special, caselen);
+ HERODOTUS_READER r;
+
+ herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
+
+ return is_case(&r, upper_major, upper_minor, upper_special, caselen);
}
bool
grapheme_is_lowercase_utf8(const char *src, size_t srclen, size_t *caselen)
{
- return is_case(src, srclen, srclen, get_codepoint_utf8,
- lower_major, lower_minor, lower_special, caselen);
+ HERODOTUS_READER r;
+ herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
+
+ return is_case(&r, lower_major, lower_minor, lower_special, caselen);
}
bool
grapheme_is_titlecase_utf8(const char *src, size_t srclen, size_t *caselen)
{
- return is_titlecase(src, srclen, get_codepoint_utf8, caselen);
+ HERODOTUS_READER r;
+
+ herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
+
+ return is_titlecase(&r, caselen);
}
diff --git a/src/character.c b/src/character.c
@@ -1,162 +1,191 @@
/* See LICENSE file for copyright and license details. */
+#include <limits.h>
#include <stdbool.h>
#include <stddef.h>
-#include <stdlib.h>
-#include <string.h>
#include "../gen/character.h"
#include "../grapheme.h"
#include "util.h"
+struct character_break_state {
+ uint_least8_t prop;
+ bool prop_set;
+ bool gb11_flag;
+ bool gb12_13_flag;
+};
+
static const uint_least16_t dont_break[NUM_CHAR_BREAK_PROPS] = {
[CHAR_BREAK_PROP_OTHER] =
- UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */
- UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */
- UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */
+ UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
+ UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
+ UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_CR] =
- UINT16_C(1 << CHAR_BREAK_PROP_LF), /* GB3 */
+ UINT16_C(1) << CHAR_BREAK_PROP_LF, /* GB3 */
[CHAR_BREAK_PROP_EXTEND] =
- UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */
- UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */
- UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */
+ UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
+ UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
+ UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
- UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */
- UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */
- UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */
+ UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
+ UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
+ UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_HANGUL_L] =
- UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_L) | /* GB6 */
- UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_V) | /* GB6 */
- UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_LV) | /* GB6 */
- UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_LVT) | /* GB6 */
- UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */
- UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */
- UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */
+ UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_L | /* GB6 */
+ UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB6 */
+ UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LV | /* GB6 */
+ UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LVT | /* GB6 */
+ UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
+ UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
+ UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_HANGUL_V] =
- UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_V) | /* GB7 */
- UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_T) | /* GB7 */
- UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */
- UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */
- UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */
+ UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */
+ UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */
+ UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
+ UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
+ UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_HANGUL_T] =
- UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_T) | /* GB8 */
- UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */
- UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */
- UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */
+ UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */
+ UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
+ UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
+ UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_HANGUL_LV] =
- UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_V) | /* GB7 */
- UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_T) | /* GB7 */
- UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */
- UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */
- UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */
+ UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */
+ UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */
+ UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
+ UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
+ UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_HANGUL_LVT] =
- UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_T) | /* GB8 */
- UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */
- UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */
- UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */
+ UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */
+ UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
+ UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
+ UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_PREPEND] =
- UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */
- UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */
- UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK) | /* GB9a */
+ UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
+ UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
+ UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK | /* GB9a */
(UINT16_C(0xFFFF) &
- ~(UINT16_C(1 << CHAR_BREAK_PROP_CR) |
- UINT16_C(1 << CHAR_BREAK_PROP_LF) |
- UINT16_C(1 << CHAR_BREAK_PROP_CONTROL)
+ ~(UINT16_C(1) << CHAR_BREAK_PROP_CR |
+ UINT16_C(1) << CHAR_BREAK_PROP_LF |
+ UINT16_C(1) << CHAR_BREAK_PROP_CONTROL
)
), /* GB9b */
[CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
- UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */
- UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */
- UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */
+ UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
+ UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
+ UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_SPACINGMARK] =
- UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */
- UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */
- UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */
+ UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
+ UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
+ UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_ZWJ] =
- UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */
- UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */
- UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */
+ UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
+ UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
+ UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
};
static const uint_least16_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
- UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) |
- UINT16_C(1 << CHAR_BREAK_PROP_EXTEND),
+ UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |
+ UINT16_C(1) << CHAR_BREAK_PROP_EXTEND,
[CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
- UINT16_C(1 << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC),
+ UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
[CHAR_BREAK_PROP_EXTEND + NUM_CHAR_BREAK_PROPS] =
- UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) |
- UINT16_C(1 << CHAR_BREAK_PROP_ZWJ),
+ UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |
+ UINT16_C(1) << CHAR_BREAK_PROP_ZWJ,
[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_CHAR_BREAK_PROPS] =
- UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) |
- UINT16_C(1 << CHAR_BREAK_PROP_EXTEND),
+ UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |
+ UINT16_C(1) << CHAR_BREAK_PROP_EXTEND,
};
static const uint_least16_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
[CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
- UINT16_C(1 << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC),
+ UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
};
static const uint_least16_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
[CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
- UINT16_C(1 << CHAR_BREAK_PROP_REGIONAL_INDICATOR),
+ UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
};
static const uint_least16_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
[CHAR_BREAK_PROP_REGIONAL_INDICATOR + NUM_CHAR_BREAK_PROPS] =
- UINT16_C(1 << CHAR_BREAK_PROP_REGIONAL_INDICATOR),
+ UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
};
static inline enum char_break_property
get_break_prop(uint_least32_t cp)
{
- if (likely(cp <= 0x10FFFF)) {
+ if (likely(cp <= UINT32_C(0x10FFFF))) {
return (enum char_break_property)
- char_break_minor[char_break_major[cp >> 8] + (cp & 0xff)];
+ char_break_minor[char_break_major[cp >> 8] + (cp & 0xFF)];
} else {
return CHAR_BREAK_PROP_OTHER;
}
}
+static inline void
+state_serialize(const struct character_break_state *in, uint_least16_t *out)
+{
+ *out = (uint_least16_t)(in->prop & UINT8_C(0xFF)) | /* first 8 bits */
+ (uint_least16_t)(((uint_least16_t)(in->prop_set)) << 8) | /* 9th bit */
+ (uint_least16_t)(((uint_least16_t)(in->gb11_flag)) << 9) | /* 10th bit */
+ (uint_least16_t)(((uint_least16_t)(in->gb12_13_flag)) << 10); /* 11th bit */
+}
+
+static inline void
+state_deserialize(uint_least16_t in, struct character_break_state *out)
+{
+ out->prop = in & UINT8_C(0xFF);
+ out->prop_set = in & (UINT16_C(1) << 8);
+ out->gb11_flag = in & (UINT16_C(1) << 9);
+ out->gb12_13_flag = in & (UINT16_C(1) << 10);
+}
+
bool
-grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STATE *state)
+grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, uint_least16_t *s)
{
+ struct character_break_state state;
enum char_break_property cp0_prop, cp1_prop;
bool notbreak = false;
- if (likely(state)) {
- if (likely(state->prop_set)) {
- cp0_prop = state->prop;
+ if (likely(s)) {
+ state_deserialize(*s, &state);
+
+ if (likely(state.prop_set)) {
+ cp0_prop = state.prop;
} else {
cp0_prop = get_break_prop(cp0);
}
cp1_prop = get_break_prop(cp1);
/* preserve prop of right codepoint for next iteration */
- state->prop = (uint_least8_t)cp1_prop;
- state->prop_set = true;
+ state.prop = (uint_least8_t)cp1_prop;
+ state.prop_set = true;
/* update flags */
- state->gb11_flag =
+ state.gb11_flag =
flag_update_gb11[cp0_prop + NUM_CHAR_BREAK_PROPS *
- state->gb11_flag] &
- UINT16_C(1 << cp1_prop);
- state->gb12_13_flag =
+ state.gb11_flag] &
+ UINT16_C(1) << cp1_prop;
+ state.gb12_13_flag =
flag_update_gb12_13[cp0_prop + NUM_CHAR_BREAK_PROPS *
- state->gb12_13_flag] &
- UINT16_C(1 << cp1_prop);
+ state.gb12_13_flag] &
+ UINT16_C(1) << cp1_prop;
/*
* Apply grapheme cluster breaking algorithm (UAX #29), see
* http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
*/
- notbreak = (dont_break[cp0_prop] & UINT16_C(1 << cp1_prop)) ||
- (dont_break_gb11[cp0_prop + state->gb11_flag *
+ notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
+ (dont_break_gb11[cp0_prop + state.gb11_flag *
NUM_CHAR_BREAK_PROPS] &
- UINT16_C(1 << cp1_prop)) ||
- (dont_break_gb12_13[cp0_prop + state->gb12_13_flag *
+ (UINT16_C(1) << cp1_prop)) ||
+ (dont_break_gb12_13[cp0_prop + state.gb12_13_flag *
NUM_CHAR_BREAK_PROPS] &
- UINT16_C(1 << cp1_prop));
+ (UINT16_C(1) << cp1_prop));
/* update or reset flags (when we have a break) */
if (likely(!notbreak)) {
- state->gb11_flag = state->gb12_13_flag = false;
+ state.gb11_flag = state.gb12_13_flag = false;
}
+
+ state_serialize(&state, s);
} else {
cp0_prop = get_break_prop(cp0);
cp1_prop = get_break_prop(cp1);
@@ -168,69 +197,47 @@ grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STA
* Given we have no state, this behaves as if the state-booleans
* were all set to false
*/
- notbreak = (dont_break[cp0_prop] & UINT16_C(1 << cp1_prop)) ||
- (dont_break_gb11[cp0_prop] & UINT16_C(1 << cp1_prop)) ||
- (dont_break_gb12_13[cp0_prop] & UINT16_C(1 << cp1_prop));
+ notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
+ (dont_break_gb11[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
+ (dont_break_gb12_13[cp0_prop] & (UINT16_C(1) << cp1_prop));
}
return !notbreak;
}
-size_t
-grapheme_next_character_break(const uint_least32_t *str, size_t len)
+static size_t
+next_character_break(HERODOTUS_READER *r)
{
- GRAPHEME_STATE state = { 0 };
- size_t off;
-
- if (str == NULL || len == 0) {
- return 0;
- }
+ uint_least16_t state = 0;
+ uint_least32_t cp0 = 0, cp1 = 0;
- for (off = 1; off < len; off++) {
- if (grapheme_is_character_break(str[off - 1], str[off], &state)) {
+ for (herodotus_read_codepoint(r, true, &cp0);
+ herodotus_read_codepoint(r, false, &cp1) == HERODOTUS_STATUS_SUCCESS;
+ herodotus_read_codepoint(r, true, &cp0)) {
+ if (grapheme_is_character_break(cp0, cp1, &state)) {
break;
}
}
- return off;
+ return herodotus_reader_number_read(r);
}
size_t
-grapheme_next_character_break_utf8(const char *str, size_t len)
+grapheme_next_character_break(const uint_least32_t *str, size_t len)
{
- GRAPHEME_STATE state = { 0 };
- uint_least32_t cp0 = 0, cp1 = 0;
- size_t off, ret;
-
- if (str == NULL || len == 0) {
- return 0;
- }
+ HERODOTUS_READER r;
- for (off = 0; (len == SIZE_MAX) || off < len; off += ret) {
- cp0 = cp1;
- ret = grapheme_decode_utf8(str + off, (len == SIZE_MAX) ?
- SIZE_MAX : len - off, &cp1);
+ herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
- if (len != SIZE_MAX && ret > (len - off)) {
- /* string ended abruptly, simply accept cropping */
- ret = len - off;
- }
+ return next_character_break(&r);
+}
- if (len == SIZE_MAX && cp1 == 0) {
- /* we hit a NUL-byte and are done */
- break;
- }
+size_t
+grapheme_next_character_break_utf8(const char *str, size_t len)
+{
+ HERODOTUS_READER r;
- if (off == 0) {
- /*
- * we skip the first round, as we need both
- * cp0 and cp1 to be initialized
- */
- continue;
- } else if (grapheme_is_character_break(cp0, cp1, &state)) {
- break;
- }
- }
+ herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
- return off;
+ return next_character_break(&r);
}
diff --git a/src/line.c b/src/line.c
@@ -1,8 +1,6 @@
/* See LICENSE file for copyright and license details. */
#include <stdbool.h>
#include <stddef.h>
-#include <stdlib.h>
-#include <string.h>
#include "../gen/line.h"
#include "../grapheme.h"
@@ -11,7 +9,7 @@
static inline enum line_break_property
get_break_prop(uint_least32_t cp)
{
- if (likely(cp <= 0x10FFFF)) {
+ if (likely(cp <= UINT32_C(0x10FFFF))) {
return (enum line_break_property)
line_break_minor[line_break_major[cp >> 8] + (cp & 0xff)];
} else {
@@ -20,22 +18,15 @@ get_break_prop(uint_least32_t cp)
}
static size_t
-next_line_break(const void *str, size_t len, size_t (*get_codepoint)
- (const void *, size_t, size_t, uint_least32_t *))
+next_line_break(HERODOTUS_READER *r)
{
+ HERODOTUS_READER tmp;
enum line_break_property cp0_prop, cp1_prop, last_non_cm_or_zwj_prop,
last_non_sp_prop, last_non_sp_cm_or_zwj_prop;
- enum line_break_property res;
uint_least32_t cp;
uint_least8_t lb25_level = 0;
- size_t off, new_off;
bool lb21a_flag = false, ri_even = true;
- /* check degenerate cases */
- if (str == NULL || len == 0) {
- return 0;
- }
-
/*
* Apply line breaking algorithm (UAX #14), see
* https://unicode.org/reports/tr14/#Algorithm and tailoring
@@ -49,24 +40,14 @@ next_line_break(const void *str, size_t len, size_t (*get_codepoint)
* Initialize the different properties such that we have
* a good state after the state-update in the loop
*/
- cp0_prop = NUM_LINE_BREAK_PROPS;
- if ((off = get_codepoint(str, len, 0, &cp)) >= len) {
- return 1;
- }
- cp1_prop = get_break_prop(cp);
last_non_cm_or_zwj_prop = LINE_BREAK_PROP_AL; /* according to LB10 */
last_non_sp_prop = last_non_sp_cm_or_zwj_prop = NUM_LINE_BREAK_PROPS;
- for (; off < len; off = new_off) {
- /* update state */
- cp0_prop = cp1_prop;
- if ((new_off = off + get_codepoint(str, len, off, &cp)) <= len) {
- get_codepoint(str, len, off, &cp);
- cp1_prop = get_break_prop(cp);
- } else {
- /* LB3 */
- break;
- }
+ for (herodotus_read_codepoint(r, true, &cp), cp0_prop = get_break_prop(cp);
+ herodotus_read_codepoint(r, false, &cp) == HERODOTUS_STATUS_SUCCESS;
+ herodotus_read_codepoint(r, true, &cp), cp0_prop = cp1_prop) {
+ /* get property of the right codepoint */
+ cp1_prop = get_break_prop(cp);
/* update retention-states */
@@ -378,14 +359,14 @@ next_line_break(const void *str, size_t len, size_t (*get_codepoint)
* two adjacent codepoints as we have it with
* characters.
*/
- if (new_off < len &&
+ herodotus_reader_copy(r, &tmp);
+ herodotus_read_codepoint(&tmp, true, &cp);
+ if (herodotus_read_codepoint(&tmp, true, &cp) ==
+ HERODOTUS_STATUS_SUCCESS &&
(cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF ||
cp1_prop == LINE_BREAK_PROP_HY)) {
- get_codepoint(str, len, new_off, &cp);
- res = get_break_prop(cp);
-
- if (res == LINE_BREAK_PROP_NU) {
+ if (get_break_prop(cp) == LINE_BREAK_PROP_NU) {
continue;
}
}
@@ -505,17 +486,25 @@ next_line_break(const void *str, size_t len, size_t (*get_codepoint)
break;
}
- return off;
+ return herodotus_reader_number_read(r);
}
size_t
grapheme_next_line_break(const uint_least32_t *str, size_t len)
{
- return next_line_break(str, len, get_codepoint);
+ HERODOTUS_READER r;
+
+ herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
+
+ return next_line_break(&r);
}
size_t
grapheme_next_line_break_utf8(const char *str, size_t len)
{
- return next_line_break(str, len, get_codepoint_utf8);
+ HERODOTUS_READER r;
+
+ herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
+
+ return next_line_break(&r);
}
diff --git a/src/sentence.c b/src/sentence.c
@@ -1,18 +1,22 @@
/* See LICENSE file for copyright and license details. */
#include <stdbool.h>
#include <stddef.h>
-#include <stdlib.h>
-#include <string.h>
#include "../gen/sentence.h"
#include "../grapheme.h"
#include "util.h"
-static inline enum sentence_break_property
-get_break_prop(uint_least32_t cp)
+struct sentence_break_state
{
- if (likely(cp <= 0x10FFFF)) {
- return (enum sentence_break_property)
+ uint_least8_t aterm_close_sp_level;
+ uint_least8_t saterm_close_sp_parasep_level;
+};
+
+static inline uint_least8_t
+get_sentence_break_prop(uint_least32_t cp)
+{
+ if (likely(cp <= UINT32_C(0x10FFFF))) {
+ return (uint_least8_t)
sentence_break_minor[sentence_break_major[cp >> 8] +
(cp & 0xff)];
} else {
@@ -20,239 +24,157 @@ get_break_prop(uint_least32_t cp)
}
}
-static size_t
-next_sentence_break(const void *str, size_t len, size_t (*get_codepoint)
- (const void *, size_t, size_t, uint_least32_t *))
+static bool
+is_skippable_sentence_prop(uint_least8_t prop)
{
- struct {
- enum sentence_break_property a, b, c, d;
- } raw, skip;
- enum sentence_break_property res;
- uint_least32_t cp;
- uint_least8_t aterm_close_sp_level = 0,
- saterm_close_sp_parasep_level = 0;
- size_t off, tmp, new_off;
+ return prop == SENTENCE_BREAK_PROP_EXTEND ||
+ prop == SENTENCE_BREAK_PROP_FORMAT;
+}
- /* check degenerate cases */
- if (str == NULL || len == 0) {
- return 0;
- }
+static void
+sentence_skip_shift_callback(uint_least8_t prop, void *s)
+{
+ struct sentence_break_state *state = (struct sentence_break_state *)s;
/*
- * Apply sentence breaking algorithm (UAX #29), see
- * https://unicode.org/reports/tr29/#Sentence_Boundary_Rules
+ * Here comes a bit of magic. The rules
+ * SB8, SB8a, SB9 and SB10 have very complicated
+ * left-hand-side-rules of the form
*
- * There are 4 slots (a, b, c, d) of "break" properties and
- * we check if there is a break in the middle between b and c.
+ * ATerm Close* Sp*
+ * SATerm Close*
+ * SATerm Close* Sp*
+ * SATerm Close* Sp* ParaSep?
*
- * The position of this middle spot is determined by off,
- * which gives the offset of the first element on the right
- * hand side of said spot, or, in other words, gives the number
- * of elements on the left hand side.
+ * but instead of backtracking, we keep the
+ * state as some kind of "power level" in
+ * two state-variables
*
- * It is further complicated by the fact that the algorithm
- * expects you to skip certain characters for the second
- * half of the rules (after SB5). Thus, we do not only have
- * the "raw" properties as described above, but also the "skip"
- * properties, where the skip.a and skip.b, for instance,
- * give the two preceding character properties behind the
- * currently investigated breakpoint.
+ * aterm_close_sp_level
+ * saterm_close_sp_parasep_level
+ *
+ * that go from 0 to 3/4:
+ *
+ * 0: we are not in the sequence
+ * 1: we have one ATerm/SATerm to the left of
+ * the middle spot
+ * 2: we have one ATerm/SATerm and one or more
+ * Close to the left of the middle spot
+ * 3: we have one ATerm/SATerm, zero or more
+ * Close and one or more Sp to the left of
+ * the middle spot.
+ * 4: we have one SATerm, zero or more Close,
+ * zero or more Sp and one ParaSep to the
+ * left of the middle spot.
*
*/
-
- /*
- * Initialize the different properties such that we have
- * a good state after the state-update in the loop
- */
- raw.b = NUM_SENTENCE_BREAK_PROPS;
- if ((off = get_codepoint(str, len, 0, &cp)) >= len) {
- return 1;
+ if ((state->aterm_close_sp_level == 0 ||
+ state->aterm_close_sp_level == 1) &&
+ prop == SENTENCE_BREAK_PROP_ATERM) {
+ /* sequence has begun */
+ state->aterm_close_sp_level = 1;
+ } else if ((state->aterm_close_sp_level == 1 ||
+ state->aterm_close_sp_level == 2) &&
+ prop == SENTENCE_BREAK_PROP_CLOSE) {
+ /* close-sequence begins or continued */
+ state->aterm_close_sp_level = 2;
+ } else if ((state->aterm_close_sp_level == 1 ||
+ state->aterm_close_sp_level == 2 ||
+ state->aterm_close_sp_level == 3) &&
+ prop == SENTENCE_BREAK_PROP_SP) {
+ /* sp-sequence begins or continued */
+ state->aterm_close_sp_level = 3;
+ } else {
+ /* sequence broke */
+ state->aterm_close_sp_level = 0;
}
- raw.c = get_break_prop(cp);
- (void)get_codepoint(str, len, off, &cp);
- raw.d = get_break_prop(cp);
- skip.a = skip.b = NUM_SENTENCE_BREAK_PROPS;
-
- for (; off < len; off = new_off) {
- /*
- * Update left side (a and b) of the skip state by
- * "shifting in" the raw.c property as long as it is
- * not one of the "ignored" character properties.
- * While at it, update the RI-counter.
- *
- */
- if (raw.c != SENTENCE_BREAK_PROP_EXTEND &&
- raw.c != SENTENCE_BREAK_PROP_FORMAT) {
- skip.a = skip.b;
- skip.b = raw.c;
-
- /*
- * Here comes a bit of magic. The rules
- * SB8, SB8a, SB9 and SB10 have very complicated
- * left-hand-side-rules of the form
- *
- * ATerm Close* Sp*
- * SATerm Close*
- * SATerm Close* Sp*
- * SATerm Close* Sp* ParaSep?
- *
- * but instead of backtracking, we keep the
- * state as some kind of "power level" in
- * two variables
- *
- * aterm_close_sp_level
- * saterm_close_sp_parasep_level
- *
- * that go from 0 to 3/4:
- *
- * 0: we are not in the sequence
- * 1: we have one ATerm/SATerm to the left of
- * the middle spot
- * 2: we have one ATerm/SATerm and one or more
- * Close to the left of the middle spot
- * 3: we have one ATerm/SATerm, zero or more
- * Close and one or more Sp to the left of
- * the middle spot.
- * 4: we have one SATerm, zero or more Close,
- * zero or more Sp and one ParaSep to the
- * left of the middle spot.
- *
- */
- if ((aterm_close_sp_level == 0 ||
- aterm_close_sp_level == 1) &&
- skip.b == SENTENCE_BREAK_PROP_ATERM) {
- /* sequence has begun */
- aterm_close_sp_level = 1;
- } else if ((aterm_close_sp_level == 1 ||
- aterm_close_sp_level == 2) &&
- skip.b == SENTENCE_BREAK_PROP_CLOSE) {
- /* close-sequence begins or continued */
- aterm_close_sp_level = 2;
- } else if ((aterm_close_sp_level == 1 ||
- aterm_close_sp_level == 2 ||
- aterm_close_sp_level == 3) &&
- skip.b == SENTENCE_BREAK_PROP_SP) {
- /* sp-sequence begins or continued */
- aterm_close_sp_level = 3;
- } else {
- /* sequence broke */
- aterm_close_sp_level = 0;
- }
- if ((saterm_close_sp_parasep_level == 0 ||
- saterm_close_sp_parasep_level == 1) &&
- (skip.b == SENTENCE_BREAK_PROP_STERM ||
- skip.b == SENTENCE_BREAK_PROP_ATERM)) {
- /* sequence has begun */
- saterm_close_sp_parasep_level = 1;
- } else if ((saterm_close_sp_parasep_level == 1 ||
- saterm_close_sp_parasep_level == 2) &&
- skip.b == SENTENCE_BREAK_PROP_CLOSE) {
- /* close-sequence begins or continued */
- saterm_close_sp_parasep_level = 2;
- } else if ((saterm_close_sp_parasep_level == 1 ||
- saterm_close_sp_parasep_level == 2 ||
- saterm_close_sp_parasep_level == 3) &&
- skip.b == SENTENCE_BREAK_PROP_SP) {
- /* sp-sequence begins or continued */
- saterm_close_sp_parasep_level = 3;
- } else if ((saterm_close_sp_parasep_level == 1 ||
- saterm_close_sp_parasep_level == 2 ||
- saterm_close_sp_parasep_level == 3) &&
- (skip.b == SENTENCE_BREAK_PROP_SEP ||
- skip.b == SENTENCE_BREAK_PROP_CR ||
- skip.b == SENTENCE_BREAK_PROP_LF)) {
- /* ParaSep at the end of the sequence */
- saterm_close_sp_parasep_level = 4;
- } else {
- /* sequence broke */
- saterm_close_sp_parasep_level = 0;
- }
- }
-
- /*
- * Update right side (b and c) of the skip state by
- * starting at the breakpoint and detecting the two
- * following non-ignored character classes
- *
- */
- skip.c = NUM_SENTENCE_BREAK_PROPS;
- for (tmp = off; tmp < len; ) {
- tmp += get_codepoint(str, len, tmp, &cp);
- res = get_break_prop(cp);
-
- if (res != SENTENCE_BREAK_PROP_EXTEND &&
- res != SENTENCE_BREAK_PROP_FORMAT) {
- skip.c = res;
- break;
- }
- }
- skip.d = NUM_SENTENCE_BREAK_PROPS;
- for (; tmp < len; ) {
- tmp += get_codepoint(str, len, tmp, &cp);
- res = get_break_prop(cp);
+ if ((state->saterm_close_sp_parasep_level == 0 ||
+ state->saterm_close_sp_parasep_level == 1) &&
+ (prop == SENTENCE_BREAK_PROP_STERM ||
+ prop == SENTENCE_BREAK_PROP_ATERM)) {
+ /* sequence has begun */
+ state->saterm_close_sp_parasep_level = 1;
+ } else if ((state->saterm_close_sp_parasep_level == 1 ||
+ state->saterm_close_sp_parasep_level == 2) &&
+ prop == SENTENCE_BREAK_PROP_CLOSE) {
+ /* close-sequence begins or continued */
+ state->saterm_close_sp_parasep_level = 2;
+ } else if ((state->saterm_close_sp_parasep_level == 1 ||
+ state->saterm_close_sp_parasep_level == 2 ||
+ state->saterm_close_sp_parasep_level == 3) &&
+ prop == SENTENCE_BREAK_PROP_SP) {
+ /* sp-sequence begins or continued */
+ state->saterm_close_sp_parasep_level = 3;
+ } else if ((state->saterm_close_sp_parasep_level == 1 ||
+ state->saterm_close_sp_parasep_level == 2 ||
+ state->saterm_close_sp_parasep_level == 3) &&
+ (prop == SENTENCE_BREAK_PROP_SEP ||
+ prop == SENTENCE_BREAK_PROP_CR ||
+ prop == SENTENCE_BREAK_PROP_LF)) {
+ /* ParaSep at the end of the sequence */
+ state->saterm_close_sp_parasep_level = 4;
+ } else {
+ /* sequence broke */
+ state->saterm_close_sp_parasep_level = 0;
+ }
+}
- if (res != SENTENCE_BREAK_PROP_EXTEND &&
- res != SENTENCE_BREAK_PROP_FORMAT) {
- skip.d = res;
- break;
- }
- }
+static size_t
+next_sentence_break(HERODOTUS_READER *r)
+{
+ HERODOTUS_READER tmp;
+ enum sentence_break_property prop;
+ struct proper p;
+ struct sentence_break_state state = { 0 };
+ uint_least32_t cp;
- /*
- * Update the raw state by simply shifting everything
- * in and, if we still have data left, determining
- * the character class of the next codepoint.
- *
- */
- raw.a = raw.b;
- raw.b = raw.c;
- raw.c = raw.d;
- if ((new_off = off + get_codepoint(str, len, off, &cp)) < len) {
- get_codepoint(str, len, new_off, &cp);
- raw.d = get_break_prop(cp);
- } else {
- raw.d = NUM_SENTENCE_BREAK_PROPS;
- }
+ /*
+ * Apply sentence breaking algorithm (UAX #29), see
+ * https://unicode.org/reports/tr29/#Sentence_Boundary_Rules
+ */
+ proper_init(r, &state, NUM_SENTENCE_BREAK_PROPS,
+ get_sentence_break_prop, is_skippable_sentence_prop,
+ sentence_skip_shift_callback, &p);
+ while (!proper_advance(&p)) {
/* SB3 */
- if (raw.b == SENTENCE_BREAK_PROP_CR &&
- raw.c == SENTENCE_BREAK_PROP_LF) {
+ if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR &&
+ p.raw.next_prop[0] == SENTENCE_BREAK_PROP_LF) {
continue;
}
/* SB4 */
- if (raw.b == SENTENCE_BREAK_PROP_SEP ||
- raw.b == SENTENCE_BREAK_PROP_CR ||
- raw.b == SENTENCE_BREAK_PROP_LF) {
+ if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_SEP ||
+ p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR ||
+ p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_LF) {
break;
}
/* SB5 */
- if (raw.c == SENTENCE_BREAK_PROP_EXTEND ||
- raw.c == SENTENCE_BREAK_PROP_FORMAT) {
+ if (p.raw.next_prop[0] == SENTENCE_BREAK_PROP_EXTEND ||
+ p.raw.next_prop[0] == SENTENCE_BREAK_PROP_FORMAT) {
continue;
}
/* SB6 */
- if (skip.b == SENTENCE_BREAK_PROP_ATERM &&
- skip.c == SENTENCE_BREAK_PROP_NUMERIC) {
+ if (p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_NUMERIC) {
continue;
}
/* SB7 */
- if (off > 1 &&
- (skip.a == SENTENCE_BREAK_PROP_UPPER ||
- skip.a == SENTENCE_BREAK_PROP_LOWER) &&
- skip.b == SENTENCE_BREAK_PROP_ATERM &&
- skip.c == SENTENCE_BREAK_PROP_UPPER) {
+ if ((p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_UPPER ||
+ p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_LOWER) &&
+ p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_UPPER) {
continue;
}
/* SB8 */
- if (aterm_close_sp_level == 1 ||
- aterm_close_sp_level == 2 ||
- aterm_close_sp_level == 3) {
+ if (state.aterm_close_sp_level == 1 ||
+ state.aterm_close_sp_level == 2 ||
+ state.aterm_close_sp_level == 3) {
/*
* This is the most complicated rule, requiring
* the right-hand-side to satisfy the regular expression
@@ -260,67 +182,75 @@ next_sentence_break(const void *str, size_t len, size_t (*get_codepoint)
* ( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
*
* which we simply check "manually" given LUT-lookups
- * are very cheap.
+ * are very cheap by starting at the mid_reader.
*
*/
- for (tmp = off, res = NUM_SENTENCE_BREAK_PROPS; tmp < len; ) {
- tmp += get_codepoint(str, len, tmp, &cp);
- res = get_break_prop(cp);
+ herodotus_reader_copy(&(p.mid_reader), &tmp);
+
+ prop = NUM_SENTENCE_BREAK_PROPS;
+ while (herodotus_read_codepoint(&tmp, true, &cp) ==
+ HERODOTUS_STATUS_SUCCESS) {
+ prop = get_sentence_break_prop(cp);
- if (res == SENTENCE_BREAK_PROP_OLETTER ||
- res == SENTENCE_BREAK_PROP_UPPER ||
- res == SENTENCE_BREAK_PROP_LOWER ||
- res == SENTENCE_BREAK_PROP_SEP ||
- res == SENTENCE_BREAK_PROP_CR ||
- res == SENTENCE_BREAK_PROP_LF ||
- res == SENTENCE_BREAK_PROP_STERM ||
- res == SENTENCE_BREAK_PROP_ATERM) {
+ /*
+ * the skippable properties are ignored
+ * automatically here given they do not
+ * match the following condition
+ */
+ if (prop == SENTENCE_BREAK_PROP_OLETTER ||
+ prop == SENTENCE_BREAK_PROP_UPPER ||
+ prop == SENTENCE_BREAK_PROP_LOWER ||
+ prop == SENTENCE_BREAK_PROP_SEP ||
+ prop == SENTENCE_BREAK_PROP_CR ||
+ prop == SENTENCE_BREAK_PROP_LF ||
+ prop == SENTENCE_BREAK_PROP_STERM ||
+ prop == SENTENCE_BREAK_PROP_ATERM) {
break;
}
}
- if (res == SENTENCE_BREAK_PROP_LOWER) {
+ if (prop == SENTENCE_BREAK_PROP_LOWER) {
continue;
}
}
/* SB8a */
- if ((saterm_close_sp_parasep_level == 1 ||
- saterm_close_sp_parasep_level == 2 ||
- saterm_close_sp_parasep_level == 3) &&
- (skip.c == SENTENCE_BREAK_PROP_SCONTINUE ||
- skip.c == SENTENCE_BREAK_PROP_STERM ||
- skip.c == SENTENCE_BREAK_PROP_ATERM)) {
+ if ((state.saterm_close_sp_parasep_level == 1 ||
+ state.saterm_close_sp_parasep_level == 2 ||
+ state.saterm_close_sp_parasep_level == 3) &&
+ (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SCONTINUE ||
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_STERM ||
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_ATERM)) {
continue;
}
/* SB9 */
- if ((saterm_close_sp_parasep_level == 1 ||
- saterm_close_sp_parasep_level == 2) &&
- (skip.c == SENTENCE_BREAK_PROP_CLOSE ||
- skip.c == SENTENCE_BREAK_PROP_SP ||
- skip.c == SENTENCE_BREAK_PROP_SEP ||
- skip.c == SENTENCE_BREAK_PROP_CR ||
- skip.c == SENTENCE_BREAK_PROP_LF)) {
+ if ((state.saterm_close_sp_parasep_level == 1 ||
+ state.saterm_close_sp_parasep_level == 2) &&
+ (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CLOSE ||
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP ||
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP ||
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR ||
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
continue;
}
/* SB10 */
- if ((saterm_close_sp_parasep_level == 1 ||
- saterm_close_sp_parasep_level == 2 ||
- saterm_close_sp_parasep_level == 3) &&
- (skip.c == SENTENCE_BREAK_PROP_SP ||
- skip.c == SENTENCE_BREAK_PROP_SEP ||
- skip.c == SENTENCE_BREAK_PROP_CR ||
- skip.c == SENTENCE_BREAK_PROP_LF)) {
+ if ((state.saterm_close_sp_parasep_level == 1 ||
+ state.saterm_close_sp_parasep_level == 2 ||
+ state.saterm_close_sp_parasep_level == 3) &&
+ (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP ||
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP ||
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR ||
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
continue;
}
/* SB11 */
- if (saterm_close_sp_parasep_level == 1 ||
- saterm_close_sp_parasep_level == 2 ||
- saterm_close_sp_parasep_level == 3 ||
- saterm_close_sp_parasep_level == 4) {
+ if (state.saterm_close_sp_parasep_level == 1 ||
+ state.saterm_close_sp_parasep_level == 2 ||
+ state.saterm_close_sp_parasep_level == 3 ||
+ state.saterm_close_sp_parasep_level == 4) {
break;
}
@@ -328,17 +258,25 @@ next_sentence_break(const void *str, size_t len, size_t (*get_codepoint)
continue;
}
- return off;
+ return herodotus_reader_number_read(&(p.mid_reader));
}
size_t
grapheme_next_sentence_break(const uint_least32_t *str, size_t len)
{
- return next_sentence_break(str, len, get_codepoint);
+ HERODOTUS_READER r;
+
+ herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
+
+ return next_sentence_break(&r);
}
size_t
grapheme_next_sentence_break_utf8(const char *str, size_t len)
{
- return next_sentence_break(str, len, get_codepoint_utf8);
+ HERODOTUS_READER r;
+
+ herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
+
+ return next_sentence_break(&r);
}
diff --git a/src/utf8.c b/src/utf8.c
@@ -1,5 +1,6 @@
/* See LICENSE file for copyright and license details. */
-#include <stdio.h>
+#include <stddef.h>
+#include <stdint.h>
#include "../grapheme.h"
#include "util.h"
diff --git a/src/util.c b/src/util.c
@@ -1,70 +1,417 @@
/* See LICENSE file for copyright and license details. */
+#include <limits.h>
#include <stdbool.h>
+#include <stddef.h>
#include <stdint.h>
-#include <stdlib.h>
#include "../gen/types.h"
#include "../grapheme.h"
#include "util.h"
-inline size_t
-get_codepoint(const void *str, size_t len, size_t offset, uint_least32_t *cp)
+void
+herodotus_reader_init(HERODOTUS_READER *r, enum herodotus_type type,
+ const void *src, size_t srclen)
{
- if (offset < len) {
- *cp = ((const uint_least32_t *)str)[offset];
- return 1;
- } else {
- *cp = GRAPHEME_INVALID_CODEPOINT;
- return 0;
+ size_t i;
+
+ r->type = type;
+ r->src = src;
+ r->srclen = srclen;
+ r->off = 0;
+ r->terminated_by_null = false;
+
+ for (i = 0; i < LEN(r->soft_limit); i++) {
+ r->soft_limit[i] = SIZE_MAX;
}
}
-inline size_t
-get_codepoint_utf8(const void *str, size_t len, size_t offset, uint_least32_t *cp)
+void
+herodotus_reader_copy(const HERODOTUS_READER *src, HERODOTUS_READER *dest)
{
- size_t ret;
+ size_t i;
- if (offset < len) {
- ret = grapheme_decode_utf8((const char *)str + offset,
- len - offset, cp);
+ /*
+ * we copy such that we have a "fresh" start and build on the
+ * fact that src->soft_limit[i] for any i and src->srclen are
+ * always larger or equal to src->off
+ */
+ dest->type = src->type;
+ if (src->type == HERODOTUS_TYPE_CODEPOINT) {
+ dest->src = (src->src == NULL) ? NULL :
+ ((const uint_least32_t *)(src->src)) + src->off;
+ } else { /* src->type == HERODOTUS_TYPE_UTF8 */
+ dest->src = (src->src == NULL) ? NULL :
+ ((const char *)(src->src)) + src->off;
+ }
+ if (src->srclen == SIZE_MAX) {
+ dest->srclen = SIZE_MAX;
+ } else {
+ dest->srclen = (src->off < src->srclen) ? src->srclen - src->off : 0;
+ }
+ dest->off = 0;
+ dest->terminated_by_null = src->terminated_by_null;
- if (unlikely(len == SIZE_MAX && cp == 0)) {
- return 0;
+ for (i = 0; i < LEN(src->soft_limit); i++) {
+ if (src->soft_limit[i] == SIZE_MAX) {
+ dest->soft_limit[i] = SIZE_MAX;
} else {
- return ret;
+ /*
+ * if we have a degenerate case where the offset is
+ * higher than the soft-limit, we simply clamp the
+ * soft-limit to zero given we can't decide here
+ * to release the limit and, instead, we just
+ * prevent any more reads
+ */
+ dest->soft_limit[i] = (src->off < src->soft_limit[i]) ?
+ src->soft_limit[i] - src->off : 0;
}
- } else {
+ }
+}
+
+void
+herodotus_reader_push_advance_limit(HERODOTUS_READER *r, size_t count)
+{
+ size_t i;
+
+ for (i = LEN(r->soft_limit) - 1; i >= 1; i--) {
+ r->soft_limit[i] = r->soft_limit[i - 1];
+ }
+ r->soft_limit[0] = r->off + count;
+}
+
+void
+herodotus_reader_pop_limit(HERODOTUS_READER *r)
+{
+ size_t i;
+
+ for (i = 0; i < LEN(r->soft_limit) - 1; i++) {
+ r->soft_limit[i] = r->soft_limit[i + 1];
+ }
+ r->soft_limit[LEN(r->soft_limit) - 1] = SIZE_MAX;
+}
+
+size_t
+herodotus_reader_next_word_break(const HERODOTUS_READER *r)
+{
+ if (r->type == HERODOTUS_TYPE_CODEPOINT) {
+ return grapheme_next_word_break(
+ (const uint_least32_t *)(r->src) + r->off,
+ MIN(r->srclen, r->soft_limit[0]) - r->off);
+ } else { /* r->type == HERODOTUS_TYPE_UTF8 */
+ return grapheme_next_word_break_utf8(
+ (const char *)(r->src) + r->off,
+ MIN(r->srclen, r->soft_limit[0]) - r->off);
+ }
+}
+
+size_t
+herodotus_reader_next_codepoint_break(const HERODOTUS_READER *r)
+{
+ if (r->type == HERODOTUS_TYPE_CODEPOINT) {
+ return (r->off < MIN(r->srclen, r->soft_limit[0])) ? 1 : 0;
+ } else { /* r->type == HERODOTUS_TYPE_UTF8 */
+ return grapheme_decode_utf8(
+ (const char *)(r->src) + r->off,
+ MIN(r->srclen, r->soft_limit[0]) - r->off, NULL);
+ }
+}
+
+size_t
+herodotus_reader_number_read(const HERODOTUS_READER *r)
+{
+ return r->off;
+}
+
+enum herodotus_status
+herodotus_read_codepoint(HERODOTUS_READER *r, bool advance, uint_least32_t *cp)
+{
+ size_t ret;
+
+ if (r->terminated_by_null || r->off >= r->srclen || r->src == NULL) {
*cp = GRAPHEME_INVALID_CODEPOINT;
- return 0;
+ return HERODOTUS_STATUS_END_OF_BUFFER;
}
+
+ if (r->off >= r->soft_limit[0]) {
+ *cp = GRAPHEME_INVALID_CODEPOINT;
+ return HERODOTUS_STATUS_SOFT_LIMIT_REACHED;
+ }
+
+ if (r->type == HERODOTUS_TYPE_CODEPOINT) {
+ *cp = ((const uint_least32_t *)(r->src))[r->off];
+ ret = 1;
+ } else { /* r->type == HERODOTUS_TYPE_UTF8 */
+ ret = grapheme_decode_utf8((const char *)r->src + r->off,
+ MIN(r->srclen, r->soft_limit[0]) -
+ r->off, cp);
+ }
+
+ if (unlikely(r->srclen == SIZE_MAX && *cp == 0)) {
+ /*
+ * We encountered a null-codepoint. Don't increment
+ * offset and return as if the buffer had ended here all
+ * along
+ */
+ r->terminated_by_null = true;
+ return HERODOTUS_STATUS_END_OF_BUFFER;
+ }
+
+ if (r->off + ret > MIN(r->srclen, r->soft_limit[0])) {
+ /*
+ * we want more than we have; instead of returning
+ * garbage we terminate here.
+ */
+ return HERODOTUS_STATUS_END_OF_BUFFER;
+ }
+
+ /*
+ * Increase offset which we now know won't surpass the limits,
+ * unless we got told otherwise
+ */
+ if (advance) {
+ r->off += ret;
+ }
+
+ return HERODOTUS_STATUS_SUCCESS;
}
-inline size_t
-set_codepoint(uint_least32_t cp, void *str, size_t len, size_t offset)
+void
+herodotus_writer_init(HERODOTUS_WRITER *w, enum herodotus_type type,
+ void *dest, size_t destlen)
{
- if (str == NULL || len == 0) {
- return 1;
+ w->type = type;
+ w->dest = dest;
+ w->destlen = destlen;
+ w->off = 0;
+ w->first_unwritable_offset = SIZE_MAX;
+}
+
+void
+herodotus_writer_nul_terminate(HERODOTUS_WRITER *w)
+{
+ if (w->dest == NULL) {
+ return;
}
- if (offset < len) {
- ((uint_least32_t *)str)[offset] = cp;
- return 1;
- } else {
- return 0;
+ if (w->off < w->destlen) {
+ /* We still have space in the buffer. Simply use it */
+ if (w->type == HERODOTUS_TYPE_CODEPOINT) {
+ ((uint_least32_t *)(w->dest))[w->off] = 0;
+ } else { /* w->type == HERODOTUS_TYPE_UTF8 */
+ ((char *)(w->dest))[w->off] = '\0';
+ }
+ } else if (w->first_unwritable_offset < w->destlen) {
+ /*
+ * There is no more space in the buffer. However,
+ * we have noted down the first offset we couldn't
+ * use to write into the buffer and it's smaller than
+ * destlen. Thus we bailed writing into the
+ * destination when a multibyte-codepoint couldn't be
+ * written. So the last "real" byte might be at
+ * destlen-4, destlen-3, destlen-2 or destlen-1
+ * (the last case meaning truncation).
+ */
+ if (w->type == HERODOTUS_TYPE_CODEPOINT) {
+ ((uint_least32_t *)(w->dest))
+ [w->first_unwritable_offset] = 0;
+ } else { /* w->type == HERODOTUS_TYPE_UTF8 */
+ ((char *)(w->dest))[w->first_unwritable_offset] = '\0';
+ }
+ } else if (w->destlen > 0) {
+ /*
+ * In this case, there is no more space in the buffer and
+ * the last unwritable offset is larger than
+ * or equal to the destination buffer length. This means
+ * that we are forced to simply write into the last
+ * byte.
+ */
+ if (w->type == HERODOTUS_TYPE_CODEPOINT) {
+ ((uint_least32_t *)(w->dest))
+ [w->destlen - 1] = 0;
+ } else { /* w->type == HERODOTUS_TYPE_UTF8 */
+ ((char *)(w->dest))[w->destlen - 1] = '\0';
+ }
}
+
+ /* w->off is not incremented in any case */
+}
+
+size_t
+herodotus_writer_number_written(const HERODOTUS_WRITER *w)
+{
+ return w->off;
}
-inline size_t
-set_codepoint_utf8(uint_least32_t cp, void *str, size_t len, size_t offset)
+void
+herodotus_write_codepoint(HERODOTUS_WRITER *w, uint_least32_t cp)
{
- if (str == NULL || len == 0) {
- return grapheme_encode_utf8(cp, NULL, 0);
+ size_t ret;
+
+ /*
+ * This function will always faithfully say how many codepoints
+ * were written, even if the buffer ends. This is used to enable
+ * truncation detection.
+ */
+ if (w->type == HERODOTUS_TYPE_CODEPOINT) {
+ if (w->dest != NULL && w->off < w->destlen) {
+ ((uint_least32_t *)(w->dest))[w->off] = cp;
+ }
+
+ w->off += 1;
+ } else { /* w->type == HERODOTUS_TYPE_UTF8 */
+ /*
+ * First determine how many bytes we need to encode the
+ * codepoint
+ */
+ ret = grapheme_encode_utf8(cp, NULL, 0);
+
+ if (w->dest != NULL && w->off + ret < w->destlen) {
+ /* we still have enough room in the buffer */
+ grapheme_encode_utf8(cp, (char *)(w->dest) +
+ w->off, w->destlen - w->off);
+ } else if (w->first_unwritable_offset == SIZE_MAX) {
+ /*
+ * the first unwritable offset has not been
+ * noted down, so this is the first time we can't
+ * write (completely) to an offset
+ */
+ w->first_unwritable_offset = w->off;
+ }
+
+ w->off += ret;
}
+}
+
+void
+proper_init(const HERODOTUS_READER *r, void *state, uint_least8_t no_prop,
+ uint_least8_t (*get_break_prop)(uint_least32_t),
+ bool (*is_skippable_prop)(uint_least8_t),
+ void (*skip_shift_callback)(uint_least8_t, void *),
+ struct proper *p)
+{
+ uint_least8_t prop;
+ uint_least32_t cp;
+ size_t i;
+
+ /* set internal variables */
+ p->state = state;
+ p->no_prop = no_prop;
+ p->get_break_prop = get_break_prop;
+ p->is_skippable_prop = is_skippable_prop;
+ p->skip_shift_callback = skip_shift_callback;
+
+ /*
+ * Initialize mid-reader, which is basically just there
+ * to reflect the current position of the viewing-line
+ */
+ herodotus_reader_copy(r, &(p->mid_reader));
- if (offset < len) {
- return grapheme_encode_utf8(cp, (char *)str + offset,
- len - offset);
+ /*
+ * In the initialization, we simply (try to) fill in next_prop.
+ * If we cannot read in more (due to the buffer ending), we
+ * fill in the prop as invalid
+ */
+
+ /*
+ * initialize the previous properties to have no property
+ * (given we are at the start of the buffer)
+ */
+ p->raw.prev_prop[1] = p->raw.prev_prop[0] = p->no_prop;
+ p->skip.prev_prop[1] = p->skip.prev_prop[0] = p->no_prop;
+
+ /*
+ * initialize the next properties
+ */
+
+ /* initialize the raw reader */
+ herodotus_reader_copy(r, &(p->raw_reader));
+
+ /* fill in the two next raw properties (after no-initialization) */
+ p->raw.next_prop[0] = p->raw.next_prop[1] = p->no_prop;
+ for (i = 0; i < 2 && herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
+ HERODOTUS_STATUS_SUCCESS; ) {
+ p->raw.next_prop[i++] = p->get_break_prop(cp);
+ }
+
+ /* initialize the skip reader */
+ herodotus_reader_copy(r, &(p->skip_reader));
+
+ /* fill in the two next skip properties (after no-initialization) */
+ p->skip.next_prop[0] = p->skip.next_prop[1] = p->no_prop;
+ for (i = 0; i < 2 && herodotus_read_codepoint(&(p->skip_reader), true, &cp) ==
+ HERODOTUS_STATUS_SUCCESS; ) {
+ prop = p->get_break_prop(cp);
+ if (!p->is_skippable_prop(prop)) {
+ p->skip.next_prop[i++] = prop;
+ }
+ }
+}
+
+int
+proper_advance(struct proper *p)
+{
+ uint_least8_t prop;
+ uint_least32_t cp;
+
+ /* read in next "raw" property */
+ if (herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
+ HERODOTUS_STATUS_SUCCESS) {
+ prop = p->get_break_prop(cp);
} else {
- return grapheme_encode_utf8(cp, NULL, 0);
+ prop = p->no_prop;
+ }
+
+ /*
+ * do a shift-in, unless we find that the property that is to
+ * be moved past the "raw-viewing-line" (this property is stored
+ * in p->raw.next_prop[0]) is a no_prop, indicating that
+ * we are at the end of the buffer.
+ */
+ if (p->raw.next_prop[0] == p->no_prop) {
+ return 1;
+ }
+
+ /* shift in the properties */
+ p->raw.prev_prop[1] = p->raw.prev_prop[0];
+ p->raw.prev_prop[0] = p->raw.next_prop[0];
+ p->raw.next_prop[0] = p->raw.next_prop[1];
+ p->raw.next_prop[1] = prop;
+
+ /* advance the middle reader viewing-line */
+ (void)herodotus_read_codepoint(&(p->mid_reader), true, &cp);
+
+ /* check skippability-property */
+ if (!p->is_skippable_prop(p->raw.prev_prop[0])) {
+ /*
+ * the property that has moved past the "raw-viewing-line"
+ * (this property is now (after the raw-shift) stored in
+ * p->raw.prev_prop[0] and guaranteed not to be a no-prop,
+ * guaranteeing that we won't shift a no-prop past the
+ * "viewing-line" in the skip-properties) is not a skippable
+ * property, thus we need to shift the skip property as well.
+ */
+ p->skip.prev_prop[1] = p->skip.prev_prop[0];
+ p->skip.prev_prop[0] = p->skip.next_prop[0];
+ p->skip.next_prop[0] = p->skip.next_prop[1];
+
+ /*
+ * call the skip-shift-callback on the property that
+ * passed the skip-viewing-line (this property is now
+ * stored in p->skip.prev_prop[0]).
+ */
+ p->skip_shift_callback(p->skip.prev_prop[0], p->state);
+
+ /* determine the next shift property */
+ p->skip.next_prop[1] = p->no_prop;
+ while (herodotus_read_codepoint(&(p->skip_reader), true, &cp) ==
+ HERODOTUS_STATUS_SUCCESS) {
+ prop = p->get_break_prop(cp);
+ if (!p->is_skippable_prop(prop)) {
+ p->skip.next_prop[1] = prop;
+ break;
+ }
+ }
}
+
+ return 0;
}
diff --git a/src/util.h b/src/util.h
@@ -2,12 +2,16 @@
#ifndef UTIL_H
#define UTIL_H
+#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include "../gen/types.h"
#include "../grapheme.h"
+#undef MIN
+#define MIN(x,y) ((x) < (y) ? (x) : (y))
+#undef LEN
#define LEN(x) (sizeof(x) / sizeof(*(x)))
#undef likely
@@ -25,10 +29,88 @@
#define unlikely(expr) (expr)
#endif
-size_t get_codepoint(const void *, size_t, size_t, uint_least32_t *);
-size_t get_codepoint_utf8(const void *, size_t, size_t, uint_least32_t *);
+/*
+ * Herodotus, the ancient greek historian and geographer,
+ * was criticized for including legends and other fantastic
+ * accounts into his works, among others by his contemporary
+ * Thucydides.
+ *
+ * The Herodotus readers and writers are tailored towards the needs
+ * of the library interface, doing all the dirty work behind the
+ * scenes. While the reader is relatively faithful in his accounts,
+ * the Herodotus writer will never fail and always claim to write the
+ * data. Internally, it only writes as much as it can, and will simply
+ * keep account of the rest. This way, we can properly signal truncation.
+ *
+ * In this sense, explaining the naming, the writer is always a bit
+ * inaccurate in his accounts.
+ *
+ */
+enum herodotus_status {
+ HERODOTUS_STATUS_SUCCESS,
+ HERODOTUS_STATUS_END_OF_BUFFER,
+ HERODOTUS_STATUS_SOFT_LIMIT_REACHED,
+};
-size_t set_codepoint(uint_least32_t, void *, size_t, size_t);
-size_t set_codepoint_utf8(uint_least32_t, void *, size_t, size_t);
+enum herodotus_type {
+ HERODOTUS_TYPE_CODEPOINT,
+ HERODOTUS_TYPE_UTF8,
+};
+
+typedef struct herodotus_reader {
+ enum herodotus_type type;
+ const void *src;
+ size_t srclen;
+ size_t off;
+ bool terminated_by_null;
+ size_t soft_limit[10];
+} HERODOTUS_READER;
+
+typedef struct herodotus_writer {
+ enum herodotus_type type;
+ void *dest;
+ size_t destlen;
+ size_t off;
+ size_t first_unwritable_offset;
+} HERODOTUS_WRITER;
+
+struct proper {
+ /*
+ * prev_prop[1] prev_prop[0] | next_prop[0] next_prop[1]
+ */
+ struct {
+ uint_least8_t prev_prop[2];
+ uint_least8_t next_prop[2];
+ } raw, skip;
+ HERODOTUS_READER mid_reader, raw_reader, skip_reader;
+ void *state;
+ uint_least8_t no_prop;
+ uint_least8_t (*get_break_prop)(uint_least32_t);
+ bool (*is_skippable_prop)(uint_least8_t);
+ void (*skip_shift_callback)(uint_least8_t, void *);
+};
+
+void herodotus_reader_init(HERODOTUS_READER *, enum herodotus_type,
+ const void *, size_t);
+void herodotus_reader_copy(const HERODOTUS_READER *, HERODOTUS_READER *);
+void herodotus_reader_push_advance_limit(HERODOTUS_READER *, size_t);
+void herodotus_reader_pop_limit(HERODOTUS_READER *);
+size_t herodotus_reader_number_read(const HERODOTUS_READER *);
+size_t herodotus_reader_next_word_break(const HERODOTUS_READER *);
+size_t herodotus_reader_next_codepoint_break(const HERODOTUS_READER *);
+enum herodotus_status herodotus_read_codepoint(HERODOTUS_READER *, bool, uint_least32_t *);
+
+void herodotus_writer_init(HERODOTUS_WRITER *, enum herodotus_type, void *,
+ size_t);
+void herodotus_writer_nul_terminate(HERODOTUS_WRITER *);
+size_t herodotus_writer_number_written(const HERODOTUS_WRITER *);
+void herodotus_write_codepoint(HERODOTUS_WRITER *, uint_least32_t);
+
+void proper_init(const HERODOTUS_READER *, void *, uint_least8_t,
+ uint_least8_t (*get_break_prop)(uint_least32_t),
+ bool (*is_skippable_prop)(uint_least8_t),
+ void (*skip_shift_callback)(uint_least8_t, void *),
+ struct proper *);
+int proper_advance(struct proper *);
#endif /* UTIL_H */
diff --git a/src/word.c b/src/word.c
@@ -1,331 +1,242 @@
/* See LICENSE file for copyright and license details. */
#include <stdbool.h>
#include <stddef.h>
-#include <stdlib.h>
-#include <string.h>
#include "../gen/word.h"
#include "../grapheme.h"
#include "util.h"
-static inline enum word_break_property
-get_break_prop(uint_least32_t cp)
+struct word_break_state
+{
+ bool ri_even;
+};
+
+static inline uint_least8_t
+get_word_break_prop(uint_least32_t cp)
{
if (likely(cp <= 0x10FFFF)) {
- return (enum word_break_property)
+ return (uint_least8_t)
word_break_minor[word_break_major[cp >> 8] + (cp & 0xff)];
} else {
return WORD_BREAK_PROP_OTHER;
}
}
-static size_t
-next_word_break(const void *str, size_t len, size_t (*get_codepoint)
- (const void *, size_t, size_t, uint_least32_t *))
+static bool
+is_skippable_word_prop(uint_least8_t prop)
{
- struct {
- enum word_break_property a, b, c, d;
- } raw, skip;
- enum word_break_property res;
- uint_least32_t cp;
- size_t off, tmp, new_off;
- bool ri_even = true;
-
- /* check degenerate cases */
- if (str == NULL || len == 0) {
- return 0;
- }
-
- /*
- * Apply word breaking algorithm (UAX #29), see
- * https://unicode.org/reports/tr29/#Word_Boundary_Rules
- *
- * There are 4 slots (a, b, c, d) of "break" properties and
- * we check if there is a break in the middle between b and c.
- *
- * The position of this middle spot is determined by off,
- * which gives the offset of the first element on the right
- * hand side of said spot, or, in other words, gives the number
- * of elements on the left hand side.
- *
- * It is further complicated by the fact that the algorithm
- * expects you to skip certain characters for the second
- * half of the rules (after WB4). Thus, we do not only have
- * the "raw" properties as described above, but also the "skip"
- * properties, where the skip.a and skip.b, for instance,
- * give the two preceding character properties behind the
- * currently investigated breakpoint.
- *
- */
+ return prop == WORD_BREAK_PROP_EXTEND ||
+ prop == WORD_BREAK_PROP_FORMAT ||
+ prop == WORD_BREAK_PROP_ZWJ;
+}
- /*
- * Initialize the different properties such that we have
- * a good state after the state-update in the loop
- */
- raw.b = NUM_WORD_BREAK_PROPS;
- if ((off = get_codepoint(str, len, 0, &cp)) >= len) {
- return 1;
- }
- raw.c = get_break_prop(cp);
- (void)get_codepoint(str, len, off, &cp);
- raw.d = get_break_prop(cp);
- skip.a = skip.b = NUM_WORD_BREAK_PROPS;
+static void
+word_skip_shift_callback(uint_least8_t prop, void *s)
+{
+ struct word_break_state *state = (struct word_break_state *)s;
- for (; off < len; off = new_off) {
+ if (prop == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
/*
- * Update left side (a and b) of the skip state by
- * "shifting in" the raw.c property as long as it is
- * not one of the "ignored" character properties.
- * While at it, update the RI-counter.
+ * The property we just shifted in is
+ * a regional indicator, increasing the
+ * number of consecutive RIs on the left
+ * side of the breakpoint by one, changing
+ * the oddness.
*
*/
- if (raw.c != WORD_BREAK_PROP_EXTEND &&
- raw.c != WORD_BREAK_PROP_FORMAT &&
- raw.c != WORD_BREAK_PROP_ZWJ) {
- skip.a = skip.b;
- skip.b = raw.c;
-
- if (skip.b == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
- /*
- * The property we just shifted in is
- * a regional indicator, increasing the
- * number of consecutive RIs on the left
- * side of the breakpoint by one, changing
- * the oddness.
- *
- */
- ri_even = !ri_even;
- } else {
- /*
- * We saw no regional indicator, so the
- * number of consecutive RIs on the left
- * side of the breakpoint is zero, which
- * is an even number.
- *
- */
- ri_even = true;
- }
- }
-
+ state->ri_even = !(state->ri_even);
+ } else {
/*
- * Update right side (b and c) of the skip state by
- * starting at the breakpoint and detecting the two
- * following non-ignored character classes
+ * We saw no regional indicator, so the
+ * number of consecutive RIs on the left
+ * side of the breakpoint is zero, which
+ * is an even number.
*
*/
- skip.c = NUM_WORD_BREAK_PROPS;
- for (tmp = off; tmp < len; ) {
- tmp += get_codepoint(str, len, tmp, &cp);
- res = get_break_prop(cp);
-
- if (res != WORD_BREAK_PROP_EXTEND &&
- res != WORD_BREAK_PROP_FORMAT &&
- res != WORD_BREAK_PROP_ZWJ) {
- skip.c = res;
- break;
- }
- }
- skip.d = NUM_WORD_BREAK_PROPS;
- for (; tmp < len; ) {
- tmp += get_codepoint(str, len, tmp, &cp);
- res = get_break_prop(cp);
-
- if (res != WORD_BREAK_PROP_EXTEND &&
- res != WORD_BREAK_PROP_FORMAT &&
- res != WORD_BREAK_PROP_ZWJ) {
- skip.d = res;
- break;
- }
- }
+ state->ri_even = true;
+ }
+}
- /*
- * Update the raw state by simply shifting everything
- * in and, if we still have data left, determining
- * the character class of the next codepoint.
- *
- */
- raw.a = raw.b;
- raw.b = raw.c;
- raw.c = raw.d;
- if ((new_off = off + get_codepoint(str, len, off, &cp)) < len) {
- get_codepoint(str, len, new_off, &cp);
- raw.d = get_break_prop(cp);
- } else {
- raw.d = NUM_WORD_BREAK_PROPS;
- }
+static size_t
+next_word_break(HERODOTUS_READER *r)
+{
+ struct proper p;
+ struct word_break_state state = { .ri_even = true };
+ /*
+ * Apply word breaking algorithm (UAX #29), see
+ * https://unicode.org/reports/tr29/#Word_Boundary_Rules
+ */
+ proper_init(r, &state, NUM_WORD_BREAK_PROPS, get_word_break_prop,
+ is_skippable_word_prop, word_skip_shift_callback, &p);
+
+ while (!proper_advance(&p)) {
/* WB3 */
- if (raw.b == WORD_BREAK_PROP_CR &&
- raw.c == WORD_BREAK_PROP_LF) {
+ if (p.raw.prev_prop[0] == WORD_BREAK_PROP_CR &&
+ p.raw.next_prop[0] == WORD_BREAK_PROP_LF) {
continue;
}
/* WB3a */
- if (raw.b == WORD_BREAK_PROP_NEWLINE ||
- raw.b == WORD_BREAK_PROP_CR ||
- raw.b == WORD_BREAK_PROP_LF) {
+ if (p.raw.prev_prop[0] == WORD_BREAK_PROP_NEWLINE ||
+ p.raw.prev_prop[0] == WORD_BREAK_PROP_CR ||
+ p.raw.prev_prop[0] == WORD_BREAK_PROP_LF) {
break;
}
/* WB3b */
- if (raw.c == WORD_BREAK_PROP_NEWLINE ||
- raw.c == WORD_BREAK_PROP_CR ||
- raw.c == WORD_BREAK_PROP_LF) {
+ if (p.raw.next_prop[0] == WORD_BREAK_PROP_NEWLINE ||
+ p.raw.next_prop[0] == WORD_BREAK_PROP_CR ||
+ p.raw.next_prop[0] == WORD_BREAK_PROP_LF) {
break;
}
/* WB3c */
- if (raw.b == WORD_BREAK_PROP_ZWJ &&
- (raw.c == WORD_BREAK_PROP_EXTENDED_PICTOGRAPHIC ||
- raw.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT)) {
+ if (p.raw.prev_prop[0] == WORD_BREAK_PROP_ZWJ &&
+ (p.raw.next_prop[0] == WORD_BREAK_PROP_EXTENDED_PICTOGRAPHIC ||
+ p.raw.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT)) {
continue;
}
/* WB3d */
- if (raw.b == WORD_BREAK_PROP_WSEGSPACE &&
- raw.c == WORD_BREAK_PROP_WSEGSPACE) {
+ if (p.raw.prev_prop[0] == WORD_BREAK_PROP_WSEGSPACE &&
+ p.raw.next_prop[0] == WORD_BREAK_PROP_WSEGSPACE) {
continue;
}
/* WB4 */
- if (raw.c == WORD_BREAK_PROP_EXTEND ||
- raw.c == WORD_BREAK_PROP_FORMAT ||
- raw.c == WORD_BREAK_PROP_ZWJ) {
+ if (p.raw.next_prop[0] == WORD_BREAK_PROP_EXTEND ||
+ p.raw.next_prop[0] == WORD_BREAK_PROP_FORMAT ||
+ p.raw.next_prop[0] == WORD_BREAK_PROP_ZWJ) {
continue;
}
/* WB5 */
- if ((skip.b == WORD_BREAK_PROP_ALETTER ||
- skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
- skip.b == WORD_BREAK_PROP_HEBREW_LETTER) &&
- (skip.c == WORD_BREAK_PROP_ALETTER ||
- skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
- skip.c == WORD_BREAK_PROP_HEBREW_LETTER)) {
+ if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
+ (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
+ p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+ p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) {
continue;
}
/* WB6 */
- if ((skip.b == WORD_BREAK_PROP_ALETTER ||
- skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
- skip.b == WORD_BREAK_PROP_HEBREW_LETTER) &&
- (skip.c == WORD_BREAK_PROP_MIDLETTER ||
- skip.c == WORD_BREAK_PROP_MIDNUMLET ||
- skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) &&
- len > 2 &&
- (skip.d == WORD_BREAK_PROP_ALETTER ||
- skip.d == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
- skip.d == WORD_BREAK_PROP_HEBREW_LETTER)) {
+ if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
+ (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDLETTER ||
+ p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
+ p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
+ (p.skip.next_prop[1] == WORD_BREAK_PROP_ALETTER ||
+ p.skip.next_prop[1] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+ p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) {
continue;
}
/* WB7 */
- if ((skip.b == WORD_BREAK_PROP_MIDLETTER ||
- skip.b == WORD_BREAK_PROP_MIDNUMLET ||
- skip.b == WORD_BREAK_PROP_SINGLE_QUOTE) &&
- (skip.c == WORD_BREAK_PROP_ALETTER ||
- skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
- skip.c == WORD_BREAK_PROP_HEBREW_LETTER) &&
- len > 2 &&
- (skip.a == WORD_BREAK_PROP_ALETTER ||
- skip.a == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
- skip.a == WORD_BREAK_PROP_HEBREW_LETTER)) {
+ if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDLETTER ||
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
+ (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
+ p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+ p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
+ (p.skip.prev_prop[1] == WORD_BREAK_PROP_ALETTER ||
+ p.skip.prev_prop[1] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+ p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) {
continue;
}
/* WB7a */
- if (skip.b == WORD_BREAK_PROP_HEBREW_LETTER &&
- skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) {
+ if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
+ p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) {
continue;
}
/* WB7b */
- if (skip.b == WORD_BREAK_PROP_HEBREW_LETTER &&
- skip.c == WORD_BREAK_PROP_DOUBLE_QUOTE &&
- len > 2 &&
- skip.d == WORD_BREAK_PROP_HEBREW_LETTER) {
+ if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
+ p.skip.next_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE &&
+ p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) {
continue;
}
/* WB7c */
- if (skip.b == WORD_BREAK_PROP_DOUBLE_QUOTE &&
- skip.c == WORD_BREAK_PROP_HEBREW_LETTER &&
- off > 1 &&
- skip.a == WORD_BREAK_PROP_HEBREW_LETTER) {
+ if (p.skip.prev_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE &&
+ p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
+ p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) {
continue;
}
/* WB8 */
- if (skip.b == WORD_BREAK_PROP_NUMERIC &&
- skip.c == WORD_BREAK_PROP_NUMERIC) {
+ if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
+ p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) {
continue;
}
/* WB9 */
- if ((skip.b == WORD_BREAK_PROP_ALETTER ||
- skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
- skip.b == WORD_BREAK_PROP_HEBREW_LETTER) &&
- skip.c == WORD_BREAK_PROP_NUMERIC) {
+ if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
+ p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) {
continue;
}
/* WB10 */
- if (skip.b == WORD_BREAK_PROP_NUMERIC &&
- (skip.c == WORD_BREAK_PROP_ALETTER ||
- skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
- skip.c == WORD_BREAK_PROP_HEBREW_LETTER)) {
+ if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
+ (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
+ p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+ p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) {
continue;
}
/* WB11 */
- if ((skip.b == WORD_BREAK_PROP_MIDNUM ||
- skip.b == WORD_BREAK_PROP_MIDNUMLET ||
- skip.b == WORD_BREAK_PROP_SINGLE_QUOTE) &&
- skip.c == WORD_BREAK_PROP_NUMERIC &&
- off > 1 &&
- skip.a == WORD_BREAK_PROP_NUMERIC) {
+ if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUM ||
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
+ p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC &&
+ p.skip.prev_prop[1] == WORD_BREAK_PROP_NUMERIC) {
continue;
}
/* WB12 */
- if (skip.b == WORD_BREAK_PROP_NUMERIC &&
- (skip.c == WORD_BREAK_PROP_MIDNUM ||
- skip.c == WORD_BREAK_PROP_MIDNUMLET ||
- skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) &&
- len > 2 &&
- skip.d == WORD_BREAK_PROP_NUMERIC) {
+ if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
+ (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUM ||
+ p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
+ p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
+ p.skip.next_prop[1] == WORD_BREAK_PROP_NUMERIC) {
continue;
}
/* WB13 */
- if (skip.b == WORD_BREAK_PROP_KATAKANA &&
- skip.c == WORD_BREAK_PROP_KATAKANA) {
+ if (p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA &&
+ p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA) {
continue;
}
/* WB13a */
- if ((skip.b == WORD_BREAK_PROP_ALETTER ||
- skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
- skip.b == WORD_BREAK_PROP_HEBREW_LETTER ||
- skip.b == WORD_BREAK_PROP_NUMERIC ||
- skip.b == WORD_BREAK_PROP_KATAKANA ||
- skip.b == WORD_BREAK_PROP_EXTENDNUMLET) &&
- skip.c == WORD_BREAK_PROP_EXTENDNUMLET) {
+ if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER ||
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC ||
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA ||
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) &&
+ p.skip.next_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) {
continue;
}
/* WB13b */
- if (skip.b == WORD_BREAK_PROP_EXTENDNUMLET &&
- (skip.c == WORD_BREAK_PROP_ALETTER ||
- skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
- skip.c == WORD_BREAK_PROP_HEBREW_LETTER ||
- skip.c == WORD_BREAK_PROP_NUMERIC ||
- skip.c == WORD_BREAK_PROP_KATAKANA)) {
+ if (p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET &&
+ (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
+ p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+ p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER ||
+ p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC ||
+ p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA)) {
continue;
}
/* WB15 and WB16 */
- if (!ri_even &&
- skip.c == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
+ if (!state.ri_even &&
+ p.skip.next_prop[0] == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
continue;
}
@@ -333,17 +244,25 @@ next_word_break(const void *str, size_t len, size_t (*get_codepoint)
break;
}
- return off;
+ return herodotus_reader_number_read(&(p.mid_reader));
}
size_t
grapheme_next_word_break(const uint_least32_t *str, size_t len)
{
- return next_word_break(str, len, get_codepoint);
+ HERODOTUS_READER r;
+
+ herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
+
+ return next_word_break(&r);
}
size_t
grapheme_next_word_break_utf8(const char *str, size_t len)
{
- return next_word_break(str, len, get_codepoint_utf8);
+ HERODOTUS_READER r;
+
+ herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
+
+ return next_word_break(&r);
}
diff --git a/test/case.c b/test/case.c
@@ -0,0 +1,580 @@
+/* See LICENSE file for copyright and license details. */
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "../grapheme.h"
+#include "util.h"
+
+struct unit_test_is_case_utf8 {
+ const char *description;
+ struct {
+ const char *src;
+ size_t srclen;
+ } input;
+ struct {
+ bool ret;
+ size_t caselen;
+ } output;
+};
+
+struct unit_test_to_case_utf8 {
+ const char *description;
+ struct {
+ const char *src;
+ size_t srclen;
+ size_t destlen;
+ } input;
+ struct {
+ const char *dest;
+ size_t ret;
+ } output;
+};
+
+static const struct unit_test_is_case_utf8 is_lowercase_utf8[] = {
+ {
+ .description = "empty input",
+ .input = { "", 0 },
+ .output = { true, 0 },
+ },
+ {
+ .description = "one character, violation",
+ .input = { "A", 1 },
+ .output = { false, 0 },
+ },
+ {
+ .description = "one character, confirmation",
+ .input = { "\xC3\x9F", 2 },
+ .output = { true, 2 },
+ },
+ {
+ .description = "one character, violation, NUL-terminated",
+ .input = { "A", SIZE_MAX },
+ .output = { false, 0 },
+ },
+ {
+ .description = "one character, confirmation, NUL-terminated",
+ .input = { "\xC3\x9F", SIZE_MAX },
+ .output = { true, 2 },
+ },
+ {
+ .description = "one word, violation",
+ .input = { "Hello", 5 },
+ .output = { false, 0 },
+ },
+ {
+ .description = "one word, partial confirmation",
+ .input = { "gru" "\xC3\x9F" "fOrmel", 11 },
+ .output = { false, 6 },
+ },
+ {
+ .description = "one word, full confirmation",
+ .input = { "gru" "\xC3\x9F" "formel", 11 },
+ .output = { true, 11 },
+ },
+ {
+ .description = "one word, violation, NUL-terminated",
+ .input = { "Hello", SIZE_MAX },
+ .output = { false, 0 },
+ },
+ {
+ .description = "one word, partial confirmation, NUL-terminated",
+ .input = { "gru" "\xC3\x9F" "fOrmel", SIZE_MAX },
+ .output = { false, 6 },
+ },
+ {
+ .description = "one word, full confirmation, NUL-terminated",
+ .input = { "gru" "\xC3\x9F" "formel", SIZE_MAX },
+ .output = { true, 11 },
+ },
+};
+
+static const struct unit_test_is_case_utf8 is_uppercase_utf8[] = {
+ {
+ .description = "empty input",
+ .input = { "", 0 },
+ .output = { true, 0 },
+ },
+ {
+ .description = "one character, violation",
+ .input = { "\xC3\x9F", 2 },
+ .output = { false, 0 },
+ },
+ {
+ .description = "one character, confirmation",
+ .input = { "A", 1 },
+ .output = { true, 1 },
+ },
+ {
+ .description = "one character, violation, NUL-terminated",
+ .input = { "\xC3\x9F", SIZE_MAX },
+ .output = { false, 0 },
+ },
+ {
+ .description = "one character, confirmation, NUL-terminated",
+ .input = { "A", SIZE_MAX },
+ .output = { true, 1 },
+ },
+ {
+ .description = "one word, violation",
+ .input = { "hello", 5 },
+ .output = { false, 0 },
+ },
+ {
+ .description = "one word, partial confirmation",
+ .input = { "GRU" "\xC3\x9F" "formel", 11 },
+ .output = { false, 3 },
+ },
+ {
+ .description = "one word, full confirmation",
+ .input = { "HELLO", 5 },
+ .output = { true, 5 },
+ },
+ {
+ .description = "one word, violation, NUL-terminated",
+ .input = { "hello", SIZE_MAX },
+ .output = { false, 0 },
+ },
+ {
+ .description = "one word, partial confirmation, NUL-terminated",
+ .input = { "GRU" "\xC3\x9F" "formel", SIZE_MAX },
+ .output = { false, 3 },
+ },
+ {
+ .description = "one word, full confirmation, NUL-terminated",
+ .input = { "HELLO", SIZE_MAX },
+ .output = { true, 5 },
+ },
+};
+
+static const struct unit_test_is_case_utf8 is_titlecase_utf8[] = {
+ {
+ .description = "empty input",
+ .input = { "", 0 },
+ .output = { true, 0 },
+ },
+ {
+ .description = "one character, violation",
+ .input = { "\xC3\x9F", 2 },
+ .output = { false, 0 },
+ },
+ {
+ .description = "one character, confirmation",
+ .input = { "A", 1 },
+ .output = { true, 1 },
+ },
+ {
+ .description = "one character, violation, NUL-terminated",
+ .input = { "\xC3\x9F", SIZE_MAX },
+ .output = { false, 0 },
+ },
+ {
+ .description = "one character, confirmation, NUL-terminated",
+ .input = { "A", SIZE_MAX },
+ .output = { true, 1 },
+ },
+ {
+ .description = "one word, violation",
+ .input = { "hello", 5 },
+ .output = { false, 0 },
+ },
+ {
+ .description = "one word, partial confirmation",
+ .input = { "Gru" "\xC3\x9F" "fOrmel", 11 },
+ .output = { false, 6 },
+ },
+ {
+ .description = "one word, full confirmation",
+ .input = { "Gru" "\xC3\x9F" "formel", 11 },
+ .output = { true, 11 },
+ },
+ {
+ .description = "one word, violation, NUL-terminated",
+ .input = { "hello", SIZE_MAX },
+ .output = { false, 0 },
+ },
+ {
+ .description = "one word, partial confirmation, NUL-terminated",
+ .input = { "Gru" "\xC3\x9F" "fOrmel", SIZE_MAX },
+ .output = { false, 6 },
+ },
+ {
+ .description = "one word, full confirmation, NUL-terminated",
+ .input = { "Gru" "\xC3\x9F" "formel", SIZE_MAX },
+ .output = { true, 11 },
+ },
+ {
+ .description = "multiple words, partial confirmation",
+ .input = { "Hello Gru" "\xC3\x9F" "fOrmel!", 18 },
+ .output = { false, 12 },
+ },
+ {
+ .description = "multiple words, full confirmation",
+ .input = { "Hello Gru" "\xC3\x9F" "formel!", 18 },
+ .output = { true, 18 },
+ },
+ {
+ .description = "multiple words, partial confirmation, NUL-terminated",
+ .input = { "Hello Gru" "\xC3\x9F" "fOrmel!", SIZE_MAX },
+ .output = { false, 12 },
+ },
+ {
+ .description = "multiple words, full confirmation, NUL-terminated",
+ .input = { "Hello Gru" "\xC3\x9F" "formel!", SIZE_MAX },
+ .output = { true, 18 },
+ },
+};
+
+static const struct unit_test_to_case_utf8 to_lowercase_utf8[] = {
+ {
+ .description = "empty input",
+ .input = { "", 0, 10 },
+ .output = { "", 0 },
+ },
+ {
+ .description = "empty output",
+ .input = { "hello", 5, 0 },
+ .output = { "", 5 },
+ },
+ {
+ .description = "one character, conversion",
+ .input = { "A", 1, 10 },
+ .output = { "a", 1 },
+ },
+ {
+ .description = "one character, no conversion",
+ .input = { "\xC3\x9F", 2, 10 },
+ .output = { "\xC3\x9F", 2 },
+ },
+ {
+ .description = "one character, conversion, truncation",
+ .input = { "A", 1, 0 },
+ .output = { "", 1 },
+ },
+ {
+ .description = "one character, conversion, NUL-terminated",
+ .input = { "A", SIZE_MAX, 10 },
+ .output = { "a", 1 },
+ },
+ {
+ .description = "one character, no conversion, NUL-terminated",
+ .input = { "\xC3\x9F", SIZE_MAX, 10 },
+ .output = { "\xC3\x9F", 2 },
+ },
+ {
+ .description = "one character, conversion, NUL-terminated, truncation",
+ .input = { "A", SIZE_MAX, 0 },
+ .output = { "", 1 },
+ },
+ {
+ .description = "one word, conversion",
+ .input = { "wOrD", 4, 10 },
+ .output = { "word", 4 },
+ },
+ {
+ .description = "one word, no conversion",
+ .input = { "word", 4, 10 },
+ .output = { "word", 4 },
+ },
+ {
+ .description = "one word, conversion, truncation",
+ .input = { "wOrD", 4, 3 },
+ .output = { "wo", 4 },
+ },
+ {
+ .description = "one word, conversion, NUL-terminated",
+ .input = { "wOrD", SIZE_MAX, 10 },
+ .output = { "word", 4 },
+ },
+ {
+ .description = "one word, no conversion, NUL-terminated",
+ .input = { "word", SIZE_MAX, 10 },
+ .output = { "word", 4 },
+ },
+ {
+ .description = "one word, conversion, NUL-terminated, truncation",
+ .input = { "wOrD", SIZE_MAX, 3 },
+ .output = { "wo", 4 },
+ },
+};
+
+static const struct unit_test_to_case_utf8 to_uppercase_utf8[] = {
+ {
+ .description = "empty input",
+ .input = { "", 0, 10 },
+ .output = { "", 0 },
+ },
+ {
+ .description = "empty output",
+ .input = { "hello", 5, 0 },
+ .output = { "", 5 },
+ },
+ {
+ .description = "one character, conversion",
+ .input = { "\xC3\x9F", 2, 10 },
+ .output = { "SS", 2 },
+ },
+ {
+ .description = "one character, no conversion",
+ .input = { "A", 1, 10 },
+ .output = { "A", 1 },
+ },
+ {
+ .description = "one character, conversion, truncation",
+ .input = { "\xC3\x9F", 2, 0 },
+ .output = { "", 2 },
+ },
+ {
+ .description = "one character, conversion, NUL-terminated",
+ .input = { "\xC3\x9F", SIZE_MAX, 10 },
+ .output = { "SS", 2 },
+ },
+ {
+ .description = "one character, no conversion, NUL-terminated",
+ .input = { "A", SIZE_MAX, 10 },
+ .output = { "A", 1 },
+ },
+ {
+ .description = "one character, conversion, NUL-terminated, truncation",
+ .input = { "\xC3\x9F", SIZE_MAX, 0 },
+ .output = { "", 2 },
+ },
+ {
+ .description = "one word, conversion",
+ .input = { "gRu" "\xC3\x9F" "fOrMel", 11, 15 },
+ .output = { "GRUSSFORMEL", 11 },
+ },
+ {
+ .description = "one word, no conversion",
+ .input = { "WORD", 4, 10 },
+ .output = { "WORD", 4 },
+ },
+ {
+ .description = "one word, conversion, truncation",
+ .input = { "gRu" "\xC3\x9F" "formel", 11, 5 },
+ .output = { "GRUS", 11 },
+ },
+ {
+ .description = "one word, conversion, NUL-terminated",
+ .input = { "gRu" "\xC3\x9F" "formel", SIZE_MAX, 15 },
+ .output = { "GRUSSFORMEL", 11 },
+ },
+ {
+ .description = "one word, no conversion, NUL-terminated",
+ .input = { "WORD", SIZE_MAX, 10 },
+ .output = { "WORD", 4 },
+ },
+ {
+ .description = "one word, conversion, NUL-terminated, truncation",
+ .input = { "gRu" "\xC3\x9F" "formel", SIZE_MAX, 5 },
+ .output = { "GRUS", 11 },
+ },
+};
+
+static const struct unit_test_to_case_utf8 to_titlecase_utf8[] = {
+ {
+ .description = "empty input",
+ .input = { "", 0, 10 },
+ .output = { "", 0 },
+ },
+ {
+ .description = "empty output",
+ .input = { "hello", 5, 0 },
+ .output = { "", 5 },
+ },
+ {
+ .description = "one character, conversion",
+ .input = { "a", 1, 10 },
+ .output = { "A", 1 },
+ },
+ {
+ .description = "one character, no conversion",
+ .input = { "A", 1, 10 },
+ .output = { "A", 1 },
+ },
+ {
+ .description = "one character, conversion, truncation",
+ .input = { "a", 1, 0 },
+ .output = { "", 1 },
+ },
+ {
+ .description = "one character, conversion, NUL-terminated",
+ .input = { "a", SIZE_MAX, 10 },
+ .output = { "A", 1 },
+ },
+ {
+ .description = "one character, no conversion, NUL-terminated",
+ .input = { "A", SIZE_MAX, 10 },
+ .output = { "A", 1 },
+ },
+ {
+ .description = "one character, conversion, NUL-terminated, truncation",
+ .input = { "a", SIZE_MAX, 0 },
+ .output = { "", 1 },
+ },
+ {
+ .description = "one word, conversion",
+ .input = { "heLlo", 5, 10 },
+ .output = { "Hello", 5 },
+ },
+ {
+ .description = "one word, no conversion",
+ .input = { "Hello", 5, 10 },
+ .output = { "Hello", 5 },
+ },
+ {
+ .description = "one word, conversion, truncation",
+ .input = { "heLlo", 5, 2 },
+ .output = { "H", 5 },
+ },
+ {
+ .description = "one word, conversion, NUL-terminated",
+ .input = { "heLlo", SIZE_MAX, 10 },
+ .output = { "Hello", 5 },
+ },
+ {
+ .description = "one word, no conversion, NUL-terminated",
+ .input = { "Hello", SIZE_MAX, 10 },
+ .output = { "Hello", 5 },
+ },
+ {
+ .description = "one word, conversion, NUL-terminated, truncation",
+ .input = { "heLlo", SIZE_MAX, 3 },
+ .output = { "He", 5 },
+ },
+ {
+ .description = "two words, conversion",
+ .input = { "heLlo wORLd!", 12, 20 },
+ .output = { "Hello World!", 12 },
+ },
+ {
+ .description = "two words, no conversion",
+ .input = { "Hello World!", 12, 20 },
+ .output = { "Hello World!", 12 },
+ },
+ {
+ .description = "two words, conversion, truncation",
+ .input = { "heLlo wORLd!", 12, 8 },
+ .output = { "Hello W", 12 },
+ },
+ {
+ .description = "two words, conversion, NUL-terminated",
+ .input = { "heLlo wORLd!", SIZE_MAX, 20 },
+ .output = { "Hello World!", 12 },
+ },
+ {
+ .description = "two words, no conversion, NUL-terminated",
+ .input = { "Hello World!", SIZE_MAX, 20 },
+ .output = { "Hello World!", 12 },
+ },
+ {
+ .description = "two words, conversion, NUL-terminated, truncation",
+ .input = { "heLlo wORLd!", SIZE_MAX, 4 },
+ .output = { "Hel", 12 },
+ },
+};
+
+static int
+unit_test_callback_is_case_utf8(const void *t, size_t off, const char *name,
+ const char *argv0)
+{
+ const struct unit_test_is_case_utf8 *test =
+ (const struct unit_test_is_case_utf8 *)t + off;
+ bool ret = false;
+ size_t caselen = 0x7f;
+
+ if (t == is_lowercase_utf8) {
+ ret = grapheme_is_lowercase_utf8(test->input.src, test->input.srclen,
+ &caselen);
+ } else if (t == is_uppercase_utf8) {
+ ret = grapheme_is_uppercase_utf8(test->input.src, test->input.srclen,
+ &caselen);
+ } else if (t == is_titlecase_utf8) {
+ ret = grapheme_is_titlecase_utf8(test->input.src, test->input.srclen,
+ &caselen);
+
+ } else {
+ goto err;
+ }
+
+ /* check results */
+ if (ret != test->output.ret || caselen != test->output.caselen) {
+ goto err;
+ }
+
+ return 0;
+err:
+ fprintf(stderr, "%s: %s: Failed unit test %zu \"%s\" "
+ "(returned (%s, %zu) instead of (%s, %zu)).\n", argv0,
+ name, off, test->description, ret ? "true" : "false",
+ caselen, test->output.ret ? "true" : "false",
+ test->output.caselen);
+ return 1;
+}
+
+static int
+unit_test_callback_to_case_utf8(const void *t, size_t off, const char *name,
+ const char *argv0)
+{
+ const struct unit_test_to_case_utf8 *test =
+ (const struct unit_test_to_case_utf8 *)t + off;
+ size_t ret = 0, i;
+ char buf[512];
+
+ /* fill the array with canary values */
+ memset(buf, 0x7f, LEN(buf));
+
+ if (t == to_lowercase_utf8) {
+ ret = grapheme_to_lowercase_utf8(test->input.src, test->input.srclen,
+ buf, test->input.destlen);
+ } else if (t == to_uppercase_utf8) {
+ ret = grapheme_to_uppercase_utf8(test->input.src, test->input.srclen,
+ buf, test->input.destlen);
+ } else if (t == to_titlecase_utf8) {
+ ret = grapheme_to_titlecase_utf8(test->input.src, test->input.srclen,
+ buf, test->input.destlen);
+ } else {
+ goto err;
+ }
+
+ /* check results */
+ if (ret != test->output.ret ||
+ memcmp(buf, test->output.dest, MIN(test->input.destlen, test->output.ret))) {
+ goto err;
+ }
+
+ /* check that none of the canary values have been overwritten */
+ for (i = test->input.destlen; i < LEN(buf); i++) {
+ if (buf[i] != 0x7f) {
+ goto err;
+ }
+ }
+
+ return 0;
+err:
+ fprintf(stderr, "%s: %s: Failed unit test %zu \"%s\" "
+ "(returned (\"%.*s\", %zu) instead of (\"%.*s\", %zu)).\n", argv0,
+ name, off, test->description, (int)ret, buf, ret,
+ (int)test->output.ret, test->output.dest, test->output.ret);
+ return 1;
+}
+
+int
+main(int argc, char *argv[])
+{
+ (void)argc;
+
+ return run_unit_tests(unit_test_callback_is_case_utf8, is_lowercase_utf8,
+ LEN(is_lowercase_utf8), "grapheme_is_lowercase_utf8", argv[0]) +
+ run_unit_tests(unit_test_callback_is_case_utf8, is_uppercase_utf8,
+ LEN(is_uppercase_utf8), "grapheme_is_uppercase_utf8", argv[0]) +
+ run_unit_tests(unit_test_callback_is_case_utf8, is_titlecase_utf8,
+ LEN(is_titlecase_utf8), "grapheme_is_titlecase_utf8", argv[0]) +
+ run_unit_tests(unit_test_callback_to_case_utf8, to_lowercase_utf8,
+ LEN(to_lowercase_utf8), "grapheme_to_lowercase_utf8", argv[0]) +
+ run_unit_tests(unit_test_callback_to_case_utf8, to_uppercase_utf8,
+ LEN(to_uppercase_utf8), "grapheme_to_uppercase_utf8", argv[0]) +
+ run_unit_tests(unit_test_callback_to_case_utf8, to_titlecase_utf8,
+ LEN(to_titlecase_utf8), "grapheme_to_titlecase_utf8", argv[0]);
+}
diff --git a/test/character.c b/test/character.c
@@ -6,12 +6,121 @@
#include "../grapheme.h"
#include "util.h"
+static const struct unit_test_next_break next_character_break[] = {
+ {
+ .description = "NULL input",
+ .input = {
+ .src = NULL,
+ .srclen = 0,
+ },
+ .output = { 0 },
+ },
+ {
+ .description = "empty input",
+ .input = {
+ .src = (uint_least32_t *)(uint_least32_t[]){ 0x0 },
+ .srclen = 0,
+ },
+ .output = { 0 },
+ },
+ {
+ .description = "empty input, null-terminated",
+ .input = {
+ .src = (uint_least32_t *)(uint_least32_t[]){ 0x0 },
+ .srclen = SIZE_MAX,
+ },
+ .output = { 0 },
+ },
+ {
+ .description = "one character",
+ .input = {
+ .src = (uint_least32_t *)(uint_least32_t[]){ 0x1F1E9, 0x1F1EA, 0x2A },
+ .srclen = 3,
+ },
+ .output = { 2 },
+ },
+ {
+ .description = "one character, null-terminated",
+ .input = {
+ .src = (uint_least32_t *)(uint_least32_t[]){ 0x1F1E9, 0x1F1EA, 0x0 },
+ .srclen = SIZE_MAX,
+ },
+ .output = { 2 },
+ },
+};
+
+static const struct unit_test_next_break_utf8 next_character_break_utf8[] = {
+ {
+ .description = "NULL input",
+ .input = {
+ .src = NULL,
+ .srclen = 0,
+ },
+ .output = { 0 },
+ },
+ {
+ .description = "empty input",
+ .input = { "", 0 },
+ .output = { 0 },
+ },
+ {
+ .description = "empty input, NUL-terminated",
+ .input = { "", SIZE_MAX },
+ .output = { 0 },
+ },
+ {
+ .description = "one character",
+ .input = { "\xF0\x9F\x87\xA9\xF0\x9F\x87\xAA*", 9 },
+ .output = { 8 },
+ },
+ {
+ .description = "one character, fragment",
+ .input = { "\xF0\x9F\x87\xA9\xF0", 5 },
+ .output = { 4 },
+ },
+ {
+ .description = "one character, NUL-terminated",
+ .input = { "\xF0\x9F\x87\xA9\xF0\x9F\x87\xAA", SIZE_MAX },
+ .output = { 8 },
+ },
+ {
+ .description = "one character, fragment, NUL-terminated",
+ .input = { "\xF0\x9F\x87\xA9\xF0\x9F", SIZE_MAX },
+ .output = { 4 },
+ },
+};
+
+static int
+unit_test_callback_next_character_break(const void *t, size_t off,
+ const char *name,
+ const char *argv0)
+{
+ return unit_test_callback_next_break(t, off,
+ grapheme_next_character_break,
+ name, argv0);
+}
+
+static int
+unit_test_callback_next_character_break_utf8(const void *t, size_t off,
+ const char *name,
+ const char *argv0)
+{
+ return unit_test_callback_next_break_utf8(t, off,
+ grapheme_next_character_break_utf8,
+ name, argv0);
+}
+
int
main(int argc, char *argv[])
{
(void)argc;
return run_break_tests(grapheme_next_character_break,
- character_break_test,
- LEN(character_break_test), argv[0]);
+ character_break_test, LEN(character_break_test), argv[0]) +
+ run_unit_tests(unit_test_callback_next_character_break,
+ next_character_break, LEN(next_character_break),
+ "grapheme_next_character_break", argv[0]) +
+ run_unit_tests(unit_test_callback_next_character_break_utf8,
+ next_character_break_utf8, LEN(next_character_break_utf8),
+ "grapheme_next_character_break_utf8", argv[0]);
}
diff --git a/test/line.c b/test/line.c
@@ -6,6 +6,110 @@
#include "../grapheme.h"
#include "util.h"
+static const struct unit_test_next_break next_line_break[] = {
+ {
+ .description = "NULL input",
+ .input = {
+ .src = NULL,
+ .srclen = 0,
+ },
+ .output = { 0 },
+ },
+ {
+ .description = "empty input",
+ .input = {
+ .src = (uint_least32_t *)(uint_least32_t[]){ 0x0 },
+ .srclen = 0,
+ },
+ .output = { 0 },
+ },
+ {
+ .description = "empty input, null-terminated",
+ .input = {
+ .src = (uint_least32_t *)(uint_least32_t[]){ 0x0 },
+ .srclen = SIZE_MAX,
+ },
+ .output = { 0 },
+ },
+ {
+ .description = "one opportunity",
+ .input = {
+ .src = (uint_least32_t *)(uint_least32_t[]){ 0x1F1E9, 0x1F1EA, 0x20, 0x2A },
+ .srclen = 4,
+ },
+ .output = { 3 },
+ },
+ {
+ .description = "one opportunity, null-terminated",
+ .input = {
+ .src = (uint_least32_t *)(uint_least32_t[]){ 0x1F1E9, 0x1F1EA, 0x20, 0x2A, 0x0 },
+ .srclen = SIZE_MAX,
+ },
+ .output = { 3 },
+ },
+};
+
+static const struct unit_test_next_break_utf8 next_line_break_utf8[] = {
+ {
+ .description = "NULL input",
+ .input = {
+ .src = NULL,
+ .srclen = 0,
+ },
+ .output = { 0 },
+ },
+ {
+ .description = "empty input",
+ .input = { "", 0 },
+ .output = { 0 },
+ },
+ {
+ .description = "empty input, NUL-terminated",
+ .input = { "", SIZE_MAX },
+ .output = { 0 },
+ },
+ {
+ .description = "one opportunity",
+ .input = { "\xF0\x9F\x87\xA9\xF0\x9F\x87\xAA *", 10 },
+ .output = { 9 },
+ },
+ {
+ .description = "one opportunity, fragment",
+ .input = { "\xF0\x9F\x87\xA9\xF0", 5 },
+ .output = { 4 },
+ },
+ {
+ .description = "one opportunity, NUL-terminated",
+ .input = { "\xF0\x9F\x87\xA9\xF0\x9F\x87\xAA A", SIZE_MAX },
+ .output = { 9 },
+ },
+ {
+ .description = "one opportunity, fragment, NUL-terminated",
+ .input = { "\xF0\x9F\x87\xA9\xF0\x9F", SIZE_MAX },
+ .output = { 4 },
+ },
+};
+
+static int
+unit_test_callback_next_line_break(const void *t, size_t off,
+ const char *name,
+ const char *argv0)
+{
+ return unit_test_callback_next_break(t, off,
+ grapheme_next_line_break,
+ name, argv0);
+}
+
+static int
+unit_test_callback_next_line_break_utf8(const void *t, size_t off,
+ const char *name,
+ const char *argv0)
+{
+ return unit_test_callback_next_break_utf8(t, off,
+ grapheme_next_line_break_utf8,
+ name, argv0);
+}
+
int
main(int argc, char *argv[])
{
@@ -13,5 +117,11 @@ main(int argc, char *argv[])
return run_break_tests(grapheme_next_line_break,
line_break_test, LEN(line_break_test),
- argv[0]);
+ argv[0]) +
+ run_unit_tests(unit_test_callback_next_line_break,
+ next_line_break, LEN(next_line_break),
+ "grapheme_next_line_break", argv[0]) +
+ run_unit_tests(unit_test_callback_next_line_break_utf8,
+ next_line_break_utf8, LEN(next_line_break_utf8),
+ "grapheme_next_line_break_utf8", argv[0]);
}
diff --git a/test/sentence.c b/test/sentence.c
@@ -6,6 +6,110 @@
#include "../grapheme.h"
#include "util.h"
+static const struct unit_test_next_break next_sentence_break[] = {
+ {
+ .description = "NULL input",
+ .input = {
+ .src = NULL,
+ .srclen = 0,
+ },
+ .output = { 0 },
+ },
+ {
+ .description = "empty input",
+ .input = {
+ .src = (uint_least32_t *)(uint_least32_t[]){ 0x0 },
+ .srclen = 0,
+ },
+ .output = { 0 },
+ },
+ {
+ .description = "empty input, null-terminated",
+ .input = {
+ .src = (uint_least32_t *)(uint_least32_t[]){ 0x0 },
+ .srclen = SIZE_MAX,
+ },
+ .output = { 0 },
+ },
+ {
+ .description = "one sentence",
+ .input = {
+ .src = (uint_least32_t *)(uint_least32_t[]){ 0x1F1E9, 0x1F1EA, 0x2E, 0x20, 0x2A },
+ .srclen = 5,
+ },
+ .output = { 4 },
+ },
+ {
+ .description = "one sentence, null-terminated",
+ .input = {
+ .src = (uint_least32_t *)(uint_least32_t[]){ 0x1F1E9, 0x1F1EA, 0x2E, 0x20, 0x2A, 0x0 },
+ .srclen = SIZE_MAX,
+ },
+ .output = { 4 },
+ },
+};
+
+static const struct unit_test_next_break_utf8 next_sentence_break_utf8[] = {
+ {
+ .description = "NULL input",
+ .input = {
+ .src = NULL,
+ .srclen = 0,
+ },
+ .output = { 0 },
+ },
+ {
+ .description = "empty input",
+ .input = { "", 0 },
+ .output = { 0 },
+ },
+ {
+ .description = "empty input, NUL-terminated",
+ .input = { "", SIZE_MAX },
+ .output = { 0 },
+ },
+ {
+ .description = "one sentence",
+ .input = { "\xF0\x9F\x87\xA9\xF0\x9F\x87\xAA is the flag of Germany. It", 36 },
+ .output = { 34 },
+ },
+ {
+ .description = "one sentence, fragment",
+ .input = { "\xF0\x9F\x87\xA9\xF0", 5 },
+ .output = { 4 },
+ },
+ {
+ .description = "one sentence, NUL-terminated",
+ .input = { "\xF0\x9F\x87\xA9\xF0\x9F\x87\xAA is the flag of Germany. It", SIZE_MAX },
+ .output = { 34 },
+ },
+ {
+ .description = "one sentence, fragment, NUL-terminated",
+ .input = { "\xF0\x9F\x87\xA9\xF0\x9F", SIZE_MAX },
+ .output = { 6 },
+ },
+};
+
+static int
+unit_test_callback_next_sentence_break(const void *t, size_t off,
+ const char *name,
+ const char *argv0)
+{
+ return unit_test_callback_next_break(t, off,
+ grapheme_next_sentence_break,
+ name, argv0);
+}
+
+static int
+unit_test_callback_next_sentence_break_utf8(const void *t, size_t off,
+ const char *name,
+ const char *argv0)
+{
+ return unit_test_callback_next_break_utf8(t, off,
+ grapheme_next_sentence_break_utf8,
+ name, argv0);
+}
+
int
main(int argc, char *argv[])
{
@@ -13,5 +117,11 @@ main(int argc, char *argv[])
return run_break_tests(grapheme_next_sentence_break,
sentence_break_test,
- LEN(sentence_break_test), argv[0]);
+ LEN(sentence_break_test), argv[0]) +
+ run_unit_tests(unit_test_callback_next_sentence_break,
+ next_sentence_break, LEN(next_sentence_break),
+ "grapheme_next_sentence_break", argv[0]) +
+ run_unit_tests(unit_test_callback_next_sentence_break_utf8,
+ next_sentence_break_utf8, LEN(next_sentence_break_utf8),
+ "grapheme_next_character_break_utf8", argv[0]);
}
diff --git a/test/utf8-decode.c b/test/utf8-decode.c
@@ -310,7 +310,7 @@ main(int argc, char *argv[])
failed++;
}
}
- printf("%s: %zu/%zu tests passed.\n", argv[0],
+ printf("%s: %zu/%zu unit tests passed.\n", argv[0],
LEN(dec_test) - failed, LEN(dec_test));
return (failed > 0) ? 1 : 0;
diff --git a/test/utf8-encode.c b/test/utf8-encode.c
@@ -86,7 +86,7 @@ main(int argc, char *argv[])
failed++;
}
}
- printf("%s: %zu/%zu tests passed.\n", argv[0],
+ printf("%s: %zu/%zu unit tests passed.\n", argv[0],
LEN(enc_test) - failed, LEN(enc_test));
return (failed > 0) ? 1 : 0;
diff --git a/test/util.c b/test/util.c
@@ -23,7 +23,7 @@ run_break_tests(size_t (*next_break)(const uint_least32_t *, size_t),
/* check if our resulting offset matches */
if (j == test[i].lenlen ||
res != test[i].len[j++]) {
- fprintf(stderr, "%s: Failed test %zu \"%s\".\n",
+ fprintf(stderr, "%s: Failed conformance test %zu \"%s\".\n",
argv0, i, test[i].descr);
fprintf(stderr, "J=%zu: EXPECTED len %zu, got %zu\n", j-1, test[i].len[j-1], res);
failed++;
@@ -31,8 +31,68 @@ run_break_tests(size_t (*next_break)(const uint_least32_t *, size_t),
}
}
}
- printf("%s: %zu/%zu tests passed.\n", argv0,
+ printf("%s: %zu/%zu conformance tests passed.\n", argv0,
testlen - failed, testlen);
return (failed > 0) ? 1 : 0;
}
+
+int
+run_unit_tests(int (*unit_test_callback)(const void *, size_t, const char *,
+ const char *), const void *test, size_t testlen, const char *name,
+ const char *argv0)
+{
+ size_t i, failed;
+
+ for (i = 0, failed = 0; i < testlen; i++) {
+ failed += (unit_test_callback(test, i, name, argv0) == 0) ? 0 : 1;
+ }
+
+ printf("%s: %s: %zu/%zu unit tests passed.\n", argv0, name,
+ testlen - failed, testlen);
+
+ return (failed > 0) ? 1 : 0;
+}
+
+int
+unit_test_callback_next_break(const struct unit_test_next_break *t, size_t off,
+ size_t (*next_break)(const uint_least32_t *, size_t),
+ const char *name, const char *argv0)
+{
+ const struct unit_test_next_break *test = t + off;
+
+ size_t ret = next_break(test->input.src, test->input.srclen);
+
+ if (ret != test->output.ret) {
+ goto err;
+ }
+
+ return 0;
+err:
+ fprintf(stderr, "%s: %s: Failed unit test %zu \"%s\" "
+ "(returned %zu instead of %zu).\n", argv0,
+ name, off, test->description, ret, test->output.ret);
+ return 1;
+}
+
+int
+unit_test_callback_next_break_utf8(const struct unit_test_next_break_utf8 *t,
+ size_t off,
+ size_t (*next_break_utf8)(const char *, size_t),
+ const char *name, const char *argv0)
+{
+ const struct unit_test_next_break_utf8 *test = t + off;
+
+ size_t ret = next_break_utf8(test->input.src, test->input.srclen);
+
+ if (ret != test->output.ret) {
+ goto err;
+ }
+
+ return 0;
+err:
+ fprintf(stderr, "%s: %s: Failed unit test %zu \"%s\" "
+ "(returned %zu instead of %zu).\n", argv0,
+ name, off, test->description, ret, test->output.ret);
+ return 1;
+}
diff --git a/test/util.h b/test/util.h
@@ -5,10 +5,45 @@
#include "../gen/types.h"
#include "../grapheme.h"
+#undef MIN
+#define MIN(x,y) ((x) < (y) ? (x) : (y))
+#undef LEN
#define LEN(x) (sizeof(x) / sizeof(*(x)))
+struct unit_test_next_break {
+ const char *description;
+ struct {
+ const uint_least32_t *src;
+ size_t srclen;
+ } input;
+ struct {
+ size_t ret;
+ } output;
+};
+
+struct unit_test_next_break_utf8 {
+ const char *description;
+ struct {
+ const char *src;
+ size_t srclen;
+ } input;
+ struct {
+ size_t ret;
+ } output;
+};
+
int run_break_tests(size_t (*next_break)(const uint_least32_t *, size_t),
const struct break_test *test, size_t testlen,
const char *);
+int run_unit_tests(int (*unit_test_callback)(const void *, size_t, const char *,
+ const char *), const void *, size_t, const char *, const char *);
+
+int unit_test_callback_next_break(const struct unit_test_next_break *, size_t,
+ size_t (*next_break)(const uint_least32_t *, size_t),
+ const char *, const char *);
+int unit_test_callback_next_break_utf8(const struct unit_test_next_break_utf8 *,
+ size_t,
+ size_t (*next_break_utf8)(const char *, size_t),
+ const char *, const char *);
#endif /* UTIL_H */
diff --git a/test/word.c b/test/word.c
@@ -6,11 +6,121 @@
#include "../grapheme.h"
#include "util.h"
+static const struct unit_test_next_break next_word_break[] = {
+ {
+ .description = "NULL input",
+ .input = {
+ .src = NULL,
+ .srclen = 0,
+ },
+ .output = { 0 },
+ },
+ {
+ .description = "empty input",
+ .input = {
+ .src = (uint_least32_t *)(uint_least32_t[]){ 0x0 },
+ .srclen = 0,
+ },
+ .output = { 0 },
+ },
+ {
+ .description = "empty input, null-terminated",
+ .input = {
+ .src = (uint_least32_t *)(uint_least32_t[]){ 0x0 },
+ .srclen = SIZE_MAX,
+ },
+ .output = { 0 },
+ },
+ {
+ .description = "one word",
+ .input = {
+ .src = (uint_least32_t *)(uint_least32_t[]){ 0x1F1E9, 0x1F1EA, 0x20, 0x2A },
+ .srclen = 4,
+ },
+ .output = { 2 },
+ },
+ {
+ .description = "one word, null-terminated",
+ .input = {
+ .src = (uint_least32_t *)(uint_least32_t[]){ 0x1F1E9, 0x1F1EA, 0x20, 0x2A, 0x0 },
+ .srclen = SIZE_MAX,
+ },
+ .output = { 2 },
+ },
+};
+
+static const struct unit_test_next_break_utf8 next_word_break_utf8[] = {
+ {
+ .description = "NULL input",
+ .input = {
+ .src = NULL,
+ .srclen = 0,
+ },
+ .output = { 0 },
+ },
+ {
+ .description = "empty input",
+ .input = { "", 0 },
+ .output = { 0 },
+ },
+ {
+ .description = "empty input, NUL-terminated",
+ .input = { "", SIZE_MAX },
+ .output = { 0 },
+ },
+ {
+ .description = "one word",
+ .input = { "\xF0\x9F\x87\xA9\xF0\x9F\x87\xAA is", 11 },
+ .output = { 8 },
+ },
+ {
+ .description = "one word, fragment",
+ .input = { "\xF0\x9F\x87\xA9\xF0", 5 },
+ .output = { 4 },
+ },
+ {
+ .description = "one word, NUL-terminated",
+ .input = { "\xF0\x9F\x87\xA9\xF0\x9F\x87\xAA is", SIZE_MAX },
+ .output = { 8 },
+ },
+ {
+ .description = "one word, fragment, NUL-terminated",
+ .input = { "\xF0\x9F\x87\xA9\xF0\x9F", SIZE_MAX },
+ .output = { 4 },
+ },
+};
+
+static int
+unit_test_callback_next_word_break(const void *t, size_t off,
+ const char *name,
+ const char *argv0)
+{
+ return unit_test_callback_next_break(t, off,
+ grapheme_next_word_break,
+ name, argv0);
+}
+
+static int
+unit_test_callback_next_word_break_utf8(const void *t, size_t off,
+ const char *name,
+ const char *argv0)
+{
+ return unit_test_callback_next_break_utf8(t, off,
+ grapheme_next_word_break_utf8,
+ name, argv0);
+}
+
int
main(int argc, char *argv[])
{
(void)argc;
return run_break_tests(grapheme_next_word_break, word_break_test,
- LEN(word_break_test), argv[0]);
+ LEN(word_break_test), argv[0]) +
+ run_unit_tests(unit_test_callback_next_word_break,
+ next_word_break, LEN(next_word_break),
+ "grapheme_next_word_break", argv[0]) +
+ run_unit_tests(unit_test_callback_next_word_break_utf8,
+ next_word_break_utf8, LEN(next_word_break_utf8),
+ "grapheme_next_word_break_utf8", argv[0]);
}