commit 52b0e29e02068d6a8123042ef901f73e37b2f38f
parent b899fd685c50cbc61999296ce1e0a03a45e74f52
Author: Laslo Hunhold <dev@frign.de>
Date: Sun, 2 Oct 2022 21:17:03 +0200
Refactor word-functions with Proper (using Herodotus in the background)
As promised, this leads to a heavy simplification and separation of
concerns in the code. Additionally, this fixes some known quirks in
regard to handling NUL-terminated strings.
Signed-off-by: Laslo Hunhold <dev@frign.de>
Diffstat:
M | src/word.c | | | 367 | +++++++++++++++++++++++++++++++------------------------------------------------ |
1 file changed, 142 insertions(+), 225 deletions(-)
diff --git a/src/word.c b/src/word.c
@@ -6,328 +6,237 @@
#include "../grapheme.h"
#include "util.h"
-static inline enum word_break_property
-get_break_prop(uint_least32_t cp)
+struct word_break_state
+{
+ bool ri_even;
+};
+
+static inline uint_least8_t
+get_word_break_prop(uint_least32_t cp)
{
if (likely(cp <= 0x10FFFF)) {
- return (enum word_break_property)
+ return (uint_least8_t)
word_break_minor[word_break_major[cp >> 8] + (cp & 0xff)];
} else {
return WORD_BREAK_PROP_OTHER;
}
}
-static size_t
-next_word_break(const void *str, size_t len, size_t (*get_codepoint)
- (const void *, size_t, size_t, uint_least32_t *))
+static bool
+is_skippable_word_prop(uint_least8_t prop)
{
- struct {
- enum word_break_property a, b, c, d;
- } raw, skip;
- enum word_break_property res;
- uint_least32_t cp;
- size_t off, tmp, new_off;
- bool ri_even = true;
-
- /* check degenerate cases */
- if (str == NULL || len == 0) {
- return 0;
- }
-
- /*
- * Apply word breaking algorithm (UAX #29), see
- * https://unicode.org/reports/tr29/#Word_Boundary_Rules
- *
- * There are 4 slots (a, b, c, d) of "break" properties and
- * we check if there is a break in the middle between b and c.
- *
- * The position of this middle spot is determined by off,
- * which gives the offset of the first element on the right
- * hand side of said spot, or, in other words, gives the number
- * of elements on the left hand side.
- *
- * It is further complicated by the fact that the algorithm
- * expects you to skip certain characters for the second
- * half of the rules (after WB4). Thus, we do not only have
- * the "raw" properties as described above, but also the "skip"
- * properties, where the skip.a and skip.b, for instance,
- * give the two preceding character properties behind the
- * currently investigated breakpoint.
- *
- */
+ return prop == WORD_BREAK_PROP_EXTEND ||
+ prop == WORD_BREAK_PROP_FORMAT ||
+ prop == WORD_BREAK_PROP_ZWJ;
+}
- /*
- * Initialize the different properties such that we have
- * a good state after the state-update in the loop
- */
- raw.b = NUM_WORD_BREAK_PROPS;
- if ((off = get_codepoint(str, len, 0, &cp)) >= len) {
- /*
- * A line is at least one codepoint long, so we can
- * safely return here
- */
- return len;
- }
- raw.c = get_break_prop(cp);
- (void)get_codepoint(str, len, off, &cp);
- raw.d = get_break_prop(cp);
- skip.a = skip.b = NUM_WORD_BREAK_PROPS;
+static void
+word_skip_shift_callback(uint_least8_t prop, void *s)
+{
+ struct word_break_state *state = (struct word_break_state *)s;
- for (; off < len; off = new_off) {
+ if (prop == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
/*
- * Update left side (a and b) of the skip state by
- * "shifting in" the raw.c property as long as it is
- * not one of the "ignored" character properties.
- * While at it, update the RI-counter.
+ * The property we just shifted in is
+ * a regional indicator, increasing the
+ * number of consecutive RIs on the left
+ * side of the breakpoint by one, changing
+ * the oddness.
*
*/
- if (raw.c != WORD_BREAK_PROP_EXTEND &&
- raw.c != WORD_BREAK_PROP_FORMAT &&
- raw.c != WORD_BREAK_PROP_ZWJ) {
- skip.a = skip.b;
- skip.b = raw.c;
-
- if (skip.b == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
- /*
- * The property we just shifted in is
- * a regional indicator, increasing the
- * number of consecutive RIs on the left
- * side of the breakpoint by one, changing
- * the oddness.
- *
- */
- ri_even = !ri_even;
- } else {
- /*
- * We saw no regional indicator, so the
- * number of consecutive RIs on the left
- * side of the breakpoint is zero, which
- * is an even number.
- *
- */
- ri_even = true;
- }
- }
-
+ state->ri_even = !(state->ri_even);
+ } else {
/*
- * Update right side (b and c) of the skip state by
- * starting at the breakpoint and detecting the two
- * following non-ignored character classes
+ * We saw no regional indicator, so the
+ * number of consecutive RIs on the left
+ * side of the breakpoint is zero, which
+ * is an even number.
*
*/
- skip.c = NUM_WORD_BREAK_PROPS;
- for (tmp = off; tmp < len; ) {
- tmp += get_codepoint(str, len, tmp, &cp);
- res = get_break_prop(cp);
-
- if (res != WORD_BREAK_PROP_EXTEND &&
- res != WORD_BREAK_PROP_FORMAT &&
- res != WORD_BREAK_PROP_ZWJ) {
- skip.c = res;
- break;
- }
- }
- skip.d = NUM_WORD_BREAK_PROPS;
- for (; tmp < len; ) {
- tmp += get_codepoint(str, len, tmp, &cp);
- res = get_break_prop(cp);
-
- if (res != WORD_BREAK_PROP_EXTEND &&
- res != WORD_BREAK_PROP_FORMAT &&
- res != WORD_BREAK_PROP_ZWJ) {
- skip.d = res;
- break;
- }
- }
+ state->ri_even = true;
+ }
+}
- /*
- * Update the raw state by simply shifting everything
- * in and, if we still have data left, determining
- * the character class of the next codepoint.
- *
- */
- raw.a = raw.b;
- raw.b = raw.c;
- raw.c = raw.d;
- if ((new_off = off + get_codepoint(str, len, off, &cp)) < len) {
- get_codepoint(str, len, new_off, &cp);
- raw.d = get_break_prop(cp);
- } else {
- raw.d = NUM_WORD_BREAK_PROPS;
- }
+static size_t
+next_word_break(HERODOTUS_READER *r)
+{
+ struct proper p;
+ struct word_break_state state = { .ri_even = true };
+ /*
+ * Apply word breaking algorithm (UAX #29), see
+ * https://unicode.org/reports/tr29/#Word_Boundary_Rules
+ */
+ proper_init(r, &state, NUM_WORD_BREAK_PROPS, get_word_break_prop,
+ is_skippable_word_prop, word_skip_shift_callback, &p);
+
+ while (!proper_advance(&p)) {
/* WB3 */
- if (raw.b == WORD_BREAK_PROP_CR &&
- raw.c == WORD_BREAK_PROP_LF) {
+ if (p.raw.prev_prop[0] == WORD_BREAK_PROP_CR &&
+ p.raw.next_prop[0] == WORD_BREAK_PROP_LF) {
continue;
}
/* WB3a */
- if (raw.b == WORD_BREAK_PROP_NEWLINE ||
- raw.b == WORD_BREAK_PROP_CR ||
- raw.b == WORD_BREAK_PROP_LF) {
+ if (p.raw.prev_prop[0] == WORD_BREAK_PROP_NEWLINE ||
+ p.raw.prev_prop[0] == WORD_BREAK_PROP_CR ||
+ p.raw.prev_prop[0] == WORD_BREAK_PROP_LF) {
break;
}
/* WB3b */
- if (raw.c == WORD_BREAK_PROP_NEWLINE ||
- raw.c == WORD_BREAK_PROP_CR ||
- raw.c == WORD_BREAK_PROP_LF) {
+ if (p.raw.next_prop[0] == WORD_BREAK_PROP_NEWLINE ||
+ p.raw.next_prop[0] == WORD_BREAK_PROP_CR ||
+ p.raw.next_prop[0] == WORD_BREAK_PROP_LF) {
break;
}
/* WB3c */
- if (raw.b == WORD_BREAK_PROP_ZWJ &&
- (raw.c == WORD_BREAK_PROP_EXTENDED_PICTOGRAPHIC ||
- raw.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT)) {
+ if (p.raw.prev_prop[0] == WORD_BREAK_PROP_ZWJ &&
+ (p.raw.next_prop[0] == WORD_BREAK_PROP_EXTENDED_PICTOGRAPHIC ||
+ p.raw.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT)) {
continue;
}
/* WB3d */
- if (raw.b == WORD_BREAK_PROP_WSEGSPACE &&
- raw.c == WORD_BREAK_PROP_WSEGSPACE) {
+ if (p.raw.prev_prop[0] == WORD_BREAK_PROP_WSEGSPACE &&
+ p.raw.next_prop[0] == WORD_BREAK_PROP_WSEGSPACE) {
continue;
}
/* WB4 */
- if (raw.c == WORD_BREAK_PROP_EXTEND ||
- raw.c == WORD_BREAK_PROP_FORMAT ||
- raw.c == WORD_BREAK_PROP_ZWJ) {
+ if (p.raw.next_prop[0] == WORD_BREAK_PROP_EXTEND ||
+ p.raw.next_prop[0] == WORD_BREAK_PROP_FORMAT ||
+ p.raw.next_prop[0] == WORD_BREAK_PROP_ZWJ) {
continue;
}
/* WB5 */
- if ((skip.b == WORD_BREAK_PROP_ALETTER ||
- skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
- skip.b == WORD_BREAK_PROP_HEBREW_LETTER) &&
- (skip.c == WORD_BREAK_PROP_ALETTER ||
- skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
- skip.c == WORD_BREAK_PROP_HEBREW_LETTER)) {
+ if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
+ (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
+ p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+ p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) {
continue;
}
/* WB6 */
- if ((skip.b == WORD_BREAK_PROP_ALETTER ||
- skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
- skip.b == WORD_BREAK_PROP_HEBREW_LETTER) &&
- (skip.c == WORD_BREAK_PROP_MIDLETTER ||
- skip.c == WORD_BREAK_PROP_MIDNUMLET ||
- skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) &&
- len > 2 &&
- (skip.d == WORD_BREAK_PROP_ALETTER ||
- skip.d == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
- skip.d == WORD_BREAK_PROP_HEBREW_LETTER)) {
+ if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
+ (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDLETTER ||
+ p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
+ p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
+ (p.skip.next_prop[1] == WORD_BREAK_PROP_ALETTER ||
+ p.skip.next_prop[1] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+ p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) {
continue;
}
/* WB7 */
- if ((skip.b == WORD_BREAK_PROP_MIDLETTER ||
- skip.b == WORD_BREAK_PROP_MIDNUMLET ||
- skip.b == WORD_BREAK_PROP_SINGLE_QUOTE) &&
- (skip.c == WORD_BREAK_PROP_ALETTER ||
- skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
- skip.c == WORD_BREAK_PROP_HEBREW_LETTER) &&
- len > 2 &&
- (skip.a == WORD_BREAK_PROP_ALETTER ||
- skip.a == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
- skip.a == WORD_BREAK_PROP_HEBREW_LETTER)) {
+ if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDLETTER ||
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
+ (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
+ p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+ p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
+ (p.skip.prev_prop[1] == WORD_BREAK_PROP_ALETTER ||
+ p.skip.prev_prop[1] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+ p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) {
continue;
}
/* WB7a */
- if (skip.b == WORD_BREAK_PROP_HEBREW_LETTER &&
- skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) {
+ if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
+ p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) {
continue;
}
/* WB7b */
- if (skip.b == WORD_BREAK_PROP_HEBREW_LETTER &&
- skip.c == WORD_BREAK_PROP_DOUBLE_QUOTE &&
- len > 2 &&
- skip.d == WORD_BREAK_PROP_HEBREW_LETTER) {
+ if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
+ p.skip.next_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE &&
+ p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) {
continue;
}
/* WB7c */
- if (skip.b == WORD_BREAK_PROP_DOUBLE_QUOTE &&
- skip.c == WORD_BREAK_PROP_HEBREW_LETTER &&
- off > 1 &&
- skip.a == WORD_BREAK_PROP_HEBREW_LETTER) {
+ if (p.skip.prev_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE &&
+ p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
+ p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) {
continue;
}
/* WB8 */
- if (skip.b == WORD_BREAK_PROP_NUMERIC &&
- skip.c == WORD_BREAK_PROP_NUMERIC) {
+ if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
+ p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) {
continue;
}
/* WB9 */
- if ((skip.b == WORD_BREAK_PROP_ALETTER ||
- skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
- skip.b == WORD_BREAK_PROP_HEBREW_LETTER) &&
- skip.c == WORD_BREAK_PROP_NUMERIC) {
+ if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
+ p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) {
continue;
}
/* WB10 */
- if (skip.b == WORD_BREAK_PROP_NUMERIC &&
- (skip.c == WORD_BREAK_PROP_ALETTER ||
- skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
- skip.c == WORD_BREAK_PROP_HEBREW_LETTER)) {
+ if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
+ (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
+ p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+ p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) {
continue;
}
/* WB11 */
- if ((skip.b == WORD_BREAK_PROP_MIDNUM ||
- skip.b == WORD_BREAK_PROP_MIDNUMLET ||
- skip.b == WORD_BREAK_PROP_SINGLE_QUOTE) &&
- skip.c == WORD_BREAK_PROP_NUMERIC &&
- off > 1 &&
- skip.a == WORD_BREAK_PROP_NUMERIC) {
+ if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUM ||
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
+ p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC &&
+ p.skip.prev_prop[1] == WORD_BREAK_PROP_NUMERIC) {
continue;
}
/* WB12 */
- if (skip.b == WORD_BREAK_PROP_NUMERIC &&
- (skip.c == WORD_BREAK_PROP_MIDNUM ||
- skip.c == WORD_BREAK_PROP_MIDNUMLET ||
- skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) &&
- len > 2 &&
- skip.d == WORD_BREAK_PROP_NUMERIC) {
+ if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
+ (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUM ||
+ p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
+ p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
+ p.skip.next_prop[1] == WORD_BREAK_PROP_NUMERIC) {
continue;
}
/* WB13 */
- if (skip.b == WORD_BREAK_PROP_KATAKANA &&
- skip.c == WORD_BREAK_PROP_KATAKANA) {
+ if (p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA &&
+ p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA) {
continue;
}
/* WB13a */
- if ((skip.b == WORD_BREAK_PROP_ALETTER ||
- skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
- skip.b == WORD_BREAK_PROP_HEBREW_LETTER ||
- skip.b == WORD_BREAK_PROP_NUMERIC ||
- skip.b == WORD_BREAK_PROP_KATAKANA ||
- skip.b == WORD_BREAK_PROP_EXTENDNUMLET) &&
- skip.c == WORD_BREAK_PROP_EXTENDNUMLET) {
+ if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER ||
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC ||
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA ||
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) &&
+ p.skip.next_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) {
continue;
}
/* WB13b */
- if (skip.b == WORD_BREAK_PROP_EXTENDNUMLET &&
- (skip.c == WORD_BREAK_PROP_ALETTER ||
- skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
- skip.c == WORD_BREAK_PROP_HEBREW_LETTER ||
- skip.c == WORD_BREAK_PROP_NUMERIC ||
- skip.c == WORD_BREAK_PROP_KATAKANA)) {
+ if (p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET &&
+ (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
+ p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+ p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER ||
+ p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC ||
+ p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA)) {
continue;
}
/* WB15 and WB16 */
- if (!ri_even &&
- skip.c == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
+ if (!state.ri_even &&
+ p.skip.next_prop[0] == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
continue;
}
@@ -335,17 +244,25 @@ next_word_break(const void *str, size_t len, size_t (*get_codepoint)
break;
}
- return off;
+ return herodotus_reader_number_read(&(p.mid_reader));
}
size_t
grapheme_next_word_break(const uint_least32_t *str, size_t len)
{
- return next_word_break(str, len, get_codepoint);
+ HERODOTUS_READER r;
+
+ herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
+
+ return next_word_break(&r);
}
size_t
grapheme_next_word_break_utf8(const char *str, size_t len)
{
- return next_word_break(str, len, get_codepoint_utf8);
+ HERODOTUS_READER r;
+
+ herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
+
+ return next_word_break(&r);
}