commit a5b1b0c0c7bc1576b5893175b27585fa963f4433
parent 52b0e29e02068d6a8123042ef901f73e37b2f38f
Author: Laslo Hunhold <dev@frign.de>
Date: Sun, 2 Oct 2022 22:05:11 +0200
Refactor sentence-functions with Proper (using Herodotus in the background)
This refactor was a breeze and it passed all conformance tests on the
first try. This, just like with the word-functions, leads to a massive
simplification and separation of concerns in the code. And as with the
word functions, this fixes some known quirks.
Signed-off-by: Laslo Hunhold <dev@frign.de>
Diffstat:
M | src/sentence.c | | | 426 | ++++++++++++++++++++++++++++++++++--------------------------------------------- |
1 file changed, 181 insertions(+), 245 deletions(-)
diff --git a/src/sentence.c b/src/sentence.c
@@ -6,11 +6,17 @@
#include "../grapheme.h"
#include "util.h"
-static inline enum sentence_break_property
-get_break_prop(uint_least32_t cp)
+struct sentence_break_state
+{
+ uint_least8_t aterm_close_sp_level;
+ uint_least8_t saterm_close_sp_parasep_level;
+};
+
+static inline uint_least8_t
+get_sentence_break_prop(uint_least32_t cp)
{
if (likely(cp <= 0x10FFFF)) {
- return (enum sentence_break_property)
+ return (uint_least8_t)
sentence_break_minor[sentence_break_major[cp >> 8] +
(cp & 0xff)];
} else {
@@ -18,243 +24,157 @@ get_break_prop(uint_least32_t cp)
}
}
-static size_t
-next_sentence_break(const void *str, size_t len, size_t (*get_codepoint)
- (const void *, size_t, size_t, uint_least32_t *))
+static bool
+is_skippable_sentence_prop(uint_least8_t prop)
{
- struct {
- enum sentence_break_property a, b, c, d;
- } raw, skip;
- enum sentence_break_property res;
- uint_least32_t cp;
- uint_least8_t aterm_close_sp_level = 0,
- saterm_close_sp_parasep_level = 0;
- size_t off, tmp, new_off;
+ return prop == SENTENCE_BREAK_PROP_EXTEND ||
+ prop == SENTENCE_BREAK_PROP_FORMAT;
+}
- /* check degenerate cases */
- if (str == NULL || len == 0) {
- return 0;
- }
+static void
+sentence_skip_shift_callback(uint_least8_t prop, void *s)
+{
+ struct sentence_break_state *state = (struct sentence_break_state *)s;
/*
- * Apply sentence breaking algorithm (UAX #29), see
- * https://unicode.org/reports/tr29/#Sentence_Boundary_Rules
+ * Here comes a bit of magic. The rules
+ * SB8, SB8a, SB9 and SB10 have very complicated
+ * left-hand-side-rules of the form
*
- * There are 4 slots (a, b, c, d) of "break" properties and
- * we check if there is a break in the middle between b and c.
+ * ATerm Close* Sp*
+ * SATerm Close*
+ * SATerm Close* Sp*
+ * SATerm Close* Sp* ParaSep?
*
- * The position of this middle spot is determined by off,
- * which gives the offset of the first element on the right
- * hand side of said spot, or, in other words, gives the number
- * of elements on the left hand side.
+ * but instead of backtracking, we keep the
+ * state as some kind of "power level" in
+ * two state-variables
*
- * It is further complicated by the fact that the algorithm
- * expects you to skip certain characters for the second
- * half of the rules (after SB5). Thus, we do not only have
- * the "raw" properties as described above, but also the "skip"
- * properties, where the skip.a and skip.b, for instance,
- * give the two preceding character properties behind the
- * currently investigated breakpoint.
+ * aterm_close_sp_level
+ * saterm_close_sp_parasep_level
+ *
+ * that go from 0 to 3/4:
+ *
+ * 0: we are not in the sequence
+ * 1: we have one ATerm/SATerm to the left of
+ * the middle spot
+ * 2: we have one ATerm/SATerm and one or more
+ * Close to the left of the middle spot
+ * 3: we have one ATerm/SATerm, zero or more
+ * Close and one or more Sp to the left of
+ * the middle spot.
+ * 4: we have one SATerm, zero or more Close,
+ * zero or more Sp and one ParaSep to the
+ * left of the middle spot.
*
*/
-
- /*
- * Initialize the different properties such that we have
- * a good state after the state-update in the loop
- */
- raw.b = NUM_SENTENCE_BREAK_PROPS;
- if ((off = get_codepoint(str, len, 0, &cp)) >= len) {
- /*
- * A line is at least one codepoint long, so we can
- * safely return here
- */
- return len;
+ if ((state->aterm_close_sp_level == 0 ||
+ state->aterm_close_sp_level == 1) &&
+ prop == SENTENCE_BREAK_PROP_ATERM) {
+ /* sequence has begun */
+ state->aterm_close_sp_level = 1;
+ } else if ((state->aterm_close_sp_level == 1 ||
+ state->aterm_close_sp_level == 2) &&
+ prop == SENTENCE_BREAK_PROP_CLOSE) {
+ /* close-sequence begins or continued */
+ state->aterm_close_sp_level = 2;
+ } else if ((state->aterm_close_sp_level == 1 ||
+ state->aterm_close_sp_level == 2 ||
+ state->aterm_close_sp_level == 3) &&
+ prop == SENTENCE_BREAK_PROP_SP) {
+ /* sp-sequence begins or continued */
+ state->aterm_close_sp_level = 3;
+ } else {
+ /* sequence broke */
+ state->aterm_close_sp_level = 0;
}
- raw.c = get_break_prop(cp);
- (void)get_codepoint(str, len, off, &cp);
- raw.d = get_break_prop(cp);
- skip.a = skip.b = NUM_SENTENCE_BREAK_PROPS;
-
- for (; off < len; off = new_off) {
- /*
- * Update left side (a and b) of the skip state by
- * "shifting in" the raw.c property as long as it is
- * not one of the "ignored" character properties.
- * While at it, update the RI-counter.
- *
- */
- if (raw.c != SENTENCE_BREAK_PROP_EXTEND &&
- raw.c != SENTENCE_BREAK_PROP_FORMAT) {
- skip.a = skip.b;
- skip.b = raw.c;
-
- /*
- * Here comes a bit of magic. The rules
- * SB8, SB8a, SB9 and SB10 have very complicated
- * left-hand-side-rules of the form
- *
- * ATerm Close* Sp*
- * SATerm Close*
- * SATerm Close* Sp*
- * SATerm Close* Sp* ParaSep?
- *
- * but instead of backtracking, we keep the
- * state as some kind of "power level" in
- * two variables
- *
- * aterm_close_sp_level
- * saterm_close_sp_parasep_level
- *
- * that go from 0 to 3/4:
- *
- * 0: we are not in the sequence
- * 1: we have one ATerm/SATerm to the left of
- * the middle spot
- * 2: we have one ATerm/SATerm and one or more
- * Close to the left of the middle spot
- * 3: we have one ATerm/SATerm, zero or more
- * Close and one or more Sp to the left of
- * the middle spot.
- * 4: we have one SATerm, zero or more Close,
- * zero or more Sp and one ParaSep to the
- * left of the middle spot.
- *
- */
- if ((aterm_close_sp_level == 0 ||
- aterm_close_sp_level == 1) &&
- skip.b == SENTENCE_BREAK_PROP_ATERM) {
- /* sequence has begun */
- aterm_close_sp_level = 1;
- } else if ((aterm_close_sp_level == 1 ||
- aterm_close_sp_level == 2) &&
- skip.b == SENTENCE_BREAK_PROP_CLOSE) {
- /* close-sequence begins or continued */
- aterm_close_sp_level = 2;
- } else if ((aterm_close_sp_level == 1 ||
- aterm_close_sp_level == 2 ||
- aterm_close_sp_level == 3) &&
- skip.b == SENTENCE_BREAK_PROP_SP) {
- /* sp-sequence begins or continued */
- aterm_close_sp_level = 3;
- } else {
- /* sequence broke */
- aterm_close_sp_level = 0;
- }
- if ((saterm_close_sp_parasep_level == 0 ||
- saterm_close_sp_parasep_level == 1) &&
- (skip.b == SENTENCE_BREAK_PROP_STERM ||
- skip.b == SENTENCE_BREAK_PROP_ATERM)) {
- /* sequence has begun */
- saterm_close_sp_parasep_level = 1;
- } else if ((saterm_close_sp_parasep_level == 1 ||
- saterm_close_sp_parasep_level == 2) &&
- skip.b == SENTENCE_BREAK_PROP_CLOSE) {
- /* close-sequence begins or continued */
- saterm_close_sp_parasep_level = 2;
- } else if ((saterm_close_sp_parasep_level == 1 ||
- saterm_close_sp_parasep_level == 2 ||
- saterm_close_sp_parasep_level == 3) &&
- skip.b == SENTENCE_BREAK_PROP_SP) {
- /* sp-sequence begins or continued */
- saterm_close_sp_parasep_level = 3;
- } else if ((saterm_close_sp_parasep_level == 1 ||
- saterm_close_sp_parasep_level == 2 ||
- saterm_close_sp_parasep_level == 3) &&
- (skip.b == SENTENCE_BREAK_PROP_SEP ||
- skip.b == SENTENCE_BREAK_PROP_CR ||
- skip.b == SENTENCE_BREAK_PROP_LF)) {
- /* ParaSep at the end of the sequence */
- saterm_close_sp_parasep_level = 4;
- } else {
- /* sequence broke */
- saterm_close_sp_parasep_level = 0;
- }
- }
-
- /*
- * Update right side (b and c) of the skip state by
- * starting at the breakpoint and detecting the two
- * following non-ignored character classes
- *
- */
- skip.c = NUM_SENTENCE_BREAK_PROPS;
- for (tmp = off; tmp < len; ) {
- tmp += get_codepoint(str, len, tmp, &cp);
- res = get_break_prop(cp);
-
- if (res != SENTENCE_BREAK_PROP_EXTEND &&
- res != SENTENCE_BREAK_PROP_FORMAT) {
- skip.c = res;
- break;
- }
- }
- skip.d = NUM_SENTENCE_BREAK_PROPS;
- for (; tmp < len; ) {
- tmp += get_codepoint(str, len, tmp, &cp);
- res = get_break_prop(cp);
+ if ((state->saterm_close_sp_parasep_level == 0 ||
+ state->saterm_close_sp_parasep_level == 1) &&
+ (prop == SENTENCE_BREAK_PROP_STERM ||
+ prop == SENTENCE_BREAK_PROP_ATERM)) {
+ /* sequence has begun */
+ state->saterm_close_sp_parasep_level = 1;
+ } else if ((state->saterm_close_sp_parasep_level == 1 ||
+ state->saterm_close_sp_parasep_level == 2) &&
+ prop == SENTENCE_BREAK_PROP_CLOSE) {
+ /* close-sequence begins or continued */
+ state->saterm_close_sp_parasep_level = 2;
+ } else if ((state->saterm_close_sp_parasep_level == 1 ||
+ state->saterm_close_sp_parasep_level == 2 ||
+ state->saterm_close_sp_parasep_level == 3) &&
+ prop == SENTENCE_BREAK_PROP_SP) {
+ /* sp-sequence begins or continued */
+ state->saterm_close_sp_parasep_level = 3;
+ } else if ((state->saterm_close_sp_parasep_level == 1 ||
+ state->saterm_close_sp_parasep_level == 2 ||
+ state->saterm_close_sp_parasep_level == 3) &&
+ (prop == SENTENCE_BREAK_PROP_SEP ||
+ prop == SENTENCE_BREAK_PROP_CR ||
+ prop == SENTENCE_BREAK_PROP_LF)) {
+ /* ParaSep at the end of the sequence */
+ state->saterm_close_sp_parasep_level = 4;
+ } else {
+ /* sequence broke */
+ state->saterm_close_sp_parasep_level = 0;
+ }
+}
- if (res != SENTENCE_BREAK_PROP_EXTEND &&
- res != SENTENCE_BREAK_PROP_FORMAT) {
- skip.d = res;
- break;
- }
- }
+static size_t
+next_sentence_break(HERODOTUS_READER *r)
+{
+ HERODOTUS_READER tmp;
+ enum sentence_break_property prop;
+ struct proper p;
+ struct sentence_break_state state = { 0 };
+ uint_least32_t cp;
- /*
- * Update the raw state by simply shifting everything
- * in and, if we still have data left, determining
- * the character class of the next codepoint.
- *
- */
- raw.a = raw.b;
- raw.b = raw.c;
- raw.c = raw.d;
- if ((new_off = off + get_codepoint(str, len, off, &cp)) < len) {
- get_codepoint(str, len, new_off, &cp);
- raw.d = get_break_prop(cp);
- } else {
- raw.d = NUM_SENTENCE_BREAK_PROPS;
- }
+ /*
+ * Apply sentence breaking algorithm (UAX #29), see
+ * https://unicode.org/reports/tr29/#Sentence_Boundary_Rules
+ */
+ proper_init(r, &state, NUM_SENTENCE_BREAK_PROPS,
+ get_sentence_break_prop, is_skippable_sentence_prop,
+ sentence_skip_shift_callback, &p);
+ while (!proper_advance(&p)) {
/* SB3 */
- if (raw.b == SENTENCE_BREAK_PROP_CR &&
- raw.c == SENTENCE_BREAK_PROP_LF) {
+ if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR &&
+ p.raw.next_prop[0] == SENTENCE_BREAK_PROP_LF) {
continue;
}
/* SB4 */
- if (raw.b == SENTENCE_BREAK_PROP_SEP ||
- raw.b == SENTENCE_BREAK_PROP_CR ||
- raw.b == SENTENCE_BREAK_PROP_LF) {
+ if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_SEP ||
+ p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR ||
+ p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_LF) {
break;
}
/* SB5 */
- if (raw.c == SENTENCE_BREAK_PROP_EXTEND ||
- raw.c == SENTENCE_BREAK_PROP_FORMAT) {
+ if (p.raw.next_prop[0] == SENTENCE_BREAK_PROP_EXTEND ||
+ p.raw.next_prop[0] == SENTENCE_BREAK_PROP_FORMAT) {
continue;
}
/* SB6 */
- if (skip.b == SENTENCE_BREAK_PROP_ATERM &&
- skip.c == SENTENCE_BREAK_PROP_NUMERIC) {
+ if (p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_NUMERIC) {
continue;
}
/* SB7 */
- if (off > 1 &&
- (skip.a == SENTENCE_BREAK_PROP_UPPER ||
- skip.a == SENTENCE_BREAK_PROP_LOWER) &&
- skip.b == SENTENCE_BREAK_PROP_ATERM &&
- skip.c == SENTENCE_BREAK_PROP_UPPER) {
+ if ((p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_UPPER ||
+ p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_LOWER) &&
+ p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_UPPER) {
continue;
}
/* SB8 */
- if (aterm_close_sp_level == 1 ||
- aterm_close_sp_level == 2 ||
- aterm_close_sp_level == 3) {
+ if (state.aterm_close_sp_level == 1 ||
+ state.aterm_close_sp_level == 2 ||
+ state.aterm_close_sp_level == 3) {
/*
* This is the most complicated rule, requiring
* the right-hand-side to satisfy the regular expression
@@ -262,67 +182,75 @@ next_sentence_break(const void *str, size_t len, size_t (*get_codepoint)
* ( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
*
* which we simply check "manually" given LUT-lookups
- * are very cheap.
+ * are very cheap by starting at the mid_reader.
*
*/
- for (tmp = off, res = NUM_SENTENCE_BREAK_PROPS; tmp < len; ) {
- tmp += get_codepoint(str, len, tmp, &cp);
- res = get_break_prop(cp);
+ herodotus_reader_copy(&(p.mid_reader), &tmp);
+
+ prop = NUM_SENTENCE_BREAK_PROPS;
+ while (herodotus_read_codepoint(&tmp, true, &cp) ==
+ HERODOTUS_STATUS_SUCCESS) {
+ prop = get_sentence_break_prop(cp);
- if (res == SENTENCE_BREAK_PROP_OLETTER ||
- res == SENTENCE_BREAK_PROP_UPPER ||
- res == SENTENCE_BREAK_PROP_LOWER ||
- res == SENTENCE_BREAK_PROP_SEP ||
- res == SENTENCE_BREAK_PROP_CR ||
- res == SENTENCE_BREAK_PROP_LF ||
- res == SENTENCE_BREAK_PROP_STERM ||
- res == SENTENCE_BREAK_PROP_ATERM) {
+ /*
+ * the skippable properties are ignored
+ * automatically here given they do not
+ * match the following condition
+ */
+ if (prop == SENTENCE_BREAK_PROP_OLETTER ||
+ prop == SENTENCE_BREAK_PROP_UPPER ||
+ prop == SENTENCE_BREAK_PROP_LOWER ||
+ prop == SENTENCE_BREAK_PROP_SEP ||
+ prop == SENTENCE_BREAK_PROP_CR ||
+ prop == SENTENCE_BREAK_PROP_LF ||
+ prop == SENTENCE_BREAK_PROP_STERM ||
+ prop == SENTENCE_BREAK_PROP_ATERM) {
break;
}
}
- if (res == SENTENCE_BREAK_PROP_LOWER) {
+ if (prop == SENTENCE_BREAK_PROP_LOWER) {
continue;
}
}
/* SB8a */
- if ((saterm_close_sp_parasep_level == 1 ||
- saterm_close_sp_parasep_level == 2 ||
- saterm_close_sp_parasep_level == 3) &&
- (skip.c == SENTENCE_BREAK_PROP_SCONTINUE ||
- skip.c == SENTENCE_BREAK_PROP_STERM ||
- skip.c == SENTENCE_BREAK_PROP_ATERM)) {
+ if ((state.saterm_close_sp_parasep_level == 1 ||
+ state.saterm_close_sp_parasep_level == 2 ||
+ state.saterm_close_sp_parasep_level == 3) &&
+ (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SCONTINUE ||
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_STERM ||
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_ATERM)) {
continue;
}
/* SB9 */
- if ((saterm_close_sp_parasep_level == 1 ||
- saterm_close_sp_parasep_level == 2) &&
- (skip.c == SENTENCE_BREAK_PROP_CLOSE ||
- skip.c == SENTENCE_BREAK_PROP_SP ||
- skip.c == SENTENCE_BREAK_PROP_SEP ||
- skip.c == SENTENCE_BREAK_PROP_CR ||
- skip.c == SENTENCE_BREAK_PROP_LF)) {
+ if ((state.saterm_close_sp_parasep_level == 1 ||
+ state.saterm_close_sp_parasep_level == 2) &&
+ (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CLOSE ||
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP ||
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP ||
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR ||
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
continue;
}
/* SB10 */
- if ((saterm_close_sp_parasep_level == 1 ||
- saterm_close_sp_parasep_level == 2 ||
- saterm_close_sp_parasep_level == 3) &&
- (skip.c == SENTENCE_BREAK_PROP_SP ||
- skip.c == SENTENCE_BREAK_PROP_SEP ||
- skip.c == SENTENCE_BREAK_PROP_CR ||
- skip.c == SENTENCE_BREAK_PROP_LF)) {
+ if ((state.saterm_close_sp_parasep_level == 1 ||
+ state.saterm_close_sp_parasep_level == 2 ||
+ state.saterm_close_sp_parasep_level == 3) &&
+ (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP ||
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP ||
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR ||
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
continue;
}
/* SB11 */
- if (saterm_close_sp_parasep_level == 1 ||
- saterm_close_sp_parasep_level == 2 ||
- saterm_close_sp_parasep_level == 3 ||
- saterm_close_sp_parasep_level == 4) {
+ if (state.saterm_close_sp_parasep_level == 1 ||
+ state.saterm_close_sp_parasep_level == 2 ||
+ state.saterm_close_sp_parasep_level == 3 ||
+ state.saterm_close_sp_parasep_level == 4) {
break;
}
@@ -330,17 +258,25 @@ next_sentence_break(const void *str, size_t len, size_t (*get_codepoint)
continue;
}
- return off;
+ return herodotus_reader_number_read(&(p.mid_reader));
}
size_t
grapheme_next_sentence_break(const uint_least32_t *str, size_t len)
{
- return next_sentence_break(str, len, get_codepoint);
+ HERODOTUS_READER r;
+
+ herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
+
+ return next_sentence_break(&r);
}
size_t
grapheme_next_sentence_break_utf8(const char *str, size_t len)
{
- return next_sentence_break(str, len, get_codepoint_utf8);
+ HERODOTUS_READER r;
+
+ herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
+
+ return next_sentence_break(&r);
}