commit b899fd685c50cbc61999296ce1e0a03a45e74f52
parent a4d42053f13e8471ee3903522f964fc0a1d3161a
Author: Laslo Hunhold <dev@frign.de>
Date: Sun, 2 Oct 2022 21:09:08 +0200
Add "proper"-property-reader
The word- and sentence-segmentation algorithms make use of a complicated
logic to accomodate "raw" and "skip" properties. The code is barely
readable and doesn't separate abstractions away nicely. Moreover, there
is a high probability that certain edge-cases are not handled properly.
To fix this, this commit adds a "proper"-property-reader, which
basically does the whole dirty details in the background using
well-commented and transparent code that builds on top of the
herodotus-reader instead of doing this by hand. This ensures that we
will (provably) never have buffer overflows unless there is a mistake
in the implementation itself, which can be verified relatively easily
given each function has a limited scope.
Signed-off-by: Laslo Hunhold <dev@frign.de>
Diffstat:
M | src/case.c | | | 25 | ++++++++++++------------- |
M | src/util.c | | | 159 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- |
M | src/util.h | | | 23 | +++++++++++++++++++++++ |
3 files changed, 190 insertions(+), 17 deletions(-)
diff --git a/src/case.c b/src/case.c
@@ -147,18 +147,14 @@ to_case(HERODOTUS_READER *r, HERODOTUS_WRITER *w,
static size_t
herodotus_next_word_break(const HERODOTUS_READER *r)
{
- if (r->src == NULL || r->off > r->srclen) {
- return 0;
- }
+ HERODOTUS_READER tmp;
+
+ herodotus_reader_copy(r, &tmp);
if (r->type == HERODOTUS_TYPE_CODEPOINT) {
- return grapheme_next_word_break(
- ((const uint_least32_t *)(r->src)) + r->off,
- r->srclen - r->off);
+ return grapheme_next_word_break(tmp.src, tmp.srclen);
} else { /* r->type == HERODOTUS_TYPE_UTF8 */
- return grapheme_next_word_break_utf8(
- ((const char *)(r->src)) + r->off,
- r->srclen - r->off);
+ return grapheme_next_word_break_utf8(tmp.src, tmp.srclen);
}
}
@@ -168,9 +164,10 @@ to_titlecase(HERODOTUS_READER *r, HERODOTUS_WRITER *w)
enum case_property prop;
enum herodotus_status s;
uint_least32_t cp;
+ size_t nwb;
- for (;;) {
- herodotus_reader_push_advance_limit(r, herodotus_next_word_break(r));
+ for (; (nwb = herodotus_next_word_break(r)) > 0;) {
+ herodotus_reader_push_advance_limit(r, nwb);
for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODOTUS_STATUS_SUCCESS;) {
/* check if we have a cased character */
prop = get_case_property(cp);
@@ -354,9 +351,10 @@ is_titlecase(HERODOTUS_READER *r, size_t *output)
enum herodotus_status s;
bool ret = true;
uint_least32_t cp;
+ size_t nwb;
- for (;;) {
- herodotus_reader_push_advance_limit(r, herodotus_next_word_break(r));
+ for (; (nwb = herodotus_next_word_break(r)) > 0;) {
+ herodotus_reader_push_advance_limit(r, nwb);
for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODOTUS_STATUS_SUCCESS;) {
/* check if we have a cased character */
prop = get_case_property(cp);
@@ -377,6 +375,7 @@ is_titlecase(HERODOTUS_READER *r, size_t *output)
* we did not encounter any cased character
* up to the word break
*/
+ herodotus_reader_pop_limit(r);
continue;
} else {
/*
diff --git a/src/util.c b/src/util.c
@@ -30,14 +30,31 @@ herodotus_reader_copy(const HERODOTUS_READER *src, HERODOTUS_READER *dest)
{
size_t i;
+ /*
+ * we copy such that we have a "fresh" start and build
+ * on the fact that src->soft_limit[i] for any i and src->srclen
+ * are always larger or equal to src->off
+ */
dest->type = src->type;
- dest->src = src->src;
- dest->srclen = src->srclen;
- dest->off = src->off;
+ if (src->type == HERODOTUS_TYPE_CODEPOINT) {
+ dest->src = ((const uint_least32_t *)(src->src)) + src->off;
+ } else { /* src->type == HERODOTUS_TYPE_UTF8 */
+ dest->src = ((const char *)(src->src)) + src->off;
+ }
+ if (src->srclen == SIZE_MAX) {
+ dest->srclen = SIZE_MAX;
+ } else {
+ dest->srclen = src->srclen - src->off;
+ }
+ dest->off = 0;
dest->terminated_by_null = src->terminated_by_null;
for (i = 0; i < LEN(src->soft_limit); i++) {
- dest->soft_limit[i] = src->soft_limit[i];
+ if (src->soft_limit[i] == SIZE_MAX) {
+ dest->soft_limit[i] = src->soft_limit[i];
+ } else {
+ dest->soft_limit[i] = src->soft_limit[i] - src->off;
+ }
}
}
@@ -258,6 +275,140 @@ herodotus_write_codepoint(HERODOTUS_WRITER *w, uint_least32_t cp)
}
}
+void
+proper_init(const HERODOTUS_READER *r, void *state, uint_least8_t no_prop,
+ uint_least8_t (*get_break_prop)(uint_least32_t),
+ bool (*is_skippable_prop)(uint_least8_t),
+ void (*skip_shift_callback)(uint_least8_t, void *),
+ struct proper *p)
+{
+ uint_least8_t prop;
+ uint_least32_t cp;
+ size_t i;
+
+ /* set internal variables */
+ p->state = state;
+ p->no_prop = no_prop;
+ p->get_break_prop = get_break_prop;
+ p->is_skippable_prop = is_skippable_prop;
+ p->skip_shift_callback = skip_shift_callback;
+
+ /*
+ * Initialize mid-reader, which is basically just there
+ * to reflect the current position of the viewing-line
+ */
+ herodotus_reader_copy(r, &(p->mid_reader));
+
+ /*
+ * In the initialization, we simply (try to) fill in next_prop.
+ * If we cannot read in more (due to the buffer ending), we
+ * fill in the prop as invalid
+ */
+
+ /*
+ * initialize the previous properties to have no property
+ * (given we are at the start of the buffer)
+ */
+ p->raw.prev_prop[1] = p->raw.prev_prop[0] = p->no_prop;
+ p->skip.prev_prop[1] = p->skip.prev_prop[0] = p->no_prop;
+
+ /*
+ * initialize the next properties
+ */
+
+ /* initialize the raw reader */
+ herodotus_reader_copy(r, &(p->raw_reader));
+
+ /* fill in the two next raw properties (after no-initialization) */
+ p->raw.next_prop[0] = p->raw.next_prop[1] = p->no_prop;
+ for (i = 0; i < 2 && herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
+ HERODOTUS_STATUS_SUCCESS; ) {
+ p->raw.next_prop[i++] = p->get_break_prop(cp);
+ }
+
+ /* initialize the skip reader */
+ herodotus_reader_copy(r, &(p->skip_reader));
+
+ /* fill in the two next skip properties (after no-initialization) */
+ p->skip.next_prop[0] = p->skip.next_prop[1] = p->no_prop;
+ for (i = 0; i < 2 && herodotus_read_codepoint(&(p->skip_reader), true, &cp) ==
+ HERODOTUS_STATUS_SUCCESS; ) {
+ prop = p->get_break_prop(cp);
+ if (!p->is_skippable_prop(prop)) {
+ p->skip.next_prop[i++] = prop;
+ }
+ }
+}
+
+int
+proper_advance(struct proper *p)
+{
+ uint_least8_t prop;
+ uint_least32_t cp;
+
+ /* read in next "raw" property */
+ if (herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
+ HERODOTUS_STATUS_SUCCESS) {
+ prop = p->get_break_prop(cp);
+ } else {
+ prop = p->no_prop;
+ }
+
+ /*
+ * do a shift-in, unless we find that the property that is to
+ * be moved past the "raw-viewing-line" (this property is stored
+ * in p->raw.next_prop[0]) is a no_prop, indicating that
+ * we are at the end of the buffer.
+ */
+ if (p->raw.next_prop[0] == p->no_prop) {
+ return 1;
+ }
+
+ /* shift in the properties */
+ p->raw.prev_prop[1] = p->raw.prev_prop[0];
+ p->raw.prev_prop[0] = p->raw.next_prop[0];
+ p->raw.next_prop[0] = p->raw.next_prop[1];
+ p->raw.next_prop[1] = prop;
+
+ /* advance the middle reader viewing-line */
+ (void)herodotus_read_codepoint(&(p->mid_reader), true, &cp);
+
+ /* check skippability-property */
+ if (!p->is_skippable_prop(p->raw.prev_prop[0])) {
+ /*
+ * the property that has moved past the "raw-viewing-line"
+ * (this property is now (after the raw-shift) stored in
+ * p->raw.prev_prop[0] and guaranteed not to be a no-prop,
+ * guaranteeing that we won't shift a no-prop past the
+ * "viewing-line" in the skip-properties) is not a skippable
+ * property, thus we need to shift the skip property as well.
+ */
+ p->skip.prev_prop[1] = p->skip.prev_prop[0];
+ p->skip.prev_prop[0] = p->skip.next_prop[0];
+ p->skip.next_prop[0] = p->skip.next_prop[1];
+
+ /*
+ * call the skip-shift-callback on the property that
+ * passed the skip-viewing-line (this property is now
+ * stored in p->skip.prev_prop[0]).
+ */
+ p->skip_shift_callback(p->skip.prev_prop[0], p->state);
+
+ /* determine the next shift property */
+ p->skip.next_prop[1] = p->no_prop;
+ while (herodotus_read_codepoint(&(p->skip_reader), true, &cp) ==
+ HERODOTUS_STATUS_SUCCESS) {
+ prop = p->get_break_prop(cp);
+ if (!p->is_skippable_prop(prop)) {
+ p->skip.next_prop[1] = prop;
+ break;
+ }
+ }
+ }
+
+ return 0;
+}
+
inline size_t
get_codepoint(const void *str, size_t len, size_t offset, uint_least32_t *cp)
{
diff --git a/src/util.h b/src/util.h
@@ -74,6 +74,22 @@ typedef struct herodotus_writer {
size_t first_unwritable_offset;
} HERODOTUS_WRITER;
+struct proper {
+ /*
+ * prev_prop[1] prev_prop[0] | next_prop[0] next_prop[1]
+ */
+ struct {
+ uint_least8_t prev_prop[2];
+ uint_least8_t next_prop[2];
+ } raw, skip;
+ HERODOTUS_READER mid_reader, raw_reader, skip_reader;
+ void *state;
+ uint_least8_t no_prop;
+ uint_least8_t (*get_break_prop)(uint_least32_t);
+ bool (*is_skippable_prop)(uint_least8_t);
+ void (*skip_shift_callback)(uint_least8_t, void *);
+};
+
void herodotus_reader_init(HERODOTUS_READER *, enum herodotus_type,
const void *, size_t);
void herodotus_reader_copy(const HERODOTUS_READER *, HERODOTUS_READER *);
@@ -90,6 +106,13 @@ void herodotus_writer_nul_terminate(HERODOTUS_WRITER *);
size_t herodotus_writer_number_written(const HERODOTUS_WRITER *);
void herodotus_write_codepoint(HERODOTUS_WRITER *, uint_least32_t);
+void proper_init(const HERODOTUS_READER *, void *, uint_least8_t,
+ uint_least8_t (*get_break_prop)(uint_least32_t),
+ bool (*is_skippable_prop)(uint_least8_t),
+ void (*skip_shift_callback)(uint_least8_t, void *),
+ struct proper *);
+int proper_advance(struct proper *);
+
size_t get_codepoint(const void *, size_t, size_t, uint_least32_t *);
size_t get_codepoint_utf8(const void *, size_t, size_t, uint_least32_t *);