libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

commit b899fd685c50cbc61999296ce1e0a03a45e74f52
parent a4d42053f13e8471ee3903522f964fc0a1d3161a
Author: Laslo Hunhold <dev@frign.de>
Date:   Sun,  2 Oct 2022 21:09:08 +0200

Add "proper"-property-reader

The word- and sentence-segmentation algorithms make use of a complicated
logic to accomodate "raw" and "skip" properties. The code is barely
readable and doesn't separate abstractions away nicely. Moreover, there
is a high probability that certain edge-cases are not handled properly.

To fix this, this commit adds a "proper"-property-reader, which
basically does the whole dirty details in the background using
well-commented and transparent code that builds on top of the
herodotus-reader instead of doing this by hand. This ensures that we
will (provably) never have buffer overflows unless there is a mistake
in the implementation itself, which can be verified relatively easily
given each function has a limited scope.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
Msrc/case.c | 25++++++++++++-------------
Msrc/util.c | 159+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
Msrc/util.h | 23+++++++++++++++++++++++
3 files changed, 190 insertions(+), 17 deletions(-)

diff --git a/src/case.c b/src/case.c @@ -147,18 +147,14 @@ to_case(HERODOTUS_READER *r, HERODOTUS_WRITER *w, static size_t herodotus_next_word_break(const HERODOTUS_READER *r) { - if (r->src == NULL || r->off > r->srclen) { - return 0; - } + HERODOTUS_READER tmp; + + herodotus_reader_copy(r, &tmp); if (r->type == HERODOTUS_TYPE_CODEPOINT) { - return grapheme_next_word_break( - ((const uint_least32_t *)(r->src)) + r->off, - r->srclen - r->off); + return grapheme_next_word_break(tmp.src, tmp.srclen); } else { /* r->type == HERODOTUS_TYPE_UTF8 */ - return grapheme_next_word_break_utf8( - ((const char *)(r->src)) + r->off, - r->srclen - r->off); + return grapheme_next_word_break_utf8(tmp.src, tmp.srclen); } } @@ -168,9 +164,10 @@ to_titlecase(HERODOTUS_READER *r, HERODOTUS_WRITER *w) enum case_property prop; enum herodotus_status s; uint_least32_t cp; + size_t nwb; - for (;;) { - herodotus_reader_push_advance_limit(r, herodotus_next_word_break(r)); + for (; (nwb = herodotus_next_word_break(r)) > 0;) { + herodotus_reader_push_advance_limit(r, nwb); for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODOTUS_STATUS_SUCCESS;) { /* check if we have a cased character */ prop = get_case_property(cp); @@ -354,9 +351,10 @@ is_titlecase(HERODOTUS_READER *r, size_t *output) enum herodotus_status s; bool ret = true; uint_least32_t cp; + size_t nwb; - for (;;) { - herodotus_reader_push_advance_limit(r, herodotus_next_word_break(r)); + for (; (nwb = herodotus_next_word_break(r)) > 0;) { + herodotus_reader_push_advance_limit(r, nwb); for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODOTUS_STATUS_SUCCESS;) { /* check if we have a cased character */ prop = get_case_property(cp); @@ -377,6 +375,7 @@ is_titlecase(HERODOTUS_READER *r, size_t *output) * we did not encounter any cased character * up to the word break */ + herodotus_reader_pop_limit(r); continue; } else { /* diff --git a/src/util.c b/src/util.c @@ -30,14 +30,31 @@ herodotus_reader_copy(const HERODOTUS_READER *src, HERODOTUS_READER *dest) { size_t i; + /* + * we copy such that we have a "fresh" start and build + * on the fact that src->soft_limit[i] for any i and src->srclen + * are always larger or equal to src->off + */ dest->type = src->type; - dest->src = src->src; - dest->srclen = src->srclen; - dest->off = src->off; + if (src->type == HERODOTUS_TYPE_CODEPOINT) { + dest->src = ((const uint_least32_t *)(src->src)) + src->off; + } else { /* src->type == HERODOTUS_TYPE_UTF8 */ + dest->src = ((const char *)(src->src)) + src->off; + } + if (src->srclen == SIZE_MAX) { + dest->srclen = SIZE_MAX; + } else { + dest->srclen = src->srclen - src->off; + } + dest->off = 0; dest->terminated_by_null = src->terminated_by_null; for (i = 0; i < LEN(src->soft_limit); i++) { - dest->soft_limit[i] = src->soft_limit[i]; + if (src->soft_limit[i] == SIZE_MAX) { + dest->soft_limit[i] = src->soft_limit[i]; + } else { + dest->soft_limit[i] = src->soft_limit[i] - src->off; + } } } @@ -258,6 +275,140 @@ herodotus_write_codepoint(HERODOTUS_WRITER *w, uint_least32_t cp) } } +void +proper_init(const HERODOTUS_READER *r, void *state, uint_least8_t no_prop, + uint_least8_t (*get_break_prop)(uint_least32_t), + bool (*is_skippable_prop)(uint_least8_t), + void (*skip_shift_callback)(uint_least8_t, void *), + struct proper *p) +{ + uint_least8_t prop; + uint_least32_t cp; + size_t i; + + /* set internal variables */ + p->state = state; + p->no_prop = no_prop; + p->get_break_prop = get_break_prop; + p->is_skippable_prop = is_skippable_prop; + p->skip_shift_callback = skip_shift_callback; + + /* + * Initialize mid-reader, which is basically just there + * to reflect the current position of the viewing-line + */ + herodotus_reader_copy(r, &(p->mid_reader)); + + /* + * In the initialization, we simply (try to) fill in next_prop. + * If we cannot read in more (due to the buffer ending), we + * fill in the prop as invalid + */ + + /* + * initialize the previous properties to have no property + * (given we are at the start of the buffer) + */ + p->raw.prev_prop[1] = p->raw.prev_prop[0] = p->no_prop; + p->skip.prev_prop[1] = p->skip.prev_prop[0] = p->no_prop; + + /* + * initialize the next properties + */ + + /* initialize the raw reader */ + herodotus_reader_copy(r, &(p->raw_reader)); + + /* fill in the two next raw properties (after no-initialization) */ + p->raw.next_prop[0] = p->raw.next_prop[1] = p->no_prop; + for (i = 0; i < 2 && herodotus_read_codepoint(&(p->raw_reader), true, &cp) == + HERODOTUS_STATUS_SUCCESS; ) { + p->raw.next_prop[i++] = p->get_break_prop(cp); + } + + /* initialize the skip reader */ + herodotus_reader_copy(r, &(p->skip_reader)); + + /* fill in the two next skip properties (after no-initialization) */ + p->skip.next_prop[0] = p->skip.next_prop[1] = p->no_prop; + for (i = 0; i < 2 && herodotus_read_codepoint(&(p->skip_reader), true, &cp) == + HERODOTUS_STATUS_SUCCESS; ) { + prop = p->get_break_prop(cp); + if (!p->is_skippable_prop(prop)) { + p->skip.next_prop[i++] = prop; + } + } +} + +int +proper_advance(struct proper *p) +{ + uint_least8_t prop; + uint_least32_t cp; + + /* read in next "raw" property */ + if (herodotus_read_codepoint(&(p->raw_reader), true, &cp) == + HERODOTUS_STATUS_SUCCESS) { + prop = p->get_break_prop(cp); + } else { + prop = p->no_prop; + } + + /* + * do a shift-in, unless we find that the property that is to + * be moved past the "raw-viewing-line" (this property is stored + * in p->raw.next_prop[0]) is a no_prop, indicating that + * we are at the end of the buffer. + */ + if (p->raw.next_prop[0] == p->no_prop) { + return 1; + } + + /* shift in the properties */ + p->raw.prev_prop[1] = p->raw.prev_prop[0]; + p->raw.prev_prop[0] = p->raw.next_prop[0]; + p->raw.next_prop[0] = p->raw.next_prop[1]; + p->raw.next_prop[1] = prop; + + /* advance the middle reader viewing-line */ + (void)herodotus_read_codepoint(&(p->mid_reader), true, &cp); + + /* check skippability-property */ + if (!p->is_skippable_prop(p->raw.prev_prop[0])) { + /* + * the property that has moved past the "raw-viewing-line" + * (this property is now (after the raw-shift) stored in + * p->raw.prev_prop[0] and guaranteed not to be a no-prop, + * guaranteeing that we won't shift a no-prop past the + * "viewing-line" in the skip-properties) is not a skippable + * property, thus we need to shift the skip property as well. + */ + p->skip.prev_prop[1] = p->skip.prev_prop[0]; + p->skip.prev_prop[0] = p->skip.next_prop[0]; + p->skip.next_prop[0] = p->skip.next_prop[1]; + + /* + * call the skip-shift-callback on the property that + * passed the skip-viewing-line (this property is now + * stored in p->skip.prev_prop[0]). + */ + p->skip_shift_callback(p->skip.prev_prop[0], p->state); + + /* determine the next shift property */ + p->skip.next_prop[1] = p->no_prop; + while (herodotus_read_codepoint(&(p->skip_reader), true, &cp) == + HERODOTUS_STATUS_SUCCESS) { + prop = p->get_break_prop(cp); + if (!p->is_skippable_prop(prop)) { + p->skip.next_prop[1] = prop; + break; + } + } + } + + return 0; +} + inline size_t get_codepoint(const void *str, size_t len, size_t offset, uint_least32_t *cp) { diff --git a/src/util.h b/src/util.h @@ -74,6 +74,22 @@ typedef struct herodotus_writer { size_t first_unwritable_offset; } HERODOTUS_WRITER; +struct proper { + /* + * prev_prop[1] prev_prop[0] | next_prop[0] next_prop[1] + */ + struct { + uint_least8_t prev_prop[2]; + uint_least8_t next_prop[2]; + } raw, skip; + HERODOTUS_READER mid_reader, raw_reader, skip_reader; + void *state; + uint_least8_t no_prop; + uint_least8_t (*get_break_prop)(uint_least32_t); + bool (*is_skippable_prop)(uint_least8_t); + void (*skip_shift_callback)(uint_least8_t, void *); +}; + void herodotus_reader_init(HERODOTUS_READER *, enum herodotus_type, const void *, size_t); void herodotus_reader_copy(const HERODOTUS_READER *, HERODOTUS_READER *); @@ -90,6 +106,13 @@ void herodotus_writer_nul_terminate(HERODOTUS_WRITER *); size_t herodotus_writer_number_written(const HERODOTUS_WRITER *); void herodotus_write_codepoint(HERODOTUS_WRITER *, uint_least32_t); +void proper_init(const HERODOTUS_READER *, void *, uint_least8_t, + uint_least8_t (*get_break_prop)(uint_least32_t), + bool (*is_skippable_prop)(uint_least8_t), + void (*skip_shift_callback)(uint_least8_t, void *), + struct proper *); +int proper_advance(struct proper *); + size_t get_codepoint(const void *, size_t, size_t, uint_least32_t *); size_t get_codepoint_utf8(const void *, size_t, size_t, uint_least32_t *);