Add "proper"-property-reader - libgrapheme

commit b899fd685c50cbc61999296ce1e0a03a45e74f52
parent a4d42053f13e8471ee3903522f964fc0a1d3161a
Author: Laslo Hunhold <dev@frign.de>
Date:   Sun,  2 Oct 2022 21:09:08 +0200

Add "proper"-property-reader

The word- and sentence-segmentation algorithms make use of a complicated
logic to accomodate "raw" and "skip" properties. The code is barely
readable and doesn't separate abstractions away nicely. Moreover, there
is a high probability that certain edge-cases are not handled properly.

To fix this, this commit adds a "proper"-property-reader, which
basically does the whole dirty details in the background using
well-commented and transparent code that builds on top of the
herodotus-reader instead of doing this by hand. This ensures that we
will (provably) never have buffer overflows unless there is a mistake
in the implementation itself, which can be verified relatively easily
given each function has a limited scope.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
M src/case.c  | 25 ++++++++++++-------------
M src/util.c  | 159 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
M src/util.h  | 23 +++++++++++++++++++++++

3 files changed, 190 insertions(+), 17 deletions(-)
diff --git a/src/case.c b/src/case.c
@@ -147,18 +147,14 @@ to_case(HERODOTUS_READER *r, HERODOTUS_WRITER *w,
 static size_t
 herodotus_next_word_break(const HERODOTUS_READER *r)
 {
-	if (r->src == NULL || r->off > r->srclen) {
-		return 0;
-	}
+	HERODOTUS_READER tmp;
+
+	herodotus_reader_copy(r, &tmp);
 
 	if (r->type == HERODOTUS_TYPE_CODEPOINT) {
-		return grapheme_next_word_break(
-			((const uint_least32_t *)(r->src)) + r->off,
-			r->srclen - r->off);
+		return grapheme_next_word_break(tmp.src, tmp.srclen);
 	} else { /* r->type == HERODOTUS_TYPE_UTF8 */
-		return grapheme_next_word_break_utf8(
-			((const char *)(r->src)) + r->off,
-			r->srclen - r->off);
+		return grapheme_next_word_break_utf8(tmp.src, tmp.srclen);
 	}
 }
 
@@ -168,9 +164,10 @@ to_titlecase(HERODOTUS_READER *r, HERODOTUS_WRITER *w)
 	enum case_property prop;
 	enum herodotus_status s;
 	uint_least32_t cp;
+	size_t nwb;
 
-	for (;;) {
-		herodotus_reader_push_advance_limit(r, herodotus_next_word_break(r));
+	for (; (nwb = herodotus_next_word_break(r)) > 0;) {
+		herodotus_reader_push_advance_limit(r, nwb);
 		for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODOTUS_STATUS_SUCCESS;) {
 			/* check if we have a cased character */
 			prop = get_case_property(cp);
@@ -354,9 +351,10 @@ is_titlecase(HERODOTUS_READER *r, size_t *output)
 	enum herodotus_status s;
 	bool ret = true;
 	uint_least32_t cp;
+	size_t nwb;
 
-	for (;;) {
-		herodotus_reader_push_advance_limit(r, herodotus_next_word_break(r));
+	for (; (nwb = herodotus_next_word_break(r)) > 0;) {
+		herodotus_reader_push_advance_limit(r, nwb);
 		for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODOTUS_STATUS_SUCCESS;) {
 			/* check if we have a cased character */
 			prop = get_case_property(cp);
@@ -377,6 +375,7 @@ is_titlecase(HERODOTUS_READER *r, size_t *output)
 			 * we did not encounter any cased character
 			 * up to the word break
 			 */
+			herodotus_reader_pop_limit(r);
 			continue;
 		} else {
 			/*
diff --git a/src/util.c b/src/util.c
@@ -30,14 +30,31 @@ herodotus_reader_copy(const HERODOTUS_READER *src, HERODOTUS_READER *dest)
 {
 	size_t i;
 
+	/*
+	 * we copy such that we have a "fresh" start and build
+	 * on the fact that src->soft_limit[i] for any i and src->srclen
+	 * are always larger or equal to src->off
+	 */
 	dest->type = src->type;
-	dest->src = src->src;
-	dest->srclen = src->srclen;
-	dest->off = src->off;
+	if (src->type == HERODOTUS_TYPE_CODEPOINT) {
+		dest->src = ((const uint_least32_t *)(src->src)) + src->off;
+	} else { /* src->type == HERODOTUS_TYPE_UTF8 */
+		dest->src = ((const char *)(src->src)) + src->off;
+	}
+	if (src->srclen == SIZE_MAX) {
+		dest->srclen = SIZE_MAX;
+	} else {
+		dest->srclen = src->srclen - src->off;
+	}
+	dest->off = 0;
 	dest->terminated_by_null = src->terminated_by_null;
 
 	for (i = 0; i < LEN(src->soft_limit); i++) {
-		dest->soft_limit[i] = src->soft_limit[i];
+		if (src->soft_limit[i] == SIZE_MAX) {
+			dest->soft_limit[i] = src->soft_limit[i];
+		} else {
+			dest->soft_limit[i] = src->soft_limit[i] - src->off;
+		}
 	}
 }
 
@@ -258,6 +275,140 @@ herodotus_write_codepoint(HERODOTUS_WRITER *w, uint_least32_t cp)
 	}
 }
 
+void
+proper_init(const HERODOTUS_READER *r, void *state, uint_least8_t no_prop,
+            uint_least8_t (*get_break_prop)(uint_least32_t),
+            bool (*is_skippable_prop)(uint_least8_t),
+            void (*skip_shift_callback)(uint_least8_t, void *),
+            struct proper *p)
+{
+	uint_least8_t prop;
+	uint_least32_t cp;
+	size_t i;
+
+	/* set internal variables */
+	p->state = state;
+	p->no_prop = no_prop;
+	p->get_break_prop = get_break_prop;
+	p->is_skippable_prop = is_skippable_prop;
+	p->skip_shift_callback = skip_shift_callback;
+
+	/*
+	 * Initialize mid-reader, which is basically just there
+	 * to reflect the current position of the viewing-line
+	 */
+	herodotus_reader_copy(r, &(p->mid_reader));
+
+	/*
+	 * In the initialization, we simply (try to) fill in next_prop.
+	 * If we cannot read in more (due to the buffer ending), we
+	 * fill in the prop as invalid
+	 */
+
+	/*
+	 * initialize the previous properties to have no property
+	 * (given we are at the start of the buffer)
+	 */
+	p->raw.prev_prop[1] = p->raw.prev_prop[0] = p->no_prop;
+	p->skip.prev_prop[1] = p->skip.prev_prop[0] = p->no_prop;
+
+	/*
+	 * initialize the next properties
+	 */
+
+	/* initialize the raw reader */
+	herodotus_reader_copy(r, &(p->raw_reader));
+
+	/* fill in the two next raw properties (after no-initialization) */
+	p->raw.next_prop[0] = p->raw.next_prop[1] = p->no_prop;
+	for (i = 0; i < 2 && herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
+	     HERODOTUS_STATUS_SUCCESS; ) {
+		p->raw.next_prop[i++] = p->get_break_prop(cp);
+	}
+
+	/* initialize the skip reader */
+	herodotus_reader_copy(r, &(p->skip_reader));
+
+	/* fill in the two next skip properties (after no-initialization) */
+	p->skip.next_prop[0] = p->skip.next_prop[1] = p->no_prop;
+	for (i = 0; i < 2 && herodotus_read_codepoint(&(p->skip_reader), true, &cp) ==
+	     HERODOTUS_STATUS_SUCCESS; ) {
+		prop = p->get_break_prop(cp);
+		if (!p->is_skippable_prop(prop)) {
+			p->skip.next_prop[i++] = prop;
+		}
+	}
+}
+
+int
+proper_advance(struct proper *p)
+{
+	uint_least8_t prop;
+	uint_least32_t cp;
+
+	/* read in next "raw" property */
+	if (herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
+	    HERODOTUS_STATUS_SUCCESS) {
+		prop = p->get_break_prop(cp);
+	} else {
+		prop = p->no_prop;
+	}
+
+	/*
+	 * do a shift-in, unless we find that the property that is to
+	 * be moved past the "raw-viewing-line" (this property is stored
+	 * in p->raw.next_prop[0]) is a no_prop, indicating that
+	 * we are at the end of the buffer.
+	 */
+	if (p->raw.next_prop[0] == p->no_prop) {
+		return 1;
+	}
+
+	/* shift in the properties */
+	p->raw.prev_prop[1] = p->raw.prev_prop[0];
+	p->raw.prev_prop[0] = p->raw.next_prop[0];
+	p->raw.next_prop[0] = p->raw.next_prop[1];
+	p->raw.next_prop[1] = prop;
+
+	/* advance the middle reader viewing-line */
+	(void)herodotus_read_codepoint(&(p->mid_reader), true, &cp);
+
+	/* check skippability-property */
+	if (!p->is_skippable_prop(p->raw.prev_prop[0])) {
+		/*
+		 * the property that has moved past the "raw-viewing-line"
+		 * (this property is now (after the raw-shift) stored in
+		 * p->raw.prev_prop[0] and guaranteed not to be a no-prop,
+		 * guaranteeing that we won't shift a no-prop past the
+		 * "viewing-line" in the skip-properties) is not a skippable
+		 * property, thus we need to shift the skip property as well.
+		 */
+		p->skip.prev_prop[1] = p->skip.prev_prop[0];
+		p->skip.prev_prop[0] = p->skip.next_prop[0];
+		p->skip.next_prop[0] = p->skip.next_prop[1];
+
+		/*
+		 * call the skip-shift-callback on the property that
+		 * passed the skip-viewing-line (this property is now
+		 * stored in p->skip.prev_prop[0]).
+		 */
+		p->skip_shift_callback(p->skip.prev_prop[0], p->state);
+
+		/* determine the next shift property */
+		p->skip.next_prop[1] = p->no_prop;
+		while (herodotus_read_codepoint(&(p->skip_reader), true, &cp) ==
+		       HERODOTUS_STATUS_SUCCESS) {
+			prop = p->get_break_prop(cp);
+			if (!p->is_skippable_prop(prop)) {
+				p->skip.next_prop[1] = prop;
+				break;
+			}
+		}
+	}
+
+	return 0;
+}
+
 inline size_t
 get_codepoint(const void *str, size_t len, size_t offset, uint_least32_t *cp)
 {
diff --git a/src/util.h b/src/util.h
@@ -74,6 +74,22 @@ typedef struct herodotus_writer {
 	size_t first_unwritable_offset;
 } HERODOTUS_WRITER;
 
+struct proper {
+	/*
+	 * prev_prop[1] prev_prop[0] | next_prop[0] next_prop[1]
+	 */
+	struct {
+		uint_least8_t prev_prop[2];
+		uint_least8_t next_prop[2];
+	} raw, skip;
+	HERODOTUS_READER mid_reader, raw_reader, skip_reader;
+	void *state;
+	uint_least8_t no_prop;
+	uint_least8_t (*get_break_prop)(uint_least32_t);
+	bool (*is_skippable_prop)(uint_least8_t);
+	void (*skip_shift_callback)(uint_least8_t, void *);
+};
+
 void herodotus_reader_init(HERODOTUS_READER *, enum herodotus_type,
                            const void *, size_t);
 void herodotus_reader_copy(const HERODOTUS_READER *, HERODOTUS_READER *);
@@ -90,6 +106,13 @@ void herodotus_writer_nul_terminate(HERODOTUS_WRITER *);
 size_t herodotus_writer_number_written(const HERODOTUS_WRITER *);
 void herodotus_write_codepoint(HERODOTUS_WRITER *, uint_least32_t);
 
+void proper_init(const HERODOTUS_READER *, void *, uint_least8_t,
+                 uint_least8_t (*get_break_prop)(uint_least32_t),
+                 bool (*is_skippable_prop)(uint_least8_t),
+                 void (*skip_shift_callback)(uint_least8_t, void *),
+                 struct proper *);
+int proper_advance(struct proper *);
+
 size_t get_codepoint(const void *, size_t, size_t, uint_least32_t *);
 size_t get_codepoint_utf8(const void *, size_t, size_t, uint_least32_t *);

	libgrapheme unicode string library
	git clone git://git.suckless.org/libgrapheme
	Log \| Files \| Refs \| README \| LICENSE

M	src/case.c	\|	25	++++++++++++-------------
M	src/util.c	\|	159	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
M	src/util.h	\|	23	+++++++++++++++++++++++