commit 5332f7ee034081618617c2b0785733ccc9ec8753
parent 563eb65bfbaa4f27c77d73ae81b51882c916993d
Author: Laslo Hunhold <dev@frign.de>
Date: Wed, 21 Sep 2022 20:16:00 +0200
Refactor case-conversion-functions with Herodotus
The readability of the code is greatly improved, and the code is now
much more robust than before.
Signed-off-by: Laslo Hunhold <dev@frign.de>
Diffstat:
M | src/case.c | | | 255 | +++++++++++++++++++++++++++++++++++-------------------------------------------- |
1 file changed, 112 insertions(+), 143 deletions(-)
diff --git a/src/case.c b/src/case.c
@@ -33,22 +33,18 @@ get_case_offset(uint_least32_t cp, const uint_least16_t *major,
}
static inline size_t
-to_case(const void *src, size_t srclen, void *dest, size_t destlen,
- size_t srcnumprocess, uint_least8_t final_sigma_level,
- size_t (*get_codepoint)(const void *, size_t, size_t, uint_least32_t *),
- size_t (*set_codepoint)(uint_least32_t, void *, size_t, size_t),
- const uint_least16_t *major, const int_least32_t *minor,
- const struct special_case *sc)
+to_case(HERODOTUS_READER *r, HERODOTUS_WRITER *w,
+ uint_least8_t final_sigma_level, const uint_least16_t *major,
+ const int_least32_t *minor, const struct special_case *sc)
{
+ HERODOTUS_READER tmp;
enum case_property prop;
- size_t srcoff, destoff, res, tmp, off, i;
+ enum herodotus_status s;
+ size_t off, i;
uint_least32_t cp, tmp_cp;
int_least32_t map;
- for (srcoff = 0, destoff = 0; srcoff < srcnumprocess; srcoff += res) {
- /* read in next source codepoint */
- res = get_codepoint((const char *)src, srclen, srcoff, &cp);
-
+ for (; herodotus_read_codepoint(r, true, &cp) == HERODOTUS_STATUS_SUCCESS;) {
if (sc == lower_special) {
/*
* For the special Final_Sigma-rule (see SpecialCasing.txt),
@@ -72,8 +68,10 @@ to_case(const void *src, size_t srclen, void *dest, size_t destlen,
* if the succeeding character is cased, invalidating
* the after-condition
*/
- for (tmp = srcoff + res, prop = NUM_CASE_PROPS; tmp < srclen; ) {
- tmp += get_codepoint(src, srclen, tmp, &tmp_cp);
+ herodotus_reader_copy(r, &tmp);
+ for (prop = NUM_CASE_PROPS;
+ (s = herodotus_read_codepoint(&tmp, true, &tmp_cp)) ==
+ HERODOTUS_STATUS_SUCCESS; ) {
prop = get_case_property(tmp_cp);
if (prop != CASE_PROP_CASE_IGNORABLE &&
@@ -83,20 +81,19 @@ to_case(const void *src, size_t srclen, void *dest, size_t destlen,
}
/*
- * Now prop is something other than case-ignorable.
+ * Now prop is something other than case-ignorable or
+ * the source-string ended.
* If it is something other than cased, we know
* that the after-condition holds
*/
- if (prop != CASE_PROP_CASED &&
- prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
+ if (s != HERODOTUS_STATUS_SUCCESS ||
+ (prop != CASE_PROP_CASED &&
+ prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
/*
* write GREEK SMALL LETTER FINAL SIGMA to
* destination
*/
- destoff += set_codepoint(UINT32_C(0x03C2),
- dest,
- destlen,
- destoff);
+ herodotus_write_codepoint(w, UINT32_C(0x03C2));
/* reset Final_Sigma-state and continue */
final_sigma_level = 0;
@@ -132,191 +129,163 @@ to_case(const void *src, size_t srclen, void *dest, size_t destlen,
off = (uint_least32_t)map - UINT32_C(0x110000);
for (i = 0; i < sc[off].cplen; i++) {
- if (likely(destoff < destlen)) {
- /*
- * write special mapping to destination
- */
- destoff += set_codepoint(sc[off].cp[i],
- dest,
- destlen,
- destoff);
- } else {
- /*
- * further increase destoff to indicate
- * how much buffer space we need
- */
- destoff += set_codepoint(sc[off].cp[i],
- NULL, 0, 0);
- }
+ herodotus_write_codepoint(w, sc[off].cp[i]);
}
} else {
/* we have a simple mapping */
- if (likely(destoff < destlen)) {
- destoff += set_codepoint((uint_least32_t)((int_least32_t)cp + map),
- dest, destlen, destoff);
- } else {
- destoff += set_codepoint((uint_least32_t)((int_least32_t)cp + map),
- NULL, 0, 0);
- }
+ herodotus_write_codepoint(w, (uint_least32_t)
+ ((int_least32_t)cp + map));
}
}
- if (set_codepoint == set_codepoint_utf8 && destlen > 0) {
- /*
- * NUL-terminate destination to always ensure NUL-termination,
- * unless in check mode.
- * Just like with snprintf() a return value >= destlen indicates
- * truncation.
- */
- ((char *)dest)[(destoff < destlen) ? destoff : (destlen - 1)] = '\0';
+ herodotus_writer_nul_terminate(w);
+
+ return herodotus_writer_number_written(w);
+}
+
+static size_t
+herodotus_next_word_break(const HERODOTUS_READER *r)
+{
+ if (r->src == NULL || r->off > r->srclen) {
+ return 0;
}
- return destoff;
+ if (r->type == HERODOTUS_TYPE_CODEPOINT) {
+ return grapheme_next_word_break(
+ ((const uint_least32_t *)(r->src)) + r->off,
+ r->srclen - r->off);
+ } else { /* r->type == HERODOTUS_TYPE_UTF8 */
+ return grapheme_next_word_break_utf8(
+ ((const char *)(r->src)) + r->off,
+ r->srclen - r->off);
+ }
}
static inline size_t
-to_titlecase(const void *src, size_t srclen, void *dest, size_t destlen,
- size_t (*get_codepoint)(const void *, size_t, size_t, uint_least32_t *),
- size_t (*set_codepoint)(uint_least32_t, void *, size_t, size_t))
+to_titlecase(HERODOTUS_READER *r, HERODOTUS_WRITER *w)
{
enum case_property prop;
- size_t next_wb, srcoff, destoff, res;
+ enum herodotus_status s;
uint_least32_t cp;
- for (srcoff = destoff = 0; ; ) {
- if (get_codepoint == get_codepoint_utf8) {
- if ((next_wb = grapheme_next_word_break_utf8((const char *)src + srcoff,
- srclen - srcoff)) == 0) {
- /* we consumed all of the string */
- break;
- }
- } else {
- if ((next_wb = grapheme_next_word_break((const uint_least32_t *)src + srcoff,
- srclen - srcoff)) == 0) {
- /* we consumed all of the string */
- break;
- }
- }
-
- for (; next_wb > 0 && srcoff < srclen; next_wb -= res, srcoff += res) {
+ for (;;) {
+ herodotus_reader_push_advance_limit(r, herodotus_next_word_break(r));
+ for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODOTUS_STATUS_SUCCESS;) {
/* check if we have a cased character */
- res = get_codepoint(src, srclen, srcoff, &cp);
prop = get_case_property(cp);
if (prop == CASE_PROP_CASED ||
prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
break;
} else {
/* write the data to the output verbatim, it if permits */
- destoff += set_codepoint_utf8(cp, dest, destlen, destoff);
- }
- }
+ herodotus_write_codepoint(w, cp);
- if (next_wb > 0) {
- /* get character length */
- res = get_codepoint(src, srclen, srcoff, &cp);
-
- /* we have a cased character at srcoff, map it to titlecase */
- if (get_codepoint == get_codepoint_utf8) {
- destoff += to_case((const char *)src + srcoff,
- srclen - srcoff,
- (char *)dest + destoff,
- (destoff < destlen) ? (destlen - destoff) : 0,
- res, 0,
- get_codepoint_utf8,
- set_codepoint_utf8, title_major,
- title_minor, title_special);
- } else {
- destoff += to_case((const uint_least32_t *)src + srcoff,
- srclen - srcoff,
- (uint_least32_t *)dest + destoff,
- (destoff < destlen) ? (destlen - destoff) : 0,
- res, 0,
- get_codepoint,
- set_codepoint, title_major,
- title_minor, title_special);
+ /* increment reader */
+ herodotus_read_codepoint(r, true, &cp);
}
-
- /* we consumed a character */
- srcoff += res;
- next_wb -= res;
}
- /* cast the rest of the codepoints in the word to lowercase */
- if (get_codepoint == get_codepoint_utf8) {
- destoff += to_case((const char *)src + srcoff,
- srclen - srcoff,
- (char *)dest + destoff,
- (destoff < destlen) ? (destlen - destoff) : 0,
- next_wb, 1,
- get_codepoint_utf8,
- set_codepoint_utf8, lower_major,
- lower_minor, lower_special);
+ if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
+ /* we are done */
+ break;
+ } else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
+ /*
+ * we did not encounter any cased character
+ * up to the word break
+ */
+ continue;
} else {
- destoff += to_case((const uint_least32_t *)src + srcoff,
- srclen - srcoff,
- (uint_least32_t *)dest + destoff,
- (destoff < destlen) ? (destlen - destoff) : 0,
- next_wb, 1,
- get_codepoint,
- set_codepoint, lower_major,
- lower_minor, lower_special);
+ /*
+ * we encountered a cased character before the word
+ * break, convert it to titlecase
+ */
+ herodotus_reader_push_advance_limit(r,
+ herodotus_reader_next_codepoint_break(r));
+ to_case(r, w, 0, title_major, title_minor, title_special);
+ herodotus_reader_pop_limit(r);
}
- srcoff += next_wb;
- }
- if (set_codepoint == set_codepoint_utf8) {
- /*
- * NUL-terminate destination to always ensure NUL-termination.
- * Just like with snprintf() a return value >= destlen indicates
- * truncation.
- */
- ((char *)dest)[(destoff < destlen) ? destoff : (destlen - 1)] = '\0';
+ /* cast the rest of the codepoints in the word to lowercase */
+ to_case(r, w, 1, lower_major, lower_minor, lower_special);
+
+ herodotus_reader_pop_limit(r);
}
- return destoff;
+ herodotus_writer_nul_terminate(w);
+
+ return herodotus_writer_number_written(w);
}
size_t
grapheme_to_uppercase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen)
{
- return to_case(src, srclen, dest, destlen, srclen, 0, get_codepoint, set_codepoint,
- upper_major, upper_minor, upper_special);
+ HERODOTUS_READER r;
+ HERODOTUS_WRITER w;
+
+ herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
+ herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
+
+ return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
}
size_t
grapheme_to_lowercase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen)
{
- return to_case(src, srclen, dest, destlen, srclen, 0, get_codepoint, set_codepoint,
- lower_major, lower_minor, lower_special);
+ HERODOTUS_READER r;
+ HERODOTUS_WRITER w;
+
+ herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
+ herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
+
+ return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
}
size_t
grapheme_to_titlecase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen)
{
- return to_titlecase(src, srclen, dest, destlen, get_codepoint,
- set_codepoint);
+ HERODOTUS_READER r;
+ HERODOTUS_WRITER w;
+
+ herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
+ herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
+
+ return to_titlecase(&r, &w);
}
size_t
grapheme_to_uppercase_utf8(const char *src, size_t srclen, char *dest, size_t destlen)
{
- return to_case(src, srclen, dest, destlen, srclen, 0, get_codepoint_utf8, set_codepoint_utf8,
- upper_major, upper_minor, upper_special);
+ HERODOTUS_READER r;
+ HERODOTUS_WRITER w;
+
+ herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
+ herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
+
+ return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
}
size_t
grapheme_to_lowercase_utf8(const char *src, size_t srclen, char *dest, size_t destlen)
{
- return to_case(src, srclen, dest, destlen, srclen, 0, get_codepoint_utf8, set_codepoint_utf8,
- lower_major, lower_minor, lower_special);
+ HERODOTUS_READER r;
+ HERODOTUS_WRITER w;
+
+ herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
+ herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
+ return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
}
size_t
grapheme_to_titlecase_utf8(const char *src, size_t srclen, char *dest, size_t destlen)
{
- return to_titlecase(src, srclen, dest, destlen, get_codepoint_utf8,
- set_codepoint_utf8);
+ HERODOTUS_READER r;
+ HERODOTUS_WRITER w;
+
+ herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
+ herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
+
+ return to_titlecase(&r, &w);
}
static inline bool