libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

commit 21b6f66acc659e8c515d4685a11fa534a289af14
parent 52a25d52f16697e74dfd582217de5d169c3790cb
Author: Laslo Hunhold <dev@frign.de>
Date:   Sun, 31 May 2020 22:44:06 +0200

Add UTF-8-encode-function

Merely to detect grapheme clusters, the reasoning behind adding
an encoding-function is not immediately apparent. The main reason
for it is because some decoding-scenarios actually change the text
representation (by identifying invalid codepoints and outputting
them as such).
The user should have the chance to output a "processed" stream.

A minor benefit with very little overhead is that this encoding
function is just useful in general.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
Msrc/codepoint.c | 65+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Msrc/codepoint.h | 1+
2 files changed, 62 insertions(+), 4 deletions(-)

diff --git a/src/codepoint.c b/src/codepoint.c @@ -9,7 +9,8 @@ static const struct { uint8_t lower; /* lower bound of sequence first byte */ uint8_t upper; /* upper bound of sequence first byte */ - uint32_t mincp; /* smallest non-overlong encoded codepoint */ + uint32_t mincp; /* smallest non-overlong encoded code point */ + uint32_t maxcp; /* largest encodable code point */ /* * implicit: table-offset represents the number of following * bytes of the form 10xxxxxx (6 bits capacity each) @@ -20,24 +21,28 @@ static const struct { .lower = 0x00, /* 00000000 */ .upper = 0x7F, /* 01111111 */ .mincp = (uint32_t)0, + .maxcp = ((uint32_t)1 << 7) - 1, /* 7 bits capacity */ }, [1] = { /* 110xxxxx */ .lower = 0xC0, /* 11000000 */ .upper = 0xDF, /* 11011111 */ - .mincp = (uint32_t)1 << 7, /* [0] has 7 bits capacity */ + .mincp = (uint32_t)1 << 7, + .maxcp = ((uint32_t)1 << 11) - 1, /* 5+6=11 bits capacity */ }, [2] = { /* 1110xxxx */ .lower = 0xE0, /* 11100000 */ .upper = 0xEF, /* 11101111 */ - .mincp = (uint32_t)1 << 11, /* [1] has 5+6=11 bits capacity */ + .mincp = (uint32_t)1 << 11, + .maxcp = ((uint32_t)1 << 16) - 1, /* 4+6+6=16 bits capacity */ }, [3] = { /* 11110xxx */ .lower = 0xF0, /* 11110000 */ .upper = 0xF7, /* 11110111 */ - .mincp = (uint32_t)1 << 16, /* [2] has 4+6+6=16 bits capacity */ + .mincp = (uint32_t)1 << 16, + .maxcp = ((uint32_t)1 << 21) - 1, /* 3+6+6+6=21 bits capacity */ }, }; @@ -117,3 +122,55 @@ grapheme_cp_decode(uint32_t *cp, const uint8_t *s, size_t n) return 1 + off; } + +size_t +grapheme_cp_encode(uint32_t cp, uint8_t *s, size_t n) +{ + size_t off, i; + + if (BETWEEN(cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) || + cp > UINT32_C(0x10FFFF)) { + /* + * code point is a high or low UTF-16 surrogate half + * (0xD800..0xDFFF) or not representable in UTF-16 + * (>0x10FFFF), which RFC-3629 deems invalid for UTF-8. + */ + cp = CP_INVALID; + } + + /* determine necessary sequence type */ + for (off = 0; off < LEN(lut); off++) { + if (cp <= lut[off].maxcp) { + break; + } + } + if (1 + off > n) { + /* specified buffer is too small to store sequence */ + return 1 + off; + } + + /* build sequence by filling cp-bits into each byte */ + + /* + * lut[off].lower is the bit-format for the first byte and + * the bits to fill into it are determined by shifting the + * cp 6 times the number of following bytes, as each + * following byte stores 6 bits, yielding the wanted bits. + * + * We do not overwrite the mask because we guaranteed earlier + * that there are no bits higher than the mask allows. + */ + s[0] = lut[off].lower | (cp >> (6 * off)); + + for (i = 1; i <= off; i++) { + /* + * the bit-format for following bytes is 10000000 (0x80) + * and it each stores 6 bits in the 6 low bits that we + * extract from the properly-shifted value using the + * mask 00111111 (0x3F) + */ + s[i] = 0x80 | ((cp >> (6 * (off - i))) & 0x3F); + } + + return 1 + off; +} diff --git a/src/codepoint.h b/src/codepoint.h @@ -10,5 +10,6 @@ typedef uint32_t Codepoint; #define CP_INVALID 0xFFFD size_t grapheme_cp_decode(uint32_t *, const uint8_t *, size_t); +size_t grapheme_cp_encode(uint32_t, uint8_t *, size_t); #endif /* CODEPOINT_H */