Add UTF-8-encode-function - libgrapheme

commit 21b6f66acc659e8c515d4685a11fa534a289af14
parent 52a25d52f16697e74dfd582217de5d169c3790cb
Author: Laslo Hunhold <dev@frign.de>
Date:   Sun, 31 May 2020 22:44:06 +0200

Add UTF-8-encode-function

Merely to detect grapheme clusters, the reasoning behind adding
an encoding-function is not immediately apparent. The main reason
for it is because some decoding-scenarios actually change the text
representation (by identifying invalid codepoints and outputting
them as such).
The user should have the chance to output a "processed" stream.

A minor benefit with very little overhead is that this encoding
function is just useful in general.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
M src/codepoint.c  | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
M src/codepoint.h  | 1 +

2 files changed, 62 insertions(+), 4 deletions(-)
diff --git a/src/codepoint.c b/src/codepoint.c
@@ -9,7 +9,8 @@
 static const struct {
 	uint8_t  lower; /* lower bound of sequence first byte */
 	uint8_t  upper; /* upper bound of sequence first byte */
-	uint32_t mincp; /* smallest non-overlong encoded codepoint */
+	uint32_t mincp; /* smallest non-overlong encoded code point */
+	uint32_t maxcp; /* largest encodable code point */
 	/*
 	 * implicit: table-offset represents the number of following
 	 * bytes of the form 10xxxxxx (6 bits capacity each)
@@ -20,24 +21,28 @@ static const struct {
 		.lower = 0x00, /* 00000000 */
 		.upper = 0x7F, /* 01111111 */
 		.mincp = (uint32_t)0,
+		.maxcp = ((uint32_t)1 << 7) - 1, /* 7 bits capacity */
 	},
 	[1] = {
 		/* 110xxxxx */
 		.lower = 0xC0, /* 11000000 */
 		.upper = 0xDF, /* 11011111 */
-		.mincp = (uint32_t)1 << 7, /* [0] has 7 bits capacity */
+		.mincp = (uint32_t)1 << 7,
+		.maxcp = ((uint32_t)1 << 11) - 1, /* 5+6=11 bits capacity */
 	},
 	[2] = {
 		/* 1110xxxx */
 		.lower = 0xE0, /* 11100000 */
 		.upper = 0xEF, /* 11101111 */
-		.mincp = (uint32_t)1 << 11, /* [1] has 5+6=11 bits capacity */
+		.mincp = (uint32_t)1 << 11,
+		.maxcp = ((uint32_t)1 << 16) - 1, /* 4+6+6=16 bits capacity */
 	},
 	[3] = {
 		/* 11110xxx */
 		.lower = 0xF0, /* 11110000 */
 		.upper = 0xF7, /* 11110111 */
-		.mincp = (uint32_t)1 << 16, /* [2] has 4+6+6=16 bits capacity */
+		.mincp = (uint32_t)1 << 16,
+		.maxcp = ((uint32_t)1 << 21) - 1, /* 3+6+6+6=21 bits capacity */
 	},
 };
 
@@ -117,3 +122,55 @@ grapheme_cp_decode(uint32_t *cp, const uint8_t *s, size_t n)
 
 	return 1 + off;
 }
+
+size_t
+grapheme_cp_encode(uint32_t cp, uint8_t *s, size_t n)
+{
+	size_t off, i;
+
+	if (BETWEEN(cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) ||
+	    cp > UINT32_C(0x10FFFF)) {
+		/*
+		 * code point is a high or low UTF-16 surrogate half
+		 * (0xD800..0xDFFF) or not representable in UTF-16
+		 * (>0x10FFFF), which RFC-3629 deems invalid for UTF-8.
+		 */
+		cp = CP_INVALID;
+	}
+
+	/* determine necessary sequence type */
+	for (off = 0; off < LEN(lut); off++) {
+		if (cp <= lut[off].maxcp) {
+			break;
+		}
+	}
+	if (1 + off > n) {
+		/* specified buffer is too small to store sequence */
+		return 1 + off;
+	}
+
+	/* build sequence by filling cp-bits into each byte */
+
+	/*
+	 * lut[off].lower is the bit-format for the first byte and
+	 * the bits to fill into it are determined by shifting the
+	 * cp 6 times the number of following bytes, as each
+	 * following byte stores 6 bits, yielding the wanted bits.
+	 *
+	 * We do not overwrite the mask because we guaranteed earlier
+	 * that there are no bits higher than the mask allows.
+	 */
+	s[0] = lut[off].lower | (cp >> (6 * off));
+
+	for (i = 1; i <= off; i++) {
+		/*
+		 * the bit-format for following bytes is 10000000 (0x80)
+		 * and it each stores 6 bits in the 6 low bits that we
+		 * extract from the properly-shifted value using the
+		 * mask 00111111 (0x3F)
+		 */
+		s[i] = 0x80 | ((cp >> (6 * (off - i))) & 0x3F);
+	}
+
+	return 1 + off;
+}
diff --git a/src/codepoint.h b/src/codepoint.h
@@ -10,5 +10,6 @@ typedef uint32_t Codepoint;
 #define CP_INVALID 0xFFFD
 
 size_t grapheme_cp_decode(uint32_t *, const uint8_t *, size_t);
+size_t grapheme_cp_encode(uint32_t, uint8_t *, size_t);
 
 #endif /* CODEPOINT_H */

	libgrapheme unicode string library
	git clone git://git.suckless.org/libgrapheme
	Log \| Files \| Refs \| README \| LICENSE

M	src/codepoint.c	\|	65	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
M	src/codepoint.h	\|	1	+