Add UTF-8-encoder tests - libgrapheme - unicode string library

commit d2b53cb080b8c75b140bb1a3347b409c118e882d
parent 21b6f66acc659e8c515d4685a11fa534a289af14
Author: Laslo Hunhold <dev@frign.de>
Date:   Sun, 31 May 2020 22:49:30 +0200

Add UTF-8-encoder tests

This should cover all the edge cases and provide a regression test
for the encoder.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
M src/test_body.c  | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--

1 file changed, 75 insertions(+), 2 deletions(-)
diff --git a/src/test_body.c b/src/test_body.c
@@ -1,14 +1,55 @@
 /* See LICENSE file for copyright and license details. */
 #include <stddef.h>
 #include <stdio.h>
+#include <string.h>
 
 #include "boundary.h"
 #include "codepoint.h"
 
 #define LEN(x) (sizeof(x) / sizeof(*x))
 
-/* all types valid/invalid, overencoded, surrogate, over 10FFFF w/e
- * expected return value and return cp */
+static const struct {
+	uint32_t cp;      /* input code point */
+	uint8_t *exp_arr; /* expected UTF-8 byte sequence */
+	size_t   exp_len; /* expected length of UTF-8 sequence */
+} enc_test[] = {
+	{
+		/* invalid code point (UTF-16 surrogate half) */
+		.cp      = UINT32_C(0xD800),
+		.exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD },
+		.exp_len = 3,
+	},
+	{
+		/* invalid code point (UTF-16-unrepresentable) */
+		.cp      = UINT32_C(0x110000),
+		.exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD },
+		.exp_len = 3,
+	},
+	{
+		/* code point encoded to a 1-byte sequence */
+		.cp      = 0x01,
+		.exp_arr = (uint8_t[]){ 0x01 },
+		.exp_len = 1,
+	},
+	{
+		/* code point encoded to a 2-byte sequence */
+		.cp      = 0xFF,
+		.exp_arr = (uint8_t[]){ 0xC3, 0xBF },
+		.exp_len = 2,
+	},
+	{
+		/* code point encoded to a 3-byte sequence */
+		.cp      = 0xFFF,
+		.exp_arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF },
+		.exp_len = 3,
+	},
+	{
+		/* code point encoded to a 4-byte sequence */
+		.cp      = UINT32_C(0xFFFFF),
+		.exp_arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF },
+		.exp_len = 4,
+	},
+};
 
 static const struct {
 	uint8_t *arr;     /* byte array */
@@ -253,6 +294,38 @@ int main(void)
 	int state;
 	size_t i, j, k, len, failed;
 
+	/* UTF-8 encoder test */
+	for (i = 0, failed = 0; i < LEN(enc_test); i++) {
+		uint8_t arr[4];
+		size_t len;
+
+		len = grapheme_cp_encode(enc_test[i].cp, arr, LEN(arr));
+
+		if (len != enc_test[i].exp_len ||
+		    memcmp(arr, enc_test[i].exp_arr, len)) {
+			fprintf(stderr, "Failed UTF-8-encoder test %zu: "
+			        "Expected (", i);
+			for (j = 0; j < enc_test[i].exp_len; j++) {
+				fprintf(stderr, "0x%x",
+				        enc_test[i].exp_arr[j]);
+				if (j != enc_test[i].exp_len - 1) {
+					fprintf(stderr, " ");
+				}
+			}
+			fprintf(stderr, "), but got (");
+			for (j = 0; j < len; j++) {
+				fprintf(stderr, "0x%x", arr[j]);
+				if (j != len - 1) {
+					fprintf(stderr, " ");
+				}
+			}
+			fprintf(stderr, ")\n");
+			failed++;
+		}
+	}
+	printf("UTF-8 encoder test: Passed %zu out of %zu tests.\n",
+	       LEN(enc_test) - failed, LEN(enc_test));
+
 	/* UTF-8 decoder test */
 	for (i = 0, failed = 0; i < LEN(dec_test); i++) {
 		size_t len;

	libgrapheme unicode string library
	git clone git://git.suckless.org/libgrapheme
	Log \| Files \| Refs \| README \| LICENSE