libgrapheme

grapheme cluster utility library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | LICENSE

test_body.c (8509B)


      1 /* See LICENSE file for copyright and license details. */
      2 #include <stddef.h>
      3 #include <stdint.h>
      4 #include <stdio.h>
      5 #include <string.h>
      6 
      7 #include "../grapheme.h"
      8 
      9 #define LEN(x) (sizeof(x) / sizeof(*x))
     10 
     11 static const struct {
     12 	uint32_t cp;      /* input code point */
     13 	uint8_t *exp_arr; /* expected UTF-8 byte sequence */
     14 	size_t   exp_len; /* expected length of UTF-8 sequence */
     15 } enc_test[] = {
     16 	{
     17 		/* invalid code point (UTF-16 surrogate half) */
     18 		.cp      = UINT32_C(0xD800),
     19 		.exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD },
     20 		.exp_len = 3,
     21 	},
     22 	{
     23 		/* invalid code point (UTF-16-unrepresentable) */
     24 		.cp      = UINT32_C(0x110000),
     25 		.exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD },
     26 		.exp_len = 3,
     27 	},
     28 	{
     29 		/* code point encoded to a 1-byte sequence */
     30 		.cp      = 0x01,
     31 		.exp_arr = (uint8_t[]){ 0x01 },
     32 		.exp_len = 1,
     33 	},
     34 	{
     35 		/* code point encoded to a 2-byte sequence */
     36 		.cp      = 0xFF,
     37 		.exp_arr = (uint8_t[]){ 0xC3, 0xBF },
     38 		.exp_len = 2,
     39 	},
     40 	{
     41 		/* code point encoded to a 3-byte sequence */
     42 		.cp      = 0xFFF,
     43 		.exp_arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF },
     44 		.exp_len = 3,
     45 	},
     46 	{
     47 		/* code point encoded to a 4-byte sequence */
     48 		.cp      = UINT32_C(0xFFFFF),
     49 		.exp_arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF },
     50 		.exp_len = 4,
     51 	},
     52 };
     53 
     54 static const struct {
     55 	uint8_t *arr;     /* UTF-8 byte sequence */
     56 	size_t   len;     /* length of UTF-8 byte sequence */
     57 	size_t   exp_len; /* expected length returned */
     58 	uint32_t exp_cp;  /* expected code point returned */
     59 } dec_test[] = {
     60 	{
     61 		/* empty sequence
     62 		 * [ ] ->
     63 		 * INVALID
     64 		 */
     65 		.arr     = NULL,
     66 		.len     = 0,
     67 		.exp_len = 1,
     68 		.exp_cp  = GRAPHEME_CP_INVALID,
     69 	},
     70 	{
     71 		/* invalid lead byte
     72 		 * [ 11111101 ] ->
     73 		 * INVALID
     74 		 */
     75 		.arr     = (uint8_t[]){ 0xFD },
     76 		.len     = 1,
     77 		.exp_len = 1,
     78 		.exp_cp  = GRAPHEME_CP_INVALID,
     79 	},
     80 	{
     81 		/* valid 1-byte sequence
     82 		 * [ 00000001 ] ->
     83 		 * 0000001
     84 		 */
     85 		.arr     = (uint8_t[]){ 0x01 },
     86 		.len     = 1,
     87 		.exp_len = 1,
     88 		.exp_cp  = 0x1,
     89 	},
     90 	{
     91 		/* valid 2-byte sequence
     92 		 * [ 11000011 10111111 ] ->
     93 		 * 00011111111
     94 		 */
     95 		.arr     = (uint8_t[]){ 0xC3, 0xBF },
     96 		.len     = 2,
     97 		.exp_len = 2,
     98 		.exp_cp  = 0xFF,
     99 	},
    100 	{
    101 		/* invalid 2-byte sequence (second byte missing)
    102 		 * [ 11000011 ] ->
    103 		 * INVALID
    104 		 */
    105 		.arr     = (uint8_t[]){ 0xC3 },
    106 		.len     = 1,
    107 		.exp_len = 2,
    108 		.exp_cp  = GRAPHEME_CP_INVALID,
    109 	},
    110 	{
    111 		/* invalid 2-byte sequence (second byte malformed)
    112 		 * [ 11000011 11111111 ] ->
    113 		 * INVALID
    114 		 */
    115 		.arr     = (uint8_t[]){ 0xC3, 0xFF },
    116 		.len     = 2,
    117 		.exp_len = 1,
    118 		.exp_cp  = GRAPHEME_CP_INVALID,
    119 	},
    120 	{
    121 		/* invalid 2-byte sequence (overlong encoded)
    122 		 * [ 11000001 10111111 ] ->
    123 		 * INVALID
    124 		 */
    125 		.arr     = (uint8_t[]){ 0xC1, 0xBF },
    126 		.len     = 2,
    127 		.exp_len = 2,
    128 		.exp_cp  = GRAPHEME_CP_INVALID,
    129 	},
    130 	{
    131 		/* valid 3-byte sequence
    132 		 * [ 11100000 10111111 10111111 ] ->
    133 		 * 0000111111111111
    134 		 */
    135 		.arr     = (uint8_t[]){ 0xE0, 0xBF, 0xBF },
    136 		.len     = 3,
    137 		.exp_len = 3,
    138 		.exp_cp  = 0xFFF,
    139 	},
    140 	{
    141 		/* invalid 3-byte sequence (second byte missing)
    142 		 * [ 11100000 ] ->
    143 		 * INVALID
    144 		 */
    145 		.arr     = (uint8_t[]){ 0xE0 },
    146 		.len     = 1,
    147 		.exp_len = 3,
    148 		.exp_cp  = GRAPHEME_CP_INVALID,
    149 	},
    150 	{
    151 		/* invalid 3-byte sequence (second byte malformed)
    152 		 * [ 11100000 01111111 10111111 ] ->
    153 		 * INVALID
    154 		 */
    155 		.arr     = (uint8_t[]){ 0xE0, 0x7F, 0xBF },
    156 		.len     = 3,
    157 		.exp_len = 1,
    158 		.exp_cp  = GRAPHEME_CP_INVALID,
    159 	},
    160 	{
    161 		/* invalid 3-byte sequence (third byte missing)
    162 		 * [ 11100000 10111111 ] ->
    163 		 * INVALID
    164 		 */
    165 		.arr     = (uint8_t[]){ 0xE0, 0xBF },
    166 		.len     = 2,
    167 		.exp_len = 3,
    168 		.exp_cp  = GRAPHEME_CP_INVALID,
    169 	},
    170 	{
    171 		/* invalid 3-byte sequence (third byte malformed)
    172 		 * [ 11100000 10111111 01111111 ] ->
    173 		 * INVALID
    174 		 */
    175 		.arr     = (uint8_t[]){ 0xE0, 0xBF, 0x7F },
    176 		.len     = 3,
    177 		.exp_len = 2,
    178 		.exp_cp  = GRAPHEME_CP_INVALID,
    179 	},
    180 	{
    181 		/* invalid 3-byte sequence (overlong encoded)
    182 		 * [ 11100000 10011111 10111111 ] ->
    183 		 * INVALID
    184 		 */
    185 		.arr     = (uint8_t[]){ 0xE0, 0x9F, 0xBF },
    186 		.len     = 3,
    187 		.exp_len = 3,
    188 		.exp_cp  = GRAPHEME_CP_INVALID,
    189 	},
    190 	{
    191 		/* invalid 3-byte sequence (UTF-16 surrogate half)
    192 		 * [ 11101101 10100000 10000000 ] ->
    193 		 * INVALID
    194 		 */
    195 		.arr     = (uint8_t[]){ 0xED, 0xA0, 0x80 },
    196 		.len     = 3,
    197 		.exp_len = 3,
    198 		.exp_cp  = GRAPHEME_CP_INVALID,
    199 	},
    200 	{
    201 		/* valid 4-byte sequence
    202 		 * [ 11110011 10111111 10111111 10111111 ] ->
    203 		 * 011111111111111111111
    204 		 */
    205 		.arr     = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF },
    206 		.len     = 4,
    207 		.exp_len = 4,
    208 		.exp_cp  = UINT32_C(0xFFFFF),
    209 	},
    210 	{
    211 		/* invalid 4-byte sequence (second byte missing)
    212 		 * [ 11110011 ] ->
    213 		 * INVALID
    214 		 */
    215 		.arr     = (uint8_t[]){ 0xF3 },
    216 		.len     = 1,
    217 		.exp_len = 4,
    218 		.exp_cp  = GRAPHEME_CP_INVALID,
    219 	},
    220 	{
    221 		/* invalid 4-byte sequence (second byte malformed)
    222 		 * [ 11110011 01111111 10111111 10111111 ] ->
    223 		 * INVALID
    224 		 */
    225 		.arr     = (uint8_t[]){ 0xF3, 0x7F, 0xBF, 0xBF },
    226 		.len     = 4,
    227 		.exp_len = 1,
    228 		.exp_cp  = GRAPHEME_CP_INVALID,
    229 	},
    230 	{
    231 		/* invalid 4-byte sequence (third byte missing)
    232 		 * [ 11110011 10111111 ] ->
    233 		 * INVALID
    234 		 */
    235 		.arr     = (uint8_t[]){ 0xF3, 0xBF },
    236 		.len     = 2,
    237 		.exp_len = 4,
    238 		.exp_cp  = GRAPHEME_CP_INVALID,
    239 	},
    240 	{
    241 		/* invalid 4-byte sequence (third byte malformed)
    242 		 * [ 11110011 10111111 01111111 10111111 ] ->
    243 		 * INVALID
    244 		 */
    245 		.arr     = (uint8_t[]){ 0xF3, 0xBF, 0x7F, 0xBF },
    246 		.len     = 4,
    247 		.exp_len = 2,
    248 		.exp_cp  = GRAPHEME_CP_INVALID,
    249 	},
    250 	{
    251 		/* invalid 4-byte sequence (fourth byte missing)
    252 		 * [ 11110011 10111111 10111111 ] ->
    253 		 * INVALID
    254 		 */
    255 		.arr     = (uint8_t[]){ 0xF3, 0xBF, 0xBF },
    256 		.len     = 3,
    257 		.exp_len = 4,
    258 		.exp_cp  = GRAPHEME_CP_INVALID,
    259 	},
    260 	{
    261 		/* invalid 4-byte sequence (fourth byte malformed)
    262 		 * [ 11110011 10111111 10111111 01111111 ] ->
    263 		 * INVALID
    264 		 */
    265 		.arr     = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0x7F },
    266 		.len     = 4,
    267 		.exp_len = 3,
    268 		.exp_cp  = GRAPHEME_CP_INVALID,
    269 	},
    270 	{
    271 		/* invalid 4-byte sequence (overlong encoded)
    272 		 * [ 11110000 10000000 10000001 10111111 ] ->
    273 		 * INVALID
    274 		 */
    275 		.arr     = (uint8_t[]){ 0xF0, 0x80, 0x81, 0xBF },
    276 		.len     = 4,
    277 		.exp_len = 4,
    278 		.exp_cp  = GRAPHEME_CP_INVALID,
    279 	},
    280 	{
    281 		/* invalid 4-byte sequence (UTF-16-unrepresentable)
    282 		 * [ 11110100 10010000 10000000 10000000 ] ->
    283 		 * INVALID
    284 		 */
    285 		.arr     = (uint8_t[]){ 0xF4, 0x90, 0x80, 0x80 },
    286 		.len     = 4,
    287 		.exp_len = 4,
    288 		.exp_cp  = GRAPHEME_CP_INVALID,
    289 	},
    290 };
    291 
    292 int
    293 main(void)
    294 {
    295 	int state;
    296 	size_t i, j, k, len, failed;
    297 
    298 	/* UTF-8 encoder test */
    299 	for (i = 0, failed = 0; i < LEN(enc_test); i++) {
    300 		uint8_t arr[4];
    301 		size_t len;
    302 
    303 		len = grapheme_cp_encode(enc_test[i].cp, arr, LEN(arr));
    304 
    305 		if (len != enc_test[i].exp_len ||
    306 		    memcmp(arr, enc_test[i].exp_arr, len)) {
    307 			fprintf(stderr, "Failed UTF-8-encoder test %zu: "
    308 			        "Expected (", i);
    309 			for (j = 0; j < enc_test[i].exp_len; j++) {
    310 				fprintf(stderr, "0x%x",
    311 				        enc_test[i].exp_arr[j]);
    312 				if (j + 1 < enc_test[i].exp_len) {
    313 					fprintf(stderr, " ");
    314 				}
    315 			}
    316 			fprintf(stderr, "), but got (");
    317 			for (j = 0; j < len; j++) {
    318 				fprintf(stderr, "0x%x", arr[j]);
    319 				if (j + 1 < len) {
    320 					fprintf(stderr, " ");
    321 				}
    322 			}
    323 			fprintf(stderr, ")\n");
    324 			failed++;
    325 		}
    326 	}
    327 	printf("UTF-8 encoder test: Passed %zu out of %zu tests.\n",
    328 	       LEN(enc_test) - failed, LEN(enc_test));
    329 
    330 	/* UTF-8 decoder test */
    331 	for (i = 0, failed = 0; i < LEN(dec_test); i++) {
    332 		size_t len;
    333 		uint32_t cp;
    334 
    335 		len = grapheme_cp_decode(&cp, dec_test[i].arr,
    336 		                         dec_test[i].len);
    337 
    338 		if (len != dec_test[i].exp_len ||
    339 		    cp != dec_test[i].exp_cp) {
    340 			fprintf(stderr, "Failed UTF-8-decoder test %zu: "
    341 			        "Expected (%zx,%u), but got (%zx,%u)\n",
    342 			        i, dec_test[i].exp_len,
    343 			        dec_test[i].exp_cp, len, cp);
    344 			failed++;
    345 		}
    346 	}
    347 	printf("UTF-8 decoder test: Passed %zu out of %zu tests.\n",
    348 	       LEN(dec_test) - failed, LEN(dec_test));
    349 
    350 	/* grapheme break test */
    351 	for (i = 0, failed = 0; i < LEN(t); i++) {
    352 		for (j = 0, k = 0, state = 0, len = 1; j < t[i].cplen; j++) {
    353 			if ((j + 1) == t[i].cplen ||
    354 			    grapheme_boundary(t[i].cp[j], t[i].cp[j + 1],
    355 			                      &state)) {
    356 				/* check if our resulting length matches */
    357 				if (k == t[i].lenlen || len != t[i].len[k++]) {
    358 					fprintf(stderr, "Failed \"%s\"\n",
    359 					        t[i].descr);
    360 					failed++;
    361 					break;
    362 				}
    363 				len = 1;
    364 			} else {
    365 				len++;
    366 			}
    367 		}
    368 	}
    369 	printf("Grapheme break test: Passed %zu out of %zu tests.\n",
    370 	       LEN(t) - failed, LEN(t));
    371 
    372 	return (failed > 0) ? 1 : 0;
    373 }