libgrapheme

grapheme cluster utility library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | LICENSE

test.c (8534B)


      1 /* See LICENSE file for copyright and license details. */
      2 #include <stddef.h>
      3 #include <stdint.h>
      4 #include <stdio.h>
      5 #include <string.h>
      6 
      7 #include "../grapheme.h"
      8 #include "../data/gbt.h"
      9 
     10 #define LEN(x) (sizeof(x) / sizeof(*x))
     11 
     12 static const struct {
     13 	uint32_t cp;      /* input code point */
     14 	uint8_t *exp_arr; /* expected UTF-8 byte sequence */
     15 	size_t   exp_len; /* expected length of UTF-8 sequence */
     16 } enc_test[] = {
     17 	{
     18 		/* invalid code point (UTF-16 surrogate half) */
     19 		.cp      = UINT32_C(0xD800),
     20 		.exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD },
     21 		.exp_len = 3,
     22 	},
     23 	{
     24 		/* invalid code point (UTF-16-unrepresentable) */
     25 		.cp      = UINT32_C(0x110000),
     26 		.exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD },
     27 		.exp_len = 3,
     28 	},
     29 	{
     30 		/* code point encoded to a 1-byte sequence */
     31 		.cp      = 0x01,
     32 		.exp_arr = (uint8_t[]){ 0x01 },
     33 		.exp_len = 1,
     34 	},
     35 	{
     36 		/* code point encoded to a 2-byte sequence */
     37 		.cp      = 0xFF,
     38 		.exp_arr = (uint8_t[]){ 0xC3, 0xBF },
     39 		.exp_len = 2,
     40 	},
     41 	{
     42 		/* code point encoded to a 3-byte sequence */
     43 		.cp      = 0xFFF,
     44 		.exp_arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF },
     45 		.exp_len = 3,
     46 	},
     47 	{
     48 		/* code point encoded to a 4-byte sequence */
     49 		.cp      = UINT32_C(0xFFFFF),
     50 		.exp_arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF },
     51 		.exp_len = 4,
     52 	},
     53 };
     54 
     55 static const struct {
     56 	uint8_t *arr;     /* UTF-8 byte sequence */
     57 	size_t   len;     /* length of UTF-8 byte sequence */
     58 	size_t   exp_len; /* expected length returned */
     59 	uint32_t exp_cp;  /* expected code point returned */
     60 } dec_test[] = {
     61 	{
     62 		/* empty sequence
     63 		 * [ ] ->
     64 		 * INVALID
     65 		 */
     66 		.arr     = NULL,
     67 		.len     = 0,
     68 		.exp_len = 1,
     69 		.exp_cp  = GRAPHEME_CP_INVALID,
     70 	},
     71 	{
     72 		/* invalid lead byte
     73 		 * [ 11111101 ] ->
     74 		 * INVALID
     75 		 */
     76 		.arr     = (uint8_t[]){ 0xFD },
     77 		.len     = 1,
     78 		.exp_len = 1,
     79 		.exp_cp  = GRAPHEME_CP_INVALID,
     80 	},
     81 	{
     82 		/* valid 1-byte sequence
     83 		 * [ 00000001 ] ->
     84 		 * 0000001
     85 		 */
     86 		.arr     = (uint8_t[]){ 0x01 },
     87 		.len     = 1,
     88 		.exp_len = 1,
     89 		.exp_cp  = 0x1,
     90 	},
     91 	{
     92 		/* valid 2-byte sequence
     93 		 * [ 11000011 10111111 ] ->
     94 		 * 00011111111
     95 		 */
     96 		.arr     = (uint8_t[]){ 0xC3, 0xBF },
     97 		.len     = 2,
     98 		.exp_len = 2,
     99 		.exp_cp  = 0xFF,
    100 	},
    101 	{
    102 		/* invalid 2-byte sequence (second byte missing)
    103 		 * [ 11000011 ] ->
    104 		 * INVALID
    105 		 */
    106 		.arr     = (uint8_t[]){ 0xC3 },
    107 		.len     = 1,
    108 		.exp_len = 2,
    109 		.exp_cp  = GRAPHEME_CP_INVALID,
    110 	},
    111 	{
    112 		/* invalid 2-byte sequence (second byte malformed)
    113 		 * [ 11000011 11111111 ] ->
    114 		 * INVALID
    115 		 */
    116 		.arr     = (uint8_t[]){ 0xC3, 0xFF },
    117 		.len     = 2,
    118 		.exp_len = 1,
    119 		.exp_cp  = GRAPHEME_CP_INVALID,
    120 	},
    121 	{
    122 		/* invalid 2-byte sequence (overlong encoded)
    123 		 * [ 11000001 10111111 ] ->
    124 		 * INVALID
    125 		 */
    126 		.arr     = (uint8_t[]){ 0xC1, 0xBF },
    127 		.len     = 2,
    128 		.exp_len = 2,
    129 		.exp_cp  = GRAPHEME_CP_INVALID,
    130 	},
    131 	{
    132 		/* valid 3-byte sequence
    133 		 * [ 11100000 10111111 10111111 ] ->
    134 		 * 0000111111111111
    135 		 */
    136 		.arr     = (uint8_t[]){ 0xE0, 0xBF, 0xBF },
    137 		.len     = 3,
    138 		.exp_len = 3,
    139 		.exp_cp  = 0xFFF,
    140 	},
    141 	{
    142 		/* invalid 3-byte sequence (second byte missing)
    143 		 * [ 11100000 ] ->
    144 		 * INVALID
    145 		 */
    146 		.arr     = (uint8_t[]){ 0xE0 },
    147 		.len     = 1,
    148 		.exp_len = 3,
    149 		.exp_cp  = GRAPHEME_CP_INVALID,
    150 	},
    151 	{
    152 		/* invalid 3-byte sequence (second byte malformed)
    153 		 * [ 11100000 01111111 10111111 ] ->
    154 		 * INVALID
    155 		 */
    156 		.arr     = (uint8_t[]){ 0xE0, 0x7F, 0xBF },
    157 		.len     = 3,
    158 		.exp_len = 1,
    159 		.exp_cp  = GRAPHEME_CP_INVALID,
    160 	},
    161 	{
    162 		/* invalid 3-byte sequence (third byte missing)
    163 		 * [ 11100000 10111111 ] ->
    164 		 * INVALID
    165 		 */
    166 		.arr     = (uint8_t[]){ 0xE0, 0xBF },
    167 		.len     = 2,
    168 		.exp_len = 3,
    169 		.exp_cp  = GRAPHEME_CP_INVALID,
    170 	},
    171 	{
    172 		/* invalid 3-byte sequence (third byte malformed)
    173 		 * [ 11100000 10111111 01111111 ] ->
    174 		 * INVALID
    175 		 */
    176 		.arr     = (uint8_t[]){ 0xE0, 0xBF, 0x7F },
    177 		.len     = 3,
    178 		.exp_len = 2,
    179 		.exp_cp  = GRAPHEME_CP_INVALID,
    180 	},
    181 	{
    182 		/* invalid 3-byte sequence (overlong encoded)
    183 		 * [ 11100000 10011111 10111111 ] ->
    184 		 * INVALID
    185 		 */
    186 		.arr     = (uint8_t[]){ 0xE0, 0x9F, 0xBF },
    187 		.len     = 3,
    188 		.exp_len = 3,
    189 		.exp_cp  = GRAPHEME_CP_INVALID,
    190 	},
    191 	{
    192 		/* invalid 3-byte sequence (UTF-16 surrogate half)
    193 		 * [ 11101101 10100000 10000000 ] ->
    194 		 * INVALID
    195 		 */
    196 		.arr     = (uint8_t[]){ 0xED, 0xA0, 0x80 },
    197 		.len     = 3,
    198 		.exp_len = 3,
    199 		.exp_cp  = GRAPHEME_CP_INVALID,
    200 	},
    201 	{
    202 		/* valid 4-byte sequence
    203 		 * [ 11110011 10111111 10111111 10111111 ] ->
    204 		 * 011111111111111111111
    205 		 */
    206 		.arr     = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF },
    207 		.len     = 4,
    208 		.exp_len = 4,
    209 		.exp_cp  = UINT32_C(0xFFFFF),
    210 	},
    211 	{
    212 		/* invalid 4-byte sequence (second byte missing)
    213 		 * [ 11110011 ] ->
    214 		 * INVALID
    215 		 */
    216 		.arr     = (uint8_t[]){ 0xF3 },
    217 		.len     = 1,
    218 		.exp_len = 4,
    219 		.exp_cp  = GRAPHEME_CP_INVALID,
    220 	},
    221 	{
    222 		/* invalid 4-byte sequence (second byte malformed)
    223 		 * [ 11110011 01111111 10111111 10111111 ] ->
    224 		 * INVALID
    225 		 */
    226 		.arr     = (uint8_t[]){ 0xF3, 0x7F, 0xBF, 0xBF },
    227 		.len     = 4,
    228 		.exp_len = 1,
    229 		.exp_cp  = GRAPHEME_CP_INVALID,
    230 	},
    231 	{
    232 		/* invalid 4-byte sequence (third byte missing)
    233 		 * [ 11110011 10111111 ] ->
    234 		 * INVALID
    235 		 */
    236 		.arr     = (uint8_t[]){ 0xF3, 0xBF },
    237 		.len     = 2,
    238 		.exp_len = 4,
    239 		.exp_cp  = GRAPHEME_CP_INVALID,
    240 	},
    241 	{
    242 		/* invalid 4-byte sequence (third byte malformed)
    243 		 * [ 11110011 10111111 01111111 10111111 ] ->
    244 		 * INVALID
    245 		 */
    246 		.arr     = (uint8_t[]){ 0xF3, 0xBF, 0x7F, 0xBF },
    247 		.len     = 4,
    248 		.exp_len = 2,
    249 		.exp_cp  = GRAPHEME_CP_INVALID,
    250 	},
    251 	{
    252 		/* invalid 4-byte sequence (fourth byte missing)
    253 		 * [ 11110011 10111111 10111111 ] ->
    254 		 * INVALID
    255 		 */
    256 		.arr     = (uint8_t[]){ 0xF3, 0xBF, 0xBF },
    257 		.len     = 3,
    258 		.exp_len = 4,
    259 		.exp_cp  = GRAPHEME_CP_INVALID,
    260 	},
    261 	{
    262 		/* invalid 4-byte sequence (fourth byte malformed)
    263 		 * [ 11110011 10111111 10111111 01111111 ] ->
    264 		 * INVALID
    265 		 */
    266 		.arr     = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0x7F },
    267 		.len     = 4,
    268 		.exp_len = 3,
    269 		.exp_cp  = GRAPHEME_CP_INVALID,
    270 	},
    271 	{
    272 		/* invalid 4-byte sequence (overlong encoded)
    273 		 * [ 11110000 10000000 10000001 10111111 ] ->
    274 		 * INVALID
    275 		 */
    276 		.arr     = (uint8_t[]){ 0xF0, 0x80, 0x81, 0xBF },
    277 		.len     = 4,
    278 		.exp_len = 4,
    279 		.exp_cp  = GRAPHEME_CP_INVALID,
    280 	},
    281 	{
    282 		/* invalid 4-byte sequence (UTF-16-unrepresentable)
    283 		 * [ 11110100 10010000 10000000 10000000 ] ->
    284 		 * INVALID
    285 		 */
    286 		.arr     = (uint8_t[]){ 0xF4, 0x90, 0x80, 0x80 },
    287 		.len     = 4,
    288 		.exp_len = 4,
    289 		.exp_cp  = GRAPHEME_CP_INVALID,
    290 	},
    291 };
    292 
    293 int
    294 main(void)
    295 {
    296 	int state;
    297 	size_t i, j, k, len, failed;
    298 
    299 	/* UTF-8 encoder test */
    300 	for (i = 0, failed = 0; i < LEN(enc_test); i++) {
    301 		uint8_t arr[4];
    302 		size_t len;
    303 
    304 		len = grapheme_cp_encode(enc_test[i].cp, arr, LEN(arr));
    305 
    306 		if (len != enc_test[i].exp_len ||
    307 		    memcmp(arr, enc_test[i].exp_arr, len)) {
    308 			fprintf(stderr, "Failed UTF-8-encoder test %zu: "
    309 			        "Expected (", i);
    310 			for (j = 0; j < enc_test[i].exp_len; j++) {
    311 				fprintf(stderr, "0x%x",
    312 				        enc_test[i].exp_arr[j]);
    313 				if (j + 1 < enc_test[i].exp_len) {
    314 					fprintf(stderr, " ");
    315 				}
    316 			}
    317 			fprintf(stderr, "), but got (");
    318 			for (j = 0; j < len; j++) {
    319 				fprintf(stderr, "0x%x", arr[j]);
    320 				if (j + 1 < len) {
    321 					fprintf(stderr, " ");
    322 				}
    323 			}
    324 			fprintf(stderr, ")\n");
    325 			failed++;
    326 		}
    327 	}
    328 	printf("UTF-8 encoder test: Passed %zu out of %zu tests.\n",
    329 	       LEN(enc_test) - failed, LEN(enc_test));
    330 
    331 	/* UTF-8 decoder test */
    332 	for (i = 0, failed = 0; i < LEN(dec_test); i++) {
    333 		size_t len;
    334 		uint32_t cp;
    335 
    336 		len = grapheme_cp_decode(&cp, dec_test[i].arr,
    337 		                         dec_test[i].len);
    338 
    339 		if (len != dec_test[i].exp_len ||
    340 		    cp != dec_test[i].exp_cp) {
    341 			fprintf(stderr, "Failed UTF-8-decoder test %zu: "
    342 			        "Expected (%zx,%u), but got (%zx,%u)\n",
    343 			        i, dec_test[i].exp_len,
    344 			        dec_test[i].exp_cp, len, cp);
    345 			failed++;
    346 		}
    347 	}
    348 	printf("UTF-8 decoder test: Passed %zu out of %zu tests.\n",
    349 	       LEN(dec_test) - failed, LEN(dec_test));
    350 
    351 	/* grapheme break test */
    352 	for (i = 0, failed = 0; i < LEN(t); i++) {
    353 		for (j = 0, k = 0, state = 0, len = 1; j < t[i].cplen; j++) {
    354 			if ((j + 1) == t[i].cplen ||
    355 			    grapheme_boundary(t[i].cp[j], t[i].cp[j + 1],
    356 			                      &state)) {
    357 				/* check if our resulting length matches */
    358 				if (k == t[i].lenlen || len != t[i].len[k++]) {
    359 					fprintf(stderr, "Failed \"%s\"\n",
    360 					        t[i].descr);
    361 					failed++;
    362 					break;
    363 				}
    364 				len = 1;
    365 			} else {
    366 				len++;
    367 			}
    368 		}
    369 	}
    370 	printf("Grapheme break test: Passed %zu out of %zu tests.\n",
    371 	       LEN(t) - failed, LEN(t));
    372 
    373 	return (failed > 0) ? 1 : 0;
    374 }