test.c (8534B)
1 /* See LICENSE file for copyright and license details. */ 2 #include <stddef.h> 3 #include <stdint.h> 4 #include <stdio.h> 5 #include <string.h> 6 7 #include "../grapheme.h" 8 #include "../data/gbt.h" 9 10 #define LEN(x) (sizeof(x) / sizeof(*x)) 11 12 static const struct { 13 uint32_t cp; /* input code point */ 14 uint8_t *exp_arr; /* expected UTF-8 byte sequence */ 15 size_t exp_len; /* expected length of UTF-8 sequence */ 16 } enc_test[] = { 17 { 18 /* invalid code point (UTF-16 surrogate half) */ 19 .cp = UINT32_C(0xD800), 20 .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD }, 21 .exp_len = 3, 22 }, 23 { 24 /* invalid code point (UTF-16-unrepresentable) */ 25 .cp = UINT32_C(0x110000), 26 .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD }, 27 .exp_len = 3, 28 }, 29 { 30 /* code point encoded to a 1-byte sequence */ 31 .cp = 0x01, 32 .exp_arr = (uint8_t[]){ 0x01 }, 33 .exp_len = 1, 34 }, 35 { 36 /* code point encoded to a 2-byte sequence */ 37 .cp = 0xFF, 38 .exp_arr = (uint8_t[]){ 0xC3, 0xBF }, 39 .exp_len = 2, 40 }, 41 { 42 /* code point encoded to a 3-byte sequence */ 43 .cp = 0xFFF, 44 .exp_arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF }, 45 .exp_len = 3, 46 }, 47 { 48 /* code point encoded to a 4-byte sequence */ 49 .cp = UINT32_C(0xFFFFF), 50 .exp_arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF }, 51 .exp_len = 4, 52 }, 53 }; 54 55 static const struct { 56 uint8_t *arr; /* UTF-8 byte sequence */ 57 size_t len; /* length of UTF-8 byte sequence */ 58 size_t exp_len; /* expected length returned */ 59 uint32_t exp_cp; /* expected code point returned */ 60 } dec_test[] = { 61 { 62 /* empty sequence 63 * [ ] -> 64 * INVALID 65 */ 66 .arr = NULL, 67 .len = 0, 68 .exp_len = 1, 69 .exp_cp = GRAPHEME_CP_INVALID, 70 }, 71 { 72 /* invalid lead byte 73 * [ 11111101 ] -> 74 * INVALID 75 */ 76 .arr = (uint8_t[]){ 0xFD }, 77 .len = 1, 78 .exp_len = 1, 79 .exp_cp = GRAPHEME_CP_INVALID, 80 }, 81 { 82 /* valid 1-byte sequence 83 * [ 00000001 ] -> 84 * 0000001 85 */ 86 .arr = (uint8_t[]){ 0x01 }, 87 .len = 1, 88 .exp_len = 1, 89 .exp_cp = 0x1, 90 }, 91 { 92 /* valid 2-byte sequence 93 * [ 11000011 10111111 ] -> 94 * 00011111111 95 */ 96 .arr = (uint8_t[]){ 0xC3, 0xBF }, 97 .len = 2, 98 .exp_len = 2, 99 .exp_cp = 0xFF, 100 }, 101 { 102 /* invalid 2-byte sequence (second byte missing) 103 * [ 11000011 ] -> 104 * INVALID 105 */ 106 .arr = (uint8_t[]){ 0xC3 }, 107 .len = 1, 108 .exp_len = 2, 109 .exp_cp = GRAPHEME_CP_INVALID, 110 }, 111 { 112 /* invalid 2-byte sequence (second byte malformed) 113 * [ 11000011 11111111 ] -> 114 * INVALID 115 */ 116 .arr = (uint8_t[]){ 0xC3, 0xFF }, 117 .len = 2, 118 .exp_len = 1, 119 .exp_cp = GRAPHEME_CP_INVALID, 120 }, 121 { 122 /* invalid 2-byte sequence (overlong encoded) 123 * [ 11000001 10111111 ] -> 124 * INVALID 125 */ 126 .arr = (uint8_t[]){ 0xC1, 0xBF }, 127 .len = 2, 128 .exp_len = 2, 129 .exp_cp = GRAPHEME_CP_INVALID, 130 }, 131 { 132 /* valid 3-byte sequence 133 * [ 11100000 10111111 10111111 ] -> 134 * 0000111111111111 135 */ 136 .arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF }, 137 .len = 3, 138 .exp_len = 3, 139 .exp_cp = 0xFFF, 140 }, 141 { 142 /* invalid 3-byte sequence (second byte missing) 143 * [ 11100000 ] -> 144 * INVALID 145 */ 146 .arr = (uint8_t[]){ 0xE0 }, 147 .len = 1, 148 .exp_len = 3, 149 .exp_cp = GRAPHEME_CP_INVALID, 150 }, 151 { 152 /* invalid 3-byte sequence (second byte malformed) 153 * [ 11100000 01111111 10111111 ] -> 154 * INVALID 155 */ 156 .arr = (uint8_t[]){ 0xE0, 0x7F, 0xBF }, 157 .len = 3, 158 .exp_len = 1, 159 .exp_cp = GRAPHEME_CP_INVALID, 160 }, 161 { 162 /* invalid 3-byte sequence (third byte missing) 163 * [ 11100000 10111111 ] -> 164 * INVALID 165 */ 166 .arr = (uint8_t[]){ 0xE0, 0xBF }, 167 .len = 2, 168 .exp_len = 3, 169 .exp_cp = GRAPHEME_CP_INVALID, 170 }, 171 { 172 /* invalid 3-byte sequence (third byte malformed) 173 * [ 11100000 10111111 01111111 ] -> 174 * INVALID 175 */ 176 .arr = (uint8_t[]){ 0xE0, 0xBF, 0x7F }, 177 .len = 3, 178 .exp_len = 2, 179 .exp_cp = GRAPHEME_CP_INVALID, 180 }, 181 { 182 /* invalid 3-byte sequence (overlong encoded) 183 * [ 11100000 10011111 10111111 ] -> 184 * INVALID 185 */ 186 .arr = (uint8_t[]){ 0xE0, 0x9F, 0xBF }, 187 .len = 3, 188 .exp_len = 3, 189 .exp_cp = GRAPHEME_CP_INVALID, 190 }, 191 { 192 /* invalid 3-byte sequence (UTF-16 surrogate half) 193 * [ 11101101 10100000 10000000 ] -> 194 * INVALID 195 */ 196 .arr = (uint8_t[]){ 0xED, 0xA0, 0x80 }, 197 .len = 3, 198 .exp_len = 3, 199 .exp_cp = GRAPHEME_CP_INVALID, 200 }, 201 { 202 /* valid 4-byte sequence 203 * [ 11110011 10111111 10111111 10111111 ] -> 204 * 011111111111111111111 205 */ 206 .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF }, 207 .len = 4, 208 .exp_len = 4, 209 .exp_cp = UINT32_C(0xFFFFF), 210 }, 211 { 212 /* invalid 4-byte sequence (second byte missing) 213 * [ 11110011 ] -> 214 * INVALID 215 */ 216 .arr = (uint8_t[]){ 0xF3 }, 217 .len = 1, 218 .exp_len = 4, 219 .exp_cp = GRAPHEME_CP_INVALID, 220 }, 221 { 222 /* invalid 4-byte sequence (second byte malformed) 223 * [ 11110011 01111111 10111111 10111111 ] -> 224 * INVALID 225 */ 226 .arr = (uint8_t[]){ 0xF3, 0x7F, 0xBF, 0xBF }, 227 .len = 4, 228 .exp_len = 1, 229 .exp_cp = GRAPHEME_CP_INVALID, 230 }, 231 { 232 /* invalid 4-byte sequence (third byte missing) 233 * [ 11110011 10111111 ] -> 234 * INVALID 235 */ 236 .arr = (uint8_t[]){ 0xF3, 0xBF }, 237 .len = 2, 238 .exp_len = 4, 239 .exp_cp = GRAPHEME_CP_INVALID, 240 }, 241 { 242 /* invalid 4-byte sequence (third byte malformed) 243 * [ 11110011 10111111 01111111 10111111 ] -> 244 * INVALID 245 */ 246 .arr = (uint8_t[]){ 0xF3, 0xBF, 0x7F, 0xBF }, 247 .len = 4, 248 .exp_len = 2, 249 .exp_cp = GRAPHEME_CP_INVALID, 250 }, 251 { 252 /* invalid 4-byte sequence (fourth byte missing) 253 * [ 11110011 10111111 10111111 ] -> 254 * INVALID 255 */ 256 .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF }, 257 .len = 3, 258 .exp_len = 4, 259 .exp_cp = GRAPHEME_CP_INVALID, 260 }, 261 { 262 /* invalid 4-byte sequence (fourth byte malformed) 263 * [ 11110011 10111111 10111111 01111111 ] -> 264 * INVALID 265 */ 266 .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0x7F }, 267 .len = 4, 268 .exp_len = 3, 269 .exp_cp = GRAPHEME_CP_INVALID, 270 }, 271 { 272 /* invalid 4-byte sequence (overlong encoded) 273 * [ 11110000 10000000 10000001 10111111 ] -> 274 * INVALID 275 */ 276 .arr = (uint8_t[]){ 0xF0, 0x80, 0x81, 0xBF }, 277 .len = 4, 278 .exp_len = 4, 279 .exp_cp = GRAPHEME_CP_INVALID, 280 }, 281 { 282 /* invalid 4-byte sequence (UTF-16-unrepresentable) 283 * [ 11110100 10010000 10000000 10000000 ] -> 284 * INVALID 285 */ 286 .arr = (uint8_t[]){ 0xF4, 0x90, 0x80, 0x80 }, 287 .len = 4, 288 .exp_len = 4, 289 .exp_cp = GRAPHEME_CP_INVALID, 290 }, 291 }; 292 293 int 294 main(void) 295 { 296 int state; 297 size_t i, j, k, len, failed; 298 299 /* UTF-8 encoder test */ 300 for (i = 0, failed = 0; i < LEN(enc_test); i++) { 301 uint8_t arr[4]; 302 size_t len; 303 304 len = grapheme_cp_encode(enc_test[i].cp, arr, LEN(arr)); 305 306 if (len != enc_test[i].exp_len || 307 memcmp(arr, enc_test[i].exp_arr, len)) { 308 fprintf(stderr, "Failed UTF-8-encoder test %zu: " 309 "Expected (", i); 310 for (j = 0; j < enc_test[i].exp_len; j++) { 311 fprintf(stderr, "0x%x", 312 enc_test[i].exp_arr[j]); 313 if (j + 1 < enc_test[i].exp_len) { 314 fprintf(stderr, " "); 315 } 316 } 317 fprintf(stderr, "), but got ("); 318 for (j = 0; j < len; j++) { 319 fprintf(stderr, "0x%x", arr[j]); 320 if (j + 1 < len) { 321 fprintf(stderr, " "); 322 } 323 } 324 fprintf(stderr, ")\n"); 325 failed++; 326 } 327 } 328 printf("UTF-8 encoder test: Passed %zu out of %zu tests.\n", 329 LEN(enc_test) - failed, LEN(enc_test)); 330 331 /* UTF-8 decoder test */ 332 for (i = 0, failed = 0; i < LEN(dec_test); i++) { 333 size_t len; 334 uint32_t cp; 335 336 len = grapheme_cp_decode(&cp, dec_test[i].arr, 337 dec_test[i].len); 338 339 if (len != dec_test[i].exp_len || 340 cp != dec_test[i].exp_cp) { 341 fprintf(stderr, "Failed UTF-8-decoder test %zu: " 342 "Expected (%zx,%u), but got (%zx,%u)\n", 343 i, dec_test[i].exp_len, 344 dec_test[i].exp_cp, len, cp); 345 failed++; 346 } 347 } 348 printf("UTF-8 decoder test: Passed %zu out of %zu tests.\n", 349 LEN(dec_test) - failed, LEN(dec_test)); 350 351 /* grapheme break test */ 352 for (i = 0, failed = 0; i < LEN(t); i++) { 353 for (j = 0, k = 0, state = 0, len = 1; j < t[i].cplen; j++) { 354 if ((j + 1) == t[i].cplen || 355 grapheme_boundary(t[i].cp[j], t[i].cp[j + 1], 356 &state)) { 357 /* check if our resulting length matches */ 358 if (k == t[i].lenlen || len != t[i].len[k++]) { 359 fprintf(stderr, "Failed \"%s\"\n", 360 t[i].descr); 361 failed++; 362 break; 363 } 364 len = 1; 365 } else { 366 len++; 367 } 368 } 369 } 370 printf("Grapheme break test: Passed %zu out of %zu tests.\n", 371 LEN(t) - failed, LEN(t)); 372 373 return (failed > 0) ? 1 : 0; 374 }