test_body.c (8509B)
1 /* See LICENSE file for copyright and license details. */ 2 #include <stddef.h> 3 #include <stdint.h> 4 #include <stdio.h> 5 #include <string.h> 6 7 #include "../grapheme.h" 8 9 #define LEN(x) (sizeof(x) / sizeof(*x)) 10 11 static const struct { 12 uint32_t cp; /* input code point */ 13 uint8_t *exp_arr; /* expected UTF-8 byte sequence */ 14 size_t exp_len; /* expected length of UTF-8 sequence */ 15 } enc_test[] = { 16 { 17 /* invalid code point (UTF-16 surrogate half) */ 18 .cp = UINT32_C(0xD800), 19 .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD }, 20 .exp_len = 3, 21 }, 22 { 23 /* invalid code point (UTF-16-unrepresentable) */ 24 .cp = UINT32_C(0x110000), 25 .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD }, 26 .exp_len = 3, 27 }, 28 { 29 /* code point encoded to a 1-byte sequence */ 30 .cp = 0x01, 31 .exp_arr = (uint8_t[]){ 0x01 }, 32 .exp_len = 1, 33 }, 34 { 35 /* code point encoded to a 2-byte sequence */ 36 .cp = 0xFF, 37 .exp_arr = (uint8_t[]){ 0xC3, 0xBF }, 38 .exp_len = 2, 39 }, 40 { 41 /* code point encoded to a 3-byte sequence */ 42 .cp = 0xFFF, 43 .exp_arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF }, 44 .exp_len = 3, 45 }, 46 { 47 /* code point encoded to a 4-byte sequence */ 48 .cp = UINT32_C(0xFFFFF), 49 .exp_arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF }, 50 .exp_len = 4, 51 }, 52 }; 53 54 static const struct { 55 uint8_t *arr; /* UTF-8 byte sequence */ 56 size_t len; /* length of UTF-8 byte sequence */ 57 size_t exp_len; /* expected length returned */ 58 uint32_t exp_cp; /* expected code point returned */ 59 } dec_test[] = { 60 { 61 /* empty sequence 62 * [ ] -> 63 * INVALID 64 */ 65 .arr = NULL, 66 .len = 0, 67 .exp_len = 1, 68 .exp_cp = GRAPHEME_CP_INVALID, 69 }, 70 { 71 /* invalid lead byte 72 * [ 11111101 ] -> 73 * INVALID 74 */ 75 .arr = (uint8_t[]){ 0xFD }, 76 .len = 1, 77 .exp_len = 1, 78 .exp_cp = GRAPHEME_CP_INVALID, 79 }, 80 { 81 /* valid 1-byte sequence 82 * [ 00000001 ] -> 83 * 0000001 84 */ 85 .arr = (uint8_t[]){ 0x01 }, 86 .len = 1, 87 .exp_len = 1, 88 .exp_cp = 0x1, 89 }, 90 { 91 /* valid 2-byte sequence 92 * [ 11000011 10111111 ] -> 93 * 00011111111 94 */ 95 .arr = (uint8_t[]){ 0xC3, 0xBF }, 96 .len = 2, 97 .exp_len = 2, 98 .exp_cp = 0xFF, 99 }, 100 { 101 /* invalid 2-byte sequence (second byte missing) 102 * [ 11000011 ] -> 103 * INVALID 104 */ 105 .arr = (uint8_t[]){ 0xC3 }, 106 .len = 1, 107 .exp_len = 2, 108 .exp_cp = GRAPHEME_CP_INVALID, 109 }, 110 { 111 /* invalid 2-byte sequence (second byte malformed) 112 * [ 11000011 11111111 ] -> 113 * INVALID 114 */ 115 .arr = (uint8_t[]){ 0xC3, 0xFF }, 116 .len = 2, 117 .exp_len = 1, 118 .exp_cp = GRAPHEME_CP_INVALID, 119 }, 120 { 121 /* invalid 2-byte sequence (overlong encoded) 122 * [ 11000001 10111111 ] -> 123 * INVALID 124 */ 125 .arr = (uint8_t[]){ 0xC1, 0xBF }, 126 .len = 2, 127 .exp_len = 2, 128 .exp_cp = GRAPHEME_CP_INVALID, 129 }, 130 { 131 /* valid 3-byte sequence 132 * [ 11100000 10111111 10111111 ] -> 133 * 0000111111111111 134 */ 135 .arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF }, 136 .len = 3, 137 .exp_len = 3, 138 .exp_cp = 0xFFF, 139 }, 140 { 141 /* invalid 3-byte sequence (second byte missing) 142 * [ 11100000 ] -> 143 * INVALID 144 */ 145 .arr = (uint8_t[]){ 0xE0 }, 146 .len = 1, 147 .exp_len = 3, 148 .exp_cp = GRAPHEME_CP_INVALID, 149 }, 150 { 151 /* invalid 3-byte sequence (second byte malformed) 152 * [ 11100000 01111111 10111111 ] -> 153 * INVALID 154 */ 155 .arr = (uint8_t[]){ 0xE0, 0x7F, 0xBF }, 156 .len = 3, 157 .exp_len = 1, 158 .exp_cp = GRAPHEME_CP_INVALID, 159 }, 160 { 161 /* invalid 3-byte sequence (third byte missing) 162 * [ 11100000 10111111 ] -> 163 * INVALID 164 */ 165 .arr = (uint8_t[]){ 0xE0, 0xBF }, 166 .len = 2, 167 .exp_len = 3, 168 .exp_cp = GRAPHEME_CP_INVALID, 169 }, 170 { 171 /* invalid 3-byte sequence (third byte malformed) 172 * [ 11100000 10111111 01111111 ] -> 173 * INVALID 174 */ 175 .arr = (uint8_t[]){ 0xE0, 0xBF, 0x7F }, 176 .len = 3, 177 .exp_len = 2, 178 .exp_cp = GRAPHEME_CP_INVALID, 179 }, 180 { 181 /* invalid 3-byte sequence (overlong encoded) 182 * [ 11100000 10011111 10111111 ] -> 183 * INVALID 184 */ 185 .arr = (uint8_t[]){ 0xE0, 0x9F, 0xBF }, 186 .len = 3, 187 .exp_len = 3, 188 .exp_cp = GRAPHEME_CP_INVALID, 189 }, 190 { 191 /* invalid 3-byte sequence (UTF-16 surrogate half) 192 * [ 11101101 10100000 10000000 ] -> 193 * INVALID 194 */ 195 .arr = (uint8_t[]){ 0xED, 0xA0, 0x80 }, 196 .len = 3, 197 .exp_len = 3, 198 .exp_cp = GRAPHEME_CP_INVALID, 199 }, 200 { 201 /* valid 4-byte sequence 202 * [ 11110011 10111111 10111111 10111111 ] -> 203 * 011111111111111111111 204 */ 205 .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF }, 206 .len = 4, 207 .exp_len = 4, 208 .exp_cp = UINT32_C(0xFFFFF), 209 }, 210 { 211 /* invalid 4-byte sequence (second byte missing) 212 * [ 11110011 ] -> 213 * INVALID 214 */ 215 .arr = (uint8_t[]){ 0xF3 }, 216 .len = 1, 217 .exp_len = 4, 218 .exp_cp = GRAPHEME_CP_INVALID, 219 }, 220 { 221 /* invalid 4-byte sequence (second byte malformed) 222 * [ 11110011 01111111 10111111 10111111 ] -> 223 * INVALID 224 */ 225 .arr = (uint8_t[]){ 0xF3, 0x7F, 0xBF, 0xBF }, 226 .len = 4, 227 .exp_len = 1, 228 .exp_cp = GRAPHEME_CP_INVALID, 229 }, 230 { 231 /* invalid 4-byte sequence (third byte missing) 232 * [ 11110011 10111111 ] -> 233 * INVALID 234 */ 235 .arr = (uint8_t[]){ 0xF3, 0xBF }, 236 .len = 2, 237 .exp_len = 4, 238 .exp_cp = GRAPHEME_CP_INVALID, 239 }, 240 { 241 /* invalid 4-byte sequence (third byte malformed) 242 * [ 11110011 10111111 01111111 10111111 ] -> 243 * INVALID 244 */ 245 .arr = (uint8_t[]){ 0xF3, 0xBF, 0x7F, 0xBF }, 246 .len = 4, 247 .exp_len = 2, 248 .exp_cp = GRAPHEME_CP_INVALID, 249 }, 250 { 251 /* invalid 4-byte sequence (fourth byte missing) 252 * [ 11110011 10111111 10111111 ] -> 253 * INVALID 254 */ 255 .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF }, 256 .len = 3, 257 .exp_len = 4, 258 .exp_cp = GRAPHEME_CP_INVALID, 259 }, 260 { 261 /* invalid 4-byte sequence (fourth byte malformed) 262 * [ 11110011 10111111 10111111 01111111 ] -> 263 * INVALID 264 */ 265 .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0x7F }, 266 .len = 4, 267 .exp_len = 3, 268 .exp_cp = GRAPHEME_CP_INVALID, 269 }, 270 { 271 /* invalid 4-byte sequence (overlong encoded) 272 * [ 11110000 10000000 10000001 10111111 ] -> 273 * INVALID 274 */ 275 .arr = (uint8_t[]){ 0xF0, 0x80, 0x81, 0xBF }, 276 .len = 4, 277 .exp_len = 4, 278 .exp_cp = GRAPHEME_CP_INVALID, 279 }, 280 { 281 /* invalid 4-byte sequence (UTF-16-unrepresentable) 282 * [ 11110100 10010000 10000000 10000000 ] -> 283 * INVALID 284 */ 285 .arr = (uint8_t[]){ 0xF4, 0x90, 0x80, 0x80 }, 286 .len = 4, 287 .exp_len = 4, 288 .exp_cp = GRAPHEME_CP_INVALID, 289 }, 290 }; 291 292 int 293 main(void) 294 { 295 int state; 296 size_t i, j, k, len, failed; 297 298 /* UTF-8 encoder test */ 299 for (i = 0, failed = 0; i < LEN(enc_test); i++) { 300 uint8_t arr[4]; 301 size_t len; 302 303 len = grapheme_cp_encode(enc_test[i].cp, arr, LEN(arr)); 304 305 if (len != enc_test[i].exp_len || 306 memcmp(arr, enc_test[i].exp_arr, len)) { 307 fprintf(stderr, "Failed UTF-8-encoder test %zu: " 308 "Expected (", i); 309 for (j = 0; j < enc_test[i].exp_len; j++) { 310 fprintf(stderr, "0x%x", 311 enc_test[i].exp_arr[j]); 312 if (j + 1 < enc_test[i].exp_len) { 313 fprintf(stderr, " "); 314 } 315 } 316 fprintf(stderr, "), but got ("); 317 for (j = 0; j < len; j++) { 318 fprintf(stderr, "0x%x", arr[j]); 319 if (j + 1 < len) { 320 fprintf(stderr, " "); 321 } 322 } 323 fprintf(stderr, ")\n"); 324 failed++; 325 } 326 } 327 printf("UTF-8 encoder test: Passed %zu out of %zu tests.\n", 328 LEN(enc_test) - failed, LEN(enc_test)); 329 330 /* UTF-8 decoder test */ 331 for (i = 0, failed = 0; i < LEN(dec_test); i++) { 332 size_t len; 333 uint32_t cp; 334 335 len = grapheme_cp_decode(&cp, dec_test[i].arr, 336 dec_test[i].len); 337 338 if (len != dec_test[i].exp_len || 339 cp != dec_test[i].exp_cp) { 340 fprintf(stderr, "Failed UTF-8-decoder test %zu: " 341 "Expected (%zx,%u), but got (%zx,%u)\n", 342 i, dec_test[i].exp_len, 343 dec_test[i].exp_cp, len, cp); 344 failed++; 345 } 346 } 347 printf("UTF-8 decoder test: Passed %zu out of %zu tests.\n", 348 LEN(dec_test) - failed, LEN(dec_test)); 349 350 /* grapheme break test */ 351 for (i = 0, failed = 0; i < LEN(t); i++) { 352 for (j = 0, k = 0, state = 0, len = 1; j < t[i].cplen; j++) { 353 if ((j + 1) == t[i].cplen || 354 grapheme_boundary(t[i].cp[j], t[i].cp[j + 1], 355 &state)) { 356 /* check if our resulting length matches */ 357 if (k == t[i].lenlen || len != t[i].len[k++]) { 358 fprintf(stderr, "Failed \"%s\"\n", 359 t[i].descr); 360 failed++; 361 break; 362 } 363 len = 1; 364 } else { 365 len++; 366 } 367 } 368 } 369 printf("Grapheme break test: Passed %zu out of %zu tests.\n", 370 LEN(t) - failed, LEN(t)); 371 372 return (failed > 0) ? 1 : 0; 373 }