utf8-decode.c (7494B)
1 /* See LICENSE file for copyright and license details. */ 2 #include <stddef.h> 3 #include <stdint.h> 4 #include <stdio.h> 5 #include <string.h> 6 7 #include "../grapheme.h" 8 #include "util.h" 9 10 static const struct { 11 char *arr; /* UTF-8 byte sequence */ 12 size_t len; /* length of UTF-8 byte sequence */ 13 size_t exp_len; /* expected length returned */ 14 uint_least32_t exp_cp; /* expected codepoint returned */ 15 } dec_test[] = { 16 { 17 /* empty sequence 18 * [ ] -> 19 * INVALID 20 */ 21 .arr = NULL, 22 .len = 0, 23 .exp_len = 0, 24 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 25 }, 26 { 27 /* invalid lead byte 28 * [ 11111101 ] -> 29 * INVALID 30 */ 31 .arr = (char *)(unsigned char[]){ 0xFD }, 32 .len = 1, 33 .exp_len = 1, 34 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 35 }, 36 { 37 /* valid 1-byte sequence 38 * [ 00000001 ] -> 39 * 0000001 40 */ 41 .arr = (char *)(unsigned char[]){ 0x01 }, 42 .len = 1, 43 .exp_len = 1, 44 .exp_cp = 0x1, 45 }, 46 { 47 /* valid 2-byte sequence 48 * [ 11000011 10111111 ] -> 49 * 00011111111 50 */ 51 .arr = (char *)(unsigned char[]){ 0xC3, 0xBF }, 52 .len = 2, 53 .exp_len = 2, 54 .exp_cp = 0xFF, 55 }, 56 { 57 /* invalid 2-byte sequence (second byte missing) 58 * [ 11000011 ] -> 59 * INVALID 60 */ 61 .arr = (char *)(unsigned char[]){ 0xC3 }, 62 .len = 1, 63 .exp_len = 2, 64 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 65 }, 66 { 67 /* invalid 2-byte sequence (second byte malformed) 68 * [ 11000011 11111111 ] -> 69 * INVALID 70 */ 71 .arr = (char *)(unsigned char[]){ 0xC3, 0xFF }, 72 .len = 2, 73 .exp_len = 1, 74 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 75 }, 76 { 77 /* invalid 2-byte sequence (overlong encoded) 78 * [ 11000001 10111111 ] -> 79 * INVALID 80 */ 81 .arr = (char *)(unsigned char[]){ 0xC1, 0xBF }, 82 .len = 2, 83 .exp_len = 2, 84 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 85 }, 86 { 87 /* valid 3-byte sequence 88 * [ 11100000 10111111 10111111 ] -> 89 * 0000111111111111 90 */ 91 .arr = (char *)(unsigned char[]){ 0xE0, 0xBF, 0xBF }, 92 .len = 3, 93 .exp_len = 3, 94 .exp_cp = 0xFFF, 95 }, 96 { 97 /* invalid 3-byte sequence (second byte missing) 98 * [ 11100000 ] -> 99 * INVALID 100 */ 101 .arr = (char *)(unsigned char[]){ 0xE0 }, 102 .len = 1, 103 .exp_len = 3, 104 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 105 }, 106 { 107 /* invalid 3-byte sequence (second byte malformed) 108 * [ 11100000 01111111 10111111 ] -> 109 * INVALID 110 */ 111 .arr = (char *)(unsigned char[]){ 0xE0, 0x7F, 0xBF }, 112 .len = 3, 113 .exp_len = 1, 114 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 115 }, 116 { 117 /* invalid 3-byte sequence (short string, second byte malformed) 118 * [ 11100000 01111111 ] -> 119 * INVALID 120 */ 121 .arr = (char *)(unsigned char[]){ 0xE0, 0x7F }, 122 .len = 2, 123 .exp_len = 1, 124 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 125 }, 126 { 127 /* invalid 3-byte sequence (third byte missing) 128 * [ 11100000 10111111 ] -> 129 * INVALID 130 */ 131 .arr = (char *)(unsigned char[]){ 0xE0, 0xBF }, 132 .len = 2, 133 .exp_len = 3, 134 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 135 }, 136 { 137 /* invalid 3-byte sequence (third byte malformed) 138 * [ 11100000 10111111 01111111 ] -> 139 * INVALID 140 */ 141 .arr = (char *)(unsigned char[]){ 0xE0, 0xBF, 0x7F }, 142 .len = 3, 143 .exp_len = 2, 144 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 145 }, 146 { 147 /* invalid 3-byte sequence (overlong encoded) 148 * [ 11100000 10011111 10111111 ] -> 149 * INVALID 150 */ 151 .arr = (char *)(unsigned char[]){ 0xE0, 0x9F, 0xBF }, 152 .len = 3, 153 .exp_len = 3, 154 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 155 }, 156 { 157 /* invalid 3-byte sequence (UTF-16 surrogate half) 158 * [ 11101101 10100000 10000000 ] -> 159 * INVALID 160 */ 161 .arr = (char *)(unsigned char[]){ 0xED, 0xA0, 0x80 }, 162 .len = 3, 163 .exp_len = 3, 164 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 165 }, 166 { 167 /* valid 4-byte sequence 168 * [ 11110011 10111111 10111111 10111111 ] -> 169 * 011111111111111111111 170 */ 171 .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0xBF }, 172 .len = 4, 173 .exp_len = 4, 174 .exp_cp = UINT32_C(0xFFFFF), 175 }, 176 { 177 /* invalid 4-byte sequence (second byte missing) 178 * [ 11110011 ] -> 179 * INVALID 180 */ 181 .arr = (char *)(unsigned char[]){ 0xF3 }, 182 .len = 1, 183 .exp_len = 4, 184 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 185 }, 186 { 187 /* invalid 4-byte sequence (second byte malformed) 188 * [ 11110011 01111111 10111111 10111111 ] -> 189 * INVALID 190 */ 191 .arr = (char *)(unsigned char[]){ 0xF3, 0x7F, 0xBF, 0xBF }, 192 .len = 4, 193 .exp_len = 1, 194 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 195 }, 196 { 197 /* invalid 4-byte sequence (short string 1, second byte malformed) 198 * [ 11110011 011111111 ] -> 199 * INVALID 200 */ 201 .arr = (char *)(unsigned char[]){ 0xF3, 0x7F }, 202 .len = 2, 203 .exp_len = 1, 204 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 205 }, 206 { 207 /* invalid 4-byte sequence (short string 2, second byte malformed) 208 * [ 11110011 011111111 10111111 ] -> 209 * INVALID 210 */ 211 .arr = (char *)(unsigned char[]){ 0xF3, 0x7F, 0xBF }, 212 .len = 3, 213 .exp_len = 1, 214 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 215 }, 216 217 { 218 /* invalid 4-byte sequence (third byte missing) 219 * [ 11110011 10111111 ] -> 220 * INVALID 221 */ 222 .arr = (char *)(unsigned char[]){ 0xF3, 0xBF }, 223 .len = 2, 224 .exp_len = 4, 225 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 226 }, 227 { 228 /* invalid 4-byte sequence (third byte malformed) 229 * [ 11110011 10111111 01111111 10111111 ] -> 230 * INVALID 231 */ 232 .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0x7F, 0xBF }, 233 .len = 4, 234 .exp_len = 2, 235 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 236 }, 237 { 238 /* invalid 4-byte sequence (short string, third byte malformed) 239 * [ 11110011 10111111 01111111 ] -> 240 * INVALID 241 */ 242 .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0x7F }, 243 .len = 3, 244 .exp_len = 2, 245 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 246 }, 247 { 248 /* invalid 4-byte sequence (fourth byte missing) 249 * [ 11110011 10111111 10111111 ] -> 250 * INVALID 251 */ 252 .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF }, 253 .len = 3, 254 .exp_len = 4, 255 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 256 }, 257 { 258 /* invalid 4-byte sequence (fourth byte malformed) 259 * [ 11110011 10111111 10111111 01111111 ] -> 260 * INVALID 261 */ 262 .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0x7F }, 263 .len = 4, 264 .exp_len = 3, 265 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 266 }, 267 { 268 /* invalid 4-byte sequence (overlong encoded) 269 * [ 11110000 10000000 10000001 10111111 ] -> 270 * INVALID 271 */ 272 .arr = (char *)(unsigned char[]){ 0xF0, 0x80, 0x81, 0xBF }, 273 .len = 4, 274 .exp_len = 4, 275 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 276 }, 277 { 278 /* invalid 4-byte sequence (UTF-16-unrepresentable) 279 * [ 11110100 10010000 10000000 10000000 ] -> 280 * INVALID 281 */ 282 .arr = (char *)(unsigned char[]){ 0xF4, 0x90, 0x80, 0x80 }, 283 .len = 4, 284 .exp_len = 4, 285 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 286 }, 287 }; 288 289 int 290 main(int argc, char *argv[]) 291 { 292 size_t i, failed; 293 294 (void)argc; 295 296 /* UTF-8 decoder test */ 297 for (i = 0, failed = 0; i < LEN(dec_test); i++) { 298 size_t len; 299 uint_least32_t cp; 300 301 len = grapheme_decode_utf8(dec_test[i].arr, 302 dec_test[i].len, &cp); 303 304 if (len != dec_test[i].exp_len || 305 cp != dec_test[i].exp_cp) { 306 fprintf(stderr, "%s: Failed test %zu: " 307 "Expected (%zx,%u), but got (%zx,%u).\n", 308 argv[0], i, dec_test[i].exp_len, 309 dec_test[i].exp_cp, len, cp); 310 failed++; 311 } 312 } 313 printf("%s: %zu/%zu tests passed.\n", argv[0], 314 LEN(dec_test) - failed, LEN(dec_test)); 315 316 return (failed > 0) ? 1 : 0; 317 }