utf8-decode.c (7826B)
1 /* See LICENSE file for copyright and license details. */ 2 #include <stddef.h> 3 #include <stdint.h> 4 #include <stdio.h> 5 #include <string.h> 6 7 #include "../grapheme.h" 8 #include "util.h" 9 10 static const struct { 11 char *arr; /* UTF-8 byte sequence */ 12 size_t len; /* length of UTF-8 byte sequence */ 13 size_t exp_len; /* expected length returned */ 14 uint_least32_t exp_cp; /* expected codepoint returned */ 15 } dec_test[] = { 16 { 17 /* empty sequence 18 * [ ] -> 19 * INVALID 20 */ 21 .arr = NULL, 22 .len = 0, 23 .exp_len = 0, 24 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 25 }, 26 { 27 /* invalid lead byte 28 * [ 11111101 ] -> 29 * INVALID 30 */ 31 .arr = (char *)(unsigned char[]) { 0xFD }, 32 .len = 1, 33 .exp_len = 1, 34 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 35 }, 36 { 37 /* valid 1-byte sequence 38 * [ 00000001 ] -> 39 * 0000001 40 */ 41 .arr = (char *)(unsigned char[]) { 0x01 }, 42 .len = 1, 43 .exp_len = 1, 44 .exp_cp = 0x1, 45 }, 46 { 47 /* valid 2-byte sequence 48 * [ 11000011 10111111 ] -> 49 * 00011111111 50 */ 51 .arr = (char *)(unsigned char[]) { 0xC3, 0xBF }, 52 .len = 2, 53 .exp_len = 2, 54 .exp_cp = 0xFF, 55 }, 56 { 57 /* invalid 2-byte sequence (second byte missing) 58 * [ 11000011 ] -> 59 * INVALID 60 */ 61 .arr = (char *)(unsigned char[]) { 0xC3 }, 62 .len = 1, 63 .exp_len = 2, 64 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 65 }, 66 { 67 /* invalid 2-byte sequence (second byte malformed) 68 * [ 11000011 11111111 ] -> 69 * INVALID 70 */ 71 .arr = (char *)(unsigned char[]) { 0xC3, 0xFF }, 72 .len = 2, 73 .exp_len = 1, 74 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 75 }, 76 { 77 /* invalid 2-byte sequence (overlong encoded) 78 * [ 11000001 10111111 ] -> 79 * INVALID 80 */ 81 .arr = (char *)(unsigned char[]) { 0xC1, 0xBF }, 82 .len = 2, 83 .exp_len = 2, 84 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 85 }, 86 { 87 /* valid 3-byte sequence 88 * [ 11100000 10111111 10111111 ] -> 89 * 0000111111111111 90 */ 91 .arr = (char *)(unsigned char[]) { 0xE0, 0xBF, 0xBF }, 92 .len = 3, 93 .exp_len = 3, 94 .exp_cp = 0xFFF, 95 }, 96 { 97 /* invalid 3-byte sequence (second byte missing) 98 * [ 11100000 ] -> 99 * INVALID 100 */ 101 .arr = (char *)(unsigned char[]) { 0xE0 }, 102 .len = 1, 103 .exp_len = 3, 104 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 105 }, 106 { 107 /* invalid 3-byte sequence (second byte malformed) 108 * [ 11100000 01111111 10111111 ] -> 109 * INVALID 110 */ 111 .arr = (char *)(unsigned char[]) { 0xE0, 0x7F, 0xBF }, 112 .len = 3, 113 .exp_len = 1, 114 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 115 }, 116 { 117 /* invalid 3-byte sequence (short string, second byte malformed) 118 * [ 11100000 01111111 ] -> 119 * INVALID 120 */ 121 .arr = (char *)(unsigned char[]) { 0xE0, 0x7F }, 122 .len = 2, 123 .exp_len = 1, 124 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 125 }, 126 { 127 /* invalid 3-byte sequence (third byte missing) 128 * [ 11100000 10111111 ] -> 129 * INVALID 130 */ 131 .arr = (char *)(unsigned char[]) { 0xE0, 0xBF }, 132 .len = 2, 133 .exp_len = 3, 134 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 135 }, 136 { 137 /* invalid 3-byte sequence (third byte malformed) 138 * [ 11100000 10111111 01111111 ] -> 139 * INVALID 140 */ 141 .arr = (char *)(unsigned char[]) { 0xE0, 0xBF, 0x7F }, 142 .len = 3, 143 .exp_len = 2, 144 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 145 }, 146 { 147 /* invalid 3-byte sequence (overlong encoded) 148 * [ 11100000 10011111 10111111 ] -> 149 * INVALID 150 */ 151 .arr = (char *)(unsigned char[]) { 0xE0, 0x9F, 0xBF }, 152 .len = 3, 153 .exp_len = 3, 154 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 155 }, 156 { 157 /* invalid 3-byte sequence (UTF-16 surrogate half) 158 * [ 11101101 10100000 10000000 ] -> 159 * INVALID 160 */ 161 .arr = (char *)(unsigned char[]) { 0xED, 0xA0, 0x80 }, 162 .len = 3, 163 .exp_len = 3, 164 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 165 }, 166 { 167 /* valid 4-byte sequence 168 * [ 11110011 10111111 10111111 10111111 ] -> 169 * 011111111111111111111 170 */ 171 .arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0xBF, 0xBF }, 172 .len = 4, 173 .exp_len = 4, 174 .exp_cp = UINT32_C(0xFFFFF), 175 }, 176 { 177 /* invalid 4-byte sequence (second byte missing) 178 * [ 11110011 ] -> 179 * INVALID 180 */ 181 .arr = (char *)(unsigned char[]) { 0xF3 }, 182 .len = 1, 183 .exp_len = 4, 184 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 185 }, 186 { 187 /* invalid 4-byte sequence (second byte malformed) 188 * [ 11110011 01111111 10111111 10111111 ] -> 189 * INVALID 190 */ 191 .arr = (char *)(unsigned char[]) { 0xF3, 0x7F, 0xBF, 0xBF }, 192 .len = 4, 193 .exp_len = 1, 194 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 195 }, 196 { 197 /* invalid 4-byte sequence (short string 1, second byte 198 * malformed) [ 11110011 011111111 ] -> INVALID 199 */ 200 .arr = (char *)(unsigned char[]) { 0xF3, 0x7F }, 201 .len = 2, 202 .exp_len = 1, 203 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 204 }, 205 { 206 /* invalid 4-byte sequence (short string 2, second byte 207 * malformed) [ 11110011 011111111 10111111 ] -> INVALID 208 */ 209 .arr = (char *)(unsigned char[]) { 0xF3, 0x7F, 0xBF }, 210 .len = 3, 211 .exp_len = 1, 212 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 213 }, 214 215 { 216 /* invalid 4-byte sequence (third byte missing) 217 * [ 11110011 10111111 ] -> 218 * INVALID 219 */ 220 .arr = (char *)(unsigned char[]) { 0xF3, 0xBF }, 221 .len = 2, 222 .exp_len = 4, 223 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 224 }, 225 { 226 /* invalid 4-byte sequence (third byte malformed) 227 * [ 11110011 10111111 01111111 10111111 ] -> 228 * INVALID 229 */ 230 .arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0x7F, 0xBF }, 231 .len = 4, 232 .exp_len = 2, 233 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 234 }, 235 { 236 /* invalid 4-byte sequence (short string, third byte malformed) 237 * [ 11110011 10111111 01111111 ] -> 238 * INVALID 239 */ 240 .arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0x7F }, 241 .len = 3, 242 .exp_len = 2, 243 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 244 }, 245 { 246 /* invalid 4-byte sequence (fourth byte missing) 247 * [ 11110011 10111111 10111111 ] -> 248 * INVALID 249 */ 250 .arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0xBF }, 251 .len = 3, 252 .exp_len = 4, 253 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 254 }, 255 { 256 /* invalid 4-byte sequence (fourth byte malformed) 257 * [ 11110011 10111111 10111111 01111111 ] -> 258 * INVALID 259 */ 260 .arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0xBF, 0x7F }, 261 .len = 4, 262 .exp_len = 3, 263 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 264 }, 265 { 266 /* invalid 4-byte sequence (overlong encoded) 267 * [ 11110000 10000000 10000001 10111111 ] -> 268 * INVALID 269 */ 270 .arr = (char *)(unsigned char[]) { 0xF0, 0x80, 0x81, 0xBF }, 271 .len = 4, 272 .exp_len = 4, 273 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 274 }, 275 { 276 /* invalid 4-byte sequence (UTF-16-unrepresentable) 277 * [ 11110100 10010000 10000000 10000000 ] -> 278 * INVALID 279 */ 280 .arr = (char *)(unsigned char[]) { 0xF4, 0x90, 0x80, 0x80 }, 281 .len = 4, 282 .exp_len = 4, 283 .exp_cp = GRAPHEME_INVALID_CODEPOINT, 284 }, 285 }; 286 287 int 288 main(int argc, char *argv[]) 289 { 290 size_t i, failed; 291 292 (void)argc; 293 294 /* UTF-8 decoder test */ 295 for (i = 0, failed = 0; i < LEN(dec_test); i++) { 296 size_t len; 297 uint_least32_t cp; 298 299 len = grapheme_decode_utf8(dec_test[i].arr, dec_test[i].len, 300 &cp); 301 302 if (len != dec_test[i].exp_len || cp != dec_test[i].exp_cp) { 303 fprintf(stderr, 304 "%s: Failed test %zu: " 305 "Expected (%zx,%u), but got (%zx,%u).\n", 306 argv[0], i, dec_test[i].exp_len, 307 dec_test[i].exp_cp, len, cp); 308 failed++; 309 } 310 } 311 printf("%s: %zu/%zu unit tests passed.\n", argv[0], 312 LEN(dec_test) - failed, LEN(dec_test)); 313 314 return (failed > 0) ? 1 : 0; 315 }