bidirectional.c (12515B)
1 /* See LICENSE file for copyright and license details. */ 2 #include <errno.h> 3 #include <inttypes.h> 4 #include <stddef.h> 5 #include <stdio.h> 6 #include <stdlib.h> 7 #include <string.h> 8 9 #include "util.h" 10 11 #define FILE_BIDI_BRACKETS "data/BidiBrackets.txt" 12 #define FILE_BIDI_CLASS "data/DerivedBidiClass.txt" 13 #define FILE_BIDI_MIRRORING "data/BidiMirroring.txt" 14 #define FILE_UNICODE_DATA "data/UnicodeData.txt" 15 16 #define NUM_BRACKET_ALIASES 20 17 18 static const struct property_spec bidi_property[] = { 19 { 20 /* default */ 21 .enumname = "L", 22 .file = FILE_BIDI_CLASS, 23 .ucdname = "L", 24 }, 25 { 26 .enumname = "AL", 27 .file = FILE_BIDI_CLASS, 28 .ucdname = "AL", 29 }, 30 { 31 .enumname = "AN", 32 .file = FILE_BIDI_CLASS, 33 .ucdname = "AN", 34 }, 35 { 36 .enumname = "B", 37 .file = FILE_BIDI_CLASS, 38 .ucdname = "B", 39 }, 40 { 41 .enumname = "BN", 42 .file = FILE_BIDI_CLASS, 43 .ucdname = "BN", 44 }, 45 { 46 .enumname = "CS", 47 .file = FILE_BIDI_CLASS, 48 .ucdname = "CS", 49 }, 50 { 51 .enumname = "EN", 52 .file = FILE_BIDI_CLASS, 53 .ucdname = "EN", 54 }, 55 { 56 .enumname = "ES", 57 .file = FILE_BIDI_CLASS, 58 .ucdname = "ES", 59 }, 60 { 61 .enumname = "ET", 62 .file = FILE_BIDI_CLASS, 63 .ucdname = "ET", 64 }, 65 { 66 .enumname = "FSI", 67 .file = FILE_BIDI_CLASS, 68 .ucdname = "FSI", 69 }, 70 { 71 .enumname = "LRE", 72 .file = FILE_BIDI_CLASS, 73 .ucdname = "LRE", 74 }, 75 { 76 .enumname = "LRI", 77 .file = FILE_BIDI_CLASS, 78 .ucdname = "LRI", 79 }, 80 { 81 .enumname = "LRO", 82 .file = FILE_BIDI_CLASS, 83 .ucdname = "LRO", 84 }, 85 { 86 .enumname = "NSM", 87 .file = FILE_BIDI_CLASS, 88 .ucdname = "NSM", 89 }, 90 { 91 .enumname = "ON", 92 .file = FILE_BIDI_CLASS, 93 .ucdname = "ON", 94 }, 95 { 96 .enumname = "PDF", 97 .file = FILE_BIDI_CLASS, 98 .ucdname = "PDF", 99 }, 100 { 101 .enumname = "PDI", 102 .file = FILE_BIDI_CLASS, 103 .ucdname = "PDI", 104 }, 105 { 106 .enumname = "R", 107 .file = FILE_BIDI_CLASS, 108 .ucdname = "R", 109 }, 110 { 111 .enumname = "RLE", 112 .file = FILE_BIDI_CLASS, 113 .ucdname = "RLE", 114 }, 115 { 116 .enumname = "RLI", 117 .file = FILE_BIDI_CLASS, 118 .ucdname = "RLI", 119 }, 120 { 121 .enumname = "RLO", 122 .file = FILE_BIDI_CLASS, 123 .ucdname = "RLO", 124 }, 125 { 126 .enumname = "S", 127 .file = FILE_BIDI_CLASS, 128 .ucdname = "S", 129 }, 130 { 131 .enumname = "WS", 132 .file = FILE_BIDI_CLASS, 133 .ucdname = "WS", 134 }, 135 }; 136 137 struct decomposition_payload { 138 uint_least32_t cp; 139 uint_least32_t decomposition; 140 }; 141 142 static int 143 decomposition_callback(const char *file, char **field, size_t nfields, 144 char *comment, void *payload) 145 { 146 char *p; 147 struct decomposition_payload *decomp = 148 (struct decomposition_payload *)payload; 149 uint_least32_t cp; 150 151 (void)file; 152 (void)comment; 153 154 if (nfields < 6) { 155 /* we have fewer than 6 fields, discard the line */ 156 return 0; 157 } 158 159 hextocp(field[0], strlen(field[0]), &cp); 160 161 if (decomp->cp == cp) { 162 /* we hit the line that contains our decomposition target */ 163 if (strlen(field[5]) > 0) { 164 p = field[5]; 165 if (*p == '<') { 166 /* 167 * the decomposition contains some metadata 168 * <...> we skip 169 */ 170 for (; *p != '\0'; p++) { 171 if (*p == '>') { 172 p++; 173 while (*p == ' ') { 174 p++; 175 } 176 break; 177 } 178 } 179 } 180 hextocp(p, strlen(p), &(decomp->decomposition)); 181 } else { 182 decomp->decomposition = decomp->cp; 183 } 184 } 185 186 return 0; 187 } 188 189 static struct { 190 uint_least32_t base[NUM_BRACKET_ALIASES]; 191 size_t baselen; 192 uint_least32_t pair[NUM_BRACKET_ALIASES]; 193 size_t pairlen; 194 uint_least8_t class; 195 char type; 196 } *b = NULL; 197 198 static size_t blen; 199 static uint_least8_t bracket_class_count = 1; 200 201 static int 202 bracket_callback(const char *file, char **field, size_t nfields, char *comment, 203 void *payload) 204 { 205 size_t i, j; 206 struct decomposition_payload decomp_base, decomp_pair; 207 uint_least32_t cp_base, cp_pair; 208 209 (void)file; 210 (void)comment; 211 (void)payload; 212 213 if (nfields < 3) { 214 /* we have fewer than 3 fields, discard the line */ 215 return 0; 216 } 217 218 /* parse field data */ 219 hextocp(field[0], strlen(field[0]), &cp_base); 220 hextocp(field[1], strlen(field[1]), &cp_pair); 221 222 /* determine decomposition of the base and pair codepoints */ 223 decomp_base.cp = cp_base; 224 decomp_pair.cp = cp_pair; 225 parse_file_with_callback(FILE_UNICODE_DATA, decomposition_callback, 226 &decomp_base); 227 parse_file_with_callback(FILE_UNICODE_DATA, decomposition_callback, 228 &decomp_pair); 229 230 /* 231 * check if we already have the canonical form in the bracket array, 232 * per convention the canonical form is the first element of the alias 233 * array 234 */ 235 for (i = 0; i < blen; i++) { 236 if (decomp_base.decomposition == b[i].base[0]) { 237 /* we have a match, check type */ 238 if (strlen(field[2]) != 1 || 239 (field[2][0] != 'o' && field[2][0] != 'c')) { 240 /* malformed line */ 241 return 1; 242 } else if (b[i].type != field[2][0]) { 243 /* mismatching types */ 244 return 1; 245 } 246 247 /* 248 * add our base alias to the base array unless it isn't 249 * already in it 250 */ 251 for (j = 0; j < b[i].baselen; j++) { 252 if (cp_base == b[i].base[j]) { 253 /* already in array, do nothing */ 254 break; 255 } 256 } 257 if (j == b[i].baselen) { 258 /* 259 * the base alias is not already in the array, 260 * add it 261 */ 262 if (b[i].baselen == NUM_BRACKET_ALIASES) { 263 fprintf(stderr, "too many aliases\n"); 264 return 1; 265 } 266 b[i].baselen++; 267 b[i].base[b[i].baselen - 1] = cp_base; 268 } 269 270 /* 271 * also add our pair alias to the pair array unless 272 * it isn't already in it 273 */ 274 for (j = 0; j < b[i].pairlen; j++) { 275 if (cp_pair == b[i].pair[j]) { 276 /* already in array, do nothing */ 277 break; 278 } 279 } 280 if (j == b[i].pairlen) { 281 /* 282 * the pair alias is not already in the array, 283 * add it 284 */ 285 if (b[i].pairlen == NUM_BRACKET_ALIASES) { 286 fprintf(stderr, "too many aliases\n"); 287 return 1; 288 } 289 b[i].pairlen++; 290 b[i].pair[b[i].pairlen - 1] = cp_pair; 291 } 292 293 return 0; 294 } 295 } 296 297 /* extend bracket pair array, as this is a new bracket type */ 298 if (!(b = realloc(b, (++blen) * sizeof(*b)))) { 299 fprintf(stderr, "realloc: %s\n", strerror(errno)); 300 exit(1); 301 } 302 303 /* fill field data by adding the canonical form first */ 304 b[blen - 1].base[0] = decomp_base.decomposition; 305 b[blen - 1].baselen = 1; 306 b[blen - 1].pair[0] = decomp_pair.decomposition; 307 b[blen - 1].pairlen = 1; 308 309 /* add alias if it differs from the canonical form */ 310 if (cp_base != decomp_base.decomposition) { 311 b[blen - 1].base[1] = cp_base; 312 b[blen - 1].baselen = 2; 313 } 314 if (cp_pair != decomp_pair.decomposition) { 315 b[blen - 1].pair[1] = cp_pair; 316 b[blen - 1].pairlen = 2; 317 } 318 319 /* add bracket type */ 320 if (strlen(field[2]) != 1 || 321 (field[2][0] != 'o' && field[2][0] != 'c')) { 322 /* malformed line */ 323 return 1; 324 } else { 325 b[blen - 1].type = field[2][0]; 326 } 327 328 /* 329 * determine bracket class by iterating over the bracket-array 330 * and seeing if our current canonical cp already has a matching pair. 331 * We only need to check the first entry in each bracket alias 332 * list, as this is, per convention, the canonical form. 333 * If not, add a new class. 334 */ 335 for (i = 0; i + 1 < blen; i++) { 336 if (b[i].pair[0] == b[blen - 1].base[0]) { 337 /* matched class */ 338 b[blen - 1].class = b[i].class; 339 break; 340 } 341 } 342 if (i + 1 == blen) { 343 /* no match, assign a new class */ 344 b[blen - 1].class = bracket_class_count++; 345 } 346 347 return 0; 348 } 349 350 static void 351 post_process(struct properties *prop) 352 { 353 size_t i, j; 354 355 for (i = 0; i < blen; i++) { 356 /* 357 * given the base property fits in 5 bits, we simply 358 * store the bracket-offset in the bits above that. 359 * 360 * All those properties that are not set here implicitly 361 * have offset 0, which we prepared to contain a stub 362 * for a character that is not a bracket. 363 */ 364 for (j = 0; j < b[i].baselen; j++) { 365 prop[b[i].base[j]].property |= (i << 5); 366 } 367 } 368 } 369 370 static uint_least8_t 371 fill_missing(uint_least32_t cp) 372 { 373 /* based on the @missing-properties in data/DerivedBidiClass.txt */ 374 if ((cp >= UINT32_C(0x0590) && cp <= UINT32_C(0x05FF)) || 375 (cp >= UINT32_C(0x07C0) && cp <= UINT32_C(0x085F)) || 376 (cp >= UINT32_C(0xFB1D) && cp <= UINT32_C(0xFB4F)) || 377 (cp >= UINT32_C(0x10800) && cp <= UINT32_C(0x10CFF)) || 378 (cp >= UINT32_C(0x10D40) && cp <= UINT32_C(0x10EBF)) || 379 (cp >= UINT32_C(0x10F00) && cp <= UINT32_C(0x10F2F)) || 380 (cp >= UINT32_C(0x10F70) && cp <= UINT32_C(0x10FFF)) || 381 (cp >= UINT32_C(0x1E800) && cp <= UINT32_C(0x1EC6F)) || 382 (cp >= UINT32_C(0x1ECC0) && cp <= UINT32_C(0x1ECFF)) || 383 (cp >= UINT32_C(0x1ED50) && cp <= UINT32_C(0x1EDFF)) || 384 (cp >= UINT32_C(0x1EF00) && cp <= UINT32_C(0x1EFFF))) { 385 return 17; /* class R */ 386 } else if ((cp >= UINT32_C(0x0600) && cp <= UINT32_C(0x07BF)) || 387 (cp >= UINT32_C(0x0860) && cp <= UINT32_C(0x08FF)) || 388 (cp >= UINT32_C(0xFB50) && cp <= UINT32_C(0xFDCF)) || 389 (cp >= UINT32_C(0xFDF0) && cp <= UINT32_C(0xFDFF)) || 390 (cp >= UINT32_C(0xFE70) && cp <= UINT32_C(0xFEFF)) || 391 (cp >= UINT32_C(0x10D00) && cp <= UINT32_C(0x10D3F)) || 392 (cp >= UINT32_C(0x10EC0) && cp <= UINT32_C(0x10EFF)) || 393 (cp >= UINT32_C(0x10F30) && cp <= UINT32_C(0x10F6F)) || 394 (cp >= UINT32_C(0x1EC70) && cp <= UINT32_C(0x1ECBF)) || 395 (cp >= UINT32_C(0x1ED00) && cp <= UINT32_C(0x1ED4F)) || 396 (cp >= UINT32_C(0x1EE00) && cp <= UINT32_C(0x1EEFF))) { 397 return 1; /* class AL */ 398 } else if (cp >= UINT32_C(0x20A0) && cp <= UINT32_C(0x20CF)) { 399 return 8; /* class ET */ 400 } else { 401 return 0; /* class L */ 402 } 403 } 404 405 static struct properties *prop_mirror = NULL; 406 407 static int 408 mirror_callback(const char *file, char **field, size_t nfields, char *comment, 409 void *payload) 410 { 411 uint_least32_t cp, cp_mirror; 412 413 (void)file; 414 (void)comment; 415 (void)payload; 416 417 hextocp(field[0], strlen(field[0]), &cp); 418 419 cp_mirror = cp; 420 421 if (nfields >= 2 && strlen(field[1]) > 0 && 422 hextocp(field[1], strlen(field[1]), &cp_mirror)) { 423 return 1; 424 } 425 426 prop_mirror[cp].property = (int_least32_t)cp_mirror - (int_least32_t)cp; 427 428 return 0; 429 } 430 431 static int_least64_t 432 get_value(const struct properties *prop, size_t offset) 433 { 434 return prop[offset].property; 435 } 436 437 int 438 main(int argc, char *argv[]) 439 { 440 struct properties_compressed comp_mirror; 441 struct properties_major_minor mm_mirror; 442 size_t i; 443 444 (void)argc; 445 446 /* 447 * the first element in the bracket array is initialized to 448 * all-zeros, as we use the implicit 0-offset for all those 449 * codepoints that are not a bracket 450 */ 451 if (!(b = calloc((blen = 1), sizeof(*b)))) { 452 fprintf(stderr, "calloc: %s\n", strerror(errno)); 453 exit(1); 454 } 455 parse_file_with_callback(FILE_BIDI_BRACKETS, bracket_callback, NULL); 456 457 properties_generate_break_property(bidi_property, LEN(bidi_property), 458 fill_missing, NULL, post_process, 459 "bidi", argv[0]); 460 461 printf("\nenum bracket_type {\n\tBIDI_BRACKET_NONE,\n\t" 462 "BIDI_BRACKET_OPEN,\n\tBIDI_BRACKET_CLOSE,\n};\n\n" 463 "static const struct bracket {\n\tenum bracket_type type;\n" 464 "\tuint_least8_t class;\n} bidi_bracket[] = {\n"); 465 for (i = 0; i < blen; i++) { 466 printf("\t{\n\t\t.type = %s,\n\t\t.class = " 467 "%" PRIuLEAST8 ",\n\t},\n", 468 (b[i].type == 'o') ? "BIDI_BRACKET_OPEN" : 469 (b[i].type == 'c') ? "BIDI_BRACKET_CLOSE" : 470 "BIDI_BRACKET_NONE", 471 b[i].class); 472 } 473 printf("};\n"); 474 475 /* 476 * allocate property buffer for all 0x110000 codepoints 477 * 478 * the buffers contain the offset from the "base" character 479 * to the respective mirrored character. By callocing we set all 480 * fields to zero, which is also the Unicode "default" in the sense 481 * that the coe point is its mirror (unless we fill it in) 482 */ 483 if (!(prop_mirror = calloc(UINT32_C(0x110000), sizeof(*prop_mirror)))) { 484 fprintf(stderr, "calloc: %s\n", strerror(errno)); 485 exit(1); 486 } 487 parse_file_with_callback(FILE_BIDI_MIRRORING, mirror_callback, NULL); 488 489 /* compress properties */ 490 properties_compress(prop_mirror, &comp_mirror); 491 492 fprintf(stderr, "%s: mirror-LUT compression-ratio: %.2f%%\n", argv[0], 493 properties_get_major_minor(&comp_mirror, &mm_mirror)); 494 495 /* print tables */ 496 properties_print_lookup_table("mirror_major", mm_mirror.major, 0x1100); 497 printf("\n"); 498 properties_print_derived_lookup_table("mirror_minor", mm_mirror.minor, 499 mm_mirror.minorlen, get_value, 500 comp_mirror.data); 501 502 free(comp_mirror.data); 503 free(comp_mirror.offset); 504 free(mm_mirror.major); 505 free(mm_mirror.minor); 506 507 return 0; 508 }