util.c (19744B)
1 /* See LICENSE file for copyright and license details. */ 2 #include <ctype.h> 3 #include <errno.h> 4 #include <inttypes.h> 5 #include <stdbool.h> 6 #include <stddef.h> 7 #include <stdint.h> 8 #include <stdio.h> 9 #include <stdlib.h> 10 #include <string.h> 11 12 #include "util.h" 13 14 struct range { 15 uint_least32_t lower; 16 uint_least32_t upper; 17 }; 18 19 static int 20 hextocp(const char *str, size_t len, uint_least32_t *cp) 21 { 22 size_t i; 23 int off; 24 char relative; 25 26 /* the maximum valid codepoint is 0x10FFFF */ 27 if (len > 6) { 28 fprintf(stderr, "hextocp: '%.*s' is too long.\n", (int)len, 29 str); 30 return 1; 31 } 32 33 for (i = 0, *cp = 0; i < len; i++) { 34 if (str[i] >= '0' && str[i] <= '9') { 35 relative = '0'; 36 off = 0; 37 } else if (str[i] >= 'a' && str[i] <= 'f') { 38 relative = 'a'; 39 off = 10; 40 } else if (str[i] >= 'A' && str[i] <= 'F') { 41 relative = 'A'; 42 off = 10; 43 } else { 44 fprintf(stderr, "hextocp: '%.*s' is not hexadecimal.\n", 45 (int)len, str); 46 return 1; 47 } 48 49 *cp += ((uint_least32_t)1 << (4 * (len - i - 1))) * 50 (uint_least32_t)(str[i] - relative + off); 51 } 52 53 if (*cp > UINT32_C(0x10FFFF)) { 54 fprintf(stderr, "hextocp: '%.*s' is too large.\n", (int)len, 55 str); 56 return 1; 57 } 58 59 return 0; 60 } 61 62 int 63 parse_range(const char *str, struct range *range) 64 { 65 char *p; 66 67 if ((p = strstr(str, "..")) == NULL) { 68 /* input has the form "XXXXXX" */ 69 if (hextocp(str, strlen(str), &range->lower)) { 70 return 1; 71 } 72 range->upper = range->lower; 73 } else { 74 /* input has the form "XXXXXX..XXXXXX" */ 75 if (hextocp(str, (size_t)(p - str), &range->lower) || 76 hextocp(p + 2, strlen(p + 2), &range->upper)) { 77 return 1; 78 } 79 } 80 81 return 0; 82 } 83 84 static bool 85 get_line(char **buf, size_t *bufsize, FILE *fp, size_t *len) 86 { 87 int ret = EOF; 88 89 for (*len = 0;; (*len)++) { 90 if (*len > 0 && *buf != NULL && (*buf)[*len - 1] == '\n') { 91 /* 92 * if the previously read character was a newline, 93 * we fake an end-of-file so we NUL-terminate and 94 * are done. 95 */ 96 ret = EOF; 97 } else { 98 ret = fgetc(fp); 99 } 100 101 if (*len >= *bufsize) { 102 /* the buffer needs to be expanded */ 103 *bufsize += 512; 104 if ((*buf = realloc(*buf, *bufsize)) == NULL) { 105 fprintf(stderr, "get_line: Out of memory.\n"); 106 exit(1); 107 } 108 } 109 110 if (ret != EOF) { 111 (*buf)[*len] = (char)ret; 112 } else { 113 (*buf)[*len] = '\0'; 114 break; 115 } 116 } 117 118 return *len == 0 && (feof(fp) || ferror(fp)); 119 } 120 121 static char * 122 duplicate_string(const char *src) 123 { 124 size_t len = strlen(src); 125 char *dest; 126 127 if (!(dest = malloc(len + 1))) { 128 fprintf(stderr, "malloc: %s\n", strerror(errno)); 129 exit(1); 130 } 131 132 memcpy(dest, src, len); 133 dest[len] = '\0'; 134 135 return dest; 136 } 137 138 void 139 duplicate_codepoint_property(const struct codepoint_property *src, 140 struct codepoint_property *dest) 141 { 142 size_t i; 143 144 /* copy the field count */ 145 dest->field_count = src->field_count; 146 147 /* allocate the field array */ 148 if (!(dest->fields = calloc(dest->field_count, sizeof(*(dest->fields))))) { 149 fprintf(stderr, "malloc: %s\n", strerror(errno)); 150 exit(1); 151 } 152 153 for (i = 0; i < dest->field_count; i++) { 154 dest->fields[i] = duplicate_string(src->fields[i]); 155 } 156 } 157 158 static void 159 free_codepoint_property(struct codepoint_property *p) 160 { 161 size_t i; 162 163 for (i = 0; i < p->field_count; i++) { 164 free(p->fields[i]); 165 } 166 free(p->fields); 167 } 168 169 static void 170 free_codepoint_property_set(struct codepoint_property_set *p) 171 { 172 size_t i; 173 174 for (i = 0; i < p->property_count; i++) { 175 free_codepoint_property(&(p->properties[i])); 176 } 177 free(p->properties); 178 } 179 180 void 181 free_codepoint_property_set_array(struct codepoint_property_set *p) 182 { 183 uint_least32_t cp; 184 185 for (cp = 0; cp < UINT32_C(0x110000); cp++) { 186 free_codepoint_property_set(&(p[cp])); 187 } 188 free(p); 189 } 190 191 const struct codepoint_property * 192 match_in_codepoint_property_set(const struct codepoint_property_set *p, 193 const char *str, size_t field_offset) 194 { 195 size_t i; 196 197 for (i = 0; i < p->property_count; i++) { 198 /* there must be enough fields to reach the offset */ 199 if (field_offset >= p->properties[i].field_count) { 200 continue; 201 } 202 203 /* check for a match */ 204 if (strcmp(p->properties[i].fields[field_offset], str) == 0) { 205 return &(p->properties[i]); 206 } 207 } 208 209 return NULL; 210 } 211 212 struct codepoint_property_set * 213 parse_property_file(const char *fname) 214 { 215 FILE *fp; 216 struct codepoint_property p; 217 struct codepoint_property_set *cpp, *missing; 218 struct range range; 219 char *line = NULL, **field = NULL, *comment; 220 uint_least32_t cp; 221 size_t linebufsize = 0, i, fieldbufsize = 0, j, nfields, len; 222 bool is_missing; 223 224 /* 225 * allocate cpp buffer of length 0x11000, initialised to zeros 226 * (NULL 'properties' pointer, zero property_count) 227 */ 228 if (!(cpp = calloc((size_t)UINT32_C(0x110000), sizeof(*cpp)))) { 229 fprintf(stderr, "calloc: %s\n", strerror(errno)); 230 exit(1); 231 } 232 233 /* 234 * allocate same-sized array for 'missing' fields. The data files 235 * have this strange notion of defining some properties in bloody 236 * comments, and in a way that says 'yeah, if we did not define 237 * something for some, then replace it with this value'. However, 238 * it complicates things, as multiple, overlapping @missing 239 * can be defined in a single file. One can deduce that subsequent 240 * @missing just overwrite each other, but there's no way to 241 * track which properties have not been set without running 242 * through the file multiple times, which we want to avoid, so 243 * we accumulate the (self-overwriting) @missing set separately 244 * and fill it in at the end. 245 */ 246 if (!(missing = calloc((size_t)UINT32_C(0x110000), sizeof(*missing)))) { 247 fprintf(stderr, "calloc: %s\n", strerror(errno)); 248 exit(1); 249 } 250 251 /* open the property file */ 252 if (!(fp = fopen(fname, "r"))) { 253 fprintf(stderr, "parse_property_file: fopen '%s': %s.\n", 254 fname, strerror(errno)); 255 exit(1); 256 } 257 258 while (!get_line(&line, &linebufsize, fp, &len)) { 259 /* remove trailing newline */ 260 if (len > 0 && line[len - 1] == '\n') { 261 line[len - 1] = '\0'; 262 len--; 263 } 264 265 /* skip empty lines */ 266 if (len == 0) { 267 continue; 268 } 269 270 /* comment, check if it's a @missing */ 271 is_missing = false; 272 if (line[0] == '#') { 273 if (strncmp(line + 1, " @missing: ", LEN(" @missing: ") - 1) == 0) { 274 /* 275 * this comment specifies a 'missing' line. 276 * We shift it to the left and treat it 277 * like any other line, differentiating it 278 * with the 'is_missing' flag. 279 */ 280 size_t offset = 1 + (LEN(" @missing: ") - 1); 281 282 memmove(line, line + offset, len - offset + 1); 283 len -= offset; 284 is_missing = true; 285 } else { 286 /* skip unrelated comment line */ 287 continue; 288 } 289 } 290 291 /* tokenize line into fields */ 292 for (i = 0, nfields = 0, comment = NULL; i < (size_t)len; i++) { 293 /* skip leading whitespace */ 294 while (line[i] == ' ') { 295 i++; 296 } 297 298 /* check if we crashed into the comment */ 299 if (line[i] != '#') { 300 /* extend field buffer, if necessary */ 301 if (++nfields > fieldbufsize) { 302 if ((field = realloc( 303 field, 304 nfields * 305 sizeof(*field))) == 306 NULL) { 307 fprintf(stderr, 308 "parse_property_" 309 "file: realloc: " 310 "%s.\n", 311 strerror(errno)); 312 exit(1); 313 } 314 fieldbufsize = nfields; 315 } 316 317 /* set current position as field start */ 318 field[nfields - 1] = &line[i]; 319 320 /* continue until we reach ';' or '#' or end */ 321 while (line[i] != ';' && line[i] != '#' && 322 line[i] != '\0') { 323 i++; 324 } 325 } 326 327 if (line[i] == '#') { 328 /* set comment-variable for later */ 329 comment = &line[i + 1]; 330 } 331 332 /* go back whitespace and terminate field there */ 333 if (i > 0) { 334 for (j = i - 1; line[j] == ' '; j--) { 335 ; 336 } 337 line[j + 1] = '\0'; 338 } else { 339 line[i] = '\0'; 340 } 341 342 /* if comment is set, we are done */ 343 if (comment != NULL) { 344 break; 345 } 346 } 347 348 /* skip leading whitespace in comment */ 349 while (comment != NULL && comment[0] == ' ') { 350 comment++; 351 } 352 353 /* 354 * We now have an array 'field' with 'nfields'. We can 355 * follow from the file format that, if nfields > 0, 356 * field[0] specifies a codepoint or range of 357 * codepoints, which we parse as the first step. 358 */ 359 if (nfields < 2) { 360 /* 361 * either there is only a range or no range at 362 * all. Report this 363 */ 364 fprintf(stderr, "parse_property_file: malformed " 365 "line with either no range or no entry\n"); 366 exit(1); 367 } 368 369 if (parse_range(field[0], &range)) { 370 fprintf(stderr, "parse_property_file: parse_range of " 371 "'%s' failed.\n", field[0]); 372 exit(1); 373 } 374 375 /* 376 * fill a codepoint_property from the remaining 377 * fields (no allocations on a stack-allocated struct 378 */ 379 p.fields = field + 1; 380 p.field_count = nfields - 1; 381 382 /* 383 * add a duplicate of the filled codepoint_property 384 * to all codepoints in the range (i.e. the cpp or 385 * missing array, depending on is_missing) 386 */ 387 for (cp = range.lower; cp <= range.upper; cp++) { 388 if (is_missing) { 389 /* 390 * we overwrite the set at the codepoint, 391 * as the @missing properties overwrite 392 * each other (bad design) 393 */ 394 if (missing[cp].property_count == 0) { 395 /* 396 * set is still empty, add space 397 * for one pointer to a 398 * codepoint_property 399 */ 400 if (!(missing[cp].properties = 401 malloc(sizeof(*(missing[cp].properties))))) { 402 fprintf(stderr, "malloc: %s\n", 403 strerror(errno)); 404 exit(1); 405 } 406 missing[cp].property_count = 1; 407 } else if (missing[cp].property_count == 1) { 408 /* free the old property */ 409 free_codepoint_property( 410 &(missing[cp].properties[0])); 411 } else { 412 /* this shouldn't happen */ 413 fprintf(stderr, "parse_property_file: " 414 "malformed missing set\n"); 415 exit(1); 416 } 417 418 /* copy in the new one */ 419 duplicate_codepoint_property( 420 &p, &(missing[cp].properties[0])); 421 } else { 422 /* expand the set by one element */ 423 if (!(cpp[cp].properties = realloc( 424 cpp[cp].properties, 425 sizeof(*(cpp[cp].properties)) * 426 (++cpp[cp].property_count)))) { 427 fprintf(stderr, "malloc: %s\n", 428 strerror(errno)); 429 exit(1); 430 } 431 432 duplicate_codepoint_property(&p, 433 &(cpp[cp].properties[cpp[cp].property_count - 1])); 434 } 435 } 436 } 437 438 /* 439 * now we add the missing elements. We purposefully do not 440 * follow the interpretation in DerivedCoreProperties.txt for 441 * the @missing 'InCB; None'. Missing, for us, means that 442 * _no_ properties have been extracted and thus property_count 443 * is zero, not that some first field is not set. Absolute 444 * madness to even publish data like this... 445 */ 446 for (cp = 0; cp < UINT32_C(0x110000); cp++) { 447 if (cpp[cp].property_count == 0) { 448 /* swap the whole lot */ 449 cpp[cp].properties = missing[cp].properties; 450 cpp[cp].property_count = missing[cp].property_count; 451 missing[cp].properties = NULL; 452 missing[cp].property_count = 0; 453 } 454 } 455 456 /* close file */ 457 if (fclose(fp)) { 458 fprintf(stderr, "parse_property_file: fclose '%s': %s.\n", 459 fname, strerror(errno)); 460 exit(1); 461 } 462 463 /* cleanup */ 464 free_codepoint_property_set_array(missing); 465 free(line); 466 free(field); 467 468 /* return codepoint properties array */ 469 return cpp; 470 } 471 472 struct compression_output { 473 uint_least64_t *major; 474 uint_least64_t *minor; 475 size_t block_size_shift; 476 size_t block_size; 477 size_t major_size; 478 size_t minor_size; 479 size_t major_entry_size; 480 size_t minor_entry_size; 481 size_t total_size; 482 }; 483 484 static void 485 compress_array(const uint_least64_t *array, size_t block_size_shift, 486 struct compression_output *co) 487 { 488 size_t i, j, major_maximum, minor_maximum; 489 490 /* set some preliminary data in the output struct */ 491 co->block_size_shift = block_size_shift; 492 co->block_size = ((size_t)1) << block_size_shift; 493 co->major_size = UINT32_C(0x110000) / co->block_size; 494 495 /* allocate/initialise */ 496 if (!(co->major = malloc(co->major_size * sizeof(*(co->major))))) { 497 fprintf(stderr, "malloc: %s\n", strerror(errno)); 498 exit(1); 499 } 500 co->minor = NULL; 501 co->minor_size = 0; 502 503 /* iterate over all blocks in the array */ 504 for (i = 0; i < co->major_size; i++) { 505 size_t block_offset = i * co->block_size; 506 507 /* 508 * iterate over all possible matching block starting 509 * positions in the minor array 510 */ 511 for (j = 0; j + co->block_size < co->minor_size; j++) { 512 /* 513 * check if our current block matches the 514 * candidate block in the minor array 515 */ 516 if (memcmp(array + block_offset, 517 co->minor + j, 518 co->block_size * sizeof(*array)) == 0) { 519 /* 520 * we have a match, so we don't have to 521 * store this block explicitly and just 522 * point to the offset in minor 523 */ 524 co->major[i] = j; 525 break; 526 } 527 } 528 if (j + co->block_size >= co->minor_size) { 529 /* 530 * we found no matching subblock in minor. Add it 531 * to the minor array. The index to the first 532 * element we add now is exactly the size 533 * of the minor array. 534 */ 535 co->major[i] = co->minor_size; 536 co->minor_size += co->block_size; 537 if (!(co->minor = realloc(co->minor, co->minor_size * 538 sizeof(*(co->minor))))) { 539 fprintf(stderr, "malloc: %s\n", 540 strerror(errno)); 541 exit(1); 542 } 543 memcpy(co->minor + co->minor_size - co->block_size, 544 array + block_offset, 545 co->block_size * sizeof(*array)); 546 } 547 } 548 549 /* the compression is done. Now we derive some metadata */ 550 551 /* determine maxima */ 552 for (i = 0, major_maximum = 0; i < co->major_size; i++) { 553 if (co->major[i] > major_maximum) { 554 major_maximum = co->major[i]; 555 } 556 } 557 for (i = 0, minor_maximum = 0; i < co->minor_size; i++) { 558 if (co->minor[i] > minor_maximum) { 559 minor_maximum = co->minor[i]; 560 } 561 } 562 563 /* determine entry sizes */ 564 if (major_maximum < UINT64_C(1) << 8) { 565 co->major_entry_size = 8; 566 } else if (major_maximum < UINT64_C(1) << 16) { 567 co->major_entry_size = 16; 568 } else if (major_maximum < UINT64_C(1) << 32) { 569 co->major_entry_size = 32; 570 } else { 571 co->major_entry_size = 64; 572 } 573 574 if (minor_maximum < UINT64_C(1) << 4) { 575 /* using 4 bit packed nibbles */ 576 co->minor_entry_size = 4; 577 } else if (minor_maximum < UINT64_C(1) << 8) { 578 co->minor_entry_size = 8; 579 } else if (minor_maximum < UINT64_C(1) << 16) { 580 co->minor_entry_size = 16; 581 } else if (minor_maximum < UINT64_C(1) << 32) { 582 co->minor_entry_size = 32; 583 } else { 584 co->minor_entry_size = 64; 585 } 586 587 /* total data size in bytes */ 588 co->total_size = ((co->major_size * co->major_entry_size) + 589 (co->minor_size * co->minor_entry_size)) / 8; 590 } 591 592 void 593 free_compressed_data(struct compression_output *co) 594 { 595 free(co->major); 596 free(co->minor); 597 memset(co, 0, sizeof(*co)); 598 } 599 600 void 601 print_array(const uint_least64_t *array, size_t array_size, 602 size_t array_entry_size, const char *prefix, 603 const char *name) 604 { 605 size_t i; 606 607 if (array_entry_size == 4) { 608 /* we store two 4-bit nibbles in one byte */ 609 if (array_size % 2 != 0) { 610 /* ensure array lenght is even */ 611 fprintf(stderr, "print_array: 4-bit array " 612 "is not implemented for odd length."); 613 exit(1); 614 } 615 616 printf("static const uint_least8_t %s_%s[] = {", 617 prefix, name); 618 for (i = 0; i < array_size; i += 2) { 619 if ((i / 2) % 8 == 0) { 620 printf("\n\t"); 621 } else { 622 printf(" "); 623 } 624 625 /* write high and low nibbles */ 626 printf("%zu", (array[i] << 4) | array[i + 1]); 627 628 if (i + 2 < array_size) { 629 printf(","); 630 } 631 } 632 printf("\n};\n"); 633 } else { 634 printf("static const uint_least%zu_t %s_%s[] = {", 635 array_entry_size, prefix, name); 636 for (i = 0; i < array_size; i++) { 637 if (i % 8 == 0) { 638 printf("\n\t"); 639 } else { 640 printf(" "); 641 } 642 printf("%zu", array[i]); 643 if (i + 1 < array_size) { 644 printf(","); 645 } 646 } 647 printf("\n};\n"); 648 } 649 } 650 651 void 652 compress_and_output(uint_least64_t *array, const char *prefix) 653 { 654 struct compression_output co, co_best; 655 size_t i, j, dictionary_size, dictionary_entry_size; 656 uint_least64_t maximum = 0, *dictionary, *keys; 657 658 /* initialise dictionary */ 659 dictionary = NULL; 660 dictionary_size = 0; 661 662 /* initialise keys */ 663 if (!(keys = calloc(UINT32_C(0x110000), sizeof(*keys)))) { 664 fprintf(stderr, "calloc: %s\n", strerror(errno)); 665 exit(1); 666 } 667 668 for (i = 0; i < UINT32_C(0x110000); i++) { 669 /* maximum determination */ 670 if (array[i] > maximum) { 671 maximum = array[i]; 672 } 673 674 /* check if the array value is already in the dictionary */ 675 for (j = 0; j < dictionary_size; j++) { 676 if (array[i] == dictionary[j]) { 677 break; 678 } 679 } 680 if (j == dictionary_size) { 681 /* not in the dictionary, insert the array value */ 682 if (!(dictionary = realloc(dictionary, (++dictionary_size) * 683 sizeof(*dictionary)))) { 684 fprintf(stderr, "realloc: %s\n", strerror(errno)); 685 exit(1); 686 } 687 dictionary[dictionary_size - 1] = array[i]; 688 } 689 690 /* set the index (key) of the matched dictionary entry */ 691 keys[i] = j; 692 } 693 694 /* check if dictionary size is below a reasonable threshold */ 695 if (dictionary_size > 256) { 696 fprintf(stderr, "compress_and_output: dictionary too large\n"); 697 exit(1); 698 } 699 700 /* 701 * compress keys array with different block size shifts 702 * (block sizes from 16=1<<4 to 32768=1<<15) 703 */ 704 memset(&co_best, 0, sizeof(co_best)); 705 co_best.total_size = SIZE_MAX; 706 for (i = 15; i >= 4; i--) { 707 compress_array(keys, i, &co); 708 709 fprintf(stderr, "compress_and_output: " 710 "block size %zu -> data size %.1f kB (%zu,%zu)\n", 711 ((size_t)1) << i, (double)co.total_size / 1000, co.major_entry_size, co.minor_entry_size); 712 713 if (co.total_size < co_best.total_size) { 714 /* we have a new best compression */ 715 free_compressed_data(&co_best); 716 co_best = co; 717 } else { 718 /* not optimal, discard it */ 719 free_compressed_data(&co); 720 } 721 } 722 fprintf(stderr, "compress_and_output: choosing optimal block size %zu (%zu,%zu)\n", 723 co_best.block_size, co_best.major_entry_size, co_best.minor_entry_size); 724 725 /* output header */ 726 printf("/* Automatically generated by gen2/%s */\n" 727 "#include <stdint.h>\n\n" 728 "#include \"character.h\"\n\n", prefix); 729 730 /* output dictionary */ 731 if (maximum < UINT64_C(1) << 8) { 732 dictionary_entry_size = 8; 733 } else if (maximum < UINT64_C(1) << 16) { 734 dictionary_entry_size = 16; 735 } else if (maximum < UINT64_C(1) << 32) { 736 dictionary_entry_size = 32; 737 } else { 738 dictionary_entry_size = 64; 739 } 740 741 print_array(dictionary, dictionary_size, dictionary_entry_size, 742 prefix, "dictionary"); 743 printf("\n"); 744 745 /* output major array */ 746 print_array(co_best.major, co_best.major_size, 747 co_best.major_entry_size, prefix, "major"); 748 printf("\n"); 749 750 /* output minor array */ 751 print_array(co_best.minor, co_best.minor_size, 752 co_best.minor_entry_size, prefix, "minor"); 753 printf("\n"); 754 755 /* output accessor function (major is never 4-bits per entry) */ 756 printf("static inline uint_least%zu_t\n" 757 "get_%s_property(uint_least32_t cp)\n" 758 "{\n" 759 "\tuint_least%zu_t minor_offset = %s_major[cp >> %zu]\n" 760 "\t\t+ (cp & ((UINT32_C(1) << %zu) - 1));\n", 761 dictionary_entry_size, prefix, co_best.major_entry_size, 762 prefix, co_best.block_size_shift, co_best.block_size_shift); 763 if (co_best.minor_entry_size == 4) { 764 printf("\tuint_least8_t packed_value = %s_minor[minor_offset / 2];\n\n" 765 "\tif (minor_offset & UINT8_C(1) == 0) {\n" 766 "\t\t/* high nibble */\n" 767 "\t\treturn %s_dictionary[packed_value >> 4];\n" 768 "\t} else {\n" 769 "\t\t/* low nibble */\n" 770 "\t\treturn %s_dictionary[packed_value & UINT8_C(0x0f)];\n" 771 "\t}\n", 772 prefix, prefix, prefix); 773 } else { 774 printf("\n\treturn %s_dictionary[%s_minor[minor_offset]];\n", 775 prefix, prefix); 776 } 777 printf("}\n"); 778 779 /* cleanup */ 780 free(dictionary); 781 free(keys); 782 }