libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

util.c (19744B)


      1 /* See LICENSE file for copyright and license details. */
      2 #include <ctype.h>
      3 #include <errno.h>
      4 #include <inttypes.h>
      5 #include <stdbool.h>
      6 #include <stddef.h>
      7 #include <stdint.h>
      8 #include <stdio.h>
      9 #include <stdlib.h>
     10 #include <string.h>
     11 
     12 #include "util.h"
     13 
     14 struct range {
     15 	uint_least32_t lower;
     16 	uint_least32_t upper;
     17 };
     18 
     19 static int
     20 hextocp(const char *str, size_t len, uint_least32_t *cp)
     21 {
     22 	size_t i;
     23 	int off;
     24 	char relative;
     25 
     26 	/* the maximum valid codepoint is 0x10FFFF */
     27 	if (len > 6) {
     28 		fprintf(stderr, "hextocp: '%.*s' is too long.\n", (int)len,
     29 		        str);
     30 		return 1;
     31 	}
     32 
     33 	for (i = 0, *cp = 0; i < len; i++) {
     34 		if (str[i] >= '0' && str[i] <= '9') {
     35 			relative = '0';
     36 			off = 0;
     37 		} else if (str[i] >= 'a' && str[i] <= 'f') {
     38 			relative = 'a';
     39 			off = 10;
     40 		} else if (str[i] >= 'A' && str[i] <= 'F') {
     41 			relative = 'A';
     42 			off = 10;
     43 		} else {
     44 			fprintf(stderr, "hextocp: '%.*s' is not hexadecimal.\n",
     45 			        (int)len, str);
     46 			return 1;
     47 		}
     48 
     49 		*cp += ((uint_least32_t)1 << (4 * (len - i - 1))) *
     50 		       (uint_least32_t)(str[i] - relative + off);
     51 	}
     52 
     53 	if (*cp > UINT32_C(0x10FFFF)) {
     54 		fprintf(stderr, "hextocp: '%.*s' is too large.\n", (int)len,
     55 		        str);
     56 		return 1;
     57 	}
     58 
     59 	return 0;
     60 }
     61 
     62 int
     63 parse_range(const char *str, struct range *range)
     64 {
     65 	char *p;
     66 
     67 	if ((p = strstr(str, "..")) == NULL) {
     68 		/* input has the form "XXXXXX" */
     69 		if (hextocp(str, strlen(str), &range->lower)) {
     70 			return 1;
     71 		}
     72 		range->upper = range->lower;
     73 	} else {
     74 		/* input has the form "XXXXXX..XXXXXX" */
     75 		if (hextocp(str, (size_t)(p - str), &range->lower) ||
     76 		    hextocp(p + 2, strlen(p + 2), &range->upper)) {
     77 			return 1;
     78 		}
     79 	}
     80 
     81 	return 0;
     82 }
     83 
     84 static bool
     85 get_line(char **buf, size_t *bufsize, FILE *fp, size_t *len)
     86 {
     87 	int ret = EOF;
     88 
     89 	for (*len = 0;; (*len)++) {
     90 		if (*len > 0 && *buf != NULL && (*buf)[*len - 1] == '\n') {
     91 			/*
     92 			 * if the previously read character was a newline,
     93 			 * we fake an end-of-file so we NUL-terminate and
     94 			 * are done.
     95 			 */
     96 			ret = EOF;
     97 		} else {
     98 			ret = fgetc(fp);
     99 		}
    100 
    101 		if (*len >= *bufsize) {
    102 			/* the buffer needs to be expanded */
    103 			*bufsize += 512;
    104 			if ((*buf = realloc(*buf, *bufsize)) == NULL) {
    105 				fprintf(stderr, "get_line: Out of memory.\n");
    106 				exit(1);
    107 			}
    108 		}
    109 
    110 		if (ret != EOF) {
    111 			(*buf)[*len] = (char)ret;
    112 		} else {
    113 			(*buf)[*len] = '\0';
    114 			break;
    115 		}
    116 	}
    117 
    118 	return *len == 0 && (feof(fp) || ferror(fp));
    119 }
    120 
    121 static char *
    122 duplicate_string(const char *src)
    123 {
    124 	size_t len = strlen(src);
    125 	char *dest;
    126 
    127 	if (!(dest = malloc(len + 1))) {
    128 		fprintf(stderr, "malloc: %s\n", strerror(errno));
    129 		exit(1);
    130 	}
    131 
    132 	memcpy(dest, src, len);
    133 	dest[len] = '\0';
    134 
    135 	return dest;
    136 }
    137 
    138 void
    139 duplicate_codepoint_property(const struct codepoint_property *src,
    140 	struct codepoint_property *dest)
    141 {
    142 	size_t i;
    143 
    144 	/* copy the field count */
    145 	dest->field_count = src->field_count;
    146 
    147 	/* allocate the field array */
    148 	if (!(dest->fields = calloc(dest->field_count, sizeof(*(dest->fields))))) {
    149 		fprintf(stderr, "malloc: %s\n", strerror(errno));
    150 		exit(1);
    151 	}
    152 
    153 	for (i = 0; i < dest->field_count; i++) {
    154 		dest->fields[i] = duplicate_string(src->fields[i]);
    155 	}
    156 }
    157 
    158 static void
    159 free_codepoint_property(struct codepoint_property *p)
    160 {
    161 	size_t i;
    162 
    163 	for (i = 0; i < p->field_count; i++) {
    164 		free(p->fields[i]);
    165 	}
    166 	free(p->fields);
    167 }
    168 
    169 static void
    170 free_codepoint_property_set(struct codepoint_property_set *p)
    171 {
    172 	size_t i;
    173 
    174 	for (i = 0; i < p->property_count; i++) {
    175 		free_codepoint_property(&(p->properties[i]));
    176 	}
    177 	free(p->properties);
    178 }
    179 
    180 void
    181 free_codepoint_property_set_array(struct codepoint_property_set *p)
    182 {
    183 	uint_least32_t cp;
    184 
    185 	for (cp = 0; cp < UINT32_C(0x110000); cp++) {
    186 		free_codepoint_property_set(&(p[cp]));
    187 	}
    188 	free(p);
    189 }
    190 
    191 const struct codepoint_property *
    192 match_in_codepoint_property_set(const struct codepoint_property_set *p,
    193 	const char *str, size_t field_offset)
    194 {
    195 	size_t i;
    196 
    197 	for (i = 0; i < p->property_count; i++) {
    198 		/* there must be enough fields to reach the offset */
    199 		if (field_offset >= p->properties[i].field_count) {
    200 			continue;
    201 		}
    202 
    203 		/* check for a match */
    204 		if (strcmp(p->properties[i].fields[field_offset], str) == 0) {
    205 			return &(p->properties[i]);
    206 		}
    207 	}
    208 	
    209 	return NULL;
    210 }
    211 
    212 struct codepoint_property_set *
    213 parse_property_file(const char *fname)
    214 {
    215 	FILE *fp;
    216 	struct codepoint_property p;
    217 	struct codepoint_property_set *cpp, *missing;
    218 	struct range range;
    219 	char *line = NULL, **field = NULL, *comment;
    220 	uint_least32_t cp;
    221 	size_t linebufsize = 0, i, fieldbufsize = 0, j, nfields, len;
    222 	bool is_missing;
    223 
    224 	/*
    225 	 * allocate cpp buffer of length 0x11000, initialised to zeros
    226 	 * (NULL 'properties' pointer, zero property_count)
    227 	 */
    228 	if (!(cpp = calloc((size_t)UINT32_C(0x110000), sizeof(*cpp)))) {
    229 		fprintf(stderr, "calloc: %s\n", strerror(errno));
    230 		exit(1);
    231 	}
    232 
    233 	/*
    234 	 * allocate same-sized array for 'missing' fields. The data files
    235 	 * have this strange notion of defining some properties in bloody
    236 	 * comments, and in a way that says 'yeah, if we did not define
    237 	 * something for some, then replace it with this value'. However,
    238 	 * it complicates things, as multiple, overlapping @missing
    239 	 * can be defined in a single file. One can deduce that subsequent
    240 	 * @missing just overwrite each other, but there's no way to
    241 	 * track which properties have not been set without running
    242 	 * through the file multiple times, which we want to avoid, so
    243 	 * we accumulate the (self-overwriting) @missing set separately
    244 	 * and fill it in at the end.
    245 	 */
    246 	if (!(missing = calloc((size_t)UINT32_C(0x110000), sizeof(*missing)))) {
    247 		fprintf(stderr, "calloc: %s\n", strerror(errno));
    248 		exit(1);
    249 	}
    250 
    251 	/* open the property file */
    252 	if (!(fp = fopen(fname, "r"))) {
    253 		fprintf(stderr, "parse_property_file: fopen '%s': %s.\n",
    254 		        fname, strerror(errno));
    255 		exit(1);
    256 	}
    257 
    258 	while (!get_line(&line, &linebufsize, fp, &len)) {
    259 		/* remove trailing newline */
    260 		if (len > 0 && line[len - 1] == '\n') {
    261 			line[len - 1] = '\0';
    262 			len--;
    263 		}
    264 
    265 		/* skip empty lines */
    266 		if (len == 0) {
    267 			continue;
    268 		}
    269 
    270 		/* comment, check if it's a @missing */
    271 		is_missing = false;
    272 		if (line[0] == '#') {
    273 			if (strncmp(line + 1, " @missing: ", LEN(" @missing: ") - 1) == 0) {
    274 				/*
    275 				 * this comment specifies a 'missing' line.
    276 				 * We shift it to the left and treat it
    277 				 * like any other line, differentiating it
    278 				 * with the 'is_missing' flag.
    279 				 */
    280 				size_t offset = 1 + (LEN(" @missing: ") - 1);
    281 
    282 				memmove(line, line + offset, len - offset + 1);
    283 				len -= offset;
    284 				is_missing = true;
    285 			} else {
    286 				/* skip unrelated comment line */
    287 				continue;
    288 			}
    289 		}
    290 
    291 		/* tokenize line into fields */
    292 		for (i = 0, nfields = 0, comment = NULL; i < (size_t)len; i++) {
    293 			/* skip leading whitespace */
    294 			while (line[i] == ' ') {
    295 				i++;
    296 			}
    297 
    298 			/* check if we crashed into the comment */
    299 			if (line[i] != '#') {
    300 				/* extend field buffer, if necessary */
    301 				if (++nfields > fieldbufsize) {
    302 					if ((field = realloc(
    303 						     field,
    304 						     nfields *
    305 							     sizeof(*field))) ==
    306 					    NULL) {
    307 						fprintf(stderr,
    308 						        "parse_property_"
    309 						        "file: realloc: "
    310 						        "%s.\n",
    311 						        strerror(errno));
    312 						exit(1);
    313 					}
    314 					fieldbufsize = nfields;
    315 				}
    316 
    317 				/* set current position as field start */
    318 				field[nfields - 1] = &line[i];
    319 
    320 				/* continue until we reach ';' or '#' or end */
    321 				while (line[i] != ';' && line[i] != '#' &&
    322 				       line[i] != '\0') {
    323 					i++;
    324 				}
    325 			}
    326 
    327 			if (line[i] == '#') {
    328 				/* set comment-variable for later */
    329 				comment = &line[i + 1];
    330 			}
    331 
    332 			/* go back whitespace and terminate field there */
    333 			if (i > 0) {
    334 				for (j = i - 1; line[j] == ' '; j--) {
    335 					;
    336 				}
    337 				line[j + 1] = '\0';
    338 			} else {
    339 				line[i] = '\0';
    340 			}
    341 
    342 			/* if comment is set, we are done */
    343 			if (comment != NULL) {
    344 				break;
    345 			}
    346 		}
    347 
    348 		/* skip leading whitespace in comment */
    349 		while (comment != NULL && comment[0] == ' ') {
    350 			comment++;
    351 		}
    352 
    353 		/*
    354 		 * We now have an array 'field' with 'nfields'. We can
    355 		 * follow from the file format that, if nfields > 0,
    356 		 * field[0] specifies a codepoint or range of
    357 		 * codepoints, which we parse as the first step.
    358 		 */
    359 		if (nfields < 2) {
    360 			/*
    361 			 * either there is only a range or no range at
    362 			 * all. Report this
    363 			 */
    364 			fprintf(stderr, "parse_property_file: malformed "
    365 			        "line with either no range or no entry\n");
    366 			exit(1);
    367 		}
    368 
    369 		if (parse_range(field[0], &range)) {
    370 			fprintf(stderr, "parse_property_file: parse_range of "
    371 			        "'%s' failed.\n", field[0]);
    372 			exit(1);
    373 		}
    374 
    375 		/*
    376 		 * fill a codepoint_property from the remaining
    377 		 * fields (no allocations on a stack-allocated struct
    378 		 */
    379 		p.fields = field + 1;
    380 		p.field_count = nfields - 1;
    381 
    382 		/*
    383 		 * add a duplicate of the filled codepoint_property
    384 		 * to all codepoints in the range (i.e. the cpp or
    385 		 * missing array, depending on is_missing)
    386 		 */
    387 		for (cp = range.lower; cp <= range.upper; cp++) {
    388 			if (is_missing) {
    389 				/*
    390 				 * we overwrite the set at the codepoint,
    391 				 * as the @missing properties overwrite
    392 				 * each other (bad design)
    393 				 */
    394 				if (missing[cp].property_count == 0) {
    395 					/*
    396 					 * set is still empty, add space
    397 					 * for one pointer to a
    398 					 * codepoint_property
    399 					 */
    400 					if (!(missing[cp].properties =
    401 					      malloc(sizeof(*(missing[cp].properties))))) {
    402 						fprintf(stderr, "malloc: %s\n",
    403 						        strerror(errno));
    404 						exit(1);
    405 					}
    406 					missing[cp].property_count = 1;
    407 				} else if (missing[cp].property_count == 1) {
    408 					/* free the old property */
    409 					free_codepoint_property(
    410 						&(missing[cp].properties[0]));
    411 				} else {
    412 					/* this shouldn't happen */
    413 					fprintf(stderr, "parse_property_file: "
    414 						"malformed missing set\n");
    415 					exit(1);
    416 				}
    417 
    418 				/* copy in the new one */
    419 				duplicate_codepoint_property(
    420 					&p, &(missing[cp].properties[0]));
    421 			} else {
    422 				/* expand the set by one element */
    423 				if (!(cpp[cp].properties = realloc(
    424 					cpp[cp].properties,
    425 					sizeof(*(cpp[cp].properties)) *
    426 					(++cpp[cp].property_count)))) {
    427 					fprintf(stderr, "malloc: %s\n",
    428 					        strerror(errno));
    429 					exit(1);
    430 				}
    431 
    432 				duplicate_codepoint_property(&p,
    433 					&(cpp[cp].properties[cpp[cp].property_count - 1]));
    434 			}
    435 		}
    436 	}
    437 
    438 	/*
    439 	 * now we add the missing elements. We purposefully do not
    440 	 * follow the interpretation in DerivedCoreProperties.txt for
    441 	 * the @missing 'InCB; None'. Missing, for us, means that
    442 	 * _no_ properties have been extracted and thus property_count
    443 	 * is zero, not that some first field is not set. Absolute
    444 	 * madness to even publish data like this...
    445 	 */
    446 	for (cp = 0; cp < UINT32_C(0x110000); cp++) {
    447 		if (cpp[cp].property_count == 0) {
    448 			/* swap the whole lot */
    449 			cpp[cp].properties = missing[cp].properties;
    450 			cpp[cp].property_count = missing[cp].property_count;
    451 			missing[cp].properties = NULL;
    452 			missing[cp].property_count = 0;
    453 		}
    454 	}
    455 
    456 	/* close file */
    457 	if (fclose(fp)) {
    458 		fprintf(stderr, "parse_property_file: fclose '%s': %s.\n",
    459 		        fname, strerror(errno));
    460 		exit(1);
    461 	}
    462 
    463 	/* cleanup */
    464 	free_codepoint_property_set_array(missing);
    465 	free(line);
    466 	free(field);
    467 
    468 	/* return codepoint properties array */
    469 	return cpp;
    470 }
    471 
    472 struct compression_output {
    473 	uint_least64_t *major;
    474 	uint_least64_t *minor;
    475 	size_t block_size_shift;
    476 	size_t block_size;
    477 	size_t major_size;
    478 	size_t minor_size;
    479 	size_t major_entry_size;
    480 	size_t minor_entry_size;
    481 	size_t total_size;
    482 };
    483 
    484 static void
    485 compress_array(const uint_least64_t *array, size_t block_size_shift,
    486                struct compression_output *co)
    487 {
    488 	size_t i, j, major_maximum, minor_maximum;
    489 
    490 	/* set some preliminary data in the output struct */
    491 	co->block_size_shift = block_size_shift;
    492 	co->block_size = ((size_t)1) << block_size_shift;
    493 	co->major_size = UINT32_C(0x110000) / co->block_size;
    494 
    495 	/* allocate/initialise */
    496 	if (!(co->major = malloc(co->major_size * sizeof(*(co->major))))) {
    497 		fprintf(stderr, "malloc: %s\n", strerror(errno));
    498 		exit(1);
    499 	}
    500 	co->minor = NULL;
    501 	co->minor_size = 0;
    502 
    503 	/* iterate over all blocks in the array */
    504 	for (i = 0; i < co->major_size; i++) {
    505 		size_t block_offset = i * co->block_size;
    506 
    507 		/*
    508 		 * iterate over all possible matching block starting
    509 		 * positions in the minor array
    510 		 */
    511 		for (j = 0; j + co->block_size < co->minor_size; j++) {
    512 			/*
    513 			 * check if our current block matches the
    514 			 * candidate block in the minor array
    515 			 */
    516 			if (memcmp(array + block_offset,
    517 			           co->minor + j,
    518 			           co->block_size * sizeof(*array)) == 0) {
    519 				/*
    520 				 * we have a match, so we don't have to
    521 				 * store this block explicitly and just
    522 				 * point to the offset in minor
    523 				 */
    524 				co->major[i] = j;
    525 				break;
    526 			}
    527 		}
    528 		if (j + co->block_size >= co->minor_size) {
    529 			/*
    530 			 * we found no matching subblock in minor. Add it
    531 			 * to the minor array. The index to the first
    532 			 * element we add now is exactly the size
    533 			 * of the minor array.
    534 			 */
    535 			co->major[i] = co->minor_size;
    536 			co->minor_size += co->block_size;
    537 			if (!(co->minor = realloc(co->minor, co->minor_size *
    538 			                          sizeof(*(co->minor))))) {
    539 				fprintf(stderr, "malloc: %s\n",
    540 				        strerror(errno));
    541 				exit(1);
    542 			}
    543 			memcpy(co->minor + co->minor_size - co->block_size,
    544 			       array + block_offset,
    545 			       co->block_size * sizeof(*array));
    546 		}
    547 	}
    548 
    549 	/* the compression is done. Now we derive some metadata */
    550 
    551 	/* determine maxima */
    552 	for (i = 0, major_maximum = 0; i < co->major_size; i++) {
    553 		if (co->major[i] > major_maximum) {
    554 			major_maximum = co->major[i];
    555 		}
    556 	}
    557 	for (i = 0, minor_maximum = 0; i < co->minor_size; i++) {
    558 		if (co->minor[i] > minor_maximum) {
    559 			minor_maximum = co->minor[i];
    560 		}
    561 	}
    562 
    563 	/* determine entry sizes */	
    564 	if (major_maximum < UINT64_C(1) << 8) {
    565 		co->major_entry_size = 8;
    566 	} else if (major_maximum < UINT64_C(1) << 16) {
    567 		co->major_entry_size = 16;
    568 	} else if (major_maximum < UINT64_C(1) << 32) {
    569 		co->major_entry_size = 32;
    570 	} else {
    571 		co->major_entry_size = 64;
    572 	}
    573 
    574 	if (minor_maximum < UINT64_C(1) << 4) {
    575 		/* using 4 bit packed nibbles */
    576 		co->minor_entry_size = 4;
    577 	} else if (minor_maximum < UINT64_C(1) << 8) {
    578 		co->minor_entry_size = 8;
    579 	} else if (minor_maximum < UINT64_C(1) << 16) {
    580 		co->minor_entry_size = 16;
    581 	} else if (minor_maximum < UINT64_C(1) << 32) {
    582 		co->minor_entry_size = 32;
    583 	} else {
    584 		co->minor_entry_size = 64;
    585 	}
    586 
    587 	/* total data size in bytes */
    588 	co->total_size = ((co->major_size * co->major_entry_size) +
    589 	                  (co->minor_size * co->minor_entry_size)) / 8;
    590 }
    591 
    592 void
    593 free_compressed_data(struct compression_output *co)
    594 {
    595 	free(co->major);
    596 	free(co->minor);
    597 	memset(co, 0, sizeof(*co));
    598 }
    599 
    600 void
    601 print_array(const uint_least64_t *array, size_t array_size,
    602             size_t array_entry_size, const char *prefix,
    603 	    const char *name)
    604 {
    605 	size_t i;
    606 
    607 	if (array_entry_size == 4) {
    608 		/* we store two 4-bit nibbles in one byte */
    609 		if (array_size % 2 != 0) {
    610 			/* ensure array lenght is even */
    611 			fprintf(stderr, "print_array: 4-bit array "
    612 			        "is not implemented for odd length.");
    613 			exit(1);
    614 		}
    615 
    616 		printf("static const uint_least8_t %s_%s[] = {",
    617 		       prefix, name);
    618 		for (i = 0; i < array_size; i += 2) {
    619 			if ((i / 2) % 8 == 0) {
    620 				printf("\n\t");
    621 			} else {
    622 				printf(" ");
    623 			}
    624 
    625 			/* write high and low nibbles */
    626 			printf("%zu", (array[i] << 4) | array[i + 1]);
    627 
    628 			if (i + 2 < array_size) {
    629 				printf(",");
    630 			}
    631 		}
    632 		printf("\n};\n");
    633 	} else {
    634 		printf("static const uint_least%zu_t %s_%s[] = {",
    635 		       array_entry_size, prefix, name);
    636 		for (i = 0; i < array_size; i++) {
    637 			if (i % 8 == 0) {
    638 				printf("\n\t");
    639 			} else {
    640 				printf(" ");
    641 			}
    642 			printf("%zu", array[i]);
    643 			if (i + 1 < array_size) {
    644 				printf(",");
    645 			}
    646 		}
    647 		printf("\n};\n");
    648 	}
    649 }
    650 
    651 void
    652 compress_and_output(uint_least64_t *array, const char *prefix)
    653 {
    654 	struct compression_output co, co_best;
    655 	size_t i, j, dictionary_size, dictionary_entry_size;
    656 	uint_least64_t maximum = 0, *dictionary, *keys;
    657 
    658 	/* initialise dictionary */
    659 	dictionary = NULL;
    660 	dictionary_size = 0;
    661 
    662 	/* initialise keys */
    663 	if (!(keys = calloc(UINT32_C(0x110000), sizeof(*keys)))) {
    664 		fprintf(stderr, "calloc: %s\n", strerror(errno));
    665 		exit(1);	
    666 	}
    667 
    668 	for (i = 0; i < UINT32_C(0x110000); i++) {
    669 		/* maximum determination */
    670 		if (array[i] > maximum) {
    671 			maximum = array[i];
    672 		}
    673 
    674 		/* check if the array value is already in the dictionary */
    675 		for (j = 0; j < dictionary_size; j++) {
    676 			if (array[i] == dictionary[j]) {
    677 				break;
    678 			}
    679 		}
    680 		if (j == dictionary_size) {
    681 			/* not in the dictionary, insert the array value */
    682 			if (!(dictionary = realloc(dictionary, (++dictionary_size) *
    683 				sizeof(*dictionary)))) {
    684 				fprintf(stderr, "realloc: %s\n", strerror(errno));
    685 				exit(1);	
    686 			}
    687 			dictionary[dictionary_size - 1] = array[i];
    688 		}
    689 
    690 		/* set the index (key) of the matched dictionary entry */
    691 		keys[i] = j;
    692 	}
    693 
    694 	/* check if dictionary size is below a reasonable threshold */
    695 	if (dictionary_size > 256) {
    696 		fprintf(stderr, "compress_and_output: dictionary too large\n");
    697 		exit(1);	
    698 	}
    699 
    700 	/*
    701 	 * compress keys array with different block size shifts
    702 	 * (block sizes from 16=1<<4 to 32768=1<<15)
    703 	 */
    704 	memset(&co_best, 0, sizeof(co_best));
    705 	co_best.total_size = SIZE_MAX;
    706 	for (i = 15; i >= 4; i--) {
    707 		compress_array(keys, i, &co);
    708 
    709 		fprintf(stderr, "compress_and_output: "
    710 		        "block size %zu -> data size %.1f kB (%zu,%zu)\n",
    711 		        ((size_t)1) << i, (double)co.total_size / 1000, co.major_entry_size, co.minor_entry_size);
    712 
    713 		if (co.total_size < co_best.total_size) {
    714 			/* we have a new best compression */
    715 			free_compressed_data(&co_best);
    716 			co_best = co;
    717 		} else {
    718 			/* not optimal, discard it */
    719 			free_compressed_data(&co);
    720 		}
    721 	}
    722 	fprintf(stderr, "compress_and_output: choosing optimal block size %zu (%zu,%zu)\n",
    723 	        co_best.block_size, co_best.major_entry_size, co_best.minor_entry_size);
    724 
    725 	/* output header */
    726 	printf("/* Automatically generated by gen2/%s */\n"
    727 	       "#include <stdint.h>\n\n"
    728 	       "#include \"character.h\"\n\n", prefix);
    729 
    730 	/* output dictionary */
    731 	if (maximum < UINT64_C(1) << 8) {
    732 		dictionary_entry_size = 8;
    733 	} else if (maximum < UINT64_C(1) << 16) {
    734 		dictionary_entry_size = 16;
    735 	} else if (maximum < UINT64_C(1) << 32) {
    736 		dictionary_entry_size = 32;
    737 	} else {
    738 		dictionary_entry_size = 64;
    739 	}
    740 
    741 	print_array(dictionary, dictionary_size, dictionary_entry_size,
    742 	            prefix, "dictionary");
    743 	printf("\n");
    744 
    745 	/* output major array */
    746 	print_array(co_best.major, co_best.major_size,
    747 	            co_best.major_entry_size, prefix, "major");
    748 	printf("\n");
    749 
    750 	/* output minor array */
    751 	print_array(co_best.minor, co_best.minor_size,
    752 	            co_best.minor_entry_size, prefix, "minor");
    753 	printf("\n");
    754 
    755 	/* output accessor function (major is never 4-bits per entry) */
    756 	printf("static inline uint_least%zu_t\n"
    757 	       "get_%s_property(uint_least32_t cp)\n"
    758 	       "{\n"
    759 	       "\tuint_least%zu_t minor_offset = %s_major[cp >> %zu]\n"
    760 	       "\t\t+ (cp & ((UINT32_C(1) << %zu) - 1));\n",
    761 	       dictionary_entry_size, prefix, co_best.major_entry_size,
    762 	       prefix, co_best.block_size_shift, co_best.block_size_shift);
    763 	if (co_best.minor_entry_size == 4) {
    764 		printf("\tuint_least8_t packed_value = %s_minor[minor_offset / 2];\n\n"
    765 		       "\tif (minor_offset & UINT8_C(1) == 0) {\n"
    766 		       "\t\t/* high nibble */\n"
    767 		       "\t\treturn %s_dictionary[packed_value >> 4];\n"
    768 		       "\t} else {\n"
    769 		       "\t\t/* low nibble */\n"
    770 		       "\t\treturn %s_dictionary[packed_value & UINT8_C(0x0f)];\n"
    771 		       "\t}\n",
    772 		       prefix, prefix, prefix);
    773 	} else {
    774 		printf("\n\treturn %s_dictionary[%s_minor[minor_offset]];\n",
    775 		       prefix, prefix);
    776 	}
    777 	printf("}\n");
    778 
    779 	/* cleanup */
    780 	free(dictionary);
    781 	free(keys);
    782 }