libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

util.c (21467B)


      1 /* See LICENSE file for copyright and license details. */
      2 #include <ctype.h>
      3 #include <errno.h>
      4 #include <inttypes.h>
      5 #include <stdbool.h>
      6 #include <stddef.h>
      7 #include <stdint.h>
      8 #include <stdio.h>
      9 #include <stdlib.h>
     10 #include <string.h>
     11 
     12 #include "util.h"
     13 
     14 struct range {
     15 	uint_least32_t lower;
     16 	uint_least32_t upper;
     17 };
     18 
     19 struct properties_payload {
     20 	struct properties *prop;
     21 	const struct property_spec *spec;
     22 	uint_least8_t speclen;
     23 	int (*set_value)(struct properties_payload *, uint_least32_t,
     24 	                 int_least64_t);
     25 	uint_least8_t (*handle_conflict)(uint_least32_t, uint_least8_t,
     26 	                                 uint_least8_t);
     27 };
     28 
     29 struct break_test_payload {
     30 	struct break_test **test;
     31 	size_t *testlen;
     32 };
     33 
     34 static void *
     35 reallocate_array(void *p, size_t len, size_t size)
     36 {
     37 	if (len > 0 && size > SIZE_MAX / len) {
     38 		errno = ENOMEM;
     39 		return NULL;
     40 	}
     41 
     42 	return realloc(p, len * size);
     43 }
     44 
     45 int
     46 hextocp(const char *str, size_t len, uint_least32_t *cp)
     47 {
     48 	size_t i;
     49 	int off;
     50 	char relative;
     51 
     52 	/* the maximum valid codepoint is 0x10FFFF */
     53 	if (len > 6) {
     54 		fprintf(stderr, "hextocp: '%.*s' is too long.\n", (int)len,
     55 		        str);
     56 		return 1;
     57 	}
     58 
     59 	for (i = 0, *cp = 0; i < len; i++) {
     60 		if (str[i] >= '0' && str[i] <= '9') {
     61 			relative = '0';
     62 			off = 0;
     63 		} else if (str[i] >= 'a' && str[i] <= 'f') {
     64 			relative = 'a';
     65 			off = 10;
     66 		} else if (str[i] >= 'A' && str[i] <= 'F') {
     67 			relative = 'A';
     68 			off = 10;
     69 		} else {
     70 			fprintf(stderr, "hextocp: '%.*s' is not hexadecimal.\n",
     71 			        (int)len, str);
     72 			return 1;
     73 		}
     74 
     75 		*cp += ((uint_least32_t)1 << (4 * (len - i - 1))) *
     76 		       (uint_least32_t)(str[i] - relative + off);
     77 	}
     78 
     79 	if (*cp > UINT32_C(0x10FFFF)) {
     80 		fprintf(stderr, "hextocp: '%.*s' is too large.\n", (int)len,
     81 		        str);
     82 		return 1;
     83 	}
     84 
     85 	return 0;
     86 }
     87 
     88 int
     89 parse_cp_list(const char *str, uint_least32_t **cp, size_t *cplen)
     90 {
     91 	size_t count, i;
     92 	const char *tmp1 = NULL, *tmp2 = NULL;
     93 
     94 	if (strlen(str) == 0) {
     95 		*cp = NULL;
     96 		*cplen = 0;
     97 		return 0;
     98 	}
     99 
    100 	/* count the number of spaces in the string and infer list length */
    101 	for (count = 1, tmp1 = str; (tmp2 = strchr(tmp1, ' ')) != NULL;
    102 	     count++, tmp1 = tmp2 + 1) {
    103 		;
    104 	}
    105 
    106 	/* allocate resources */
    107 	if (!(*cp = calloc((*cplen = count), sizeof(**cp)))) {
    108 		fprintf(stderr, "calloc: %s\n", strerror(errno));
    109 		exit(1);
    110 	}
    111 
    112 	/* go through the string again, parsing the numbers */
    113 	for (i = 0, tmp1 = tmp2 = str; tmp2 != NULL; i++) {
    114 		tmp2 = strchr(tmp1, ' ');
    115 		if (hextocp(tmp1, tmp2 ? (size_t)(tmp2 - tmp1) : strlen(tmp1),
    116 		            &((*cp)[i]))) {
    117 			return 1;
    118 		}
    119 		if (tmp2 != NULL) {
    120 			tmp1 = tmp2 + 1;
    121 		}
    122 	}
    123 
    124 	return 0;
    125 }
    126 
    127 static int
    128 range_parse(const char *str, struct range *range)
    129 {
    130 	char *p;
    131 
    132 	if ((p = strstr(str, "..")) == NULL) {
    133 		/* input has the form "XXXXXX" */
    134 		if (hextocp(str, strlen(str), &range->lower)) {
    135 			return 1;
    136 		}
    137 		range->upper = range->lower;
    138 	} else {
    139 		/* input has the form "XXXXXX..XXXXXX" */
    140 		if (hextocp(str, (size_t)(p - str), &range->lower) ||
    141 		    hextocp(p + 2, strlen(p + 2), &range->upper)) {
    142 			return 1;
    143 		}
    144 	}
    145 
    146 	return 0;
    147 }
    148 
    149 static bool
    150 get_line(char **buf, size_t *bufsize, FILE *fp, size_t *len)
    151 {
    152 	int ret = EOF;
    153 
    154 	for (*len = 0;; (*len)++) {
    155 		if (*len > 0 && *buf != NULL && (*buf)[*len - 1] == '\n') {
    156 			/*
    157 			 * if the previously read character was a newline,
    158 			 * we fake an end-of-file so we NUL-terminate and
    159 			 * are done.
    160 			 */
    161 			ret = EOF;
    162 		} else {
    163 			ret = fgetc(fp);
    164 		}
    165 
    166 		if (*len >= *bufsize) {
    167 			/* the buffer needs to be expanded */
    168 			*bufsize += 512;
    169 			if ((*buf = realloc(*buf, *bufsize)) == NULL) {
    170 				fprintf(stderr, "get_line: Out of memory.\n");
    171 				exit(1);
    172 			}
    173 		}
    174 
    175 		if (ret != EOF) {
    176 			(*buf)[*len] = (char)ret;
    177 		} else {
    178 			(*buf)[*len] = '\0';
    179 			break;
    180 		}
    181 	}
    182 
    183 	return *len == 0 && (feof(fp) || ferror(fp));
    184 }
    185 
    186 void
    187 parse_file_with_callback(const char *fname,
    188                          int (*callback)(const char *, char **, size_t, char *,
    189                                          void *),
    190                          void *payload)
    191 {
    192 	FILE *fp;
    193 	char *line = NULL, **field = NULL, *comment;
    194 	size_t linebufsize = 0, i, fieldbufsize = 0, j, nfields, len;
    195 
    196 	/* open file */
    197 	if (!(fp = fopen(fname, "r"))) {
    198 		fprintf(stderr, "parse_file_with_callback: fopen '%s': %s.\n",
    199 		        fname, strerror(errno));
    200 		exit(1);
    201 	}
    202 
    203 	while (!get_line(&line, &linebufsize, fp, &len)) {
    204 		/* remove trailing newline */
    205 		if (len > 0 && line[len - 1] == '\n') {
    206 			line[len - 1] = '\0';
    207 			len--;
    208 		}
    209 
    210 		/* skip empty lines and comment lines */
    211 		if (len == 0 || line[0] == '#') {
    212 			continue;
    213 		}
    214 
    215 		/* tokenize line into fields */
    216 		for (i = 0, nfields = 0, comment = NULL; i < (size_t)len; i++) {
    217 			/* skip leading whitespace */
    218 			while (line[i] == ' ') {
    219 				i++;
    220 			}
    221 
    222 			/* check if we crashed into the comment */
    223 			if (line[i] != '#') {
    224 				/* extend field buffer, if necessary */
    225 				if (++nfields > fieldbufsize) {
    226 					if ((field = realloc(
    227 						     field,
    228 						     nfields *
    229 							     sizeof(*field))) ==
    230 					    NULL) {
    231 						fprintf(stderr,
    232 						        "parse_file_with_"
    233 						        "callback: realloc: "
    234 						        "%s.\n",
    235 						        strerror(errno));
    236 						exit(1);
    237 					}
    238 					fieldbufsize = nfields;
    239 				}
    240 
    241 				/* set current position as field start */
    242 				field[nfields - 1] = &line[i];
    243 
    244 				/* continue until we reach ';' or '#' or end */
    245 				while (line[i] != ';' && line[i] != '#' &&
    246 				       line[i] != '\0') {
    247 					i++;
    248 				}
    249 			}
    250 
    251 			if (line[i] == '#') {
    252 				/* set comment-variable for later */
    253 				comment = &line[i + 1];
    254 			}
    255 
    256 			/* go back whitespace and terminate field there */
    257 			if (i > 0) {
    258 				for (j = i - 1; line[j] == ' '; j--) {
    259 					;
    260 				}
    261 				line[j + 1] = '\0';
    262 			} else {
    263 				line[i] = '\0';
    264 			}
    265 
    266 			/* if comment is set, we are done */
    267 			if (comment != NULL) {
    268 				break;
    269 			}
    270 		}
    271 
    272 		/* skip leading whitespace in comment */
    273 		while (comment != NULL && comment[0] == ' ') {
    274 			comment++;
    275 		}
    276 
    277 		/* call callback function */
    278 		if (callback(fname, field, nfields, comment, payload)) {
    279 			fprintf(stderr, "parse_file_with_callback: "
    280 			                "Malformed input.\n");
    281 			exit(1);
    282 		}
    283 	}
    284 
    285 	/* close file */
    286 	if (fclose(fp)) {
    287 		fprintf(stderr, "parse_file_with_callback: fclose '%s': %s.\n",
    288 		        fname, strerror(errno));
    289 		exit(1);
    290 	}
    291 
    292 	/* cleanup */
    293 	free(line);
    294 	free(field);
    295 }
    296 
    297 static int
    298 properties_callback(const char *file, char **field, size_t nfields,
    299                     char *comment, void *payload)
    300 {
    301 	/* prop always has the length 0x110000 */
    302 	struct properties_payload *p = (struct properties_payload *)payload;
    303 	struct range r;
    304 	uint_least8_t i;
    305 	uint_least32_t cp;
    306 
    307 	(void)comment;
    308 
    309 	if (nfields < 2) {
    310 		return 1;
    311 	}
    312 
    313 	for (i = 0; i < p->speclen; i++) {
    314 		/* identify fitting file and identifier */
    315 		if (p->spec[i].file && !strcmp(p->spec[i].file, file) &&
    316 		    (!strcmp(p->spec[i].ucdname, field[1]) ||
    317 		     (comment != NULL &&
    318 		      !strncmp(p->spec[i].ucdname, comment,
    319 		               strlen(p->spec[i].ucdname)) &&
    320 		      comment[strlen(p->spec[i].ucdname)] == ' ')) &&
    321 		    (p->spec[i].ucdsubname == NULL ||
    322 		     (nfields >= 3 &&
    323 		      !strcmp(p->spec[i].ucdsubname, field[2])))) {
    324 			/* parse range in first field */
    325 			if (range_parse(field[0], &r)) {
    326 				return 1;
    327 			}
    328 
    329 			/* apply to all codepoints in the range */
    330 			for (cp = r.lower; cp <= r.upper; cp++) {
    331 				if (p->set_value(payload, cp, i)) {
    332 					exit(1);
    333 				}
    334 			}
    335 			break;
    336 		}
    337 	}
    338 
    339 	return 0;
    340 }
    341 
    342 void
    343 properties_compress(const struct properties *prop,
    344                     struct properties_compressed *comp)
    345 {
    346 	uint_least32_t cp, i;
    347 
    348 	/* initialization */
    349 	if (!(comp->offset = malloc((size_t)UINT32_C(0x110000) *
    350 	                            sizeof(*(comp->offset))))) {
    351 		fprintf(stderr, "malloc: %s\n", strerror(errno));
    352 		exit(1);
    353 	}
    354 	comp->data = NULL;
    355 	comp->datalen = 0;
    356 
    357 	for (cp = 0; cp < UINT32_C(0x110000); cp++) {
    358 		for (i = 0; i < comp->datalen; i++) {
    359 			if (!memcmp(&(prop[cp]), &(comp->data[i]),
    360 			            sizeof(*prop))) {
    361 				/* found a match! */
    362 				comp->offset[cp] = i;
    363 				break;
    364 			}
    365 		}
    366 		if (i == comp->datalen) {
    367 			/*
    368 			 * found no matching properties-struct, so
    369 			 * add current properties to data and add the
    370 			 * offset in the offset-table
    371 			 */
    372 			if (!(comp->data = reallocate_array(
    373 				      comp->data, ++(comp->datalen),
    374 				      sizeof(*(comp->data))))) {
    375 				fprintf(stderr, "reallocate_array: %s\n",
    376 				        strerror(errno));
    377 				exit(1);
    378 			}
    379 			memcpy(&(comp->data[comp->datalen - 1]), &(prop[cp]),
    380 			       sizeof(*prop));
    381 			comp->offset[cp] = comp->datalen - 1;
    382 		}
    383 	}
    384 }
    385 
    386 double
    387 properties_get_major_minor(const struct properties_compressed *comp,
    388                            struct properties_major_minor *mm)
    389 {
    390 	size_t i, j, compression_count = 0;
    391 
    392 	/*
    393 	 * we currently have an array comp->offset which maps the
    394 	 * codepoints 0..0x110000 to offsets into comp->data.
    395 	 * To improve cache-locality instead and allow a bit of
    396 	 * compressing, instead of directly mapping a codepoint
    397 	 * 0xAAAABB with comp->offset, we generate two arrays major
    398 	 * and minor such that
    399 	 *    comp->offset(0xAAAABB) == minor[major[0xAAAA] + 0xBB]
    400 	 * This yields a major-array of length 2^16 and a minor array
    401 	 * of variable length depending on how many common subsequences
    402 	 * can be filtered out.
    403 	 */
    404 
    405 	/* initialize */
    406 	if (!(mm->major = malloc((size_t)0x1100 * sizeof(*(mm->major))))) {
    407 		fprintf(stderr, "malloc: %s\n", strerror(errno));
    408 		exit(1);
    409 	}
    410 	mm->minor = NULL;
    411 	mm->minorlen = 0;
    412 
    413 	for (i = 0; i < (size_t)0x1100; i++) {
    414 		/*
    415 		 * we now look at the cp-range (i << 8)..(i << 8 + 0xFF)
    416 		 * and check if its corresponding offset-data already
    417 		 * exists in minor (because then we just point there
    418 		 * and need less storage)
    419 		 */
    420 		for (j = 0; j + 0xFF < mm->minorlen; j++) {
    421 			if (!memcmp(&(comp->offset[i << 8]), &(mm->minor[j]),
    422 			            sizeof(*(comp->offset)) * 0x100)) {
    423 				break;
    424 			}
    425 		}
    426 		if (j + 0xFF < mm->minorlen) {
    427 			/* found an index */
    428 			compression_count++;
    429 			mm->major[i] = j;
    430 		} else {
    431 			/*
    432 			 * add "new" sequence to minor and point to it
    433 			 * in major
    434 			 */
    435 			mm->minorlen += 0x100;
    436 			if (!(mm->minor =
    437 			              reallocate_array(mm->minor, mm->minorlen,
    438 			                               sizeof(*(mm->minor))))) {
    439 				fprintf(stderr, "reallocate_array: %s\n",
    440 				        strerror(errno));
    441 				exit(1);
    442 			}
    443 			memcpy(&(mm->minor[mm->minorlen - 0x100]),
    444 			       &(comp->offset[i << 8]),
    445 			       sizeof(*(mm->minor)) * 0x100);
    446 			mm->major[i] = mm->minorlen - 0x100;
    447 		}
    448 	}
    449 
    450 	/* return compression ratio */
    451 	return (double)compression_count / 0x1100 * 100;
    452 }
    453 
    454 void
    455 properties_print_lookup_table(const char *name, const size_t *data,
    456                               size_t datalen)
    457 {
    458 	const char *type;
    459 	size_t i, maxval;
    460 
    461 	for (i = 0, maxval = 0; i < datalen; i++) {
    462 		if (data[i] > maxval) {
    463 			maxval = data[i];
    464 		}
    465 	}
    466 
    467 	type = (maxval <= UINT_LEAST8_MAX)  ? "uint_least8_t" :
    468 	       (maxval <= UINT_LEAST16_MAX) ? "uint_least16_t" :
    469 	       (maxval <= UINT_LEAST32_MAX) ? "uint_least32_t" :
    470 	                                      "uint_least64_t";
    471 
    472 	printf("static const %s %s[] = {\n\t", type, name);
    473 	for (i = 0; i < datalen; i++) {
    474 		printf("%zu", data[i]);
    475 		if (i + 1 == datalen) {
    476 			printf("\n");
    477 		} else if ((i + 1) % 8 != 0) {
    478 			printf(", ");
    479 		} else {
    480 			printf(",\n\t");
    481 		}
    482 	}
    483 	printf("};\n");
    484 }
    485 
    486 void
    487 properties_print_derived_lookup_table(
    488 	char *name, size_t *offset, size_t offsetlen,
    489 	int_least64_t (*get_value)(const struct properties *, size_t),
    490 	const void *payload)
    491 {
    492 	const char *type;
    493 	size_t i;
    494 	int_least64_t minval, maxval;
    495 
    496 	for (i = 0, minval = INT_LEAST64_MAX, maxval = INT_LEAST64_MIN;
    497 	     i < offsetlen; i++) {
    498 		if (get_value(payload, offset[i]) > maxval) {
    499 			maxval = get_value(payload, offset[i]);
    500 		} else if (get_value(payload, offset[i]) < minval) {
    501 			minval = get_value(payload, offset[i]);
    502 		}
    503 	}
    504 
    505 	if (minval < 0) {
    506 		/* we need a signed type */
    507 		type = (minval >= INT_LEAST8_MIN && maxval <= INT_LEAST8_MAX) ?
    508 		               "int_least8_t" :
    509 		       (minval >= INT_LEAST16_MIN &&
    510 		        maxval <= INT_LEAST16_MAX) ?
    511 		               "int_least16_t" :
    512 		       (minval >= INT_LEAST32_MIN &&
    513 		        maxval <= INT_LEAST32_MAX) ?
    514 		               "int_least32_t" :
    515 		               "int_least64_t";
    516 	} else {
    517 		/* we are fine with an unsigned type */
    518 		type = (maxval <= UINT_LEAST8_MAX)  ? "uint_least8_t" :
    519 		       (maxval <= UINT_LEAST16_MAX) ? "uint_least16_t" :
    520 		       (maxval <= UINT_LEAST32_MAX) ? "uint_least32_t" :
    521 		                                      "uint_least64_t";
    522 	}
    523 
    524 	printf("static const %s %s[] = {\n\t", type, name);
    525 	for (i = 0; i < offsetlen; i++) {
    526 		printf("%" PRIiLEAST64, get_value(payload, offset[i]));
    527 		if (i + 1 == offsetlen) {
    528 			printf("\n");
    529 		} else if ((i + 1) % 8 != 0) {
    530 			printf(", ");
    531 		} else {
    532 			printf(",\n\t");
    533 		}
    534 	}
    535 	printf("};\n");
    536 }
    537 
    538 static void
    539 properties_print_enum(const struct property_spec *spec, size_t speclen,
    540                       const char *enumname, const char *enumprefix)
    541 {
    542 	size_t i;
    543 
    544 	printf("enum %s {\n", enumname);
    545 	for (i = 0; i < speclen; i++) {
    546 		printf("\t%s_%s,\n", enumprefix, spec[i].enumname);
    547 	}
    548 	printf("\tNUM_%sS,\n};\n\n", enumprefix);
    549 }
    550 
    551 static int
    552 set_value_bp(struct properties_payload *payload, uint_least32_t cp,
    553              int_least64_t value)
    554 {
    555 	if (payload->prop[cp].property != payload->speclen) {
    556 		if (payload->handle_conflict == NULL) {
    557 			fprintf(stderr,
    558 			        "set_value_bp: "
    559 			        "Unhandled character break property "
    560 			        "overwrite for 0x%06X (%s <- %s).\n",
    561 			        cp,
    562 			        payload->spec[payload->prop[cp].property]
    563 			                .enumname,
    564 			        payload->spec[value].enumname);
    565 			return 1;
    566 		} else {
    567 			value = payload->handle_conflict(
    568 				cp, (uint_least8_t)payload->prop[cp].property,
    569 				(uint_least8_t)value);
    570 		}
    571 	}
    572 	payload->prop[cp].property = value;
    573 
    574 	return 0;
    575 }
    576 
    577 static int_least64_t
    578 get_value_bp(const struct properties *prop, size_t offset)
    579 {
    580 	return prop[offset].property;
    581 }
    582 
    583 void
    584 properties_generate_break_property(
    585 	const struct property_spec *spec, uint_least8_t speclen,
    586 	uint_least8_t (*fill_missing)(uint_least32_t),
    587 	uint_least8_t (*handle_conflict)(uint_least32_t, uint_least8_t,
    588                                          uint_least8_t),
    589 	void (*post_process)(struct properties *), const char *prefix,
    590 	const char *argv0)
    591 {
    592 	struct properties_compressed comp;
    593 	struct properties_major_minor mm;
    594 	struct properties_payload payload;
    595 	struct properties *prop;
    596 	size_t i, j, prefixlen = strlen(prefix);
    597 	char buf1[64], prefix_uc[64], buf2[64], buf3[64], buf4[64];
    598 
    599 	/*
    600 	 * allocate property buffer for all 0x110000 codepoints and
    601 	 * initialize its entries to the known invalid value "speclen"
    602 	 */
    603 	if (!(prop = calloc(UINT32_C(0x110000), sizeof(*prop)))) {
    604 		fprintf(stderr, "calloc: %s\n", strerror(errno));
    605 		exit(1);
    606 	}
    607 	for (i = 0; i < UINT32_C(0x110000); i++) {
    608 		prop[i].property = speclen;
    609 	}
    610 
    611 	/* generate data */
    612 	payload.prop = prop;
    613 	payload.spec = spec;
    614 	payload.speclen = speclen;
    615 	payload.set_value = set_value_bp;
    616 	payload.handle_conflict = handle_conflict;
    617 
    618 	/* parse each file exactly once and ignore NULL-fields */
    619 	for (i = 0; i < speclen; i++) {
    620 		for (j = 0; j < i; j++) {
    621 			if (spec[i].file && spec[j].file &&
    622 			    !strcmp(spec[i].file, spec[j].file)) {
    623 				/* file has already been parsed */
    624 				break;
    625 			}
    626 		}
    627 		if (i == j && spec[i].file) {
    628 			/* file has not been processed yet */
    629 			parse_file_with_callback(spec[i].file,
    630 			                         properties_callback, &payload);
    631 		}
    632 	}
    633 
    634 	/* fill in the missing properties that weren't explicitly given */
    635 	for (i = 0; i < UINT32_C(0x110000); i++) {
    636 		if (payload.prop[i].property == speclen) {
    637 			if (fill_missing != NULL) {
    638 				payload.prop[i].property =
    639 					fill_missing((uint_least32_t)i);
    640 			} else {
    641 				payload.prop[i].property = 0;
    642 			}
    643 		}
    644 	}
    645 
    646 	/* post-processing */
    647 	if (post_process != NULL) {
    648 		post_process(payload.prop);
    649 	}
    650 
    651 	/* compress data */
    652 	printf("/* Automatically generated by %s */\n#include <stdint.h>\n\n",
    653 	       argv0);
    654 	properties_compress(prop, &comp);
    655 
    656 	fprintf(stderr, "%s: %s-LUT compression-ratio: %.2f%%\n", argv0, prefix,
    657 	        properties_get_major_minor(&comp, &mm));
    658 
    659 	/* prepare names */
    660 	if ((size_t)snprintf(buf1, LEN(buf1), "%s_property", prefix) >=
    661 	    LEN(buf1)) {
    662 		fprintf(stderr, "snprintf: String truncated.\n");
    663 		exit(1);
    664 	}
    665 	if (LEN(prefix_uc) + 1 < prefixlen) {
    666 		fprintf(stderr, "snprintf: Buffer too small.\n");
    667 		exit(1);
    668 	}
    669 	for (i = 0; i < prefixlen; i++) {
    670 		prefix_uc[i] = (char)toupper(prefix[i]);
    671 	}
    672 	prefix_uc[prefixlen] = '\0';
    673 	if ((size_t)snprintf(buf2, LEN(buf2), "%s_PROP", prefix_uc) >=
    674 	            LEN(buf2) ||
    675 	    (size_t)snprintf(buf3, LEN(buf3), "%s_major", prefix) >=
    676 	            LEN(buf3) ||
    677 	    (size_t)snprintf(buf4, LEN(buf4), "%s_minor", prefix) >=
    678 	            LEN(buf4)) {
    679 		fprintf(stderr, "snprintf: String truncated.\n");
    680 		exit(1);
    681 	}
    682 
    683 	/* print data */
    684 	properties_print_enum(spec, speclen, buf1, buf2);
    685 	properties_print_lookup_table(buf3, mm.major, 0x1100);
    686 	printf("\n");
    687 	properties_print_derived_lookup_table(buf4, mm.minor, mm.minorlen,
    688 	                                      get_value_bp, comp.data);
    689 
    690 	/* free data */
    691 	free(prop);
    692 	free(comp.data);
    693 	free(comp.offset);
    694 	free(mm.major);
    695 	free(mm.minor);
    696 }
    697 
    698 static int
    699 break_test_callback(const char *fname, char **field, size_t nfields,
    700                     char *comment, void *payload)
    701 {
    702 	struct break_test *t,
    703 		**test = ((struct break_test_payload *)payload)->test;
    704 	size_t i, *testlen = ((struct break_test_payload *)payload)->testlen,
    705 		  commentlen;
    706 	char *token;
    707 
    708 	(void)fname;
    709 
    710 	if (nfields < 1) {
    711 		return 1;
    712 	}
    713 
    714 	/* append new testcase and initialize with zeroes */
    715 	if ((*test = realloc(*test, ++(*testlen) * sizeof(**test))) == NULL) {
    716 		fprintf(stderr, "break_test_callback: realloc: %s.\n",
    717 		        strerror(errno));
    718 		return 1;
    719 	}
    720 	t = &(*test)[*testlen - 1];
    721 	memset(t, 0, sizeof(*t));
    722 
    723 	/* parse testcase "<÷|×> <cp> <÷|×> ... <cp> <÷|×>" */
    724 	for (token = strtok(field[0], " "), i = 0; token != NULL;
    725 	     i++, token = strtok(NULL, " ")) {
    726 		if (i % 2 == 0) {
    727 			/* delimiter or start of sequence */
    728 			if (i == 0 ||
    729 			    !strncmp(token, "\xC3\xB7", 2)) { /* UTF-8 */
    730 				/*
    731 				 * '÷' indicates a breakpoint,
    732 				 * the current length is done; allocate
    733 				 * a new length field and set it to 0
    734 				 */
    735 				if ((t->len = realloc(
    736 					     t->len,
    737 					     ++t->lenlen * sizeof(*t->len))) ==
    738 				    NULL) {
    739 					fprintf(stderr,
    740 					        "break_test_"
    741 					        "callback: realloc: %s.\n",
    742 					        strerror(errno));
    743 					return 1;
    744 				}
    745 				t->len[t->lenlen - 1] = 0;
    746 			} else if (!strncmp(token, "\xC3\x97", 2)) { /* UTF-8 */
    747 				/* '×' indicates a non-breakpoint, do nothing */
    748 			} else {
    749 				fprintf(stderr,
    750 				        "break_test_callback: "
    751 				        "Malformed delimiter '%s'.\n",
    752 				        token);
    753 				return 1;
    754 			}
    755 		} else {
    756 			/* add codepoint to cp-array */
    757 			if ((t->cp = realloc(t->cp,
    758 			                     ++t->cplen * sizeof(*t->cp))) ==
    759 			    NULL) {
    760 				fprintf(stderr,
    761 				        "break_test_callback: "
    762 				        "realloc: %s.\n",
    763 				        strerror(errno));
    764 				return 1;
    765 			}
    766 			if (hextocp(token, strlen(token),
    767 			            &t->cp[t->cplen - 1])) {
    768 				return 1;
    769 			}
    770 			if (t->lenlen > 0) {
    771 				t->len[t->lenlen - 1]++;
    772 			}
    773 		}
    774 	}
    775 	if (t->lenlen > 0 && t->len[t->lenlen - 1] == 0) {
    776 		/*
    777 		 * we allocated one more length than we needed because
    778 		 * the breakpoint was at the end
    779 		 */
    780 		t->lenlen--;
    781 	}
    782 
    783 	/* store comment */
    784 	if (comment != NULL) {
    785 		commentlen = strlen(comment) + 1;
    786 		if (((*test)[*testlen - 1].descr = malloc(commentlen)) ==
    787 		    NULL) {
    788 			fprintf(stderr, "break_test_callback: malloc: %s.\n",
    789 			        strerror(errno));
    790 			return 1;
    791 		}
    792 		memcpy((*test)[*testlen - 1].descr, comment, commentlen);
    793 	}
    794 
    795 	return 0;
    796 }
    797 
    798 void
    799 break_test_list_parse(char *fname, struct break_test **test, size_t *testlen)
    800 {
    801 	struct break_test_payload pl = {
    802 		.test = test,
    803 		.testlen = testlen,
    804 	};
    805 	*test = NULL;
    806 	*testlen = 0;
    807 
    808 	parse_file_with_callback(fname, break_test_callback, &pl);
    809 }
    810 
    811 void
    812 break_test_list_print(const struct break_test *test, size_t testlen,
    813                       const char *identifier, const char *progname)
    814 {
    815 	size_t i, j;
    816 
    817 	printf("/* Automatically generated by %s */\n"
    818 	       "#include <stdint.h>\n#include <stddef.h>\n\n"
    819 	       "#include \"../gen/types.h\"\n\n",
    820 	       progname);
    821 
    822 	printf("static const struct break_test %s[] = {\n", identifier);
    823 	for (i = 0; i < testlen; i++) {
    824 		printf("\t{\n");
    825 
    826 		printf("\t\t.cp     = (uint_least32_t[]){");
    827 		for (j = 0; j < test[i].cplen; j++) {
    828 			printf(" UINT32_C(0x%06X)", test[i].cp[j]);
    829 			if (j + 1 < test[i].cplen) {
    830 				putchar(',');
    831 			}
    832 		}
    833 		printf(" },\n");
    834 		printf("\t\t.cplen  = %zu,\n", test[i].cplen);
    835 
    836 		printf("\t\t.len    = (size_t[]){");
    837 		for (j = 0; j < test[i].lenlen; j++) {
    838 			printf(" %zu", test[i].len[j]);
    839 			if (j + 1 < test[i].lenlen) {
    840 				putchar(',');
    841 			}
    842 		}
    843 		printf(" },\n");
    844 		printf("\t\t.lenlen = %zu,\n", test[i].lenlen);
    845 
    846 		printf("\t\t.descr  = \"%s\",\n", test[i].descr);
    847 
    848 		printf("\t},\n");
    849 	}
    850 	printf("};\n");
    851 }
    852 
    853 void
    854 break_test_list_free(struct break_test *test, size_t testlen)
    855 {
    856 	size_t i;
    857 
    858 	for (i = 0; i < testlen; i++) {
    859 		free(test[i].cp);
    860 		free(test[i].len);
    861 		free(test[i].descr);
    862 	}
    863 
    864 	free(test);
    865 }