libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

util.c (16588B)


      1 /* See LICENSE file for copyright and license details. */
      2 #include <ctype.h>
      3 #include <errno.h>
      4 #include <inttypes.h>
      5 #include <stdbool.h>
      6 #include <stdint.h>
      7 #include <stdlib.h>
      8 #include <stdio.h>
      9 #include <string.h>
     10 
     11 #include "util.h"
     12 
     13 struct range {
     14 	uint_least32_t lower;
     15 	uint_least32_t upper;
     16 };
     17 
     18 struct properties_payload {
     19 	struct properties *prop;
     20 	const struct property_spec *spec;
     21 	uint_least8_t speclen;
     22 	int (*set_value)(struct properties_payload *, uint_least32_t, uint_least8_t);
     23 };
     24 
     25 struct properties_compressed {
     26 	size_t *offset;
     27 	struct properties *data;
     28 	size_t datalen;
     29 };
     30 
     31 struct properties_major_minor {
     32 	size_t *major;
     33 	size_t *minor;
     34 	size_t minorlen;
     35 };
     36 
     37 struct break_test_payload
     38 {
     39 	struct break_test **test;
     40 	size_t *testlen;
     41 };
     42 
     43 static int
     44 hextocp(const char *str, size_t len, uint_least32_t *cp)
     45 {
     46 	size_t i;
     47 	int off;
     48 	char relative;
     49 
     50 	/* the maximum valid codepoint is 0x10FFFF */
     51 	if (len > 6) {
     52 		fprintf(stderr, "hextocp: '%.*s' is too long.\n",
     53 		        (int)len, str);
     54 		return 1;
     55 	}
     56 
     57 	for (i = 0, *cp = 0; i < len; i++) {
     58 		if (str[i] >= '0' && str[i] <= '9') {
     59 			relative = '0';
     60 			off = 0;
     61 		} else if (str[i] >= 'a' && str[i] <= 'f') {
     62 			relative = 'a';
     63 			off = 10;
     64 		} else if (str[i] >= 'A' && str[i] <= 'F') {
     65 			relative = 'A';
     66 			off = 10;
     67 		} else {
     68 			fprintf(stderr, "hextocp: '%.*s' is not hexadecimal.\n",
     69 			        (int)len, str);
     70 			return 1;
     71 		}
     72 
     73 		*cp += ((uint_least32_t)1 << (4 * (len - i - 1))) *
     74 		       (uint_least32_t)(str[i] - relative + off);
     75 	}
     76 
     77 	if (*cp > 0x10ffff) {
     78 		fprintf(stderr, "hextocp: '%.*s' is too large.\n",
     79 		        (int)len, str);
     80 		return 1;
     81 	}
     82 
     83 	return 0;
     84 }
     85 
     86 static int
     87 range_parse(const char *str, struct range *range)
     88 {
     89 	char *p;
     90 
     91 	if ((p = strstr(str, "..")) == NULL) {
     92 		/* input has the form "XXXXXX" */
     93 		if (hextocp(str, strlen(str), &range->lower)) {
     94 			return 1;
     95 		}
     96 		range->upper = range->lower;
     97 	} else {
     98 		/* input has the form "XXXXXX..XXXXXX" */
     99 		if (hextocp(str, (size_t)(p - str), &range->lower) ||
    100 		    hextocp(p + 2, strlen(p + 2), &range->upper)) {
    101 			return 1;
    102 		}
    103 	}
    104 
    105 	return 0;
    106 }
    107 
    108 void
    109 parse_file_with_callback(const char *fname, int (*callback)(const char *,
    110                          char **, size_t, char *, void *), void *payload)
    111 {
    112 	FILE *fp;
    113 	char *line = NULL, **field = NULL, *comment;
    114 	size_t linebufsize = 0, i, fieldbufsize = 0, j, nfields;
    115 	ssize_t len;
    116 
    117 	/* open file */
    118 	if (!(fp = fopen(fname, "r"))) {
    119 		fprintf(stderr, "parse_file_with_callback: fopen '%s': %s.\n",
    120 		        fname, strerror(errno));
    121 		exit(1);
    122 	}
    123 
    124 	while ((len = getline(&line, &linebufsize, fp)) >= 0) {
    125 		/* remove trailing newline */
    126 		if (len > 0 && line[len - 1] == '\n') {
    127 			line[len - 1] = '\0';
    128 			len--;
    129 		}
    130 
    131 		/* skip empty lines and comment lines */
    132 		if (len == 0 || line[0] == '#') {
    133 			continue;
    134 		}
    135 
    136 		/* tokenize line into fields */
    137 		for (i = 0, nfields = 0, comment = NULL; i < (size_t)len; i++) {
    138 			/* extend field buffer, if necessary */
    139 			if (++nfields > fieldbufsize) {
    140 				if ((field = realloc(field, nfields *
    141                                      sizeof(*field))) == NULL) {
    142 					fprintf(stderr, "parse_file_with_"
    143 					        "callback: realloc: %s.\n",
    144 					        strerror(errno));
    145 					exit(1);
    146 				}
    147 				fieldbufsize = nfields;
    148 			}
    149 
    150 			/* skip leading whitespace */
    151 			while (line[i] == ' ') {
    152 				i++;
    153 			}
    154 
    155 			/* set current position as field start */
    156 			field[nfields - 1] = &line[i];
    157 
    158 			/* continue until we reach ';' or '#' or end */
    159 			while (line[i] != ';' && line[i] != '#' &&
    160 			       line[i] != '\0') {
    161 				i++;
    162 			}
    163 			if (line[i] == '#') {
    164 				/* set comment-variable for later */
    165 				comment = &line[i + 1];
    166 			}
    167 
    168 			/* go back whitespace and terminate field there */
    169 			if (i > 0) {
    170 				for (j = i - 1; line[j] == ' '; j--)
    171 					;
    172 				line[j + 1] = '\0';
    173 			} else {
    174 				line[i] = '\0';
    175 			}
    176 
    177 			/* if comment is set, we are done */
    178 			if (comment != NULL) {
    179 				break;
    180 			}
    181 		}
    182 
    183 		/* skip leading whitespace in comment */
    184 		while (comment != NULL && comment[0] == ' ') {
    185 			comment++;
    186 		}
    187 
    188 		/* call callback function */
    189 		if (callback(fname, field, nfields, comment, payload)) {
    190 			fprintf(stderr, "parse_file_with_callback: "
    191 			        "Malformed input.\n");
    192 			exit(1);
    193 		}
    194 	}
    195 
    196 	free(line);
    197 	free(field);
    198 }
    199 
    200 static int
    201 properties_callback(const char *file, char **field, size_t nfields,
    202                     char *comment, void *payload)
    203 {
    204 	/* prop always has the length 0x110000 */
    205 	struct properties_payload *p = (struct properties_payload *)payload;
    206 	struct range r;
    207 	uint_least8_t i;
    208 	uint_least32_t cp;
    209 
    210 	(void)comment;
    211 
    212 	if (nfields < 2) {
    213 		return 1;
    214 	}
    215 
    216 	for (i = 0; i < p->speclen; i++) {
    217 		/* identify fitting file and identifier */
    218 		if (p->spec[i].file &&
    219 		    !strcmp(p->spec[i].file, file) &&
    220 		    !strcmp(p->spec[i].ucdname, field[1])) {
    221 			/* parse range in first field */
    222 			if (range_parse(field[0], &r)) {
    223 				return 1;
    224 			}
    225 
    226 			/* apply to all codepoints in the range */
    227 			for (cp = r.lower; cp <= r.upper; cp++) {
    228 				if (p->set_value(payload, cp, i)) {
    229 					exit(1);
    230 				}
    231 			}
    232 			break;
    233 		}
    234 	}
    235 
    236 	return 0;
    237 }
    238 
    239 static void
    240 properties_compress(const struct properties *prop,
    241                     struct properties_compressed *comp)
    242 {
    243 	uint_least32_t cp, i;
    244 
    245 	/* initialization */
    246 	if (!(comp->offset = malloc((size_t)0x110000 * sizeof(*(comp->offset))))) {
    247 		fprintf(stderr, "malloc: %s\n", strerror(errno));
    248 		exit(1);
    249 	}
    250 	comp->data = NULL;
    251 	comp->datalen = 0;
    252 
    253 	for (cp = 0; cp < 0x110000; cp++) {
    254 		for (i = 0; i < comp->datalen; i++) {
    255 			if (!memcmp(&(prop[cp]), &(comp->data[i]), sizeof(*prop))) {
    256 				/* found a match! */
    257 				comp->offset[cp] = i;
    258 				break;
    259 			}
    260 		}
    261 		if (i == comp->datalen) {
    262 			/*
    263 			 * found no matching properties-struct, so
    264 			 * add current properties to data and add the
    265 			 * offset in the offset-table
    266 			 */
    267 			if (!(comp->data = reallocarray(comp->data,
    268 			                                ++(comp->datalen),
    269 			                                sizeof(*(comp->data))))) {
    270 				fprintf(stderr, "reallocarray: %s\n",
    271 				        strerror(errno));
    272 				exit(1);
    273 			}
    274 			memcpy(&(comp->data[comp->datalen - 1]), &(prop[cp]),
    275 			       sizeof(*prop));
    276 			comp->offset[cp] = comp->datalen - 1;
    277 		}
    278 	}
    279 }
    280 
    281 static double
    282 properties_get_major_minor(const struct properties_compressed *comp,
    283                            struct properties_major_minor *mm)
    284 {
    285 	size_t i, j, compression_count = 0;
    286 
    287 	/*
    288 	 * we currently have an array comp->offset which maps the
    289 	 * codepoints 0..0x110000 to offsets into comp->data.
    290 	 * To improve cache-locality instead and allow a bit of
    291 	 * compressing, instead of directly mapping a codepoint
    292 	 * 0xAAAABB with comp->offset, we generate two arrays major
    293 	 * and minor such that
    294 	 *    comp->offset(0xAAAABB) == minor[major[0xAAAA] + 0xBB]
    295 	 * This yields a major-array of length 2^16 and a minor array
    296 	 * of variable length depending on how many common subsequences
    297 	 * can be filtered out.
    298 	 */
    299 
    300 	/* initialize */
    301 	if (!(mm->major = malloc((size_t)0x1100 * sizeof(*(mm->major))))) {
    302 		fprintf(stderr, "malloc: %s\n", strerror(errno));
    303 		exit(1);
    304 	}
    305 	mm->minor = NULL;
    306 	mm->minorlen = 0;
    307 
    308 	printf("#include <stdint.h>\n\n");
    309 
    310 	for (i = 0; i < (size_t)0x1100; i++) {
    311 		/*
    312 		 * we now look at the cp-range (i << 8)..(i << 8 + 0xFF)
    313 		 * and check if its corresponding offset-data already
    314 		 * exists in minor (because then we just point there
    315 		 * and need less storage)
    316 		 */
    317 		for (j = 0; j + 0xFF < mm->minorlen; j++) {
    318 			if (!memcmp(&(comp->offset[i << 8]),
    319 			            &(mm->minor[j]),
    320 			            sizeof(*(comp->offset)) * 0x100)) {
    321 				break;
    322 			}
    323 		}
    324 		if (j + 0xFF < mm->minorlen) {
    325 			/* found an index */
    326 			compression_count++;
    327 			mm->major[i] = j;
    328 		} else {
    329 			/*
    330 			 * add "new" sequence to minor and point to it
    331 			 * in major
    332 			 */
    333 			mm->minorlen += 0x100;
    334 			if (!(mm->minor = reallocarray(mm->minor,
    335 			                               mm->minorlen,
    336 			                               sizeof(*(mm->minor))))) {
    337 				fprintf(stderr, "reallocarray: %s\n",
    338 				        strerror(errno));
    339 				exit(1);
    340 			}
    341 			memcpy(&(mm->minor[mm->minorlen - 0x100]),
    342 			       &(comp->offset[i << 8]),
    343 			       sizeof(*(mm->minor)) * 0x100);
    344 			mm->major[i] = mm->minorlen - 0x100;
    345 		}
    346 	}
    347 
    348 	/* return compression ratio */
    349 	return (double)compression_count / 0x1100 * 100;
    350 }
    351 
    352 static void
    353 properties_print_lookup_table(char *name, size_t *data, size_t datalen)
    354 {
    355 	char *type;
    356 	size_t i, maxval;
    357 
    358 	for (i = 0, maxval = 0; i < datalen; i++) {
    359 		if (data[i] > maxval) {
    360 			maxval = data[i];
    361 		}
    362 	}
    363 
    364 	type = (maxval <= UINT_LEAST8_MAX)  ? "uint_least8_t"  :
    365 	       (maxval <= UINT_LEAST16_MAX) ? "uint_least16_t" :
    366 	       (maxval <= UINT_LEAST32_MAX) ? "uint_least32_t" :
    367 	                                      "uint_least64_t";
    368 
    369 	printf("static const %s %s[] = {\n\t", type, name);
    370 	for (i = 0; i < datalen; i++) {
    371 		printf("%zu", data[i]);
    372 		if (i + 1 == datalen) {
    373 			printf("\n");
    374 		} else if ((i + 1) % 8 != 0) {
    375 			printf(", ");
    376 		} else {
    377 			printf(",\n\t");
    378 		}
    379 
    380 	}
    381 	printf("};\n");
    382 }
    383 
    384 static void
    385 properties_print_derived_lookup_table(char *name, size_t *offset, size_t offsetlen,
    386                                       uint_least8_t (*get_value)(const struct properties *,
    387                                       size_t), const void *payload)
    388 {
    389 	size_t i;
    390 
    391 	printf("static const uint_least8_t %s[] = {\n\t", name);
    392 	for (i = 0; i < offsetlen; i++) {
    393 		printf("%"PRIuLEAST8, get_value(payload, offset[i]));
    394 		if (i + 1 == offsetlen) {
    395 			printf("\n");
    396 		} else if ((i + 1) % 8 != 0) {
    397 			printf(", ");
    398 		} else {
    399 			printf(",\n\t");
    400 		}
    401 
    402 	}
    403 	printf("};\n");
    404 }
    405 
    406 static void
    407 properties_print_enum(const struct property_spec *spec, size_t speclen,
    408                       const char *enumname, const char *enumprefix)
    409 {
    410 	size_t i;
    411 
    412 	printf("enum %s {\n", enumname);
    413 	for (i = 0; i < speclen; i++) {
    414 		printf("\t%s_%s,\n", enumprefix, spec[i].enumname);
    415 	}
    416 	printf("\tNUM_%sS,\n};\n\n", enumprefix);
    417 }
    418 
    419 static int
    420 set_value_bp(struct properties_payload *payload, uint_least32_t cp,
    421              uint_least8_t value)
    422 {
    423 	if (payload->prop[cp].break_property != 0) {
    424 		fprintf(stderr, "set_value_bp: "
    425 	                "Character break property overwrite (%s <- %s).\n",
    426 		        payload->spec[payload->prop[cp].break_property].enumname,
    427 			payload->spec[value].enumname);
    428 		return 1;
    429 	}
    430 	payload->prop[cp].break_property = value;
    431 
    432 	return 0;
    433 }
    434 
    435 static uint_least8_t
    436 get_value_bp(const struct properties *prop, size_t offset)
    437 {
    438 	return prop[offset].break_property;
    439 }
    440 
    441 void
    442 properties_generate_break_property(const struct property_spec *spec,
    443                                    uint_least8_t speclen, const char *prefix,
    444 				   const char *argv0)
    445 {
    446 	struct properties_compressed comp;
    447 	struct properties_major_minor mm;
    448 	struct properties_payload payload;
    449 	struct properties *prop;
    450 	size_t i, j, prefixlen = strlen(prefix);
    451 	char buf1[64], prefix_uc[64], buf2[64], buf3[64], buf4[64];
    452 
    453 	/* allocate property buffer for all 0x110000 codepoints */
    454 	if (!(prop = calloc(0x110000, sizeof(*prop)))) {
    455 		fprintf(stderr, "calloc: %s\n", strerror(errno));
    456 		exit(1);
    457 	}
    458 
    459 	/* generate data */
    460 	payload.prop = prop;
    461 	payload.spec = spec;
    462 	payload.speclen = speclen;
    463 	payload.set_value = set_value_bp;
    464 
    465 	/* parse each file exactly once and ignore NULL-fields */
    466 	for (i = 0; i < speclen; i++) {
    467 		for (j = 0; j < i; j++) {
    468 			if (spec[i].file && spec[j].file &&
    469 			    !strcmp(spec[i].file, spec[j].file)) {
    470 				/* file has already been parsed */
    471 				break;
    472 			}
    473 		}
    474 		if (i == j && spec[i].file) {
    475 			/* file has not been processed yet */
    476 			parse_file_with_callback(spec[i].file,
    477 			                         properties_callback,
    478 			                         &payload);
    479 		}
    480 	}
    481 
    482 	/* compress data */
    483 	properties_compress(prop, &comp);
    484 
    485 	fprintf(stderr, "%s: compression-ratio: %.2f%%\n", argv0,
    486 	        properties_get_major_minor(&comp, &mm));
    487 
    488 	/* prepare names */
    489 	if ((size_t)snprintf(buf1, LEN(buf1), "%s_break_property", prefix) >= LEN(buf1)) {
    490 		fprintf(stderr, "snprintf: String truncated.\n");
    491 		exit(1);
    492 	}
    493 	if (LEN(prefix_uc) + 1 < prefixlen) {
    494 		fprintf(stderr, "snprintf: Buffer too small.\n");
    495 		exit(1);
    496 	}
    497 	for (i = 0; i < prefixlen; i++) {
    498 		prefix_uc[i] = (char)toupper(prefix[i]);
    499 	}
    500 	prefix_uc[prefixlen] = '\0';
    501 	if ((size_t)snprintf(buf2, LEN(buf2), "%s_BREAK_PROP", prefix_uc) >= LEN(buf2) ||
    502 	    (size_t)snprintf(buf3, LEN(buf3), "%s_break_major", prefix) >= LEN(buf3)   ||
    503 	    (size_t)snprintf(buf4, LEN(buf4), "%s_break_minor", prefix) >= LEN(buf4)) {
    504 		fprintf(stderr, "snprintf: String truncated.\n");
    505 		exit(1);
    506 	}
    507 
    508 	/* print data */
    509 	properties_print_enum(spec, speclen, buf1, buf2);
    510 	properties_print_lookup_table(buf3, mm.major, 0x1100);
    511 	printf("\n");
    512 	properties_print_derived_lookup_table(buf4, mm.minor, mm.minorlen,
    513 	                                      get_value_bp, comp.data);
    514 
    515 	/* free data */
    516 	free(prop);
    517 	free(comp.data);
    518 	free(comp.offset);
    519 	free(mm.major);
    520 	free(mm.minor);
    521 }
    522 
    523 static int
    524 break_test_callback(const char *fname, char **field, size_t nfields,
    525                       char *comment, void *payload)
    526 {
    527 	struct break_test *t,
    528 		**test = ((struct break_test_payload *)payload)->test;
    529 	size_t i, *testlen = ((struct break_test_payload *)payload)->testlen;
    530 	char *token;
    531 
    532 	(void)fname;
    533 
    534 	if (nfields < 1) {
    535 		return 1;
    536 	}
    537 
    538 	/* append new testcase and initialize with zeroes */
    539 	if ((*test = realloc(*test, ++(*testlen) * sizeof(**test))) == NULL) {
    540 		fprintf(stderr, "break_test_callback: realloc: %s.\n",
    541 		        strerror(errno));
    542 		return 1;
    543 	}
    544 	t = &(*test)[*testlen - 1];
    545 	memset(t, 0, sizeof(*t));
    546 
    547 	/* parse testcase "<÷|×> <cp> <÷|×> ... <cp> <÷|×>" */
    548 	for (token = strtok(field[0], " "), i = 0; token != NULL; i++,
    549 	     token = strtok(NULL, " ")) {
    550 		if (i % 2 == 0) {
    551 			/* delimiter */
    552 			if (!strncmp(token, "\xC3\xB7", 2)) { /* UTF-8 */
    553 				/*
    554 				 * '÷' indicates a breakpoint,
    555 				 * the current length is done; allocate
    556 				 * a new length field and set it to 0
    557 				 */
    558 				if ((t->len = realloc(t->len,
    559 				     ++t->lenlen * sizeof(*t->len))) == NULL) {
    560 					fprintf(stderr, "break_test_"
    561 					        "callback: realloc: %s.\n",
    562 					        strerror(errno));
    563 					return 1;
    564 				}
    565 				t->len[t->lenlen - 1] = 0;
    566 			} else if (!strncmp(token, "\xC3\x97", 2)) { /* UTF-8 */
    567 				/*
    568 				 * '×' indicates a non-breakpoint, do nothing
    569 				 */
    570 			} else {
    571 				fprintf(stderr, "break_test_callback: "
    572 				        "Malformed delimiter '%s'.\n", token);
    573 				return 1;
    574 			}
    575 		} else {
    576 			/* add codepoint to cp-array */
    577 			if ((t->cp = realloc(t->cp, ++t->cplen *
    578 			                     sizeof(*t->cp))) == NULL) {
    579 				fprintf(stderr, "break_test_callback: "
    580 				        "realloc: %s.\n", strerror(errno));
    581 				return 1;
    582 			}
    583 			if (hextocp(token, strlen(token), &t->cp[t->cplen - 1])) {
    584 				return 1;
    585 			}
    586 			if (t->lenlen > 0) {
    587 				t->len[t->lenlen - 1]++;
    588 			}
    589 		}
    590 	}
    591 	if (t->len[t->lenlen - 1] == 0) {
    592 		/* we allocated one more length than we needed */
    593 		t->lenlen--;
    594 	}
    595 
    596 	/* store comment */
    597 	if (((*test)[*testlen - 1].descr = strdup(comment)) == NULL) {
    598 		fprintf(stderr, "break_test_callback: strdup: %s.\n",
    599 		        strerror(errno));
    600 		return 1;
    601 	}
    602 
    603 	return 0;
    604 }
    605 
    606 void
    607 break_test_list_parse(char *fname, struct break_test **test,
    608                         size_t *testlen)
    609 {
    610 	struct break_test_payload pl = {
    611 		.test = test,
    612 		.testlen = testlen,
    613 	};
    614 	*test = NULL;
    615 	*testlen = 0;
    616 
    617 	parse_file_with_callback(fname, break_test_callback, &pl);
    618 }
    619 
    620 void
    621 break_test_list_print(const struct break_test *test, size_t testlen,
    622                         const char *identifier, const char *progname)
    623 {
    624 	size_t i, j;
    625 
    626 	printf("/* Automatically generated by %s */\n"
    627 	       "#include <stdint.h>\n#include <stddef.h>\n\n"
    628 	       "#include \"../gen/types.h\"\n\n", progname);
    629 
    630 	printf("static const struct break_test %s[] = {\n", identifier);
    631 	for (i = 0; i < testlen; i++) {
    632 		printf("\t{\n");
    633 
    634 		printf("\t\t.cp     = (uint_least32_t[]){");
    635 		for (j = 0; j < test[i].cplen; j++) {
    636 			printf(" UINT32_C(0x%06X)", test[i].cp[j]);
    637 			if (j + 1 < test[i].cplen) {
    638 				putchar(',');
    639 			}
    640 		}
    641 		printf(" },\n");
    642 		printf("\t\t.cplen  = %zu,\n", test[i].cplen);
    643 
    644 		printf("\t\t.len    = (size_t[]){");
    645 		for (j = 0; j < test[i].lenlen; j++) {
    646 			printf(" %zu", test[i].len[j]);
    647 			if (j + 1 < test[i].lenlen) {
    648 				putchar(',');
    649 			}
    650 		}
    651 		printf(" },\n");
    652 		printf("\t\t.lenlen = %zu,\n", test[i].lenlen);
    653 
    654 		printf("\t\t.descr  = \"%s\",\n", test[i].descr);
    655 
    656 		printf("\t},\n");
    657 	}
    658 	printf("};\n");
    659 }
    660 
    661 void
    662 break_test_list_free(struct break_test *test, size_t testlen)
    663 {
    664 	(void)testlen;
    665 
    666 	free(test);
    667 }