libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

bidirectional.c (12515B)


      1 /* See LICENSE file for copyright and license details. */
      2 #include <errno.h>
      3 #include <inttypes.h>
      4 #include <stddef.h>
      5 #include <stdio.h>
      6 #include <stdlib.h>
      7 #include <string.h>
      8 
      9 #include "util.h"
     10 
     11 #define FILE_BIDI_BRACKETS  "data/BidiBrackets.txt"
     12 #define FILE_BIDI_CLASS     "data/DerivedBidiClass.txt"
     13 #define FILE_BIDI_MIRRORING "data/BidiMirroring.txt"
     14 #define FILE_UNICODE_DATA   "data/UnicodeData.txt"
     15 
     16 #define NUM_BRACKET_ALIASES 20
     17 
     18 static const struct property_spec bidi_property[] = {
     19 	{
     20 		/* default */
     21 		.enumname = "L",
     22 		.file = FILE_BIDI_CLASS,
     23 		.ucdname = "L",
     24 	},
     25 	{
     26 		.enumname = "AL",
     27 		.file = FILE_BIDI_CLASS,
     28 		.ucdname = "AL",
     29 	},
     30 	{
     31 		.enumname = "AN",
     32 		.file = FILE_BIDI_CLASS,
     33 		.ucdname = "AN",
     34 	},
     35 	{
     36 		.enumname = "B",
     37 		.file = FILE_BIDI_CLASS,
     38 		.ucdname = "B",
     39 	},
     40 	{
     41 		.enumname = "BN",
     42 		.file = FILE_BIDI_CLASS,
     43 		.ucdname = "BN",
     44 	},
     45 	{
     46 		.enumname = "CS",
     47 		.file = FILE_BIDI_CLASS,
     48 		.ucdname = "CS",
     49 	},
     50 	{
     51 		.enumname = "EN",
     52 		.file = FILE_BIDI_CLASS,
     53 		.ucdname = "EN",
     54 	},
     55 	{
     56 		.enumname = "ES",
     57 		.file = FILE_BIDI_CLASS,
     58 		.ucdname = "ES",
     59 	},
     60 	{
     61 		.enumname = "ET",
     62 		.file = FILE_BIDI_CLASS,
     63 		.ucdname = "ET",
     64 	},
     65 	{
     66 		.enumname = "FSI",
     67 		.file = FILE_BIDI_CLASS,
     68 		.ucdname = "FSI",
     69 	},
     70 	{
     71 		.enumname = "LRE",
     72 		.file = FILE_BIDI_CLASS,
     73 		.ucdname = "LRE",
     74 	},
     75 	{
     76 		.enumname = "LRI",
     77 		.file = FILE_BIDI_CLASS,
     78 		.ucdname = "LRI",
     79 	},
     80 	{
     81 		.enumname = "LRO",
     82 		.file = FILE_BIDI_CLASS,
     83 		.ucdname = "LRO",
     84 	},
     85 	{
     86 		.enumname = "NSM",
     87 		.file = FILE_BIDI_CLASS,
     88 		.ucdname = "NSM",
     89 	},
     90 	{
     91 		.enumname = "ON",
     92 		.file = FILE_BIDI_CLASS,
     93 		.ucdname = "ON",
     94 	},
     95 	{
     96 		.enumname = "PDF",
     97 		.file = FILE_BIDI_CLASS,
     98 		.ucdname = "PDF",
     99 	},
    100 	{
    101 		.enumname = "PDI",
    102 		.file = FILE_BIDI_CLASS,
    103 		.ucdname = "PDI",
    104 	},
    105 	{
    106 		.enumname = "R",
    107 		.file = FILE_BIDI_CLASS,
    108 		.ucdname = "R",
    109 	},
    110 	{
    111 		.enumname = "RLE",
    112 		.file = FILE_BIDI_CLASS,
    113 		.ucdname = "RLE",
    114 	},
    115 	{
    116 		.enumname = "RLI",
    117 		.file = FILE_BIDI_CLASS,
    118 		.ucdname = "RLI",
    119 	},
    120 	{
    121 		.enumname = "RLO",
    122 		.file = FILE_BIDI_CLASS,
    123 		.ucdname = "RLO",
    124 	},
    125 	{
    126 		.enumname = "S",
    127 		.file = FILE_BIDI_CLASS,
    128 		.ucdname = "S",
    129 	},
    130 	{
    131 		.enumname = "WS",
    132 		.file = FILE_BIDI_CLASS,
    133 		.ucdname = "WS",
    134 	},
    135 };
    136 
    137 struct decomposition_payload {
    138 	uint_least32_t cp;
    139 	uint_least32_t decomposition;
    140 };
    141 
    142 static int
    143 decomposition_callback(const char *file, char **field, size_t nfields,
    144                        char *comment, void *payload)
    145 {
    146 	char *p;
    147 	struct decomposition_payload *decomp =
    148 		(struct decomposition_payload *)payload;
    149 	uint_least32_t cp;
    150 
    151 	(void)file;
    152 	(void)comment;
    153 
    154 	if (nfields < 6) {
    155 		/* we have fewer than 6 fields, discard the line */
    156 		return 0;
    157 	}
    158 
    159 	hextocp(field[0], strlen(field[0]), &cp);
    160 
    161 	if (decomp->cp == cp) {
    162 		/* we hit the line that contains our decomposition target */
    163 		if (strlen(field[5]) > 0) {
    164 			p = field[5];
    165 			if (*p == '<') {
    166 				/*
    167 				 * the decomposition contains some metadata
    168 				 * <...> we skip
    169 				 */
    170 				for (; *p != '\0'; p++) {
    171 					if (*p == '>') {
    172 						p++;
    173 						while (*p == ' ') {
    174 							p++;
    175 						}
    176 						break;
    177 					}
    178 				}
    179 			}
    180 			hextocp(p, strlen(p), &(decomp->decomposition));
    181 		} else {
    182 			decomp->decomposition = decomp->cp;
    183 		}
    184 	}
    185 
    186 	return 0;
    187 }
    188 
    189 static struct {
    190 	uint_least32_t base[NUM_BRACKET_ALIASES];
    191 	size_t baselen;
    192 	uint_least32_t pair[NUM_BRACKET_ALIASES];
    193 	size_t pairlen;
    194 	uint_least8_t class;
    195 	char type;
    196 } *b = NULL;
    197 
    198 static size_t blen;
    199 static uint_least8_t bracket_class_count = 1;
    200 
    201 static int
    202 bracket_callback(const char *file, char **field, size_t nfields, char *comment,
    203                  void *payload)
    204 {
    205 	size_t i, j;
    206 	struct decomposition_payload decomp_base, decomp_pair;
    207 	uint_least32_t cp_base, cp_pair;
    208 
    209 	(void)file;
    210 	(void)comment;
    211 	(void)payload;
    212 
    213 	if (nfields < 3) {
    214 		/* we have fewer than 3 fields, discard the line */
    215 		return 0;
    216 	}
    217 
    218 	/* parse field data */
    219 	hextocp(field[0], strlen(field[0]), &cp_base);
    220 	hextocp(field[1], strlen(field[1]), &cp_pair);
    221 
    222 	/* determine decomposition of the base and pair codepoints */
    223 	decomp_base.cp = cp_base;
    224 	decomp_pair.cp = cp_pair;
    225 	parse_file_with_callback(FILE_UNICODE_DATA, decomposition_callback,
    226 	                         &decomp_base);
    227 	parse_file_with_callback(FILE_UNICODE_DATA, decomposition_callback,
    228 	                         &decomp_pair);
    229 
    230 	/*
    231 	 * check if we already have the canonical form in the bracket array,
    232 	 * per convention the canonical form is the first element of the alias
    233 	 * array
    234 	 */
    235 	for (i = 0; i < blen; i++) {
    236 		if (decomp_base.decomposition == b[i].base[0]) {
    237 			/* we have a match, check type */
    238 			if (strlen(field[2]) != 1 ||
    239 			    (field[2][0] != 'o' && field[2][0] != 'c')) {
    240 				/* malformed line */
    241 				return 1;
    242 			} else if (b[i].type != field[2][0]) {
    243 				/* mismatching types */
    244 				return 1;
    245 			}
    246 
    247 			/*
    248 			 * add our base alias to the base array unless it isn't
    249 			 * already in it
    250 			 */
    251 			for (j = 0; j < b[i].baselen; j++) {
    252 				if (cp_base == b[i].base[j]) {
    253 					/* already in array, do nothing */
    254 					break;
    255 				}
    256 			}
    257 			if (j == b[i].baselen) {
    258 				/*
    259 				 * the base alias is not already in the array,
    260 				 * add it
    261 				 */
    262 				if (b[i].baselen == NUM_BRACKET_ALIASES) {
    263 					fprintf(stderr, "too many aliases\n");
    264 					return 1;
    265 				}
    266 				b[i].baselen++;
    267 				b[i].base[b[i].baselen - 1] = cp_base;
    268 			}
    269 
    270 			/*
    271 			 * also add our pair alias to the pair array unless
    272 			 * it isn't already in it
    273 			 */
    274 			for (j = 0; j < b[i].pairlen; j++) {
    275 				if (cp_pair == b[i].pair[j]) {
    276 					/* already in array, do nothing */
    277 					break;
    278 				}
    279 			}
    280 			if (j == b[i].pairlen) {
    281 				/*
    282 				 * the pair alias is not already in the array,
    283 				 * add it
    284 				 */
    285 				if (b[i].pairlen == NUM_BRACKET_ALIASES) {
    286 					fprintf(stderr, "too many aliases\n");
    287 					return 1;
    288 				}
    289 				b[i].pairlen++;
    290 				b[i].pair[b[i].pairlen - 1] = cp_pair;
    291 			}
    292 
    293 			return 0;
    294 		}
    295 	}
    296 
    297 	/* extend bracket pair array, as this is a new bracket type */
    298 	if (!(b = realloc(b, (++blen) * sizeof(*b)))) {
    299 		fprintf(stderr, "realloc: %s\n", strerror(errno));
    300 		exit(1);
    301 	}
    302 
    303 	/* fill field data by adding the canonical form first */
    304 	b[blen - 1].base[0] = decomp_base.decomposition;
    305 	b[blen - 1].baselen = 1;
    306 	b[blen - 1].pair[0] = decomp_pair.decomposition;
    307 	b[blen - 1].pairlen = 1;
    308 
    309 	/* add alias if it differs from the canonical form */
    310 	if (cp_base != decomp_base.decomposition) {
    311 		b[blen - 1].base[1] = cp_base;
    312 		b[blen - 1].baselen = 2;
    313 	}
    314 	if (cp_pair != decomp_pair.decomposition) {
    315 		b[blen - 1].pair[1] = cp_pair;
    316 		b[blen - 1].pairlen = 2;
    317 	}
    318 
    319 	/* add bracket type */
    320 	if (strlen(field[2]) != 1 ||
    321 	    (field[2][0] != 'o' && field[2][0] != 'c')) {
    322 		/* malformed line */
    323 		return 1;
    324 	} else {
    325 		b[blen - 1].type = field[2][0];
    326 	}
    327 
    328 	/*
    329 	 * determine bracket class by iterating over the bracket-array
    330 	 * and seeing if our current canonical cp already has a matching pair.
    331 	 * We only need to check the first entry in each bracket alias
    332 	 * list, as this is, per convention, the canonical form.
    333 	 * If not, add a new class.
    334 	 */
    335 	for (i = 0; i + 1 < blen; i++) {
    336 		if (b[i].pair[0] == b[blen - 1].base[0]) {
    337 			/* matched class */
    338 			b[blen - 1].class = b[i].class;
    339 			break;
    340 		}
    341 	}
    342 	if (i + 1 == blen) {
    343 		/* no match, assign a new class */
    344 		b[blen - 1].class = bracket_class_count++;
    345 	}
    346 
    347 	return 0;
    348 }
    349 
    350 static void
    351 post_process(struct properties *prop)
    352 {
    353 	size_t i, j;
    354 
    355 	for (i = 0; i < blen; i++) {
    356 		/*
    357 		 * given the base property fits in 5 bits, we simply
    358 		 * store the bracket-offset in the bits above that.
    359 		 *
    360 		 * All those properties that are not set here implicitly
    361 		 * have offset 0, which we prepared to contain a stub
    362 		 * for a character that is not a bracket.
    363 		 */
    364 		for (j = 0; j < b[i].baselen; j++) {
    365 			prop[b[i].base[j]].property |= (i << 5);
    366 		}
    367 	}
    368 }
    369 
    370 static uint_least8_t
    371 fill_missing(uint_least32_t cp)
    372 {
    373 	/* based on the @missing-properties in data/DerivedBidiClass.txt */
    374 	if ((cp >= UINT32_C(0x0590) && cp <= UINT32_C(0x05FF)) ||
    375 	    (cp >= UINT32_C(0x07C0) && cp <= UINT32_C(0x085F)) ||
    376 	    (cp >= UINT32_C(0xFB1D) && cp <= UINT32_C(0xFB4F)) ||
    377 	    (cp >= UINT32_C(0x10800) && cp <= UINT32_C(0x10CFF)) ||
    378 	    (cp >= UINT32_C(0x10D40) && cp <= UINT32_C(0x10EBF)) ||
    379 	    (cp >= UINT32_C(0x10F00) && cp <= UINT32_C(0x10F2F)) ||
    380 	    (cp >= UINT32_C(0x10F70) && cp <= UINT32_C(0x10FFF)) ||
    381 	    (cp >= UINT32_C(0x1E800) && cp <= UINT32_C(0x1EC6F)) ||
    382 	    (cp >= UINT32_C(0x1ECC0) && cp <= UINT32_C(0x1ECFF)) ||
    383 	    (cp >= UINT32_C(0x1ED50) && cp <= UINT32_C(0x1EDFF)) ||
    384 	    (cp >= UINT32_C(0x1EF00) && cp <= UINT32_C(0x1EFFF))) {
    385 		return 17; /* class R */
    386 	} else if ((cp >= UINT32_C(0x0600) && cp <= UINT32_C(0x07BF)) ||
    387 	           (cp >= UINT32_C(0x0860) && cp <= UINT32_C(0x08FF)) ||
    388 	           (cp >= UINT32_C(0xFB50) && cp <= UINT32_C(0xFDCF)) ||
    389 	           (cp >= UINT32_C(0xFDF0) && cp <= UINT32_C(0xFDFF)) ||
    390 	           (cp >= UINT32_C(0xFE70) && cp <= UINT32_C(0xFEFF)) ||
    391 	           (cp >= UINT32_C(0x10D00) && cp <= UINT32_C(0x10D3F)) ||
    392 	           (cp >= UINT32_C(0x10EC0) && cp <= UINT32_C(0x10EFF)) ||
    393 	           (cp >= UINT32_C(0x10F30) && cp <= UINT32_C(0x10F6F)) ||
    394 	           (cp >= UINT32_C(0x1EC70) && cp <= UINT32_C(0x1ECBF)) ||
    395 	           (cp >= UINT32_C(0x1ED00) && cp <= UINT32_C(0x1ED4F)) ||
    396 	           (cp >= UINT32_C(0x1EE00) && cp <= UINT32_C(0x1EEFF))) {
    397 		return 1; /* class AL */
    398 	} else if (cp >= UINT32_C(0x20A0) && cp <= UINT32_C(0x20CF)) {
    399 		return 8; /* class ET */
    400 	} else {
    401 		return 0; /* class L */
    402 	}
    403 }
    404 
    405 static struct properties *prop_mirror = NULL;
    406 
    407 static int
    408 mirror_callback(const char *file, char **field, size_t nfields, char *comment,
    409                 void *payload)
    410 {
    411 	uint_least32_t cp, cp_mirror;
    412 
    413 	(void)file;
    414 	(void)comment;
    415 	(void)payload;
    416 
    417 	hextocp(field[0], strlen(field[0]), &cp);
    418 
    419 	cp_mirror = cp;
    420 
    421 	if (nfields >= 2 && strlen(field[1]) > 0 &&
    422 	    hextocp(field[1], strlen(field[1]), &cp_mirror)) {
    423 		return 1;
    424 	}
    425 
    426 	prop_mirror[cp].property = (int_least32_t)cp_mirror - (int_least32_t)cp;
    427 
    428 	return 0;
    429 }
    430 
    431 static int_least64_t
    432 get_value(const struct properties *prop, size_t offset)
    433 {
    434 	return prop[offset].property;
    435 }
    436 
    437 int
    438 main(int argc, char *argv[])
    439 {
    440 	struct properties_compressed comp_mirror;
    441 	struct properties_major_minor mm_mirror;
    442 	size_t i;
    443 
    444 	(void)argc;
    445 
    446 	/*
    447 	 * the first element in the bracket array is initialized to
    448 	 * all-zeros, as we use the implicit 0-offset for all those
    449 	 * codepoints that are not a bracket
    450 	 */
    451 	if (!(b = calloc((blen = 1), sizeof(*b)))) {
    452 		fprintf(stderr, "calloc: %s\n", strerror(errno));
    453 		exit(1);
    454 	}
    455 	parse_file_with_callback(FILE_BIDI_BRACKETS, bracket_callback, NULL);
    456 
    457 	properties_generate_break_property(bidi_property, LEN(bidi_property),
    458 	                                   fill_missing, NULL, post_process,
    459 	                                   "bidi", argv[0]);
    460 
    461 	printf("\nenum bracket_type {\n\tBIDI_BRACKET_NONE,\n\t"
    462 	       "BIDI_BRACKET_OPEN,\n\tBIDI_BRACKET_CLOSE,\n};\n\n"
    463 	       "static const struct bracket {\n\tenum bracket_type type;\n"
    464 	       "\tuint_least8_t class;\n} bidi_bracket[] = {\n");
    465 	for (i = 0; i < blen; i++) {
    466 		printf("\t{\n\t\t.type = %s,\n\t\t.class = "
    467 		       "%" PRIuLEAST8 ",\n\t},\n",
    468 		       (b[i].type == 'o') ? "BIDI_BRACKET_OPEN" :
    469 		       (b[i].type == 'c') ? "BIDI_BRACKET_CLOSE" :
    470 		                            "BIDI_BRACKET_NONE",
    471 		       b[i].class);
    472 	}
    473 	printf("};\n");
    474 
    475 	/*
    476 	 * allocate property buffer for all 0x110000 codepoints
    477 	 *
    478 	 * the buffers contain the offset from the "base" character
    479 	 * to the respective mirrored character. By callocing we set all
    480 	 * fields to zero, which is also the Unicode "default" in the sense
    481 	 * that the coe point is its mirror (unless we fill it in)
    482 	 */
    483 	if (!(prop_mirror = calloc(UINT32_C(0x110000), sizeof(*prop_mirror)))) {
    484 		fprintf(stderr, "calloc: %s\n", strerror(errno));
    485 		exit(1);
    486 	}
    487 	parse_file_with_callback(FILE_BIDI_MIRRORING, mirror_callback, NULL);
    488 
    489 	/* compress properties */
    490 	properties_compress(prop_mirror, &comp_mirror);
    491 
    492 	fprintf(stderr, "%s: mirror-LUT compression-ratio: %.2f%%\n", argv[0],
    493 	        properties_get_major_minor(&comp_mirror, &mm_mirror));
    494 
    495 	/* print tables */
    496 	properties_print_lookup_table("mirror_major", mm_mirror.major, 0x1100);
    497 	printf("\n");
    498 	properties_print_derived_lookup_table("mirror_minor", mm_mirror.minor,
    499 	                                      mm_mirror.minorlen, get_value,
    500 	                                      comp_mirror.data);
    501 
    502 	free(comp_mirror.data);
    503 	free(comp_mirror.offset);
    504 	free(mm_mirror.major);
    505 	free(mm_mirror.minor);
    506 
    507 	return 0;
    508 }