libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

case.c (8442B)


      1 /* See LICENSE file for copyright and license details. */
      2 #include <errno.h>
      3 #include <stdint.h>
      4 #include <stdio.h>
      5 #include <stdlib.h>
      6 #include <string.h>
      7 
      8 #include "util.h"
      9 
     10 #define FILE_DCP "data/DerivedCoreProperties.txt"
     11 
     12 static const struct property_spec case_property[] = {
     13 	{
     14 		.enumname = "OTHER",
     15 		.file = NULL,
     16 		.ucdname = NULL,
     17 	},
     18 	{
     19 		.enumname = "BOTH_CASED_CASE_IGNORABLE",
     20 		.file = NULL,
     21 		.ucdname = NULL,
     22 	},
     23 	{
     24 		.enumname = "CASED",
     25 		.file = FILE_DCP,
     26 		.ucdname = "Cased",
     27 	},
     28 	{
     29 		.enumname = "CASE_IGNORABLE",
     30 		.file = FILE_DCP,
     31 		.ucdname = "Case_Ignorable",
     32 	},
     33 	{
     34 		.enumname = "UNCASED",
     35 		.file = FILE_DCP,
     36 		.ucdname = "Uncased",
     37 	},
     38 };
     39 
     40 static uint_least8_t
     41 handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t prop2)
     42 {
     43 	uint_least8_t result;
     44 
     45 	(void)cp;
     46 
     47 	if ((!strcmp(case_property[prop1].enumname, "CASED") &&
     48 	     !strcmp(case_property[prop2].enumname, "CASE_IGNORABLE")) ||
     49 	    (!strcmp(case_property[prop1].enumname, "CASE_IGNORABLE") &&
     50 	     !strcmp(case_property[prop2].enumname, "CASED"))) {
     51 		for (result = 0; result < LEN(case_property); result++) {
     52 			if (!strcmp(case_property[result].enumname,
     53 			            "BOTH_CASED_CASE_IGNORABLE")) {
     54 				break;
     55 			}
     56 		}
     57 		if (result == LEN(case_property)) {
     58 			fprintf(stderr, "handle_conflict: Internal error.\n");
     59 			exit(1);
     60 		}
     61 	} else {
     62 		fprintf(stderr, "handle_conflict: Cannot handle conflict.\n");
     63 		exit(1);
     64 	}
     65 
     66 	return result;
     67 }
     68 
     69 static struct properties *prop_upper = NULL, *prop_lower, *prop_title;
     70 
     71 static struct special_case {
     72 	struct {
     73 		uint_least32_t *cp;
     74 		size_t cplen;
     75 	} upper, lower, title;
     76 } *sc = NULL;
     77 
     78 static size_t sclen = 0;
     79 
     80 static int
     81 unicodedata_callback(const char *file, char **field, size_t nfields,
     82                      char *comment, void *payload)
     83 {
     84 	uint_least32_t cp, upper, lower, title;
     85 
     86 	(void)file;
     87 	(void)comment;
     88 	(void)payload;
     89 
     90 	hextocp(field[0], strlen(field[0]), &cp);
     91 
     92 	upper = lower = title = cp;
     93 
     94 	if ((strlen(field[12]) > 0 &&
     95 	     hextocp(field[12], strlen(field[12]), &upper)) ||
     96 	    (strlen(field[13]) > 0 &&
     97 	     hextocp(field[13], strlen(field[13]), &lower)) ||
     98 	    (nfields >= 15 && strlen(field[14]) > 0 &&
     99 	     hextocp(field[14], strlen(field[14]), &title))) {
    100 		return 1;
    101 	}
    102 
    103 	prop_upper[cp].property = (int_least32_t)upper - (int_least32_t)cp;
    104 	prop_lower[cp].property = (int_least32_t)lower - (int_least32_t)cp;
    105 	prop_title[cp].property = (int_least32_t)title - (int_least32_t)cp;
    106 
    107 	return 0;
    108 }
    109 
    110 static int
    111 specialcasing_callback(const char *file, char **field, size_t nfields,
    112                        char *comment, void *payload)
    113 {
    114 	uint_least32_t cp;
    115 
    116 	(void)file;
    117 	(void)comment;
    118 	(void)payload;
    119 
    120 	if (nfields > 4 && strlen(field[4]) > 0) {
    121 		/*
    122 		 * we have more than 4 fields, i.e. the rule has a
    123 		 * condition (language-sensitive, etc.) and is discarded
    124 		 */
    125 		return 0;
    126 	}
    127 
    128 	/* parse affected codepoint */
    129 	hextocp(field[0], strlen(field[0]), &cp);
    130 
    131 	/* extend special case array */
    132 	if (!(sc = realloc(sc, (++sclen) * sizeof(*sc)))) {
    133 		fprintf(stderr, "realloc: %s\n", strerror(errno));
    134 		exit(1);
    135 	}
    136 
    137 	/* parse field data */
    138 	parse_cp_list(field[3], &(sc[sclen - 1].upper.cp),
    139 	              &(sc[sclen - 1].upper.cplen));
    140 	parse_cp_list(field[1], &(sc[sclen - 1].lower.cp),
    141 	              &(sc[sclen - 1].lower.cplen));
    142 	parse_cp_list(field[2], &(sc[sclen - 1].title.cp),
    143 	              &(sc[sclen - 1].title.cplen));
    144 
    145 	/*
    146 	 * overwrite value in "single mapping" property table by the
    147 	 * special value 0x110000 + (offset in special case array),
    148 	 * even if the special case has length 1
    149 	 */
    150 	prop_upper[cp].property =
    151 		(int_least64_t)(UINT32_C(0x110000) + (sclen - 1));
    152 	prop_lower[cp].property =
    153 		(int_least64_t)(UINT32_C(0x110000) + (sclen - 1));
    154 	prop_title[cp].property =
    155 		(int_least64_t)(UINT32_C(0x110000) + (sclen - 1));
    156 
    157 	return 0;
    158 }
    159 
    160 static int_least64_t
    161 get_value(const struct properties *prop, size_t offset)
    162 {
    163 	return prop[offset].property;
    164 }
    165 
    166 int
    167 main(int argc, char *argv[])
    168 {
    169 	struct properties_compressed comp_upper, comp_lower, comp_title;
    170 	struct properties_major_minor mm_upper, mm_lower, mm_title;
    171 	size_t i, j;
    172 
    173 	(void)argc;
    174 
    175 	/* generate case property table from the specification */
    176 	properties_generate_break_property(case_property, LEN(case_property),
    177 	                                   NULL, handle_conflict, NULL, "case",
    178 	                                   argv[0]);
    179 
    180 	/*
    181 	 * allocate property buffers for all 0x110000 codepoints
    182 	 *
    183 	 * the buffers contain the offset from the "base" character
    184 	 * to the respective case mapping. By callocing we set all fields
    185 	 * to zero, which is also the Unicode "default" in the sense that
    186 	 * there is no case mapping by default (unless we fill it in)
    187 	 */
    188 	if (!(prop_upper = calloc(UINT32_C(0x110000), sizeof(*prop_upper))) ||
    189 	    !(prop_lower = calloc(UINT32_C(0x110000), sizeof(*prop_lower))) ||
    190 	    !(prop_title = calloc(UINT32_C(0x110000), sizeof(*prop_title)))) {
    191 		fprintf(stderr, "calloc: %s\n", strerror(errno));
    192 		exit(1);
    193 	}
    194 	parse_file_with_callback("data/UnicodeData.txt", unicodedata_callback,
    195 	                         NULL);
    196 	parse_file_with_callback("data/SpecialCasing.txt",
    197 	                         specialcasing_callback, NULL);
    198 
    199 	/* compress properties */
    200 	properties_compress(prop_upper, &comp_upper);
    201 	properties_compress(prop_lower, &comp_lower);
    202 	properties_compress(prop_title, &comp_title);
    203 
    204 	fprintf(stderr,
    205 	        "%s: LUT compression-ratios: upper=%.2f%%, lower=%.2f%%, "
    206 	        "title=%.2f%%\n",
    207 	        argv[0], properties_get_major_minor(&comp_upper, &mm_upper),
    208 	        properties_get_major_minor(&comp_lower, &mm_lower),
    209 	        properties_get_major_minor(&comp_title, &mm_title));
    210 
    211 	/* print tables */
    212 	printf("/* Automatically generated by %s */\n#include "
    213 	       "<stdint.h>\n#include <stddef.h>\n\n",
    214 	       argv[0]);
    215 
    216 	printf("struct special_case {\n\tuint_least32_t *cp;\n\tsize_t "
    217 	       "cplen;\n};\n\n");
    218 
    219 	properties_print_lookup_table("upper_major", mm_upper.major, 0x1100);
    220 	printf("\n");
    221 	properties_print_derived_lookup_table("upper_minor", mm_upper.minor,
    222 	                                      mm_upper.minorlen, get_value,
    223 	                                      comp_upper.data);
    224 	printf("\n");
    225 	properties_print_lookup_table("lower_major", mm_lower.major, 0x1100);
    226 	printf("\n");
    227 	properties_print_derived_lookup_table("lower_minor", mm_lower.minor,
    228 	                                      mm_lower.minorlen, get_value,
    229 	                                      comp_lower.data);
    230 	printf("\n");
    231 	properties_print_lookup_table("title_major", mm_title.major, 0x1100);
    232 	printf("\n");
    233 	properties_print_derived_lookup_table("title_minor", mm_title.minor,
    234 	                                      mm_title.minorlen, get_value,
    235 	                                      comp_title.data);
    236 	printf("\n");
    237 
    238 	printf("static const struct special_case upper_special[] = {\n");
    239 	for (i = 0; i < sclen; i++) {
    240 		printf("\t{\n");
    241 
    242 		printf("\t\t.cp     = (uint_least32_t[]){");
    243 		for (j = 0; j < sc[i].upper.cplen; j++) {
    244 			printf(" UINT32_C(0x%06X)", sc[i].upper.cp[j]);
    245 			if (j + 1 < sc[i].upper.cplen) {
    246 				putchar(',');
    247 			}
    248 		}
    249 		printf(" },\n");
    250 		printf("\t\t.cplen  = %zu,\n", sc[i].upper.cplen);
    251 		printf("\t},\n");
    252 	}
    253 	printf("};\n\n");
    254 
    255 	printf("static const struct special_case lower_special[] = {\n");
    256 	for (i = 0; i < sclen; i++) {
    257 		printf("\t{\n");
    258 
    259 		printf("\t\t.cp     = (uint_least32_t[]){");
    260 		for (j = 0; j < sc[i].lower.cplen; j++) {
    261 			printf(" UINT32_C(0x%06X)", sc[i].lower.cp[j]);
    262 			if (j + 1 < sc[i].lower.cplen) {
    263 				putchar(',');
    264 			}
    265 		}
    266 		printf(" },\n");
    267 		printf("\t\t.cplen  = %zu,\n", sc[i].lower.cplen);
    268 		printf("\t},\n");
    269 	}
    270 	printf("};\n\n");
    271 
    272 	printf("static const struct special_case title_special[] = {\n");
    273 	for (i = 0; i < sclen; i++) {
    274 		printf("\t{\n");
    275 
    276 		printf("\t\t.cp     = (uint_least32_t[]){");
    277 		for (j = 0; j < sc[i].title.cplen; j++) {
    278 			printf(" UINT32_C(0x%06X)", sc[i].title.cp[j]);
    279 			if (j + 1 < sc[i].title.cplen) {
    280 				putchar(',');
    281 			}
    282 		}
    283 		printf(" },\n");
    284 		printf("\t\t.cplen  = %zu,\n", sc[i].title.cplen);
    285 		printf("\t},\n");
    286 	}
    287 	printf("};\n\n");
    288 
    289 	free(comp_lower.data);
    290 	free(comp_lower.offset);
    291 	free(comp_title.data);
    292 	free(comp_title.offset);
    293 	free(comp_upper.data);
    294 	free(comp_upper.offset);
    295 	free(mm_lower.major);
    296 	free(mm_lower.minor);
    297 	free(mm_title.major);
    298 	free(mm_title.minor);
    299 	free(mm_upper.major);
    300 	free(mm_upper.minor);
    301 
    302 	return 0;
    303 }