case.c (8442B)
1 /* See LICENSE file for copyright and license details. */ 2 #include <errno.h> 3 #include <stdint.h> 4 #include <stdio.h> 5 #include <stdlib.h> 6 #include <string.h> 7 8 #include "util.h" 9 10 #define FILE_DCP "data/DerivedCoreProperties.txt" 11 12 static const struct property_spec case_property[] = { 13 { 14 .enumname = "OTHER", 15 .file = NULL, 16 .ucdname = NULL, 17 }, 18 { 19 .enumname = "BOTH_CASED_CASE_IGNORABLE", 20 .file = NULL, 21 .ucdname = NULL, 22 }, 23 { 24 .enumname = "CASED", 25 .file = FILE_DCP, 26 .ucdname = "Cased", 27 }, 28 { 29 .enumname = "CASE_IGNORABLE", 30 .file = FILE_DCP, 31 .ucdname = "Case_Ignorable", 32 }, 33 { 34 .enumname = "UNCASED", 35 .file = FILE_DCP, 36 .ucdname = "Uncased", 37 }, 38 }; 39 40 static uint_least8_t 41 handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t prop2) 42 { 43 uint_least8_t result; 44 45 (void)cp; 46 47 if ((!strcmp(case_property[prop1].enumname, "CASED") && 48 !strcmp(case_property[prop2].enumname, "CASE_IGNORABLE")) || 49 (!strcmp(case_property[prop1].enumname, "CASE_IGNORABLE") && 50 !strcmp(case_property[prop2].enumname, "CASED"))) { 51 for (result = 0; result < LEN(case_property); result++) { 52 if (!strcmp(case_property[result].enumname, 53 "BOTH_CASED_CASE_IGNORABLE")) { 54 break; 55 } 56 } 57 if (result == LEN(case_property)) { 58 fprintf(stderr, "handle_conflict: Internal error.\n"); 59 exit(1); 60 } 61 } else { 62 fprintf(stderr, "handle_conflict: Cannot handle conflict.\n"); 63 exit(1); 64 } 65 66 return result; 67 } 68 69 static struct properties *prop_upper = NULL, *prop_lower, *prop_title; 70 71 static struct special_case { 72 struct { 73 uint_least32_t *cp; 74 size_t cplen; 75 } upper, lower, title; 76 } *sc = NULL; 77 78 static size_t sclen = 0; 79 80 static int 81 unicodedata_callback(const char *file, char **field, size_t nfields, 82 char *comment, void *payload) 83 { 84 uint_least32_t cp, upper, lower, title; 85 86 (void)file; 87 (void)comment; 88 (void)payload; 89 90 hextocp(field[0], strlen(field[0]), &cp); 91 92 upper = lower = title = cp; 93 94 if ((strlen(field[12]) > 0 && 95 hextocp(field[12], strlen(field[12]), &upper)) || 96 (strlen(field[13]) > 0 && 97 hextocp(field[13], strlen(field[13]), &lower)) || 98 (nfields >= 15 && strlen(field[14]) > 0 && 99 hextocp(field[14], strlen(field[14]), &title))) { 100 return 1; 101 } 102 103 prop_upper[cp].property = (int_least32_t)upper - (int_least32_t)cp; 104 prop_lower[cp].property = (int_least32_t)lower - (int_least32_t)cp; 105 prop_title[cp].property = (int_least32_t)title - (int_least32_t)cp; 106 107 return 0; 108 } 109 110 static int 111 specialcasing_callback(const char *file, char **field, size_t nfields, 112 char *comment, void *payload) 113 { 114 uint_least32_t cp; 115 116 (void)file; 117 (void)comment; 118 (void)payload; 119 120 if (nfields > 4 && strlen(field[4]) > 0) { 121 /* 122 * we have more than 4 fields, i.e. the rule has a 123 * condition (language-sensitive, etc.) and is discarded 124 */ 125 return 0; 126 } 127 128 /* parse affected codepoint */ 129 hextocp(field[0], strlen(field[0]), &cp); 130 131 /* extend special case array */ 132 if (!(sc = realloc(sc, (++sclen) * sizeof(*sc)))) { 133 fprintf(stderr, "realloc: %s\n", strerror(errno)); 134 exit(1); 135 } 136 137 /* parse field data */ 138 parse_cp_list(field[3], &(sc[sclen - 1].upper.cp), 139 &(sc[sclen - 1].upper.cplen)); 140 parse_cp_list(field[1], &(sc[sclen - 1].lower.cp), 141 &(sc[sclen - 1].lower.cplen)); 142 parse_cp_list(field[2], &(sc[sclen - 1].title.cp), 143 &(sc[sclen - 1].title.cplen)); 144 145 /* 146 * overwrite value in "single mapping" property table by the 147 * special value 0x110000 + (offset in special case array), 148 * even if the special case has length 1 149 */ 150 prop_upper[cp].property = 151 (int_least64_t)(UINT32_C(0x110000) + (sclen - 1)); 152 prop_lower[cp].property = 153 (int_least64_t)(UINT32_C(0x110000) + (sclen - 1)); 154 prop_title[cp].property = 155 (int_least64_t)(UINT32_C(0x110000) + (sclen - 1)); 156 157 return 0; 158 } 159 160 static int_least64_t 161 get_value(const struct properties *prop, size_t offset) 162 { 163 return prop[offset].property; 164 } 165 166 int 167 main(int argc, char *argv[]) 168 { 169 struct properties_compressed comp_upper, comp_lower, comp_title; 170 struct properties_major_minor mm_upper, mm_lower, mm_title; 171 size_t i, j; 172 173 (void)argc; 174 175 /* generate case property table from the specification */ 176 properties_generate_break_property(case_property, LEN(case_property), 177 NULL, handle_conflict, NULL, "case", 178 argv[0]); 179 180 /* 181 * allocate property buffers for all 0x110000 codepoints 182 * 183 * the buffers contain the offset from the "base" character 184 * to the respective case mapping. By callocing we set all fields 185 * to zero, which is also the Unicode "default" in the sense that 186 * there is no case mapping by default (unless we fill it in) 187 */ 188 if (!(prop_upper = calloc(UINT32_C(0x110000), sizeof(*prop_upper))) || 189 !(prop_lower = calloc(UINT32_C(0x110000), sizeof(*prop_lower))) || 190 !(prop_title = calloc(UINT32_C(0x110000), sizeof(*prop_title)))) { 191 fprintf(stderr, "calloc: %s\n", strerror(errno)); 192 exit(1); 193 } 194 parse_file_with_callback("data/UnicodeData.txt", unicodedata_callback, 195 NULL); 196 parse_file_with_callback("data/SpecialCasing.txt", 197 specialcasing_callback, NULL); 198 199 /* compress properties */ 200 properties_compress(prop_upper, &comp_upper); 201 properties_compress(prop_lower, &comp_lower); 202 properties_compress(prop_title, &comp_title); 203 204 fprintf(stderr, 205 "%s: LUT compression-ratios: upper=%.2f%%, lower=%.2f%%, " 206 "title=%.2f%%\n", 207 argv[0], properties_get_major_minor(&comp_upper, &mm_upper), 208 properties_get_major_minor(&comp_lower, &mm_lower), 209 properties_get_major_minor(&comp_title, &mm_title)); 210 211 /* print tables */ 212 printf("/* Automatically generated by %s */\n#include " 213 "<stdint.h>\n#include <stddef.h>\n\n", 214 argv[0]); 215 216 printf("struct special_case {\n\tuint_least32_t *cp;\n\tsize_t " 217 "cplen;\n};\n\n"); 218 219 properties_print_lookup_table("upper_major", mm_upper.major, 0x1100); 220 printf("\n"); 221 properties_print_derived_lookup_table("upper_minor", mm_upper.minor, 222 mm_upper.minorlen, get_value, 223 comp_upper.data); 224 printf("\n"); 225 properties_print_lookup_table("lower_major", mm_lower.major, 0x1100); 226 printf("\n"); 227 properties_print_derived_lookup_table("lower_minor", mm_lower.minor, 228 mm_lower.minorlen, get_value, 229 comp_lower.data); 230 printf("\n"); 231 properties_print_lookup_table("title_major", mm_title.major, 0x1100); 232 printf("\n"); 233 properties_print_derived_lookup_table("title_minor", mm_title.minor, 234 mm_title.minorlen, get_value, 235 comp_title.data); 236 printf("\n"); 237 238 printf("static const struct special_case upper_special[] = {\n"); 239 for (i = 0; i < sclen; i++) { 240 printf("\t{\n"); 241 242 printf("\t\t.cp = (uint_least32_t[]){"); 243 for (j = 0; j < sc[i].upper.cplen; j++) { 244 printf(" UINT32_C(0x%06X)", sc[i].upper.cp[j]); 245 if (j + 1 < sc[i].upper.cplen) { 246 putchar(','); 247 } 248 } 249 printf(" },\n"); 250 printf("\t\t.cplen = %zu,\n", sc[i].upper.cplen); 251 printf("\t},\n"); 252 } 253 printf("};\n\n"); 254 255 printf("static const struct special_case lower_special[] = {\n"); 256 for (i = 0; i < sclen; i++) { 257 printf("\t{\n"); 258 259 printf("\t\t.cp = (uint_least32_t[]){"); 260 for (j = 0; j < sc[i].lower.cplen; j++) { 261 printf(" UINT32_C(0x%06X)", sc[i].lower.cp[j]); 262 if (j + 1 < sc[i].lower.cplen) { 263 putchar(','); 264 } 265 } 266 printf(" },\n"); 267 printf("\t\t.cplen = %zu,\n", sc[i].lower.cplen); 268 printf("\t},\n"); 269 } 270 printf("};\n\n"); 271 272 printf("static const struct special_case title_special[] = {\n"); 273 for (i = 0; i < sclen; i++) { 274 printf("\t{\n"); 275 276 printf("\t\t.cp = (uint_least32_t[]){"); 277 for (j = 0; j < sc[i].title.cplen; j++) { 278 printf(" UINT32_C(0x%06X)", sc[i].title.cp[j]); 279 if (j + 1 < sc[i].title.cplen) { 280 putchar(','); 281 } 282 } 283 printf(" },\n"); 284 printf("\t\t.cplen = %zu,\n", sc[i].title.cplen); 285 printf("\t},\n"); 286 } 287 printf("};\n\n"); 288 289 free(comp_lower.data); 290 free(comp_lower.offset); 291 free(comp_title.data); 292 free(comp_title.offset); 293 free(comp_upper.data); 294 free(comp_upper.offset); 295 free(mm_lower.major); 296 free(mm_lower.minor); 297 free(mm_title.major); 298 free(mm_title.minor); 299 free(mm_upper.major); 300 free(mm_upper.minor); 301 302 return 0; 303 }