libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

line.c (11273B)


      1 /* See LICENSE file for copyright and license details. */
      2 #include <stdio.h>
      3 #include <stdlib.h>
      4 #include <string.h>
      5 
      6 #include "util.h"
      7 
      8 #define FILE_EAW   "data/EastAsianWidth.txt"
      9 #define FILE_EMOJI "data/emoji-data.txt"
     10 #define FILE_LINE  "data/LineBreak.txt"
     11 
     12 static const struct property_spec line_break_property[] = {
     13 	{
     14 		.enumname = "AL",
     15 		.file = FILE_LINE,
     16 		.ucdname = "AL",
     17 	},
     18 	/*
     19 	 * Both extended pictographic and cn are large classes,
     20 	 * but we are only interested in their intersection for LB30b,
     21 	 * so we have the following two temporary classes. At first
     22 	 * the extpict-class is filled, then the cn-class, which leads
     23 	 * to conflicts (that we handle by putting them in the "proper"
     24 	 * class BOTH_CN_EXTPICT). We make use of the fact that there
     25 	 * is no intersection between AL and Cn.
     26 	 *
     27 	 * Any consecutive conflicts are permitted to overwrite
     28 	 * TMP_EXTENDED_PICTOGRAPHIC and TMP_CN, because we don't need
     29 	 * them, and in the final postprocessing we "reset" all
     30 	 * remaining matches (that then didn't fit any of the other
     31 	 * classes) to the generic class AL.
     32 	 */
     33 	{
     34 		.enumname = "TMP_CN",
     35 		.file = FILE_LINE,
     36 		.ucdname = "Cn",
     37 	},
     38 	{
     39 		.enumname = "TMP_EXTENDED_PICTOGRAPHIC",
     40 		.file = FILE_EMOJI,
     41 		.ucdname = "Extended_Pictographic",
     42 	},
     43 	/* end of special block */
     44 	{
     45 		.enumname = "B2",
     46 		.file = FILE_LINE,
     47 		.ucdname = "B2",
     48 	},
     49 	{
     50 		.enumname = "BA",
     51 		.file = FILE_LINE,
     52 		.ucdname = "BA",
     53 	},
     54 	{
     55 		.enumname = "BB",
     56 		.file = FILE_LINE,
     57 		.ucdname = "BB",
     58 	},
     59 	{
     60 		.enumname = "BK",
     61 		.file = FILE_LINE,
     62 		.ucdname = "BK",
     63 	},
     64 	{
     65 		.enumname = "BOTH_CN_EXTPICT",
     66 		.file = NULL,
     67 		.ucdname = NULL,
     68 	},
     69 	{
     70 		.enumname = "CB",
     71 		.file = FILE_LINE,
     72 		.ucdname = "CB",
     73 	},
     74 	{
     75 		.enumname = "CL",
     76 		.file = FILE_LINE,
     77 		.ucdname = "CL",
     78 	},
     79 	{
     80 		.enumname = "CM",
     81 		.file = FILE_LINE,
     82 		.ucdname = "CM",
     83 	},
     84 	{
     85 		.enumname = "CP_WITHOUT_EAW_HWF",
     86 		.file = FILE_LINE,
     87 		.ucdname = "CP",
     88 	},
     89 	{
     90 		.enumname = "CP_WITH_EAW_HWF",
     91 		.file = NULL,
     92 		.ucdname = NULL,
     93 	},
     94 	{
     95 		.enumname = "CR",
     96 		.file = FILE_LINE,
     97 		.ucdname = "CR",
     98 	},
     99 	{
    100 		.enumname = "EB",
    101 		.file = FILE_LINE,
    102 		.ucdname = "EB",
    103 	},
    104 	{
    105 		.enumname = "EM",
    106 		.file = FILE_LINE,
    107 		.ucdname = "EM",
    108 	},
    109 	{
    110 		.enumname = "EX",
    111 		.file = FILE_LINE,
    112 		.ucdname = "EX",
    113 	},
    114 	{
    115 		.enumname = "GL",
    116 		.file = FILE_LINE,
    117 		.ucdname = "GL",
    118 	},
    119 	{
    120 		.enumname = "H2",
    121 		.file = FILE_LINE,
    122 		.ucdname = "H2",
    123 	},
    124 	{
    125 		.enumname = "H3",
    126 		.file = FILE_LINE,
    127 		.ucdname = "H3",
    128 	},
    129 	{
    130 		.enumname = "HL",
    131 		.file = FILE_LINE,
    132 		.ucdname = "HL",
    133 	},
    134 	{
    135 		.enumname = "HY",
    136 		.file = FILE_LINE,
    137 		.ucdname = "HY",
    138 	},
    139 	{
    140 		.enumname = "ID",
    141 		.file = FILE_LINE,
    142 		.ucdname = "ID",
    143 	},
    144 	{
    145 		.enumname = "IN",
    146 		.file = FILE_LINE,
    147 		.ucdname = "IN",
    148 	},
    149 	{
    150 		.enumname = "IS",
    151 		.file = FILE_LINE,
    152 		.ucdname = "IS",
    153 	},
    154 	{
    155 		.enumname = "JL",
    156 		.file = FILE_LINE,
    157 		.ucdname = "JL",
    158 	},
    159 	{
    160 		.enumname = "JT",
    161 		.file = FILE_LINE,
    162 		.ucdname = "JT",
    163 	},
    164 	{
    165 		.enumname = "JV",
    166 		.file = FILE_LINE,
    167 		.ucdname = "JV",
    168 	},
    169 	{
    170 		.enumname = "LF",
    171 		.file = FILE_LINE,
    172 		.ucdname = "LF",
    173 	},
    174 	{
    175 		.enumname = "NL",
    176 		.file = FILE_LINE,
    177 		.ucdname = "NL",
    178 	},
    179 	{
    180 		.enumname = "NS",
    181 		.file = FILE_LINE,
    182 		.ucdname = "NS",
    183 	},
    184 	{
    185 		.enumname = "NU",
    186 		.file = FILE_LINE,
    187 		.ucdname = "NU",
    188 	},
    189 	{
    190 		.enumname = "OP_WITHOUT_EAW_HWF",
    191 		.file = FILE_LINE,
    192 		.ucdname = "OP",
    193 	},
    194 	{
    195 		.enumname = "OP_WITH_EAW_HWF",
    196 		.file = NULL,
    197 		.ucdname = NULL,
    198 	},
    199 	{
    200 		.enumname = "PO",
    201 		.file = FILE_LINE,
    202 		.ucdname = "PO",
    203 	},
    204 	{
    205 		.enumname = "PR",
    206 		.file = FILE_LINE,
    207 		.ucdname = "PR",
    208 	},
    209 	{
    210 		.enumname = "QU",
    211 		.file = FILE_LINE,
    212 		.ucdname = "QU",
    213 	},
    214 	{
    215 		.enumname = "RI",
    216 		.file = FILE_LINE,
    217 		.ucdname = "RI",
    218 	},
    219 	{
    220 		.enumname = "SP",
    221 		.file = FILE_LINE,
    222 		.ucdname = "SP",
    223 	},
    224 	{
    225 		.enumname = "SY",
    226 		.file = FILE_LINE,
    227 		.ucdname = "SY",
    228 	},
    229 	{
    230 		.enumname = "WJ",
    231 		.file = FILE_LINE,
    232 		.ucdname = "WJ",
    233 	},
    234 	{
    235 		.enumname = "ZW",
    236 		.file = FILE_LINE,
    237 		.ucdname = "ZW",
    238 	},
    239 	{
    240 		.enumname = "ZWJ",
    241 		.file = FILE_LINE,
    242 		.ucdname = "ZWJ",
    243 	},
    244 	{
    245 		.enumname = "TMP_AI",
    246 		.file = FILE_LINE,
    247 		.ucdname = "AI",
    248 	},
    249 	{
    250 		.enumname = "TMP_CJ",
    251 		.file = FILE_LINE,
    252 		.ucdname = "CJ",
    253 	},
    254 	{
    255 		.enumname = "TMP_XX",
    256 		.file = NULL,
    257 		.ucdname = NULL,
    258 	},
    259 	{
    260 		.enumname = "TMP_MN",
    261 		.file = FILE_LINE,
    262 		.ucdname = "Mn",
    263 	},
    264 	{
    265 		.enumname = "TMP_MC",
    266 		.file = FILE_LINE,
    267 		.ucdname = "Mc",
    268 	},
    269 	{
    270 		.enumname = "TMP_SA_WITHOUT_MN_OR_MC",
    271 		.file = FILE_LINE,
    272 		.ucdname = "SA",
    273 	},
    274 	{
    275 		.enumname = "TMP_SA_WITH_MN_OR_MC",
    276 		.file = FILE_LINE,
    277 		.ucdname = "SA",
    278 	},
    279 	{
    280 		.enumname = "TMP_SG",
    281 		.file = FILE_LINE,
    282 		.ucdname = "SG",
    283 	},
    284 	{
    285 		.enumname = "TMP_EAW_H",
    286 		.file = FILE_EAW,
    287 		.ucdname = "H",
    288 	},
    289 	{
    290 		.enumname = "TMP_EAW_W",
    291 		.file = FILE_EAW,
    292 		.ucdname = "W",
    293 	},
    294 	{
    295 		.enumname = "TMP_EAW_F",
    296 		.file = FILE_EAW,
    297 		.ucdname = "F",
    298 	},
    299 };
    300 
    301 static uint_least8_t
    302 handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t prop2)
    303 {
    304 	uint_least8_t result = prop2;
    305 	char *target = NULL;
    306 
    307 	(void)cp;
    308 
    309 	if ((!strcmp(line_break_property[prop1].enumname, "TMP_EAW_H") ||
    310 	     !strcmp(line_break_property[prop1].enumname, "TMP_EAW_W") ||
    311 	     !strcmp(line_break_property[prop1].enumname, "TMP_EAW_F")) ||
    312 	    (!strcmp(line_break_property[prop2].enumname, "TMP_EAW_H") ||
    313 	     !strcmp(line_break_property[prop2].enumname, "TMP_EAW_W") ||
    314 	     !strcmp(line_break_property[prop2].enumname, "TMP_EAW_F"))) {
    315 		if (!strcmp(line_break_property[prop1].enumname,
    316 		            "CP_WITHOUT_EAW_HWF") ||
    317 		    !strcmp(line_break_property[prop2].enumname,
    318 		            "CP_WITHOUT_EAW_HWF")) {
    319 			target = "CP_WITH_EAW_HWF";
    320 		} else if (!strcmp(line_break_property[prop1].enumname,
    321 		                   "OP_WITHOUT_EAW_HWF") ||
    322 		           !strcmp(line_break_property[prop2].enumname,
    323 		                   "OP_WITHOUT_EAW_HWF")) {
    324 			target = "OP_WITH_EAW_HWF";
    325 		} else {
    326 			/* ignore EAW for the rest */
    327 			if ((!strcmp(line_break_property[prop1].enumname,
    328 			             "TMP_EAW_H") ||
    329 			     !strcmp(line_break_property[prop1].enumname,
    330 			             "TMP_EAW_W") ||
    331 			     !strcmp(line_break_property[prop1].enumname,
    332 			             "TMP_EAW_F"))) {
    333 				result = prop2;
    334 			} else {
    335 				result = prop1;
    336 			}
    337 		}
    338 	} else if ((!strcmp(line_break_property[prop1].enumname, "TMP_MN") ||
    339 	            !strcmp(line_break_property[prop1].enumname, "TMP_MC")) ||
    340 	           (!strcmp(line_break_property[prop2].enumname, "TMP_MN") ||
    341 	            !strcmp(line_break_property[prop2].enumname, "TMP_MC"))) {
    342 		if (!strcmp(line_break_property[prop1].enumname,
    343 		            "SA_WITHOUT_MN_OR_MC") ||
    344 		    !strcmp(line_break_property[prop2].enumname,
    345 		            "SA_WITHOUT_MN_OR_MC")) {
    346 			target = "SA_WITH_MN_OR_MC";
    347 		} else {
    348 			/* ignore Mn and Mc for the rest */
    349 			if ((!strcmp(line_break_property[prop1].enumname,
    350 			             "TMP_MN") ||
    351 			     !strcmp(line_break_property[prop1].enumname,
    352 			             "TMP_MC"))) {
    353 				result = prop2;
    354 			} else {
    355 				result = prop1;
    356 			}
    357 		}
    358 	} else if (!strcmp(line_break_property[prop1].enumname, "TMP_CN") ||
    359 	           !strcmp(line_break_property[prop2].enumname, "TMP_CN")) {
    360 		if (!strcmp(line_break_property[prop1].enumname,
    361 		            "TMP_EXTENDED_PICTOGRAPHIC") ||
    362 		    !strcmp(line_break_property[prop2].enumname,
    363 		            "TMP_EXTENDED_PICTOGRAPHIC")) {
    364 			target = "BOTH_CN_EXTPICT";
    365 		} else {
    366 			/* ignore Cn for all the other properties */
    367 			if (!strcmp(line_break_property[prop1].enumname,
    368 			            "TMP_CN")) {
    369 				result = prop2;
    370 			} else {
    371 				result = prop1;
    372 			}
    373 		}
    374 	} else if (!strcmp(line_break_property[prop1].enumname,
    375 	                   "TMP_EXTENDED_PICTOGRAPHIC") ||
    376 	           !strcmp(line_break_property[prop2].enumname,
    377 	                   "TMP_EXTENDED_PICTOGRAPHIC")) {
    378 		if (!strcmp(line_break_property[prop1].enumname, "TMP_CN") ||
    379 		    !strcmp(line_break_property[prop2].enumname, "TMP_CN")) {
    380 			target = "BOTH_CN_EXTPICT";
    381 		} else {
    382 			/* ignore Extended_Pictographic for all the other
    383 			 * properties */
    384 			if (!strcmp(line_break_property[prop1].enumname,
    385 			            "TMP_EXTENDED_PICTOGRAPHIC")) {
    386 				result = prop2;
    387 			} else {
    388 				result = prop1;
    389 			}
    390 		}
    391 	} else {
    392 		fprintf(stderr,
    393 		        "handle_conflict: Cannot handle conflict %s <- %s.\n",
    394 		        line_break_property[prop1].enumname,
    395 		        line_break_property[prop2].enumname);
    396 		exit(1);
    397 	}
    398 
    399 	if (target) {
    400 		for (result = 0; result < LEN(line_break_property); result++) {
    401 			if (!strcmp(line_break_property[result].enumname,
    402 			            target)) {
    403 				break;
    404 			}
    405 		}
    406 		if (result == LEN(line_break_property)) {
    407 			fprintf(stderr, "handle_conflict: Internal error.\n");
    408 			exit(1);
    409 		}
    410 	}
    411 
    412 	return result;
    413 }
    414 
    415 static void
    416 post_process(struct properties *prop)
    417 {
    418 	const char *target;
    419 	uint_least8_t result;
    420 	size_t i;
    421 
    422 	/* post-mapping according to the line breaking algorithm */
    423 	for (i = 0; i < UINT32_C(0x110000); i++) {
    424 		/* LB1 */
    425 		if (!strcmp(line_break_property[prop[i].property].enumname,
    426 		            "TMP_AI") ||
    427 		    !strcmp(line_break_property[prop[i].property].enumname,
    428 		            "TMP_SG") ||
    429 		    !strcmp(line_break_property[prop[i].property].enumname,
    430 		            "TMP_XX")) {
    431 			/* map AI, SG and XX to AL */
    432 			target = "AL";
    433 		} else if (!strcmp(line_break_property[prop[i].property]
    434 		                           .enumname,
    435 		                   "TMP_SA_WITH_MN_OR_MC")) {
    436 			/* map SA (with General_Category Mn or Mc) to CM */
    437 			target = "CM";
    438 		} else if (!strcmp(line_break_property[prop[i].property]
    439 		                           .enumname,
    440 		                   "TMP_SA_WITHOUT_MN_OR_MC")) {
    441 			/* map SA (without General_Category Mn or Mc) to AL */
    442 			target = "AL";
    443 		} else if (!strcmp(line_break_property[prop[i].property]
    444 		                           .enumname,
    445 		                   "TMP_CJ")) {
    446 			/* map CJ to NS */
    447 			target = "NS";
    448 		} else if (
    449 			!strcmp(line_break_property[prop[i].property].enumname,
    450 		                "TMP_CN") ||
    451 			!strcmp(line_break_property[prop[i].property].enumname,
    452 		                "TMP_EXTENDED_PICTOGRAPHIC") ||
    453 			!strcmp(line_break_property[prop[i].property].enumname,
    454 		                "TMP_MN") ||
    455 			!strcmp(line_break_property[prop[i].property].enumname,
    456 		                "TMP_MC") ||
    457 			!strcmp(line_break_property[prop[i].property].enumname,
    458 		                "TMP_EAW_H") ||
    459 			!strcmp(line_break_property[prop[i].property].enumname,
    460 		                "TMP_EAW_W") ||
    461 			!strcmp(line_break_property[prop[i].property].enumname,
    462 		                "TMP_EAW_F")) {
    463 			/* map all the temporary classes "residue" to AL */
    464 			target = "AL";
    465 		} else {
    466 			target = NULL;
    467 		}
    468 
    469 		if (target) {
    470 			for (result = 0; result < LEN(line_break_property);
    471 			     result++) {
    472 				if (!strcmp(line_break_property[result]
    473 				                    .enumname,
    474 				            target)) {
    475 					break;
    476 				}
    477 			}
    478 			if (result == LEN(line_break_property)) {
    479 				fprintf(stderr,
    480 				        "handle_conflict: Internal error.\n");
    481 				exit(1);
    482 			}
    483 
    484 			prop[i].property = result;
    485 		}
    486 	}
    487 }
    488 
    489 int
    490 main(int argc, char *argv[])
    491 {
    492 	(void)argc;
    493 
    494 	properties_generate_break_property(
    495 		line_break_property, LEN(line_break_property), NULL,
    496 		handle_conflict, post_process, "line_break", argv[0]);
    497 
    498 	return 0;
    499 }