libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

line.c (14397B)


      1 /* See LICENSE file for copyright and license details. */
      2 #include <stdbool.h>
      3 #include <stddef.h>
      4 
      5 #include "../gen/line.h"
      6 #include "../grapheme.h"
      7 #include "util.h"
      8 
      9 static inline enum line_break_property
     10 get_break_prop(uint_least32_t cp)
     11 {
     12 	if (likely(cp <= UINT32_C(0x10FFFF))) {
     13 		return (enum line_break_property)
     14 			line_break_minor[line_break_major[cp >> 8] +
     15 		                         (cp & 0xff)];
     16 	} else {
     17 		return LINE_BREAK_PROP_AL;
     18 	}
     19 }
     20 
     21 static size_t
     22 next_line_break(HERODOTUS_READER *r)
     23 {
     24 	HERODOTUS_READER tmp;
     25 	enum line_break_property cp0_prop, cp1_prop, last_non_cm_or_zwj_prop,
     26 		last_non_sp_prop, last_non_sp_cm_or_zwj_prop;
     27 	uint_least32_t cp;
     28 	uint_least8_t lb25_level = 0;
     29 	bool lb21a_flag = false, ri_even = true;
     30 
     31 	/*
     32 	 * Apply line breaking algorithm (UAX #14), see
     33 	 * https://unicode.org/reports/tr14/#Algorithm and tailoring
     34 	 * https://unicode.org/reports/tr14/#Examples (example 7),
     35 	 * given the automatic test-cases implement this example for
     36 	 * better number handling.
     37 	 *
     38 	 */
     39 
     40 	/*
     41 	 * Initialize the different properties such that we have
     42 	 * a good state after the state-update in the loop
     43 	 */
     44 	last_non_cm_or_zwj_prop = LINE_BREAK_PROP_AL; /* according to LB10 */
     45 	last_non_sp_prop = last_non_sp_cm_or_zwj_prop = NUM_LINE_BREAK_PROPS;
     46 
     47 	for (herodotus_read_codepoint(r, true, &cp),
     48 	     cp0_prop = get_break_prop(cp);
     49 	     herodotus_read_codepoint(r, false, &cp) ==
     50 	     HERODOTUS_STATUS_SUCCESS;
     51 	     herodotus_read_codepoint(r, true, &cp), cp0_prop = cp1_prop) {
     52 		/* get property of the right codepoint */
     53 		cp1_prop = get_break_prop(cp);
     54 
     55 		/* update retention-states */
     56 
     57 		/*
     58 		 * store the last observed non-CM-or-ZWJ-property for
     59 		 * LB9 and following.
     60 		 */
     61 		if (cp0_prop != LINE_BREAK_PROP_CM &&
     62 		    cp0_prop != LINE_BREAK_PROP_ZWJ) {
     63 			/*
     64 			 * check if the property we are overwriting now is an
     65 			 * HL. If so, we set the LB21a-flag which depends on
     66 			 * this knowledge.
     67 			 */
     68 			lb21a_flag =
     69 				(last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL);
     70 
     71 			/* check regional indicator state */
     72 			if (cp0_prop == LINE_BREAK_PROP_RI) {
     73 				/*
     74 				 * The property we just shifted in is
     75 				 * a regional indicator, increasing the
     76 				 * number of consecutive RIs on the left
     77 				 * side of the breakpoint by one, changing
     78 				 * the oddness.
     79 				 *
     80 				 */
     81 				ri_even = !ri_even;
     82 			} else {
     83 				/*
     84 				 * We saw no regional indicator, so the
     85 				 * number of consecutive RIs on the left
     86 				 * side of the breakpoint is zero, which
     87 				 * is an even number.
     88 				 *
     89 				 */
     90 				ri_even = true;
     91 			}
     92 
     93 			/*
     94 			 * Here comes a bit of magic. The tailored rule
     95 			 * LB25 (using example 7) has a very complicated
     96 			 * left-hand-side-rule of the form
     97 			 *
     98 			 *  NU (NU | SY | IS)* (CL | CP)?
     99 			 *
    100 			 * but instead of backtracking, we keep the state
    101 			 * as some kind of "power level" in the variable
    102 			 *
    103 			 *  lb25_level
    104 			 *
    105 			 * that goes from 0 to 3
    106 			 *
    107 			 *  0: we are not in the sequence
    108 			 *  1: we have one NU to the left of the middle
    109 			 *     spot
    110 			 *  2: we have one NU and one or more (NU | SY | IS)
    111 			 *     to the left of the middle spot
    112 			 *  3: we have one NU, zero or more (NU | SY | IS)
    113 			 *     and one (CL | CP) to the left of the middle
    114 			 *     spot
    115 			 */
    116 			if ((lb25_level == 0 || lb25_level == 1) &&
    117 			    cp0_prop == LINE_BREAK_PROP_NU) {
    118 				/* sequence has begun */
    119 				lb25_level = 1;
    120 			} else if ((lb25_level == 1 || lb25_level == 2) &&
    121 			           (cp0_prop == LINE_BREAK_PROP_NU ||
    122 			            cp0_prop == LINE_BREAK_PROP_SY ||
    123 			            cp0_prop == LINE_BREAK_PROP_IS)) {
    124 				/* (NU | SY | IS) sequence begins or continued
    125 				 */
    126 				lb25_level = 2;
    127 			} else if (
    128 				(lb25_level == 1 || lb25_level == 2) &&
    129 				(cp0_prop == LINE_BREAK_PROP_CL ||
    130 			         cp0_prop ==
    131 			                 LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
    132 			         cp0_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF)) {
    133 				/* CL or CP at the end of the sequence */
    134 				lb25_level = 3;
    135 			} else {
    136 				/* sequence broke */
    137 				lb25_level = 0;
    138 			}
    139 
    140 			last_non_cm_or_zwj_prop = cp0_prop;
    141 		}
    142 
    143 		/*
    144 		 * store the last observed non-SP-property for LB8, LB14,
    145 		 * LB15, LB16 and LB17. LB8 gets its own unskipped property,
    146 		 * whereas the others build on top of the CM-ZWJ-skipped
    147 		 * properties as they come after LB9
    148 		 */
    149 		if (cp0_prop != LINE_BREAK_PROP_SP) {
    150 			last_non_sp_prop = cp0_prop;
    151 		}
    152 		if (last_non_cm_or_zwj_prop != LINE_BREAK_PROP_SP) {
    153 			last_non_sp_cm_or_zwj_prop = last_non_cm_or_zwj_prop;
    154 		}
    155 
    156 		/* apply the algorithm */
    157 
    158 		/* LB4 */
    159 		if (cp0_prop == LINE_BREAK_PROP_BK) {
    160 			break;
    161 		}
    162 
    163 		/* LB5 */
    164 		if (cp0_prop == LINE_BREAK_PROP_CR &&
    165 		    cp1_prop == LINE_BREAK_PROP_LF) {
    166 			continue;
    167 		}
    168 		if (cp0_prop == LINE_BREAK_PROP_CR ||
    169 		    cp0_prop == LINE_BREAK_PROP_LF ||
    170 		    cp0_prop == LINE_BREAK_PROP_NL) {
    171 			break;
    172 		}
    173 
    174 		/* LB6 */
    175 		if (cp1_prop == LINE_BREAK_PROP_BK ||
    176 		    cp1_prop == LINE_BREAK_PROP_CR ||
    177 		    cp1_prop == LINE_BREAK_PROP_LF ||
    178 		    cp1_prop == LINE_BREAK_PROP_NL) {
    179 			continue;
    180 		}
    181 
    182 		/* LB7 */
    183 		if (cp1_prop == LINE_BREAK_PROP_SP ||
    184 		    cp1_prop == LINE_BREAK_PROP_ZW) {
    185 			continue;
    186 		}
    187 
    188 		/* LB8 */
    189 		if (last_non_sp_prop == LINE_BREAK_PROP_ZW) {
    190 			break;
    191 		}
    192 
    193 		/* LB8a */
    194 		if (cp0_prop == LINE_BREAK_PROP_ZWJ) {
    195 			continue;
    196 		}
    197 
    198 		/* LB9 */
    199 		if ((cp0_prop != LINE_BREAK_PROP_BK &&
    200 		     cp0_prop != LINE_BREAK_PROP_CR &&
    201 		     cp0_prop != LINE_BREAK_PROP_LF &&
    202 		     cp0_prop != LINE_BREAK_PROP_NL &&
    203 		     cp0_prop != LINE_BREAK_PROP_SP &&
    204 		     cp0_prop != LINE_BREAK_PROP_ZW) &&
    205 		    (cp1_prop == LINE_BREAK_PROP_CM ||
    206 		     cp1_prop == LINE_BREAK_PROP_ZWJ)) {
    207 			/*
    208 			 * given we skip them, we don't break in such
    209 			 * a sequence
    210 			 */
    211 			continue;
    212 		}
    213 
    214 		/* LB10 is baked into the following rules */
    215 
    216 		/* LB11 */
    217 		if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_WJ ||
    218 		    cp1_prop == LINE_BREAK_PROP_WJ) {
    219 			continue;
    220 		}
    221 
    222 		/* LB12 */
    223 		if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_GL) {
    224 			continue;
    225 		}
    226 
    227 		/* LB12a */
    228 		if ((last_non_cm_or_zwj_prop != LINE_BREAK_PROP_SP &&
    229 		     last_non_cm_or_zwj_prop != LINE_BREAK_PROP_BA &&
    230 		     last_non_cm_or_zwj_prop != LINE_BREAK_PROP_HY) &&
    231 		    cp1_prop == LINE_BREAK_PROP_GL) {
    232 			continue;
    233 		}
    234 
    235 		/* LB13 (affected by tailoring for LB25, see example 7) */
    236 		if (cp1_prop == LINE_BREAK_PROP_EX ||
    237 		    (last_non_cm_or_zwj_prop != LINE_BREAK_PROP_NU &&
    238 		     (cp1_prop == LINE_BREAK_PROP_CL ||
    239 		      cp1_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
    240 		      cp1_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF ||
    241 		      cp1_prop == LINE_BREAK_PROP_IS ||
    242 		      cp1_prop == LINE_BREAK_PROP_SY))) {
    243 			continue;
    244 		}
    245 
    246 		/* LB14 */
    247 		if (last_non_sp_cm_or_zwj_prop ==
    248 		            LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
    249 		    last_non_sp_cm_or_zwj_prop ==
    250 		            LINE_BREAK_PROP_OP_WITH_EAW_HWF) {
    251 			continue;
    252 		}
    253 
    254 		/* LB15 */
    255 		if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_QU &&
    256 		    (cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
    257 		     cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF)) {
    258 			continue;
    259 		}
    260 
    261 		/* LB16 */
    262 		if ((last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_CL ||
    263 		     last_non_sp_cm_or_zwj_prop ==
    264 		             LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
    265 		     last_non_sp_cm_or_zwj_prop ==
    266 		             LINE_BREAK_PROP_CP_WITH_EAW_HWF) &&
    267 		    cp1_prop == LINE_BREAK_PROP_NS) {
    268 			continue;
    269 		}
    270 
    271 		/* LB17 */
    272 		if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_B2 &&
    273 		    cp1_prop == LINE_BREAK_PROP_B2) {
    274 			continue;
    275 		}
    276 
    277 		/* LB18 */
    278 		if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_SP) {
    279 			break;
    280 		}
    281 
    282 		/* LB19 */
    283 		if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_QU ||
    284 		    cp1_prop == LINE_BREAK_PROP_QU) {
    285 			continue;
    286 		}
    287 
    288 		/* LB20 */
    289 		if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_CB ||
    290 		    cp1_prop == LINE_BREAK_PROP_CB) {
    291 			break;
    292 		}
    293 
    294 		/* LB21 */
    295 		if (cp1_prop == LINE_BREAK_PROP_BA ||
    296 		    cp1_prop == LINE_BREAK_PROP_HY ||
    297 		    cp1_prop == LINE_BREAK_PROP_NS ||
    298 		    last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BB) {
    299 			continue;
    300 		}
    301 
    302 		/* LB21a */
    303 		if (lb21a_flag &&
    304 		    (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HY ||
    305 		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BA)) {
    306 			continue;
    307 		}
    308 
    309 		/* LB21b */
    310 		if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_SY &&
    311 		    cp1_prop == LINE_BREAK_PROP_HL) {
    312 			continue;
    313 		}
    314 
    315 		/* LB22 */
    316 		if (cp1_prop == LINE_BREAK_PROP_IN) {
    317 			continue;
    318 		}
    319 
    320 		/* LB23 */
    321 		if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
    322 		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
    323 		    cp1_prop == LINE_BREAK_PROP_NU) {
    324 			continue;
    325 		}
    326 		if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_NU &&
    327 		    (cp1_prop == LINE_BREAK_PROP_AL ||
    328 		     cp1_prop == LINE_BREAK_PROP_HL)) {
    329 			continue;
    330 		}
    331 
    332 		/* LB23a */
    333 		if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR &&
    334 		    (cp1_prop == LINE_BREAK_PROP_ID ||
    335 		     cp1_prop == LINE_BREAK_PROP_EB ||
    336 		     cp1_prop == LINE_BREAK_PROP_EM)) {
    337 			continue;
    338 		}
    339 		if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_ID ||
    340 		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EB ||
    341 		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EM) &&
    342 		    cp1_prop == LINE_BREAK_PROP_PO) {
    343 			continue;
    344 		}
    345 
    346 		/* LB24 */
    347 		if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR ||
    348 		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PO) &&
    349 		    (cp1_prop == LINE_BREAK_PROP_AL ||
    350 		     cp1_prop == LINE_BREAK_PROP_HL)) {
    351 			continue;
    352 		}
    353 		if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
    354 		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
    355 		    (cp1_prop == LINE_BREAK_PROP_PR ||
    356 		     cp1_prop == LINE_BREAK_PROP_PO)) {
    357 			continue;
    358 		}
    359 
    360 		/* LB25 (tailored with example 7) */
    361 		if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR ||
    362 		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PO)) {
    363 			if (cp1_prop == LINE_BREAK_PROP_NU) {
    364 				continue;
    365 			}
    366 
    367 			/* this stupid rule is the reason why we cannot
    368 			 * simply have a stateful break-detection between
    369 			 * two adjacent codepoints as we have it with
    370 			 * characters.
    371 			 */
    372 			herodotus_reader_copy(r, &tmp);
    373 			herodotus_read_codepoint(&tmp, true, &cp);
    374 			if (herodotus_read_codepoint(&tmp, true, &cp) ==
    375 			            HERODOTUS_STATUS_SUCCESS &&
    376 			    (cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
    377 			     cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF ||
    378 			     cp1_prop == LINE_BREAK_PROP_HY)) {
    379 				if (get_break_prop(cp) == LINE_BREAK_PROP_NU) {
    380 					continue;
    381 				}
    382 			}
    383 		}
    384 		if ((last_non_cm_or_zwj_prop ==
    385 		             LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
    386 		     last_non_cm_or_zwj_prop ==
    387 		             LINE_BREAK_PROP_OP_WITH_EAW_HWF ||
    388 		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HY) &&
    389 		    cp1_prop == LINE_BREAK_PROP_NU) {
    390 			continue;
    391 		}
    392 		if (lb25_level == 1 && (cp1_prop == LINE_BREAK_PROP_NU ||
    393 		                        cp1_prop == LINE_BREAK_PROP_SY ||
    394 		                        cp1_prop == LINE_BREAK_PROP_IS)) {
    395 			continue;
    396 		}
    397 		if ((lb25_level == 1 || lb25_level == 2) &&
    398 		    (cp1_prop == LINE_BREAK_PROP_NU ||
    399 		     cp1_prop == LINE_BREAK_PROP_SY ||
    400 		     cp1_prop == LINE_BREAK_PROP_IS ||
    401 		     cp1_prop == LINE_BREAK_PROP_CL ||
    402 		     cp1_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
    403 		     cp1_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF)) {
    404 			continue;
    405 		}
    406 		if ((lb25_level == 1 || lb25_level == 2 || lb25_level == 3) &&
    407 		    (cp1_prop == LINE_BREAK_PROP_PO ||
    408 		     cp1_prop == LINE_BREAK_PROP_PR)) {
    409 			continue;
    410 		}
    411 
    412 		/* LB26 */
    413 		if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JL &&
    414 		    (cp1_prop == LINE_BREAK_PROP_JL ||
    415 		     cp1_prop == LINE_BREAK_PROP_JV ||
    416 		     cp1_prop == LINE_BREAK_PROP_H2 ||
    417 		     cp1_prop == LINE_BREAK_PROP_H3)) {
    418 			continue;
    419 		}
    420 		if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JV ||
    421 		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H2) &&
    422 		    (cp1_prop == LINE_BREAK_PROP_JV ||
    423 		     cp1_prop == LINE_BREAK_PROP_JT)) {
    424 			continue;
    425 		}
    426 		if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JT ||
    427 		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H3) &&
    428 		    cp1_prop == LINE_BREAK_PROP_JT) {
    429 			continue;
    430 		}
    431 
    432 		/* LB27 */
    433 		if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JL ||
    434 		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JV ||
    435 		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JT ||
    436 		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H2 ||
    437 		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H3) &&
    438 		    cp1_prop == LINE_BREAK_PROP_PO) {
    439 			continue;
    440 		}
    441 		if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR &&
    442 		    (cp1_prop == LINE_BREAK_PROP_JL ||
    443 		     cp1_prop == LINE_BREAK_PROP_JV ||
    444 		     cp1_prop == LINE_BREAK_PROP_JT ||
    445 		     cp1_prop == LINE_BREAK_PROP_H2 ||
    446 		     cp1_prop == LINE_BREAK_PROP_H3)) {
    447 			continue;
    448 		}
    449 
    450 		/* LB28 */
    451 		if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
    452 		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
    453 		    (cp1_prop == LINE_BREAK_PROP_AL ||
    454 		     cp1_prop == LINE_BREAK_PROP_HL)) {
    455 			continue;
    456 		}
    457 
    458 		/* LB29 */
    459 		if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_IS &&
    460 		    (cp1_prop == LINE_BREAK_PROP_AL ||
    461 		     cp1_prop == LINE_BREAK_PROP_HL)) {
    462 			continue;
    463 		}
    464 
    465 		/* LB30 */
    466 		if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
    467 		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL ||
    468 		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_NU) &&
    469 		    cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF) {
    470 			continue;
    471 		}
    472 		if (last_non_cm_or_zwj_prop ==
    473 		            LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF &&
    474 		    (cp1_prop == LINE_BREAK_PROP_AL ||
    475 		     cp1_prop == LINE_BREAK_PROP_HL ||
    476 		     cp1_prop == LINE_BREAK_PROP_NU)) {
    477 			continue;
    478 		}
    479 
    480 		/* LB30a */
    481 		if (!ri_even && last_non_cm_or_zwj_prop == LINE_BREAK_PROP_RI &&
    482 		    cp1_prop == LINE_BREAK_PROP_RI) {
    483 			continue;
    484 		}
    485 
    486 		/* LB30b */
    487 		if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EB &&
    488 		    cp1_prop == LINE_BREAK_PROP_EM) {
    489 			continue;
    490 		}
    491 		if (last_non_cm_or_zwj_prop ==
    492 		            LINE_BREAK_PROP_BOTH_CN_EXTPICT &&
    493 		    cp1_prop == LINE_BREAK_PROP_EM) {
    494 			continue;
    495 		}
    496 
    497 		/* LB31 */
    498 		break;
    499 	}
    500 
    501 	return herodotus_reader_number_read(r);
    502 }
    503 
    504 size_t
    505 grapheme_next_line_break(const uint_least32_t *str, size_t len)
    506 {
    507 	HERODOTUS_READER r;
    508 
    509 	herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
    510 
    511 	return next_line_break(&r);
    512 }
    513 
    514 size_t
    515 grapheme_next_line_break_utf8(const char *str, size_t len)
    516 {
    517 	HERODOTUS_READER r;
    518 
    519 	herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
    520 
    521 	return next_line_break(&r);
    522 }