line.c (14965B)
1 /* See LICENSE file for copyright and license details. */ 2 #include <stdbool.h> 3 #include <stddef.h> 4 #include <stdlib.h> 5 #include <string.h> 6 7 #include "../gen/line.h" 8 #include "../grapheme.h" 9 #include "util.h" 10 11 static inline enum line_break_property 12 get_break_prop(uint_least32_t cp) 13 { 14 if (likely(cp <= 0x10FFFF)) { 15 return (enum line_break_property) 16 line_break_minor[line_break_major[cp >> 8] + (cp & 0xff)]; 17 } else { 18 return LINE_BREAK_PROP_AL; 19 } 20 } 21 22 static inline size_t 23 get_codepoint(const void *str, size_t len, size_t offset, uint_least32_t *cp) 24 { 25 if (offset < len) { 26 *cp = ((const uint_least32_t *)str)[offset]; 27 return 1; 28 } else { 29 *cp = GRAPHEME_INVALID_CODEPOINT; 30 return 0; 31 } 32 } 33 34 static inline size_t 35 get_codepoint_utf8(const void *str, size_t len, size_t offset, uint_least32_t *cp) 36 { 37 if (offset < len) { 38 return grapheme_decode_utf8((const char *)str + offset, 39 len - offset, cp); 40 } else { 41 *cp = GRAPHEME_INVALID_CODEPOINT; 42 return 0; 43 } 44 } 45 46 static size_t 47 next_line_break(const void *str, size_t len, size_t (*get_codepoint) 48 (const void *, size_t, size_t, uint_least32_t *)) 49 { 50 enum line_break_property cp0_prop, cp1_prop, last_non_cm_or_zwj_prop, 51 last_non_sp_prop, last_non_sp_cm_or_zwj_prop; 52 enum line_break_property res; 53 uint_least32_t cp; 54 uint_least8_t lb25_level = 0; 55 size_t off, new_off; 56 bool lb21a_flag = false, ri_even = true; 57 58 /* check degenerate cases */ 59 if (str == NULL || len == 0) { 60 return 0; 61 } 62 63 /* 64 * Apply line breaking algorithm (UAX #14), see 65 * https://unicode.org/reports/tr14/#Algorithm and tailoring 66 * https://unicode.org/reports/tr14/#Examples (example 7), 67 * given the automatic test-cases implement this example for 68 * better number handling. 69 * 70 */ 71 72 /* 73 * Initialize the different properties such that we have 74 * a good state after the state-update in the loop 75 */ 76 cp0_prop = NUM_LINE_BREAK_PROPS; 77 if ((off = get_codepoint(str, len, 0, &cp)) >= len) { 78 return 1; 79 } 80 cp1_prop = get_break_prop(cp); 81 last_non_cm_or_zwj_prop = LINE_BREAK_PROP_AL; /* according to LB10 */ 82 last_non_sp_prop = last_non_sp_cm_or_zwj_prop = NUM_LINE_BREAK_PROPS; 83 84 for (; off < len; off = new_off) { 85 /* update state */ 86 cp0_prop = cp1_prop; 87 if ((new_off = off + get_codepoint(str, len, off, &cp)) <= len) { 88 get_codepoint(str, len, off, &cp); 89 cp1_prop = get_break_prop(cp); 90 } else { 91 /* LB3 */ 92 break; 93 } 94 95 /* update retention-states */ 96 97 /* 98 * store the last observed non-CM-or-ZWJ-property for 99 * LB9 and following. 100 */ 101 if (cp0_prop != LINE_BREAK_PROP_CM && 102 cp0_prop != LINE_BREAK_PROP_ZWJ) { 103 /* 104 * check if the property we are overwriting now is an 105 * HL. If so, we set the LB21a-flag which depends on this 106 * knowledge. 107 */ 108 lb21a_flag = (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL); 109 110 /* check regional indicator state */ 111 if (cp0_prop == LINE_BREAK_PROP_RI) { 112 /* 113 * The property we just shifted in is 114 * a regional indicator, increasing the 115 * number of consecutive RIs on the left 116 * side of the breakpoint by one, changing 117 * the oddness. 118 * 119 */ 120 ri_even = !ri_even; 121 } else { 122 /* 123 * We saw no regional indicator, so the 124 * number of consecutive RIs on the left 125 * side of the breakpoint is zero, which 126 * is an even number. 127 * 128 */ 129 ri_even = true; 130 } 131 132 /* 133 * Here comes a bit of magic. The tailored rule 134 * LB25 (using example 7) has a very complicated 135 * left-hand-side-rule of the form 136 * 137 * NU (NU | SY | IS)* (CL | CP)? 138 * 139 * but instead of backtracking, we keep the state 140 * as some kind of "power level" in the variable 141 * 142 * lb25_level 143 * 144 * that goes from 0 to 3 145 * 146 * 0: we are not in the sequence 147 * 1: we have one NU to the left of the middle 148 * spot 149 * 2: we have one NU and one or more (NU | SY | IS) 150 * to the left of the middle spot 151 * 3: we have one NU, zero or more (NU | SY | IS) 152 * and one (CL | CP) to the left of the middle 153 * spot 154 */ 155 if (lb25_level == 0 && cp0_prop == LINE_BREAK_PROP_NU) { 156 /* sequence has begun */ 157 lb25_level = 1; 158 } else if ((lb25_level == 1 || lb25_level == 2) && 159 (cp0_prop == LINE_BREAK_PROP_NU || 160 cp0_prop == LINE_BREAK_PROP_SY || 161 cp0_prop == LINE_BREAK_PROP_IS)) { 162 /* (NU | SY | IS) sequence begins or continued */ 163 lb25_level = 2; 164 } else if ((lb25_level == 1 || lb25_level == 2) && 165 (cp0_prop == LINE_BREAK_PROP_CL || 166 cp0_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF || 167 cp0_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF)) { 168 /* CL or CP at the end of the sequence */ 169 lb25_level = 3; 170 } else { 171 /* sequence broke */ 172 lb25_level = 0; 173 } 174 175 last_non_cm_or_zwj_prop = cp0_prop; 176 } 177 178 /* 179 * store the last observed non-SP-property for LB8, LB14, 180 * LB15, LB16 and LB17. LB8 gets its own unskipped property, 181 * whereas the others build on top of the CM-ZWJ-skipped 182 * properties as they come after LB9 183 */ 184 if (cp0_prop != LINE_BREAK_PROP_SP) { 185 last_non_sp_prop = cp0_prop; 186 } 187 if (last_non_cm_or_zwj_prop != LINE_BREAK_PROP_SP) { 188 last_non_sp_cm_or_zwj_prop = last_non_cm_or_zwj_prop; 189 } 190 191 /* apply the algorithm */ 192 193 /* LB4 */ 194 if (cp0_prop == LINE_BREAK_PROP_BK) { 195 break; 196 } 197 198 /* LB5 */ 199 if (cp0_prop == LINE_BREAK_PROP_CR && 200 cp1_prop == LINE_BREAK_PROP_LF) { 201 continue; 202 } 203 if (cp0_prop == LINE_BREAK_PROP_CR || 204 cp0_prop == LINE_BREAK_PROP_LF || 205 cp0_prop == LINE_BREAK_PROP_NL) { 206 break; 207 } 208 209 /* LB6 */ 210 if (cp1_prop == LINE_BREAK_PROP_BK || 211 cp1_prop == LINE_BREAK_PROP_CR || 212 cp1_prop == LINE_BREAK_PROP_LF || 213 cp1_prop == LINE_BREAK_PROP_NL) { 214 continue; 215 } 216 217 /* LB7 */ 218 if (cp1_prop == LINE_BREAK_PROP_SP || 219 cp1_prop == LINE_BREAK_PROP_ZW) { 220 continue; 221 } 222 223 /* LB8 */ 224 if (last_non_sp_prop == LINE_BREAK_PROP_ZW) { 225 break; 226 } 227 228 /* LB8a */ 229 if (cp0_prop == LINE_BREAK_PROP_ZWJ) { 230 continue; 231 } 232 233 /* LB9 */ 234 if ((cp0_prop != LINE_BREAK_PROP_BK && 235 cp0_prop != LINE_BREAK_PROP_CR && 236 cp0_prop != LINE_BREAK_PROP_LF && 237 cp0_prop != LINE_BREAK_PROP_NL && 238 cp0_prop != LINE_BREAK_PROP_SP && 239 cp0_prop != LINE_BREAK_PROP_ZW) && 240 (cp1_prop == LINE_BREAK_PROP_CM || 241 cp1_prop == LINE_BREAK_PROP_ZWJ)) { 242 /* 243 * given we skip them, we don't break in such 244 * a sequence 245 */ 246 continue; 247 } 248 249 /* LB10 is baked into the following rules */ 250 251 /* LB11 */ 252 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_WJ || 253 cp1_prop == LINE_BREAK_PROP_WJ) { 254 continue; 255 } 256 257 /* LB12 */ 258 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_GL) { 259 continue; 260 } 261 262 /* LB12a */ 263 if ((last_non_cm_or_zwj_prop != LINE_BREAK_PROP_SP && 264 last_non_cm_or_zwj_prop != LINE_BREAK_PROP_BA && 265 last_non_cm_or_zwj_prop != LINE_BREAK_PROP_HY) && 266 cp1_prop == LINE_BREAK_PROP_GL) { 267 continue; 268 } 269 270 /* LB13 (affected by tailoring for LB25, see example 7) */ 271 if (cp1_prop == LINE_BREAK_PROP_EX || 272 (last_non_cm_or_zwj_prop != LINE_BREAK_PROP_NU && 273 (cp1_prop == LINE_BREAK_PROP_CL || 274 cp1_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF || 275 cp1_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF || 276 cp1_prop == LINE_BREAK_PROP_IS || 277 cp1_prop == LINE_BREAK_PROP_SY))) { 278 continue; 279 } 280 281 /* LB14 */ 282 if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF || 283 last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF) { 284 continue; 285 } 286 287 /* LB15 */ 288 if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_QU && 289 (cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF || 290 cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF)) { 291 continue; 292 } 293 294 /* LB16 */ 295 if ((last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_CL || 296 last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF || 297 last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF) && 298 cp1_prop == LINE_BREAK_PROP_NS) { 299 continue; 300 } 301 302 /* LB17 */ 303 if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_B2 && 304 cp1_prop == LINE_BREAK_PROP_B2) { 305 continue; 306 } 307 308 /* LB18 */ 309 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_SP) { 310 break; 311 } 312 313 /* LB19 */ 314 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_QU || 315 cp1_prop == LINE_BREAK_PROP_QU) { 316 continue; 317 } 318 319 /* LB20 */ 320 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_CB || 321 cp1_prop == LINE_BREAK_PROP_CB) { 322 break; 323 } 324 325 /* LB21 */ 326 if (cp1_prop == LINE_BREAK_PROP_BA || 327 cp1_prop == LINE_BREAK_PROP_HY || 328 cp1_prop == LINE_BREAK_PROP_NS || 329 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BB) { 330 continue; 331 } 332 333 /* LB21a */ 334 if (lb21a_flag && 335 (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HY || 336 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BA)) { 337 continue; 338 } 339 340 /* LB21b */ 341 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_SY && 342 cp1_prop == LINE_BREAK_PROP_HL) { 343 continue; 344 } 345 346 /* LB22 */ 347 if (cp1_prop == LINE_BREAK_PROP_IN) { 348 continue; 349 } 350 351 /* LB23 */ 352 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL || 353 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) && 354 cp1_prop == LINE_BREAK_PROP_NU) { 355 continue; 356 } 357 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_NU && 358 (cp1_prop == LINE_BREAK_PROP_AL || 359 cp1_prop == LINE_BREAK_PROP_HL)) { 360 continue; 361 } 362 363 /* LB23a */ 364 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR && 365 (cp1_prop == LINE_BREAK_PROP_ID || 366 cp1_prop == LINE_BREAK_PROP_EB || 367 cp1_prop == LINE_BREAK_PROP_EM)) { 368 continue; 369 } 370 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_ID || 371 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EB || 372 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EM) && 373 cp1_prop == LINE_BREAK_PROP_PO) { 374 continue; 375 } 376 377 /* LB24 */ 378 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR || 379 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PO) && 380 (cp1_prop == LINE_BREAK_PROP_AL || 381 cp1_prop == LINE_BREAK_PROP_HL)) { 382 continue; 383 } 384 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL || 385 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) && 386 (cp1_prop == LINE_BREAK_PROP_PR || 387 cp1_prop == LINE_BREAK_PROP_PO)) { 388 continue; 389 } 390 391 /* LB25 (tailored with example 7) */ 392 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR || 393 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PO)) { 394 if (cp1_prop == LINE_BREAK_PROP_NU) { 395 continue; 396 } 397 398 /* this stupid rule is the reason why we cannot 399 * simply have a stateful break-detection between 400 * two adjacent codepoints as we have it with 401 * characters. 402 */ 403 if (new_off < len && 404 (cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF || 405 cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF || 406 cp1_prop == LINE_BREAK_PROP_HY)) { 407 get_codepoint(str, len, new_off, &cp); 408 res = get_break_prop(cp); 409 410 if (res == LINE_BREAK_PROP_NU) { 411 continue; 412 } 413 } 414 } 415 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF || 416 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF || 417 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HY) && 418 cp1_prop == LINE_BREAK_PROP_NU) { 419 continue; 420 } 421 if (lb25_level == 1 && 422 (cp1_prop == LINE_BREAK_PROP_NU || 423 cp1_prop == LINE_BREAK_PROP_SY || 424 cp1_prop == LINE_BREAK_PROP_IS)) { 425 continue; 426 } 427 if ((lb25_level == 1 || lb25_level == 2) && 428 (cp1_prop == LINE_BREAK_PROP_NU || 429 cp1_prop == LINE_BREAK_PROP_SY || 430 cp1_prop == LINE_BREAK_PROP_IS || 431 cp1_prop == LINE_BREAK_PROP_CL || 432 cp1_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF || 433 cp1_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF)) { 434 continue; 435 } 436 if ((lb25_level == 1 || lb25_level == 2 || lb25_level == 3) && 437 (cp1_prop == LINE_BREAK_PROP_PO || 438 cp1_prop == LINE_BREAK_PROP_PR)) { 439 continue; 440 } 441 442 /* LB26 */ 443 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JL && 444 (cp1_prop == LINE_BREAK_PROP_JL || 445 cp1_prop == LINE_BREAK_PROP_JV || 446 cp1_prop == LINE_BREAK_PROP_H2 || 447 cp1_prop == LINE_BREAK_PROP_H3)) { 448 continue; 449 } 450 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JV || 451 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H2) && 452 (cp1_prop == LINE_BREAK_PROP_JV || 453 cp1_prop == LINE_BREAK_PROP_JT)) { 454 continue; 455 } 456 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JT || 457 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H3) && 458 cp1_prop == LINE_BREAK_PROP_JT) { 459 continue; 460 } 461 462 /* LB27 */ 463 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JL || 464 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JV || 465 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JT || 466 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H2 || 467 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H3) && 468 cp1_prop == LINE_BREAK_PROP_PO) { 469 continue; 470 } 471 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR && 472 (cp1_prop == LINE_BREAK_PROP_JL || 473 cp1_prop == LINE_BREAK_PROP_JV || 474 cp1_prop == LINE_BREAK_PROP_JT || 475 cp1_prop == LINE_BREAK_PROP_H2 || 476 cp1_prop == LINE_BREAK_PROP_H3)) { 477 continue; 478 } 479 480 /* LB28 */ 481 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL || 482 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) && 483 (cp1_prop == LINE_BREAK_PROP_AL || 484 cp1_prop == LINE_BREAK_PROP_HL)) { 485 continue; 486 } 487 488 /* LB29 */ 489 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_IS && 490 (cp1_prop == LINE_BREAK_PROP_AL || 491 cp1_prop == LINE_BREAK_PROP_HL)) { 492 continue; 493 } 494 495 /* LB30 */ 496 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL || 497 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL || 498 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_NU) && 499 cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF) { 500 continue; 501 } 502 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF && 503 (cp1_prop == LINE_BREAK_PROP_AL || 504 cp1_prop == LINE_BREAK_PROP_HL || 505 cp1_prop == LINE_BREAK_PROP_NU)) { 506 continue; 507 } 508 509 /* LB30a */ 510 if (!ri_even && 511 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_RI && 512 cp1_prop == LINE_BREAK_PROP_RI) { 513 continue; 514 } 515 516 /* LB30b */ 517 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EB && 518 cp1_prop == LINE_BREAK_PROP_EM) { 519 continue; 520 } 521 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BOTH_CN_EXTPICT && 522 cp1_prop == LINE_BREAK_PROP_EM) { 523 continue; 524 } 525 526 /* LB31 */ 527 break; 528 } 529 530 return off; 531 } 532 533 size_t 534 grapheme_next_line_break(const uint_least32_t *str, size_t len) 535 { 536 return next_line_break(str, len, get_codepoint); 537 } 538 539 size_t 540 grapheme_next_line_break_utf8(const char *str, size_t len) 541 { 542 return next_line_break(str, len, get_codepoint_utf8); 543 }