bidirectional.c (48331B)
1 /* See LICENSE file for copyright and license details. */ 2 #include <stdbool.h> 3 #include <stddef.h> 4 5 #include "../gen/bidirectional.h" 6 #include "../grapheme.h" 7 #include "util.h" 8 9 #define MAX_DEPTH 125 10 11 enum state_type { 12 STATE_PROP, /* in 0..23, bidi_property */ 13 STATE_PRESERVED_PROP, /* in 0..23, preserved bidi_prop for L1-rule */ 14 STATE_BRACKET_OFF, /* in 0..255, offset in bidi_bracket */ 15 STATE_LEVEL, /* in 0..MAX_DEPTH+1=126, embedding level */ 16 STATE_PARAGRAPH_LEVEL, /* in 0..1, paragraph embedding level */ 17 STATE_VISITED, /* in 0..1, visited within isolating run */ 18 }; 19 20 static struct { 21 uint_least32_t filter_mask; 22 size_t mask_shift; 23 int_least16_t value_offset; 24 } state_lut[] = { 25 [STATE_PROP] = { 26 .filter_mask = 0x000001F, /* 00000000 00000000 00000000 00011111 */ 27 .mask_shift = 0, 28 .value_offset = 0, 29 }, 30 [STATE_PRESERVED_PROP] = { 31 .filter_mask = 0x00003E0, /* 00000000 00000000 00000011 11100000 */ 32 .mask_shift = 5, 33 .value_offset = 0, 34 }, 35 [STATE_BRACKET_OFF] = { 36 .filter_mask = 0x003FC00, /* 00000000 00000011 11111100 00000000 */ 37 .mask_shift = 10, 38 .value_offset = 0, 39 }, 40 [STATE_LEVEL] = { 41 .filter_mask = 0x1FC0000, /* 00000001 11111100 00000000 00000000 */ 42 .mask_shift = 18, 43 .value_offset = -1, 44 }, 45 [STATE_PARAGRAPH_LEVEL] = { 46 .filter_mask = 0x2000000, /* 00000010 00000000 00000000 00000000 */ 47 .mask_shift = 25, 48 .value_offset = 0, 49 }, 50 [STATE_VISITED] = { 51 .filter_mask = 0x4000000, /* 00000100 00000000 00000000 00000000 */ 52 .mask_shift = 26, 53 .value_offset = 0, 54 }, 55 }; 56 57 static inline int_least16_t 58 get_state(enum state_type t, uint_least32_t input) 59 { 60 return (int_least16_t)((input & state_lut[t].filter_mask) >> 61 state_lut[t].mask_shift) + 62 state_lut[t].value_offset; 63 } 64 65 static inline void 66 set_state(enum state_type t, int_least16_t value, uint_least32_t *output) 67 { 68 *output &= ~state_lut[t].filter_mask; 69 *output |= ((uint_least32_t)(value - state_lut[t].value_offset) 70 << state_lut[t].mask_shift) & 71 state_lut[t].filter_mask; 72 } 73 74 struct isolate_runner { 75 uint_least32_t *buf; 76 size_t buflen; 77 size_t start; 78 79 struct { 80 size_t off; 81 } prev, cur, next; 82 83 enum bidi_property sos, eos; 84 85 uint_least8_t paragraph_level; 86 int_least8_t isolating_run_level; 87 }; 88 89 static inline enum bidi_property 90 ir_get_previous_prop(const struct isolate_runner *ir) 91 { 92 return (ir->prev.off == SIZE_MAX) ? 93 ir->sos : 94 (uint_least8_t)get_state(STATE_PROP, 95 ir->buf[ir->prev.off]); 96 } 97 98 static inline enum bidi_property 99 ir_get_current_prop(const struct isolate_runner *ir) 100 { 101 return (uint_least8_t)get_state(STATE_PROP, ir->buf[ir->cur.off]); 102 } 103 104 static inline enum bidi_property 105 ir_get_next_prop(const struct isolate_runner *ir) 106 { 107 return (ir->next.off == SIZE_MAX) ? 108 ir->eos : 109 (uint_least8_t)get_state(STATE_PROP, 110 ir->buf[ir->next.off]); 111 } 112 113 static inline enum bidi_property 114 ir_get_current_preserved_prop(const struct isolate_runner *ir) 115 { 116 return (uint_least8_t)get_state(STATE_PRESERVED_PROP, 117 ir->buf[ir->cur.off]); 118 } 119 120 static inline int_least8_t 121 ir_get_current_level(const struct isolate_runner *ir) 122 { 123 return (int_least8_t)get_state(STATE_LEVEL, ir->buf[ir->cur.off]); 124 } 125 126 static inline const struct bracket * 127 ir_get_current_bracket_prop(const struct isolate_runner *ir) 128 { 129 return bidi_bracket + 130 (int_least8_t)get_state(STATE_BRACKET_OFF, ir->buf[ir->cur.off]); 131 } 132 133 static void 134 ir_set_current_prop(const struct isolate_runner *ir, enum bidi_property prop) 135 { 136 set_state(STATE_PROP, (int_least16_t)prop, &(ir->buf[ir->cur.off])); 137 } 138 139 static void 140 ir_init(uint_least32_t *buf, size_t buflen, size_t off, 141 uint_least8_t paragraph_level, bool within, struct isolate_runner *ir) 142 { 143 size_t i; 144 int_least8_t sos_level; 145 146 /* initialize invariants */ 147 ir->buf = buf; 148 ir->buflen = buflen; 149 ir->paragraph_level = paragraph_level; 150 ir->start = off; 151 152 /* advance off until we are at a non-removed character */ 153 for (; off < buflen; off++) { 154 if (get_state(STATE_LEVEL, buf[off]) != -1) { 155 break; 156 } 157 } 158 if (off == buflen) { 159 /* we encountered no more non-removed character, terminate */ 160 ir->next.off = SIZE_MAX; 161 return; 162 } 163 164 /* set the isolating run level to that of the current offset */ 165 ir->isolating_run_level = 166 (int_least8_t)get_state(STATE_LEVEL, buf[off]); 167 168 /* initialize sos and eos to dummy values */ 169 ir->sos = ir->eos = NUM_BIDI_PROPS; 170 171 /* 172 * we write the information of the "current" state into next, 173 * so that the shift-in at the first advancement moves it in 174 * cur, as desired. 175 */ 176 ir->next.off = off; 177 178 /* 179 * determine the previous state but store its offset in cur.off, 180 * given it's shifted in on the first advancement 181 */ 182 ir->cur.off = SIZE_MAX; 183 for (i = off, sos_level = -1; i >= 1; i--) { 184 if (get_state(STATE_LEVEL, buf[i - 1]) != -1) { 185 /* 186 * we found a character that has not been 187 * removed in X9 188 */ 189 sos_level = (int_least8_t)get_state(STATE_LEVEL, 190 buf[i - 1]); 191 192 if (within) { 193 /* we just take it */ 194 ir->cur.off = i; 195 } 196 197 break; 198 } 199 } 200 if (sos_level == -1) { 201 /* 202 * there were no preceding non-removed characters, set 203 * sos-level to paragraph embedding level 204 */ 205 sos_level = (int_least8_t)paragraph_level; 206 } 207 208 if (!within || ir->cur.off == SIZE_MAX) { 209 /* 210 * we are at the beginning of the sequence; initialize 211 * it faithfully according to the algorithm by looking 212 * at the sos-level 213 */ 214 if (MAX(sos_level, ir->isolating_run_level) % 2 == 0) { 215 /* the higher level is even, set sos to L */ 216 ir->sos = BIDI_PROP_L; 217 } else { 218 /* the higher level is odd, set sos to R */ 219 ir->sos = BIDI_PROP_R; 220 } 221 } 222 } 223 224 static int 225 ir_advance(struct isolate_runner *ir) 226 { 227 enum bidi_property prop; 228 int_least8_t level, isolate_level, last_isolate_level; 229 size_t i; 230 231 if (ir->next.off == SIZE_MAX) { 232 /* the sequence is over */ 233 return 1; 234 } 235 236 /* shift in */ 237 ir->prev.off = ir->cur.off; 238 ir->cur.off = ir->next.off; 239 240 /* mark as visited */ 241 set_state(STATE_VISITED, 1, &(ir->buf[ir->cur.off])); 242 243 /* initialize next state by going to the next character in the sequence 244 */ 245 ir->next.off = SIZE_MAX; 246 247 last_isolate_level = -1; 248 for (i = ir->cur.off, isolate_level = 0; i < ir->buflen; i++) { 249 level = (int_least8_t)get_state(STATE_LEVEL, ir->buf[i]); 250 prop = (uint_least8_t)get_state(STATE_PROP, ir->buf[i]); 251 252 if (level == -1) { 253 /* this is one of the ignored characters, skip */ 254 continue; 255 } else if (level == ir->isolating_run_level) { 256 last_isolate_level = level; 257 } 258 259 /* follow BD8/BD9 and P2 to traverse the current sequence */ 260 if (prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI || 261 prop == BIDI_PROP_FSI) { 262 /* 263 * we encountered an isolate initiator, increment 264 * counter, but go into processing when we 265 * were not isolated before 266 */ 267 if (isolate_level < MAX_DEPTH) { 268 isolate_level++; 269 } 270 if (isolate_level != 1) { 271 continue; 272 } 273 } else if (prop == BIDI_PROP_PDI && isolate_level > 0) { 274 isolate_level--; 275 276 /* 277 * if the current PDI dropped the isolate-level 278 * to zero, it is itself part of the isolating 279 * run sequence; otherwise we simply continue. 280 */ 281 if (isolate_level > 0) { 282 continue; 283 } 284 } else if (isolate_level > 0) { 285 /* we are in an isolating sequence */ 286 continue; 287 } 288 289 /* 290 * now we either still are in our sequence or we hit 291 * the eos-case as we left the sequence and hit the 292 * first non-isolating-sequence character. 293 */ 294 if (i == ir->cur.off) { 295 /* we were in the first initializing round */ 296 continue; 297 } else if (level == ir->isolating_run_level) { 298 /* isolate_level-skips have been handled before, we're 299 * good */ 300 /* still in the sequence */ 301 ir->next.off = i; 302 } else { 303 /* out of sequence or isolated, compare levels via eos 304 */ 305 ir->next.off = SIZE_MAX; 306 if (MAX(last_isolate_level, level) % 2 == 0) { 307 ir->eos = BIDI_PROP_L; 308 } else { 309 ir->eos = BIDI_PROP_R; 310 } 311 } 312 break; 313 } 314 if (i == ir->buflen) { 315 /* 316 * the sequence ended before we could grab an offset. 317 * we need to determine the eos-prop by comparing the 318 * level of the last element in the isolating run sequence 319 * with the paragraph level. 320 */ 321 ir->next.off = SIZE_MAX; 322 if (MAX(last_isolate_level, ir->paragraph_level) % 2 == 0) { 323 /* the higher level is even, set eos to L */ 324 ir->eos = BIDI_PROP_L; 325 } else { 326 /* the higher level is odd, set eos to R */ 327 ir->eos = BIDI_PROP_R; 328 } 329 } 330 331 return 0; 332 } 333 334 static enum bidi_property 335 ir_get_last_strong_prop(const struct isolate_runner *ir) 336 { 337 struct isolate_runner tmp; 338 enum bidi_property last_strong_prop = ir->sos, prop; 339 340 ir_init(ir->buf, ir->buflen, ir->start, ir->paragraph_level, false, 341 &tmp); 342 for (; !ir_advance(&tmp) && tmp.cur.off < ir->cur.off;) { 343 prop = ir_get_current_prop(&tmp); 344 345 if (prop == BIDI_PROP_R || prop == BIDI_PROP_L || 346 prop == BIDI_PROP_AL) { 347 last_strong_prop = prop; 348 } 349 } 350 351 return last_strong_prop; 352 } 353 354 static enum bidi_property 355 ir_get_last_strong_or_number_prop(const struct isolate_runner *ir) 356 { 357 struct isolate_runner tmp; 358 enum bidi_property last_strong_or_number_prop = ir->sos, prop; 359 360 ir_init(ir->buf, ir->buflen, ir->start, ir->paragraph_level, false, 361 &tmp); 362 for (; !ir_advance(&tmp) && tmp.cur.off < ir->cur.off;) { 363 prop = ir_get_current_prop(&tmp); 364 365 if (prop == BIDI_PROP_R || prop == BIDI_PROP_L || 366 prop == BIDI_PROP_AL || prop == BIDI_PROP_EN || 367 prop == BIDI_PROP_AN) { 368 last_strong_or_number_prop = prop; 369 } 370 } 371 372 return last_strong_or_number_prop; 373 } 374 375 static void 376 preprocess_bracket_pair(const struct isolate_runner *start, 377 const struct isolate_runner *end) 378 { 379 enum bidi_property prop, bracket_prop, last_strong_or_number_prop; 380 struct isolate_runner ir; 381 size_t strong_type_off; 382 383 /* 384 * check if the bracket contains a strong type (L or R|EN|AN) 385 */ 386 for (ir = *start, strong_type_off = SIZE_MAX, 387 bracket_prop = NUM_BIDI_PROPS; 388 !ir_advance(&ir) && ir.cur.off < end->cur.off;) { 389 prop = ir_get_current_prop(&ir); 390 391 if (prop == BIDI_PROP_L) { 392 strong_type_off = ir.cur.off; 393 if (ir.isolating_run_level % 2 == 0) { 394 /* 395 * set the type for both brackets to L (so they 396 * match the strong type they contain) 397 */ 398 bracket_prop = BIDI_PROP_L; 399 } 400 } else if (prop == BIDI_PROP_R || prop == BIDI_PROP_EN || 401 prop == BIDI_PROP_AN) { 402 strong_type_off = ir.cur.off; 403 if (ir.isolating_run_level % 2 != 0) { 404 /* 405 * set the type for both brackets to R (so they 406 * match the strong type they contain) 407 */ 408 bracket_prop = BIDI_PROP_R; 409 } 410 } 411 } 412 if (strong_type_off == SIZE_MAX) { 413 /* 414 * there are no strong types within the brackets and we just 415 * leave the brackets as is 416 */ 417 return; 418 } 419 420 if (bracket_prop == NUM_BIDI_PROPS) { 421 /* 422 * We encountered a strong type, but it was opposite 423 * to the embedding direction. 424 * Check the previous strong type before the opening 425 * bracket 426 */ 427 last_strong_or_number_prop = 428 ir_get_last_strong_or_number_prop(start); 429 if (last_strong_or_number_prop == BIDI_PROP_L && 430 ir.isolating_run_level % 2 != 0) { 431 /* 432 * the previous strong type is also opposite 433 * to the embedding direction, so the context 434 * was established and we set the brackets 435 * accordingly. 436 */ 437 bracket_prop = BIDI_PROP_L; 438 } else if ((last_strong_or_number_prop == BIDI_PROP_R || 439 last_strong_or_number_prop == BIDI_PROP_EN || 440 last_strong_or_number_prop == BIDI_PROP_AN) && 441 ir.isolating_run_level % 2 == 0) { 442 /* 443 * the previous strong type is also opposite 444 * to the embedding direction, so the context 445 * was established and we set the brackets 446 * accordingly. 447 */ 448 bracket_prop = BIDI_PROP_R; 449 } else { 450 /* set brackets to the embedding direction */ 451 if (ir.isolating_run_level % 2 == 0) { 452 bracket_prop = BIDI_PROP_L; 453 } else { 454 bracket_prop = BIDI_PROP_R; 455 } 456 } 457 } 458 459 ir_set_current_prop(start, bracket_prop); 460 ir_set_current_prop(end, bracket_prop); 461 462 /* 463 * any sequence of NSMs after opening or closing brackets get 464 * the same property as the one we set on the brackets 465 */ 466 for (ir = *start; !ir_advance(&ir) && ir_get_current_preserved_prop( 467 &ir) == BIDI_PROP_NSM;) { 468 ir_set_current_prop(&ir, bracket_prop); 469 } 470 for (ir = *end; !ir_advance(&ir) && 471 ir_get_current_preserved_prop(&ir) == BIDI_PROP_NSM;) { 472 ir_set_current_prop(&ir, bracket_prop); 473 } 474 } 475 476 static void 477 preprocess_bracket_pairs(uint_least32_t *buf, size_t buflen, size_t off, 478 uint_least8_t paragraph_level) 479 { 480 /* 481 * The N0-rule deals with bracket pairs that shall be determined 482 * with the rule BD16. This is specified as an algorithm with a 483 * stack of 63 bracket openings that are used to resolve into a 484 * separate list of pairs, which is then to be sorted by opening 485 * position. Thus, even though the bracketing-depth is limited 486 * by 63, the algorithm, as is, requires dynamic memory 487 * management. 488 * 489 * A naive approach (used by Fribidi) would be to screw the 490 * stack-approach and simply directly determine the 491 * corresponding closing bracket offset for a given opening 492 * bracket, leading to O(n²) time complexity in the worst case 493 * with a lot of brackets. While many brackets are not common, 494 * it is still possible to find a middle ground where you obtain 495 * strongly linear time complexity in most common cases: 496 * 497 * Instead of a stack, we use a FIFO data structure which is 498 * filled with bracket openings in the order of appearance (thus 499 * yielding an implicit sorting!) at the top. If the 500 * corresponding closing bracket is encountered, it is added to 501 * the respective entry, making it ready to "move out" at the 502 * bottom (i.e. passed to the bracket processing). Due to the 503 * nature of the specified pair detection algorithm, which only 504 * cares about the bracket type and nothing else (bidi class, 505 * level, etc.), we can mix processing and bracket detection. 506 * 507 * Obviously, if you, for instance, have one big bracket pair at 508 * the bottom that has not been closed yet, it will block the 509 * bracket processing and the FIFO might hit its capacity limit. 510 * At this point, the blockage is manually resolved using the 511 * naive quadratic approach. 512 * 513 * To remain within the specified standard behaviour, which 514 * mandates that processing of brackets should stop when the 515 * bracketing-depth is at 63, we simply check in an "overflow" 516 * scenario if all 63 elements in the LIFO are unfinished, which 517 * corresponds with such a bracketing depth. 518 */ 519 enum bidi_property prop; 520 521 struct { 522 bool complete; 523 size_t bracket_class; 524 struct isolate_runner start; 525 struct isolate_runner end; 526 } fifo[63]; 527 const struct bracket *bracket_prop, *tmp_bracket_prop; 528 struct isolate_runner ir, tmp_ir; 529 size_t fifo_len = 0, i, blevel, j, k; 530 531 ir_init(buf, buflen, off, paragraph_level, false, &ir); 532 while (!ir_advance(&ir)) { 533 prop = ir_get_current_prop(&ir); 534 bracket_prop = ir_get_current_bracket_prop(&ir); 535 if (prop == BIDI_PROP_ON && 536 bracket_prop->type == BIDI_BRACKET_OPEN) { 537 if (fifo_len == LEN(fifo)) { 538 /* 539 * The FIFO is full, check first if it's 540 * completely blocked (i.e. no finished 541 * bracket pairs, triggering the standard 542 * that mandates to abort in such a case 543 */ 544 for (i = 0; i < fifo_len; i++) { 545 if (fifo[i].complete) { 546 break; 547 } 548 } 549 if (i == fifo_len) { 550 /* abort processing */ 551 return; 552 } 553 554 /* 555 * by construction, the bottom entry 556 * in the FIFO is guaranteed to be 557 * unfinished (given we "consume" all 558 * finished bottom entries after each 559 * iteration). 560 * 561 * iterate, starting after the opening 562 * bracket, and find the corresponding 563 * closing bracket. 564 * 565 * if we find none, just drop the FIFO 566 * entry silently 567 */ 568 for (tmp_ir = fifo[0].start, blevel = 0; 569 !ir_advance(&tmp_ir);) { 570 tmp_bracket_prop = 571 ir_get_current_bracket_prop( 572 &tmp_ir); 573 574 if (tmp_bracket_prop->type == 575 BIDI_BRACKET_OPEN && 576 tmp_bracket_prop->class == 577 bracket_prop->class) { 578 /* we encountered another 579 * opening bracket of the 580 * same class */ 581 blevel++; 582 583 } else if (tmp_bracket_prop->type == 584 BIDI_BRACKET_CLOSE && 585 tmp_bracket_prop->class == 586 bracket_prop 587 ->class) { 588 /* we encountered a closing 589 * bracket of the same class 590 */ 591 if (blevel == 0) { 592 /* this is the 593 * corresponding 594 * closing bracket 595 */ 596 fifo[0].complete = true; 597 fifo[0].end = ir; 598 } else { 599 blevel--; 600 } 601 } 602 } 603 if (fifo[0].complete) { 604 /* we found the matching bracket */ 605 preprocess_bracket_pair( 606 &(fifo[i].start), 607 &(fifo[i].end)); 608 } 609 610 /* shift FIFO one to the left */ 611 for (i = 1; i < fifo_len; i++) { 612 fifo[i - 1] = fifo[i]; 613 } 614 fifo_len--; 615 } 616 617 /* add element to the FIFO */ 618 fifo_len++; 619 fifo[fifo_len - 1].complete = false; 620 fifo[fifo_len - 1].bracket_class = bracket_prop->class; 621 fifo[fifo_len - 1].start = ir; 622 } else if (prop == BIDI_PROP_ON && 623 bracket_prop->type == BIDI_BRACKET_CLOSE) { 624 /* 625 * go backwards in the FIFO, skip finished entries 626 * and simply ignore (do nothing) the closing 627 * bracket if we do not match anything 628 */ 629 for (i = fifo_len; i > 0; i--) { 630 if (bracket_prop->class == 631 fifo[i - 1].bracket_class && 632 !fifo[i - 1].complete) { 633 /* we have found a pair */ 634 fifo[i - 1].complete = true; 635 fifo[i - 1].end = ir; 636 637 /* remove all uncompleted FIFO elements 638 * above i - 1 */ 639 for (j = i; j < fifo_len;) { 640 if (fifo[j].complete) { 641 j++; 642 continue; 643 } 644 645 /* shift FIFO one to the left */ 646 for (k = j + 1; k < fifo_len; 647 k++) { 648 fifo[k - 1] = fifo[k]; 649 } 650 fifo_len--; 651 } 652 break; 653 } 654 } 655 } 656 657 /* process all ready bracket pairs from the FIFO bottom */ 658 while (fifo_len > 0 && fifo[0].complete) { 659 preprocess_bracket_pair(&(fifo[0].start), 660 &(fifo[0].end)); 661 662 /* shift FIFO one to the left */ 663 for (j = 0; j + 1 < fifo_len; j++) { 664 fifo[j] = fifo[j + 1]; 665 } 666 fifo_len--; 667 } 668 } 669 670 /* 671 * afterwards, we still might have unfinished bracket pairs 672 * that will remain as such, but the subsequent finished pairs 673 * also need to be taken into account, so we go through the 674 * FIFO once more and process all finished pairs 675 */ 676 for (i = 0; i < fifo_len; i++) { 677 if (fifo[i].complete) { 678 preprocess_bracket_pair(&(fifo[i].start), 679 &(fifo[i].end)); 680 } 681 } 682 } 683 684 static size_t 685 preprocess_isolating_run_sequence(uint_least32_t *buf, size_t buflen, 686 size_t off, uint_least8_t paragraph_level) 687 { 688 enum bidi_property sequence_prop, prop; 689 struct isolate_runner ir, tmp; 690 size_t runsince, sequence_end; 691 692 /* W1 */ 693 ir_init(buf, buflen, off, paragraph_level, false, &ir); 694 while (!ir_advance(&ir)) { 695 if (ir_get_current_prop(&ir) == BIDI_PROP_NSM) { 696 prop = ir_get_previous_prop(&ir); 697 698 if (prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI || 699 prop == BIDI_PROP_FSI || prop == BIDI_PROP_PDI) { 700 ir_set_current_prop(&ir, BIDI_PROP_ON); 701 } else { 702 ir_set_current_prop(&ir, prop); 703 } 704 } 705 } 706 707 /* W2 */ 708 ir_init(buf, buflen, off, paragraph_level, false, &ir); 709 while (!ir_advance(&ir)) { 710 if (ir_get_current_prop(&ir) == BIDI_PROP_EN && 711 ir_get_last_strong_prop(&ir) == BIDI_PROP_AL) { 712 ir_set_current_prop(&ir, BIDI_PROP_AN); 713 } 714 } 715 716 /* W3 */ 717 ir_init(buf, buflen, off, paragraph_level, false, &ir); 718 while (!ir_advance(&ir)) { 719 if (ir_get_current_prop(&ir) == BIDI_PROP_AL) { 720 ir_set_current_prop(&ir, BIDI_PROP_R); 721 } 722 } 723 724 /* W4 */ 725 ir_init(buf, buflen, off, paragraph_level, false, &ir); 726 while (!ir_advance(&ir)) { 727 if (ir_get_previous_prop(&ir) == BIDI_PROP_EN && 728 (ir_get_current_prop(&ir) == BIDI_PROP_ES || 729 ir_get_current_prop(&ir) == BIDI_PROP_CS) && 730 ir_get_next_prop(&ir) == BIDI_PROP_EN) { 731 ir_set_current_prop(&ir, BIDI_PROP_EN); 732 } 733 734 if (ir_get_previous_prop(&ir) == BIDI_PROP_AN && 735 ir_get_current_prop(&ir) == BIDI_PROP_CS && 736 ir_get_next_prop(&ir) == BIDI_PROP_AN) { 737 ir_set_current_prop(&ir, BIDI_PROP_AN); 738 } 739 } 740 741 /* W5 */ 742 runsince = SIZE_MAX; 743 ir_init(buf, buflen, off, paragraph_level, false, &ir); 744 while (!ir_advance(&ir)) { 745 if (ir_get_current_prop(&ir) == BIDI_PROP_ET) { 746 if (runsince == SIZE_MAX) { 747 /* a new run has begun */ 748 runsince = ir.cur.off; 749 } 750 } else if (ir_get_current_prop(&ir) == BIDI_PROP_EN) { 751 /* set the preceding sequence */ 752 if (runsince != SIZE_MAX) { 753 ir_init(buf, buflen, runsince, paragraph_level, 754 (runsince > off), &tmp); 755 while (!ir_advance(&tmp) && 756 tmp.cur.off < ir.cur.off) { 757 ir_set_current_prop(&tmp, BIDI_PROP_EN); 758 } 759 runsince = SIZE_MAX; 760 } else { 761 ir_init(buf, buflen, ir.cur.off, 762 paragraph_level, (ir.cur.off > off), 763 &tmp); 764 ir_advance(&tmp); 765 } 766 /* follow the succeeding sequence */ 767 while (!ir_advance(&tmp)) { 768 if (ir_get_current_prop(&tmp) != BIDI_PROP_ET) { 769 break; 770 } 771 ir_set_current_prop(&tmp, BIDI_PROP_EN); 772 } 773 } else { 774 /* sequence ended */ 775 runsince = SIZE_MAX; 776 } 777 } 778 779 /* W6 */ 780 ir_init(buf, buflen, off, paragraph_level, false, &ir); 781 while (!ir_advance(&ir)) { 782 prop = ir_get_current_prop(&ir); 783 784 if (prop == BIDI_PROP_ES || prop == BIDI_PROP_ET || 785 prop == BIDI_PROP_CS) { 786 ir_set_current_prop(&ir, BIDI_PROP_ON); 787 } 788 } 789 790 /* W7 */ 791 ir_init(buf, buflen, off, paragraph_level, false, &ir); 792 while (!ir_advance(&ir)) { 793 if (ir_get_current_prop(&ir) == BIDI_PROP_EN && 794 ir_get_last_strong_prop(&ir) == BIDI_PROP_L) { 795 ir_set_current_prop(&ir, BIDI_PROP_L); 796 } 797 } 798 799 /* N0 */ 800 preprocess_bracket_pairs(buf, buflen, off, paragraph_level); 801 802 /* N1 */ 803 sequence_end = SIZE_MAX; 804 sequence_prop = NUM_BIDI_PROPS; 805 ir_init(buf, buflen, off, paragraph_level, false, &ir); 806 while (!ir_advance(&ir)) { 807 if (sequence_end == SIZE_MAX) { 808 prop = ir_get_current_prop(&ir); 809 810 if (prop == BIDI_PROP_B || prop == BIDI_PROP_S || 811 prop == BIDI_PROP_WS || prop == BIDI_PROP_ON || 812 prop == BIDI_PROP_FSI || prop == BIDI_PROP_LRI || 813 prop == BIDI_PROP_RLI || prop == BIDI_PROP_PDI) { 814 /* the current character is an NI (neutral 815 * or isolate) */ 816 817 /* scan ahead to the end of the NI-sequence 818 */ 819 ir_init(buf, buflen, ir.cur.off, 820 paragraph_level, (ir.cur.off > off), 821 &tmp); 822 while (!ir_advance(&tmp)) { 823 prop = ir_get_next_prop(&tmp); 824 825 if (prop != BIDI_PROP_B && 826 prop != BIDI_PROP_S && 827 prop != BIDI_PROP_WS && 828 prop != BIDI_PROP_ON && 829 prop != BIDI_PROP_FSI && 830 prop != BIDI_PROP_LRI && 831 prop != BIDI_PROP_RLI && 832 prop != BIDI_PROP_PDI) { 833 break; 834 } 835 } 836 837 /* 838 * check what follows and see if the text 839 * has the same direction on both sides 840 */ 841 if (ir_get_previous_prop(&ir) == BIDI_PROP_L && 842 ir_get_next_prop(&tmp) == BIDI_PROP_L) { 843 sequence_end = tmp.cur.off; 844 sequence_prop = BIDI_PROP_L; 845 } else if ((ir_get_previous_prop(&ir) == 846 BIDI_PROP_R || 847 ir_get_previous_prop(&ir) == 848 BIDI_PROP_EN || 849 ir_get_previous_prop(&ir) == 850 BIDI_PROP_AN) && 851 (ir_get_next_prop(&tmp) == 852 BIDI_PROP_R || 853 ir_get_next_prop(&tmp) == 854 BIDI_PROP_EN || 855 ir_get_next_prop(&tmp) == 856 BIDI_PROP_AN)) { 857 sequence_end = tmp.cur.off; 858 sequence_prop = BIDI_PROP_R; 859 } 860 } 861 } 862 863 if (sequence_end != SIZE_MAX) { 864 if (ir.cur.off <= sequence_end) { 865 ir_set_current_prop(&ir, sequence_prop); 866 } else { 867 /* end of sequence, reset */ 868 sequence_end = SIZE_MAX; 869 sequence_prop = NUM_BIDI_PROPS; 870 } 871 } 872 } 873 874 /* N2 */ 875 ir_init(buf, buflen, off, paragraph_level, false, &ir); 876 while (!ir_advance(&ir)) { 877 prop = ir_get_current_prop(&ir); 878 879 if (prop == BIDI_PROP_B || prop == BIDI_PROP_S || 880 prop == BIDI_PROP_WS || prop == BIDI_PROP_ON || 881 prop == BIDI_PROP_FSI || prop == BIDI_PROP_LRI || 882 prop == BIDI_PROP_RLI || prop == BIDI_PROP_PDI) { 883 /* N2 */ 884 if (ir_get_current_level(&ir) % 2 == 0) { 885 /* even embedding level */ 886 ir_set_current_prop(&ir, BIDI_PROP_L); 887 } else { 888 /* odd embedding level */ 889 ir_set_current_prop(&ir, BIDI_PROP_R); 890 } 891 } 892 } 893 894 return 0; 895 } 896 897 static uint_least8_t 898 get_isolated_paragraph_level(const uint_least32_t *state, size_t statelen) 899 { 900 enum bidi_property prop; 901 int_least8_t isolate_level; 902 size_t stateoff; 903 904 /* determine paragraph level (rules P1-P3) and terminate on PDI */ 905 for (stateoff = 0, isolate_level = 0; stateoff < statelen; stateoff++) { 906 prop = (uint_least8_t)get_state(STATE_PROP, state[stateoff]); 907 908 if (prop == BIDI_PROP_PDI && isolate_level == 0) { 909 /* 910 * we are in a FSI-subsection of a paragraph and 911 * matched with the terminating PDI 912 */ 913 break; 914 } 915 916 /* BD8/BD9 */ 917 if ((prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI || 918 prop == BIDI_PROP_FSI) && 919 isolate_level < MAX_DEPTH) { 920 /* we hit an isolate initiator, increment counter */ 921 isolate_level++; 922 } else if (prop == BIDI_PROP_PDI && isolate_level > 0) { 923 isolate_level--; 924 } 925 926 /* P2 */ 927 if (isolate_level > 0) { 928 continue; 929 } 930 931 /* P3 */ 932 if (prop == BIDI_PROP_L) { 933 return 0; 934 } else if (prop == BIDI_PROP_AL || prop == BIDI_PROP_R) { 935 return 1; 936 } 937 } 938 939 return 0; 940 } 941 942 static inline uint_least8_t 943 get_bidi_property(uint_least32_t cp) 944 { 945 if (likely(cp <= 0x10FFFF)) { 946 return (bidi_minor[bidi_major[cp >> 8] + (cp & 0xff)]) & 947 0x1F /* 00011111 */; 948 } else { 949 return BIDI_PROP_L; 950 } 951 } 952 953 static uint_least8_t 954 get_paragraph_level(enum grapheme_bidirectional_direction override, 955 const HERODOTUS_READER *r) 956 { 957 HERODOTUS_READER tmp; 958 enum bidi_property prop; 959 int_least8_t isolate_level; 960 uint_least32_t cp; 961 962 /* check overrides first according to rule HL1 */ 963 if (override == GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) { 964 return 0; 965 } else if (override == GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) { 966 return 1; 967 } 968 969 /* copy reader into temporary reader */ 970 herodotus_reader_copy(r, &tmp); 971 972 /* determine paragraph level (rules P1-P3) */ 973 for (isolate_level = 0; herodotus_read_codepoint(&tmp, true, &cp) == 974 HERODOTUS_STATUS_SUCCESS;) { 975 prop = get_bidi_property(cp); 976 977 /* BD8/BD9 */ 978 if ((prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI || 979 prop == BIDI_PROP_FSI) && 980 isolate_level < MAX_DEPTH) { 981 /* we hit an isolate initiator, increment counter */ 982 isolate_level++; 983 } else if (prop == BIDI_PROP_PDI && isolate_level > 0) { 984 isolate_level--; 985 } 986 987 /* P2 */ 988 if (isolate_level > 0) { 989 continue; 990 } 991 992 /* P3 */ 993 if (prop == BIDI_PROP_L) { 994 return 0; 995 } else if (prop == BIDI_PROP_AL || prop == BIDI_PROP_R) { 996 return 1; 997 } 998 } 999 1000 return 0; 1001 } 1002 1003 static void 1004 preprocess_paragraph(uint_least8_t paragraph_level, uint_least32_t *buf, 1005 size_t buflen) 1006 { 1007 enum bidi_property prop; 1008 int_least8_t level; 1009 1010 struct { 1011 int_least8_t level; 1012 enum grapheme_bidirectional_direction override; 1013 bool directional_isolate; 1014 } directional_status[MAX_DEPTH + 2], *dirstat = directional_status; 1015 1016 size_t overflow_isolate_count, overflow_embedding_count, 1017 valid_isolate_count, bufoff, i, runsince; 1018 1019 /* X1 */ 1020 dirstat->level = (int_least8_t)paragraph_level; 1021 dirstat->override = GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL; 1022 dirstat->directional_isolate = false; 1023 overflow_isolate_count = overflow_embedding_count = 1024 valid_isolate_count = 0; 1025 1026 for (bufoff = 0; bufoff < buflen; bufoff++) { 1027 prop = (uint_least8_t)get_state(STATE_PROP, buf[bufoff]); 1028 1029 /* set paragraph level we need for line-level-processing */ 1030 set_state(STATE_PARAGRAPH_LEVEL, paragraph_level, 1031 &(buf[bufoff])); 1032 again: 1033 if (prop == BIDI_PROP_RLE) { 1034 /* X2 */ 1035 if (dirstat->level + (dirstat->level % 2 != 0) + 1 <= 1036 MAX_DEPTH && 1037 overflow_isolate_count == 0 && 1038 overflow_embedding_count == 0) { 1039 /* valid RLE */ 1040 dirstat++; 1041 dirstat->level = 1042 (dirstat - 1)->level + 1043 ((dirstat - 1)->level % 2 != 0) + 1; 1044 dirstat->override = 1045 GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL; 1046 dirstat->directional_isolate = false; 1047 } else { 1048 /* overflow RLE */ 1049 overflow_embedding_count += 1050 (overflow_isolate_count == 0); 1051 } 1052 } else if (prop == BIDI_PROP_LRE) { 1053 /* X3 */ 1054 if (dirstat->level + (dirstat->level % 2 == 0) + 1 <= 1055 MAX_DEPTH && 1056 overflow_isolate_count == 0 && 1057 overflow_embedding_count == 0) { 1058 /* valid LRE */ 1059 dirstat++; 1060 dirstat->level = 1061 (dirstat - 1)->level + 1062 ((dirstat - 1)->level % 2 == 0) + 1; 1063 dirstat->override = 1064 GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL; 1065 dirstat->directional_isolate = false; 1066 } else { 1067 /* overflow LRE */ 1068 overflow_embedding_count += 1069 (overflow_isolate_count == 0); 1070 } 1071 } else if (prop == BIDI_PROP_RLO) { 1072 /* X4 */ 1073 if (dirstat->level + (dirstat->level % 2 != 0) + 1 <= 1074 MAX_DEPTH && 1075 overflow_isolate_count == 0 && 1076 overflow_embedding_count == 0) { 1077 /* valid RLO */ 1078 dirstat++; 1079 dirstat->level = 1080 (dirstat - 1)->level + 1081 ((dirstat - 1)->level % 2 != 0) + 1; 1082 dirstat->override = 1083 GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL; 1084 dirstat->directional_isolate = false; 1085 } else { 1086 /* overflow RLO */ 1087 overflow_embedding_count += 1088 (overflow_isolate_count == 0); 1089 } 1090 } else if (prop == BIDI_PROP_LRO) { 1091 /* X5 */ 1092 if (dirstat->level + (dirstat->level % 2 == 0) + 1 <= 1093 MAX_DEPTH && 1094 overflow_isolate_count == 0 && 1095 overflow_embedding_count == 0) { 1096 /* valid LRE */ 1097 dirstat++; 1098 dirstat->level = 1099 (dirstat - 1)->level + 1100 ((dirstat - 1)->level % 2 == 0) + 1; 1101 dirstat->override = 1102 GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR; 1103 dirstat->directional_isolate = false; 1104 } else { 1105 /* overflow LRO */ 1106 overflow_embedding_count += 1107 (overflow_isolate_count == 0); 1108 } 1109 } else if (prop == BIDI_PROP_RLI) { 1110 /* X5a */ 1111 set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff])); 1112 if (dirstat->override == 1113 GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) { 1114 set_state(STATE_PROP, BIDI_PROP_L, 1115 &(buf[bufoff])); 1116 } else if (dirstat->override == 1117 GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) { 1118 set_state(STATE_PROP, BIDI_PROP_R, 1119 &(buf[bufoff])); 1120 } 1121 1122 if (dirstat->level + (dirstat->level % 2 != 0) + 1 <= 1123 MAX_DEPTH && 1124 overflow_isolate_count == 0 && 1125 overflow_embedding_count == 0) { 1126 /* valid RLI */ 1127 valid_isolate_count++; 1128 1129 dirstat++; 1130 dirstat->level = 1131 (dirstat - 1)->level + 1132 ((dirstat - 1)->level % 2 != 0) + 1; 1133 dirstat->override = 1134 GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL; 1135 dirstat->directional_isolate = true; 1136 } else { 1137 /* overflow RLI */ 1138 overflow_isolate_count++; 1139 } 1140 } else if (prop == BIDI_PROP_LRI) { 1141 /* X5b */ 1142 set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff])); 1143 if (dirstat->override == 1144 GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) { 1145 set_state(STATE_PROP, BIDI_PROP_L, 1146 &(buf[bufoff])); 1147 } else if (dirstat->override == 1148 GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) { 1149 set_state(STATE_PROP, BIDI_PROP_R, 1150 &(buf[bufoff])); 1151 } 1152 1153 if (dirstat->level + (dirstat->level % 2 == 0) + 1 <= 1154 MAX_DEPTH && 1155 overflow_isolate_count == 0 && 1156 overflow_embedding_count == 0) { 1157 /* valid LRI */ 1158 valid_isolate_count++; 1159 1160 dirstat++; 1161 dirstat->level = 1162 (dirstat - 1)->level + 1163 ((dirstat - 1)->level % 2 == 0) + 1; 1164 dirstat->override = 1165 GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL; 1166 dirstat->directional_isolate = true; 1167 } else { 1168 /* overflow LRI */ 1169 overflow_isolate_count++; 1170 } 1171 } else if (prop == BIDI_PROP_FSI) { 1172 /* X5c */ 1173 if (get_isolated_paragraph_level( 1174 buf + (bufoff + 1), 1175 buflen - (bufoff + 1)) == 1) { 1176 prop = BIDI_PROP_RLI; 1177 goto again; 1178 } else { /* ... == 0 */ 1179 prop = BIDI_PROP_LRI; 1180 goto again; 1181 } 1182 } else if (prop != BIDI_PROP_B && prop != BIDI_PROP_BN && 1183 prop != BIDI_PROP_PDF && prop != BIDI_PROP_PDI) { 1184 /* X6 */ 1185 set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff])); 1186 if (dirstat->override == 1187 GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) { 1188 set_state(STATE_PROP, BIDI_PROP_L, 1189 &(buf[bufoff])); 1190 } else if (dirstat->override == 1191 GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) { 1192 set_state(STATE_PROP, BIDI_PROP_R, 1193 &(buf[bufoff])); 1194 } 1195 } else if (prop == BIDI_PROP_PDI) { 1196 /* X6a */ 1197 if (overflow_isolate_count > 0) { 1198 /* PDI matches an overflow isolate initiator 1199 */ 1200 overflow_isolate_count--; 1201 } else if (valid_isolate_count > 0) { 1202 /* PDI matches a normal isolate initiator */ 1203 overflow_embedding_count = 0; 1204 while (dirstat->directional_isolate == false && 1205 dirstat > directional_status) { 1206 /* 1207 * we are safe here as given the 1208 * valid isolate count is positive 1209 * there must be a stack-entry 1210 * with positive directional 1211 * isolate status, but we take 1212 * no chances and include an 1213 * explicit check 1214 * 1215 * POSSIBLE OPTIMIZATION: Whenever 1216 * we push on the stack, check if it 1217 * has the directional isolate 1218 * status true and store a pointer 1219 * to it so we can jump to it very 1220 * quickly. 1221 */ 1222 dirstat--; 1223 } 1224 1225 /* 1226 * as above, the following check is not 1227 * necessary, given we are guaranteed to 1228 * have at least one stack entry left, 1229 * but it's better to be safe 1230 */ 1231 if (dirstat > directional_status) { 1232 dirstat--; 1233 } 1234 valid_isolate_count--; 1235 } 1236 1237 set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff])); 1238 if (dirstat->override == 1239 GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) { 1240 set_state(STATE_PROP, BIDI_PROP_L, 1241 &(buf[bufoff])); 1242 } else if (dirstat->override == 1243 GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) { 1244 set_state(STATE_PROP, BIDI_PROP_R, 1245 &(buf[bufoff])); 1246 } 1247 } else if (prop == BIDI_PROP_PDF) { 1248 /* X7 */ 1249 if (overflow_isolate_count > 0) { 1250 /* do nothing */ 1251 } else if (overflow_embedding_count > 0) { 1252 overflow_embedding_count--; 1253 } else if (dirstat->directional_isolate == false && 1254 dirstat > directional_status) { 1255 dirstat--; 1256 } 1257 } else if (prop == BIDI_PROP_B) { 1258 /* X8 */ 1259 set_state(STATE_LEVEL, paragraph_level, &(buf[bufoff])); 1260 } 1261 1262 /* X9 */ 1263 if (prop == BIDI_PROP_RLE || prop == BIDI_PROP_LRE || 1264 prop == BIDI_PROP_RLO || prop == BIDI_PROP_LRO || 1265 prop == BIDI_PROP_PDF || prop == BIDI_PROP_BN) { 1266 set_state(STATE_LEVEL, -1, &(buf[bufoff])); 1267 } 1268 } 1269 1270 /* X10 (W1-W7, N0-N2) */ 1271 for (bufoff = 0; bufoff < buflen; bufoff++) { 1272 if (get_state(STATE_VISITED, buf[bufoff]) == 0 && 1273 get_state(STATE_LEVEL, buf[bufoff]) != -1) { 1274 bufoff += preprocess_isolating_run_sequence( 1275 buf, buflen, bufoff, paragraph_level); 1276 } 1277 } 1278 1279 /* 1280 * I1-I2 (given our sequential approach to processing the 1281 * isolating run sequences, we apply this rule separately) 1282 */ 1283 for (bufoff = 0; bufoff < buflen; bufoff++) { 1284 level = (int_least8_t)get_state(STATE_LEVEL, buf[bufoff]); 1285 prop = (uint_least8_t)get_state(STATE_PROP, buf[bufoff]); 1286 1287 if (level % 2 == 0) { 1288 /* even level */ 1289 if (prop == BIDI_PROP_R) { 1290 set_state(STATE_LEVEL, level + 1, 1291 &(buf[bufoff])); 1292 } else if (prop == BIDI_PROP_AN || 1293 prop == BIDI_PROP_EN) { 1294 set_state(STATE_LEVEL, level + 2, 1295 &(buf[bufoff])); 1296 } 1297 } else { 1298 /* odd level */ 1299 if (prop == BIDI_PROP_L || prop == BIDI_PROP_EN || 1300 prop == BIDI_PROP_AN) { 1301 set_state(STATE_LEVEL, level + 1, 1302 &(buf[bufoff])); 1303 } 1304 } 1305 } 1306 1307 /* L1 (rules 1-3) */ 1308 runsince = SIZE_MAX; 1309 for (bufoff = 0; bufoff < buflen; bufoff++) { 1310 level = (int_least8_t)get_state(STATE_LEVEL, buf[bufoff]); 1311 prop = (uint_least8_t)get_state(STATE_PRESERVED_PROP, 1312 buf[bufoff]); 1313 1314 if (level == -1) { 1315 /* ignored character */ 1316 continue; 1317 } 1318 1319 /* rules 1 and 2 */ 1320 if (prop == BIDI_PROP_S || prop == BIDI_PROP_B) { 1321 set_state(STATE_LEVEL, paragraph_level, &(buf[bufoff])); 1322 } 1323 1324 /* rule 3 */ 1325 if (prop == BIDI_PROP_WS || prop == BIDI_PROP_FSI || 1326 prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI || 1327 prop == BIDI_PROP_PDI) { 1328 if (runsince == SIZE_MAX) { 1329 /* a new run has begun */ 1330 runsince = bufoff; 1331 } 1332 } else if ((prop == BIDI_PROP_S || prop == BIDI_PROP_B) && 1333 runsince != SIZE_MAX) { 1334 /* 1335 * we hit a segment or paragraph separator in a 1336 * sequence, reset sequence-levels 1337 */ 1338 for (i = runsince; i < bufoff; i++) { 1339 if (get_state(STATE_LEVEL, buf[i]) != -1) { 1340 set_state(STATE_LEVEL, paragraph_level, 1341 &(buf[i])); 1342 } 1343 } 1344 runsince = SIZE_MAX; 1345 } else { 1346 /* sequence ended */ 1347 runsince = SIZE_MAX; 1348 } 1349 } 1350 if (runsince != SIZE_MAX) { 1351 /* 1352 * this is the end of the paragraph and we 1353 * are in a run 1354 */ 1355 for (i = runsince; i < buflen; i++) { 1356 if (get_state(STATE_LEVEL, buf[i]) != -1) { 1357 set_state(STATE_LEVEL, paragraph_level, 1358 &(buf[i])); 1359 } 1360 } 1361 runsince = SIZE_MAX; 1362 } 1363 } 1364 1365 static inline uint_least8_t 1366 get_bidi_bracket_off(uint_least32_t cp) 1367 { 1368 if (likely(cp <= 0x10FFFF)) { 1369 return (uint_least8_t)((bidi_minor[bidi_major[cp >> 8] + 1370 (cp & 0xff)]) >> 1371 5); 1372 } else { 1373 return 0; 1374 } 1375 } 1376 1377 static size_t 1378 preprocess(HERODOTUS_READER *r, enum grapheme_bidirectional_direction override, 1379 uint_least32_t *buf, size_t buflen, 1380 enum grapheme_bidirectional_direction *resolved) 1381 { 1382 HERODOTUS_READER tmp; 1383 size_t bufoff, bufsize, paragraph_len; 1384 uint_least32_t cp; 1385 uint_least8_t paragraph_level; 1386 1387 /* determine length and level of the paragraph */ 1388 herodotus_reader_copy(r, &tmp); 1389 for (; herodotus_read_codepoint(&tmp, true, &cp) == 1390 HERODOTUS_STATUS_SUCCESS;) { 1391 /* break on paragraph separator */ 1392 if (get_bidi_property(cp) == BIDI_PROP_B) { 1393 break; 1394 } 1395 } 1396 paragraph_len = herodotus_reader_number_read(&tmp); 1397 paragraph_level = get_paragraph_level(override, r); 1398 1399 if (resolved != NULL) { 1400 /* store resolved paragraph level in output variable */ 1401 /* TODO use enum-type */ 1402 *resolved = (paragraph_level == 0) ? 1403 GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR : 1404 GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL; 1405 } 1406 1407 if (buf == NULL) { 1408 /* see below for return value reasoning */ 1409 return paragraph_len; 1410 } 1411 1412 /* 1413 * the first step is to determine the bidirectional properties 1414 * and store them in the buffer 1415 */ 1416 for (bufoff = 0; 1417 bufoff < paragraph_len && 1418 herodotus_read_codepoint(r, true, &cp) == HERODOTUS_STATUS_SUCCESS; 1419 bufoff++) { 1420 if (bufoff < buflen) { 1421 /* 1422 * actually only do something when we have 1423 * space in the level-buffer. We continue 1424 * the iteration to be able to give a good 1425 * return value 1426 */ 1427 set_state(STATE_PROP, 1428 (uint_least8_t)get_bidi_property(cp), 1429 &(buf[bufoff])); 1430 set_state(STATE_BRACKET_OFF, get_bidi_bracket_off(cp), 1431 &(buf[bufoff])); 1432 set_state(STATE_LEVEL, 0, &(buf[bufoff])); 1433 set_state(STATE_PARAGRAPH_LEVEL, 0, &(buf[bufoff])); 1434 set_state(STATE_VISITED, 0, &(buf[bufoff])); 1435 set_state(STATE_PRESERVED_PROP, 1436 (uint_least8_t)get_bidi_property(cp), 1437 &(buf[bufoff])); 1438 } 1439 } 1440 bufsize = herodotus_reader_number_read(r); 1441 1442 for (bufoff = 0; bufoff < bufsize; bufoff++) { 1443 if (get_state(STATE_PROP, buf[bufoff]) != BIDI_PROP_B && 1444 bufoff != bufsize - 1) { 1445 continue; 1446 } 1447 1448 /* 1449 * we either encountered a paragraph terminator or this 1450 * is the last character in the string. 1451 * Call the paragraph handler on the paragraph, including 1452 * the terminating character or last character of the 1453 * string respectively 1454 */ 1455 preprocess_paragraph(paragraph_level, buf, bufoff + 1); 1456 break; 1457 } 1458 1459 /* 1460 * we return the number of total bytes read, as the function 1461 * should indicate if the given level-buffer is too small 1462 */ 1463 return herodotus_reader_number_read(r); 1464 } 1465 1466 size_t 1467 grapheme_bidirectional_preprocess_paragraph( 1468 const uint_least32_t *src, size_t srclen, 1469 enum grapheme_bidirectional_direction override, uint_least32_t *dest, 1470 size_t destlen, enum grapheme_bidirectional_direction *resolved) 1471 { 1472 HERODOTUS_READER r; 1473 1474 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen); 1475 1476 return preprocess(&r, override, dest, destlen, resolved); 1477 } 1478 1479 static inline size_t 1480 get_line_embedding_levels(const uint_least32_t *linedata, size_t linelen, 1481 int_least8_t (*get_level)(const void *, size_t), 1482 void (*set_level)(void *, size_t, int_least8_t), 1483 void *lev, size_t levsize, bool skipignored) 1484 { 1485 enum bidi_property prop; 1486 size_t i, levlen, runsince; 1487 int_least8_t level, runlevel; 1488 1489 /* rule L1.4 */ 1490 runsince = SIZE_MAX; 1491 runlevel = -1; 1492 for (i = 0, levlen = 0; i < linelen; i++) { 1493 level = (int_least8_t)get_state(STATE_LEVEL, linedata[i]); 1494 prop = (uint_least8_t)get_state(STATE_PRESERVED_PROP, 1495 linedata[i]); 1496 1497 /* write level into level array if we still have space */ 1498 if (level != -1 || skipignored == false) { 1499 if (levlen <= levsize) { 1500 set_level(lev, levlen, level); 1501 } 1502 levlen++; 1503 } 1504 1505 if (level == -1) { 1506 /* ignored character */ 1507 continue; 1508 } 1509 1510 if (prop == BIDI_PROP_WS || prop == BIDI_PROP_FSI || 1511 prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI || 1512 prop == BIDI_PROP_PDI) { 1513 if (runsince == SIZE_MAX) { 1514 /* a new run has begun */ 1515 runsince = levlen - 1; /* levlen > 0 */ 1516 runlevel = (int_least8_t)get_state( 1517 STATE_PARAGRAPH_LEVEL, linedata[i]); 1518 } 1519 } else { 1520 /* sequence ended */ 1521 runsince = SIZE_MAX; 1522 runlevel = -1; 1523 } 1524 } 1525 if (runsince != SIZE_MAX) { 1526 /* 1527 * we hit the end of the line but were in a run; 1528 * reset the line levels to the paragraph level 1529 */ 1530 for (i = runsince; i < MIN(linelen, levlen); i++) { 1531 if (get_level(lev, i) != -1) { 1532 set_level(lev, i, runlevel); 1533 } 1534 } 1535 } 1536 1537 return levlen; 1538 } 1539 1540 static inline int_least8_t 1541 get_level_int8(const void *lev, size_t off) 1542 { 1543 return ((const int_least8_t *)lev)[off]; 1544 } 1545 1546 static inline void 1547 set_level_int8(void *lev, size_t off, int_least8_t value) 1548 { 1549 ((int_least8_t *)lev)[off] = value; 1550 } 1551 1552 size_t 1553 grapheme_bidirectional_get_line_embedding_levels(const uint_least32_t *linedata, 1554 size_t linelen, 1555 int_least8_t *lev, 1556 size_t levlen) 1557 { 1558 return get_line_embedding_levels(linedata, linelen, get_level_int8, 1559 set_level_int8, lev, levlen, false); 1560 } 1561 1562 static inline int_least8_t 1563 get_level_uint32(const void *lev, size_t off) 1564 { 1565 return (int_least8_t)((((const uint_least32_t *)lev)[off] & 1566 UINT32_C(0x1FE00000)) >> 1567 21) - 1568 1; 1569 } 1570 1571 static inline void 1572 set_level_uint32(void *lev, size_t off, int_least8_t value) 1573 { 1574 ((uint_least32_t *)lev)[off] ^= 1575 ((uint_least32_t *)lev)[off] & UINT32_C(0x1FE00000); 1576 ((uint_least32_t *)lev)[off] |= ((uint_least32_t)(value + 1)) << 21; 1577 } 1578 1579 static inline int_least16_t 1580 get_mirror_offset(uint_least32_t cp) 1581 { 1582 if (cp <= UINT32_C(0x10FFFF)) { 1583 return mirror_minor[mirror_major[cp >> 8] + (cp & 0xFF)]; 1584 } else { 1585 return 0; 1586 } 1587 } 1588 1589 size_t 1590 grapheme_bidirectional_reorder_line(const uint_least32_t *line, 1591 const uint_least32_t *linedata, 1592 size_t linelen, uint_least32_t *output, 1593 size_t outputsize) 1594 { 1595 size_t i, outputlen, first, last, j, k, l /*, laststart*/; 1596 int_least8_t level, min_odd_level = MAX_DEPTH + 2, max_level = 0; 1597 uint_least32_t tmp; 1598 1599 /* write output characters (and apply possible mirroring) */ 1600 for (i = 0, outputlen = 0; i < linelen; i++) { 1601 if (get_state(STATE_LEVEL, linedata[i]) != -1) { 1602 if (outputlen < outputsize) { 1603 output[outputlen] = 1604 (uint_least32_t)((int_least32_t) 1605 line[i] + 1606 get_mirror_offset( 1607 line[i])); 1608 } 1609 outputlen++; 1610 } 1611 } 1612 if (outputlen >= outputsize) { 1613 /* clear output buffer */ 1614 for (i = 0; i < outputsize; i++) { 1615 output[i] = GRAPHEME_INVALID_CODEPOINT; 1616 } 1617 1618 /* return required size */ 1619 return outputlen; 1620 } 1621 1622 /* 1623 * write line embedding levels as metadata and codepoints into the 1624 * output 1625 */ 1626 get_line_embedding_levels(linedata, linelen, get_level_uint32, 1627 set_level_uint32, output, outputsize, true); 1628 1629 /* determine level range */ 1630 for (i = 0; i < outputlen; i++) { 1631 level = get_level_uint32(output, i); 1632 1633 if (level == -1) { 1634 /* ignored character */ 1635 continue; 1636 } 1637 1638 if (level % 2 == 1 && level < min_odd_level) { 1639 min_odd_level = level; 1640 } 1641 if (level > max_level) { 1642 max_level = level; 1643 } 1644 } 1645 1646 for (level = max_level; level >= min_odd_level /* > 0 */; level--) { 1647 for (i = 0; i < outputlen; i++) { 1648 if (get_level_uint32(output, i) >= level) { 1649 /* 1650 * the current character has the desired level 1651 */ 1652 first = last = i; 1653 1654 /* find the end of the level-sequence */ 1655 for (i++; i < outputlen; i++) { 1656 if (get_level_uint32(output, i) >= 1657 level) { 1658 /* the sequence continues */ 1659 last = i; 1660 } else { 1661 break; 1662 } 1663 } 1664 1665 /* invert the sequence first..last respecting 1666 * grapheme clusters 1667 * 1668 * The standard only speaks of combining marks 1669 * inversion, but we should in the perfect case 1670 * respect _all_ grapheme clusters, which we do 1671 * here! 1672 */ 1673 1674 /* mark grapheme cluster breaks */ 1675 for (j = first; j <= last; 1676 j += grapheme_next_character_break( 1677 line + j, outputlen - j)) { 1678 /* 1679 * we use a special trick here: The 1680 * first 21 bits of the state are filled 1681 * with the codepoint, the next 8 bits 1682 * are used for the level, so we can use 1683 * the 30th bit to mark the grapheme 1684 * cluster breaks. This allows us to 1685 * reinvert the grapheme clusters into 1686 * the proper direction later. 1687 */ 1688 output[j] |= UINT32_C(1) << 29; 1689 } 1690 1691 /* global inversion */ 1692 for (k = first, l = last; k < l; k++, l--) { 1693 /* swap */ 1694 tmp = output[k]; 1695 output[k] = output[l]; 1696 output[l] = tmp; 1697 } 1698 1699 /* grapheme cluster reinversion */ 1700 #if 0 1701 for (j = first, laststart = first; j <= last; 1702 j++) { 1703 if (output[j] & (UINT32_C(1) << 29)) { 1704 /* we hit a mark! given the 1705 * grapheme cluster is inverted, 1706 * this means that the cluster 1707 * ended and we now reinvert it 1708 * again 1709 */ 1710 for (k = laststart, l = j; 1711 k < l; k++, l--) { 1712 /* swap */ 1713 tmp = output[k]; 1714 output[k] = output[l]; 1715 output[l] = tmp; 1716 } 1717 laststart = j + 1; 1718 } 1719 } 1720 #endif 1721 1722 /* unmark grapheme cluster breaks */ 1723 for (j = first; j <= last; j++) { 1724 output[j] ^= output[j] & 1725 UINT32_C(0x20000000); 1726 } 1727 } 1728 } 1729 } 1730 1731 /* remove embedding level metadata */ 1732 for (i = 0; i < outputlen; i++) { 1733 output[i] ^= output[i] & UINT32_C(0x1FE00000); 1734 } 1735 1736 return outputlen; 1737 }