libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

bidirectional.c (48331B)


      1 /* See LICENSE file for copyright and license details. */
      2 #include <stdbool.h>
      3 #include <stddef.h>
      4 
      5 #include "../gen/bidirectional.h"
      6 #include "../grapheme.h"
      7 #include "util.h"
      8 
      9 #define MAX_DEPTH 125
     10 
     11 enum state_type {
     12 	STATE_PROP,            /* in 0..23, bidi_property */
     13 	STATE_PRESERVED_PROP,  /* in 0..23, preserved bidi_prop for L1-rule */
     14 	STATE_BRACKET_OFF,     /* in 0..255, offset in bidi_bracket */
     15 	STATE_LEVEL,           /* in 0..MAX_DEPTH+1=126, embedding level */
     16 	STATE_PARAGRAPH_LEVEL, /* in 0..1, paragraph embedding level */
     17 	STATE_VISITED,         /* in 0..1, visited within isolating run */
     18 };
     19 
     20 static struct {
     21 	uint_least32_t filter_mask;
     22 	size_t mask_shift;
     23 	int_least16_t value_offset;
     24 } state_lut[] = {
     25 	[STATE_PROP] = {
     26 		.filter_mask  = 0x000001F, /* 00000000 00000000 00000000 00011111 */
     27 		.mask_shift   = 0,
     28 		.value_offset = 0,
     29 	},
     30 	[STATE_PRESERVED_PROP] = {
     31 		.filter_mask  = 0x00003E0, /* 00000000 00000000 00000011 11100000 */
     32 		.mask_shift   = 5,
     33 		.value_offset = 0,
     34 	},
     35 	[STATE_BRACKET_OFF] = {
     36 		.filter_mask  = 0x003FC00, /* 00000000 00000011 11111100 00000000 */
     37 		.mask_shift   = 10,
     38 		.value_offset = 0,
     39 	},
     40 	[STATE_LEVEL] = {
     41 		.filter_mask  = 0x1FC0000, /* 00000001 11111100 00000000 00000000 */
     42 		.mask_shift   = 18,
     43 		.value_offset = -1,
     44 	},
     45 	[STATE_PARAGRAPH_LEVEL] = {
     46 		.filter_mask  = 0x2000000, /* 00000010 00000000 00000000 00000000 */
     47 		.mask_shift   = 25,
     48 		.value_offset = 0,
     49 	},
     50 	[STATE_VISITED] = {
     51 		.filter_mask  = 0x4000000, /* 00000100 00000000 00000000 00000000 */
     52 		.mask_shift   = 26,
     53 		.value_offset = 0,
     54 	},
     55 };
     56 
     57 static inline int_least16_t
     58 get_state(enum state_type t, uint_least32_t input)
     59 {
     60 	return (int_least16_t)((input & state_lut[t].filter_mask) >>
     61 	                       state_lut[t].mask_shift) +
     62 	       state_lut[t].value_offset;
     63 }
     64 
     65 static inline void
     66 set_state(enum state_type t, int_least16_t value, uint_least32_t *output)
     67 {
     68 	*output &= ~state_lut[t].filter_mask;
     69 	*output |= ((uint_least32_t)(value - state_lut[t].value_offset)
     70 	            << state_lut[t].mask_shift) &
     71 	           state_lut[t].filter_mask;
     72 }
     73 
     74 struct isolate_runner {
     75 	uint_least32_t *buf;
     76 	size_t buflen;
     77 	size_t start;
     78 
     79 	struct {
     80 		size_t off;
     81 	} prev, cur, next;
     82 
     83 	enum bidi_property sos, eos;
     84 
     85 	uint_least8_t paragraph_level;
     86 	int_least8_t isolating_run_level;
     87 };
     88 
     89 static inline enum bidi_property
     90 ir_get_previous_prop(const struct isolate_runner *ir)
     91 {
     92 	return (ir->prev.off == SIZE_MAX) ?
     93 	               ir->sos :
     94 	               (uint_least8_t)get_state(STATE_PROP,
     95 	                                        ir->buf[ir->prev.off]);
     96 }
     97 
     98 static inline enum bidi_property
     99 ir_get_current_prop(const struct isolate_runner *ir)
    100 {
    101 	return (uint_least8_t)get_state(STATE_PROP, ir->buf[ir->cur.off]);
    102 }
    103 
    104 static inline enum bidi_property
    105 ir_get_next_prop(const struct isolate_runner *ir)
    106 {
    107 	return (ir->next.off == SIZE_MAX) ?
    108 	               ir->eos :
    109 	               (uint_least8_t)get_state(STATE_PROP,
    110 	                                        ir->buf[ir->next.off]);
    111 }
    112 
    113 static inline enum bidi_property
    114 ir_get_current_preserved_prop(const struct isolate_runner *ir)
    115 {
    116 	return (uint_least8_t)get_state(STATE_PRESERVED_PROP,
    117 	                                ir->buf[ir->cur.off]);
    118 }
    119 
    120 static inline int_least8_t
    121 ir_get_current_level(const struct isolate_runner *ir)
    122 {
    123 	return (int_least8_t)get_state(STATE_LEVEL, ir->buf[ir->cur.off]);
    124 }
    125 
    126 static inline const struct bracket *
    127 ir_get_current_bracket_prop(const struct isolate_runner *ir)
    128 {
    129 	return bidi_bracket +
    130 	       (int_least8_t)get_state(STATE_BRACKET_OFF, ir->buf[ir->cur.off]);
    131 }
    132 
    133 static void
    134 ir_set_current_prop(const struct isolate_runner *ir, enum bidi_property prop)
    135 {
    136 	set_state(STATE_PROP, (int_least16_t)prop, &(ir->buf[ir->cur.off]));
    137 }
    138 
    139 static void
    140 ir_init(uint_least32_t *buf, size_t buflen, size_t off,
    141         uint_least8_t paragraph_level, bool within, struct isolate_runner *ir)
    142 {
    143 	size_t i;
    144 	int_least8_t sos_level;
    145 
    146 	/* initialize invariants */
    147 	ir->buf = buf;
    148 	ir->buflen = buflen;
    149 	ir->paragraph_level = paragraph_level;
    150 	ir->start = off;
    151 
    152 	/* advance off until we are at a non-removed character */
    153 	for (; off < buflen; off++) {
    154 		if (get_state(STATE_LEVEL, buf[off]) != -1) {
    155 			break;
    156 		}
    157 	}
    158 	if (off == buflen) {
    159 		/* we encountered no more non-removed character, terminate */
    160 		ir->next.off = SIZE_MAX;
    161 		return;
    162 	}
    163 
    164 	/* set the isolating run level to that of the current offset */
    165 	ir->isolating_run_level =
    166 		(int_least8_t)get_state(STATE_LEVEL, buf[off]);
    167 
    168 	/* initialize sos and eos to dummy values */
    169 	ir->sos = ir->eos = NUM_BIDI_PROPS;
    170 
    171 	/*
    172 	 * we write the information of the "current" state into next,
    173 	 * so that the shift-in at the first advancement moves it in
    174 	 * cur, as desired.
    175 	 */
    176 	ir->next.off = off;
    177 
    178 	/*
    179 	 * determine the previous state but store its offset in cur.off,
    180 	 * given it's shifted in on the first advancement
    181 	 */
    182 	ir->cur.off = SIZE_MAX;
    183 	for (i = off, sos_level = -1; i >= 1; i--) {
    184 		if (get_state(STATE_LEVEL, buf[i - 1]) != -1) {
    185 			/*
    186 			 * we found a character that has not been
    187 			 * removed in X9
    188 			 */
    189 			sos_level = (int_least8_t)get_state(STATE_LEVEL,
    190 			                                    buf[i - 1]);
    191 
    192 			if (within) {
    193 				/* we just take it */
    194 				ir->cur.off = i;
    195 			}
    196 
    197 			break;
    198 		}
    199 	}
    200 	if (sos_level == -1) {
    201 		/*
    202 		 * there were no preceding non-removed characters, set
    203 		 * sos-level to paragraph embedding level
    204 		 */
    205 		sos_level = (int_least8_t)paragraph_level;
    206 	}
    207 
    208 	if (!within || ir->cur.off == SIZE_MAX) {
    209 		/*
    210 		 * we are at the beginning of the sequence; initialize
    211 		 * it faithfully according to the algorithm by looking
    212 		 * at the sos-level
    213 		 */
    214 		if (MAX(sos_level, ir->isolating_run_level) % 2 == 0) {
    215 			/* the higher level is even, set sos to L */
    216 			ir->sos = BIDI_PROP_L;
    217 		} else {
    218 			/* the higher level is odd, set sos to R */
    219 			ir->sos = BIDI_PROP_R;
    220 		}
    221 	}
    222 }
    223 
    224 static int
    225 ir_advance(struct isolate_runner *ir)
    226 {
    227 	enum bidi_property prop;
    228 	int_least8_t level, isolate_level, last_isolate_level;
    229 	size_t i;
    230 
    231 	if (ir->next.off == SIZE_MAX) {
    232 		/* the sequence is over */
    233 		return 1;
    234 	}
    235 
    236 	/* shift in */
    237 	ir->prev.off = ir->cur.off;
    238 	ir->cur.off = ir->next.off;
    239 
    240 	/* mark as visited */
    241 	set_state(STATE_VISITED, 1, &(ir->buf[ir->cur.off]));
    242 
    243 	/* initialize next state by going to the next character in the sequence
    244 	 */
    245 	ir->next.off = SIZE_MAX;
    246 
    247 	last_isolate_level = -1;
    248 	for (i = ir->cur.off, isolate_level = 0; i < ir->buflen; i++) {
    249 		level = (int_least8_t)get_state(STATE_LEVEL, ir->buf[i]);
    250 		prop = (uint_least8_t)get_state(STATE_PROP, ir->buf[i]);
    251 
    252 		if (level == -1) {
    253 			/* this is one of the ignored characters, skip */
    254 			continue;
    255 		} else if (level == ir->isolating_run_level) {
    256 			last_isolate_level = level;
    257 		}
    258 
    259 		/* follow BD8/BD9 and P2 to traverse the current sequence */
    260 		if (prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
    261 		    prop == BIDI_PROP_FSI) {
    262 			/*
    263 			 * we encountered an isolate initiator, increment
    264 			 * counter, but go into processing when we
    265 			 * were not isolated before
    266 			 */
    267 			if (isolate_level < MAX_DEPTH) {
    268 				isolate_level++;
    269 			}
    270 			if (isolate_level != 1) {
    271 				continue;
    272 			}
    273 		} else if (prop == BIDI_PROP_PDI && isolate_level > 0) {
    274 			isolate_level--;
    275 
    276 			/*
    277 			 * if the current PDI dropped the isolate-level
    278 			 * to zero, it is itself part of the isolating
    279 			 * run sequence; otherwise we simply continue.
    280 			 */
    281 			if (isolate_level > 0) {
    282 				continue;
    283 			}
    284 		} else if (isolate_level > 0) {
    285 			/* we are in an isolating sequence */
    286 			continue;
    287 		}
    288 
    289 		/*
    290 		 * now we either still are in our sequence or we hit
    291 		 * the eos-case as we left the sequence and hit the
    292 		 * first non-isolating-sequence character.
    293 		 */
    294 		if (i == ir->cur.off) {
    295 			/* we were in the first initializing round */
    296 			continue;
    297 		} else if (level == ir->isolating_run_level) {
    298 			/* isolate_level-skips have been handled before, we're
    299 			 * good */
    300 			/* still in the sequence */
    301 			ir->next.off = i;
    302 		} else {
    303 			/* out of sequence or isolated, compare levels via eos
    304 			 */
    305 			ir->next.off = SIZE_MAX;
    306 			if (MAX(last_isolate_level, level) % 2 == 0) {
    307 				ir->eos = BIDI_PROP_L;
    308 			} else {
    309 				ir->eos = BIDI_PROP_R;
    310 			}
    311 		}
    312 		break;
    313 	}
    314 	if (i == ir->buflen) {
    315 		/*
    316 		 * the sequence ended before we could grab an offset.
    317 		 * we need to determine the eos-prop by comparing the
    318 		 * level of the last element in the isolating run sequence
    319 		 * with the paragraph level.
    320 		 */
    321 		ir->next.off = SIZE_MAX;
    322 		if (MAX(last_isolate_level, ir->paragraph_level) % 2 == 0) {
    323 			/* the higher level is even, set eos to L */
    324 			ir->eos = BIDI_PROP_L;
    325 		} else {
    326 			/* the higher level is odd, set eos to R */
    327 			ir->eos = BIDI_PROP_R;
    328 		}
    329 	}
    330 
    331 	return 0;
    332 }
    333 
    334 static enum bidi_property
    335 ir_get_last_strong_prop(const struct isolate_runner *ir)
    336 {
    337 	struct isolate_runner tmp;
    338 	enum bidi_property last_strong_prop = ir->sos, prop;
    339 
    340 	ir_init(ir->buf, ir->buflen, ir->start, ir->paragraph_level, false,
    341 	        &tmp);
    342 	for (; !ir_advance(&tmp) && tmp.cur.off < ir->cur.off;) {
    343 		prop = ir_get_current_prop(&tmp);
    344 
    345 		if (prop == BIDI_PROP_R || prop == BIDI_PROP_L ||
    346 		    prop == BIDI_PROP_AL) {
    347 			last_strong_prop = prop;
    348 		}
    349 	}
    350 
    351 	return last_strong_prop;
    352 }
    353 
    354 static enum bidi_property
    355 ir_get_last_strong_or_number_prop(const struct isolate_runner *ir)
    356 {
    357 	struct isolate_runner tmp;
    358 	enum bidi_property last_strong_or_number_prop = ir->sos, prop;
    359 
    360 	ir_init(ir->buf, ir->buflen, ir->start, ir->paragraph_level, false,
    361 	        &tmp);
    362 	for (; !ir_advance(&tmp) && tmp.cur.off < ir->cur.off;) {
    363 		prop = ir_get_current_prop(&tmp);
    364 
    365 		if (prop == BIDI_PROP_R || prop == BIDI_PROP_L ||
    366 		    prop == BIDI_PROP_AL || prop == BIDI_PROP_EN ||
    367 		    prop == BIDI_PROP_AN) {
    368 			last_strong_or_number_prop = prop;
    369 		}
    370 	}
    371 
    372 	return last_strong_or_number_prop;
    373 }
    374 
    375 static void
    376 preprocess_bracket_pair(const struct isolate_runner *start,
    377                         const struct isolate_runner *end)
    378 {
    379 	enum bidi_property prop, bracket_prop, last_strong_or_number_prop;
    380 	struct isolate_runner ir;
    381 	size_t strong_type_off;
    382 
    383 	/*
    384 	 * check if the bracket contains a strong type (L or R|EN|AN)
    385 	 */
    386 	for (ir = *start, strong_type_off = SIZE_MAX,
    387 	    bracket_prop = NUM_BIDI_PROPS;
    388 	     !ir_advance(&ir) && ir.cur.off < end->cur.off;) {
    389 		prop = ir_get_current_prop(&ir);
    390 
    391 		if (prop == BIDI_PROP_L) {
    392 			strong_type_off = ir.cur.off;
    393 			if (ir.isolating_run_level % 2 == 0) {
    394 				/*
    395 				 * set the type for both brackets to L (so they
    396 				 * match the strong type they contain)
    397 				 */
    398 				bracket_prop = BIDI_PROP_L;
    399 			}
    400 		} else if (prop == BIDI_PROP_R || prop == BIDI_PROP_EN ||
    401 		           prop == BIDI_PROP_AN) {
    402 			strong_type_off = ir.cur.off;
    403 			if (ir.isolating_run_level % 2 != 0) {
    404 				/*
    405 				 * set the type for both brackets to R (so they
    406 				 * match the strong type they contain)
    407 				 */
    408 				bracket_prop = BIDI_PROP_R;
    409 			}
    410 		}
    411 	}
    412 	if (strong_type_off == SIZE_MAX) {
    413 		/*
    414 		 * there are no strong types within the brackets and we just
    415 		 * leave the brackets as is
    416 		 */
    417 		return;
    418 	}
    419 
    420 	if (bracket_prop == NUM_BIDI_PROPS) {
    421 		/*
    422 		 * We encountered a strong type, but it was opposite
    423 		 * to the embedding direction.
    424 		 * Check the previous strong type before the opening
    425 		 * bracket
    426 		 */
    427 		last_strong_or_number_prop =
    428 			ir_get_last_strong_or_number_prop(start);
    429 		if (last_strong_or_number_prop == BIDI_PROP_L &&
    430 		    ir.isolating_run_level % 2 != 0) {
    431 			/*
    432 			 * the previous strong type is also opposite
    433 			 * to the embedding direction, so the context
    434 			 * was established and we set the brackets
    435 			 * accordingly.
    436 			 */
    437 			bracket_prop = BIDI_PROP_L;
    438 		} else if ((last_strong_or_number_prop == BIDI_PROP_R ||
    439 		            last_strong_or_number_prop == BIDI_PROP_EN ||
    440 		            last_strong_or_number_prop == BIDI_PROP_AN) &&
    441 		           ir.isolating_run_level % 2 == 0) {
    442 			/*
    443 			 * the previous strong type is also opposite
    444 			 * to the embedding direction, so the context
    445 			 * was established and we set the brackets
    446 			 * accordingly.
    447 			 */
    448 			bracket_prop = BIDI_PROP_R;
    449 		} else {
    450 			/* set brackets to the embedding direction */
    451 			if (ir.isolating_run_level % 2 == 0) {
    452 				bracket_prop = BIDI_PROP_L;
    453 			} else {
    454 				bracket_prop = BIDI_PROP_R;
    455 			}
    456 		}
    457 	}
    458 
    459 	ir_set_current_prop(start, bracket_prop);
    460 	ir_set_current_prop(end, bracket_prop);
    461 
    462 	/*
    463 	 * any sequence of NSMs after opening or closing brackets get
    464 	 * the same property as the one we set on the brackets
    465 	 */
    466 	for (ir = *start; !ir_advance(&ir) && ir_get_current_preserved_prop(
    467 						      &ir) == BIDI_PROP_NSM;) {
    468 		ir_set_current_prop(&ir, bracket_prop);
    469 	}
    470 	for (ir = *end; !ir_advance(&ir) &&
    471 	                ir_get_current_preserved_prop(&ir) == BIDI_PROP_NSM;) {
    472 		ir_set_current_prop(&ir, bracket_prop);
    473 	}
    474 }
    475 
    476 static void
    477 preprocess_bracket_pairs(uint_least32_t *buf, size_t buflen, size_t off,
    478                          uint_least8_t paragraph_level)
    479 {
    480 	/*
    481 	 * The N0-rule deals with bracket pairs that shall be determined
    482 	 * with the rule BD16. This is specified as an algorithm with a
    483 	 * stack of 63 bracket openings that are used to resolve into a
    484 	 * separate list of pairs, which is then to be sorted by opening
    485 	 * position. Thus, even though the bracketing-depth is limited
    486 	 * by 63, the algorithm, as is, requires dynamic memory
    487 	 * management.
    488 	 *
    489 	 * A naive approach (used by Fribidi) would be to screw the
    490 	 * stack-approach and simply directly determine the
    491 	 * corresponding closing bracket offset for a given opening
    492 	 * bracket, leading to O(n²) time complexity in the worst case
    493 	 * with a lot of brackets. While many brackets are not common,
    494 	 * it is still possible to find a middle ground where you obtain
    495 	 * strongly linear time complexity in most common cases:
    496 	 *
    497 	 * Instead of a stack, we use a FIFO data structure which is
    498 	 * filled with bracket openings in the order of appearance (thus
    499 	 * yielding an implicit sorting!) at the top. If the
    500 	 * corresponding closing bracket is encountered, it is added to
    501 	 * the respective entry, making it ready to "move out" at the
    502 	 * bottom (i.e. passed to the bracket processing). Due to the
    503 	 * nature of the specified pair detection algorithm, which only
    504 	 * cares about the bracket type and nothing else (bidi class,
    505 	 * level, etc.), we can mix processing and bracket detection.
    506 	 *
    507 	 * Obviously, if you, for instance, have one big bracket pair at
    508 	 * the bottom that has not been closed yet, it will block the
    509 	 * bracket processing and the FIFO might hit its capacity limit.
    510 	 * At this point, the blockage is manually resolved using the
    511 	 * naive quadratic approach.
    512 	 *
    513 	 * To remain within the specified standard behaviour, which
    514 	 * mandates that processing of brackets should stop when the
    515 	 * bracketing-depth is at 63, we simply check in an "overflow"
    516 	 * scenario if all 63 elements in the LIFO are unfinished, which
    517 	 * corresponds with such a bracketing depth.
    518 	 */
    519 	enum bidi_property prop;
    520 
    521 	struct {
    522 		bool complete;
    523 		size_t bracket_class;
    524 		struct isolate_runner start;
    525 		struct isolate_runner end;
    526 	} fifo[63];
    527 	const struct bracket *bracket_prop, *tmp_bracket_prop;
    528 	struct isolate_runner ir, tmp_ir;
    529 	size_t fifo_len = 0, i, blevel, j, k;
    530 
    531 	ir_init(buf, buflen, off, paragraph_level, false, &ir);
    532 	while (!ir_advance(&ir)) {
    533 		prop = ir_get_current_prop(&ir);
    534 		bracket_prop = ir_get_current_bracket_prop(&ir);
    535 		if (prop == BIDI_PROP_ON &&
    536 		    bracket_prop->type == BIDI_BRACKET_OPEN) {
    537 			if (fifo_len == LEN(fifo)) {
    538 				/*
    539 				 * The FIFO is full, check first if it's
    540 				 * completely blocked (i.e. no finished
    541 				 * bracket pairs, triggering the standard
    542 				 * that mandates to abort in such a case
    543 				 */
    544 				for (i = 0; i < fifo_len; i++) {
    545 					if (fifo[i].complete) {
    546 						break;
    547 					}
    548 				}
    549 				if (i == fifo_len) {
    550 					/* abort processing */
    551 					return;
    552 				}
    553 
    554 				/*
    555 				 * by construction, the bottom entry
    556 				 * in the FIFO is guaranteed to be
    557 				 * unfinished (given we "consume" all
    558 				 * finished bottom entries after each
    559 				 * iteration).
    560 				 *
    561 				 * iterate, starting after the opening
    562 				 * bracket, and find the corresponding
    563 				 * closing bracket.
    564 				 *
    565 				 * if we find none, just drop the FIFO
    566 				 * entry silently
    567 				 */
    568 				for (tmp_ir = fifo[0].start, blevel = 0;
    569 				     !ir_advance(&tmp_ir);) {
    570 					tmp_bracket_prop =
    571 						ir_get_current_bracket_prop(
    572 							&tmp_ir);
    573 
    574 					if (tmp_bracket_prop->type ==
    575 					            BIDI_BRACKET_OPEN &&
    576 					    tmp_bracket_prop->class ==
    577 					            bracket_prop->class) {
    578 						/* we encountered another
    579 						 * opening bracket of the
    580 						 * same class */
    581 						blevel++;
    582 
    583 					} else if (tmp_bracket_prop->type ==
    584 					                   BIDI_BRACKET_CLOSE &&
    585 					           tmp_bracket_prop->class ==
    586 					                   bracket_prop
    587 					                           ->class) {
    588 						/* we encountered a closing
    589 						 * bracket of the same class
    590 						 */
    591 						if (blevel == 0) {
    592 							/* this is the
    593 							 * corresponding
    594 							 * closing bracket
    595 							 */
    596 							fifo[0].complete = true;
    597 							fifo[0].end = ir;
    598 						} else {
    599 							blevel--;
    600 						}
    601 					}
    602 				}
    603 				if (fifo[0].complete) {
    604 					/* we found the matching bracket */
    605 					preprocess_bracket_pair(
    606 						&(fifo[i].start),
    607 						&(fifo[i].end));
    608 				}
    609 
    610 				/* shift FIFO one to the left */
    611 				for (i = 1; i < fifo_len; i++) {
    612 					fifo[i - 1] = fifo[i];
    613 				}
    614 				fifo_len--;
    615 			}
    616 
    617 			/* add element to the FIFO */
    618 			fifo_len++;
    619 			fifo[fifo_len - 1].complete = false;
    620 			fifo[fifo_len - 1].bracket_class = bracket_prop->class;
    621 			fifo[fifo_len - 1].start = ir;
    622 		} else if (prop == BIDI_PROP_ON &&
    623 		           bracket_prop->type == BIDI_BRACKET_CLOSE) {
    624 			/*
    625 			 * go backwards in the FIFO, skip finished entries
    626 			 * and simply ignore (do nothing) the closing
    627 			 * bracket if we do not match anything
    628 			 */
    629 			for (i = fifo_len; i > 0; i--) {
    630 				if (bracket_prop->class ==
    631 				            fifo[i - 1].bracket_class &&
    632 				    !fifo[i - 1].complete) {
    633 					/* we have found a pair */
    634 					fifo[i - 1].complete = true;
    635 					fifo[i - 1].end = ir;
    636 
    637 					/* remove all uncompleted FIFO elements
    638 					 * above i - 1 */
    639 					for (j = i; j < fifo_len;) {
    640 						if (fifo[j].complete) {
    641 							j++;
    642 							continue;
    643 						}
    644 
    645 						/* shift FIFO one to the left */
    646 						for (k = j + 1; k < fifo_len;
    647 						     k++) {
    648 							fifo[k - 1] = fifo[k];
    649 						}
    650 						fifo_len--;
    651 					}
    652 					break;
    653 				}
    654 			}
    655 		}
    656 
    657 		/* process all ready bracket pairs from the FIFO bottom */
    658 		while (fifo_len > 0 && fifo[0].complete) {
    659 			preprocess_bracket_pair(&(fifo[0].start),
    660 			                        &(fifo[0].end));
    661 
    662 			/* shift FIFO one to the left */
    663 			for (j = 0; j + 1 < fifo_len; j++) {
    664 				fifo[j] = fifo[j + 1];
    665 			}
    666 			fifo_len--;
    667 		}
    668 	}
    669 
    670 	/*
    671 	 * afterwards, we still might have unfinished bracket pairs
    672 	 * that will remain as such, but the subsequent finished pairs
    673 	 * also need to be taken into account, so we go through the
    674 	 * FIFO once more and process all finished pairs
    675 	 */
    676 	for (i = 0; i < fifo_len; i++) {
    677 		if (fifo[i].complete) {
    678 			preprocess_bracket_pair(&(fifo[i].start),
    679 			                        &(fifo[i].end));
    680 		}
    681 	}
    682 }
    683 
    684 static size_t
    685 preprocess_isolating_run_sequence(uint_least32_t *buf, size_t buflen,
    686                                   size_t off, uint_least8_t paragraph_level)
    687 {
    688 	enum bidi_property sequence_prop, prop;
    689 	struct isolate_runner ir, tmp;
    690 	size_t runsince, sequence_end;
    691 
    692 	/* W1 */
    693 	ir_init(buf, buflen, off, paragraph_level, false, &ir);
    694 	while (!ir_advance(&ir)) {
    695 		if (ir_get_current_prop(&ir) == BIDI_PROP_NSM) {
    696 			prop = ir_get_previous_prop(&ir);
    697 
    698 			if (prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
    699 			    prop == BIDI_PROP_FSI || prop == BIDI_PROP_PDI) {
    700 				ir_set_current_prop(&ir, BIDI_PROP_ON);
    701 			} else {
    702 				ir_set_current_prop(&ir, prop);
    703 			}
    704 		}
    705 	}
    706 
    707 	/* W2 */
    708 	ir_init(buf, buflen, off, paragraph_level, false, &ir);
    709 	while (!ir_advance(&ir)) {
    710 		if (ir_get_current_prop(&ir) == BIDI_PROP_EN &&
    711 		    ir_get_last_strong_prop(&ir) == BIDI_PROP_AL) {
    712 			ir_set_current_prop(&ir, BIDI_PROP_AN);
    713 		}
    714 	}
    715 
    716 	/* W3 */
    717 	ir_init(buf, buflen, off, paragraph_level, false, &ir);
    718 	while (!ir_advance(&ir)) {
    719 		if (ir_get_current_prop(&ir) == BIDI_PROP_AL) {
    720 			ir_set_current_prop(&ir, BIDI_PROP_R);
    721 		}
    722 	}
    723 
    724 	/* W4 */
    725 	ir_init(buf, buflen, off, paragraph_level, false, &ir);
    726 	while (!ir_advance(&ir)) {
    727 		if (ir_get_previous_prop(&ir) == BIDI_PROP_EN &&
    728 		    (ir_get_current_prop(&ir) == BIDI_PROP_ES ||
    729 		     ir_get_current_prop(&ir) == BIDI_PROP_CS) &&
    730 		    ir_get_next_prop(&ir) == BIDI_PROP_EN) {
    731 			ir_set_current_prop(&ir, BIDI_PROP_EN);
    732 		}
    733 
    734 		if (ir_get_previous_prop(&ir) == BIDI_PROP_AN &&
    735 		    ir_get_current_prop(&ir) == BIDI_PROP_CS &&
    736 		    ir_get_next_prop(&ir) == BIDI_PROP_AN) {
    737 			ir_set_current_prop(&ir, BIDI_PROP_AN);
    738 		}
    739 	}
    740 
    741 	/* W5 */
    742 	runsince = SIZE_MAX;
    743 	ir_init(buf, buflen, off, paragraph_level, false, &ir);
    744 	while (!ir_advance(&ir)) {
    745 		if (ir_get_current_prop(&ir) == BIDI_PROP_ET) {
    746 			if (runsince == SIZE_MAX) {
    747 				/* a new run has begun */
    748 				runsince = ir.cur.off;
    749 			}
    750 		} else if (ir_get_current_prop(&ir) == BIDI_PROP_EN) {
    751 			/* set the preceding sequence */
    752 			if (runsince != SIZE_MAX) {
    753 				ir_init(buf, buflen, runsince, paragraph_level,
    754 				        (runsince > off), &tmp);
    755 				while (!ir_advance(&tmp) &&
    756 				       tmp.cur.off < ir.cur.off) {
    757 					ir_set_current_prop(&tmp, BIDI_PROP_EN);
    758 				}
    759 				runsince = SIZE_MAX;
    760 			} else {
    761 				ir_init(buf, buflen, ir.cur.off,
    762 				        paragraph_level, (ir.cur.off > off),
    763 				        &tmp);
    764 				ir_advance(&tmp);
    765 			}
    766 			/* follow the succeeding sequence */
    767 			while (!ir_advance(&tmp)) {
    768 				if (ir_get_current_prop(&tmp) != BIDI_PROP_ET) {
    769 					break;
    770 				}
    771 				ir_set_current_prop(&tmp, BIDI_PROP_EN);
    772 			}
    773 		} else {
    774 			/* sequence ended */
    775 			runsince = SIZE_MAX;
    776 		}
    777 	}
    778 
    779 	/* W6 */
    780 	ir_init(buf, buflen, off, paragraph_level, false, &ir);
    781 	while (!ir_advance(&ir)) {
    782 		prop = ir_get_current_prop(&ir);
    783 
    784 		if (prop == BIDI_PROP_ES || prop == BIDI_PROP_ET ||
    785 		    prop == BIDI_PROP_CS) {
    786 			ir_set_current_prop(&ir, BIDI_PROP_ON);
    787 		}
    788 	}
    789 
    790 	/* W7 */
    791 	ir_init(buf, buflen, off, paragraph_level, false, &ir);
    792 	while (!ir_advance(&ir)) {
    793 		if (ir_get_current_prop(&ir) == BIDI_PROP_EN &&
    794 		    ir_get_last_strong_prop(&ir) == BIDI_PROP_L) {
    795 			ir_set_current_prop(&ir, BIDI_PROP_L);
    796 		}
    797 	}
    798 
    799 	/* N0 */
    800 	preprocess_bracket_pairs(buf, buflen, off, paragraph_level);
    801 
    802 	/* N1 */
    803 	sequence_end = SIZE_MAX;
    804 	sequence_prop = NUM_BIDI_PROPS;
    805 	ir_init(buf, buflen, off, paragraph_level, false, &ir);
    806 	while (!ir_advance(&ir)) {
    807 		if (sequence_end == SIZE_MAX) {
    808 			prop = ir_get_current_prop(&ir);
    809 
    810 			if (prop == BIDI_PROP_B || prop == BIDI_PROP_S ||
    811 			    prop == BIDI_PROP_WS || prop == BIDI_PROP_ON ||
    812 			    prop == BIDI_PROP_FSI || prop == BIDI_PROP_LRI ||
    813 			    prop == BIDI_PROP_RLI || prop == BIDI_PROP_PDI) {
    814 				/* the current character is an NI (neutral
    815 				 * or isolate) */
    816 
    817 				/* scan ahead to the end of the NI-sequence
    818 				 */
    819 				ir_init(buf, buflen, ir.cur.off,
    820 				        paragraph_level, (ir.cur.off > off),
    821 				        &tmp);
    822 				while (!ir_advance(&tmp)) {
    823 					prop = ir_get_next_prop(&tmp);
    824 
    825 					if (prop != BIDI_PROP_B &&
    826 					    prop != BIDI_PROP_S &&
    827 					    prop != BIDI_PROP_WS &&
    828 					    prop != BIDI_PROP_ON &&
    829 					    prop != BIDI_PROP_FSI &&
    830 					    prop != BIDI_PROP_LRI &&
    831 					    prop != BIDI_PROP_RLI &&
    832 					    prop != BIDI_PROP_PDI) {
    833 						break;
    834 					}
    835 				}
    836 
    837 				/*
    838 				 * check what follows and see if the text
    839 				 * has the same direction on both sides
    840 				 */
    841 				if (ir_get_previous_prop(&ir) == BIDI_PROP_L &&
    842 				    ir_get_next_prop(&tmp) == BIDI_PROP_L) {
    843 					sequence_end = tmp.cur.off;
    844 					sequence_prop = BIDI_PROP_L;
    845 				} else if ((ir_get_previous_prop(&ir) ==
    846 				                    BIDI_PROP_R ||
    847 				            ir_get_previous_prop(&ir) ==
    848 				                    BIDI_PROP_EN ||
    849 				            ir_get_previous_prop(&ir) ==
    850 				                    BIDI_PROP_AN) &&
    851 				           (ir_get_next_prop(&tmp) ==
    852 				                    BIDI_PROP_R ||
    853 				            ir_get_next_prop(&tmp) ==
    854 				                    BIDI_PROP_EN ||
    855 				            ir_get_next_prop(&tmp) ==
    856 				                    BIDI_PROP_AN)) {
    857 					sequence_end = tmp.cur.off;
    858 					sequence_prop = BIDI_PROP_R;
    859 				}
    860 			}
    861 		}
    862 
    863 		if (sequence_end != SIZE_MAX) {
    864 			if (ir.cur.off <= sequence_end) {
    865 				ir_set_current_prop(&ir, sequence_prop);
    866 			} else {
    867 				/* end of sequence, reset */
    868 				sequence_end = SIZE_MAX;
    869 				sequence_prop = NUM_BIDI_PROPS;
    870 			}
    871 		}
    872 	}
    873 
    874 	/* N2 */
    875 	ir_init(buf, buflen, off, paragraph_level, false, &ir);
    876 	while (!ir_advance(&ir)) {
    877 		prop = ir_get_current_prop(&ir);
    878 
    879 		if (prop == BIDI_PROP_B || prop == BIDI_PROP_S ||
    880 		    prop == BIDI_PROP_WS || prop == BIDI_PROP_ON ||
    881 		    prop == BIDI_PROP_FSI || prop == BIDI_PROP_LRI ||
    882 		    prop == BIDI_PROP_RLI || prop == BIDI_PROP_PDI) {
    883 			/* N2 */
    884 			if (ir_get_current_level(&ir) % 2 == 0) {
    885 				/* even embedding level */
    886 				ir_set_current_prop(&ir, BIDI_PROP_L);
    887 			} else {
    888 				/* odd embedding level */
    889 				ir_set_current_prop(&ir, BIDI_PROP_R);
    890 			}
    891 		}
    892 	}
    893 
    894 	return 0;
    895 }
    896 
    897 static uint_least8_t
    898 get_isolated_paragraph_level(const uint_least32_t *state, size_t statelen)
    899 {
    900 	enum bidi_property prop;
    901 	int_least8_t isolate_level;
    902 	size_t stateoff;
    903 
    904 	/* determine paragraph level (rules P1-P3) and terminate on PDI */
    905 	for (stateoff = 0, isolate_level = 0; stateoff < statelen; stateoff++) {
    906 		prop = (uint_least8_t)get_state(STATE_PROP, state[stateoff]);
    907 
    908 		if (prop == BIDI_PROP_PDI && isolate_level == 0) {
    909 			/*
    910 			 * we are in a FSI-subsection of a paragraph and
    911 			 * matched with the terminating PDI
    912 			 */
    913 			break;
    914 		}
    915 
    916 		/* BD8/BD9 */
    917 		if ((prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
    918 		     prop == BIDI_PROP_FSI) &&
    919 		    isolate_level < MAX_DEPTH) {
    920 			/* we hit an isolate initiator, increment counter */
    921 			isolate_level++;
    922 		} else if (prop == BIDI_PROP_PDI && isolate_level > 0) {
    923 			isolate_level--;
    924 		}
    925 
    926 		/* P2 */
    927 		if (isolate_level > 0) {
    928 			continue;
    929 		}
    930 
    931 		/* P3 */
    932 		if (prop == BIDI_PROP_L) {
    933 			return 0;
    934 		} else if (prop == BIDI_PROP_AL || prop == BIDI_PROP_R) {
    935 			return 1;
    936 		}
    937 	}
    938 
    939 	return 0;
    940 }
    941 
    942 static inline uint_least8_t
    943 get_bidi_property(uint_least32_t cp)
    944 {
    945 	if (likely(cp <= 0x10FFFF)) {
    946 		return (bidi_minor[bidi_major[cp >> 8] + (cp & 0xff)]) &
    947 		       0x1F /* 00011111 */;
    948 	} else {
    949 		return BIDI_PROP_L;
    950 	}
    951 }
    952 
    953 static uint_least8_t
    954 get_paragraph_level(enum grapheme_bidirectional_direction override,
    955                     const HERODOTUS_READER *r)
    956 {
    957 	HERODOTUS_READER tmp;
    958 	enum bidi_property prop;
    959 	int_least8_t isolate_level;
    960 	uint_least32_t cp;
    961 
    962 	/* check overrides first according to rule HL1 */
    963 	if (override == GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
    964 		return 0;
    965 	} else if (override == GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
    966 		return 1;
    967 	}
    968 
    969 	/* copy reader into temporary reader */
    970 	herodotus_reader_copy(r, &tmp);
    971 
    972 	/* determine paragraph level (rules P1-P3) */
    973 	for (isolate_level = 0; herodotus_read_codepoint(&tmp, true, &cp) ==
    974 	                        HERODOTUS_STATUS_SUCCESS;) {
    975 		prop = get_bidi_property(cp);
    976 
    977 		/* BD8/BD9 */
    978 		if ((prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
    979 		     prop == BIDI_PROP_FSI) &&
    980 		    isolate_level < MAX_DEPTH) {
    981 			/* we hit an isolate initiator, increment counter */
    982 			isolate_level++;
    983 		} else if (prop == BIDI_PROP_PDI && isolate_level > 0) {
    984 			isolate_level--;
    985 		}
    986 
    987 		/* P2 */
    988 		if (isolate_level > 0) {
    989 			continue;
    990 		}
    991 
    992 		/* P3 */
    993 		if (prop == BIDI_PROP_L) {
    994 			return 0;
    995 		} else if (prop == BIDI_PROP_AL || prop == BIDI_PROP_R) {
    996 			return 1;
    997 		}
    998 	}
    999 
   1000 	return 0;
   1001 }
   1002 
   1003 static void
   1004 preprocess_paragraph(uint_least8_t paragraph_level, uint_least32_t *buf,
   1005                      size_t buflen)
   1006 {
   1007 	enum bidi_property prop;
   1008 	int_least8_t level;
   1009 
   1010 	struct {
   1011 		int_least8_t level;
   1012 		enum grapheme_bidirectional_direction override;
   1013 		bool directional_isolate;
   1014 	} directional_status[MAX_DEPTH + 2], *dirstat = directional_status;
   1015 
   1016 	size_t overflow_isolate_count, overflow_embedding_count,
   1017 		valid_isolate_count, bufoff, i, runsince;
   1018 
   1019 	/* X1 */
   1020 	dirstat->level = (int_least8_t)paragraph_level;
   1021 	dirstat->override = GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
   1022 	dirstat->directional_isolate = false;
   1023 	overflow_isolate_count = overflow_embedding_count =
   1024 		valid_isolate_count = 0;
   1025 
   1026 	for (bufoff = 0; bufoff < buflen; bufoff++) {
   1027 		prop = (uint_least8_t)get_state(STATE_PROP, buf[bufoff]);
   1028 
   1029 		/* set paragraph level we need for line-level-processing */
   1030 		set_state(STATE_PARAGRAPH_LEVEL, paragraph_level,
   1031 		          &(buf[bufoff]));
   1032 again:
   1033 		if (prop == BIDI_PROP_RLE) {
   1034 			/* X2 */
   1035 			if (dirstat->level + (dirstat->level % 2 != 0) + 1 <=
   1036 			            MAX_DEPTH &&
   1037 			    overflow_isolate_count == 0 &&
   1038 			    overflow_embedding_count == 0) {
   1039 				/* valid RLE */
   1040 				dirstat++;
   1041 				dirstat->level =
   1042 					(dirstat - 1)->level +
   1043 					((dirstat - 1)->level % 2 != 0) + 1;
   1044 				dirstat->override =
   1045 					GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
   1046 				dirstat->directional_isolate = false;
   1047 			} else {
   1048 				/* overflow RLE */
   1049 				overflow_embedding_count +=
   1050 					(overflow_isolate_count == 0);
   1051 			}
   1052 		} else if (prop == BIDI_PROP_LRE) {
   1053 			/* X3 */
   1054 			if (dirstat->level + (dirstat->level % 2 == 0) + 1 <=
   1055 			            MAX_DEPTH &&
   1056 			    overflow_isolate_count == 0 &&
   1057 			    overflow_embedding_count == 0) {
   1058 				/* valid LRE */
   1059 				dirstat++;
   1060 				dirstat->level =
   1061 					(dirstat - 1)->level +
   1062 					((dirstat - 1)->level % 2 == 0) + 1;
   1063 				dirstat->override =
   1064 					GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
   1065 				dirstat->directional_isolate = false;
   1066 			} else {
   1067 				/* overflow LRE */
   1068 				overflow_embedding_count +=
   1069 					(overflow_isolate_count == 0);
   1070 			}
   1071 		} else if (prop == BIDI_PROP_RLO) {
   1072 			/* X4 */
   1073 			if (dirstat->level + (dirstat->level % 2 != 0) + 1 <=
   1074 			            MAX_DEPTH &&
   1075 			    overflow_isolate_count == 0 &&
   1076 			    overflow_embedding_count == 0) {
   1077 				/* valid RLO */
   1078 				dirstat++;
   1079 				dirstat->level =
   1080 					(dirstat - 1)->level +
   1081 					((dirstat - 1)->level % 2 != 0) + 1;
   1082 				dirstat->override =
   1083 					GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL;
   1084 				dirstat->directional_isolate = false;
   1085 			} else {
   1086 				/* overflow RLO */
   1087 				overflow_embedding_count +=
   1088 					(overflow_isolate_count == 0);
   1089 			}
   1090 		} else if (prop == BIDI_PROP_LRO) {
   1091 			/* X5 */
   1092 			if (dirstat->level + (dirstat->level % 2 == 0) + 1 <=
   1093 			            MAX_DEPTH &&
   1094 			    overflow_isolate_count == 0 &&
   1095 			    overflow_embedding_count == 0) {
   1096 				/* valid LRE */
   1097 				dirstat++;
   1098 				dirstat->level =
   1099 					(dirstat - 1)->level +
   1100 					((dirstat - 1)->level % 2 == 0) + 1;
   1101 				dirstat->override =
   1102 					GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR;
   1103 				dirstat->directional_isolate = false;
   1104 			} else {
   1105 				/* overflow LRO */
   1106 				overflow_embedding_count +=
   1107 					(overflow_isolate_count == 0);
   1108 			}
   1109 		} else if (prop == BIDI_PROP_RLI) {
   1110 			/* X5a */
   1111 			set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff]));
   1112 			if (dirstat->override ==
   1113 			    GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
   1114 				set_state(STATE_PROP, BIDI_PROP_L,
   1115 				          &(buf[bufoff]));
   1116 			} else if (dirstat->override ==
   1117 			           GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
   1118 				set_state(STATE_PROP, BIDI_PROP_R,
   1119 				          &(buf[bufoff]));
   1120 			}
   1121 
   1122 			if (dirstat->level + (dirstat->level % 2 != 0) + 1 <=
   1123 			            MAX_DEPTH &&
   1124 			    overflow_isolate_count == 0 &&
   1125 			    overflow_embedding_count == 0) {
   1126 				/* valid RLI */
   1127 				valid_isolate_count++;
   1128 
   1129 				dirstat++;
   1130 				dirstat->level =
   1131 					(dirstat - 1)->level +
   1132 					((dirstat - 1)->level % 2 != 0) + 1;
   1133 				dirstat->override =
   1134 					GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
   1135 				dirstat->directional_isolate = true;
   1136 			} else {
   1137 				/* overflow RLI */
   1138 				overflow_isolate_count++;
   1139 			}
   1140 		} else if (prop == BIDI_PROP_LRI) {
   1141 			/* X5b */
   1142 			set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff]));
   1143 			if (dirstat->override ==
   1144 			    GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
   1145 				set_state(STATE_PROP, BIDI_PROP_L,
   1146 				          &(buf[bufoff]));
   1147 			} else if (dirstat->override ==
   1148 			           GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
   1149 				set_state(STATE_PROP, BIDI_PROP_R,
   1150 				          &(buf[bufoff]));
   1151 			}
   1152 
   1153 			if (dirstat->level + (dirstat->level % 2 == 0) + 1 <=
   1154 			            MAX_DEPTH &&
   1155 			    overflow_isolate_count == 0 &&
   1156 			    overflow_embedding_count == 0) {
   1157 				/* valid LRI */
   1158 				valid_isolate_count++;
   1159 
   1160 				dirstat++;
   1161 				dirstat->level =
   1162 					(dirstat - 1)->level +
   1163 					((dirstat - 1)->level % 2 == 0) + 1;
   1164 				dirstat->override =
   1165 					GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
   1166 				dirstat->directional_isolate = true;
   1167 			} else {
   1168 				/* overflow LRI */
   1169 				overflow_isolate_count++;
   1170 			}
   1171 		} else if (prop == BIDI_PROP_FSI) {
   1172 			/* X5c */
   1173 			if (get_isolated_paragraph_level(
   1174 				    buf + (bufoff + 1),
   1175 				    buflen - (bufoff + 1)) == 1) {
   1176 				prop = BIDI_PROP_RLI;
   1177 				goto again;
   1178 			} else { /* ... == 0 */
   1179 				prop = BIDI_PROP_LRI;
   1180 				goto again;
   1181 			}
   1182 		} else if (prop != BIDI_PROP_B && prop != BIDI_PROP_BN &&
   1183 		           prop != BIDI_PROP_PDF && prop != BIDI_PROP_PDI) {
   1184 			/* X6 */
   1185 			set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff]));
   1186 			if (dirstat->override ==
   1187 			    GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
   1188 				set_state(STATE_PROP, BIDI_PROP_L,
   1189 				          &(buf[bufoff]));
   1190 			} else if (dirstat->override ==
   1191 			           GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
   1192 				set_state(STATE_PROP, BIDI_PROP_R,
   1193 				          &(buf[bufoff]));
   1194 			}
   1195 		} else if (prop == BIDI_PROP_PDI) {
   1196 			/* X6a */
   1197 			if (overflow_isolate_count > 0) {
   1198 				/* PDI matches an overflow isolate initiator
   1199 				 */
   1200 				overflow_isolate_count--;
   1201 			} else if (valid_isolate_count > 0) {
   1202 				/* PDI matches a normal isolate initiator */
   1203 				overflow_embedding_count = 0;
   1204 				while (dirstat->directional_isolate == false &&
   1205 				       dirstat > directional_status) {
   1206 					/*
   1207 					 * we are safe here as given the
   1208 					 * valid isolate count is positive
   1209 					 * there must be a stack-entry
   1210 					 * with positive directional
   1211 					 * isolate status, but we take
   1212 					 * no chances and include an
   1213 					 * explicit check
   1214 					 *
   1215 					 * POSSIBLE OPTIMIZATION: Whenever
   1216 					 * we push on the stack, check if it
   1217 					 * has the directional isolate
   1218 					 * status true and store a pointer
   1219 					 * to it so we can jump to it very
   1220 					 * quickly.
   1221 					 */
   1222 					dirstat--;
   1223 				}
   1224 
   1225 				/*
   1226 				 * as above, the following check is not
   1227 				 * necessary, given we are guaranteed to
   1228 				 * have at least one stack entry left,
   1229 				 * but it's better to be safe
   1230 				 */
   1231 				if (dirstat > directional_status) {
   1232 					dirstat--;
   1233 				}
   1234 				valid_isolate_count--;
   1235 			}
   1236 
   1237 			set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff]));
   1238 			if (dirstat->override ==
   1239 			    GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
   1240 				set_state(STATE_PROP, BIDI_PROP_L,
   1241 				          &(buf[bufoff]));
   1242 			} else if (dirstat->override ==
   1243 			           GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
   1244 				set_state(STATE_PROP, BIDI_PROP_R,
   1245 				          &(buf[bufoff]));
   1246 			}
   1247 		} else if (prop == BIDI_PROP_PDF) {
   1248 			/* X7 */
   1249 			if (overflow_isolate_count > 0) {
   1250 				/* do nothing */
   1251 			} else if (overflow_embedding_count > 0) {
   1252 				overflow_embedding_count--;
   1253 			} else if (dirstat->directional_isolate == false &&
   1254 			           dirstat > directional_status) {
   1255 				dirstat--;
   1256 			}
   1257 		} else if (prop == BIDI_PROP_B) {
   1258 			/* X8 */
   1259 			set_state(STATE_LEVEL, paragraph_level, &(buf[bufoff]));
   1260 		}
   1261 
   1262 		/* X9 */
   1263 		if (prop == BIDI_PROP_RLE || prop == BIDI_PROP_LRE ||
   1264 		    prop == BIDI_PROP_RLO || prop == BIDI_PROP_LRO ||
   1265 		    prop == BIDI_PROP_PDF || prop == BIDI_PROP_BN) {
   1266 			set_state(STATE_LEVEL, -1, &(buf[bufoff]));
   1267 		}
   1268 	}
   1269 
   1270 	/* X10 (W1-W7, N0-N2) */
   1271 	for (bufoff = 0; bufoff < buflen; bufoff++) {
   1272 		if (get_state(STATE_VISITED, buf[bufoff]) == 0 &&
   1273 		    get_state(STATE_LEVEL, buf[bufoff]) != -1) {
   1274 			bufoff += preprocess_isolating_run_sequence(
   1275 				buf, buflen, bufoff, paragraph_level);
   1276 		}
   1277 	}
   1278 
   1279 	/*
   1280 	 * I1-I2 (given our sequential approach to processing the
   1281 	 * isolating run sequences, we apply this rule separately)
   1282 	 */
   1283 	for (bufoff = 0; bufoff < buflen; bufoff++) {
   1284 		level = (int_least8_t)get_state(STATE_LEVEL, buf[bufoff]);
   1285 		prop = (uint_least8_t)get_state(STATE_PROP, buf[bufoff]);
   1286 
   1287 		if (level % 2 == 0) {
   1288 			/* even level */
   1289 			if (prop == BIDI_PROP_R) {
   1290 				set_state(STATE_LEVEL, level + 1,
   1291 				          &(buf[bufoff]));
   1292 			} else if (prop == BIDI_PROP_AN ||
   1293 			           prop == BIDI_PROP_EN) {
   1294 				set_state(STATE_LEVEL, level + 2,
   1295 				          &(buf[bufoff]));
   1296 			}
   1297 		} else {
   1298 			/* odd level */
   1299 			if (prop == BIDI_PROP_L || prop == BIDI_PROP_EN ||
   1300 			    prop == BIDI_PROP_AN) {
   1301 				set_state(STATE_LEVEL, level + 1,
   1302 				          &(buf[bufoff]));
   1303 			}
   1304 		}
   1305 	}
   1306 
   1307 	/* L1 (rules 1-3) */
   1308 	runsince = SIZE_MAX;
   1309 	for (bufoff = 0; bufoff < buflen; bufoff++) {
   1310 		level = (int_least8_t)get_state(STATE_LEVEL, buf[bufoff]);
   1311 		prop = (uint_least8_t)get_state(STATE_PRESERVED_PROP,
   1312 		                                buf[bufoff]);
   1313 
   1314 		if (level == -1) {
   1315 			/* ignored character */
   1316 			continue;
   1317 		}
   1318 
   1319 		/* rules 1 and 2 */
   1320 		if (prop == BIDI_PROP_S || prop == BIDI_PROP_B) {
   1321 			set_state(STATE_LEVEL, paragraph_level, &(buf[bufoff]));
   1322 		}
   1323 
   1324 		/* rule 3 */
   1325 		if (prop == BIDI_PROP_WS || prop == BIDI_PROP_FSI ||
   1326 		    prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
   1327 		    prop == BIDI_PROP_PDI) {
   1328 			if (runsince == SIZE_MAX) {
   1329 				/* a new run has begun */
   1330 				runsince = bufoff;
   1331 			}
   1332 		} else if ((prop == BIDI_PROP_S || prop == BIDI_PROP_B) &&
   1333 		           runsince != SIZE_MAX) {
   1334 			/*
   1335 			 * we hit a segment or paragraph separator in a
   1336 			 * sequence, reset sequence-levels
   1337 			 */
   1338 			for (i = runsince; i < bufoff; i++) {
   1339 				if (get_state(STATE_LEVEL, buf[i]) != -1) {
   1340 					set_state(STATE_LEVEL, paragraph_level,
   1341 					          &(buf[i]));
   1342 				}
   1343 			}
   1344 			runsince = SIZE_MAX;
   1345 		} else {
   1346 			/* sequence ended */
   1347 			runsince = SIZE_MAX;
   1348 		}
   1349 	}
   1350 	if (runsince != SIZE_MAX) {
   1351 		/*
   1352 		 * this is the end of the paragraph and we
   1353 		 * are in a run
   1354 		 */
   1355 		for (i = runsince; i < buflen; i++) {
   1356 			if (get_state(STATE_LEVEL, buf[i]) != -1) {
   1357 				set_state(STATE_LEVEL, paragraph_level,
   1358 				          &(buf[i]));
   1359 			}
   1360 		}
   1361 		runsince = SIZE_MAX;
   1362 	}
   1363 }
   1364 
   1365 static inline uint_least8_t
   1366 get_bidi_bracket_off(uint_least32_t cp)
   1367 {
   1368 	if (likely(cp <= 0x10FFFF)) {
   1369 		return (uint_least8_t)((bidi_minor[bidi_major[cp >> 8] +
   1370 		                                   (cp & 0xff)]) >>
   1371 		                       5);
   1372 	} else {
   1373 		return 0;
   1374 	}
   1375 }
   1376 
   1377 static size_t
   1378 preprocess(HERODOTUS_READER *r, enum grapheme_bidirectional_direction override,
   1379            uint_least32_t *buf, size_t buflen,
   1380            enum grapheme_bidirectional_direction *resolved)
   1381 {
   1382 	HERODOTUS_READER tmp;
   1383 	size_t bufoff, bufsize, paragraph_len;
   1384 	uint_least32_t cp;
   1385 	uint_least8_t paragraph_level;
   1386 
   1387 	/* determine length and level of the paragraph */
   1388 	herodotus_reader_copy(r, &tmp);
   1389 	for (; herodotus_read_codepoint(&tmp, true, &cp) ==
   1390 	       HERODOTUS_STATUS_SUCCESS;) {
   1391 		/* break on paragraph separator */
   1392 		if (get_bidi_property(cp) == BIDI_PROP_B) {
   1393 			break;
   1394 		}
   1395 	}
   1396 	paragraph_len = herodotus_reader_number_read(&tmp);
   1397 	paragraph_level = get_paragraph_level(override, r);
   1398 
   1399 	if (resolved != NULL) {
   1400 		/* store resolved paragraph level in output variable */
   1401 		/* TODO use enum-type */
   1402 		*resolved = (paragraph_level == 0) ?
   1403 		                    GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR :
   1404 		                    GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL;
   1405 	}
   1406 
   1407 	if (buf == NULL) {
   1408 		/* see below for return value reasoning */
   1409 		return paragraph_len;
   1410 	}
   1411 
   1412 	/*
   1413 	 * the first step is to determine the bidirectional properties
   1414 	 * and store them in the buffer
   1415 	 */
   1416 	for (bufoff = 0;
   1417 	     bufoff < paragraph_len &&
   1418 	     herodotus_read_codepoint(r, true, &cp) == HERODOTUS_STATUS_SUCCESS;
   1419 	     bufoff++) {
   1420 		if (bufoff < buflen) {
   1421 			/*
   1422 			 * actually only do something when we have
   1423 			 * space in the level-buffer. We continue
   1424 			 * the iteration to be able to give a good
   1425 			 * return value
   1426 			 */
   1427 			set_state(STATE_PROP,
   1428 			          (uint_least8_t)get_bidi_property(cp),
   1429 			          &(buf[bufoff]));
   1430 			set_state(STATE_BRACKET_OFF, get_bidi_bracket_off(cp),
   1431 			          &(buf[bufoff]));
   1432 			set_state(STATE_LEVEL, 0, &(buf[bufoff]));
   1433 			set_state(STATE_PARAGRAPH_LEVEL, 0, &(buf[bufoff]));
   1434 			set_state(STATE_VISITED, 0, &(buf[bufoff]));
   1435 			set_state(STATE_PRESERVED_PROP,
   1436 			          (uint_least8_t)get_bidi_property(cp),
   1437 			          &(buf[bufoff]));
   1438 		}
   1439 	}
   1440 	bufsize = herodotus_reader_number_read(r);
   1441 
   1442 	for (bufoff = 0; bufoff < bufsize; bufoff++) {
   1443 		if (get_state(STATE_PROP, buf[bufoff]) != BIDI_PROP_B &&
   1444 		    bufoff != bufsize - 1) {
   1445 			continue;
   1446 		}
   1447 
   1448 		/*
   1449 		 * we either encountered a paragraph terminator or this
   1450 		 * is the last character in the string.
   1451 		 * Call the paragraph handler on the paragraph, including
   1452 		 * the terminating character or last character of the
   1453 		 * string respectively
   1454 		 */
   1455 		preprocess_paragraph(paragraph_level, buf, bufoff + 1);
   1456 		break;
   1457 	}
   1458 
   1459 	/*
   1460 	 * we return the number of total bytes read, as the function
   1461 	 * should indicate if the given level-buffer is too small
   1462 	 */
   1463 	return herodotus_reader_number_read(r);
   1464 }
   1465 
   1466 size_t
   1467 grapheme_bidirectional_preprocess_paragraph(
   1468 	const uint_least32_t *src, size_t srclen,
   1469 	enum grapheme_bidirectional_direction override, uint_least32_t *dest,
   1470 	size_t destlen, enum grapheme_bidirectional_direction *resolved)
   1471 {
   1472 	HERODOTUS_READER r;
   1473 
   1474 	herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
   1475 
   1476 	return preprocess(&r, override, dest, destlen, resolved);
   1477 }
   1478 
   1479 static inline size_t
   1480 get_line_embedding_levels(const uint_least32_t *linedata, size_t linelen,
   1481                           int_least8_t (*get_level)(const void *, size_t),
   1482                           void (*set_level)(void *, size_t, int_least8_t),
   1483                           void *lev, size_t levsize, bool skipignored)
   1484 {
   1485 	enum bidi_property prop;
   1486 	size_t i, levlen, runsince;
   1487 	int_least8_t level, runlevel;
   1488 
   1489 	/* rule L1.4 */
   1490 	runsince = SIZE_MAX;
   1491 	runlevel = -1;
   1492 	for (i = 0, levlen = 0; i < linelen; i++) {
   1493 		level = (int_least8_t)get_state(STATE_LEVEL, linedata[i]);
   1494 		prop = (uint_least8_t)get_state(STATE_PRESERVED_PROP,
   1495 		                                linedata[i]);
   1496 
   1497 		/* write level into level array if we still have space */
   1498 		if (level != -1 || skipignored == false) {
   1499 			if (levlen <= levsize) {
   1500 				set_level(lev, levlen, level);
   1501 			}
   1502 			levlen++;
   1503 		}
   1504 
   1505 		if (level == -1) {
   1506 			/* ignored character */
   1507 			continue;
   1508 		}
   1509 
   1510 		if (prop == BIDI_PROP_WS || prop == BIDI_PROP_FSI ||
   1511 		    prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
   1512 		    prop == BIDI_PROP_PDI) {
   1513 			if (runsince == SIZE_MAX) {
   1514 				/* a new run has begun */
   1515 				runsince = levlen - 1; /* levlen > 0 */
   1516 				runlevel = (int_least8_t)get_state(
   1517 					STATE_PARAGRAPH_LEVEL, linedata[i]);
   1518 			}
   1519 		} else {
   1520 			/* sequence ended */
   1521 			runsince = SIZE_MAX;
   1522 			runlevel = -1;
   1523 		}
   1524 	}
   1525 	if (runsince != SIZE_MAX) {
   1526 		/*
   1527 		 * we hit the end of the line but were in a run;
   1528 		 * reset the line levels to the paragraph level
   1529 		 */
   1530 		for (i = runsince; i < MIN(linelen, levlen); i++) {
   1531 			if (get_level(lev, i) != -1) {
   1532 				set_level(lev, i, runlevel);
   1533 			}
   1534 		}
   1535 	}
   1536 
   1537 	return levlen;
   1538 }
   1539 
   1540 static inline int_least8_t
   1541 get_level_int8(const void *lev, size_t off)
   1542 {
   1543 	return ((const int_least8_t *)lev)[off];
   1544 }
   1545 
   1546 static inline void
   1547 set_level_int8(void *lev, size_t off, int_least8_t value)
   1548 {
   1549 	((int_least8_t *)lev)[off] = value;
   1550 }
   1551 
   1552 size_t
   1553 grapheme_bidirectional_get_line_embedding_levels(const uint_least32_t *linedata,
   1554                                                  size_t linelen,
   1555                                                  int_least8_t *lev,
   1556                                                  size_t levlen)
   1557 {
   1558 	return get_line_embedding_levels(linedata, linelen, get_level_int8,
   1559 	                                 set_level_int8, lev, levlen, false);
   1560 }
   1561 
   1562 static inline int_least8_t
   1563 get_level_uint32(const void *lev, size_t off)
   1564 {
   1565 	return (int_least8_t)((((const uint_least32_t *)lev)[off] &
   1566 	                       UINT32_C(0x1FE00000)) >>
   1567 	                      21) -
   1568 	       1;
   1569 }
   1570 
   1571 static inline void
   1572 set_level_uint32(void *lev, size_t off, int_least8_t value)
   1573 {
   1574 	((uint_least32_t *)lev)[off] ^=
   1575 		((uint_least32_t *)lev)[off] & UINT32_C(0x1FE00000);
   1576 	((uint_least32_t *)lev)[off] |= ((uint_least32_t)(value + 1)) << 21;
   1577 }
   1578 
   1579 static inline int_least16_t
   1580 get_mirror_offset(uint_least32_t cp)
   1581 {
   1582 	if (cp <= UINT32_C(0x10FFFF)) {
   1583 		return mirror_minor[mirror_major[cp >> 8] + (cp & 0xFF)];
   1584 	} else {
   1585 		return 0;
   1586 	}
   1587 }
   1588 
   1589 size_t
   1590 grapheme_bidirectional_reorder_line(const uint_least32_t *line,
   1591                                     const uint_least32_t *linedata,
   1592                                     size_t linelen, uint_least32_t *output,
   1593                                     size_t outputsize)
   1594 {
   1595 	size_t i, outputlen, first, last, j, k, l /*, laststart*/;
   1596 	int_least8_t level, min_odd_level = MAX_DEPTH + 2, max_level = 0;
   1597 	uint_least32_t tmp;
   1598 
   1599 	/* write output characters (and apply possible mirroring) */
   1600 	for (i = 0, outputlen = 0; i < linelen; i++) {
   1601 		if (get_state(STATE_LEVEL, linedata[i]) != -1) {
   1602 			if (outputlen < outputsize) {
   1603 				output[outputlen] =
   1604 					(uint_least32_t)((int_least32_t)
   1605 				                                 line[i] +
   1606 				                         get_mirror_offset(
   1607 								 line[i]));
   1608 			}
   1609 			outputlen++;
   1610 		}
   1611 	}
   1612 	if (outputlen >= outputsize) {
   1613 		/* clear output buffer */
   1614 		for (i = 0; i < outputsize; i++) {
   1615 			output[i] = GRAPHEME_INVALID_CODEPOINT;
   1616 		}
   1617 
   1618 		/* return required size */
   1619 		return outputlen;
   1620 	}
   1621 
   1622 	/*
   1623 	 * write line embedding levels as metadata and codepoints into the
   1624 	 * output
   1625 	 */
   1626 	get_line_embedding_levels(linedata, linelen, get_level_uint32,
   1627 	                          set_level_uint32, output, outputsize, true);
   1628 
   1629 	/* determine level range */
   1630 	for (i = 0; i < outputlen; i++) {
   1631 		level = get_level_uint32(output, i);
   1632 
   1633 		if (level == -1) {
   1634 			/* ignored character */
   1635 			continue;
   1636 		}
   1637 
   1638 		if (level % 2 == 1 && level < min_odd_level) {
   1639 			min_odd_level = level;
   1640 		}
   1641 		if (level > max_level) {
   1642 			max_level = level;
   1643 		}
   1644 	}
   1645 
   1646 	for (level = max_level; level >= min_odd_level /* > 0 */; level--) {
   1647 		for (i = 0; i < outputlen; i++) {
   1648 			if (get_level_uint32(output, i) >= level) {
   1649 				/*
   1650 				 * the current character has the desired level
   1651 				 */
   1652 				first = last = i;
   1653 
   1654 				/* find the end of the level-sequence */
   1655 				for (i++; i < outputlen; i++) {
   1656 					if (get_level_uint32(output, i) >=
   1657 					    level) {
   1658 						/* the sequence continues */
   1659 						last = i;
   1660 					} else {
   1661 						break;
   1662 					}
   1663 				}
   1664 
   1665 				/* invert the sequence first..last respecting
   1666 				 * grapheme clusters
   1667 				 *
   1668 				 * The standard only speaks of combining marks
   1669 				 * inversion, but we should in the perfect case
   1670 				 * respect _all_ grapheme clusters, which we do
   1671 				 * here!
   1672 				 */
   1673 
   1674 				/* mark grapheme cluster breaks */
   1675 				for (j = first; j <= last;
   1676 				     j += grapheme_next_character_break(
   1677 					     line + j, outputlen - j)) {
   1678 					/*
   1679 					 * we use a special trick here: The
   1680 					 * first 21 bits of the state are filled
   1681 					 * with the codepoint, the next 8 bits
   1682 					 * are used for the level, so we can use
   1683 					 * the 30th bit to mark the grapheme
   1684 					 * cluster breaks. This allows us to
   1685 					 * reinvert the grapheme clusters into
   1686 					 * the proper direction later.
   1687 					 */
   1688 					output[j] |= UINT32_C(1) << 29;
   1689 				}
   1690 
   1691 				/* global inversion */
   1692 				for (k = first, l = last; k < l; k++, l--) {
   1693 					/* swap */
   1694 					tmp = output[k];
   1695 					output[k] = output[l];
   1696 					output[l] = tmp;
   1697 				}
   1698 
   1699 				/* grapheme cluster reinversion */
   1700 #if 0
   1701 				for (j = first, laststart = first; j <= last;
   1702 				     j++) {
   1703 					if (output[j] & (UINT32_C(1) << 29)) {
   1704 						/* we hit a mark! given the
   1705 						 * grapheme cluster is inverted,
   1706 						 * this means that the cluster
   1707 						 * ended and we now reinvert it
   1708 						 * again
   1709 						 */
   1710 						for (k = laststart, l = j;
   1711 						     k < l; k++, l--) {
   1712 							/* swap */
   1713 							tmp = output[k];
   1714 							output[k] = output[l];
   1715 							output[l] = tmp;
   1716 						}
   1717 						laststart = j + 1;
   1718 					}
   1719 				}
   1720 #endif
   1721 
   1722 				/* unmark grapheme cluster breaks */
   1723 				for (j = first; j <= last; j++) {
   1724 					output[j] ^= output[j] &
   1725 					             UINT32_C(0x20000000);
   1726 				}
   1727 			}
   1728 		}
   1729 	}
   1730 
   1731 	/* remove embedding level metadata */
   1732 	for (i = 0; i < outputlen; i++) {
   1733 		output[i] ^= output[i] & UINT32_C(0x1FE00000);
   1734 	}
   1735 
   1736 	return outputlen;
   1737 }