libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

character.c (18706B)


      1 #include <stdio.h>
      2 
      3 /* See LICENSE file for copyright and license details. */
      4 #include <limits.h>
      5 #include <stdbool.h>
      6 #include <stddef.h>
      7 
      8 #include "../gen/character.h"
      9 #include "../grapheme.h"
     10 #include "util.h"
     11 
     12 struct character_break_state {
     13 	uint_least8_t prop;
     14 	bool prop_set;
     15 	bool gb11_flag;
     16 	bool gb12_13_flag;
     17 	uint_least8_t gb9c_level;
     18 };
     19 
     20 static const uint_least32_t dont_break[NUM_CHAR_BREAK_PROPS] = {
     21 	[CHAR_BREAK_PROP_OTHER] =
     22 		UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
     23 		UINT32_C(1)
     24 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
     25 		UINT32_C(1)
     26 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
     27 		UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
     28 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
     29 		UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
     30 	[CHAR_BREAK_PROP_ICB_CONSONANT] =
     31 		UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
     32 		UINT32_C(1)
     33 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
     34 		UINT32_C(1)
     35 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
     36 		UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
     37 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
     38 		UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
     39 	[CHAR_BREAK_PROP_ICB_EXTEND] =
     40 		UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
     41 		UINT32_C(1)
     42 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
     43 		UINT32_C(1)
     44 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
     45 		UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
     46 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
     47 		UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
     48 	[CHAR_BREAK_PROP_ICB_LINKER] =
     49 		UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
     50 		UINT32_C(1)
     51 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
     52 		UINT32_C(1)
     53 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
     54 		UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
     55 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
     56 		UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
     57 	[CHAR_BREAK_PROP_CR] = UINT32_C(1) << CHAR_BREAK_PROP_LF,    /* GB3  */
     58 	[CHAR_BREAK_PROP_EXTEND] =
     59 		UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
     60 		UINT32_C(1)
     61 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
     62 		UINT32_C(1)
     63 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
     64 		UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
     65 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
     66 		UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
     67 	[CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND] =
     68 		UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
     69 		UINT32_C(1)
     70 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
     71 		UINT32_C(1)
     72 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
     73 		UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
     74 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
     75 		UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
     76 	[CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER] =
     77 		UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
     78 		UINT32_C(1)
     79 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
     80 		UINT32_C(1)
     81 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
     82 		UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
     83 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
     84 		UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
     85 	[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
     86 		UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
     87 		UINT32_C(1)
     88 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
     89 		UINT32_C(1)
     90 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
     91 		UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
     92 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
     93 		UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
     94 	[CHAR_BREAK_PROP_HANGUL_L] =
     95 		UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_L |   /* GB6  */
     96 		UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V |   /* GB6  */
     97 		UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_LV |  /* GB6  */
     98 		UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_LVT | /* GB6  */
     99 		UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
    100 		UINT32_C(1)
    101 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
    102 		UINT32_C(1)
    103 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
    104 		UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
    105 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
    106 		UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
    107 	[CHAR_BREAK_PROP_HANGUL_V] =
    108 		UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7  */
    109 		UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7  */
    110 		UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |   /* GB9  */
    111 		UINT32_C(1)
    112 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
    113 		UINT32_C(1)
    114 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
    115 		UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
    116 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
    117 		UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
    118 	[CHAR_BREAK_PROP_HANGUL_T] =
    119 		UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8  */
    120 		UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |   /* GB9  */
    121 		UINT32_C(1)
    122 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
    123 		UINT32_C(1)
    124 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
    125 		UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
    126 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
    127 		UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
    128 	[CHAR_BREAK_PROP_HANGUL_LV] =
    129 		UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7  */
    130 		UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7  */
    131 		UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |   /* GB9  */
    132 		UINT32_C(1)
    133 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
    134 		UINT32_C(1)
    135 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
    136 		UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
    137 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
    138 		UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
    139 	[CHAR_BREAK_PROP_HANGUL_LVT] =
    140 		UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8  */
    141 		UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |   /* GB9  */
    142 		UINT32_C(1)
    143 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
    144 		UINT32_C(1)
    145 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
    146 		UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
    147 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
    148 		UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
    149 	[CHAR_BREAK_PROP_PREPEND] =
    150 		UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
    151 		UINT32_C(1)
    152 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
    153 		UINT32_C(1)
    154 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
    155 		UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
    156 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
    157 		UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK |         /* GB9a */
    158 		(UINT32_C(0xFFFFFFFF) &
    159 	         ~(UINT32_C(1) << CHAR_BREAK_PROP_CR |
    160 	           UINT32_C(1) << CHAR_BREAK_PROP_LF |
    161 	           UINT32_C(1) << CHAR_BREAK_PROP_CONTROL)), /* GB9b */
    162 	[CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
    163 		UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
    164 		UINT32_C(1)
    165 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
    166 		UINT32_C(1)
    167 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
    168 		UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
    169 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
    170 		UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
    171 	[CHAR_BREAK_PROP_SPACINGMARK] =
    172 		UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
    173 		UINT32_C(1)
    174 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
    175 		UINT32_C(1)
    176 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
    177 		UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
    178 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
    179 		UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
    180 	[CHAR_BREAK_PROP_ZWJ] =
    181 		UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
    182 		UINT32_C(1)
    183 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
    184 		UINT32_C(1)
    185 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
    186 		UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
    187 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
    188 		UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
    189 	[CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND] =
    190 		UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
    191 		UINT32_C(1)
    192 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
    193 		UINT32_C(1)
    194 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
    195 		UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
    196 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
    197 		UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
    198 
    199 };
    200 static const uint_least32_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
    201 	[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
    202 		UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |
    203 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
    204 		UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |              /* GB9  */
    205 		UINT32_C(1)
    206 			<< CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
    207 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER, /* GB9 */
    208 	[CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
    209 		UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
    210 	[CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] =
    211 		UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
    212 	[CHAR_BREAK_PROP_EXTEND + NUM_CHAR_BREAK_PROPS] =
    213 		UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |
    214 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND |
    215 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |
    216 		UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |
    217 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND,
    218 	[CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] =
    219 		UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |
    220 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND |
    221 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |
    222 		UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |
    223 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND,
    224 	[CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER + NUM_CHAR_BREAK_PROPS] =
    225 		UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |
    226 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND |
    227 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |
    228 		UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |
    229 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND,
    230 	[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_CHAR_BREAK_PROPS] =
    231 		UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |
    232 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND |
    233 		UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |
    234 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND |
    235 		UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER,
    236 };
    237 static const uint_least32_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
    238 	[CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
    239 		UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
    240 	[CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] =
    241 		UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
    242 };
    243 static const uint_least32_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
    244 	[CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
    245 		UINT32_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
    246 };
    247 static const uint_least32_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
    248 	[CHAR_BREAK_PROP_REGIONAL_INDICATOR + NUM_CHAR_BREAK_PROPS] =
    249 		UINT32_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
    250 };
    251 
    252 static inline enum char_break_property
    253 get_break_prop(uint_least32_t cp)
    254 {
    255 	if (likely(cp <= UINT32_C(0x10FFFF))) {
    256 		return (enum char_break_property)
    257 			char_break_minor[char_break_major[cp >> 8] +
    258 		                         (cp & 0xFF)];
    259 	} else {
    260 		return CHAR_BREAK_PROP_OTHER;
    261 	}
    262 }
    263 
    264 static inline void
    265 state_serialize(const struct character_break_state *in, uint_least16_t *out)
    266 {
    267 	*out = (uint_least16_t)(in->prop & UINT8_C(0xFF)) | /* first 8 bits */
    268 	       (uint_least16_t)(((uint_least16_t)(in->prop_set))
    269 	                        << 8) | /* 9th bit */
    270 	       (uint_least16_t)(((uint_least16_t)(in->gb11_flag))
    271 	                        << 9) | /* 10th bit */
    272 	       (uint_least16_t)(((uint_least16_t)(in->gb12_13_flag))
    273 	                        << 10) | /* 11th bit */
    274 	       (uint_least16_t)(((uint_least16_t)(in->gb9c_level & 0x3))
    275 	                        << 11); /* 12th and 13th bit */
    276 }
    277 
    278 static inline void
    279 state_deserialize(uint_least16_t in, struct character_break_state *out)
    280 {
    281 	out->prop = in & UINT8_C(0xFF);
    282 	out->prop_set = in & (UINT16_C(1) << 8);
    283 	out->gb11_flag = in & (UINT16_C(1) << 9);
    284 	out->gb12_13_flag = in & (UINT16_C(1) << 10);
    285 	out->gb9c_level = (uint_least8_t)(in >> 11) & UINT8_C(0x3);
    286 }
    287 
    288 bool
    289 grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1,
    290                             uint_least16_t *s)
    291 {
    292 	struct character_break_state state;
    293 	enum char_break_property cp0_prop, cp1_prop;
    294 	bool notbreak = false;
    295 
    296 	if (likely(s)) {
    297 		state_deserialize(*s, &state);
    298 
    299 		if (likely(state.prop_set)) {
    300 			cp0_prop = state.prop;
    301 		} else {
    302 			cp0_prop = get_break_prop(cp0);
    303 		}
    304 		cp1_prop = get_break_prop(cp1);
    305 
    306 		/* preserve prop of right codepoint for next iteration */
    307 		state.prop = (uint_least8_t)cp1_prop;
    308 		state.prop_set = true;
    309 
    310 		/* update flags */
    311 		state.gb11_flag =
    312 			flag_update_gb11[cp0_prop + NUM_CHAR_BREAK_PROPS *
    313 		                                            state.gb11_flag] &
    314 			UINT32_C(1) << cp1_prop;
    315 		state.gb12_13_flag =
    316 			flag_update_gb12_13[cp0_prop +
    317 		                            NUM_CHAR_BREAK_PROPS *
    318 		                                    state.gb12_13_flag] &
    319 			UINT32_C(1) << cp1_prop;
    320 
    321 		/*
    322 		 * update GB9c state, which deals with indic conjunct breaks.
    323 		 * We want to detect the following prefix:
    324 		 *
    325 		 *   ICB_CONSONANT
    326 		 *   [ICB_EXTEND ICB_LINKER]*
    327 		 *   ICB_LINKER
    328 		 *   [ICB_EXTEND ICB_LINKER]*
    329 		 *
    330 		 * This representation is not ideal: In reality, what is
    331 		 * meant is that the prefix is a sequence of [ICB_EXTEND
    332 		 * ICB_LINKER]*, following an ICB_CONSONANT, that contains at
    333 		 * least one ICB_LINKER. We thus use the following equivalent
    334 		 * representation that allows us to store the levels 0..3 in 2
    335 		 * bits.
    336 		 *
    337 		 *   ICB_CONSONANT              -- Level 1
    338 		 *   ICB_EXTEND*                -- Level 2
    339 		 *   ICB_LINKER                 -- Level 3
    340 		 *   [ICB_EXTEND ICB_LINKER]*   -- Level 3
    341 		 *
    342 		 * The following chain of if-else-blocks is a bit redundant and
    343 		 * of course could be optimised, but this is kept as is for
    344 		 * best readability.
    345 		 */
    346 		if (state.gb9c_level == 0 &&
    347 		    cp0_prop == CHAR_BREAK_PROP_ICB_CONSONANT) {
    348 			/* the sequence has begun */
    349 			state.gb9c_level = 1;
    350 		} else if ((state.gb9c_level == 1 || state.gb9c_level == 2) &&
    351 		           (cp0_prop == CHAR_BREAK_PROP_ICB_EXTEND ||
    352 		            cp0_prop == CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND ||
    353 		            cp0_prop ==
    354 		                    CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND)) {
    355 			/*
    356 			 * either the level is 1 and thus the ICB consonant is
    357 			 * followed by an ICB extend, where we jump
    358 			 * to level 2, or we are at level 2 and just witness
    359 			 * more ICB extends, staying at level 2.
    360 			 */
    361 			state.gb9c_level = 2;
    362 		} else if ((state.gb9c_level == 1 || state.gb9c_level == 2) &&
    363 		           (cp0_prop == CHAR_BREAK_PROP_ICB_LINKER ||
    364 		            cp0_prop ==
    365 		                    CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER)) {
    366 			/*
    367 			 * witnessing an ICB linker directly lifts us up to
    368 			 * level 3
    369 			 */
    370 			state.gb9c_level = 3;
    371 		} else if (state.gb9c_level == 3 &&
    372 		           (cp0_prop == CHAR_BREAK_PROP_ICB_EXTEND ||
    373 		            cp0_prop == CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND ||
    374 		            cp0_prop ==
    375 		                    CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND ||
    376 		            cp0_prop == CHAR_BREAK_PROP_ICB_LINKER ||
    377 		            cp0_prop ==
    378 		                    CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER)) {
    379 			/*
    380 			 * we stay at level 3 when we observe either ICB
    381 			 * extends or linkers
    382 			 */
    383 			state.gb9c_level = 3;
    384 		} else {
    385 			/*
    386 			 * the sequence has collapsed, but it could be
    387 			 * that the left property is ICB consonant, which
    388 			 * means that we jump right back to level 1 instead
    389 			 * of 0
    390 			 */
    391 			if (cp0_prop == CHAR_BREAK_PROP_ICB_CONSONANT) {
    392 				state.gb9c_level = 1;
    393 			} else {
    394 				state.gb9c_level = 0;
    395 			}
    396 		}
    397 
    398 		/*
    399 		 * Apply grapheme cluster breaking algorithm (UAX #29), see
    400 		 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
    401 		 */
    402 		notbreak = (dont_break[cp0_prop] & (UINT32_C(1) << cp1_prop)) ||
    403 		           (state.gb9c_level == 3 &&
    404 		            cp1_prop == CHAR_BREAK_PROP_ICB_CONSONANT) ||
    405 		           (dont_break_gb11[cp0_prop +
    406 		                            state.gb11_flag *
    407 		                                    NUM_CHAR_BREAK_PROPS] &
    408 		            (UINT32_C(1) << cp1_prop)) ||
    409 		           (dont_break_gb12_13[cp0_prop +
    410 		                               state.gb12_13_flag *
    411 		                                       NUM_CHAR_BREAK_PROPS] &
    412 		            (UINT32_C(1) << cp1_prop));
    413 
    414 		/* update or reset flags (when we have a break) */
    415 		if (likely(!notbreak)) {
    416 			state.gb11_flag = state.gb12_13_flag = false;
    417 		}
    418 
    419 		state_serialize(&state, s);
    420 	} else {
    421 		cp0_prop = get_break_prop(cp0);
    422 		cp1_prop = get_break_prop(cp1);
    423 
    424 		/*
    425 		 * Apply grapheme cluster breaking algorithm (UAX #29), see
    426 		 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
    427 		 *
    428 		 * Given we have no state, this behaves as if the state-booleans
    429 		 * were all set to false
    430 		 */
    431 		notbreak = (dont_break[cp0_prop] & (UINT32_C(1) << cp1_prop)) ||
    432 		           (dont_break_gb11[cp0_prop] &
    433 		            (UINT32_C(1) << cp1_prop)) ||
    434 		           (dont_break_gb12_13[cp0_prop] &
    435 		            (UINT32_C(1) << cp1_prop));
    436 	}
    437 
    438 	return !notbreak;
    439 }
    440 
    441 static size_t
    442 next_character_break(HERODOTUS_READER *r)
    443 {
    444 	uint_least16_t state = 0;
    445 	uint_least32_t cp0 = 0, cp1 = 0;
    446 
    447 	for (herodotus_read_codepoint(r, true, &cp0);
    448 	     herodotus_read_codepoint(r, false, &cp1) ==
    449 	     HERODOTUS_STATUS_SUCCESS;
    450 	     herodotus_read_codepoint(r, true, &cp0)) {
    451 		if (grapheme_is_character_break(cp0, cp1, &state)) {
    452 			break;
    453 		}
    454 	}
    455 
    456 	return herodotus_reader_number_read(r);
    457 }
    458 
    459 size_t
    460 grapheme_next_character_break(const uint_least32_t *str, size_t len)
    461 {
    462 	HERODOTUS_READER r;
    463 
    464 	herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
    465 
    466 	return next_character_break(&r);
    467 }
    468 
    469 size_t
    470 grapheme_next_character_break_utf8(const char *str, size_t len)
    471 {
    472 	HERODOTUS_READER r;
    473 
    474 	herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
    475 
    476 	return next_character_break(&r);
    477 }