character.c (18706B)
1 #include <stdio.h> 2 3 /* See LICENSE file for copyright and license details. */ 4 #include <limits.h> 5 #include <stdbool.h> 6 #include <stddef.h> 7 8 #include "../gen/character.h" 9 #include "../grapheme.h" 10 #include "util.h" 11 12 struct character_break_state { 13 uint_least8_t prop; 14 bool prop_set; 15 bool gb11_flag; 16 bool gb12_13_flag; 17 uint_least8_t gb9c_level; 18 }; 19 20 static const uint_least32_t dont_break[NUM_CHAR_BREAK_PROPS] = { 21 [CHAR_BREAK_PROP_OTHER] = 22 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ 23 UINT32_C(1) 24 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ 25 UINT32_C(1) 26 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ 27 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ 28 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ 29 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ 30 [CHAR_BREAK_PROP_ICB_CONSONANT] = 31 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ 32 UINT32_C(1) 33 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ 34 UINT32_C(1) 35 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ 36 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ 37 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ 38 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ 39 [CHAR_BREAK_PROP_ICB_EXTEND] = 40 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ 41 UINT32_C(1) 42 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ 43 UINT32_C(1) 44 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ 45 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ 46 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ 47 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ 48 [CHAR_BREAK_PROP_ICB_LINKER] = 49 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ 50 UINT32_C(1) 51 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ 52 UINT32_C(1) 53 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ 54 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ 55 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ 56 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ 57 [CHAR_BREAK_PROP_CR] = UINT32_C(1) << CHAR_BREAK_PROP_LF, /* GB3 */ 58 [CHAR_BREAK_PROP_EXTEND] = 59 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ 60 UINT32_C(1) 61 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ 62 UINT32_C(1) 63 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ 64 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ 65 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ 66 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ 67 [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND] = 68 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ 69 UINT32_C(1) 70 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ 71 UINT32_C(1) 72 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ 73 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ 74 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ 75 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ 76 [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER] = 77 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ 78 UINT32_C(1) 79 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ 80 UINT32_C(1) 81 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ 82 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ 83 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ 84 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ 85 [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] = 86 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ 87 UINT32_C(1) 88 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ 89 UINT32_C(1) 90 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ 91 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ 92 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ 93 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ 94 [CHAR_BREAK_PROP_HANGUL_L] = 95 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_L | /* GB6 */ 96 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB6 */ 97 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_LV | /* GB6 */ 98 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_LVT | /* GB6 */ 99 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ 100 UINT32_C(1) 101 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ 102 UINT32_C(1) 103 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ 104 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ 105 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ 106 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ 107 [CHAR_BREAK_PROP_HANGUL_V] = 108 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */ 109 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */ 110 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ 111 UINT32_C(1) 112 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ 113 UINT32_C(1) 114 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ 115 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ 116 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ 117 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ 118 [CHAR_BREAK_PROP_HANGUL_T] = 119 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */ 120 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ 121 UINT32_C(1) 122 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ 123 UINT32_C(1) 124 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ 125 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ 126 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ 127 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ 128 [CHAR_BREAK_PROP_HANGUL_LV] = 129 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */ 130 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */ 131 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ 132 UINT32_C(1) 133 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ 134 UINT32_C(1) 135 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ 136 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ 137 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ 138 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ 139 [CHAR_BREAK_PROP_HANGUL_LVT] = 140 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */ 141 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ 142 UINT32_C(1) 143 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ 144 UINT32_C(1) 145 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ 146 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ 147 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ 148 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ 149 [CHAR_BREAK_PROP_PREPEND] = 150 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ 151 UINT32_C(1) 152 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ 153 UINT32_C(1) 154 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ 155 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ 156 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ 157 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK | /* GB9a */ 158 (UINT32_C(0xFFFFFFFF) & 159 ~(UINT32_C(1) << CHAR_BREAK_PROP_CR | 160 UINT32_C(1) << CHAR_BREAK_PROP_LF | 161 UINT32_C(1) << CHAR_BREAK_PROP_CONTROL)), /* GB9b */ 162 [CHAR_BREAK_PROP_REGIONAL_INDICATOR] = 163 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ 164 UINT32_C(1) 165 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ 166 UINT32_C(1) 167 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ 168 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ 169 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ 170 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ 171 [CHAR_BREAK_PROP_SPACINGMARK] = 172 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ 173 UINT32_C(1) 174 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ 175 UINT32_C(1) 176 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ 177 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ 178 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ 179 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ 180 [CHAR_BREAK_PROP_ZWJ] = 181 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ 182 UINT32_C(1) 183 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ 184 UINT32_C(1) 185 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ 186 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ 187 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ 188 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ 189 [CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND] = 190 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ 191 UINT32_C(1) 192 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ 193 UINT32_C(1) 194 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ 195 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ 196 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ 197 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ 198 199 }; 200 static const uint_least32_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] = { 201 [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] = 202 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | 203 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ 204 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ 205 UINT32_C(1) 206 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ 207 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER, /* GB9 */ 208 [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] = 209 UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC, 210 [CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] = 211 UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC, 212 [CHAR_BREAK_PROP_EXTEND + NUM_CHAR_BREAK_PROPS] = 213 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | 214 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | 215 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | 216 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | 217 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND, 218 [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] = 219 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | 220 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | 221 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | 222 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | 223 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND, 224 [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER + NUM_CHAR_BREAK_PROPS] = 225 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | 226 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | 227 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | 228 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | 229 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND, 230 [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_CHAR_BREAK_PROPS] = 231 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | 232 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | 233 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | 234 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | 235 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER, 236 }; 237 static const uint_least32_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = { 238 [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] = 239 UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC, 240 [CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] = 241 UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC, 242 }; 243 static const uint_least32_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = { 244 [CHAR_BREAK_PROP_REGIONAL_INDICATOR] = 245 UINT32_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR, 246 }; 247 static const uint_least32_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = { 248 [CHAR_BREAK_PROP_REGIONAL_INDICATOR + NUM_CHAR_BREAK_PROPS] = 249 UINT32_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR, 250 }; 251 252 static inline enum char_break_property 253 get_break_prop(uint_least32_t cp) 254 { 255 if (likely(cp <= UINT32_C(0x10FFFF))) { 256 return (enum char_break_property) 257 char_break_minor[char_break_major[cp >> 8] + 258 (cp & 0xFF)]; 259 } else { 260 return CHAR_BREAK_PROP_OTHER; 261 } 262 } 263 264 static inline void 265 state_serialize(const struct character_break_state *in, uint_least16_t *out) 266 { 267 *out = (uint_least16_t)(in->prop & UINT8_C(0xFF)) | /* first 8 bits */ 268 (uint_least16_t)(((uint_least16_t)(in->prop_set)) 269 << 8) | /* 9th bit */ 270 (uint_least16_t)(((uint_least16_t)(in->gb11_flag)) 271 << 9) | /* 10th bit */ 272 (uint_least16_t)(((uint_least16_t)(in->gb12_13_flag)) 273 << 10) | /* 11th bit */ 274 (uint_least16_t)(((uint_least16_t)(in->gb9c_level & 0x3)) 275 << 11); /* 12th and 13th bit */ 276 } 277 278 static inline void 279 state_deserialize(uint_least16_t in, struct character_break_state *out) 280 { 281 out->prop = in & UINT8_C(0xFF); 282 out->prop_set = in & (UINT16_C(1) << 8); 283 out->gb11_flag = in & (UINT16_C(1) << 9); 284 out->gb12_13_flag = in & (UINT16_C(1) << 10); 285 out->gb9c_level = (uint_least8_t)(in >> 11) & UINT8_C(0x3); 286 } 287 288 bool 289 grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, 290 uint_least16_t *s) 291 { 292 struct character_break_state state; 293 enum char_break_property cp0_prop, cp1_prop; 294 bool notbreak = false; 295 296 if (likely(s)) { 297 state_deserialize(*s, &state); 298 299 if (likely(state.prop_set)) { 300 cp0_prop = state.prop; 301 } else { 302 cp0_prop = get_break_prop(cp0); 303 } 304 cp1_prop = get_break_prop(cp1); 305 306 /* preserve prop of right codepoint for next iteration */ 307 state.prop = (uint_least8_t)cp1_prop; 308 state.prop_set = true; 309 310 /* update flags */ 311 state.gb11_flag = 312 flag_update_gb11[cp0_prop + NUM_CHAR_BREAK_PROPS * 313 state.gb11_flag] & 314 UINT32_C(1) << cp1_prop; 315 state.gb12_13_flag = 316 flag_update_gb12_13[cp0_prop + 317 NUM_CHAR_BREAK_PROPS * 318 state.gb12_13_flag] & 319 UINT32_C(1) << cp1_prop; 320 321 /* 322 * update GB9c state, which deals with indic conjunct breaks. 323 * We want to detect the following prefix: 324 * 325 * ICB_CONSONANT 326 * [ICB_EXTEND ICB_LINKER]* 327 * ICB_LINKER 328 * [ICB_EXTEND ICB_LINKER]* 329 * 330 * This representation is not ideal: In reality, what is 331 * meant is that the prefix is a sequence of [ICB_EXTEND 332 * ICB_LINKER]*, following an ICB_CONSONANT, that contains at 333 * least one ICB_LINKER. We thus use the following equivalent 334 * representation that allows us to store the levels 0..3 in 2 335 * bits. 336 * 337 * ICB_CONSONANT -- Level 1 338 * ICB_EXTEND* -- Level 2 339 * ICB_LINKER -- Level 3 340 * [ICB_EXTEND ICB_LINKER]* -- Level 3 341 * 342 * The following chain of if-else-blocks is a bit redundant and 343 * of course could be optimised, but this is kept as is for 344 * best readability. 345 */ 346 if (state.gb9c_level == 0 && 347 cp0_prop == CHAR_BREAK_PROP_ICB_CONSONANT) { 348 /* the sequence has begun */ 349 state.gb9c_level = 1; 350 } else if ((state.gb9c_level == 1 || state.gb9c_level == 2) && 351 (cp0_prop == CHAR_BREAK_PROP_ICB_EXTEND || 352 cp0_prop == CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND || 353 cp0_prop == 354 CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND)) { 355 /* 356 * either the level is 1 and thus the ICB consonant is 357 * followed by an ICB extend, where we jump 358 * to level 2, or we are at level 2 and just witness 359 * more ICB extends, staying at level 2. 360 */ 361 state.gb9c_level = 2; 362 } else if ((state.gb9c_level == 1 || state.gb9c_level == 2) && 363 (cp0_prop == CHAR_BREAK_PROP_ICB_LINKER || 364 cp0_prop == 365 CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER)) { 366 /* 367 * witnessing an ICB linker directly lifts us up to 368 * level 3 369 */ 370 state.gb9c_level = 3; 371 } else if (state.gb9c_level == 3 && 372 (cp0_prop == CHAR_BREAK_PROP_ICB_EXTEND || 373 cp0_prop == CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND || 374 cp0_prop == 375 CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND || 376 cp0_prop == CHAR_BREAK_PROP_ICB_LINKER || 377 cp0_prop == 378 CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER)) { 379 /* 380 * we stay at level 3 when we observe either ICB 381 * extends or linkers 382 */ 383 state.gb9c_level = 3; 384 } else { 385 /* 386 * the sequence has collapsed, but it could be 387 * that the left property is ICB consonant, which 388 * means that we jump right back to level 1 instead 389 * of 0 390 */ 391 if (cp0_prop == CHAR_BREAK_PROP_ICB_CONSONANT) { 392 state.gb9c_level = 1; 393 } else { 394 state.gb9c_level = 0; 395 } 396 } 397 398 /* 399 * Apply grapheme cluster breaking algorithm (UAX #29), see 400 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules 401 */ 402 notbreak = (dont_break[cp0_prop] & (UINT32_C(1) << cp1_prop)) || 403 (state.gb9c_level == 3 && 404 cp1_prop == CHAR_BREAK_PROP_ICB_CONSONANT) || 405 (dont_break_gb11[cp0_prop + 406 state.gb11_flag * 407 NUM_CHAR_BREAK_PROPS] & 408 (UINT32_C(1) << cp1_prop)) || 409 (dont_break_gb12_13[cp0_prop + 410 state.gb12_13_flag * 411 NUM_CHAR_BREAK_PROPS] & 412 (UINT32_C(1) << cp1_prop)); 413 414 /* update or reset flags (when we have a break) */ 415 if (likely(!notbreak)) { 416 state.gb11_flag = state.gb12_13_flag = false; 417 } 418 419 state_serialize(&state, s); 420 } else { 421 cp0_prop = get_break_prop(cp0); 422 cp1_prop = get_break_prop(cp1); 423 424 /* 425 * Apply grapheme cluster breaking algorithm (UAX #29), see 426 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules 427 * 428 * Given we have no state, this behaves as if the state-booleans 429 * were all set to false 430 */ 431 notbreak = (dont_break[cp0_prop] & (UINT32_C(1) << cp1_prop)) || 432 (dont_break_gb11[cp0_prop] & 433 (UINT32_C(1) << cp1_prop)) || 434 (dont_break_gb12_13[cp0_prop] & 435 (UINT32_C(1) << cp1_prop)); 436 } 437 438 return !notbreak; 439 } 440 441 static size_t 442 next_character_break(HERODOTUS_READER *r) 443 { 444 uint_least16_t state = 0; 445 uint_least32_t cp0 = 0, cp1 = 0; 446 447 for (herodotus_read_codepoint(r, true, &cp0); 448 herodotus_read_codepoint(r, false, &cp1) == 449 HERODOTUS_STATUS_SUCCESS; 450 herodotus_read_codepoint(r, true, &cp0)) { 451 if (grapheme_is_character_break(cp0, cp1, &state)) { 452 break; 453 } 454 } 455 456 return herodotus_reader_number_read(r); 457 } 458 459 size_t 460 grapheme_next_character_break(const uint_least32_t *str, size_t len) 461 { 462 HERODOTUS_READER r; 463 464 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len); 465 466 return next_character_break(&r); 467 } 468 469 size_t 470 grapheme_next_character_break_utf8(const char *str, size_t len) 471 { 472 HERODOTUS_READER r; 473 474 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len); 475 476 return next_character_break(&r); 477 }