word.c (9443B)
1 /* See LICENSE file for copyright and license details. */ 2 #include <stdbool.h> 3 #include <stddef.h> 4 #include <stdlib.h> 5 #include <string.h> 6 7 #include "../gen/word.h" 8 #include "../grapheme.h" 9 #include "util.h" 10 11 static inline enum word_break_property 12 get_break_prop(uint_least32_t cp) 13 { 14 if (likely(cp <= 0x10FFFF)) { 15 return (enum word_break_property) 16 word_break_minor[word_break_major[cp >> 8] + (cp & 0xff)]; 17 } else { 18 return WORD_BREAK_PROP_OTHER; 19 } 20 } 21 22 static size_t 23 next_word_break(const void *str, size_t len, size_t (*get_codepoint) 24 (const void *, size_t, size_t, uint_least32_t *)) 25 { 26 struct { 27 enum word_break_property a, b, c, d; 28 } raw, skip; 29 enum word_break_property res; 30 uint_least32_t cp; 31 size_t off, tmp, new_off; 32 bool ri_even = true; 33 34 /* check degenerate cases */ 35 if (str == NULL || len == 0) { 36 return 0; 37 } 38 39 /* 40 * Apply word breaking algorithm (UAX #29), see 41 * https://unicode.org/reports/tr29/#Word_Boundary_Rules 42 * 43 * There are 4 slots (a, b, c, d) of "break" properties and 44 * we check if there is a break in the middle between b and c. 45 * 46 * The position of this middle spot is determined by off, 47 * which gives the offset of the first element on the right 48 * hand side of said spot, or, in other words, gives the number 49 * of elements on the left hand side. 50 * 51 * It is further complicated by the fact that the algorithm 52 * expects you to skip certain characters for the second 53 * half of the rules (after WB4). Thus, we do not only have 54 * the "raw" properties as described above, but also the "skip" 55 * properties, where the skip.a and skip.b, for instance, 56 * give the two preceding character properties behind the 57 * currently investigated breakpoint. 58 * 59 */ 60 61 /* 62 * Initialize the different properties such that we have 63 * a good state after the state-update in the loop 64 */ 65 raw.b = NUM_WORD_BREAK_PROPS; 66 if ((off = get_codepoint(str, len, 0, &cp)) >= len) { 67 return 1; 68 } 69 raw.c = get_break_prop(cp); 70 (void)get_codepoint(str, len, off, &cp); 71 raw.d = get_break_prop(cp); 72 skip.a = skip.b = NUM_WORD_BREAK_PROPS; 73 74 for (; off < len; off = new_off) { 75 /* 76 * Update left side (a and b) of the skip state by 77 * "shifting in" the raw.c property as long as it is 78 * not one of the "ignored" character properties. 79 * While at it, update the RI-counter. 80 * 81 */ 82 if (raw.c != WORD_BREAK_PROP_EXTEND && 83 raw.c != WORD_BREAK_PROP_FORMAT && 84 raw.c != WORD_BREAK_PROP_ZWJ) { 85 skip.a = skip.b; 86 skip.b = raw.c; 87 88 if (skip.b == WORD_BREAK_PROP_REGIONAL_INDICATOR) { 89 /* 90 * The property we just shifted in is 91 * a regional indicator, increasing the 92 * number of consecutive RIs on the left 93 * side of the breakpoint by one, changing 94 * the oddness. 95 * 96 */ 97 ri_even = !ri_even; 98 } else { 99 /* 100 * We saw no regional indicator, so the 101 * number of consecutive RIs on the left 102 * side of the breakpoint is zero, which 103 * is an even number. 104 * 105 */ 106 ri_even = true; 107 } 108 } 109 110 /* 111 * Update right side (b and c) of the skip state by 112 * starting at the breakpoint and detecting the two 113 * following non-ignored character classes 114 * 115 */ 116 skip.c = NUM_WORD_BREAK_PROPS; 117 for (tmp = off; tmp < len; ) { 118 tmp += get_codepoint(str, len, tmp, &cp); 119 res = get_break_prop(cp); 120 121 if (res != WORD_BREAK_PROP_EXTEND && 122 res != WORD_BREAK_PROP_FORMAT && 123 res != WORD_BREAK_PROP_ZWJ) { 124 skip.c = res; 125 break; 126 } 127 } 128 skip.d = NUM_WORD_BREAK_PROPS; 129 for (; tmp < len; ) { 130 tmp += get_codepoint(str, len, tmp, &cp); 131 res = get_break_prop(cp); 132 133 if (res != WORD_BREAK_PROP_EXTEND && 134 res != WORD_BREAK_PROP_FORMAT && 135 res != WORD_BREAK_PROP_ZWJ) { 136 skip.d = res; 137 break; 138 } 139 } 140 141 /* 142 * Update the raw state by simply shifting everything 143 * in and, if we still have data left, determining 144 * the character class of the next codepoint. 145 * 146 */ 147 raw.a = raw.b; 148 raw.b = raw.c; 149 raw.c = raw.d; 150 if ((new_off = off + get_codepoint(str, len, off, &cp)) < len) { 151 get_codepoint(str, len, new_off, &cp); 152 raw.d = get_break_prop(cp); 153 } else { 154 raw.d = NUM_WORD_BREAK_PROPS; 155 } 156 157 /* WB3 */ 158 if (raw.b == WORD_BREAK_PROP_CR && 159 raw.c == WORD_BREAK_PROP_LF) { 160 continue; 161 } 162 163 /* WB3a */ 164 if (raw.b == WORD_BREAK_PROP_NEWLINE || 165 raw.b == WORD_BREAK_PROP_CR || 166 raw.b == WORD_BREAK_PROP_LF) { 167 break; 168 } 169 170 /* WB3b */ 171 if (raw.c == WORD_BREAK_PROP_NEWLINE || 172 raw.c == WORD_BREAK_PROP_CR || 173 raw.c == WORD_BREAK_PROP_LF) { 174 break; 175 } 176 177 /* WB3c */ 178 if (raw.b == WORD_BREAK_PROP_ZWJ && 179 (raw.c == WORD_BREAK_PROP_EXTENDED_PICTOGRAPHIC || 180 raw.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT)) { 181 continue; 182 } 183 184 /* WB3d */ 185 if (raw.b == WORD_BREAK_PROP_WSEGSPACE && 186 raw.c == WORD_BREAK_PROP_WSEGSPACE) { 187 continue; 188 } 189 190 /* WB4 */ 191 if (raw.c == WORD_BREAK_PROP_EXTEND || 192 raw.c == WORD_BREAK_PROP_FORMAT || 193 raw.c == WORD_BREAK_PROP_ZWJ) { 194 continue; 195 } 196 197 /* WB5 */ 198 if ((skip.b == WORD_BREAK_PROP_ALETTER || 199 skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || 200 skip.b == WORD_BREAK_PROP_HEBREW_LETTER) && 201 (skip.c == WORD_BREAK_PROP_ALETTER || 202 skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || 203 skip.c == WORD_BREAK_PROP_HEBREW_LETTER)) { 204 continue; 205 } 206 207 /* WB6 */ 208 if ((skip.b == WORD_BREAK_PROP_ALETTER || 209 skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || 210 skip.b == WORD_BREAK_PROP_HEBREW_LETTER) && 211 (skip.c == WORD_BREAK_PROP_MIDLETTER || 212 skip.c == WORD_BREAK_PROP_MIDNUMLET || 213 skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) && 214 len > 2 && 215 (skip.d == WORD_BREAK_PROP_ALETTER || 216 skip.d == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || 217 skip.d == WORD_BREAK_PROP_HEBREW_LETTER)) { 218 continue; 219 } 220 221 /* WB7 */ 222 if ((skip.b == WORD_BREAK_PROP_MIDLETTER || 223 skip.b == WORD_BREAK_PROP_MIDNUMLET || 224 skip.b == WORD_BREAK_PROP_SINGLE_QUOTE) && 225 (skip.c == WORD_BREAK_PROP_ALETTER || 226 skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || 227 skip.c == WORD_BREAK_PROP_HEBREW_LETTER) && 228 len > 2 && 229 (skip.a == WORD_BREAK_PROP_ALETTER || 230 skip.a == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || 231 skip.a == WORD_BREAK_PROP_HEBREW_LETTER)) { 232 continue; 233 } 234 235 /* WB7a */ 236 if (skip.b == WORD_BREAK_PROP_HEBREW_LETTER && 237 skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) { 238 continue; 239 } 240 241 /* WB7b */ 242 if (skip.b == WORD_BREAK_PROP_HEBREW_LETTER && 243 skip.c == WORD_BREAK_PROP_DOUBLE_QUOTE && 244 len > 2 && 245 skip.d == WORD_BREAK_PROP_HEBREW_LETTER) { 246 continue; 247 } 248 249 /* WB7c */ 250 if (skip.b == WORD_BREAK_PROP_DOUBLE_QUOTE && 251 skip.c == WORD_BREAK_PROP_HEBREW_LETTER && 252 off > 1 && 253 skip.a == WORD_BREAK_PROP_HEBREW_LETTER) { 254 continue; 255 } 256 257 /* WB8 */ 258 if (skip.b == WORD_BREAK_PROP_NUMERIC && 259 skip.c == WORD_BREAK_PROP_NUMERIC) { 260 continue; 261 } 262 263 /* WB9 */ 264 if ((skip.b == WORD_BREAK_PROP_ALETTER || 265 skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || 266 skip.b == WORD_BREAK_PROP_HEBREW_LETTER) && 267 skip.c == WORD_BREAK_PROP_NUMERIC) { 268 continue; 269 } 270 271 /* WB10 */ 272 if (skip.b == WORD_BREAK_PROP_NUMERIC && 273 (skip.c == WORD_BREAK_PROP_ALETTER || 274 skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || 275 skip.c == WORD_BREAK_PROP_HEBREW_LETTER)) { 276 continue; 277 } 278 279 /* WB11 */ 280 if ((skip.b == WORD_BREAK_PROP_MIDNUM || 281 skip.b == WORD_BREAK_PROP_MIDNUMLET || 282 skip.b == WORD_BREAK_PROP_SINGLE_QUOTE) && 283 skip.c == WORD_BREAK_PROP_NUMERIC && 284 off > 1 && 285 skip.a == WORD_BREAK_PROP_NUMERIC) { 286 continue; 287 } 288 289 /* WB12 */ 290 if (skip.b == WORD_BREAK_PROP_NUMERIC && 291 (skip.c == WORD_BREAK_PROP_MIDNUM || 292 skip.c == WORD_BREAK_PROP_MIDNUMLET || 293 skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) && 294 len > 2 && 295 skip.d == WORD_BREAK_PROP_NUMERIC) { 296 continue; 297 } 298 299 /* WB13 */ 300 if (skip.b == WORD_BREAK_PROP_KATAKANA && 301 skip.c == WORD_BREAK_PROP_KATAKANA) { 302 continue; 303 } 304 305 /* WB13a */ 306 if ((skip.b == WORD_BREAK_PROP_ALETTER || 307 skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || 308 skip.b == WORD_BREAK_PROP_HEBREW_LETTER || 309 skip.b == WORD_BREAK_PROP_NUMERIC || 310 skip.b == WORD_BREAK_PROP_KATAKANA || 311 skip.b == WORD_BREAK_PROP_EXTENDNUMLET) && 312 skip.c == WORD_BREAK_PROP_EXTENDNUMLET) { 313 continue; 314 } 315 316 /* WB13b */ 317 if (skip.b == WORD_BREAK_PROP_EXTENDNUMLET && 318 (skip.c == WORD_BREAK_PROP_ALETTER || 319 skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || 320 skip.c == WORD_BREAK_PROP_HEBREW_LETTER || 321 skip.c == WORD_BREAK_PROP_NUMERIC || 322 skip.c == WORD_BREAK_PROP_KATAKANA)) { 323 continue; 324 } 325 326 /* WB15 and WB16 */ 327 if (!ri_even && 328 skip.c == WORD_BREAK_PROP_REGIONAL_INDICATOR) { 329 continue; 330 } 331 332 /* WB999 */ 333 break; 334 } 335 336 return off; 337 } 338 339 size_t 340 grapheme_next_word_break(const uint_least32_t *str, size_t len) 341 { 342 return next_word_break(str, len, get_codepoint); 343 } 344 345 size_t 346 grapheme_next_word_break_utf8(const char *str, size_t len) 347 { 348 return next_word_break(str, len, get_codepoint_utf8); 349 }