word.c (8052B)
1 /* See LICENSE file for copyright and license details. */ 2 #include <stdbool.h> 3 #include <stddef.h> 4 5 #include "../gen/word.h" 6 #include "../grapheme.h" 7 #include "util.h" 8 9 struct word_break_state { 10 bool ri_even; 11 }; 12 13 static inline uint_least8_t 14 get_word_break_prop(uint_least32_t cp) 15 { 16 if (likely(cp <= UINT32_C(0x10FFFF))) { 17 return (uint_least8_t) 18 word_break_minor[word_break_major[cp >> 8] + 19 (cp & 0xff)]; 20 } else { 21 return WORD_BREAK_PROP_OTHER; 22 } 23 } 24 25 static bool 26 is_skippable_word_prop(uint_least8_t prop) 27 { 28 return prop == WORD_BREAK_PROP_EXTEND || 29 prop == WORD_BREAK_PROP_FORMAT || prop == WORD_BREAK_PROP_ZWJ; 30 } 31 32 static void 33 word_skip_shift_callback(uint_least8_t prop, void *s) 34 { 35 struct word_break_state *state = (struct word_break_state *)s; 36 37 if (prop == WORD_BREAK_PROP_REGIONAL_INDICATOR) { 38 /* 39 * The property we just shifted in is 40 * a regional indicator, increasing the 41 * number of consecutive RIs on the left 42 * side of the breakpoint by one, changing 43 * the oddness. 44 * 45 */ 46 state->ri_even = !(state->ri_even); 47 } else { 48 /* 49 * We saw no regional indicator, so the 50 * number of consecutive RIs on the left 51 * side of the breakpoint is zero, which 52 * is an even number. 53 * 54 */ 55 state->ri_even = true; 56 } 57 } 58 59 static size_t 60 next_word_break(HERODOTUS_READER *r) 61 { 62 struct proper p; 63 struct word_break_state state = { .ri_even = true }; 64 65 /* 66 * Apply word breaking algorithm (UAX #29), see 67 * https://unicode.org/reports/tr29/#Word_Boundary_Rules 68 */ 69 proper_init(r, &state, NUM_WORD_BREAK_PROPS, get_word_break_prop, 70 is_skippable_word_prop, word_skip_shift_callback, &p); 71 72 while (!proper_advance(&p)) { 73 /* WB3 */ 74 if (p.raw.prev_prop[0] == WORD_BREAK_PROP_CR && 75 p.raw.next_prop[0] == WORD_BREAK_PROP_LF) { 76 continue; 77 } 78 79 /* WB3a */ 80 if (p.raw.prev_prop[0] == WORD_BREAK_PROP_NEWLINE || 81 p.raw.prev_prop[0] == WORD_BREAK_PROP_CR || 82 p.raw.prev_prop[0] == WORD_BREAK_PROP_LF) { 83 break; 84 } 85 86 /* WB3b */ 87 if (p.raw.next_prop[0] == WORD_BREAK_PROP_NEWLINE || 88 p.raw.next_prop[0] == WORD_BREAK_PROP_CR || 89 p.raw.next_prop[0] == WORD_BREAK_PROP_LF) { 90 break; 91 } 92 93 /* WB3c */ 94 if (p.raw.prev_prop[0] == WORD_BREAK_PROP_ZWJ && 95 (p.raw.next_prop[0] == 96 WORD_BREAK_PROP_EXTENDED_PICTOGRAPHIC || 97 p.raw.next_prop[0] == 98 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT)) { 99 continue; 100 } 101 102 /* WB3d */ 103 if (p.raw.prev_prop[0] == WORD_BREAK_PROP_WSEGSPACE && 104 p.raw.next_prop[0] == WORD_BREAK_PROP_WSEGSPACE) { 105 continue; 106 } 107 108 /* WB4 */ 109 if (p.raw.next_prop[0] == WORD_BREAK_PROP_EXTEND || 110 p.raw.next_prop[0] == WORD_BREAK_PROP_FORMAT || 111 p.raw.next_prop[0] == WORD_BREAK_PROP_ZWJ) { 112 continue; 113 } 114 115 /* WB5 */ 116 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER || 117 p.skip.prev_prop[0] == 118 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || 119 p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) && 120 (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER || 121 p.skip.next_prop[0] == 122 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || 123 p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) { 124 continue; 125 } 126 127 /* WB6 */ 128 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER || 129 p.skip.prev_prop[0] == 130 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || 131 p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) && 132 (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDLETTER || 133 p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET || 134 p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) && 135 (p.skip.next_prop[1] == WORD_BREAK_PROP_ALETTER || 136 p.skip.next_prop[1] == 137 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || 138 p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) { 139 continue; 140 } 141 142 /* WB7 */ 143 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDLETTER || 144 p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET || 145 p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) && 146 (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER || 147 p.skip.next_prop[0] == 148 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || 149 p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) && 150 (p.skip.prev_prop[1] == WORD_BREAK_PROP_ALETTER || 151 p.skip.prev_prop[1] == 152 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || 153 p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) { 154 continue; 155 } 156 157 /* WB7a */ 158 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER && 159 p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) { 160 continue; 161 } 162 163 /* WB7b */ 164 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER && 165 p.skip.next_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE && 166 p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) { 167 continue; 168 } 169 170 /* WB7c */ 171 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE && 172 p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER && 173 p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) { 174 continue; 175 } 176 177 /* WB8 */ 178 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC && 179 p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) { 180 continue; 181 } 182 183 /* WB9 */ 184 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER || 185 p.skip.prev_prop[0] == 186 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || 187 p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) && 188 p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) { 189 continue; 190 } 191 192 /* WB10 */ 193 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC && 194 (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER || 195 p.skip.next_prop[0] == 196 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || 197 p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) { 198 continue; 199 } 200 201 /* WB11 */ 202 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUM || 203 p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET || 204 p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) && 205 p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC && 206 p.skip.prev_prop[1] == WORD_BREAK_PROP_NUMERIC) { 207 continue; 208 } 209 210 /* WB12 */ 211 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC && 212 (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUM || 213 p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET || 214 p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) && 215 p.skip.next_prop[1] == WORD_BREAK_PROP_NUMERIC) { 216 continue; 217 } 218 219 /* WB13 */ 220 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA && 221 p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA) { 222 continue; 223 } 224 225 /* WB13a */ 226 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER || 227 p.skip.prev_prop[0] == 228 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || 229 p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER || 230 p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC || 231 p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA || 232 p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) && 233 p.skip.next_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) { 234 continue; 235 } 236 237 /* WB13b */ 238 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET && 239 (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER || 240 p.skip.next_prop[0] == 241 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || 242 p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER || 243 p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC || 244 p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA)) { 245 continue; 246 } 247 248 /* WB15 and WB16 */ 249 if (!state.ri_even && 250 p.skip.next_prop[0] == WORD_BREAK_PROP_REGIONAL_INDICATOR) { 251 continue; 252 } 253 254 /* WB999 */ 255 break; 256 } 257 258 return herodotus_reader_number_read(&(p.mid_reader)); 259 } 260 261 size_t 262 grapheme_next_word_break(const uint_least32_t *str, size_t len) 263 { 264 HERODOTUS_READER r; 265 266 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len); 267 268 return next_word_break(&r); 269 } 270 271 size_t 272 grapheme_next_word_break_utf8(const char *str, size_t len) 273 { 274 HERODOTUS_READER r; 275 276 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len); 277 278 return next_word_break(&r); 279 }