sentence.c (8420B)
1 /* See LICENSE file for copyright and license details. */ 2 #include <stdbool.h> 3 #include <stddef.h> 4 5 #include "../gen/sentence.h" 6 #include "../grapheme.h" 7 #include "util.h" 8 9 struct sentence_break_state { 10 uint_least8_t aterm_close_sp_level; 11 uint_least8_t saterm_close_sp_parasep_level; 12 }; 13 14 static inline uint_least8_t 15 get_sentence_break_prop(uint_least32_t cp) 16 { 17 if (likely(cp <= UINT32_C(0x10FFFF))) { 18 return (uint_least8_t) 19 sentence_break_minor[sentence_break_major[cp >> 8] + 20 (cp & 0xff)]; 21 } else { 22 return SENTENCE_BREAK_PROP_OTHER; 23 } 24 } 25 26 static bool 27 is_skippable_sentence_prop(uint_least8_t prop) 28 { 29 return prop == SENTENCE_BREAK_PROP_EXTEND || 30 prop == SENTENCE_BREAK_PROP_FORMAT; 31 } 32 33 static void 34 sentence_skip_shift_callback(uint_least8_t prop, void *s) 35 { 36 struct sentence_break_state *state = (struct sentence_break_state *)s; 37 38 /* 39 * Here comes a bit of magic. The rules 40 * SB8, SB8a, SB9 and SB10 have very complicated 41 * left-hand-side-rules of the form 42 * 43 * ATerm Close* Sp* 44 * SATerm Close* 45 * SATerm Close* Sp* 46 * SATerm Close* Sp* ParaSep? 47 * 48 * but instead of backtracking, we keep the 49 * state as some kind of "power level" in 50 * two state-variables 51 * 52 * aterm_close_sp_level 53 * saterm_close_sp_parasep_level 54 * 55 * that go from 0 to 3/4: 56 * 57 * 0: we are not in the sequence 58 * 1: we have one ATerm/SATerm to the left of 59 * the middle spot 60 * 2: we have one ATerm/SATerm and one or more 61 * Close to the left of the middle spot 62 * 3: we have one ATerm/SATerm, zero or more 63 * Close and one or more Sp to the left of 64 * the middle spot. 65 * 4: we have one SATerm, zero or more Close, 66 * zero or more Sp and one ParaSep to the 67 * left of the middle spot. 68 * 69 */ 70 if ((state->aterm_close_sp_level == 0 || 71 state->aterm_close_sp_level == 1) && 72 prop == SENTENCE_BREAK_PROP_ATERM) { 73 /* sequence has begun */ 74 state->aterm_close_sp_level = 1; 75 } else if ((state->aterm_close_sp_level == 1 || 76 state->aterm_close_sp_level == 2) && 77 prop == SENTENCE_BREAK_PROP_CLOSE) { 78 /* close-sequence begins or continued */ 79 state->aterm_close_sp_level = 2; 80 } else if ((state->aterm_close_sp_level == 1 || 81 state->aterm_close_sp_level == 2 || 82 state->aterm_close_sp_level == 3) && 83 prop == SENTENCE_BREAK_PROP_SP) { 84 /* sp-sequence begins or continued */ 85 state->aterm_close_sp_level = 3; 86 } else { 87 /* sequence broke */ 88 state->aterm_close_sp_level = 0; 89 } 90 91 if ((state->saterm_close_sp_parasep_level == 0 || 92 state->saterm_close_sp_parasep_level == 1) && 93 (prop == SENTENCE_BREAK_PROP_STERM || 94 prop == SENTENCE_BREAK_PROP_ATERM)) { 95 /* sequence has begun */ 96 state->saterm_close_sp_parasep_level = 1; 97 } else if ((state->saterm_close_sp_parasep_level == 1 || 98 state->saterm_close_sp_parasep_level == 2) && 99 prop == SENTENCE_BREAK_PROP_CLOSE) { 100 /* close-sequence begins or continued */ 101 state->saterm_close_sp_parasep_level = 2; 102 } else if ((state->saterm_close_sp_parasep_level == 1 || 103 state->saterm_close_sp_parasep_level == 2 || 104 state->saterm_close_sp_parasep_level == 3) && 105 prop == SENTENCE_BREAK_PROP_SP) { 106 /* sp-sequence begins or continued */ 107 state->saterm_close_sp_parasep_level = 3; 108 } else if ((state->saterm_close_sp_parasep_level == 1 || 109 state->saterm_close_sp_parasep_level == 2 || 110 state->saterm_close_sp_parasep_level == 3) && 111 (prop == SENTENCE_BREAK_PROP_SEP || 112 prop == SENTENCE_BREAK_PROP_CR || 113 prop == SENTENCE_BREAK_PROP_LF)) { 114 /* ParaSep at the end of the sequence */ 115 state->saterm_close_sp_parasep_level = 4; 116 } else { 117 /* sequence broke */ 118 state->saterm_close_sp_parasep_level = 0; 119 } 120 } 121 122 static size_t 123 next_sentence_break(HERODOTUS_READER *r) 124 { 125 HERODOTUS_READER tmp; 126 enum sentence_break_property prop; 127 struct proper p; 128 struct sentence_break_state state = { 0 }; 129 uint_least32_t cp; 130 131 /* 132 * Apply sentence breaking algorithm (UAX #29), see 133 * https://unicode.org/reports/tr29/#Sentence_Boundary_Rules 134 */ 135 proper_init(r, &state, NUM_SENTENCE_BREAK_PROPS, 136 get_sentence_break_prop, is_skippable_sentence_prop, 137 sentence_skip_shift_callback, &p); 138 139 while (!proper_advance(&p)) { 140 /* SB3 */ 141 if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR && 142 p.raw.next_prop[0] == SENTENCE_BREAK_PROP_LF) { 143 continue; 144 } 145 146 /* SB4 */ 147 if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_SEP || 148 p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR || 149 p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_LF) { 150 break; 151 } 152 153 /* SB5 */ 154 if (p.raw.next_prop[0] == SENTENCE_BREAK_PROP_EXTEND || 155 p.raw.next_prop[0] == SENTENCE_BREAK_PROP_FORMAT) { 156 continue; 157 } 158 159 /* SB6 */ 160 if (p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM && 161 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_NUMERIC) { 162 continue; 163 } 164 165 /* SB7 */ 166 if ((p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_UPPER || 167 p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_LOWER) && 168 p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM && 169 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_UPPER) { 170 continue; 171 } 172 173 /* SB8 */ 174 if (state.aterm_close_sp_level == 1 || 175 state.aterm_close_sp_level == 2 || 176 state.aterm_close_sp_level == 3) { 177 /* 178 * This is the most complicated rule, requiring 179 * the right-hand-side to satisfy the regular expression 180 * 181 * ( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* 182 * Lower 183 * 184 * which we simply check "manually" given LUT-lookups 185 * are very cheap by starting at the mid_reader. 186 * 187 */ 188 herodotus_reader_copy(&(p.mid_reader), &tmp); 189 190 prop = NUM_SENTENCE_BREAK_PROPS; 191 while (herodotus_read_codepoint(&tmp, true, &cp) == 192 HERODOTUS_STATUS_SUCCESS) { 193 prop = get_sentence_break_prop(cp); 194 195 /* 196 * the skippable properties are ignored 197 * automatically here given they do not 198 * match the following condition 199 */ 200 if (prop == SENTENCE_BREAK_PROP_OLETTER || 201 prop == SENTENCE_BREAK_PROP_UPPER || 202 prop == SENTENCE_BREAK_PROP_LOWER || 203 prop == SENTENCE_BREAK_PROP_SEP || 204 prop == SENTENCE_BREAK_PROP_CR || 205 prop == SENTENCE_BREAK_PROP_LF || 206 prop == SENTENCE_BREAK_PROP_STERM || 207 prop == SENTENCE_BREAK_PROP_ATERM) { 208 break; 209 } 210 } 211 212 if (prop == SENTENCE_BREAK_PROP_LOWER) { 213 continue; 214 } 215 } 216 217 /* SB8a */ 218 if ((state.saterm_close_sp_parasep_level == 1 || 219 state.saterm_close_sp_parasep_level == 2 || 220 state.saterm_close_sp_parasep_level == 3) && 221 (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SCONTINUE || 222 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_STERM || 223 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_ATERM)) { 224 continue; 225 } 226 227 /* SB9 */ 228 if ((state.saterm_close_sp_parasep_level == 1 || 229 state.saterm_close_sp_parasep_level == 2) && 230 (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CLOSE || 231 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP || 232 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP || 233 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR || 234 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) { 235 continue; 236 } 237 238 /* SB10 */ 239 if ((state.saterm_close_sp_parasep_level == 1 || 240 state.saterm_close_sp_parasep_level == 2 || 241 state.saterm_close_sp_parasep_level == 3) && 242 (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP || 243 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP || 244 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR || 245 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) { 246 continue; 247 } 248 249 /* SB11 */ 250 if (state.saterm_close_sp_parasep_level == 1 || 251 state.saterm_close_sp_parasep_level == 2 || 252 state.saterm_close_sp_parasep_level == 3 || 253 state.saterm_close_sp_parasep_level == 4) { 254 break; 255 } 256 257 /* SB998 */ 258 continue; 259 } 260 261 return herodotus_reader_number_read(&(p.mid_reader)); 262 } 263 264 size_t 265 grapheme_next_sentence_break(const uint_least32_t *str, size_t len) 266 { 267 HERODOTUS_READER r; 268 269 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len); 270 271 return next_sentence_break(&r); 272 } 273 274 size_t 275 grapheme_next_sentence_break_utf8(const char *str, size_t len) 276 { 277 HERODOTUS_READER r; 278 279 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len); 280 281 return next_sentence_break(&r); 282 }