sentence.c - libgrapheme - unicode string library

sentence.c (8420B)
      1 /* See LICENSE file for copyright and license details. */
      2 #include <stdbool.h>
      3 #include <stddef.h>
      4 
      5 #include "../gen/sentence.h"
      6 #include "../grapheme.h"
      7 #include "util.h"
      8 
      9 struct sentence_break_state {
     10 	uint_least8_t aterm_close_sp_level;
     11 	uint_least8_t saterm_close_sp_parasep_level;
     12 };
     13 
     14 static inline uint_least8_t
     15 get_sentence_break_prop(uint_least32_t cp)
     16 {
     17 	if (likely(cp <= UINT32_C(0x10FFFF))) {
     18 		return (uint_least8_t)
     19 			sentence_break_minor[sentence_break_major[cp >> 8] +
     20 		                             (cp & 0xff)];
     21 	} else {
     22 		return SENTENCE_BREAK_PROP_OTHER;
     23 	}
     24 }
     25 
     26 static bool
     27 is_skippable_sentence_prop(uint_least8_t prop)
     28 {
     29 	return prop == SENTENCE_BREAK_PROP_EXTEND ||
     30 	       prop == SENTENCE_BREAK_PROP_FORMAT;
     31 }
     32 
     33 static void
     34 sentence_skip_shift_callback(uint_least8_t prop, void *s)
     35 {
     36 	struct sentence_break_state *state = (struct sentence_break_state *)s;
     37 
     38 	/*
     39 	 * Here comes a bit of magic. The rules
     40 	 * SB8, SB8a, SB9 and SB10 have very complicated
     41 	 * left-hand-side-rules of the form
     42 	 *
     43 	 *  ATerm Close* Sp*
     44 	 *  SATerm Close*
     45 	 *  SATerm Close* Sp*
     46 	 *  SATerm Close* Sp* ParaSep?
     47 	 *
     48 	 * but instead of backtracking, we keep the
     49 	 * state as some kind of "power level" in
     50 	 * two state-variables
     51 	 *
     52 	 *  aterm_close_sp_level
     53 	 *  saterm_close_sp_parasep_level
     54 	 *
     55 	 * that go from 0 to 3/4:
     56 	 *
     57 	 *  0: we are not in the sequence
     58 	 *  1: we have one ATerm/SATerm to the left of
     59 	 *     the middle spot
     60 	 *  2: we have one ATerm/SATerm and one or more
     61 	 *     Close to the left of the middle spot
     62 	 *  3: we have one ATerm/SATerm, zero or more
     63 	 *     Close and one or more Sp to the left of
     64 	 *     the middle spot.
     65 	 *  4: we have one SATerm, zero or more Close,
     66 	 *     zero or more Sp and one ParaSep to the
     67 	 *     left of the middle spot.
     68 	 *
     69 	 */
     70 	if ((state->aterm_close_sp_level == 0 ||
     71 	     state->aterm_close_sp_level == 1) &&
     72 	    prop == SENTENCE_BREAK_PROP_ATERM) {
     73 		/* sequence has begun */
     74 		state->aterm_close_sp_level = 1;
     75 	} else if ((state->aterm_close_sp_level == 1 ||
     76 	            state->aterm_close_sp_level == 2) &&
     77 	           prop == SENTENCE_BREAK_PROP_CLOSE) {
     78 		/* close-sequence begins or continued */
     79 		state->aterm_close_sp_level = 2;
     80 	} else if ((state->aterm_close_sp_level == 1 ||
     81 	            state->aterm_close_sp_level == 2 ||
     82 	            state->aterm_close_sp_level == 3) &&
     83 	           prop == SENTENCE_BREAK_PROP_SP) {
     84 		/* sp-sequence begins or continued */
     85 		state->aterm_close_sp_level = 3;
     86 	} else {
     87 		/* sequence broke */
     88 		state->aterm_close_sp_level = 0;
     89 	}
     90 
     91 	if ((state->saterm_close_sp_parasep_level == 0 ||
     92 	     state->saterm_close_sp_parasep_level == 1) &&
     93 	    (prop == SENTENCE_BREAK_PROP_STERM ||
     94 	     prop == SENTENCE_BREAK_PROP_ATERM)) {
     95 		/* sequence has begun */
     96 		state->saterm_close_sp_parasep_level = 1;
     97 	} else if ((state->saterm_close_sp_parasep_level == 1 ||
     98 	            state->saterm_close_sp_parasep_level == 2) &&
     99 	           prop == SENTENCE_BREAK_PROP_CLOSE) {
    100 		/* close-sequence begins or continued */
    101 		state->saterm_close_sp_parasep_level = 2;
    102 	} else if ((state->saterm_close_sp_parasep_level == 1 ||
    103 	            state->saterm_close_sp_parasep_level == 2 ||
    104 	            state->saterm_close_sp_parasep_level == 3) &&
    105 	           prop == SENTENCE_BREAK_PROP_SP) {
    106 		/* sp-sequence begins or continued */
    107 		state->saterm_close_sp_parasep_level = 3;
    108 	} else if ((state->saterm_close_sp_parasep_level == 1 ||
    109 	            state->saterm_close_sp_parasep_level == 2 ||
    110 	            state->saterm_close_sp_parasep_level == 3) &&
    111 	           (prop == SENTENCE_BREAK_PROP_SEP ||
    112 	            prop == SENTENCE_BREAK_PROP_CR ||
    113 	            prop == SENTENCE_BREAK_PROP_LF)) {
    114 		/* ParaSep at the end of the sequence */
    115 		state->saterm_close_sp_parasep_level = 4;
    116 	} else {
    117 		/* sequence broke */
    118 		state->saterm_close_sp_parasep_level = 0;
    119 	}
    120 }
    121 
    122 static size_t
    123 next_sentence_break(HERODOTUS_READER *r)
    124 {
    125 	HERODOTUS_READER tmp;
    126 	enum sentence_break_property prop;
    127 	struct proper p;
    128 	struct sentence_break_state state = { 0 };
    129 	uint_least32_t cp;
    130 
    131 	/*
    132 	 * Apply sentence breaking algorithm (UAX #29), see
    133 	 * https://unicode.org/reports/tr29/#Sentence_Boundary_Rules
    134 	 */
    135 	proper_init(r, &state, NUM_SENTENCE_BREAK_PROPS,
    136 	            get_sentence_break_prop, is_skippable_sentence_prop,
    137 	            sentence_skip_shift_callback, &p);
    138 
    139 	while (!proper_advance(&p)) {
    140 		/* SB3 */
    141 		if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR &&
    142 		    p.raw.next_prop[0] == SENTENCE_BREAK_PROP_LF) {
    143 			continue;
    144 		}
    145 
    146 		/* SB4 */
    147 		if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_SEP ||
    148 		    p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR ||
    149 		    p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_LF) {
    150 			break;
    151 		}
    152 
    153 		/* SB5 */
    154 		if (p.raw.next_prop[0] == SENTENCE_BREAK_PROP_EXTEND ||
    155 		    p.raw.next_prop[0] == SENTENCE_BREAK_PROP_FORMAT) {
    156 			continue;
    157 		}
    158 
    159 		/* SB6 */
    160 		if (p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
    161 		    p.skip.next_prop[0] == SENTENCE_BREAK_PROP_NUMERIC) {
    162 			continue;
    163 		}
    164 
    165 		/* SB7 */
    166 		if ((p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_UPPER ||
    167 		     p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_LOWER) &&
    168 		    p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
    169 		    p.skip.next_prop[0] == SENTENCE_BREAK_PROP_UPPER) {
    170 			continue;
    171 		}
    172 
    173 		/* SB8 */
    174 		if (state.aterm_close_sp_level == 1 ||
    175 		    state.aterm_close_sp_level == 2 ||
    176 		    state.aterm_close_sp_level == 3) {
    177 			/*
    178 			 * This is the most complicated rule, requiring
    179 			 * the right-hand-side to satisfy the regular expression
    180 			 *
    181 			 *  ( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )*
    182 			 * Lower
    183 			 *
    184 			 * which we simply check "manually" given LUT-lookups
    185 			 * are very cheap by starting at the mid_reader.
    186 			 *
    187 			 */
    188 			herodotus_reader_copy(&(p.mid_reader), &tmp);
    189 
    190 			prop = NUM_SENTENCE_BREAK_PROPS;
    191 			while (herodotus_read_codepoint(&tmp, true, &cp) ==
    192 			       HERODOTUS_STATUS_SUCCESS) {
    193 				prop = get_sentence_break_prop(cp);
    194 
    195 				/*
    196 				 * the skippable properties are ignored
    197 				 * automatically here given they do not
    198 				 * match the following condition
    199 				 */
    200 				if (prop == SENTENCE_BREAK_PROP_OLETTER ||
    201 				    prop == SENTENCE_BREAK_PROP_UPPER ||
    202 				    prop == SENTENCE_BREAK_PROP_LOWER ||
    203 				    prop == SENTENCE_BREAK_PROP_SEP ||
    204 				    prop == SENTENCE_BREAK_PROP_CR ||
    205 				    prop == SENTENCE_BREAK_PROP_LF ||
    206 				    prop == SENTENCE_BREAK_PROP_STERM ||
    207 				    prop == SENTENCE_BREAK_PROP_ATERM) {
    208 					break;
    209 				}
    210 			}
    211 
    212 			if (prop == SENTENCE_BREAK_PROP_LOWER) {
    213 				continue;
    214 			}
    215 		}
    216 
    217 		/* SB8a */
    218 		if ((state.saterm_close_sp_parasep_level == 1 ||
    219 		     state.saterm_close_sp_parasep_level == 2 ||
    220 		     state.saterm_close_sp_parasep_level == 3) &&
    221 		    (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SCONTINUE ||
    222 		     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_STERM ||
    223 		     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_ATERM)) {
    224 			continue;
    225 		}
    226 
    227 		/* SB9 */
    228 		if ((state.saterm_close_sp_parasep_level == 1 ||
    229 		     state.saterm_close_sp_parasep_level == 2) &&
    230 		    (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CLOSE ||
    231 		     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP ||
    232 		     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP ||
    233 		     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR ||
    234 		     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
    235 			continue;
    236 		}
    237 
    238 		/* SB10 */
    239 		if ((state.saterm_close_sp_parasep_level == 1 ||
    240 		     state.saterm_close_sp_parasep_level == 2 ||
    241 		     state.saterm_close_sp_parasep_level == 3) &&
    242 		    (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP ||
    243 		     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP ||
    244 		     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR ||
    245 		     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
    246 			continue;
    247 		}
    248 
    249 		/* SB11 */
    250 		if (state.saterm_close_sp_parasep_level == 1 ||
    251 		    state.saterm_close_sp_parasep_level == 2 ||
    252 		    state.saterm_close_sp_parasep_level == 3 ||
    253 		    state.saterm_close_sp_parasep_level == 4) {
    254 			break;
    255 		}
    256 
    257 		/* SB998 */
    258 		continue;
    259 	}
    260 
    261 	return herodotus_reader_number_read(&(p.mid_reader));
    262 }
    263 
    264 size_t
    265 grapheme_next_sentence_break(const uint_least32_t *str, size_t len)
    266 {
    267 	HERODOTUS_READER r;
    268 
    269 	herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
    270 
    271 	return next_sentence_break(&r);
    272 }
    273 
    274 size_t
    275 grapheme_next_sentence_break_utf8(const char *str, size_t len)
    276 {
    277 	HERODOTUS_READER r;
    278 
    279 	herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
    280 
    281 	return next_sentence_break(&r);
    282 }
	libgrapheme unicode string library
	git clone git://git.suckless.org/libgrapheme
	Log \| Files \| Refs \| README \| LICENSE