libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

sentence.c (10441B)


      1 /* See LICENSE file for copyright and license details. */
      2 #include <stdbool.h>
      3 #include <stddef.h>
      4 #include <stdlib.h>
      5 #include <string.h>
      6 
      7 #include "../gen/sentence.h"
      8 #include "../grapheme.h"
      9 #include "util.h"
     10 
     11 static inline enum sentence_break_property
     12 get_break_prop(uint_least32_t cp)
     13 {
     14 	if (likely(cp <= 0x10FFFF)) {
     15 		return (enum sentence_break_property)
     16 		       sentence_break_minor[sentence_break_major[cp >> 8] +
     17 		       (cp & 0xff)];
     18 	} else {
     19 		return SENTENCE_BREAK_PROP_OTHER;
     20 	}
     21 }
     22 
     23 static inline size_t
     24 get_codepoint(const void *str, size_t len, size_t offset, uint_least32_t *cp)
     25 {
     26 	if (offset < len) {
     27 		*cp = ((const uint_least32_t *)str)[offset];
     28 		return 1;
     29 	} else {
     30 		*cp = GRAPHEME_INVALID_CODEPOINT;
     31 		return 0;
     32 	}
     33 }
     34 
     35 static inline size_t
     36 get_codepoint_utf8(const void *str, size_t len, size_t offset, uint_least32_t *cp)
     37 {
     38 	if (offset < len) {
     39 		return grapheme_decode_utf8((const char *)str + offset,
     40 		                            len - offset, cp);
     41 	} else {
     42 		*cp = GRAPHEME_INVALID_CODEPOINT;
     43 		return 0;
     44 	}
     45 }
     46 
     47 static size_t
     48 next_sentence_break(const void *str, size_t len, size_t (*get_codepoint)
     49                     (const void *, size_t, size_t, uint_least32_t *))
     50 {
     51 	struct {
     52 		enum sentence_break_property a, b, c, d;
     53 	} raw, skip;
     54 	enum sentence_break_property res;
     55 	uint_least32_t cp;
     56 	uint_least8_t aterm_close_sp_level = 0,
     57 	              saterm_close_sp_parasep_level = 0;
     58 	size_t off, tmp, new_off;
     59 
     60 	/* check degenerate cases */
     61 	if (str == NULL || len == 0) {
     62 		return 0;
     63 	}
     64 
     65 	/*
     66 	 * Apply sentence breaking algorithm (UAX #29), see
     67 	 * https://unicode.org/reports/tr29/#Sentence_Boundary_Rules
     68 	 *
     69 	 * There are 4 slots (a, b, c, d) of "break" properties and
     70 	 * we check if there is a break in the middle between b and c.
     71 	 *
     72 	 * The position of this middle spot is determined by off,
     73 	 * which gives the offset of the first element on the right
     74 	 * hand side of said spot, or, in other words, gives the number
     75 	 * of elements on the left hand side.
     76 	 *
     77 	 * It is further complicated by the fact that the algorithm
     78 	 * expects you to skip certain characters for the second
     79 	 * half of the rules (after SB5). Thus, we do not only have
     80 	 * the "raw" properties as described above, but also the "skip"
     81 	 * properties, where the skip.a and skip.b, for instance,
     82 	 * give the two preceding character properties behind the
     83 	 * currently investigated breakpoint.
     84 	 *
     85 	 */
     86 
     87 	/*
     88 	 * Initialize the different properties such that we have
     89 	 * a good state after the state-update in the loop
     90 	 */
     91 	raw.b = NUM_SENTENCE_BREAK_PROPS;
     92 	if ((off = get_codepoint(str, len, 0, &cp)) >= len) {
     93 		return 1;
     94 	}
     95 	raw.c = get_break_prop(cp);
     96 	(void)get_codepoint(str, len, off, &cp);
     97 	raw.d = get_break_prop(cp);
     98 	skip.a = skip.b = NUM_SENTENCE_BREAK_PROPS;
     99 
    100 	for (; off < len; off = new_off) {
    101 		/*
    102 		 * Update left side (a and b) of the skip state by
    103 		 * "shifting in" the raw.c property as long as it is
    104 		 * not one of the "ignored" character properties.
    105 		 * While at it, update the RI-counter.
    106 		 *
    107 		 */
    108 		if (raw.c != SENTENCE_BREAK_PROP_EXTEND &&
    109 		    raw.c != SENTENCE_BREAK_PROP_FORMAT) {
    110 		    	skip.a = skip.b;
    111 			skip.b = raw.c;
    112 
    113 			/*
    114 			 * Here comes a bit of magic. The rules
    115 			 * SB8, SB8a, SB9 and SB10 have very complicated
    116 			 * left-hand-side-rules of the form
    117 			 *
    118 			 *  ATerm Close* Sp*
    119 			 *  SATerm Close*
    120 			 *  SATerm Close* Sp*
    121 			 *  SATerm Close* Sp* ParaSep?
    122 			 * 
    123 			 * but instead of backtracking, we keep the
    124 			 * state as some kind of "power level" in
    125 			 * two variables
    126 			 *
    127 			 *  aterm_close_sp_level
    128 			 *  saterm_close_sp_parasep_level
    129 			 * 
    130 			 * that go from 0 to 3/4:
    131 			 *
    132 			 *  0: we are not in the sequence
    133 			 *  1: we have one ATerm/SATerm to the left of
    134 			 *     the middle spot
    135 			 *  2: we have one ATerm/SATerm and one or more
    136 			 *     Close to the left of the middle spot
    137 			 *  3: we have one ATerm/SATerm, zero or more
    138 			 *     Close and one or more Sp to the left of
    139 			 *     the middle spot.
    140 			 *  4: we have one SATerm, zero or more Close,
    141 			 *     zero or more Sp and one ParaSep to the
    142 			 *     left of the middle spot.
    143 			 *
    144 			 */
    145 			if (aterm_close_sp_level == 0 &&
    146 			    skip.b == SENTENCE_BREAK_PROP_ATERM) {
    147 			    	/* sequence has begun */
    148 				aterm_close_sp_level = 1;
    149 			} else if ((aterm_close_sp_level == 1 ||
    150 			            aterm_close_sp_level == 2) &&
    151 			           skip.b == SENTENCE_BREAK_PROP_CLOSE) {
    152 				/* close-sequence begins or continued */
    153 				aterm_close_sp_level = 2;
    154 			} else if ((aterm_close_sp_level == 1 ||
    155 			            aterm_close_sp_level == 2 ||
    156 				    aterm_close_sp_level == 3) &&
    157 			           skip.b == SENTENCE_BREAK_PROP_SP) {
    158 				/* sp-sequence begins or continued */
    159 				aterm_close_sp_level = 3;
    160 			} else {
    161 				/* sequence broke */
    162 				aterm_close_sp_level = 0;
    163 			}
    164 
    165 			if (saterm_close_sp_parasep_level == 0 &&
    166 			    (skip.b == SENTENCE_BREAK_PROP_STERM ||
    167 			     skip.b == SENTENCE_BREAK_PROP_ATERM)) {
    168 			    	/* sequence has begun */
    169 				saterm_close_sp_parasep_level = 1;
    170 			} else if ((saterm_close_sp_parasep_level == 1 ||
    171 			            saterm_close_sp_parasep_level == 2) &&
    172 			           skip.b == SENTENCE_BREAK_PROP_CLOSE) {
    173 				/* close-sequence begins or continued */
    174 				saterm_close_sp_parasep_level = 2;
    175 			} else if ((saterm_close_sp_parasep_level == 1 ||
    176 			            saterm_close_sp_parasep_level == 2 ||
    177 				    saterm_close_sp_parasep_level == 3) &&
    178 			           skip.b == SENTENCE_BREAK_PROP_SP) {
    179 				/* sp-sequence begins or continued */
    180 				saterm_close_sp_parasep_level = 3;
    181 			} else if ((saterm_close_sp_parasep_level == 1 ||
    182 			            saterm_close_sp_parasep_level == 2 ||
    183 			            saterm_close_sp_parasep_level == 3) &&
    184 			           (skip.b == SENTENCE_BREAK_PROP_SEP ||
    185 			            skip.b == SENTENCE_BREAK_PROP_CR  ||
    186 			            skip.b == SENTENCE_BREAK_PROP_LF)) {
    187 				/* ParaSep at the end of the sequence */
    188 				saterm_close_sp_parasep_level = 4;
    189 			} else {
    190 				/* sequence broke */
    191 				saterm_close_sp_parasep_level = 0;
    192 			}
    193 		}
    194 
    195 		/*
    196 		 * Update right side (b and c) of the skip state by
    197 		 * starting at the breakpoint and detecting the two
    198 		 * following non-ignored character classes
    199 		 *
    200 		 */
    201 		skip.c = NUM_SENTENCE_BREAK_PROPS;
    202 		for (tmp = off; tmp < len; ) {
    203 			tmp += get_codepoint(str, len, tmp, &cp);
    204 			res = get_break_prop(cp);
    205 
    206 			if (res != SENTENCE_BREAK_PROP_EXTEND &&
    207 			    res != SENTENCE_BREAK_PROP_FORMAT) {
    208 				skip.c = res;
    209 				break;
    210 			}
    211 		}
    212 		skip.d = NUM_SENTENCE_BREAK_PROPS;
    213 		for (; tmp < len; ) {
    214 			tmp += get_codepoint(str, len, tmp, &cp);
    215 			res = get_break_prop(cp);
    216 
    217 			if (res != SENTENCE_BREAK_PROP_EXTEND &&
    218 			    res != SENTENCE_BREAK_PROP_FORMAT) {
    219 				skip.d = res;
    220 				break;
    221 			}
    222 		}
    223 
    224 		/*
    225 		 * Update the raw state by simply shifting everything
    226 		 * in and, if we still have data left, determining
    227 		 * the character class of the next codepoint.
    228 		 *
    229 		 */
    230 		raw.a = raw.b;
    231 		raw.b = raw.c;
    232 		raw.c = raw.d;
    233 		if ((new_off = off + get_codepoint(str, len, off, &cp)) < len) {
    234 			get_codepoint(str, len, new_off, &cp);
    235 			raw.d = get_break_prop(cp);
    236 		} else {
    237 			raw.d = NUM_SENTENCE_BREAK_PROPS;
    238 		}
    239 
    240 		/* SB3 */
    241 		if (raw.b == SENTENCE_BREAK_PROP_CR &&
    242 		    raw.c == SENTENCE_BREAK_PROP_LF) {
    243 			continue;
    244 		}
    245 
    246 		/* SB4 */
    247 		if (raw.b == SENTENCE_BREAK_PROP_SEP ||
    248 		    raw.b == SENTENCE_BREAK_PROP_CR  ||
    249 		    raw.b == SENTENCE_BREAK_PROP_LF) {
    250 			break;
    251 		}
    252 
    253 		/* SB5 */
    254 		if (raw.c == SENTENCE_BREAK_PROP_EXTEND ||
    255 		    raw.c == SENTENCE_BREAK_PROP_FORMAT) {
    256 			continue;
    257 		}
    258 
    259 		/* SB6 */
    260 		if (skip.b == SENTENCE_BREAK_PROP_ATERM &&
    261 		    skip.c == SENTENCE_BREAK_PROP_NUMERIC) {
    262 			continue;
    263 		}
    264 
    265 		/* SB7 */
    266 		if (off > 1 &&
    267 		    (skip.a == SENTENCE_BREAK_PROP_UPPER ||
    268 		     skip.a == SENTENCE_BREAK_PROP_LOWER) &&
    269 		    skip.b == SENTENCE_BREAK_PROP_ATERM &&
    270 		    skip.c == SENTENCE_BREAK_PROP_UPPER) {
    271 			continue;
    272 		}
    273 
    274 		/* SB8 */
    275 		if (aterm_close_sp_level == 1 ||
    276 		    aterm_close_sp_level == 2 ||
    277 		    aterm_close_sp_level == 3) {
    278 			/*
    279 			 * This is the most complicated rule, requiring
    280 			 * the right-hand-side to satisfy the regular expression
    281 			 *
    282 			 *  ( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
    283 			 *
    284 			 * which we simply check "manually" given LUT-lookups
    285 			 * are very cheap.
    286 			 *
    287 			 */
    288 			for (tmp = off, res = NUM_SENTENCE_BREAK_PROPS; tmp < len; ) {
    289 				tmp += get_codepoint(str, len, tmp, &cp);
    290 				res = get_break_prop(cp);
    291 
    292 				if (res == SENTENCE_BREAK_PROP_OLETTER ||
    293 				    res == SENTENCE_BREAK_PROP_UPPER   ||
    294 				    res == SENTENCE_BREAK_PROP_LOWER   ||
    295 				    res == SENTENCE_BREAK_PROP_SEP     ||
    296 				    res == SENTENCE_BREAK_PROP_CR      ||
    297 				    res == SENTENCE_BREAK_PROP_LF      ||
    298 				    res == SENTENCE_BREAK_PROP_STERM   ||
    299 				    res == SENTENCE_BREAK_PROP_ATERM) {
    300 					break;
    301 				}
    302 			}
    303 
    304 			if (res == SENTENCE_BREAK_PROP_LOWER) {
    305 				continue;
    306 			}
    307 		}
    308 
    309 		/* SB8a */
    310 		if ((saterm_close_sp_parasep_level == 1 ||
    311 		     saterm_close_sp_parasep_level == 2 ||
    312 		     saterm_close_sp_parasep_level == 3) &&
    313 		    (skip.c == SENTENCE_BREAK_PROP_SCONTINUE ||
    314 		     skip.c == SENTENCE_BREAK_PROP_STERM     ||
    315                      skip.c == SENTENCE_BREAK_PROP_ATERM)) {
    316 			continue;
    317 		}
    318 
    319 		/* SB9 */
    320 		if ((saterm_close_sp_parasep_level == 1 ||
    321 		     saterm_close_sp_parasep_level == 2) &&
    322 		    (skip.c == SENTENCE_BREAK_PROP_CLOSE ||
    323 		     skip.c == SENTENCE_BREAK_PROP_SP    ||
    324 		     skip.c == SENTENCE_BREAK_PROP_SEP   ||
    325 		     skip.c == SENTENCE_BREAK_PROP_CR    ||
    326 		     skip.c == SENTENCE_BREAK_PROP_LF)) {
    327 			continue;
    328 		}
    329 
    330 		/* SB10 */
    331 		if ((saterm_close_sp_parasep_level == 1 ||
    332 		     saterm_close_sp_parasep_level == 2 ||
    333 		     saterm_close_sp_parasep_level == 3) &&
    334 		    (skip.c == SENTENCE_BREAK_PROP_SP  ||
    335 		     skip.c == SENTENCE_BREAK_PROP_SEP ||
    336 		     skip.c == SENTENCE_BREAK_PROP_CR  ||
    337 		     skip.c == SENTENCE_BREAK_PROP_LF)) {
    338 			continue;
    339 		}
    340 
    341 		/* SB11 */
    342 		if (saterm_close_sp_parasep_level == 1 ||
    343 		    saterm_close_sp_parasep_level == 2 ||
    344 		    saterm_close_sp_parasep_level == 3 ||
    345 		    saterm_close_sp_parasep_level == 4) {
    346 			break;
    347 		}
    348 
    349 		/* SB998 */
    350 		continue;
    351 	}
    352 
    353 	return off;
    354 }
    355 
    356 size_t
    357 grapheme_next_sentence_break(const uint_least32_t *str, size_t len)
    358 {
    359 	return next_sentence_break(str, len, get_codepoint);
    360 }
    361 
    362 size_t
    363 grapheme_next_sentence_break_utf8(const char *str, size_t len)
    364 {
    365 	return next_sentence_break(str, len, get_codepoint_utf8);
    366 }