libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

word.c (9443B)


      1 /* See LICENSE file for copyright and license details. */
      2 #include <stdbool.h>
      3 #include <stddef.h>
      4 #include <stdlib.h>
      5 #include <string.h>
      6 
      7 #include "../gen/word.h"
      8 #include "../grapheme.h"
      9 #include "util.h"
     10 
     11 static inline enum word_break_property
     12 get_break_prop(uint_least32_t cp)
     13 {
     14 	if (likely(cp <= 0x10FFFF)) {
     15 		return (enum word_break_property)
     16 		       word_break_minor[word_break_major[cp >> 8] + (cp & 0xff)];
     17 	} else {
     18 		return WORD_BREAK_PROP_OTHER;
     19 	}
     20 }
     21 
     22 static size_t
     23 next_word_break(const void *str, size_t len, size_t (*get_codepoint)
     24                 (const void *, size_t, size_t, uint_least32_t *))
     25 {
     26 	struct {
     27 		enum word_break_property a, b, c, d;
     28 	} raw, skip;
     29 	enum word_break_property res;
     30 	uint_least32_t cp;
     31 	size_t off, tmp, new_off;
     32 	bool ri_even = true;
     33 
     34 	/* check degenerate cases */
     35 	if (str == NULL || len == 0) {
     36 		return 0;
     37 	}
     38 
     39 	/*
     40 	 * Apply word breaking algorithm (UAX #29), see
     41 	 * https://unicode.org/reports/tr29/#Word_Boundary_Rules
     42 	 *
     43 	 * There are 4 slots (a, b, c, d) of "break" properties and
     44 	 * we check if there is a break in the middle between b and c.
     45 	 *
     46 	 * The position of this middle spot is determined by off,
     47 	 * which gives the offset of the first element on the right
     48 	 * hand side of said spot, or, in other words, gives the number
     49 	 * of elements on the left hand side.
     50 	 *
     51 	 * It is further complicated by the fact that the algorithm
     52 	 * expects you to skip certain characters for the second
     53 	 * half of the rules (after WB4). Thus, we do not only have
     54 	 * the "raw" properties as described above, but also the "skip"
     55 	 * properties, where the skip.a and skip.b, for instance,
     56 	 * give the two preceding character properties behind the
     57 	 * currently investigated breakpoint.
     58 	 *
     59 	 */
     60 
     61 	/*
     62 	 * Initialize the different properties such that we have
     63 	 * a good state after the state-update in the loop
     64 	 */
     65 	raw.b = NUM_WORD_BREAK_PROPS;
     66 	if ((off = get_codepoint(str, len, 0, &cp)) >= len) {
     67 		return 1;
     68 	}
     69 	raw.c = get_break_prop(cp);
     70 	(void)get_codepoint(str, len, off, &cp);
     71 	raw.d = get_break_prop(cp);
     72 	skip.a = skip.b = NUM_WORD_BREAK_PROPS;
     73 
     74 	for (; off < len; off = new_off) {
     75 		/*
     76 		 * Update left side (a and b) of the skip state by
     77 		 * "shifting in" the raw.c property as long as it is
     78 		 * not one of the "ignored" character properties.
     79 		 * While at it, update the RI-counter.
     80 		 *
     81 		 */
     82 		if (raw.c != WORD_BREAK_PROP_EXTEND &&
     83 		    raw.c != WORD_BREAK_PROP_FORMAT &&
     84 		    raw.c != WORD_BREAK_PROP_ZWJ) {
     85 		    	skip.a = skip.b;
     86 			skip.b = raw.c;
     87 
     88 			if (skip.b == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
     89 				/*
     90 				 * The property we just shifted in is
     91 				 * a regional indicator, increasing the
     92 				 * number of consecutive RIs on the left
     93 				 * side of the breakpoint by one, changing
     94 				 * the oddness.
     95 				 *
     96 				 */
     97 				ri_even = !ri_even;
     98 			} else {
     99 				/*
    100 				 * We saw no regional indicator, so the
    101 				 * number of consecutive RIs on the left
    102 				 * side of the breakpoint is zero, which
    103 				 * is an even number.
    104 				 *
    105 				 */
    106 				ri_even = true;
    107 			}
    108 		}
    109 
    110 		/*
    111 		 * Update right side (b and c) of the skip state by
    112 		 * starting at the breakpoint and detecting the two
    113 		 * following non-ignored character classes
    114 		 *
    115 		 */
    116 		skip.c = NUM_WORD_BREAK_PROPS;
    117 		for (tmp = off; tmp < len; ) {
    118 			tmp += get_codepoint(str, len, tmp, &cp);
    119 			res = get_break_prop(cp);
    120 
    121 			if (res != WORD_BREAK_PROP_EXTEND &&
    122 			    res != WORD_BREAK_PROP_FORMAT &&
    123 			    res != WORD_BREAK_PROP_ZWJ) {
    124 				skip.c = res;
    125 				break;
    126 			}
    127 		}
    128 		skip.d = NUM_WORD_BREAK_PROPS;
    129 		for (; tmp < len; ) {
    130 			tmp += get_codepoint(str, len, tmp, &cp);
    131 			res = get_break_prop(cp);
    132 
    133 			if (res != WORD_BREAK_PROP_EXTEND &&
    134 			    res != WORD_BREAK_PROP_FORMAT &&
    135 			    res != WORD_BREAK_PROP_ZWJ) {
    136 				skip.d = res;
    137 				break;
    138 			}
    139 		}
    140 
    141 		/*
    142 		 * Update the raw state by simply shifting everything
    143 		 * in and, if we still have data left, determining
    144 		 * the character class of the next codepoint.
    145 		 *
    146 		 */
    147 		raw.a = raw.b;
    148 		raw.b = raw.c;
    149 		raw.c = raw.d;
    150 		if ((new_off = off + get_codepoint(str, len, off, &cp)) < len) {
    151 			get_codepoint(str, len, new_off, &cp);
    152 			raw.d = get_break_prop(cp);
    153 		} else {
    154 			raw.d = NUM_WORD_BREAK_PROPS;
    155 		}
    156 
    157 		/* WB3 */
    158 		if (raw.b == WORD_BREAK_PROP_CR &&
    159 		    raw.c == WORD_BREAK_PROP_LF) {
    160 			continue;
    161 		}
    162 
    163 		/* WB3a */
    164 		if (raw.b == WORD_BREAK_PROP_NEWLINE ||
    165 		    raw.b == WORD_BREAK_PROP_CR      ||
    166 		    raw.b == WORD_BREAK_PROP_LF) {
    167 			break;
    168 		}
    169 
    170 		/* WB3b */
    171 		if (raw.c == WORD_BREAK_PROP_NEWLINE ||
    172 		    raw.c == WORD_BREAK_PROP_CR      ||
    173 		    raw.c == WORD_BREAK_PROP_LF) {
    174 			break;
    175 		}
    176 
    177 		/* WB3c */
    178 		if (raw.b == WORD_BREAK_PROP_ZWJ &&
    179 		    (raw.c == WORD_BREAK_PROP_EXTENDED_PICTOGRAPHIC ||
    180 		     raw.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT)) {
    181 			continue;
    182 		}
    183 
    184 		/* WB3d */
    185 		if (raw.b == WORD_BREAK_PROP_WSEGSPACE &&
    186 		    raw.c == WORD_BREAK_PROP_WSEGSPACE) {
    187 			continue;
    188 		}
    189 
    190 		/* WB4 */
    191 		if (raw.c == WORD_BREAK_PROP_EXTEND ||
    192 		    raw.c == WORD_BREAK_PROP_FORMAT ||
    193 		    raw.c == WORD_BREAK_PROP_ZWJ) {
    194 			continue;
    195 		}
    196 
    197 		/* WB5 */
    198 		if ((skip.b == WORD_BREAK_PROP_ALETTER              ||
    199 		     skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
    200 		     skip.b == WORD_BREAK_PROP_HEBREW_LETTER) &&
    201 		    (skip.c == WORD_BREAK_PROP_ALETTER              ||
    202 		     skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
    203 		     skip.c == WORD_BREAK_PROP_HEBREW_LETTER)) {
    204 			continue;
    205 		}
    206 
    207 		/* WB6 */
    208 		if ((skip.b == WORD_BREAK_PROP_ALETTER              ||
    209 		     skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
    210 		     skip.b == WORD_BREAK_PROP_HEBREW_LETTER) &&
    211 		    (skip.c == WORD_BREAK_PROP_MIDLETTER    ||
    212 		     skip.c == WORD_BREAK_PROP_MIDNUMLET    ||
    213 		     skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) &&
    214 		    len > 2 &&
    215 		    (skip.d == WORD_BREAK_PROP_ALETTER              ||
    216 		     skip.d == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
    217 		     skip.d == WORD_BREAK_PROP_HEBREW_LETTER)) {
    218 			continue;
    219 		}
    220 
    221 		/* WB7 */
    222 		if ((skip.b == WORD_BREAK_PROP_MIDLETTER    ||
    223 		     skip.b == WORD_BREAK_PROP_MIDNUMLET    ||
    224 		     skip.b == WORD_BREAK_PROP_SINGLE_QUOTE) &&
    225 		    (skip.c == WORD_BREAK_PROP_ALETTER              ||
    226 		     skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
    227 		     skip.c == WORD_BREAK_PROP_HEBREW_LETTER) &&
    228 		    len > 2 &&
    229 		    (skip.a == WORD_BREAK_PROP_ALETTER              ||
    230 		     skip.a == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
    231 		     skip.a == WORD_BREAK_PROP_HEBREW_LETTER)) {
    232 			continue;
    233 		}
    234 
    235 		/* WB7a */
    236 		if (skip.b == WORD_BREAK_PROP_HEBREW_LETTER &&
    237 		    skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) {
    238 			continue;
    239 		}
    240 
    241 		/* WB7b */
    242 		if (skip.b == WORD_BREAK_PROP_HEBREW_LETTER &&
    243 		    skip.c == WORD_BREAK_PROP_DOUBLE_QUOTE &&
    244 		    len > 2 &&
    245 		    skip.d == WORD_BREAK_PROP_HEBREW_LETTER) {
    246 			continue;
    247 		}
    248 
    249 		/* WB7c */
    250 		if (skip.b == WORD_BREAK_PROP_DOUBLE_QUOTE &&
    251 		    skip.c == WORD_BREAK_PROP_HEBREW_LETTER &&
    252 		    off > 1 &&
    253 		    skip.a == WORD_BREAK_PROP_HEBREW_LETTER) {
    254 			continue;
    255 		}
    256 
    257 		/* WB8 */
    258 		if (skip.b == WORD_BREAK_PROP_NUMERIC &&
    259 		    skip.c == WORD_BREAK_PROP_NUMERIC) {
    260 			continue;
    261 		}
    262 
    263 		/* WB9 */
    264 		if ((skip.b == WORD_BREAK_PROP_ALETTER              ||
    265 		     skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
    266 		     skip.b == WORD_BREAK_PROP_HEBREW_LETTER) &&
    267 		    skip.c == WORD_BREAK_PROP_NUMERIC) {
    268 			continue;
    269 		}
    270 
    271 		/* WB10 */
    272 		if (skip.b == WORD_BREAK_PROP_NUMERIC &&
    273 		    (skip.c == WORD_BREAK_PROP_ALETTER              ||
    274 		     skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
    275 		     skip.c == WORD_BREAK_PROP_HEBREW_LETTER)) {
    276 			continue;
    277 		}
    278 
    279 		/* WB11 */
    280 		if ((skip.b == WORD_BREAK_PROP_MIDNUM       ||
    281 		     skip.b == WORD_BREAK_PROP_MIDNUMLET    ||
    282 		     skip.b == WORD_BREAK_PROP_SINGLE_QUOTE) &&
    283 		    skip.c == WORD_BREAK_PROP_NUMERIC &&
    284 		    off > 1 &&
    285 		    skip.a == WORD_BREAK_PROP_NUMERIC) {
    286 			continue;
    287 		}
    288 
    289 		/* WB12 */
    290 		if (skip.b == WORD_BREAK_PROP_NUMERIC &&
    291 		    (skip.c == WORD_BREAK_PROP_MIDNUM       ||
    292 		     skip.c == WORD_BREAK_PROP_MIDNUMLET    ||
    293 		     skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) &&
    294 		    len > 2 &&
    295 		    skip.d == WORD_BREAK_PROP_NUMERIC) {
    296 			continue;
    297 		}
    298 
    299 		/* WB13 */
    300 		if (skip.b == WORD_BREAK_PROP_KATAKANA &&
    301 		    skip.c == WORD_BREAK_PROP_KATAKANA) {
    302 			continue;
    303 		}
    304 
    305 		/* WB13a */
    306 		if ((skip.b == WORD_BREAK_PROP_ALETTER              ||
    307 		     skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
    308 		     skip.b == WORD_BREAK_PROP_HEBREW_LETTER        ||
    309 		     skip.b == WORD_BREAK_PROP_NUMERIC              ||
    310 		     skip.b == WORD_BREAK_PROP_KATAKANA             ||
    311 		     skip.b == WORD_BREAK_PROP_EXTENDNUMLET) &&
    312 		    skip.c == WORD_BREAK_PROP_EXTENDNUMLET) {
    313 			continue;
    314 		}
    315 
    316 		/* WB13b */
    317 		if (skip.b == WORD_BREAK_PROP_EXTENDNUMLET &&
    318 		    (skip.c == WORD_BREAK_PROP_ALETTER              ||
    319 		     skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
    320 		     skip.c == WORD_BREAK_PROP_HEBREW_LETTER        ||
    321 		     skip.c == WORD_BREAK_PROP_NUMERIC              ||
    322 		     skip.c == WORD_BREAK_PROP_KATAKANA)) {
    323 			continue;
    324 		}
    325 
    326 		/* WB15 and WB16 */
    327 		if (!ri_even &&
    328 		    skip.c == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
    329 			continue;
    330 		}
    331 
    332 		/* WB999 */
    333 		break;
    334 	}
    335 
    336 	return off;
    337 }
    338 
    339 size_t
    340 grapheme_next_word_break(const uint_least32_t *str, size_t len)
    341 {
    342 	return next_word_break(str, len, get_codepoint);
    343 }
    344 
    345 size_t
    346 grapheme_next_word_break_utf8(const char *str, size_t len)
    347 {
    348 	return next_word_break(str, len, get_codepoint_utf8);
    349 }