word.c - libgrapheme - unicode string library

word.c (8052B)
      1 /* See LICENSE file for copyright and license details. */
      2 #include <stdbool.h>
      3 #include <stddef.h>
      4 
      5 #include "../gen/word.h"
      6 #include "../grapheme.h"
      7 #include "util.h"
      8 
      9 struct word_break_state {
     10 	bool ri_even;
     11 };
     12 
     13 static inline uint_least8_t
     14 get_word_break_prop(uint_least32_t cp)
     15 {
     16 	if (likely(cp <= UINT32_C(0x10FFFF))) {
     17 		return (uint_least8_t)
     18 			word_break_minor[word_break_major[cp >> 8] +
     19 		                         (cp & 0xff)];
     20 	} else {
     21 		return WORD_BREAK_PROP_OTHER;
     22 	}
     23 }
     24 
     25 static bool
     26 is_skippable_word_prop(uint_least8_t prop)
     27 {
     28 	return prop == WORD_BREAK_PROP_EXTEND ||
     29 	       prop == WORD_BREAK_PROP_FORMAT || prop == WORD_BREAK_PROP_ZWJ;
     30 }
     31 
     32 static void
     33 word_skip_shift_callback(uint_least8_t prop, void *s)
     34 {
     35 	struct word_break_state *state = (struct word_break_state *)s;
     36 
     37 	if (prop == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
     38 		/*
     39 		 * The property we just shifted in is
     40 		 * a regional indicator, increasing the
     41 		 * number of consecutive RIs on the left
     42 		 * side of the breakpoint by one, changing
     43 		 * the oddness.
     44 		 *
     45 		 */
     46 		state->ri_even = !(state->ri_even);
     47 	} else {
     48 		/*
     49 		 * We saw no regional indicator, so the
     50 		 * number of consecutive RIs on the left
     51 		 * side of the breakpoint is zero, which
     52 		 * is an even number.
     53 		 *
     54 		 */
     55 		state->ri_even = true;
     56 	}
     57 }
     58 
     59 static size_t
     60 next_word_break(HERODOTUS_READER *r)
     61 {
     62 	struct proper p;
     63 	struct word_break_state state = { .ri_even = true };
     64 
     65 	/*
     66 	 * Apply word breaking algorithm (UAX #29), see
     67 	 * https://unicode.org/reports/tr29/#Word_Boundary_Rules
     68 	 */
     69 	proper_init(r, &state, NUM_WORD_BREAK_PROPS, get_word_break_prop,
     70 	            is_skippable_word_prop, word_skip_shift_callback, &p);
     71 
     72 	while (!proper_advance(&p)) {
     73 		/* WB3 */
     74 		if (p.raw.prev_prop[0] == WORD_BREAK_PROP_CR &&
     75 		    p.raw.next_prop[0] == WORD_BREAK_PROP_LF) {
     76 			continue;
     77 		}
     78 
     79 		/* WB3a */
     80 		if (p.raw.prev_prop[0] == WORD_BREAK_PROP_NEWLINE ||
     81 		    p.raw.prev_prop[0] == WORD_BREAK_PROP_CR ||
     82 		    p.raw.prev_prop[0] == WORD_BREAK_PROP_LF) {
     83 			break;
     84 		}
     85 
     86 		/* WB3b */
     87 		if (p.raw.next_prop[0] == WORD_BREAK_PROP_NEWLINE ||
     88 		    p.raw.next_prop[0] == WORD_BREAK_PROP_CR ||
     89 		    p.raw.next_prop[0] == WORD_BREAK_PROP_LF) {
     90 			break;
     91 		}
     92 
     93 		/* WB3c */
     94 		if (p.raw.prev_prop[0] == WORD_BREAK_PROP_ZWJ &&
     95 		    (p.raw.next_prop[0] ==
     96 		             WORD_BREAK_PROP_EXTENDED_PICTOGRAPHIC ||
     97 		     p.raw.next_prop[0] ==
     98 		             WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT)) {
     99 			continue;
    100 		}
    101 
    102 		/* WB3d */
    103 		if (p.raw.prev_prop[0] == WORD_BREAK_PROP_WSEGSPACE &&
    104 		    p.raw.next_prop[0] == WORD_BREAK_PROP_WSEGSPACE) {
    105 			continue;
    106 		}
    107 
    108 		/* WB4 */
    109 		if (p.raw.next_prop[0] == WORD_BREAK_PROP_EXTEND ||
    110 		    p.raw.next_prop[0] == WORD_BREAK_PROP_FORMAT ||
    111 		    p.raw.next_prop[0] == WORD_BREAK_PROP_ZWJ) {
    112 			continue;
    113 		}
    114 
    115 		/* WB5 */
    116 		if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
    117 		     p.skip.prev_prop[0] ==
    118 		             WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
    119 		     p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
    120 		    (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
    121 		     p.skip.next_prop[0] ==
    122 		             WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
    123 		     p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) {
    124 			continue;
    125 		}
    126 
    127 		/* WB6 */
    128 		if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
    129 		     p.skip.prev_prop[0] ==
    130 		             WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
    131 		     p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
    132 		    (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDLETTER ||
    133 		     p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
    134 		     p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
    135 		    (p.skip.next_prop[1] == WORD_BREAK_PROP_ALETTER ||
    136 		     p.skip.next_prop[1] ==
    137 		             WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
    138 		     p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) {
    139 			continue;
    140 		}
    141 
    142 		/* WB7 */
    143 		if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDLETTER ||
    144 		     p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
    145 		     p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
    146 		    (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
    147 		     p.skip.next_prop[0] ==
    148 		             WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
    149 		     p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
    150 		    (p.skip.prev_prop[1] == WORD_BREAK_PROP_ALETTER ||
    151 		     p.skip.prev_prop[1] ==
    152 		             WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
    153 		     p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) {
    154 			continue;
    155 		}
    156 
    157 		/* WB7a */
    158 		if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
    159 		    p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) {
    160 			continue;
    161 		}
    162 
    163 		/* WB7b */
    164 		if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
    165 		    p.skip.next_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE &&
    166 		    p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) {
    167 			continue;
    168 		}
    169 
    170 		/* WB7c */
    171 		if (p.skip.prev_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE &&
    172 		    p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
    173 		    p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) {
    174 			continue;
    175 		}
    176 
    177 		/* WB8 */
    178 		if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
    179 		    p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) {
    180 			continue;
    181 		}
    182 
    183 		/* WB9 */
    184 		if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
    185 		     p.skip.prev_prop[0] ==
    186 		             WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
    187 		     p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
    188 		    p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) {
    189 			continue;
    190 		}
    191 
    192 		/* WB10 */
    193 		if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
    194 		    (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
    195 		     p.skip.next_prop[0] ==
    196 		             WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
    197 		     p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) {
    198 			continue;
    199 		}
    200 
    201 		/* WB11 */
    202 		if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUM ||
    203 		     p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
    204 		     p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
    205 		    p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC &&
    206 		    p.skip.prev_prop[1] == WORD_BREAK_PROP_NUMERIC) {
    207 			continue;
    208 		}
    209 
    210 		/* WB12 */
    211 		if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
    212 		    (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUM ||
    213 		     p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
    214 		     p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
    215 		    p.skip.next_prop[1] == WORD_BREAK_PROP_NUMERIC) {
    216 			continue;
    217 		}
    218 
    219 		/* WB13 */
    220 		if (p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA &&
    221 		    p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA) {
    222 			continue;
    223 		}
    224 
    225 		/* WB13a */
    226 		if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
    227 		     p.skip.prev_prop[0] ==
    228 		             WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
    229 		     p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER ||
    230 		     p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC ||
    231 		     p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA ||
    232 		     p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) &&
    233 		    p.skip.next_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) {
    234 			continue;
    235 		}
    236 
    237 		/* WB13b */
    238 		if (p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET &&
    239 		    (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
    240 		     p.skip.next_prop[0] ==
    241 		             WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
    242 		     p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER ||
    243 		     p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC ||
    244 		     p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA)) {
    245 			continue;
    246 		}
    247 
    248 		/* WB15 and WB16 */
    249 		if (!state.ri_even &&
    250 		    p.skip.next_prop[0] == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
    251 			continue;
    252 		}
    253 
    254 		/* WB999 */
    255 		break;
    256 	}
    257 
    258 	return herodotus_reader_number_read(&(p.mid_reader));
    259 }
    260 
    261 size_t
    262 grapheme_next_word_break(const uint_least32_t *str, size_t len)
    263 {
    264 	HERODOTUS_READER r;
    265 
    266 	herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
    267 
    268 	return next_word_break(&r);
    269 }
    270 
    271 size_t
    272 grapheme_next_word_break_utf8(const char *str, size_t len)
    273 {
    274 	HERODOTUS_READER r;
    275 
    276 	herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
    277 
    278 	return next_word_break(&r);
    279 }
	libgrapheme unicode string library
	git clone git://git.suckless.org/libgrapheme
	Log \| Files \| Refs \| README \| LICENSE