libgrapheme

grapheme cluster utility library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | LICENSE

grapheme.c (6818B)


      1 /* See LICENSE file for copyright and license details. */
      2 #include <stdbool.h>
      3 #include <stddef.h>
      4 #include <stdlib.h>
      5 #include <string.h>
      6 
      7 #include "../gen/grapheme.h"
      8 #include "../grapheme.h"
      9 #include "util.h"
     10 
     11 enum {
     12 	GRAPHEME_FLAG_RI_ODD = 1 << 0, /* odd number of RI's before the seam */
     13 	GRAPHEME_FLAG_EMOJI  = 1 << 1, /* within emoji modifier or zwj sequence */
     14 };
     15 
     16 bool
     17 lg_grapheme_isbreak(uint_least32_t a, uint_least32_t b, LG_SEGMENTATION_STATE *state)
     18 {
     19 	struct lg_internal_heisenstate *p[2] = { 0 };
     20 	uint_least16_t flags = 0;
     21 	bool isbreak = true;
     22 
     23 	/* set state depending on state pointer */
     24 	if (state != NULL) {
     25 		p[0] = &(state->a);
     26 		p[1] = &(state->b);
     27 		flags = state->flags;
     28 	}
     29 
     30 	/* skip printable ASCII */
     31 	if ((a >= 0x20 && a <= 0x7E) &&
     32 	    (b >= 0x20 && b <= 0x7E)) {
     33 		goto hasbreak;
     34 	}
     35 
     36 	/*
     37 	 * Apply grapheme cluster breaking algorithm (UAX #29), see
     38 	 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
     39 	 */
     40 
     41 	/*
     42 	 * update flags, if state-pointer given
     43 	 */
     44 	if (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR)) {
     45 		if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR)) {
     46 			/* one more RI is on the left side of the seam, flip state */
     47 			flags ^= GRAPHEME_FLAG_RI_ODD;
     48 		} else {
     49 			/* an RI appeared on the right side but the left
     50 			   side is not an RI, reset state (number 0 is even) */
     51 			flags &= ~GRAPHEME_FLAG_RI_ODD;
     52 		}
     53 	}
     54 	if (!(flags & GRAPHEME_FLAG_EMOJI) &&
     55 	    ((has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) &&
     56 	      has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) ||
     57              (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) &&
     58 	      has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTEND)))) {
     59 		flags |= GRAPHEME_FLAG_EMOJI;
     60 	} else if ((flags & GRAPHEME_FLAG_EMOJI) &&
     61 	           ((has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_ZWJ) &&
     62 		     has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC)) ||
     63 	            (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTEND) &&
     64 		     has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTEND)) ||
     65 	            (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTEND) &&
     66 		     has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) ||
     67 	            (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) &&
     68 		     has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) ||
     69 	            (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) &&
     70 		     has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTEND)))) {
     71 		/* GRAPHEME_FLAG_EMOJI remains */
     72 	} else {
     73 		flags &= ~GRAPHEME_FLAG_EMOJI;
     74 	}
     75 
     76 	/* write updated flags to state, if given */
     77 	if (state != NULL) {
     78 		state->flags = flags;
     79 	}
     80 
     81 	/*
     82 	 * apply rules
     83 	 */
     84 
     85 	/* skip GB1 and GB2, as they are never satisfied here */
     86 
     87 	/* GB3 */
     88 	if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_CR) &&
     89 	    has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_LF)) {
     90 		goto nobreak;
     91 	}
     92 
     93 	/* GB4 */
     94 	if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_CONTROL) ||
     95 	    has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_CR) ||
     96 	    has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_LF)) {
     97 		goto hasbreak;
     98 	}
     99 
    100 	/* GB5 */
    101 	if (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_CONTROL) ||
    102 	    has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_CR) ||
    103 	    has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_LF)) {
    104 		goto hasbreak;
    105 	}
    106 
    107 	/* GB6 */
    108 	if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_HANGUL_L) &&
    109 	    (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_L) ||
    110 	     has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_V) ||
    111 	     has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_LV) ||
    112 
    113 	     has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_LVT))) {
    114 		goto nobreak;
    115 	}
    116 
    117 	/* GB7 */
    118 	if ((has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_HANGUL_LV) ||
    119 	     has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_HANGUL_V)) &&
    120 	    (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_V) ||
    121 	     has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_T))) {
    122 		goto nobreak;
    123 	}
    124 
    125 	/* GB8 */
    126 	if ((has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_HANGUL_LVT) ||
    127 	     has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_HANGUL_T)) &&
    128 	    has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_T)) {
    129 		goto nobreak;
    130 	}
    131 
    132 	/* GB9 */
    133 	if (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTEND) ||
    134 	    has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) {
    135 		goto nobreak;
    136 	}
    137 
    138 	/* GB9a */
    139 	if (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_SPACINGMARK)) {
    140 		goto nobreak;
    141 	}
    142 
    143 	/* GB9b */
    144 	if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_PREPEND)) {
    145 		goto nobreak;
    146 	}
    147 
    148 	/* GB11 */
    149 	if ((flags & GRAPHEME_FLAG_EMOJI) &&
    150 	    has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_ZWJ) &&
    151 	    has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC)) {
    152 		goto nobreak;
    153 	}
    154 
    155 	/* GB12/GB13 */
    156 	if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR) &&
    157 	    has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR) &&
    158 	    (flags & GRAPHEME_FLAG_RI_ODD)) {
    159 		goto nobreak;
    160 	}
    161 
    162 	/* GB999 */
    163 	goto hasbreak;
    164 nobreak:
    165 	isbreak = false;
    166 hasbreak:
    167 	if (state != NULL) {
    168 		/* move b-state to a-state, discard b-state */
    169 		memcpy(&(state->a), &(state->b), sizeof(state->a));
    170 		memset(&(state->b), 0, sizeof(state->b));
    171 
    172 		/* reset flags */
    173 		if (isbreak) {
    174 			state->flags = 0;
    175 		}
    176 	}
    177 
    178 	return isbreak;
    179 }
    180 
    181 size_t
    182 lg_grapheme_nextbreak(const char *str)
    183 {
    184 	uint_least32_t cp0, cp1;
    185 	size_t ret, len = 0;
    186 	LG_SEGMENTATION_STATE state = { 0 };
    187 
    188 	if (str == NULL) {
    189 		return 0;
    190 	}
    191 
    192 	/*
    193 	 * lg_utf8_decode, when it encounters an unexpected byte,
    194 	 * does not count it to the error and instead assumes that the
    195 	 * unexpected byte is the beginning of a new sequence.
    196 	 * This way, when the string ends with a null byte, we never
    197 	 * miss it, even if the previous UTF-8 sequence terminates
    198 	 * unexpectedly, as it would either act as an unexpected byte,
    199 	 * saved for later, or as a null byte itself, that we can catch.
    200 	 * We pass (size_t)-1 to the length, as we will never read beyond
    201 	 * the null byte for the reasons given above.
    202 	 */
    203 
    204 	/* get first code point */
    205 	len += lg_utf8_decode(str, (size_t)-1, &cp0);
    206 	if (cp0 == LG_INVALID_CODE_POINT) {
    207 		return len;
    208 	}
    209 
    210 	while (cp0 != 0) {
    211 		/* get next code point */
    212 		ret = lg_utf8_decode(str + len, (size_t)-1, &cp1);
    213 
    214 		if (cp1 == LG_INVALID_CODE_POINT ||
    215 		    lg_grapheme_isbreak(cp0, cp1, &state)) {
    216 			/* we read an invalid cp or have a breakpoint */
    217 			break;
    218 		} else {
    219 			/* we don't have a breakpoint, continue */
    220 			len += ret;
    221 		}
    222 
    223 		/* prepare next round */
    224 		cp0 = cp1;
    225 	}
    226 
    227 	return len;
    228 }