libgrapheme

grapheme cluster utility library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | LICENSE

boundary_body.c (6474B)


      1 /* See LICENSE file for copyright and license details. */
      2 #include <stddef.h>
      3 #include <stdint.h>
      4 #include <stdlib.h>
      5 
      6 #define LEN(x) (sizeof(x) / sizeof(*x))
      7 
      8 enum {
      9 	GRAPHEME_STATE_RI_ODD = 1 << 0, /* odd number of RI's before the seam */
     10 	GRAPHEME_STATE_EMOJI  = 1 << 1, /* within emoji modifier or zwj sequence */
     11 };
     12 
     13 enum cp_property {
     14 	PROP_CR,          /* carriage return */
     15 	PROP_LF,          /* line feed */
     16 	PROP_CONTROL,     /* control character */
     17 	PROP_EXTEND,      /* grapheme extender (TODO Emoji_Modifier=Yes) */
     18 	PROP_ZWJ,         /* zero width joiner */
     19 	PROP_RI,          /* regional indicator */
     20 	PROP_PREPEND,     /* prepend character */
     21 	PROP_SPACINGMARK, /* spacing mark */
     22 	PROP_L,           /* hangul syllable type L */
     23 	PROP_V,           /* hangul syllable type V */
     24 	PROP_T,           /* hangul syllable type T */
     25 	PROP_LV,          /* hangul syllable type LV */
     26 	PROP_LVT,         /* hangul syllable type LVT */
     27 	PROP_EXTPICT,     /* extended pictographic */
     28 };
     29 
     30 struct {
     31 	const uint32_t (*table)[2];
     32 	size_t tablelen;
     33 } cp_property_tables[] = {
     34 	[PROP_CR] = {
     35 		.table    = cr_table,
     36 		.tablelen = LEN(cr_table),
     37 	},
     38 	[PROP_LF] = {
     39 		.table    = lf_table,
     40 		.tablelen = LEN(lf_table),
     41 	},
     42 	[PROP_CONTROL] = {
     43 		.table    = control_table,
     44 		.tablelen = LEN(control_table),
     45 	},
     46 	[PROP_EXTEND] = {
     47 		.table    = extend_table,
     48 		.tablelen = LEN(extend_table),
     49 	},
     50 	[PROP_ZWJ] = {
     51 		.table    = zwj_table,
     52 		.tablelen = LEN(zwj_table),
     53 	},
     54 	[PROP_RI] = {
     55 		.table    = ri_table,
     56 		.tablelen = LEN(ri_table),
     57 	},
     58 	[PROP_PREPEND] = {
     59 		.table    = prepend_table,
     60 		.tablelen = LEN(prepend_table),
     61 	},
     62 	[PROP_SPACINGMARK] = {
     63 		.table    = spacingmark_table,
     64 		.tablelen = LEN(spacingmark_table),
     65 	},
     66 	[PROP_L] = {
     67 		.table    = l_table,
     68 		.tablelen = LEN(l_table),
     69 	},
     70 	[PROP_V] = {
     71 		.table    = v_table,
     72 		.tablelen = LEN(v_table),
     73 	},
     74 	[PROP_T] = {
     75 		.table    = t_table,
     76 		.tablelen = LEN(t_table),
     77 	},
     78 	[PROP_LV] = {
     79 		.table    = lv_table,
     80 		.tablelen = LEN(lv_table),
     81 	},
     82 	[PROP_LVT] = {
     83 		.table    = lvt_table,
     84 		.tablelen = LEN(lvt_table),
     85 	},
     86 	[PROP_EXTPICT] = {
     87 		.table    = extpict_table,
     88 		.tablelen = LEN(extpict_table),
     89 	},
     90 };
     91 
     92 struct cp_properties {
     93 	uint32_t cp;
     94 	int_least16_t determined;
     95 	int_least16_t state;
     96 };
     97 
     98 static int
     99 cp_cmp(const void *a, const void *b)
    100 {
    101 	uint32_t cp = *(uint32_t *)a;
    102 	uint32_t *range = (uint32_t *)b;
    103 
    104 	return (cp >= range[0] && cp <= range[1]) ? 0 : (cp - range[0]);
    105 }
    106 
    107 static int
    108 has_property(struct cp_properties *props, enum cp_property p)
    109 {
    110 	if (!(props->determined & (1 << p))) {
    111 		/* not determined yet, do a lookup and set the state */
    112 		if (bsearch(&props->cp, cp_property_tables[p].table,
    113 		            cp_property_tables[p].tablelen,
    114 		            sizeof(*cp_property_tables[p].table),
    115 			    cp_cmp)) {
    116 			props->state |= (1 << p);
    117 		} else {
    118 			props->state &= ~(1 << p);
    119 		}
    120 
    121 		/* now it's determined */
    122 		props->determined |= (1 << p);
    123 	}
    124 
    125 	return (props->state & (1 << p));
    126 }
    127 
    128 int
    129 grapheme_boundary(uint32_t a, uint32_t b, int *state)
    130 {
    131 	struct cp_properties props[] = {
    132 		{
    133 			.cp = a,
    134 		},
    135 		{
    136 			.cp = b,
    137 		},
    138 	};
    139 	int s;
    140 
    141 	/* skip printable ASCII */
    142 	if ((a >= 0x20 && a <= 0x7E) &&
    143 	    (b >= 0x20 && b <= 0x7E)) {
    144 		return 1;
    145 	}
    146 
    147 	/* set internal state based on given state-pointer */
    148 	s = (state != NULL) ? *state : 0;
    149 
    150 	/*
    151 	 * Apply grapheme cluster breaking algorithm (UAX #29), see
    152 	 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
    153 	 */
    154 
    155 	/*
    156 	 * update state
    157 	 */
    158 	if (has_property(&props[1], PROP_RI)) {
    159 		if (has_property(&props[0], PROP_RI)) {
    160 			/* one more RI is on the left side of the seam */
    161 			s ^= GRAPHEME_STATE_RI_ODD;
    162 		} else {
    163 			/* an RI appeared on the right side but the left
    164 			   side is not an RI, reset state (0 is even) */
    165 			s &= ~GRAPHEME_STATE_RI_ODD;
    166 		}
    167 	}
    168 	if (!(*state & GRAPHEME_STATE_EMOJI) &&
    169 	    ((has_property(&props[0], PROP_EXTPICT) &&
    170 	      has_property(&props[1], PROP_ZWJ)) ||
    171              (has_property(&props[0], PROP_EXTPICT) &&
    172 	      has_property(&props[1], PROP_EXTEND)))) {
    173 		s |= GRAPHEME_STATE_EMOJI;
    174 	} else if ((*state & GRAPHEME_STATE_EMOJI) &&
    175 	           ((has_property(&props[0], PROP_ZWJ) &&
    176 		     has_property(&props[1], PROP_EXTPICT)) ||
    177 	            (has_property(&props[0], PROP_EXTEND) &&
    178 		     has_property(&props[1], PROP_EXTEND)) ||
    179 	            (has_property(&props[0], PROP_EXTEND) &&
    180 		     has_property(&props[1], PROP_ZWJ)) ||
    181 	            (has_property(&props[0], PROP_EXTPICT) &&
    182 		     has_property(&props[1], PROP_ZWJ)) ||
    183 	            (has_property(&props[0], PROP_EXTPICT) &&
    184 		     has_property(&props[1], PROP_EXTEND)))) {
    185 		/* GRAPHEME_STATE_EMOJI remains */
    186 	} else {
    187 		s &= ~GRAPHEME_STATE_EMOJI;
    188 	}
    189 
    190 	/* write updated state to state-pointer, if given */
    191 	if (state != NULL) {
    192 		*state = s;
    193 	}
    194 
    195 	/*
    196 	 * apply rules
    197 	 */
    198 
    199 	/* skip GB1 and GB2, as they are never satisfied here */
    200 
    201 	/* GB3 */
    202 	if (has_property(&props[0], PROP_CR) &&
    203 	    has_property(&props[1], PROP_LF)) {
    204 		return 0;
    205 	}
    206 
    207 	/* GB4 */
    208 	if (has_property(&props[0], PROP_CONTROL) ||
    209 	    has_property(&props[0], PROP_CR) ||
    210 	    has_property(&props[0], PROP_LF)) {
    211 		return 1;
    212 	}
    213 
    214 	/* GB5 */
    215 	if (has_property(&props[1], PROP_CONTROL) ||
    216 	    has_property(&props[1], PROP_CR) ||
    217 	    has_property(&props[1], PROP_LF)) {
    218 		return 1;
    219 	}
    220 
    221 	/* GB6 */
    222 	if (has_property(&props[0], PROP_L) &&
    223 	    (has_property(&props[1], PROP_L) ||
    224 	     has_property(&props[1], PROP_V) ||
    225 	     has_property(&props[1], PROP_LV) ||
    226 	     has_property(&props[1], PROP_LVT))) {
    227 		return 0;
    228 	}
    229 
    230 	/* GB7 */
    231 	if ((has_property(&props[0], PROP_LV) ||
    232 	     has_property(&props[0], PROP_V)) &&
    233 	    (has_property(&props[1], PROP_V) ||
    234 	     has_property(&props[1], PROP_T))) {
    235 		return 0;
    236 	}
    237 
    238 	/* GB8 */
    239 	if ((has_property(&props[0], PROP_LVT) ||
    240 	     has_property(&props[0], PROP_T)) &&
    241 	    has_property(&props[1], PROP_T)) {
    242 		return 0;
    243 	}
    244 
    245 	/* GB9 */
    246 	if (has_property(&props[1], PROP_EXTEND) ||
    247 	    has_property(&props[1], PROP_ZWJ)) {
    248 		return 0;
    249 	}
    250 
    251 	/* GB9a */
    252 	if (has_property(&props[1], PROP_SPACINGMARK)) {
    253 		return 0;
    254 	}
    255 
    256 	/* GB9b */
    257 	if (has_property(&props[0], PROP_PREPEND)) {
    258 		return 0;
    259 	}
    260 
    261 	/* GB11 */
    262 	if ((s & GRAPHEME_STATE_EMOJI) &&
    263 	    has_property(&props[0], PROP_ZWJ) &&
    264 	    has_property(&props[1], PROP_EXTPICT)) {
    265 		return 0;
    266 	}
    267 
    268 	/* GB12/GB13 */
    269 	if (has_property(&props[0], PROP_RI) &&
    270 	    has_property(&props[1], PROP_RI) &&
    271 	    (s & GRAPHEME_STATE_RI_ODD)) {
    272 		return 0;
    273 	}
    274 
    275 	/* GB999 */
    276 	return 1;
    277 }