libgrapheme

grapheme cluster utility library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | LICENSE

boundary.c (5212B)


      1 /* See LICENSE file for copyright and license details. */
      2 #include <stddef.h>
      3 #include <stdint.h>
      4 #include <stdlib.h>
      5 
      6 #include "../data/emoji.h"
      7 #include "../data/grapheme_boundary.h"
      8 
      9 enum {
     10 	GRAPHEME_STATE_RI_ODD = 1 << 0, /* odd number of RI's before the seam */
     11 	GRAPHEME_STATE_EMOJI  = 1 << 1, /* within emoji modifier or zwj sequence */
     12 };
     13 
     14 static int
     15 cp_cmp(const void *a, const void *b)
     16 {
     17 	uint32_t cp = *(uint32_t *)a;
     18 	uint32_t *range = (uint32_t *)b;
     19 
     20 	return (cp >= range[0] && cp <= range[1]) ? 0 : (cp - range[0]);
     21 }
     22 
     23 static int
     24 has_property(uint32_t cp, struct heisenstate *cpstate,
     25              const struct range_list *proptable, int property)
     26 {
     27 	if (heisenstate_get(cpstate, property) == -1) {
     28 		/* state undetermined, make a lookup and set it */
     29 		heisenstate_set(cpstate, property, bsearch(&cp,
     30 		                proptable[property].data,
     31 		                proptable[property].len,
     32 				sizeof(*proptable[property].data),
     33 		                cp_cmp) ? 1 : 0);
     34 	}
     35 
     36 	return heisenstate_get(cpstate, property);
     37 }
     38 
     39 int
     40 grapheme_boundary(uint32_t a, uint32_t b, int *state)
     41 {
     42 	struct heisenstate gb[2] = { 0 }, emoji[2] = { 0 };
     43 	int s;
     44 
     45 	/* skip printable ASCII */
     46 	if ((a >= 0x20 && a <= 0x7E) &&
     47 	    (b >= 0x20 && b <= 0x7E)) {
     48 		return 1;
     49 	}
     50 
     51 	/* set internal state based on given state-pointer */
     52 	s = (state != NULL) ? *state : 0;
     53 
     54 	/*
     55 	 * Apply grapheme cluster breaking algorithm (UAX #29), see
     56 	 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
     57 	 */
     58 
     59 	/*
     60 	 * update state
     61 	 */
     62 	if (has_property(b, &gb[1], gb_prop, GB_PROP_REGIONAL_INDICATOR)) {
     63 		if (has_property(a, &gb[0], gb_prop, GB_PROP_REGIONAL_INDICATOR)) {
     64 			/* one more RI is on the left side of the seam */
     65 			s ^= GRAPHEME_STATE_RI_ODD;
     66 		} else {
     67 			/* an RI appeared on the right side but the left
     68 			   side is not an RI, reset state (0 is even) */
     69 			s &= ~GRAPHEME_STATE_RI_ODD;
     70 		}
     71 	}
     72 	if (!(*state & GRAPHEME_STATE_EMOJI) &&
     73 	    ((has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) &&
     74 	      has_property(b, &gb[1],    gb_prop,    GB_PROP_ZWJ)) ||
     75              (has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) &&
     76 	      has_property(b, &gb[1],    gb_prop,    GB_PROP_EXTEND)))) {
     77 		s |= GRAPHEME_STATE_EMOJI;
     78 	} else if ((*state & GRAPHEME_STATE_EMOJI) &&
     79 	           ((has_property(a, &gb[0],    gb_prop,    GB_PROP_ZWJ) &&
     80 		     has_property(b, &emoji[1], emoji_prop, EMOJI_PROP_EXTPICT)) ||
     81 	            (has_property(a, &gb[0],    gb_prop,    GB_PROP_EXTEND) &&
     82 		     has_property(b, &gb[1],    gb_prop,    GB_PROP_EXTEND)) ||
     83 	            (has_property(a, &gb[0],    gb_prop,    GB_PROP_EXTEND) &&
     84 		     has_property(b, &gb[1],    gb_prop,    GB_PROP_ZWJ)) ||
     85 	            (has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) &&
     86 		     has_property(b, &gb[1],    gb_prop,    GB_PROP_ZWJ)) ||
     87 	            (has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) &&
     88 		     has_property(b, &gb[1],    gb_prop,    GB_PROP_EXTEND)))) {
     89 		/* GRAPHEME_STATE_EMOJI remains */
     90 	} else {
     91 		s &= ~GRAPHEME_STATE_EMOJI;
     92 	}
     93 
     94 	/* write updated state to state-pointer, if given */
     95 	if (state != NULL) {
     96 		*state = s;
     97 	}
     98 
     99 	/*
    100 	 * apply rules
    101 	 */
    102 
    103 	/* skip GB1 and GB2, as they are never satisfied here */
    104 
    105 	/* GB3 */
    106 	if (has_property(a, &gb[0], gb_prop, GB_PROP_CR) &&
    107 	    has_property(b, &gb[1], gb_prop, GB_PROP_LF)) {
    108 		return 0;
    109 	}
    110 
    111 	/* GB4 */
    112 	if (has_property(a, &gb[0], gb_prop, GB_PROP_CONTROL) ||
    113 	    has_property(a, &gb[0], gb_prop, GB_PROP_CR) ||
    114 	    has_property(a, &gb[0], gb_prop, GB_PROP_LF)) {
    115 		return 1;
    116 	}
    117 
    118 	/* GB5 */
    119 	if (has_property(b, &gb[1], gb_prop, GB_PROP_CONTROL) ||
    120 	    has_property(b, &gb[1], gb_prop, GB_PROP_CR) ||
    121 	    has_property(b, &gb[1], gb_prop, GB_PROP_LF)) {
    122 		return 1;
    123 	}
    124 
    125 	/* GB6 */
    126 	if (has_property(a, &gb[0], gb_prop, GB_PROP_L) &&
    127 	    (has_property(b, &gb[1], gb_prop, GB_PROP_L) ||
    128 	     has_property(b, &gb[1], gb_prop, GB_PROP_V) ||
    129 	     has_property(b, &gb[1], gb_prop, GB_PROP_LV) ||
    130 	     has_property(b, &gb[1], gb_prop, GB_PROP_LVT))) {
    131 		return 0;
    132 	}
    133 
    134 	/* GB7 */
    135 	if ((has_property(a, &gb[0], gb_prop, GB_PROP_LV) ||
    136 	     has_property(a, &gb[0], gb_prop, GB_PROP_V)) &&
    137 	    (has_property(b, &gb[1], gb_prop, GB_PROP_V) ||
    138 	     has_property(b, &gb[1], gb_prop, GB_PROP_T))) {
    139 		return 0;
    140 	}
    141 
    142 	/* GB8 */
    143 	if ((has_property(a, &gb[0], gb_prop, GB_PROP_LVT) ||
    144 	     has_property(a, &gb[0], gb_prop, GB_PROP_T)) &&
    145 	    has_property(b, &gb[1], gb_prop, GB_PROP_T)) {
    146 		return 0;
    147 	}
    148 
    149 	/* GB9 */
    150 	if (has_property(b, &gb[1], gb_prop, GB_PROP_EXTEND) ||
    151 	    has_property(b, &gb[1], gb_prop, GB_PROP_ZWJ)) {
    152 		return 0;
    153 	}
    154 
    155 	/* GB9a */
    156 	if (has_property(b, &gb[1], gb_prop, GB_PROP_SPACINGMARK)) {
    157 		return 0;
    158 	}
    159 
    160 	/* GB9b */
    161 	if (has_property(a, &gb[0], gb_prop, GB_PROP_PREPEND)) {
    162 		return 0;
    163 	}
    164 
    165 	/* GB11 */
    166 	if ((s & GRAPHEME_STATE_EMOJI) &&
    167 	    has_property(a, &gb[0], gb_prop, GB_PROP_ZWJ) &&
    168 	    has_property(b, &emoji[1], emoji_prop, EMOJI_PROP_EXTPICT)) {
    169 		return 0;
    170 	}
    171 
    172 	/* GB12/GB13 */
    173 	if (has_property(a, &gb[0], gb_prop, GB_PROP_REGIONAL_INDICATOR) &&
    174 	    has_property(b, &gb[1], gb_prop, GB_PROP_REGIONAL_INDICATOR) &&
    175 	    (s & GRAPHEME_STATE_RI_ODD)) {
    176 		return 0;
    177 	}
    178 
    179 	/* GB999 */
    180 	return 1;
    181 }