boundary.c (5212B)
1 /* See LICENSE file for copyright and license details. */ 2 #include <stddef.h> 3 #include <stdint.h> 4 #include <stdlib.h> 5 6 #include "../data/emoji.h" 7 #include "../data/grapheme_boundary.h" 8 9 enum { 10 GRAPHEME_STATE_RI_ODD = 1 << 0, /* odd number of RI's before the seam */ 11 GRAPHEME_STATE_EMOJI = 1 << 1, /* within emoji modifier or zwj sequence */ 12 }; 13 14 static int 15 cp_cmp(const void *a, const void *b) 16 { 17 uint32_t cp = *(uint32_t *)a; 18 uint32_t *range = (uint32_t *)b; 19 20 return (cp >= range[0] && cp <= range[1]) ? 0 : (cp - range[0]); 21 } 22 23 static int 24 has_property(uint32_t cp, struct heisenstate *cpstate, 25 const struct range_list *proptable, int property) 26 { 27 if (heisenstate_get(cpstate, property) == -1) { 28 /* state undetermined, make a lookup and set it */ 29 heisenstate_set(cpstate, property, bsearch(&cp, 30 proptable[property].data, 31 proptable[property].len, 32 sizeof(*proptable[property].data), 33 cp_cmp) ? 1 : 0); 34 } 35 36 return heisenstate_get(cpstate, property); 37 } 38 39 int 40 grapheme_boundary(uint32_t a, uint32_t b, int *state) 41 { 42 struct heisenstate gb[2] = { 0 }, emoji[2] = { 0 }; 43 int s; 44 45 /* skip printable ASCII */ 46 if ((a >= 0x20 && a <= 0x7E) && 47 (b >= 0x20 && b <= 0x7E)) { 48 return 1; 49 } 50 51 /* set internal state based on given state-pointer */ 52 s = (state != NULL) ? *state : 0; 53 54 /* 55 * Apply grapheme cluster breaking algorithm (UAX #29), see 56 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules 57 */ 58 59 /* 60 * update state 61 */ 62 if (has_property(b, &gb[1], gb_prop, GB_PROP_REGIONAL_INDICATOR)) { 63 if (has_property(a, &gb[0], gb_prop, GB_PROP_REGIONAL_INDICATOR)) { 64 /* one more RI is on the left side of the seam */ 65 s ^= GRAPHEME_STATE_RI_ODD; 66 } else { 67 /* an RI appeared on the right side but the left 68 side is not an RI, reset state (0 is even) */ 69 s &= ~GRAPHEME_STATE_RI_ODD; 70 } 71 } 72 if (!(*state & GRAPHEME_STATE_EMOJI) && 73 ((has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) && 74 has_property(b, &gb[1], gb_prop, GB_PROP_ZWJ)) || 75 (has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) && 76 has_property(b, &gb[1], gb_prop, GB_PROP_EXTEND)))) { 77 s |= GRAPHEME_STATE_EMOJI; 78 } else if ((*state & GRAPHEME_STATE_EMOJI) && 79 ((has_property(a, &gb[0], gb_prop, GB_PROP_ZWJ) && 80 has_property(b, &emoji[1], emoji_prop, EMOJI_PROP_EXTPICT)) || 81 (has_property(a, &gb[0], gb_prop, GB_PROP_EXTEND) && 82 has_property(b, &gb[1], gb_prop, GB_PROP_EXTEND)) || 83 (has_property(a, &gb[0], gb_prop, GB_PROP_EXTEND) && 84 has_property(b, &gb[1], gb_prop, GB_PROP_ZWJ)) || 85 (has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) && 86 has_property(b, &gb[1], gb_prop, GB_PROP_ZWJ)) || 87 (has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) && 88 has_property(b, &gb[1], gb_prop, GB_PROP_EXTEND)))) { 89 /* GRAPHEME_STATE_EMOJI remains */ 90 } else { 91 s &= ~GRAPHEME_STATE_EMOJI; 92 } 93 94 /* write updated state to state-pointer, if given */ 95 if (state != NULL) { 96 *state = s; 97 } 98 99 /* 100 * apply rules 101 */ 102 103 /* skip GB1 and GB2, as they are never satisfied here */ 104 105 /* GB3 */ 106 if (has_property(a, &gb[0], gb_prop, GB_PROP_CR) && 107 has_property(b, &gb[1], gb_prop, GB_PROP_LF)) { 108 return 0; 109 } 110 111 /* GB4 */ 112 if (has_property(a, &gb[0], gb_prop, GB_PROP_CONTROL) || 113 has_property(a, &gb[0], gb_prop, GB_PROP_CR) || 114 has_property(a, &gb[0], gb_prop, GB_PROP_LF)) { 115 return 1; 116 } 117 118 /* GB5 */ 119 if (has_property(b, &gb[1], gb_prop, GB_PROP_CONTROL) || 120 has_property(b, &gb[1], gb_prop, GB_PROP_CR) || 121 has_property(b, &gb[1], gb_prop, GB_PROP_LF)) { 122 return 1; 123 } 124 125 /* GB6 */ 126 if (has_property(a, &gb[0], gb_prop, GB_PROP_L) && 127 (has_property(b, &gb[1], gb_prop, GB_PROP_L) || 128 has_property(b, &gb[1], gb_prop, GB_PROP_V) || 129 has_property(b, &gb[1], gb_prop, GB_PROP_LV) || 130 has_property(b, &gb[1], gb_prop, GB_PROP_LVT))) { 131 return 0; 132 } 133 134 /* GB7 */ 135 if ((has_property(a, &gb[0], gb_prop, GB_PROP_LV) || 136 has_property(a, &gb[0], gb_prop, GB_PROP_V)) && 137 (has_property(b, &gb[1], gb_prop, GB_PROP_V) || 138 has_property(b, &gb[1], gb_prop, GB_PROP_T))) { 139 return 0; 140 } 141 142 /* GB8 */ 143 if ((has_property(a, &gb[0], gb_prop, GB_PROP_LVT) || 144 has_property(a, &gb[0], gb_prop, GB_PROP_T)) && 145 has_property(b, &gb[1], gb_prop, GB_PROP_T)) { 146 return 0; 147 } 148 149 /* GB9 */ 150 if (has_property(b, &gb[1], gb_prop, GB_PROP_EXTEND) || 151 has_property(b, &gb[1], gb_prop, GB_PROP_ZWJ)) { 152 return 0; 153 } 154 155 /* GB9a */ 156 if (has_property(b, &gb[1], gb_prop, GB_PROP_SPACINGMARK)) { 157 return 0; 158 } 159 160 /* GB9b */ 161 if (has_property(a, &gb[0], gb_prop, GB_PROP_PREPEND)) { 162 return 0; 163 } 164 165 /* GB11 */ 166 if ((s & GRAPHEME_STATE_EMOJI) && 167 has_property(a, &gb[0], gb_prop, GB_PROP_ZWJ) && 168 has_property(b, &emoji[1], emoji_prop, EMOJI_PROP_EXTPICT)) { 169 return 0; 170 } 171 172 /* GB12/GB13 */ 173 if (has_property(a, &gb[0], gb_prop, GB_PROP_REGIONAL_INDICATOR) && 174 has_property(b, &gb[1], gb_prop, GB_PROP_REGIONAL_INDICATOR) && 175 (s & GRAPHEME_STATE_RI_ODD)) { 176 return 0; 177 } 178 179 /* GB999 */ 180 return 1; 181 }