boundary_body.c (6474B)
1 /* See LICENSE file for copyright and license details. */ 2 #include <stddef.h> 3 #include <stdint.h> 4 #include <stdlib.h> 5 6 #define LEN(x) (sizeof(x) / sizeof(*x)) 7 8 enum { 9 GRAPHEME_STATE_RI_ODD = 1 << 0, /* odd number of RI's before the seam */ 10 GRAPHEME_STATE_EMOJI = 1 << 1, /* within emoji modifier or zwj sequence */ 11 }; 12 13 enum cp_property { 14 PROP_CR, /* carriage return */ 15 PROP_LF, /* line feed */ 16 PROP_CONTROL, /* control character */ 17 PROP_EXTEND, /* grapheme extender (TODO Emoji_Modifier=Yes) */ 18 PROP_ZWJ, /* zero width joiner */ 19 PROP_RI, /* regional indicator */ 20 PROP_PREPEND, /* prepend character */ 21 PROP_SPACINGMARK, /* spacing mark */ 22 PROP_L, /* hangul syllable type L */ 23 PROP_V, /* hangul syllable type V */ 24 PROP_T, /* hangul syllable type T */ 25 PROP_LV, /* hangul syllable type LV */ 26 PROP_LVT, /* hangul syllable type LVT */ 27 PROP_EXTPICT, /* extended pictographic */ 28 }; 29 30 struct { 31 const uint32_t (*table)[2]; 32 size_t tablelen; 33 } cp_property_tables[] = { 34 [PROP_CR] = { 35 .table = cr_table, 36 .tablelen = LEN(cr_table), 37 }, 38 [PROP_LF] = { 39 .table = lf_table, 40 .tablelen = LEN(lf_table), 41 }, 42 [PROP_CONTROL] = { 43 .table = control_table, 44 .tablelen = LEN(control_table), 45 }, 46 [PROP_EXTEND] = { 47 .table = extend_table, 48 .tablelen = LEN(extend_table), 49 }, 50 [PROP_ZWJ] = { 51 .table = zwj_table, 52 .tablelen = LEN(zwj_table), 53 }, 54 [PROP_RI] = { 55 .table = ri_table, 56 .tablelen = LEN(ri_table), 57 }, 58 [PROP_PREPEND] = { 59 .table = prepend_table, 60 .tablelen = LEN(prepend_table), 61 }, 62 [PROP_SPACINGMARK] = { 63 .table = spacingmark_table, 64 .tablelen = LEN(spacingmark_table), 65 }, 66 [PROP_L] = { 67 .table = l_table, 68 .tablelen = LEN(l_table), 69 }, 70 [PROP_V] = { 71 .table = v_table, 72 .tablelen = LEN(v_table), 73 }, 74 [PROP_T] = { 75 .table = t_table, 76 .tablelen = LEN(t_table), 77 }, 78 [PROP_LV] = { 79 .table = lv_table, 80 .tablelen = LEN(lv_table), 81 }, 82 [PROP_LVT] = { 83 .table = lvt_table, 84 .tablelen = LEN(lvt_table), 85 }, 86 [PROP_EXTPICT] = { 87 .table = extpict_table, 88 .tablelen = LEN(extpict_table), 89 }, 90 }; 91 92 struct cp_properties { 93 uint32_t cp; 94 int_least16_t determined; 95 int_least16_t state; 96 }; 97 98 static int 99 cp_cmp(const void *a, const void *b) 100 { 101 uint32_t cp = *(uint32_t *)a; 102 uint32_t *range = (uint32_t *)b; 103 104 return (cp >= range[0] && cp <= range[1]) ? 0 : (cp - range[0]); 105 } 106 107 static int 108 has_property(struct cp_properties *props, enum cp_property p) 109 { 110 if (!(props->determined & (1 << p))) { 111 /* not determined yet, do a lookup and set the state */ 112 if (bsearch(&props->cp, cp_property_tables[p].table, 113 cp_property_tables[p].tablelen, 114 sizeof(*cp_property_tables[p].table), 115 cp_cmp)) { 116 props->state |= (1 << p); 117 } else { 118 props->state &= ~(1 << p); 119 } 120 121 /* now it's determined */ 122 props->determined |= (1 << p); 123 } 124 125 return (props->state & (1 << p)); 126 } 127 128 int 129 grapheme_boundary(uint32_t a, uint32_t b, int *state) 130 { 131 struct cp_properties props[] = { 132 { 133 .cp = a, 134 }, 135 { 136 .cp = b, 137 }, 138 }; 139 int s; 140 141 /* skip printable ASCII */ 142 if ((a >= 0x20 && a <= 0x7E) && 143 (b >= 0x20 && b <= 0x7E)) { 144 return 1; 145 } 146 147 /* set internal state based on given state-pointer */ 148 s = (state != NULL) ? *state : 0; 149 150 /* 151 * Apply grapheme cluster breaking algorithm (UAX #29), see 152 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules 153 */ 154 155 /* 156 * update state 157 */ 158 if (has_property(&props[1], PROP_RI)) { 159 if (has_property(&props[0], PROP_RI)) { 160 /* one more RI is on the left side of the seam */ 161 s ^= GRAPHEME_STATE_RI_ODD; 162 } else { 163 /* an RI appeared on the right side but the left 164 side is not an RI, reset state (0 is even) */ 165 s &= ~GRAPHEME_STATE_RI_ODD; 166 } 167 } 168 if (!(*state & GRAPHEME_STATE_EMOJI) && 169 ((has_property(&props[0], PROP_EXTPICT) && 170 has_property(&props[1], PROP_ZWJ)) || 171 (has_property(&props[0], PROP_EXTPICT) && 172 has_property(&props[1], PROP_EXTEND)))) { 173 s |= GRAPHEME_STATE_EMOJI; 174 } else if ((*state & GRAPHEME_STATE_EMOJI) && 175 ((has_property(&props[0], PROP_ZWJ) && 176 has_property(&props[1], PROP_EXTPICT)) || 177 (has_property(&props[0], PROP_EXTEND) && 178 has_property(&props[1], PROP_EXTEND)) || 179 (has_property(&props[0], PROP_EXTEND) && 180 has_property(&props[1], PROP_ZWJ)) || 181 (has_property(&props[0], PROP_EXTPICT) && 182 has_property(&props[1], PROP_ZWJ)) || 183 (has_property(&props[0], PROP_EXTPICT) && 184 has_property(&props[1], PROP_EXTEND)))) { 185 /* GRAPHEME_STATE_EMOJI remains */ 186 } else { 187 s &= ~GRAPHEME_STATE_EMOJI; 188 } 189 190 /* write updated state to state-pointer, if given */ 191 if (state != NULL) { 192 *state = s; 193 } 194 195 /* 196 * apply rules 197 */ 198 199 /* skip GB1 and GB2, as they are never satisfied here */ 200 201 /* GB3 */ 202 if (has_property(&props[0], PROP_CR) && 203 has_property(&props[1], PROP_LF)) { 204 return 0; 205 } 206 207 /* GB4 */ 208 if (has_property(&props[0], PROP_CONTROL) || 209 has_property(&props[0], PROP_CR) || 210 has_property(&props[0], PROP_LF)) { 211 return 1; 212 } 213 214 /* GB5 */ 215 if (has_property(&props[1], PROP_CONTROL) || 216 has_property(&props[1], PROP_CR) || 217 has_property(&props[1], PROP_LF)) { 218 return 1; 219 } 220 221 /* GB6 */ 222 if (has_property(&props[0], PROP_L) && 223 (has_property(&props[1], PROP_L) || 224 has_property(&props[1], PROP_V) || 225 has_property(&props[1], PROP_LV) || 226 has_property(&props[1], PROP_LVT))) { 227 return 0; 228 } 229 230 /* GB7 */ 231 if ((has_property(&props[0], PROP_LV) || 232 has_property(&props[0], PROP_V)) && 233 (has_property(&props[1], PROP_V) || 234 has_property(&props[1], PROP_T))) { 235 return 0; 236 } 237 238 /* GB8 */ 239 if ((has_property(&props[0], PROP_LVT) || 240 has_property(&props[0], PROP_T)) && 241 has_property(&props[1], PROP_T)) { 242 return 0; 243 } 244 245 /* GB9 */ 246 if (has_property(&props[1], PROP_EXTEND) || 247 has_property(&props[1], PROP_ZWJ)) { 248 return 0; 249 } 250 251 /* GB9a */ 252 if (has_property(&props[1], PROP_SPACINGMARK)) { 253 return 0; 254 } 255 256 /* GB9b */ 257 if (has_property(&props[0], PROP_PREPEND)) { 258 return 0; 259 } 260 261 /* GB11 */ 262 if ((s & GRAPHEME_STATE_EMOJI) && 263 has_property(&props[0], PROP_ZWJ) && 264 has_property(&props[1], PROP_EXTPICT)) { 265 return 0; 266 } 267 268 /* GB12/GB13 */ 269 if (has_property(&props[0], PROP_RI) && 270 has_property(&props[1], PROP_RI) && 271 (s & GRAPHEME_STATE_RI_ODD)) { 272 return 0; 273 } 274 275 /* GB999 */ 276 return 1; 277 }