grapheme.c (6818B)
1 /* See LICENSE file for copyright and license details. */ 2 #include <stdbool.h> 3 #include <stddef.h> 4 #include <stdlib.h> 5 #include <string.h> 6 7 #include "../gen/grapheme.h" 8 #include "../grapheme.h" 9 #include "util.h" 10 11 enum { 12 GRAPHEME_FLAG_RI_ODD = 1 << 0, /* odd number of RI's before the seam */ 13 GRAPHEME_FLAG_EMOJI = 1 << 1, /* within emoji modifier or zwj sequence */ 14 }; 15 16 bool 17 lg_grapheme_isbreak(uint_least32_t a, uint_least32_t b, LG_SEGMENTATION_STATE *state) 18 { 19 struct lg_internal_heisenstate *p[2] = { 0 }; 20 uint_least16_t flags = 0; 21 bool isbreak = true; 22 23 /* set state depending on state pointer */ 24 if (state != NULL) { 25 p[0] = &(state->a); 26 p[1] = &(state->b); 27 flags = state->flags; 28 } 29 30 /* skip printable ASCII */ 31 if ((a >= 0x20 && a <= 0x7E) && 32 (b >= 0x20 && b <= 0x7E)) { 33 goto hasbreak; 34 } 35 36 /* 37 * Apply grapheme cluster breaking algorithm (UAX #29), see 38 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules 39 */ 40 41 /* 42 * update flags, if state-pointer given 43 */ 44 if (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR)) { 45 if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR)) { 46 /* one more RI is on the left side of the seam, flip state */ 47 flags ^= GRAPHEME_FLAG_RI_ODD; 48 } else { 49 /* an RI appeared on the right side but the left 50 side is not an RI, reset state (number 0 is even) */ 51 flags &= ~GRAPHEME_FLAG_RI_ODD; 52 } 53 } 54 if (!(flags & GRAPHEME_FLAG_EMOJI) && 55 ((has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) && 56 has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) || 57 (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) && 58 has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTEND)))) { 59 flags |= GRAPHEME_FLAG_EMOJI; 60 } else if ((flags & GRAPHEME_FLAG_EMOJI) && 61 ((has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_ZWJ) && 62 has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC)) || 63 (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTEND) && 64 has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTEND)) || 65 (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTEND) && 66 has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) || 67 (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) && 68 has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) || 69 (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) && 70 has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTEND)))) { 71 /* GRAPHEME_FLAG_EMOJI remains */ 72 } else { 73 flags &= ~GRAPHEME_FLAG_EMOJI; 74 } 75 76 /* write updated flags to state, if given */ 77 if (state != NULL) { 78 state->flags = flags; 79 } 80 81 /* 82 * apply rules 83 */ 84 85 /* skip GB1 and GB2, as they are never satisfied here */ 86 87 /* GB3 */ 88 if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_CR) && 89 has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_LF)) { 90 goto nobreak; 91 } 92 93 /* GB4 */ 94 if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_CONTROL) || 95 has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_CR) || 96 has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_LF)) { 97 goto hasbreak; 98 } 99 100 /* GB5 */ 101 if (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_CONTROL) || 102 has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_CR) || 103 has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_LF)) { 104 goto hasbreak; 105 } 106 107 /* GB6 */ 108 if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_HANGUL_L) && 109 (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_L) || 110 has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_V) || 111 has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_LV) || 112 113 has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_LVT))) { 114 goto nobreak; 115 } 116 117 /* GB7 */ 118 if ((has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_HANGUL_LV) || 119 has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_HANGUL_V)) && 120 (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_V) || 121 has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_T))) { 122 goto nobreak; 123 } 124 125 /* GB8 */ 126 if ((has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_HANGUL_LVT) || 127 has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_HANGUL_T)) && 128 has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_T)) { 129 goto nobreak; 130 } 131 132 /* GB9 */ 133 if (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTEND) || 134 has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) { 135 goto nobreak; 136 } 137 138 /* GB9a */ 139 if (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_SPACINGMARK)) { 140 goto nobreak; 141 } 142 143 /* GB9b */ 144 if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_PREPEND)) { 145 goto nobreak; 146 } 147 148 /* GB11 */ 149 if ((flags & GRAPHEME_FLAG_EMOJI) && 150 has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_ZWJ) && 151 has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC)) { 152 goto nobreak; 153 } 154 155 /* GB12/GB13 */ 156 if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR) && 157 has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR) && 158 (flags & GRAPHEME_FLAG_RI_ODD)) { 159 goto nobreak; 160 } 161 162 /* GB999 */ 163 goto hasbreak; 164 nobreak: 165 isbreak = false; 166 hasbreak: 167 if (state != NULL) { 168 /* move b-state to a-state, discard b-state */ 169 memcpy(&(state->a), &(state->b), sizeof(state->a)); 170 memset(&(state->b), 0, sizeof(state->b)); 171 172 /* reset flags */ 173 if (isbreak) { 174 state->flags = 0; 175 } 176 } 177 178 return isbreak; 179 } 180 181 size_t 182 lg_grapheme_nextbreak(const char *str) 183 { 184 uint_least32_t cp0, cp1; 185 size_t ret, len = 0; 186 LG_SEGMENTATION_STATE state = { 0 }; 187 188 if (str == NULL) { 189 return 0; 190 } 191 192 /* 193 * lg_utf8_decode, when it encounters an unexpected byte, 194 * does not count it to the error and instead assumes that the 195 * unexpected byte is the beginning of a new sequence. 196 * This way, when the string ends with a null byte, we never 197 * miss it, even if the previous UTF-8 sequence terminates 198 * unexpectedly, as it would either act as an unexpected byte, 199 * saved for later, or as a null byte itself, that we can catch. 200 * We pass (size_t)-1 to the length, as we will never read beyond 201 * the null byte for the reasons given above. 202 */ 203 204 /* get first code point */ 205 len += lg_utf8_decode(str, (size_t)-1, &cp0); 206 if (cp0 == LG_INVALID_CODE_POINT) { 207 return len; 208 } 209 210 while (cp0 != 0) { 211 /* get next code point */ 212 ret = lg_utf8_decode(str + len, (size_t)-1, &cp1); 213 214 if (cp1 == LG_INVALID_CODE_POINT || 215 lg_grapheme_isbreak(cp0, cp1, &state)) { 216 /* we read an invalid cp or have a breakpoint */ 217 break; 218 } else { 219 /* we don't have a breakpoint, continue */ 220 len += ret; 221 } 222 223 /* prepare next round */ 224 cp0 = cp1; 225 } 226 227 return len; 228 }