character.c (8109B)
1 /* See LICENSE file for copyright and license details. */ 2 #include <stdbool.h> 3 #include <stddef.h> 4 #include <stdlib.h> 5 #include <string.h> 6 7 #include "../gen/character.h" 8 #include "../grapheme.h" 9 #include "util.h" 10 11 static const uint_least16_t dont_break[NUM_CHAR_BREAK_PROPS] = { 12 [CHAR_BREAK_PROP_OTHER] = 13 UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ 14 UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ 15 UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ 16 [CHAR_BREAK_PROP_CR] = 17 UINT16_C(1 << CHAR_BREAK_PROP_LF), /* GB3 */ 18 [CHAR_BREAK_PROP_EXTEND] = 19 UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ 20 UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ 21 UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ 22 [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] = 23 UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ 24 UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ 25 UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ 26 [CHAR_BREAK_PROP_HANGUL_L] = 27 UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_L) | /* GB6 */ 28 UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_V) | /* GB6 */ 29 UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_LV) | /* GB6 */ 30 UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_LVT) | /* GB6 */ 31 UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ 32 UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ 33 UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ 34 [CHAR_BREAK_PROP_HANGUL_V] = 35 UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_V) | /* GB7 */ 36 UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_T) | /* GB7 */ 37 UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ 38 UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ 39 UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ 40 [CHAR_BREAK_PROP_HANGUL_T] = 41 UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_T) | /* GB8 */ 42 UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ 43 UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ 44 UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ 45 [CHAR_BREAK_PROP_HANGUL_LV] = 46 UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_V) | /* GB7 */ 47 UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_T) | /* GB7 */ 48 UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ 49 UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ 50 UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ 51 [CHAR_BREAK_PROP_HANGUL_LVT] = 52 UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_T) | /* GB8 */ 53 UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ 54 UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ 55 UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ 56 [CHAR_BREAK_PROP_PREPEND] = 57 UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ 58 UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ 59 UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK) | /* GB9a */ 60 (UINT16_C(0xFFFF) & 61 ~(UINT16_C(1 << CHAR_BREAK_PROP_CR) | 62 UINT16_C(1 << CHAR_BREAK_PROP_LF) | 63 UINT16_C(1 << CHAR_BREAK_PROP_CONTROL) 64 ) 65 ), /* GB9b */ 66 [CHAR_BREAK_PROP_REGIONAL_INDICATOR] = 67 UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ 68 UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ 69 UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ 70 [CHAR_BREAK_PROP_SPACINGMARK] = 71 UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ 72 UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ 73 UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ 74 [CHAR_BREAK_PROP_ZWJ] = 75 UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ 76 UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ 77 UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ 78 }; 79 static const uint_least16_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] = { 80 [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] = 81 UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | 82 UINT16_C(1 << CHAR_BREAK_PROP_EXTEND), 83 [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] = 84 UINT16_C(1 << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC), 85 [CHAR_BREAK_PROP_EXTEND + NUM_CHAR_BREAK_PROPS] = 86 UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | 87 UINT16_C(1 << CHAR_BREAK_PROP_ZWJ), 88 [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_CHAR_BREAK_PROPS] = 89 UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | 90 UINT16_C(1 << CHAR_BREAK_PROP_EXTEND), 91 }; 92 static const uint_least16_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = { 93 [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] = 94 UINT16_C(1 << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC), 95 }; 96 static const uint_least16_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = { 97 [CHAR_BREAK_PROP_REGIONAL_INDICATOR] = 98 UINT16_C(1 << CHAR_BREAK_PROP_REGIONAL_INDICATOR), 99 }; 100 static const uint_least16_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = { 101 [CHAR_BREAK_PROP_REGIONAL_INDICATOR + NUM_CHAR_BREAK_PROPS] = 102 UINT16_C(1 << CHAR_BREAK_PROP_REGIONAL_INDICATOR), 103 }; 104 105 static enum char_break_property 106 get_break_prop(uint_least32_t cp) 107 { 108 if (likely(cp <= 0x10FFFF)) { 109 return (enum char_break_property) 110 char_break_minor[char_break_major[cp >> 8] + (cp & 0xff)]; 111 } else { 112 return CHAR_BREAK_PROP_OTHER; 113 } 114 } 115 116 bool 117 grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STATE *state) 118 { 119 enum char_break_property cp0_prop, cp1_prop; 120 bool notbreak = false; 121 122 if (likely(state)) { 123 if (likely(state->prop_set)) { 124 cp0_prop = state->prop; 125 } else { 126 cp0_prop = get_break_prop(cp0); 127 } 128 cp1_prop = get_break_prop(cp1); 129 130 /* preserve prop of right codepoint for next iteration */ 131 state->prop = (uint_least8_t)cp1_prop; 132 state->prop_set = true; 133 134 /* update flags */ 135 state->gb11_flag = flag_update_gb11[cp0_prop + 136 NUM_CHAR_BREAK_PROPS * 137 state->gb11_flag] & 138 UINT16_C(1 << cp1_prop); 139 state->gb12_13_flag = flag_update_gb12_13[cp0_prop + 140 NUM_CHAR_BREAK_PROPS * 141 state->gb12_13_flag] & 142 UINT16_C(1 << cp1_prop); 143 144 /* 145 * Apply grapheme cluster breaking algorithm (UAX #29), see 146 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules 147 */ 148 notbreak = (dont_break[cp0_prop] & UINT16_C(1 << cp1_prop)) || 149 (dont_break_gb11[cp0_prop + state->gb11_flag * 150 NUM_CHAR_BREAK_PROPS] & 151 UINT16_C(1 << cp1_prop)) || 152 (dont_break_gb12_13[cp0_prop + state->gb12_13_flag * 153 NUM_CHAR_BREAK_PROPS] & 154 UINT16_C(1 << cp1_prop)); 155 156 /* update or reset flags (when we have a break) */ 157 if (likely(!notbreak)) { 158 state->gb11_flag = state->gb12_13_flag = false; 159 } 160 } else { 161 cp0_prop = get_break_prop(cp0); 162 cp1_prop = get_break_prop(cp1); 163 164 /* 165 * Apply grapheme cluster breaking algorithm (UAX #29), see 166 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules 167 * 168 * Given we have no state, this behaves as if the state-booleans 169 * were all set to false 170 */ 171 notbreak = (dont_break[cp0_prop] & UINT16_C(1 << cp1_prop)) || 172 (dont_break_gb11[cp0_prop] & UINT16_C(1 << cp1_prop)) || 173 (dont_break_gb12_13[cp0_prop] & UINT16_C(1 << cp1_prop)); 174 } 175 176 return !notbreak; 177 } 178 179 size_t 180 grapheme_next_character_break(const char *str, size_t len) 181 { 182 GRAPHEME_STATE state = { 0 }; 183 uint_least32_t cp0 = 0, cp1 = 0; 184 size_t off, ret; 185 186 if (str == NULL || len == 0) { 187 return 0; 188 } 189 190 for (off = 0; (len == SIZE_MAX) || off < len; off += ret) { 191 cp0 = cp1; 192 ret = grapheme_decode_utf8(str + off, (len == SIZE_MAX) ? 193 SIZE_MAX : len - off, &cp1); 194 195 if (len != SIZE_MAX && ret > (len - off)) { 196 /* string ended abruptly, simply accept cropping */ 197 ret = len - off; 198 } 199 200 if (len == SIZE_MAX && cp1 == 0) { 201 /* we hit a NUL-byte and are done */ 202 break; 203 } 204 205 if (off == 0) { 206 /* 207 * we skip the first round, as we need both 208 * cp0 and cp1 to be initialized 209 */ 210 continue; 211 } else if (grapheme_is_character_break(cp0, cp1, &state)) { 212 break; 213 } 214 } 215 216 return off; 217 }