libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

character.c (8109B)


      1 /* See LICENSE file for copyright and license details. */
      2 #include <stdbool.h>
      3 #include <stddef.h>
      4 #include <stdlib.h>
      5 #include <string.h>
      6 
      7 #include "../gen/character.h"
      8 #include "../grapheme.h"
      9 #include "util.h"
     10 
     11 static const uint_least16_t dont_break[NUM_CHAR_BREAK_PROPS] = {
     12 	[CHAR_BREAK_PROP_OTHER] =
     13 		UINT16_C(1 << CHAR_BREAK_PROP_EXTEND)       | /* GB9  */
     14 		UINT16_C(1 << CHAR_BREAK_PROP_ZWJ)          | /* GB9  */
     15 		UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK),   /* GB9a */
     16 	[CHAR_BREAK_PROP_CR] =
     17 		UINT16_C(1 << CHAR_BREAK_PROP_LF),            /* GB3  */
     18 	[CHAR_BREAK_PROP_EXTEND] =
     19 		UINT16_C(1 << CHAR_BREAK_PROP_EXTEND)       | /* GB9  */
     20 		UINT16_C(1 << CHAR_BREAK_PROP_ZWJ)          | /* GB9  */
     21 		UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK),   /* GB9a */
     22 	[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
     23 		UINT16_C(1 << CHAR_BREAK_PROP_EXTEND)       | /* GB9  */
     24 		UINT16_C(1 << CHAR_BREAK_PROP_ZWJ)          | /* GB9  */
     25 		UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK),   /* GB9a */
     26 	[CHAR_BREAK_PROP_HANGUL_L] =
     27 		UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_L)     | /* GB6  */
     28 		UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_V)     | /* GB6  */
     29 		UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_LV)    | /* GB6  */
     30 		UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_LVT)   | /* GB6  */
     31 		UINT16_C(1 << CHAR_BREAK_PROP_EXTEND)       | /* GB9  */
     32 		UINT16_C(1 << CHAR_BREAK_PROP_ZWJ)          | /* GB9  */
     33 		UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK),   /* GB9a */
     34 	[CHAR_BREAK_PROP_HANGUL_V] =
     35 		UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_V)     | /* GB7  */
     36 		UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_T)     | /* GB7  */
     37 		UINT16_C(1 << CHAR_BREAK_PROP_EXTEND)       | /* GB9  */
     38 		UINT16_C(1 << CHAR_BREAK_PROP_ZWJ)          | /* GB9  */
     39 		UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK),   /* GB9a */
     40 	[CHAR_BREAK_PROP_HANGUL_T] =
     41 		UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_T)     | /* GB8  */
     42 		UINT16_C(1 << CHAR_BREAK_PROP_EXTEND)       | /* GB9  */
     43 		UINT16_C(1 << CHAR_BREAK_PROP_ZWJ)          | /* GB9  */
     44 		UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK),   /* GB9a */
     45 	[CHAR_BREAK_PROP_HANGUL_LV] =
     46 		UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_V)     | /* GB7  */
     47 		UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_T)     | /* GB7  */
     48 		UINT16_C(1 << CHAR_BREAK_PROP_EXTEND)       | /* GB9  */
     49 		UINT16_C(1 << CHAR_BREAK_PROP_ZWJ)          | /* GB9  */
     50 		UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK),   /* GB9a */
     51 	[CHAR_BREAK_PROP_HANGUL_LVT] =
     52 		UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_T)     | /* GB8  */
     53 		UINT16_C(1 << CHAR_BREAK_PROP_EXTEND)       | /* GB9  */
     54 		UINT16_C(1 << CHAR_BREAK_PROP_ZWJ)          | /* GB9  */
     55 		UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK),   /* GB9a */
     56 	[CHAR_BREAK_PROP_PREPEND] =
     57 		UINT16_C(1 << CHAR_BREAK_PROP_EXTEND)       | /* GB9  */
     58 		UINT16_C(1 << CHAR_BREAK_PROP_ZWJ)          | /* GB9  */
     59 		UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK)  | /* GB9a */
     60 		(UINT16_C(0xFFFF) &
     61 		 ~(UINT16_C(1 << CHAR_BREAK_PROP_CR)      |
     62 		   UINT16_C(1 << CHAR_BREAK_PROP_LF)      |
     63 		   UINT16_C(1 << CHAR_BREAK_PROP_CONTROL)
     64 		  )
     65 		),                                           /* GB9b */
     66 	[CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
     67 		UINT16_C(1 << CHAR_BREAK_PROP_EXTEND)       | /* GB9  */
     68 		UINT16_C(1 << CHAR_BREAK_PROP_ZWJ)          | /* GB9  */
     69 		UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK),   /* GB9a */
     70 	[CHAR_BREAK_PROP_SPACINGMARK] =
     71 		UINT16_C(1 << CHAR_BREAK_PROP_EXTEND)       | /* GB9  */
     72 		UINT16_C(1 << CHAR_BREAK_PROP_ZWJ)          | /* GB9  */
     73 		UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK),   /* GB9a */
     74 	[CHAR_BREAK_PROP_ZWJ] =
     75 		UINT16_C(1 << CHAR_BREAK_PROP_EXTEND)       | /* GB9  */
     76 		UINT16_C(1 << CHAR_BREAK_PROP_ZWJ)          | /* GB9  */
     77 		UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK),   /* GB9a */
     78 };
     79 static const uint_least16_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
     80 	[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
     81 		UINT16_C(1 << CHAR_BREAK_PROP_ZWJ)                   |
     82 		UINT16_C(1 << CHAR_BREAK_PROP_EXTEND),
     83 	[CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
     84 		UINT16_C(1 << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC),
     85 	[CHAR_BREAK_PROP_EXTEND + NUM_CHAR_BREAK_PROPS] =
     86 		UINT16_C(1 << CHAR_BREAK_PROP_EXTEND)                |
     87 		UINT16_C(1 << CHAR_BREAK_PROP_ZWJ),
     88 	[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_CHAR_BREAK_PROPS] =
     89 		UINT16_C(1 << CHAR_BREAK_PROP_ZWJ)                   |
     90 		UINT16_C(1 << CHAR_BREAK_PROP_EXTEND),
     91 };
     92 static const uint_least16_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
     93 	[CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
     94 		UINT16_C(1 << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC),
     95 };
     96 static const uint_least16_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
     97 	[CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
     98 		UINT16_C(1 << CHAR_BREAK_PROP_REGIONAL_INDICATOR),
     99 };
    100 static const uint_least16_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
    101 	[CHAR_BREAK_PROP_REGIONAL_INDICATOR + NUM_CHAR_BREAK_PROPS] =
    102 		UINT16_C(1 << CHAR_BREAK_PROP_REGIONAL_INDICATOR),
    103 };
    104 
    105 static enum char_break_property
    106 get_break_prop(uint_least32_t cp)
    107 {
    108 	if (likely(cp <= 0x10FFFF)) {
    109 		return (enum char_break_property)
    110 		       char_break_minor[char_break_major[cp >> 8] + (cp & 0xff)];
    111 	} else {
    112 		return CHAR_BREAK_PROP_OTHER;
    113 	}
    114 }
    115 
    116 bool
    117 grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STATE *state)
    118 {
    119 	enum char_break_property cp0_prop, cp1_prop;
    120 	bool notbreak = false;
    121 
    122 	if (likely(state)) {
    123 		if (likely(state->prop_set)) {
    124 			cp0_prop = state->prop;
    125 		} else {
    126 			cp0_prop = get_break_prop(cp0);
    127 		}
    128 		cp1_prop = get_break_prop(cp1);
    129 
    130 		/* preserve prop of right codepoint for next iteration */
    131 		state->prop = (uint_least8_t)cp1_prop;
    132 		state->prop_set = true;
    133 
    134 		/* update flags */
    135 		state->gb11_flag = flag_update_gb11[cp0_prop +
    136 		                                    NUM_CHAR_BREAK_PROPS *
    137 		                                    state->gb11_flag] &
    138 	                           UINT16_C(1 << cp1_prop);
    139 		state->gb12_13_flag = flag_update_gb12_13[cp0_prop +
    140 		                                          NUM_CHAR_BREAK_PROPS *
    141 		                                          state->gb12_13_flag] &
    142 		                      UINT16_C(1 << cp1_prop);
    143 
    144 		/*
    145 		 * Apply grapheme cluster breaking algorithm (UAX #29), see
    146 		 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
    147 		 */
    148 		notbreak = (dont_break[cp0_prop] & UINT16_C(1 << cp1_prop)) ||
    149 		           (dont_break_gb11[cp0_prop + state->gb11_flag *
    150 		                            NUM_CHAR_BREAK_PROPS] &
    151 		            UINT16_C(1 << cp1_prop)) ||
    152 		           (dont_break_gb12_13[cp0_prop + state->gb12_13_flag *
    153 		                               NUM_CHAR_BREAK_PROPS] &
    154 		            UINT16_C(1 << cp1_prop));
    155 
    156 		/* update or reset flags (when we have a break) */
    157 		if (likely(!notbreak)) {
    158 			state->gb11_flag = state->gb12_13_flag = false;
    159 		}
    160 	} else {
    161 		cp0_prop = get_break_prop(cp0);
    162 		cp1_prop = get_break_prop(cp1);
    163 
    164 		/*
    165 		 * Apply grapheme cluster breaking algorithm (UAX #29), see
    166 		 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
    167 		 *
    168 		 * Given we have no state, this behaves as if the state-booleans
    169 		 * were all set to false
    170 		 */
    171 		notbreak = (dont_break[cp0_prop] & UINT16_C(1 << cp1_prop)) ||
    172 		           (dont_break_gb11[cp0_prop] & UINT16_C(1 << cp1_prop)) ||
    173 		           (dont_break_gb12_13[cp0_prop] & UINT16_C(1 << cp1_prop));
    174 	}
    175 
    176 	return !notbreak;
    177 }
    178 
    179 size_t
    180 grapheme_next_character_break(const char *str, size_t len)
    181 {
    182 	GRAPHEME_STATE state = { 0 };
    183 	uint_least32_t cp0 = 0, cp1 = 0;
    184 	size_t off, ret;
    185 
    186 	if (str == NULL || len == 0) {
    187 		return 0;
    188 	}
    189 
    190 	for (off = 0; (len == SIZE_MAX) || off < len; off += ret) {
    191 		cp0 = cp1;
    192 		ret = grapheme_decode_utf8(str + off, (len == SIZE_MAX) ?
    193 		                           SIZE_MAX : len - off, &cp1);
    194 
    195 		if (len != SIZE_MAX && ret > (len - off)) {
    196 			/* string ended abruptly, simply accept cropping */
    197 			ret = len - off;
    198 		}
    199 
    200 		if (len == SIZE_MAX && cp1 == 0) {
    201 			/* we hit a NUL-byte and are done */
    202 			break;
    203 		}
    204 
    205 		if (off == 0) {
    206 			/*
    207 			 * we skip the first round, as we need both
    208 			 * cp0 and cp1 to be initialized
    209 			 */
    210 			continue;
    211 		} else if (grapheme_is_character_break(cp0, cp1, &state)) {
    212 			break;
    213 		}
    214 	}
    215 
    216 	return off;
    217 }