libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

character.c (8932B)


      1 /* See LICENSE file for copyright and license details. */
      2 #include <limits.h>
      3 #include <stdbool.h>
      4 #include <stddef.h>
      5 
      6 #include "../gen/character.h"
      7 #include "../grapheme.h"
      8 #include "util.h"
      9 
     10 struct character_break_state {
     11 	uint_least8_t prop;
     12 	bool prop_set;
     13 	bool gb11_flag;
     14 	bool gb12_13_flag;
     15 };
     16 
     17 static const uint_least16_t dont_break[NUM_CHAR_BREAK_PROPS] = {
     18 	[CHAR_BREAK_PROP_OTHER] =
     19 		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |           /* GB9  */
     20 		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |              /* GB9  */
     21 		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK,       /* GB9a */
     22 	[CHAR_BREAK_PROP_CR] = UINT16_C(1) << CHAR_BREAK_PROP_LF, /* GB3  */
     23 	[CHAR_BREAK_PROP_EXTEND] =
     24 		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
     25 		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |        /* GB9  */
     26 		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
     27 	[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
     28 		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
     29 		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |        /* GB9  */
     30 		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
     31 	[CHAR_BREAK_PROP_HANGUL_L] =
     32 		UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_L |   /* GB6  */
     33 		UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V |   /* GB6  */
     34 		UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LV |  /* GB6  */
     35 		UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LVT | /* GB6  */
     36 		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
     37 		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |        /* GB9  */
     38 		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
     39 	[CHAR_BREAK_PROP_HANGUL_V] =
     40 		UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V |   /* GB7  */
     41 		UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T |   /* GB7  */
     42 		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
     43 		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |        /* GB9  */
     44 		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
     45 	[CHAR_BREAK_PROP_HANGUL_T] =
     46 		UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T |   /* GB8  */
     47 		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
     48 		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |        /* GB9  */
     49 		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
     50 	[CHAR_BREAK_PROP_HANGUL_LV] =
     51 		UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V |   /* GB7  */
     52 		UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T |   /* GB7  */
     53 		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
     54 		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |        /* GB9  */
     55 		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
     56 	[CHAR_BREAK_PROP_HANGUL_LVT] =
     57 		UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T |   /* GB8  */
     58 		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
     59 		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |        /* GB9  */
     60 		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
     61 	[CHAR_BREAK_PROP_PREPEND] =
     62 		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |      /* GB9  */
     63 		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |         /* GB9  */
     64 		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK | /* GB9a */
     65 		(UINT16_C(0xFFFF) &
     66 	         ~(UINT16_C(1) << CHAR_BREAK_PROP_CR |
     67 	           UINT16_C(1) << CHAR_BREAK_PROP_LF |
     68 	           UINT16_C(1) << CHAR_BREAK_PROP_CONTROL)), /* GB9b */
     69 	[CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
     70 		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
     71 		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |        /* GB9  */
     72 		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
     73 	[CHAR_BREAK_PROP_SPACINGMARK] =
     74 		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
     75 		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |        /* GB9  */
     76 		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
     77 	[CHAR_BREAK_PROP_ZWJ] =
     78 		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
     79 		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |        /* GB9  */
     80 		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
     81 };
     82 static const uint_least16_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
     83 	[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
     84 		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |
     85 		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND,
     86 	[CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
     87 		UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
     88 	[CHAR_BREAK_PROP_EXTEND + NUM_CHAR_BREAK_PROPS] =
     89 		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |
     90 		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ,
     91 	[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_CHAR_BREAK_PROPS] =
     92 		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |
     93 		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND,
     94 };
     95 static const uint_least16_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
     96 	[CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
     97 		UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
     98 };
     99 static const uint_least16_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
    100 	[CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
    101 		UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
    102 };
    103 static const uint_least16_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
    104 	[CHAR_BREAK_PROP_REGIONAL_INDICATOR + NUM_CHAR_BREAK_PROPS] =
    105 		UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
    106 };
    107 
    108 static inline enum char_break_property
    109 get_break_prop(uint_least32_t cp)
    110 {
    111 	if (likely(cp <= UINT32_C(0x10FFFF))) {
    112 		return (enum char_break_property)
    113 			char_break_minor[char_break_major[cp >> 8] +
    114 		                         (cp & 0xFF)];
    115 	} else {
    116 		return CHAR_BREAK_PROP_OTHER;
    117 	}
    118 }
    119 
    120 static inline void
    121 state_serialize(const struct character_break_state *in, uint_least16_t *out)
    122 {
    123 	*out = (uint_least16_t)(in->prop & UINT8_C(0xFF)) | /* first 8 bits */
    124 	       (uint_least16_t)(((uint_least16_t)(in->prop_set))
    125 	                        << 8) | /* 9th bit */
    126 	       (uint_least16_t)(((uint_least16_t)(in->gb11_flag))
    127 	                        << 9) | /* 10th bit */
    128 	       (uint_least16_t)(((uint_least16_t)(in->gb12_13_flag))
    129 	                        << 10); /* 11th bit */
    130 }
    131 
    132 static inline void
    133 state_deserialize(uint_least16_t in, struct character_break_state *out)
    134 {
    135 	out->prop = in & UINT8_C(0xFF);
    136 	out->prop_set = in & (UINT16_C(1) << 8);
    137 	out->gb11_flag = in & (UINT16_C(1) << 9);
    138 	out->gb12_13_flag = in & (UINT16_C(1) << 10);
    139 }
    140 
    141 bool
    142 grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1,
    143                             uint_least16_t *s)
    144 {
    145 	struct character_break_state state;
    146 	enum char_break_property cp0_prop, cp1_prop;
    147 	bool notbreak = false;
    148 
    149 	if (likely(s)) {
    150 		state_deserialize(*s, &state);
    151 
    152 		if (likely(state.prop_set)) {
    153 			cp0_prop = state.prop;
    154 		} else {
    155 			cp0_prop = get_break_prop(cp0);
    156 		}
    157 		cp1_prop = get_break_prop(cp1);
    158 
    159 		/* preserve prop of right codepoint for next iteration */
    160 		state.prop = (uint_least8_t)cp1_prop;
    161 		state.prop_set = true;
    162 
    163 		/* update flags */
    164 		state.gb11_flag =
    165 			flag_update_gb11[cp0_prop + NUM_CHAR_BREAK_PROPS *
    166 		                                            state.gb11_flag] &
    167 			UINT16_C(1) << cp1_prop;
    168 		state.gb12_13_flag =
    169 			flag_update_gb12_13[cp0_prop +
    170 		                            NUM_CHAR_BREAK_PROPS *
    171 		                                    state.gb12_13_flag] &
    172 			UINT16_C(1) << cp1_prop;
    173 
    174 		/*
    175 		 * Apply grapheme cluster breaking algorithm (UAX #29), see
    176 		 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
    177 		 */
    178 		notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
    179 		           (dont_break_gb11[cp0_prop +
    180 		                            state.gb11_flag *
    181 		                                    NUM_CHAR_BREAK_PROPS] &
    182 		            (UINT16_C(1) << cp1_prop)) ||
    183 		           (dont_break_gb12_13[cp0_prop +
    184 		                               state.gb12_13_flag *
    185 		                                       NUM_CHAR_BREAK_PROPS] &
    186 		            (UINT16_C(1) << cp1_prop));
    187 
    188 		/* update or reset flags (when we have a break) */
    189 		if (likely(!notbreak)) {
    190 			state.gb11_flag = state.gb12_13_flag = false;
    191 		}
    192 
    193 		state_serialize(&state, s);
    194 	} else {
    195 		cp0_prop = get_break_prop(cp0);
    196 		cp1_prop = get_break_prop(cp1);
    197 
    198 		/*
    199 		 * Apply grapheme cluster breaking algorithm (UAX #29), see
    200 		 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
    201 		 *
    202 		 * Given we have no state, this behaves as if the state-booleans
    203 		 * were all set to false
    204 		 */
    205 		notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
    206 		           (dont_break_gb11[cp0_prop] &
    207 		            (UINT16_C(1) << cp1_prop)) ||
    208 		           (dont_break_gb12_13[cp0_prop] &
    209 		            (UINT16_C(1) << cp1_prop));
    210 	}
    211 
    212 	return !notbreak;
    213 }
    214 
    215 static size_t
    216 next_character_break(HERODOTUS_READER *r)
    217 {
    218 	uint_least16_t state = 0;
    219 	uint_least32_t cp0 = 0, cp1 = 0;
    220 
    221 	for (herodotus_read_codepoint(r, true, &cp0);
    222 	     herodotus_read_codepoint(r, false, &cp1) ==
    223 	     HERODOTUS_STATUS_SUCCESS;
    224 	     herodotus_read_codepoint(r, true, &cp0)) {
    225 		if (grapheme_is_character_break(cp0, cp1, &state)) {
    226 			break;
    227 		}
    228 	}
    229 
    230 	return herodotus_reader_number_read(r);
    231 }
    232 
    233 size_t
    234 grapheme_next_character_break(const uint_least32_t *str, size_t len)
    235 {
    236 	HERODOTUS_READER r;
    237 
    238 	herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
    239 
    240 	return next_character_break(&r);
    241 }
    242 
    243 size_t
    244 grapheme_next_character_break_utf8(const char *str, size_t len)
    245 {
    246 	HERODOTUS_READER r;
    247 
    248 	herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
    249 
    250 	return next_character_break(&r);
    251 }