libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

character.c (7934B)


      1 /* See LICENSE file for copyright and license details. */
      2 #include <limits.h>
      3 #include <stdbool.h>
      4 #include <stddef.h>
      5 
      6 #include "../gen/character.h"
      7 #include "../grapheme.h"
      8 #include "util.h"
      9 
     10 static const uint_least16_t dont_break[NUM_CHAR_BREAK_PROPS] = {
     11 	[CHAR_BREAK_PROP_OTHER] =
     12 		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND       | /* GB9  */
     13 		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ          | /* GB9  */
     14 		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK,   /* GB9a */
     15 	[CHAR_BREAK_PROP_CR] =
     16 		UINT16_C(1) << CHAR_BREAK_PROP_LF,            /* GB3  */
     17 	[CHAR_BREAK_PROP_EXTEND] =
     18 		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND       | /* GB9  */
     19 		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ          | /* GB9  */
     20 		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK,   /* GB9a */
     21 	[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
     22 		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND       | /* GB9  */
     23 		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ          | /* GB9  */
     24 		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK,   /* GB9a */
     25 	[CHAR_BREAK_PROP_HANGUL_L] =
     26 		UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_L     | /* GB6  */
     27 		UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V     | /* GB6  */
     28 		UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LV    | /* GB6  */
     29 		UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LVT   | /* GB6  */
     30 		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND       | /* GB9  */
     31 		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ          | /* GB9  */
     32 		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK,   /* GB9a */
     33 	[CHAR_BREAK_PROP_HANGUL_V] =
     34 		UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V     | /* GB7  */
     35 		UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T     | /* GB7  */
     36 		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND       | /* GB9  */
     37 		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ          | /* GB9  */
     38 		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK,   /* GB9a */
     39 	[CHAR_BREAK_PROP_HANGUL_T] =
     40 		UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T     | /* GB8  */
     41 		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND       | /* GB9  */
     42 		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ          | /* GB9  */
     43 		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK,   /* GB9a */
     44 	[CHAR_BREAK_PROP_HANGUL_LV] =
     45 		UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V     | /* GB7  */
     46 		UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T     | /* GB7  */
     47 		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND       | /* GB9  */
     48 		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ          | /* GB9  */
     49 		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK,   /* GB9a */
     50 	[CHAR_BREAK_PROP_HANGUL_LVT] =
     51 		UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T     | /* GB8  */
     52 		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND       | /* GB9  */
     53 		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ          | /* GB9  */
     54 		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK,   /* GB9a */
     55 	[CHAR_BREAK_PROP_PREPEND] =
     56 		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND       | /* GB9  */
     57 		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ          | /* GB9  */
     58 		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK  | /* GB9a */
     59 		(UINT16_C(0xFFFF) &
     60 		 ~(UINT16_C(1) << CHAR_BREAK_PROP_CR      |
     61 		   UINT16_C(1) << CHAR_BREAK_PROP_LF      |
     62 		   UINT16_C(1) << CHAR_BREAK_PROP_CONTROL
     63 		  )
     64 		),                                           /* GB9b */
     65 	[CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
     66 		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND       | /* GB9  */
     67 		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ          | /* GB9  */
     68 		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK,   /* GB9a */
     69 	[CHAR_BREAK_PROP_SPACINGMARK] =
     70 		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND       | /* GB9  */
     71 		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ          | /* GB9  */
     72 		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK,   /* GB9a */
     73 	[CHAR_BREAK_PROP_ZWJ] =
     74 		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND       | /* GB9  */
     75 		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ          | /* GB9  */
     76 		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK,   /* GB9a */
     77 };
     78 static const uint_least16_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
     79 	[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
     80 		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ                   |
     81 		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND,
     82 	[CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
     83 		UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
     84 	[CHAR_BREAK_PROP_EXTEND + NUM_CHAR_BREAK_PROPS] =
     85 		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND                |
     86 		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ,
     87 	[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_CHAR_BREAK_PROPS] =
     88 		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ                   |
     89 		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND,
     90 };
     91 static const uint_least16_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
     92 	[CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
     93 		UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
     94 };
     95 static const uint_least16_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
     96 	[CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
     97 		UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
     98 };
     99 static const uint_least16_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
    100 	[CHAR_BREAK_PROP_REGIONAL_INDICATOR + NUM_CHAR_BREAK_PROPS] =
    101 		UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
    102 };
    103 
    104 static inline enum char_break_property
    105 get_break_prop(uint_least32_t cp)
    106 {
    107 	if (likely(cp <= 0x10FFFF)) {
    108 		return (enum char_break_property)
    109 		       char_break_minor[char_break_major[cp >> 8] + (cp & 0xff)];
    110 	} else {
    111 		return CHAR_BREAK_PROP_OTHER;
    112 	}
    113 }
    114 
    115 bool
    116 grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STATE *state)
    117 {
    118 	enum char_break_property cp0_prop, cp1_prop;
    119 	bool notbreak = false;
    120 
    121 	if (likely(state)) {
    122 		if (likely(state->prop_set)) {
    123 			cp0_prop = state->prop;
    124 		} else {
    125 			cp0_prop = get_break_prop(cp0);
    126 		}
    127 		cp1_prop = get_break_prop(cp1);
    128 
    129 		/* preserve prop of right codepoint for next iteration */
    130 		state->prop = (uint_least8_t)cp1_prop;
    131 		state->prop_set = true;
    132 
    133 		/* update flags */
    134 		state->gb11_flag =
    135 			flag_update_gb11[cp0_prop + NUM_CHAR_BREAK_PROPS *
    136 			                 state->gb11_flag] &
    137 			UINT16_C(1) << cp1_prop;
    138 		state->gb12_13_flag =
    139 			flag_update_gb12_13[cp0_prop + NUM_CHAR_BREAK_PROPS *
    140 		                            state->gb12_13_flag] &
    141 		        UINT16_C(1) << cp1_prop;
    142 
    143 		/*
    144 		 * Apply grapheme cluster breaking algorithm (UAX #29), see
    145 		 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
    146 		 */
    147 		notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
    148 		           (dont_break_gb11[cp0_prop + state->gb11_flag *
    149 		                            NUM_CHAR_BREAK_PROPS] &
    150 		            (UINT16_C(1) << cp1_prop)) ||
    151 		           (dont_break_gb12_13[cp0_prop + state->gb12_13_flag *
    152 		                               NUM_CHAR_BREAK_PROPS] &
    153 		            (UINT16_C(1) << cp1_prop));
    154 
    155 		/* update or reset flags (when we have a break) */
    156 		if (likely(!notbreak)) {
    157 			state->gb11_flag = state->gb12_13_flag = false;
    158 		}
    159 	} else {
    160 		cp0_prop = get_break_prop(cp0);
    161 		cp1_prop = get_break_prop(cp1);
    162 
    163 		/*
    164 		 * Apply grapheme cluster breaking algorithm (UAX #29), see
    165 		 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
    166 		 *
    167 		 * Given we have no state, this behaves as if the state-booleans
    168 		 * were all set to false
    169 		 */
    170 		notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
    171 		           (dont_break_gb11[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
    172 		           (dont_break_gb12_13[cp0_prop] & (UINT16_C(1) << cp1_prop));
    173 	}
    174 
    175 	return !notbreak;
    176 }
    177 
    178 static size_t
    179 next_character_break(HERODOTUS_READER *r)
    180 {
    181 	GRAPHEME_STATE state = { 0 };
    182 	uint_least32_t cp0 = 0, cp1 = 0;
    183 
    184 	for (herodotus_read_codepoint(r, true, &cp0);
    185 	     herodotus_read_codepoint(r, false, &cp1) == HERODOTUS_STATUS_SUCCESS;
    186 	     herodotus_read_codepoint(r, true, &cp0)) {
    187 		if (grapheme_is_character_break(cp0, cp1, &state)) {
    188 			break;
    189 		}
    190 	}
    191 
    192 	return herodotus_reader_number_read(r);
    193 }
    194 
    195 size_t
    196 grapheme_next_character_break(const uint_least32_t *str, size_t len)
    197 {
    198 	HERODOTUS_READER r;
    199 
    200 	herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
    201 
    202 	return next_character_break(&r);
    203 }
    204 
    205 size_t
    206 grapheme_next_character_break_utf8(const char *str, size_t len)
    207 {
    208 	HERODOTUS_READER r;
    209 
    210 	herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
    211 
    212 	return next_character_break(&r);
    213 }