libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

case.c (12993B)


      1 /* See LICENSE file for copyright and license details. */
      2 #include <stddef.h>
      3 #include <stdint.h>
      4 
      5 #include "../gen/case.h"
      6 #include "../grapheme.h"
      7 #include "util.h"
      8 
      9 static inline enum case_property
     10 get_case_property(uint_least32_t cp)
     11 {
     12 	if (likely(cp <= UINT32_C(0x10FFFF))) {
     13 		return (enum case_property)
     14 			case_minor[case_major[cp >> 8] + (cp & 0xFF)];
     15 	} else {
     16 		return CASE_PROP_OTHER;
     17 	}
     18 }
     19 
     20 static inline int_least32_t
     21 get_case_offset(uint_least32_t cp, const uint_least16_t *major,
     22                 const int_least32_t *minor)
     23 {
     24 	if (likely(cp <= UINT32_C(0x10FFFF))) {
     25 		/*
     26 		 * this value might be larger than or equal to 0x110000
     27 		 * for the special-case-mapping. This needs to be handled
     28 		 * separately
     29 		 */
     30 		return minor[major[cp >> 8] + (cp & 0xFF)];
     31 	} else {
     32 		return 0;
     33 	}
     34 }
     35 
     36 static inline size_t
     37 to_case(HERODOTUS_READER *r, HERODOTUS_WRITER *w,
     38         uint_least8_t final_sigma_level, const uint_least16_t *major,
     39         const int_least32_t *minor, const struct special_case *sc)
     40 {
     41 	HERODOTUS_READER tmp;
     42 	enum case_property prop;
     43 	enum herodotus_status s;
     44 	size_t off, i;
     45 	uint_least32_t cp, tmp_cp;
     46 	int_least32_t map;
     47 
     48 	for (; herodotus_read_codepoint(r, true, &cp) ==
     49 	       HERODOTUS_STATUS_SUCCESS;) {
     50 		if (sc == lower_special) {
     51 			/*
     52 			 * For the special Final_Sigma-rule (see
     53 			 * SpecialCasing.txt), which is the only non-localized
     54 			 * case-dependent rule, we apply a different mapping
     55 			 * when a sigma is at the end of a word.
     56 			 *
     57 			 * Before: cased case-ignorable*
     58 			 * After: not(case-ignorable* cased)
     59 			 *
     60 			 * We check the after-condition on demand, but the
     61 			 * before- condition is best checked using the
     62 			 * "level"-heuristic also used in the sentence and line
     63 			 * breaking-implementations.
     64 			 */
     65 			if (cp == UINT32_C(0x03A3) && /* GREEK CAPITAL LETTER
     66 			                                 SIGMA */
     67 			    (final_sigma_level == 1 ||
     68 			     final_sigma_level == 2)) {
     69 				/*
     70 				 * check succeeding characters by first skipping
     71 				 * all case-ignorable characters and then
     72 				 * checking if the succeeding character is
     73 				 * cased, invalidating the after-condition
     74 				 */
     75 				herodotus_reader_copy(r, &tmp);
     76 				for (prop = NUM_CASE_PROPS;
     77 				     (s = herodotus_read_codepoint(&tmp, true,
     78 				                                   &tmp_cp)) ==
     79 				     HERODOTUS_STATUS_SUCCESS;) {
     80 					prop = get_case_property(tmp_cp);
     81 
     82 					if (prop != CASE_PROP_CASE_IGNORABLE &&
     83 					    prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
     84 						break;
     85 					}
     86 				}
     87 
     88 				/*
     89 				 * Now prop is something other than
     90 				 * case-ignorable or the source-string ended. If
     91 				 * it is something other than cased, we know
     92 				 * that the after-condition holds
     93 				 */
     94 				if (s != HERODOTUS_STATUS_SUCCESS ||
     95 				    (prop != CASE_PROP_CASED &&
     96 				     prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
     97 					/*
     98 					 * write GREEK SMALL LETTER FINAL SIGMA
     99 					 * to destination
    100 					 */
    101 					herodotus_write_codepoint(
    102 						w, UINT32_C(0x03C2));
    103 
    104 					/* reset Final_Sigma-state and continue
    105 					 */
    106 					final_sigma_level = 0;
    107 					continue;
    108 				}
    109 			}
    110 
    111 			/* update state */
    112 			prop = get_case_property(cp);
    113 			if ((final_sigma_level == 0 ||
    114 			     final_sigma_level == 1) &&
    115 			    (prop == CASE_PROP_CASED ||
    116 			     prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
    117 				/* sequence has begun */
    118 				final_sigma_level = 1;
    119 			} else if (
    120 				(final_sigma_level == 1 ||
    121 			         final_sigma_level == 2) &&
    122 				(prop == CASE_PROP_CASE_IGNORABLE ||
    123 			         prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
    124 				/* case-ignorable sequence begins or continued
    125 				 */
    126 				final_sigma_level = 2;
    127 			} else {
    128 				/* sequence broke */
    129 				final_sigma_level = 0;
    130 			}
    131 		}
    132 
    133 		/* get and handle case mapping */
    134 		if (unlikely((map = get_case_offset(cp, major, minor)) >=
    135 		             INT32_C(0x110000))) {
    136 			/* we have a special case and the offset in the sc-array
    137 			 * is the difference to 0x110000*/
    138 			off = (uint_least32_t)map - UINT32_C(0x110000);
    139 
    140 			for (i = 0; i < sc[off].cplen; i++) {
    141 				herodotus_write_codepoint(w, sc[off].cp[i]);
    142 			}
    143 		} else {
    144 			/* we have a simple mapping */
    145 			herodotus_write_codepoint(
    146 				w, (uint_least32_t)((int_least32_t)cp + map));
    147 		}
    148 	}
    149 
    150 	herodotus_writer_nul_terminate(w);
    151 
    152 	return herodotus_writer_number_written(w);
    153 }
    154 
    155 static size_t
    156 herodotus_next_word_break(const HERODOTUS_READER *r)
    157 {
    158 	HERODOTUS_READER tmp;
    159 
    160 	herodotus_reader_copy(r, &tmp);
    161 
    162 	if (r->type == HERODOTUS_TYPE_CODEPOINT) {
    163 		return grapheme_next_word_break(tmp.src, tmp.srclen);
    164 	} else { /* r->type == HERODOTUS_TYPE_UTF8 */
    165 		return grapheme_next_word_break_utf8(tmp.src, tmp.srclen);
    166 	}
    167 }
    168 
    169 static inline size_t
    170 to_titlecase(HERODOTUS_READER *r, HERODOTUS_WRITER *w)
    171 {
    172 	enum case_property prop;
    173 	enum herodotus_status s;
    174 	uint_least32_t cp;
    175 	size_t nwb;
    176 
    177 	for (; (nwb = herodotus_next_word_break(r)) > 0;) {
    178 		herodotus_reader_push_advance_limit(r, nwb);
    179 		for (; (s = herodotus_read_codepoint(r, false, &cp)) ==
    180 		       HERODOTUS_STATUS_SUCCESS;) {
    181 			/* check if we have a cased character */
    182 			prop = get_case_property(cp);
    183 			if (prop == CASE_PROP_CASED ||
    184 			    prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
    185 				break;
    186 			} else {
    187 				/* write the data to the output verbatim, it if
    188 				 * permits */
    189 				herodotus_write_codepoint(w, cp);
    190 
    191 				/* increment reader */
    192 				herodotus_read_codepoint(r, true, &cp);
    193 			}
    194 		}
    195 
    196 		if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
    197 			/* we are done */
    198 			herodotus_reader_pop_limit(r);
    199 			break;
    200 		} else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
    201 			/*
    202 			 * we did not encounter any cased character
    203 			 * up to the word break
    204 			 */
    205 			herodotus_reader_pop_limit(r);
    206 			continue;
    207 		} else {
    208 			/*
    209 			 * we encountered a cased character before the word
    210 			 * break, convert it to titlecase
    211 			 */
    212 			herodotus_reader_push_advance_limit(
    213 				r, herodotus_reader_next_codepoint_break(r));
    214 			to_case(r, w, 0, title_major, title_minor,
    215 			        title_special);
    216 			herodotus_reader_pop_limit(r);
    217 		}
    218 
    219 		/* cast the rest of the codepoints in the word to lowercase */
    220 		to_case(r, w, 1, lower_major, lower_minor, lower_special);
    221 
    222 		/* remove the limit on the word before the next iteration */
    223 		herodotus_reader_pop_limit(r);
    224 	}
    225 
    226 	herodotus_writer_nul_terminate(w);
    227 
    228 	return herodotus_writer_number_written(w);
    229 }
    230 
    231 size_t
    232 grapheme_to_uppercase(const uint_least32_t *src, size_t srclen,
    233                       uint_least32_t *dest, size_t destlen)
    234 {
    235 	HERODOTUS_READER r;
    236 	HERODOTUS_WRITER w;
    237 
    238 	herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
    239 	herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
    240 
    241 	return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
    242 }
    243 
    244 size_t
    245 grapheme_to_lowercase(const uint_least32_t *src, size_t srclen,
    246                       uint_least32_t *dest, size_t destlen)
    247 {
    248 	HERODOTUS_READER r;
    249 	HERODOTUS_WRITER w;
    250 
    251 	herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
    252 	herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
    253 
    254 	return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
    255 }
    256 
    257 size_t
    258 grapheme_to_titlecase(const uint_least32_t *src, size_t srclen,
    259                       uint_least32_t *dest, size_t destlen)
    260 {
    261 	HERODOTUS_READER r;
    262 	HERODOTUS_WRITER w;
    263 
    264 	herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
    265 	herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
    266 
    267 	return to_titlecase(&r, &w);
    268 }
    269 
    270 size_t
    271 grapheme_to_uppercase_utf8(const char *src, size_t srclen, char *dest,
    272                            size_t destlen)
    273 {
    274 	HERODOTUS_READER r;
    275 	HERODOTUS_WRITER w;
    276 
    277 	herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
    278 	herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
    279 
    280 	return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
    281 }
    282 
    283 size_t
    284 grapheme_to_lowercase_utf8(const char *src, size_t srclen, char *dest,
    285                            size_t destlen)
    286 {
    287 	HERODOTUS_READER r;
    288 	HERODOTUS_WRITER w;
    289 
    290 	herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
    291 	herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
    292 
    293 	return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
    294 }
    295 
    296 size_t
    297 grapheme_to_titlecase_utf8(const char *src, size_t srclen, char *dest,
    298                            size_t destlen)
    299 {
    300 	HERODOTUS_READER r;
    301 	HERODOTUS_WRITER w;
    302 
    303 	herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
    304 	herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
    305 
    306 	return to_titlecase(&r, &w);
    307 }
    308 
    309 static inline bool
    310 is_case(HERODOTUS_READER *r, const uint_least16_t *major,
    311         const int_least32_t *minor, const struct special_case *sc,
    312         size_t *output)
    313 {
    314 	size_t off, i;
    315 	bool ret = true;
    316 	uint_least32_t cp;
    317 	int_least32_t map;
    318 
    319 	for (; herodotus_read_codepoint(r, false, &cp) ==
    320 	       HERODOTUS_STATUS_SUCCESS;) {
    321 		/* get and handle case mapping */
    322 		if (unlikely((map = get_case_offset(cp, major, minor)) >=
    323 		             INT32_C(0x110000))) {
    324 			/* we have a special case and the offset in the sc-array
    325 			 * is the difference to 0x110000*/
    326 			off = (uint_least32_t)map - UINT32_C(0x110000);
    327 
    328 			for (i = 0; i < sc[off].cplen; i++) {
    329 				if (herodotus_read_codepoint(r, false, &cp) ==
    330 				    HERODOTUS_STATUS_SUCCESS) {
    331 					if (cp != sc[off].cp[i]) {
    332 						ret = false;
    333 						goto done;
    334 					} else {
    335 						/* move forward */
    336 						herodotus_read_codepoint(
    337 							r, true, &cp);
    338 					}
    339 				} else {
    340 					/*
    341 					 * input ended and we didn't see
    342 					 * any difference so far, so this
    343 					 * string is in fact okay
    344 					 */
    345 					ret = true;
    346 					goto done;
    347 				}
    348 			}
    349 		} else {
    350 			/* we have a simple mapping */
    351 			if (cp != (uint_least32_t)((int_least32_t)cp + map)) {
    352 				/* we have a difference */
    353 				ret = false;
    354 				goto done;
    355 			} else {
    356 				/* move forward */
    357 				herodotus_read_codepoint(r, true, &cp);
    358 			}
    359 		}
    360 	}
    361 done:
    362 	if (output) {
    363 		*output = herodotus_reader_number_read(r);
    364 	}
    365 	return ret;
    366 }
    367 
    368 static inline bool
    369 is_titlecase(HERODOTUS_READER *r, size_t *output)
    370 {
    371 	enum case_property prop;
    372 	enum herodotus_status s;
    373 	bool ret = true;
    374 	uint_least32_t cp;
    375 	size_t nwb;
    376 
    377 	for (; (nwb = herodotus_next_word_break(r)) > 0;) {
    378 		herodotus_reader_push_advance_limit(r, nwb);
    379 		for (; (s = herodotus_read_codepoint(r, false, &cp)) ==
    380 		       HERODOTUS_STATUS_SUCCESS;) {
    381 			/* check if we have a cased character */
    382 			prop = get_case_property(cp);
    383 			if (prop == CASE_PROP_CASED ||
    384 			    prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
    385 				break;
    386 			} else {
    387 				/* increment reader */
    388 				herodotus_read_codepoint(r, true, &cp);
    389 			}
    390 		}
    391 
    392 		if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
    393 			/* we are done */
    394 			break;
    395 		} else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
    396 			/*
    397 			 * we did not encounter any cased character
    398 			 * up to the word break
    399 			 */
    400 			herodotus_reader_pop_limit(r);
    401 			continue;
    402 		} else {
    403 			/*
    404 			 * we encountered a cased character before the word
    405 			 * break, check if it's titlecase
    406 			 */
    407 			herodotus_reader_push_advance_limit(
    408 				r, herodotus_reader_next_codepoint_break(r));
    409 			if (!is_case(r, title_major, title_minor, title_special,
    410 			             NULL)) {
    411 				ret = false;
    412 				goto done;
    413 			}
    414 			herodotus_reader_pop_limit(r);
    415 		}
    416 
    417 		/* check if the rest of the codepoints in the word are lowercase
    418 		 */
    419 		if (!is_case(r, lower_major, lower_minor, lower_special,
    420 		             NULL)) {
    421 			ret = false;
    422 			goto done;
    423 		}
    424 
    425 		/* remove the limit on the word before the next iteration */
    426 		herodotus_reader_pop_limit(r);
    427 	}
    428 done:
    429 	if (output) {
    430 		*output = herodotus_reader_number_read(r);
    431 	}
    432 	return ret;
    433 }
    434 
    435 bool
    436 grapheme_is_uppercase(const uint_least32_t *src, size_t srclen, size_t *caselen)
    437 {
    438 	HERODOTUS_READER r;
    439 
    440 	herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
    441 
    442 	return is_case(&r, upper_major, upper_minor, upper_special, caselen);
    443 }
    444 
    445 bool
    446 grapheme_is_lowercase(const uint_least32_t *src, size_t srclen, size_t *caselen)
    447 {
    448 	HERODOTUS_READER r;
    449 
    450 	herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
    451 
    452 	return is_case(&r, lower_major, lower_minor, lower_special, caselen);
    453 }
    454 
    455 bool
    456 grapheme_is_titlecase(const uint_least32_t *src, size_t srclen, size_t *caselen)
    457 {
    458 	HERODOTUS_READER r;
    459 
    460 	herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
    461 
    462 	return is_titlecase(&r, caselen);
    463 }
    464 
    465 bool
    466 grapheme_is_uppercase_utf8(const char *src, size_t srclen, size_t *caselen)
    467 {
    468 	HERODOTUS_READER r;
    469 
    470 	herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
    471 
    472 	return is_case(&r, upper_major, upper_minor, upper_special, caselen);
    473 }
    474 
    475 bool
    476 grapheme_is_lowercase_utf8(const char *src, size_t srclen, size_t *caselen)
    477 {
    478 	HERODOTUS_READER r;
    479 
    480 	herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
    481 
    482 	return is_case(&r, lower_major, lower_minor, lower_special, caselen);
    483 }
    484 
    485 bool
    486 grapheme_is_titlecase_utf8(const char *src, size_t srclen, size_t *caselen)
    487 {
    488 	HERODOTUS_READER r;
    489 
    490 	herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
    491 
    492 	return is_titlecase(&r, caselen);
    493 }