case.c (12993B)
1 /* See LICENSE file for copyright and license details. */ 2 #include <stddef.h> 3 #include <stdint.h> 4 5 #include "../gen/case.h" 6 #include "../grapheme.h" 7 #include "util.h" 8 9 static inline enum case_property 10 get_case_property(uint_least32_t cp) 11 { 12 if (likely(cp <= UINT32_C(0x10FFFF))) { 13 return (enum case_property) 14 case_minor[case_major[cp >> 8] + (cp & 0xFF)]; 15 } else { 16 return CASE_PROP_OTHER; 17 } 18 } 19 20 static inline int_least32_t 21 get_case_offset(uint_least32_t cp, const uint_least16_t *major, 22 const int_least32_t *minor) 23 { 24 if (likely(cp <= UINT32_C(0x10FFFF))) { 25 /* 26 * this value might be larger than or equal to 0x110000 27 * for the special-case-mapping. This needs to be handled 28 * separately 29 */ 30 return minor[major[cp >> 8] + (cp & 0xFF)]; 31 } else { 32 return 0; 33 } 34 } 35 36 static inline size_t 37 to_case(HERODOTUS_READER *r, HERODOTUS_WRITER *w, 38 uint_least8_t final_sigma_level, const uint_least16_t *major, 39 const int_least32_t *minor, const struct special_case *sc) 40 { 41 HERODOTUS_READER tmp; 42 enum case_property prop; 43 enum herodotus_status s; 44 size_t off, i; 45 uint_least32_t cp, tmp_cp; 46 int_least32_t map; 47 48 for (; herodotus_read_codepoint(r, true, &cp) == 49 HERODOTUS_STATUS_SUCCESS;) { 50 if (sc == lower_special) { 51 /* 52 * For the special Final_Sigma-rule (see 53 * SpecialCasing.txt), which is the only non-localized 54 * case-dependent rule, we apply a different mapping 55 * when a sigma is at the end of a word. 56 * 57 * Before: cased case-ignorable* 58 * After: not(case-ignorable* cased) 59 * 60 * We check the after-condition on demand, but the 61 * before- condition is best checked using the 62 * "level"-heuristic also used in the sentence and line 63 * breaking-implementations. 64 */ 65 if (cp == UINT32_C(0x03A3) && /* GREEK CAPITAL LETTER 66 SIGMA */ 67 (final_sigma_level == 1 || 68 final_sigma_level == 2)) { 69 /* 70 * check succeeding characters by first skipping 71 * all case-ignorable characters and then 72 * checking if the succeeding character is 73 * cased, invalidating the after-condition 74 */ 75 herodotus_reader_copy(r, &tmp); 76 for (prop = NUM_CASE_PROPS; 77 (s = herodotus_read_codepoint(&tmp, true, 78 &tmp_cp)) == 79 HERODOTUS_STATUS_SUCCESS;) { 80 prop = get_case_property(tmp_cp); 81 82 if (prop != CASE_PROP_CASE_IGNORABLE && 83 prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE) { 84 break; 85 } 86 } 87 88 /* 89 * Now prop is something other than 90 * case-ignorable or the source-string ended. If 91 * it is something other than cased, we know 92 * that the after-condition holds 93 */ 94 if (s != HERODOTUS_STATUS_SUCCESS || 95 (prop != CASE_PROP_CASED && 96 prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) { 97 /* 98 * write GREEK SMALL LETTER FINAL SIGMA 99 * to destination 100 */ 101 herodotus_write_codepoint( 102 w, UINT32_C(0x03C2)); 103 104 /* reset Final_Sigma-state and continue 105 */ 106 final_sigma_level = 0; 107 continue; 108 } 109 } 110 111 /* update state */ 112 prop = get_case_property(cp); 113 if ((final_sigma_level == 0 || 114 final_sigma_level == 1) && 115 (prop == CASE_PROP_CASED || 116 prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) { 117 /* sequence has begun */ 118 final_sigma_level = 1; 119 } else if ( 120 (final_sigma_level == 1 || 121 final_sigma_level == 2) && 122 (prop == CASE_PROP_CASE_IGNORABLE || 123 prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) { 124 /* case-ignorable sequence begins or continued 125 */ 126 final_sigma_level = 2; 127 } else { 128 /* sequence broke */ 129 final_sigma_level = 0; 130 } 131 } 132 133 /* get and handle case mapping */ 134 if (unlikely((map = get_case_offset(cp, major, minor)) >= 135 INT32_C(0x110000))) { 136 /* we have a special case and the offset in the sc-array 137 * is the difference to 0x110000*/ 138 off = (uint_least32_t)map - UINT32_C(0x110000); 139 140 for (i = 0; i < sc[off].cplen; i++) { 141 herodotus_write_codepoint(w, sc[off].cp[i]); 142 } 143 } else { 144 /* we have a simple mapping */ 145 herodotus_write_codepoint( 146 w, (uint_least32_t)((int_least32_t)cp + map)); 147 } 148 } 149 150 herodotus_writer_nul_terminate(w); 151 152 return herodotus_writer_number_written(w); 153 } 154 155 static size_t 156 herodotus_next_word_break(const HERODOTUS_READER *r) 157 { 158 HERODOTUS_READER tmp; 159 160 herodotus_reader_copy(r, &tmp); 161 162 if (r->type == HERODOTUS_TYPE_CODEPOINT) { 163 return grapheme_next_word_break(tmp.src, tmp.srclen); 164 } else { /* r->type == HERODOTUS_TYPE_UTF8 */ 165 return grapheme_next_word_break_utf8(tmp.src, tmp.srclen); 166 } 167 } 168 169 static inline size_t 170 to_titlecase(HERODOTUS_READER *r, HERODOTUS_WRITER *w) 171 { 172 enum case_property prop; 173 enum herodotus_status s; 174 uint_least32_t cp; 175 size_t nwb; 176 177 for (; (nwb = herodotus_next_word_break(r)) > 0;) { 178 herodotus_reader_push_advance_limit(r, nwb); 179 for (; (s = herodotus_read_codepoint(r, false, &cp)) == 180 HERODOTUS_STATUS_SUCCESS;) { 181 /* check if we have a cased character */ 182 prop = get_case_property(cp); 183 if (prop == CASE_PROP_CASED || 184 prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) { 185 break; 186 } else { 187 /* write the data to the output verbatim, it if 188 * permits */ 189 herodotus_write_codepoint(w, cp); 190 191 /* increment reader */ 192 herodotus_read_codepoint(r, true, &cp); 193 } 194 } 195 196 if (s == HERODOTUS_STATUS_END_OF_BUFFER) { 197 /* we are done */ 198 herodotus_reader_pop_limit(r); 199 break; 200 } else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) { 201 /* 202 * we did not encounter any cased character 203 * up to the word break 204 */ 205 herodotus_reader_pop_limit(r); 206 continue; 207 } else { 208 /* 209 * we encountered a cased character before the word 210 * break, convert it to titlecase 211 */ 212 herodotus_reader_push_advance_limit( 213 r, herodotus_reader_next_codepoint_break(r)); 214 to_case(r, w, 0, title_major, title_minor, 215 title_special); 216 herodotus_reader_pop_limit(r); 217 } 218 219 /* cast the rest of the codepoints in the word to lowercase */ 220 to_case(r, w, 1, lower_major, lower_minor, lower_special); 221 222 /* remove the limit on the word before the next iteration */ 223 herodotus_reader_pop_limit(r); 224 } 225 226 herodotus_writer_nul_terminate(w); 227 228 return herodotus_writer_number_written(w); 229 } 230 231 size_t 232 grapheme_to_uppercase(const uint_least32_t *src, size_t srclen, 233 uint_least32_t *dest, size_t destlen) 234 { 235 HERODOTUS_READER r; 236 HERODOTUS_WRITER w; 237 238 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen); 239 herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen); 240 241 return to_case(&r, &w, 0, upper_major, upper_minor, upper_special); 242 } 243 244 size_t 245 grapheme_to_lowercase(const uint_least32_t *src, size_t srclen, 246 uint_least32_t *dest, size_t destlen) 247 { 248 HERODOTUS_READER r; 249 HERODOTUS_WRITER w; 250 251 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen); 252 herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen); 253 254 return to_case(&r, &w, 0, lower_major, lower_minor, lower_special); 255 } 256 257 size_t 258 grapheme_to_titlecase(const uint_least32_t *src, size_t srclen, 259 uint_least32_t *dest, size_t destlen) 260 { 261 HERODOTUS_READER r; 262 HERODOTUS_WRITER w; 263 264 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen); 265 herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen); 266 267 return to_titlecase(&r, &w); 268 } 269 270 size_t 271 grapheme_to_uppercase_utf8(const char *src, size_t srclen, char *dest, 272 size_t destlen) 273 { 274 HERODOTUS_READER r; 275 HERODOTUS_WRITER w; 276 277 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen); 278 herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen); 279 280 return to_case(&r, &w, 0, upper_major, upper_minor, upper_special); 281 } 282 283 size_t 284 grapheme_to_lowercase_utf8(const char *src, size_t srclen, char *dest, 285 size_t destlen) 286 { 287 HERODOTUS_READER r; 288 HERODOTUS_WRITER w; 289 290 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen); 291 herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen); 292 293 return to_case(&r, &w, 0, lower_major, lower_minor, lower_special); 294 } 295 296 size_t 297 grapheme_to_titlecase_utf8(const char *src, size_t srclen, char *dest, 298 size_t destlen) 299 { 300 HERODOTUS_READER r; 301 HERODOTUS_WRITER w; 302 303 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen); 304 herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen); 305 306 return to_titlecase(&r, &w); 307 } 308 309 static inline bool 310 is_case(HERODOTUS_READER *r, const uint_least16_t *major, 311 const int_least32_t *minor, const struct special_case *sc, 312 size_t *output) 313 { 314 size_t off, i; 315 bool ret = true; 316 uint_least32_t cp; 317 int_least32_t map; 318 319 for (; herodotus_read_codepoint(r, false, &cp) == 320 HERODOTUS_STATUS_SUCCESS;) { 321 /* get and handle case mapping */ 322 if (unlikely((map = get_case_offset(cp, major, minor)) >= 323 INT32_C(0x110000))) { 324 /* we have a special case and the offset in the sc-array 325 * is the difference to 0x110000*/ 326 off = (uint_least32_t)map - UINT32_C(0x110000); 327 328 for (i = 0; i < sc[off].cplen; i++) { 329 if (herodotus_read_codepoint(r, false, &cp) == 330 HERODOTUS_STATUS_SUCCESS) { 331 if (cp != sc[off].cp[i]) { 332 ret = false; 333 goto done; 334 } else { 335 /* move forward */ 336 herodotus_read_codepoint( 337 r, true, &cp); 338 } 339 } else { 340 /* 341 * input ended and we didn't see 342 * any difference so far, so this 343 * string is in fact okay 344 */ 345 ret = true; 346 goto done; 347 } 348 } 349 } else { 350 /* we have a simple mapping */ 351 if (cp != (uint_least32_t)((int_least32_t)cp + map)) { 352 /* we have a difference */ 353 ret = false; 354 goto done; 355 } else { 356 /* move forward */ 357 herodotus_read_codepoint(r, true, &cp); 358 } 359 } 360 } 361 done: 362 if (output) { 363 *output = herodotus_reader_number_read(r); 364 } 365 return ret; 366 } 367 368 static inline bool 369 is_titlecase(HERODOTUS_READER *r, size_t *output) 370 { 371 enum case_property prop; 372 enum herodotus_status s; 373 bool ret = true; 374 uint_least32_t cp; 375 size_t nwb; 376 377 for (; (nwb = herodotus_next_word_break(r)) > 0;) { 378 herodotus_reader_push_advance_limit(r, nwb); 379 for (; (s = herodotus_read_codepoint(r, false, &cp)) == 380 HERODOTUS_STATUS_SUCCESS;) { 381 /* check if we have a cased character */ 382 prop = get_case_property(cp); 383 if (prop == CASE_PROP_CASED || 384 prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) { 385 break; 386 } else { 387 /* increment reader */ 388 herodotus_read_codepoint(r, true, &cp); 389 } 390 } 391 392 if (s == HERODOTUS_STATUS_END_OF_BUFFER) { 393 /* we are done */ 394 break; 395 } else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) { 396 /* 397 * we did not encounter any cased character 398 * up to the word break 399 */ 400 herodotus_reader_pop_limit(r); 401 continue; 402 } else { 403 /* 404 * we encountered a cased character before the word 405 * break, check if it's titlecase 406 */ 407 herodotus_reader_push_advance_limit( 408 r, herodotus_reader_next_codepoint_break(r)); 409 if (!is_case(r, title_major, title_minor, title_special, 410 NULL)) { 411 ret = false; 412 goto done; 413 } 414 herodotus_reader_pop_limit(r); 415 } 416 417 /* check if the rest of the codepoints in the word are lowercase 418 */ 419 if (!is_case(r, lower_major, lower_minor, lower_special, 420 NULL)) { 421 ret = false; 422 goto done; 423 } 424 425 /* remove the limit on the word before the next iteration */ 426 herodotus_reader_pop_limit(r); 427 } 428 done: 429 if (output) { 430 *output = herodotus_reader_number_read(r); 431 } 432 return ret; 433 } 434 435 bool 436 grapheme_is_uppercase(const uint_least32_t *src, size_t srclen, size_t *caselen) 437 { 438 HERODOTUS_READER r; 439 440 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen); 441 442 return is_case(&r, upper_major, upper_minor, upper_special, caselen); 443 } 444 445 bool 446 grapheme_is_lowercase(const uint_least32_t *src, size_t srclen, size_t *caselen) 447 { 448 HERODOTUS_READER r; 449 450 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen); 451 452 return is_case(&r, lower_major, lower_minor, lower_special, caselen); 453 } 454 455 bool 456 grapheme_is_titlecase(const uint_least32_t *src, size_t srclen, size_t *caselen) 457 { 458 HERODOTUS_READER r; 459 460 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen); 461 462 return is_titlecase(&r, caselen); 463 } 464 465 bool 466 grapheme_is_uppercase_utf8(const char *src, size_t srclen, size_t *caselen) 467 { 468 HERODOTUS_READER r; 469 470 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen); 471 472 return is_case(&r, upper_major, upper_minor, upper_special, caselen); 473 } 474 475 bool 476 grapheme_is_lowercase_utf8(const char *src, size_t srclen, size_t *caselen) 477 { 478 HERODOTUS_READER r; 479 480 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen); 481 482 return is_case(&r, lower_major, lower_minor, lower_special, caselen); 483 } 484 485 bool 486 grapheme_is_titlecase_utf8(const char *src, size_t srclen, size_t *caselen) 487 { 488 HERODOTUS_READER r; 489 490 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen); 491 492 return is_titlecase(&r, caselen); 493 }