util.c (11480B)
1 /* See LICENSE file for copyright and license details. */ 2 #include <limits.h> 3 #include <stdbool.h> 4 #include <stddef.h> 5 #include <stdint.h> 6 7 #include "../gen/types.h" 8 #include "../grapheme.h" 9 #include "util.h" 10 11 void 12 herodotus_reader_init(HERODOTUS_READER *r, enum herodotus_type type, 13 const void *src, size_t srclen) 14 { 15 size_t i; 16 17 r->type = type; 18 r->src = src; 19 r->srclen = srclen; 20 r->off = 0; 21 r->terminated_by_null = false; 22 23 for (i = 0; i < LEN(r->soft_limit); i++) { 24 r->soft_limit[i] = SIZE_MAX; 25 } 26 } 27 28 void 29 herodotus_reader_copy(const HERODOTUS_READER *src, HERODOTUS_READER *dest) 30 { 31 size_t i; 32 33 /* 34 * we copy such that we have a "fresh" start and build on the 35 * fact that src->soft_limit[i] for any i and src->srclen are 36 * always larger or equal to src->off 37 */ 38 dest->type = src->type; 39 if (src->type == HERODOTUS_TYPE_CODEPOINT) { 40 dest->src = 41 (src->src == NULL) ? 42 NULL : 43 ((const uint_least32_t *)(src->src)) + src->off; 44 } else { /* src->type == HERODOTUS_TYPE_UTF8 */ 45 dest->src = (src->src == NULL) ? 46 NULL : 47 ((const char *)(src->src)) + src->off; 48 } 49 if (src->srclen == SIZE_MAX) { 50 dest->srclen = SIZE_MAX; 51 } else { 52 dest->srclen = 53 (src->off < src->srclen) ? src->srclen - src->off : 0; 54 } 55 dest->off = 0; 56 dest->terminated_by_null = src->terminated_by_null; 57 58 for (i = 0; i < LEN(src->soft_limit); i++) { 59 if (src->soft_limit[i] == SIZE_MAX) { 60 dest->soft_limit[i] = SIZE_MAX; 61 } else { 62 /* 63 * if we have a degenerate case where the offset is 64 * higher than the soft-limit, we simply clamp the 65 * soft-limit to zero given we can't decide here 66 * to release the limit and, instead, we just 67 * prevent any more reads 68 */ 69 dest->soft_limit[i] = 70 (src->off < src->soft_limit[i]) ? 71 src->soft_limit[i] - src->off : 72 0; 73 } 74 } 75 } 76 77 void 78 herodotus_reader_push_advance_limit(HERODOTUS_READER *r, size_t count) 79 { 80 size_t i; 81 82 for (i = LEN(r->soft_limit) - 1; i >= 1; i--) { 83 r->soft_limit[i] = r->soft_limit[i - 1]; 84 } 85 r->soft_limit[0] = r->off + count; 86 } 87 88 void 89 herodotus_reader_pop_limit(HERODOTUS_READER *r) 90 { 91 size_t i; 92 93 for (i = 0; i < LEN(r->soft_limit) - 1; i++) { 94 r->soft_limit[i] = r->soft_limit[i + 1]; 95 } 96 r->soft_limit[LEN(r->soft_limit) - 1] = SIZE_MAX; 97 } 98 99 size_t 100 herodotus_reader_next_word_break(const HERODOTUS_READER *r) 101 { 102 if (r->type == HERODOTUS_TYPE_CODEPOINT) { 103 return grapheme_next_word_break( 104 (const uint_least32_t *)(r->src) + r->off, 105 MIN(r->srclen, r->soft_limit[0]) - r->off); 106 } else { /* r->type == HERODOTUS_TYPE_UTF8 */ 107 return grapheme_next_word_break_utf8( 108 (const char *)(r->src) + r->off, 109 MIN(r->srclen, r->soft_limit[0]) - r->off); 110 } 111 } 112 113 size_t 114 herodotus_reader_next_codepoint_break(const HERODOTUS_READER *r) 115 { 116 if (r->type == HERODOTUS_TYPE_CODEPOINT) { 117 return (r->off < MIN(r->srclen, r->soft_limit[0])) ? 1 : 0; 118 } else { /* r->type == HERODOTUS_TYPE_UTF8 */ 119 return grapheme_decode_utf8( 120 (const char *)(r->src) + r->off, 121 MIN(r->srclen, r->soft_limit[0]) - r->off, NULL); 122 } 123 } 124 125 size_t 126 herodotus_reader_number_read(const HERODOTUS_READER *r) 127 { 128 return r->off; 129 } 130 131 enum herodotus_status 132 herodotus_read_codepoint(HERODOTUS_READER *r, bool advance, uint_least32_t *cp) 133 { 134 size_t ret; 135 136 if (r->terminated_by_null || r->off >= r->srclen || r->src == NULL) { 137 *cp = GRAPHEME_INVALID_CODEPOINT; 138 return HERODOTUS_STATUS_END_OF_BUFFER; 139 } 140 141 if (r->off >= r->soft_limit[0]) { 142 *cp = GRAPHEME_INVALID_CODEPOINT; 143 return HERODOTUS_STATUS_SOFT_LIMIT_REACHED; 144 } 145 146 if (r->type == HERODOTUS_TYPE_CODEPOINT) { 147 *cp = ((const uint_least32_t *)(r->src))[r->off]; 148 ret = 1; 149 } else { /* r->type == HERODOTUS_TYPE_UTF8 */ 150 ret = grapheme_decode_utf8( 151 (const char *)r->src + r->off, 152 MIN(r->srclen, r->soft_limit[0]) - r->off, cp); 153 } 154 155 if (unlikely(r->srclen == SIZE_MAX && *cp == 0)) { 156 /* 157 * We encountered a null-codepoint. Don't increment 158 * offset and return as if the buffer had ended here all 159 * along 160 */ 161 r->terminated_by_null = true; 162 return HERODOTUS_STATUS_END_OF_BUFFER; 163 } 164 165 if (r->off + ret > MIN(r->srclen, r->soft_limit[0])) { 166 /* 167 * we want more than we have; instead of returning 168 * garbage we terminate here. 169 */ 170 return HERODOTUS_STATUS_END_OF_BUFFER; 171 } 172 173 /* 174 * Increase offset which we now know won't surpass the limits, 175 * unless we got told otherwise 176 */ 177 if (advance) { 178 r->off += ret; 179 } 180 181 return HERODOTUS_STATUS_SUCCESS; 182 } 183 184 void 185 herodotus_writer_init(HERODOTUS_WRITER *w, enum herodotus_type type, void *dest, 186 size_t destlen) 187 { 188 w->type = type; 189 w->dest = dest; 190 w->destlen = destlen; 191 w->off = 0; 192 w->first_unwritable_offset = SIZE_MAX; 193 } 194 195 void 196 herodotus_writer_nul_terminate(HERODOTUS_WRITER *w) 197 { 198 if (w->dest == NULL) { 199 return; 200 } 201 202 if (w->off < w->destlen) { 203 /* We still have space in the buffer. Simply use it */ 204 if (w->type == HERODOTUS_TYPE_CODEPOINT) { 205 ((uint_least32_t *)(w->dest))[w->off] = 0; 206 } else { /* w->type == HERODOTUS_TYPE_UTF8 */ 207 ((char *)(w->dest))[w->off] = '\0'; 208 } 209 } else if (w->first_unwritable_offset < w->destlen) { 210 /* 211 * There is no more space in the buffer. However, 212 * we have noted down the first offset we couldn't 213 * use to write into the buffer and it's smaller than 214 * destlen. Thus we bailed writing into the 215 * destination when a multibyte-codepoint couldn't be 216 * written. So the last "real" byte might be at 217 * destlen-4, destlen-3, destlen-2 or destlen-1 218 * (the last case meaning truncation). 219 */ 220 if (w->type == HERODOTUS_TYPE_CODEPOINT) { 221 ((uint_least32_t 222 *)(w->dest))[w->first_unwritable_offset] = 0; 223 } else { /* w->type == HERODOTUS_TYPE_UTF8 */ 224 ((char *)(w->dest))[w->first_unwritable_offset] = '\0'; 225 } 226 } else if (w->destlen > 0) { 227 /* 228 * In this case, there is no more space in the buffer and 229 * the last unwritable offset is larger than 230 * or equal to the destination buffer length. This means 231 * that we are forced to simply write into the last 232 * byte. 233 */ 234 if (w->type == HERODOTUS_TYPE_CODEPOINT) { 235 ((uint_least32_t *)(w->dest))[w->destlen - 1] = 0; 236 } else { /* w->type == HERODOTUS_TYPE_UTF8 */ 237 ((char *)(w->dest))[w->destlen - 1] = '\0'; 238 } 239 } 240 241 /* w->off is not incremented in any case */ 242 } 243 244 size_t 245 herodotus_writer_number_written(const HERODOTUS_WRITER *w) 246 { 247 return w->off; 248 } 249 250 void 251 herodotus_write_codepoint(HERODOTUS_WRITER *w, uint_least32_t cp) 252 { 253 size_t ret; 254 255 /* 256 * This function will always faithfully say how many codepoints 257 * were written, even if the buffer ends. This is used to enable 258 * truncation detection. 259 */ 260 if (w->type == HERODOTUS_TYPE_CODEPOINT) { 261 if (w->dest != NULL && w->off < w->destlen) { 262 ((uint_least32_t *)(w->dest))[w->off] = cp; 263 } 264 265 w->off += 1; 266 } else { /* w->type == HERODOTUS_TYPE_UTF8 */ 267 /* 268 * First determine how many bytes we need to encode the 269 * codepoint 270 */ 271 ret = grapheme_encode_utf8(cp, NULL, 0); 272 273 if (w->dest != NULL && w->off + ret < w->destlen) { 274 /* we still have enough room in the buffer */ 275 grapheme_encode_utf8(cp, (char *)(w->dest) + w->off, 276 w->destlen - w->off); 277 } else if (w->first_unwritable_offset == SIZE_MAX) { 278 /* 279 * the first unwritable offset has not been 280 * noted down, so this is the first time we can't 281 * write (completely) to an offset 282 */ 283 w->first_unwritable_offset = w->off; 284 } 285 286 w->off += ret; 287 } 288 } 289 290 void 291 proper_init(const HERODOTUS_READER *r, void *state, uint_least8_t no_prop, 292 uint_least8_t (*get_break_prop)(uint_least32_t), 293 bool (*is_skippable_prop)(uint_least8_t), 294 void (*skip_shift_callback)(uint_least8_t, void *), 295 struct proper *p) 296 { 297 uint_least8_t prop; 298 uint_least32_t cp; 299 size_t i; 300 301 /* set internal variables */ 302 p->state = state; 303 p->no_prop = no_prop; 304 p->get_break_prop = get_break_prop; 305 p->is_skippable_prop = is_skippable_prop; 306 p->skip_shift_callback = skip_shift_callback; 307 308 /* 309 * Initialize mid-reader, which is basically just there 310 * to reflect the current position of the viewing-line 311 */ 312 herodotus_reader_copy(r, &(p->mid_reader)); 313 314 /* 315 * In the initialization, we simply (try to) fill in next_prop. 316 * If we cannot read in more (due to the buffer ending), we 317 * fill in the prop as invalid 318 */ 319 320 /* 321 * initialize the previous properties to have no property 322 * (given we are at the start of the buffer) 323 */ 324 p->raw.prev_prop[1] = p->raw.prev_prop[0] = p->no_prop; 325 p->skip.prev_prop[1] = p->skip.prev_prop[0] = p->no_prop; 326 327 /* 328 * initialize the next properties 329 */ 330 331 /* initialize the raw reader */ 332 herodotus_reader_copy(r, &(p->raw_reader)); 333 334 /* fill in the two next raw properties (after no-initialization) */ 335 p->raw.next_prop[0] = p->raw.next_prop[1] = p->no_prop; 336 for (i = 0; 337 i < 2 && herodotus_read_codepoint(&(p->raw_reader), true, &cp) == 338 HERODOTUS_STATUS_SUCCESS;) { 339 p->raw.next_prop[i++] = p->get_break_prop(cp); 340 } 341 342 /* initialize the skip reader */ 343 herodotus_reader_copy(r, &(p->skip_reader)); 344 345 /* fill in the two next skip properties (after no-initialization) */ 346 p->skip.next_prop[0] = p->skip.next_prop[1] = p->no_prop; 347 for (i = 0; 348 i < 2 && herodotus_read_codepoint(&(p->skip_reader), true, &cp) == 349 HERODOTUS_STATUS_SUCCESS;) { 350 prop = p->get_break_prop(cp); 351 if (!p->is_skippable_prop(prop)) { 352 p->skip.next_prop[i++] = prop; 353 } 354 } 355 } 356 357 int 358 proper_advance(struct proper *p) 359 { 360 uint_least8_t prop; 361 uint_least32_t cp; 362 363 /* read in next "raw" property */ 364 if (herodotus_read_codepoint(&(p->raw_reader), true, &cp) == 365 HERODOTUS_STATUS_SUCCESS) { 366 prop = p->get_break_prop(cp); 367 } else { 368 prop = p->no_prop; 369 } 370 371 /* 372 * do a shift-in, unless we find that the property that is to 373 * be moved past the "raw-viewing-line" (this property is stored 374 * in p->raw.next_prop[0]) is a no_prop, indicating that 375 * we are at the end of the buffer. 376 */ 377 if (p->raw.next_prop[0] == p->no_prop) { 378 return 1; 379 } 380 381 /* shift in the properties */ 382 p->raw.prev_prop[1] = p->raw.prev_prop[0]; 383 p->raw.prev_prop[0] = p->raw.next_prop[0]; 384 p->raw.next_prop[0] = p->raw.next_prop[1]; 385 p->raw.next_prop[1] = prop; 386 387 /* advance the middle reader viewing-line */ 388 (void)herodotus_read_codepoint(&(p->mid_reader), true, &cp); 389 390 /* check skippability-property */ 391 if (!p->is_skippable_prop(p->raw.prev_prop[0])) { 392 /* 393 * the property that has moved past the "raw-viewing-line" 394 * (this property is now (after the raw-shift) stored in 395 * p->raw.prev_prop[0] and guaranteed not to be a no-prop, 396 * guaranteeing that we won't shift a no-prop past the 397 * "viewing-line" in the skip-properties) is not a skippable 398 * property, thus we need to shift the skip property as well. 399 */ 400 p->skip.prev_prop[1] = p->skip.prev_prop[0]; 401 p->skip.prev_prop[0] = p->skip.next_prop[0]; 402 p->skip.next_prop[0] = p->skip.next_prop[1]; 403 404 /* 405 * call the skip-shift-callback on the property that 406 * passed the skip-viewing-line (this property is now 407 * stored in p->skip.prev_prop[0]). 408 */ 409 p->skip_shift_callback(p->skip.prev_prop[0], p->state); 410 411 /* determine the next shift property */ 412 p->skip.next_prop[1] = p->no_prop; 413 while (herodotus_read_codepoint(&(p->skip_reader), true, &cp) == 414 HERODOTUS_STATUS_SUCCESS) { 415 prop = p->get_break_prop(cp); 416 if (!p->is_skippable_prop(prop)) { 417 p->skip.next_prop[1] = prop; 418 break; 419 } 420 } 421 } 422 423 return 0; 424 }