util.c - libgrapheme - unicode string library

util.c (11480B)
      1 /* See LICENSE file for copyright and license details. */
      2 #include <limits.h>
      3 #include <stdbool.h>
      4 #include <stddef.h>
      5 #include <stdint.h>
      6 
      7 #include "../gen/types.h"
      8 #include "../grapheme.h"
      9 #include "util.h"
     10 
     11 void
     12 herodotus_reader_init(HERODOTUS_READER *r, enum herodotus_type type,
     13                       const void *src, size_t srclen)
     14 {
     15 	size_t i;
     16 
     17 	r->type = type;
     18 	r->src = src;
     19 	r->srclen = srclen;
     20 	r->off = 0;
     21 	r->terminated_by_null = false;
     22 
     23 	for (i = 0; i < LEN(r->soft_limit); i++) {
     24 		r->soft_limit[i] = SIZE_MAX;
     25 	}
     26 }
     27 
     28 void
     29 herodotus_reader_copy(const HERODOTUS_READER *src, HERODOTUS_READER *dest)
     30 {
     31 	size_t i;
     32 
     33 	/*
     34 	 * we copy such that we have a "fresh" start and build on the
     35 	 * fact that src->soft_limit[i] for any i and src->srclen are
     36 	 * always larger or equal to src->off
     37 	 */
     38 	dest->type = src->type;
     39 	if (src->type == HERODOTUS_TYPE_CODEPOINT) {
     40 		dest->src =
     41 			(src->src == NULL) ?
     42 				NULL :
     43 				((const uint_least32_t *)(src->src)) + src->off;
     44 	} else { /* src->type == HERODOTUS_TYPE_UTF8 */
     45 		dest->src = (src->src == NULL) ?
     46 		                    NULL :
     47 		                    ((const char *)(src->src)) + src->off;
     48 	}
     49 	if (src->srclen == SIZE_MAX) {
     50 		dest->srclen = SIZE_MAX;
     51 	} else {
     52 		dest->srclen =
     53 			(src->off < src->srclen) ? src->srclen - src->off : 0;
     54 	}
     55 	dest->off = 0;
     56 	dest->terminated_by_null = src->terminated_by_null;
     57 
     58 	for (i = 0; i < LEN(src->soft_limit); i++) {
     59 		if (src->soft_limit[i] == SIZE_MAX) {
     60 			dest->soft_limit[i] = SIZE_MAX;
     61 		} else {
     62 			/*
     63 			 * if we have a degenerate case where the offset is
     64 			 * higher than the soft-limit, we simply clamp the
     65 			 * soft-limit to zero given we can't decide here
     66 			 * to release the limit and, instead, we just
     67 			 * prevent any more reads
     68 			 */
     69 			dest->soft_limit[i] =
     70 				(src->off < src->soft_limit[i]) ?
     71 					src->soft_limit[i] - src->off :
     72 					0;
     73 		}
     74 	}
     75 }
     76 
     77 void
     78 herodotus_reader_push_advance_limit(HERODOTUS_READER *r, size_t count)
     79 {
     80 	size_t i;
     81 
     82 	for (i = LEN(r->soft_limit) - 1; i >= 1; i--) {
     83 		r->soft_limit[i] = r->soft_limit[i - 1];
     84 	}
     85 	r->soft_limit[0] = r->off + count;
     86 }
     87 
     88 void
     89 herodotus_reader_pop_limit(HERODOTUS_READER *r)
     90 {
     91 	size_t i;
     92 
     93 	for (i = 0; i < LEN(r->soft_limit) - 1; i++) {
     94 		r->soft_limit[i] = r->soft_limit[i + 1];
     95 	}
     96 	r->soft_limit[LEN(r->soft_limit) - 1] = SIZE_MAX;
     97 }
     98 
     99 size_t
    100 herodotus_reader_next_word_break(const HERODOTUS_READER *r)
    101 {
    102 	if (r->type == HERODOTUS_TYPE_CODEPOINT) {
    103 		return grapheme_next_word_break(
    104 			(const uint_least32_t *)(r->src) + r->off,
    105 			MIN(r->srclen, r->soft_limit[0]) - r->off);
    106 	} else { /* r->type == HERODOTUS_TYPE_UTF8 */
    107 		return grapheme_next_word_break_utf8(
    108 			(const char *)(r->src) + r->off,
    109 			MIN(r->srclen, r->soft_limit[0]) - r->off);
    110 	}
    111 }
    112 
    113 size_t
    114 herodotus_reader_next_codepoint_break(const HERODOTUS_READER *r)
    115 {
    116 	if (r->type == HERODOTUS_TYPE_CODEPOINT) {
    117 		return (r->off < MIN(r->srclen, r->soft_limit[0])) ? 1 : 0;
    118 	} else { /* r->type == HERODOTUS_TYPE_UTF8 */
    119 		return grapheme_decode_utf8(
    120 			(const char *)(r->src) + r->off,
    121 			MIN(r->srclen, r->soft_limit[0]) - r->off, NULL);
    122 	}
    123 }
    124 
    125 size_t
    126 herodotus_reader_number_read(const HERODOTUS_READER *r)
    127 {
    128 	return r->off;
    129 }
    130 
    131 enum herodotus_status
    132 herodotus_read_codepoint(HERODOTUS_READER *r, bool advance, uint_least32_t *cp)
    133 {
    134 	size_t ret;
    135 
    136 	if (r->terminated_by_null || r->off >= r->srclen || r->src == NULL) {
    137 		*cp = GRAPHEME_INVALID_CODEPOINT;
    138 		return HERODOTUS_STATUS_END_OF_BUFFER;
    139 	}
    140 
    141 	if (r->off >= r->soft_limit[0]) {
    142 		*cp = GRAPHEME_INVALID_CODEPOINT;
    143 		return HERODOTUS_STATUS_SOFT_LIMIT_REACHED;
    144 	}
    145 
    146 	if (r->type == HERODOTUS_TYPE_CODEPOINT) {
    147 		*cp = ((const uint_least32_t *)(r->src))[r->off];
    148 		ret = 1;
    149 	} else { /* r->type == HERODOTUS_TYPE_UTF8 */
    150 		ret = grapheme_decode_utf8(
    151 			(const char *)r->src + r->off,
    152 			MIN(r->srclen, r->soft_limit[0]) - r->off, cp);
    153 	}
    154 
    155 	if (unlikely(r->srclen == SIZE_MAX && *cp == 0)) {
    156 		/*
    157 		 * We encountered a null-codepoint. Don't increment
    158 		 * offset and return as if the buffer had ended here all
    159 		 * along
    160 		 */
    161 		r->terminated_by_null = true;
    162 		return HERODOTUS_STATUS_END_OF_BUFFER;
    163 	}
    164 
    165 	if (r->off + ret > MIN(r->srclen, r->soft_limit[0])) {
    166 		/*
    167 		 * we want more than we have; instead of returning
    168 		 * garbage we terminate here.
    169 		 */
    170 		return HERODOTUS_STATUS_END_OF_BUFFER;
    171 	}
    172 
    173 	/*
    174 	 * Increase offset which we now know won't surpass the limits,
    175 	 * unless we got told otherwise
    176 	 */
    177 	if (advance) {
    178 		r->off += ret;
    179 	}
    180 
    181 	return HERODOTUS_STATUS_SUCCESS;
    182 }
    183 
    184 void
    185 herodotus_writer_init(HERODOTUS_WRITER *w, enum herodotus_type type, void *dest,
    186                       size_t destlen)
    187 {
    188 	w->type = type;
    189 	w->dest = dest;
    190 	w->destlen = destlen;
    191 	w->off = 0;
    192 	w->first_unwritable_offset = SIZE_MAX;
    193 }
    194 
    195 void
    196 herodotus_writer_nul_terminate(HERODOTUS_WRITER *w)
    197 {
    198 	if (w->dest == NULL) {
    199 		return;
    200 	}
    201 
    202 	if (w->off < w->destlen) {
    203 		/* We still have space in the buffer. Simply use it */
    204 		if (w->type == HERODOTUS_TYPE_CODEPOINT) {
    205 			((uint_least32_t *)(w->dest))[w->off] = 0;
    206 		} else { /* w->type == HERODOTUS_TYPE_UTF8 */
    207 			((char *)(w->dest))[w->off] = '\0';
    208 		}
    209 	} else if (w->first_unwritable_offset < w->destlen) {
    210 		/*
    211 		 * There is no more space in the buffer. However,
    212 		 * we have noted down the first offset we couldn't
    213 		 * use to write into the buffer and it's smaller than
    214 		 * destlen. Thus we bailed writing into the
    215 		 * destination when a multibyte-codepoint couldn't be
    216 		 * written. So the last "real" byte might be at
    217 		 * destlen-4, destlen-3, destlen-2 or destlen-1
    218 		 * (the last case meaning truncation).
    219 		 */
    220 		if (w->type == HERODOTUS_TYPE_CODEPOINT) {
    221 			((uint_least32_t
    222 			          *)(w->dest))[w->first_unwritable_offset] = 0;
    223 		} else { /* w->type == HERODOTUS_TYPE_UTF8 */
    224 			((char *)(w->dest))[w->first_unwritable_offset] = '\0';
    225 		}
    226 	} else if (w->destlen > 0) {
    227 		/*
    228 		 * In this case, there is no more space in the buffer and
    229 		 * the last unwritable offset is larger than
    230 		 * or equal to the destination buffer length. This means
    231 		 * that we are forced to simply write into the last
    232 		 * byte.
    233 		 */
    234 		if (w->type == HERODOTUS_TYPE_CODEPOINT) {
    235 			((uint_least32_t *)(w->dest))[w->destlen - 1] = 0;
    236 		} else { /* w->type == HERODOTUS_TYPE_UTF8 */
    237 			((char *)(w->dest))[w->destlen - 1] = '\0';
    238 		}
    239 	}
    240 
    241 	/* w->off is not incremented in any case */
    242 }
    243 
    244 size_t
    245 herodotus_writer_number_written(const HERODOTUS_WRITER *w)
    246 {
    247 	return w->off;
    248 }
    249 
    250 void
    251 herodotus_write_codepoint(HERODOTUS_WRITER *w, uint_least32_t cp)
    252 {
    253 	size_t ret;
    254 
    255 	/*
    256 	 * This function will always faithfully say how many codepoints
    257 	 * were written, even if the buffer ends. This is used to enable
    258 	 * truncation detection.
    259 	 */
    260 	if (w->type == HERODOTUS_TYPE_CODEPOINT) {
    261 		if (w->dest != NULL && w->off < w->destlen) {
    262 			((uint_least32_t *)(w->dest))[w->off] = cp;
    263 		}
    264 
    265 		w->off += 1;
    266 	} else { /* w->type == HERODOTUS_TYPE_UTF8 */
    267 		/*
    268 		 * First determine how many bytes we need to encode the
    269 		 * codepoint
    270 		 */
    271 		ret = grapheme_encode_utf8(cp, NULL, 0);
    272 
    273 		if (w->dest != NULL && w->off + ret < w->destlen) {
    274 			/* we still have enough room in the buffer */
    275 			grapheme_encode_utf8(cp, (char *)(w->dest) + w->off,
    276 			                     w->destlen - w->off);
    277 		} else if (w->first_unwritable_offset == SIZE_MAX) {
    278 			/*
    279 			 * the first unwritable offset has not been
    280 			 * noted down, so this is the first time we can't
    281 			 * write (completely) to an offset
    282 			 */
    283 			w->first_unwritable_offset = w->off;
    284 		}
    285 
    286 		w->off += ret;
    287 	}
    288 }
    289 
    290 void
    291 proper_init(const HERODOTUS_READER *r, void *state, uint_least8_t no_prop,
    292             uint_least8_t (*get_break_prop)(uint_least32_t),
    293             bool (*is_skippable_prop)(uint_least8_t),
    294             void (*skip_shift_callback)(uint_least8_t, void *),
    295             struct proper *p)
    296 {
    297 	uint_least8_t prop;
    298 	uint_least32_t cp;
    299 	size_t i;
    300 
    301 	/* set internal variables */
    302 	p->state = state;
    303 	p->no_prop = no_prop;
    304 	p->get_break_prop = get_break_prop;
    305 	p->is_skippable_prop = is_skippable_prop;
    306 	p->skip_shift_callback = skip_shift_callback;
    307 
    308 	/*
    309 	 * Initialize mid-reader, which is basically just there
    310 	 * to reflect the current position of the viewing-line
    311 	 */
    312 	herodotus_reader_copy(r, &(p->mid_reader));
    313 
    314 	/*
    315 	 * In the initialization, we simply (try to) fill in next_prop.
    316 	 * If we cannot read in more (due to the buffer ending), we
    317 	 * fill in the prop as invalid
    318 	 */
    319 
    320 	/*
    321 	 * initialize the previous properties to have no property
    322 	 * (given we are at the start of the buffer)
    323 	 */
    324 	p->raw.prev_prop[1] = p->raw.prev_prop[0] = p->no_prop;
    325 	p->skip.prev_prop[1] = p->skip.prev_prop[0] = p->no_prop;
    326 
    327 	/*
    328 	 * initialize the next properties
    329 	 */
    330 
    331 	/* initialize the raw reader */
    332 	herodotus_reader_copy(r, &(p->raw_reader));
    333 
    334 	/* fill in the two next raw properties (after no-initialization) */
    335 	p->raw.next_prop[0] = p->raw.next_prop[1] = p->no_prop;
    336 	for (i = 0;
    337 	     i < 2 && herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
    338 	                      HERODOTUS_STATUS_SUCCESS;) {
    339 		p->raw.next_prop[i++] = p->get_break_prop(cp);
    340 	}
    341 
    342 	/* initialize the skip reader */
    343 	herodotus_reader_copy(r, &(p->skip_reader));
    344 
    345 	/* fill in the two next skip properties (after no-initialization) */
    346 	p->skip.next_prop[0] = p->skip.next_prop[1] = p->no_prop;
    347 	for (i = 0;
    348 	     i < 2 && herodotus_read_codepoint(&(p->skip_reader), true, &cp) ==
    349 	                      HERODOTUS_STATUS_SUCCESS;) {
    350 		prop = p->get_break_prop(cp);
    351 		if (!p->is_skippable_prop(prop)) {
    352 			p->skip.next_prop[i++] = prop;
    353 		}
    354 	}
    355 }
    356 
    357 int
    358 proper_advance(struct proper *p)
    359 {
    360 	uint_least8_t prop;
    361 	uint_least32_t cp;
    362 
    363 	/* read in next "raw" property */
    364 	if (herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
    365 	    HERODOTUS_STATUS_SUCCESS) {
    366 		prop = p->get_break_prop(cp);
    367 	} else {
    368 		prop = p->no_prop;
    369 	}
    370 
    371 	/*
    372 	 * do a shift-in, unless we find that the property that is to
    373 	 * be moved past the "raw-viewing-line" (this property is stored
    374 	 * in p->raw.next_prop[0]) is a no_prop, indicating that
    375 	 * we are at the end of the buffer.
    376 	 */
    377 	if (p->raw.next_prop[0] == p->no_prop) {
    378 		return 1;
    379 	}
    380 
    381 	/* shift in the properties */
    382 	p->raw.prev_prop[1] = p->raw.prev_prop[0];
    383 	p->raw.prev_prop[0] = p->raw.next_prop[0];
    384 	p->raw.next_prop[0] = p->raw.next_prop[1];
    385 	p->raw.next_prop[1] = prop;
    386 
    387 	/* advance the middle reader viewing-line */
    388 	(void)herodotus_read_codepoint(&(p->mid_reader), true, &cp);
    389 
    390 	/* check skippability-property */
    391 	if (!p->is_skippable_prop(p->raw.prev_prop[0])) {
    392 		/*
    393 		 * the property that has moved past the "raw-viewing-line"
    394 		 * (this property is now (after the raw-shift) stored in
    395 		 * p->raw.prev_prop[0] and guaranteed not to be a no-prop,
    396 		 * guaranteeing that we won't shift a no-prop past the
    397 		 * "viewing-line" in the skip-properties) is not a skippable
    398 		 * property, thus we need to shift the skip property as well.
    399 		 */
    400 		p->skip.prev_prop[1] = p->skip.prev_prop[0];
    401 		p->skip.prev_prop[0] = p->skip.next_prop[0];
    402 		p->skip.next_prop[0] = p->skip.next_prop[1];
    403 
    404 		/*
    405 		 * call the skip-shift-callback on the property that
    406 		 * passed the skip-viewing-line (this property is now
    407 		 * stored in p->skip.prev_prop[0]).
    408 		 */
    409 		p->skip_shift_callback(p->skip.prev_prop[0], p->state);
    410 
    411 		/* determine the next shift property */
    412 		p->skip.next_prop[1] = p->no_prop;
    413 		while (herodotus_read_codepoint(&(p->skip_reader), true, &cp) ==
    414 		       HERODOTUS_STATUS_SUCCESS) {
    415 			prop = p->get_break_prop(cp);
    416 			if (!p->is_skippable_prop(prop)) {
    417 				p->skip.next_prop[1] = prop;
    418 				break;
    419 			}
    420 		}
    421 	}
    422 
    423 	return 0;
    424 }
	libgrapheme unicode string library
	git clone git://git.suckless.org/libgrapheme
	Log \| Files \| Refs \| README \| LICENSE