libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

utf8.c (6079B)


      1 /* See LICENSE file for copyright and license details. */
      2 #include <stdio.h>
      3 
      4 #include "../grapheme.h"
      5 #include "util.h"
      6 
      7 #define BETWEEN(c, l, u) (c >= l && c <= u)
      8 
      9 /* lookup-table for the types of sequence first bytes */
     10 static const struct {
     11 	uint_least8_t  lower; /* lower bound of sequence first byte */
     12 	uint_least8_t  upper; /* upper bound of sequence first byte */
     13 	uint_least32_t mincp; /* smallest non-overlong encoded codepoint */
     14 	uint_least32_t maxcp; /* largest encodable codepoint */
     15 	/*
     16 	 * implicit: table-offset represents the number of following
     17 	 * bytes of the form 10xxxxxx (6 bits capacity each)
     18 	 */
     19 } lut[] = {
     20 	[0] = {
     21 		/* 0xxxxxxx */
     22 		.lower = 0x00, /* 00000000 */
     23 		.upper = 0x7F, /* 01111111 */
     24 		.mincp = (uint_least32_t)0,
     25 		.maxcp = ((uint_least32_t)1 << 7) - 1, /* 7 bits capacity */
     26 	},
     27 	[1] = {
     28 		/* 110xxxxx */
     29 		.lower = 0xC0, /* 11000000 */
     30 		.upper = 0xDF, /* 11011111 */
     31 		.mincp = (uint_least32_t)1 << 7,
     32 		.maxcp = ((uint_least32_t)1 << 11) - 1, /* 5+6=11 bits capacity */
     33 	},
     34 	[2] = {
     35 		/* 1110xxxx */
     36 		.lower = 0xE0, /* 11100000 */
     37 		.upper = 0xEF, /* 11101111 */
     38 		.mincp = (uint_least32_t)1 << 11,
     39 		.maxcp = ((uint_least32_t)1 << 16) - 1, /* 4+6+6=16 bits capacity */
     40 	},
     41 	[3] = {
     42 		/* 11110xxx */
     43 		.lower = 0xF0, /* 11110000 */
     44 		.upper = 0xF7, /* 11110111 */
     45 		.mincp = (uint_least32_t)1 << 16,
     46 		.maxcp = ((uint_least32_t)1 << 21) - 1, /* 3+6+6+6=21 bits capacity */
     47 	},
     48 };
     49 
     50 size_t
     51 grapheme_decode_utf8(const char *str, size_t len, uint_least32_t *cp)
     52 {
     53 	size_t off, i;
     54 	uint_least32_t tmp;
     55 
     56 	if (cp == NULL) {
     57 		/*
     58 		 * instead of checking every time if cp is NULL within
     59 		 * the decoder, simply point it at a dummy variable here.
     60 		 */
     61 		cp = &tmp;
     62 	}
     63 
     64 	if (str == NULL || len == 0) {
     65 		/* a sequence must be at least 1 byte long */
     66 		*cp = GRAPHEME_INVALID_CODEPOINT;
     67 		return 0;
     68 	}
     69 
     70 	/* identify sequence type with the first byte */
     71 	for (off = 0; off < LEN(lut); off++) {
     72 		if (BETWEEN(((const unsigned char *)str)[0], lut[off].lower,
     73 		            lut[off].upper)) {
     74 			/*
     75 			 * first byte is within the bounds; fill
     76 			 * p with the the first bits contained in
     77 			 * the first byte (by subtracting the high bits)
     78 			 */
     79 			*cp = ((const unsigned char *)str)[0] - lut[off].lower;
     80 			break;
     81 		}
     82 	}
     83 	if (off == LEN(lut)) {
     84 		/*
     85 		 * first byte does not match a sequence type;
     86 		 * set cp as invalid and return 1 byte processed
     87 		 *
     88 		 * this also includes the cases where bits higher than
     89 		 * the 8th are set on systems with CHAR_BIT > 8
     90 		 */
     91 		*cp = GRAPHEME_INVALID_CODEPOINT;
     92 		return 1;
     93 	}
     94 	if (1 + off > len) {
     95 		/*
     96 		 * input is not long enough, set cp as invalid
     97 		 */
     98 		*cp = GRAPHEME_INVALID_CODEPOINT;
     99 
    100 		/*
    101 		 * count the following continuation bytes, but nothing
    102 		 * else in case we have a "rogue" case where e.g. such a
    103 		 * sequence starter occurs right before a NUL-byte.
    104 		 */
    105 		for (i = 0; 1 + i < len; i++) {
    106 			if(!BETWEEN(((const unsigned char *)str)[1 + i],
    107 			            0x80, 0xBF)) {
    108 				break;
    109 			}
    110 		}
    111 
    112 		/*
    113 		 * if the continuation bytes do not continue until
    114 		 * the end, return the incomplete sequence length.
    115 		 * Otherwise return the number of bytes we actually
    116 		 * expected, which is larger than n.
    117 		 */
    118 		return ((1 + i) < len) ? (1 + i) : (1 + off);
    119 	}
    120 
    121 	/*
    122 	 * process 'off' following bytes, each of the form 10xxxxxx
    123 	 * (i.e. between 0x80 (10000000) and 0xBF (10111111))
    124 	 */
    125 	for (i = 1; i <= off; i++) {
    126 		if(!BETWEEN(((const unsigned char *)str)[i], 0x80, 0xBF)) {
    127 			/*
    128 			 * byte does not match format; return
    129 			 * number of bytes processed excluding the
    130 			 * unexpected character as recommended since
    131 			 * Unicode 6 (chapter 3)
    132 			 *
    133 			 * this also includes the cases where bits
    134 			 * higher than the 8th are set on systems
    135 			 * with CHAR_BIT > 8
    136 			 */
    137 			*cp = GRAPHEME_INVALID_CODEPOINT;
    138 			return 1 + (i - 1);
    139 		}
    140 		/*
    141 		 * shift codepoint by 6 bits and add the 6 stored bits
    142 		 * in s[i] to it using the bitmask 0x3F (00111111)
    143 		 */
    144 		*cp = (*cp << 6) | (((const unsigned char *)str)[i] & 0x3F);
    145 	}
    146 
    147 	if (*cp < lut[off].mincp ||
    148 	    BETWEEN(*cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) ||
    149 	    *cp > UINT32_C(0x10FFFF)) {
    150 		/*
    151 		 * codepoint is overlong encoded in the sequence, is a
    152 		 * high or low UTF-16 surrogate half (0xD800..0xDFFF) or
    153 		 * not representable in UTF-16 (>0x10FFFF) (RFC-3629
    154 		 * specifies the latter two conditions)
    155 		 */
    156 		*cp = GRAPHEME_INVALID_CODEPOINT;
    157 	}
    158 
    159 	return 1 + off;
    160 }
    161 
    162 size_t
    163 grapheme_encode_utf8(uint_least32_t cp, char *str, size_t len)
    164 {
    165 	size_t off, i;
    166 
    167 	if (BETWEEN(cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) ||
    168 	    cp > UINT32_C(0x10FFFF)) {
    169 		/*
    170 		 * codepoint is a high or low UTF-16 surrogate half
    171 		 * (0xD800..0xDFFF) or not representable in UTF-16
    172 		 * (>0x10FFFF), which RFC-3629 deems invalid for UTF-8.
    173 		 */
    174 		cp = GRAPHEME_INVALID_CODEPOINT;
    175 	}
    176 
    177 	/* determine necessary sequence type */
    178 	for (off = 0; off < LEN(lut); off++) {
    179 		if (cp <= lut[off].maxcp) {
    180 			break;
    181 		}
    182 	}
    183 	if (1 + off > len || str == NULL || len == 0) {
    184 		/*
    185 		 * specified buffer is too small to store sequence or
    186 		 * the caller just wanted to know how many bytes the
    187 		 * codepoint needs by passing a NULL-buffer.
    188 		 */
    189 		return 1 + off;
    190 	}
    191 
    192 	/* build sequence by filling cp-bits into each byte */
    193 
    194 	/*
    195 	 * lut[off].lower is the bit-format for the first byte and
    196 	 * the bits to fill into it are determined by shifting the
    197 	 * cp 6 times the number of following bytes, as each
    198 	 * following byte stores 6 bits, yielding the wanted bits.
    199 	 *
    200 	 * We do not overwrite the mask because we guaranteed earlier
    201 	 * that there are no bits higher than the mask allows.
    202 	 */
    203 	((unsigned char *)str)[0] = lut[off].lower |
    204 	                            (uint_least8_t)(cp >> (6 * off));
    205 
    206 	for (i = 1; i <= off; i++) {
    207 		/*
    208 		 * the bit-format for following bytes is 10000000 (0x80)
    209 		 * and it each stores 6 bits in the 6 low bits that we
    210 		 * extract from the properly-shifted value using the
    211 		 * mask 00111111 (0x3F)
    212 		 */
    213 		((unsigned char *)str)[i] = 0x80 |
    214 		                            ((cp >> (6 * (off - i))) & 0x3F);
    215 	}
    216 
    217 	return 1 + off;
    218 }