utf8.c (6131B)
1 /* See LICENSE file for copyright and license details. */ 2 #include <stddef.h> 3 #include <stdint.h> 4 5 #include "../grapheme.h" 6 #include "util.h" 7 8 #define BETWEEN(c, l, u) ((c) >= (l) && (c) <= (u)) 9 10 /* lookup-table for the types of sequence first bytes */ 11 static const struct { 12 uint_least8_t lower; /* lower bound of sequence first byte */ 13 uint_least8_t upper; /* upper bound of sequence first byte */ 14 uint_least32_t mincp; /* smallest non-overlong encoded codepoint */ 15 uint_least32_t maxcp; /* largest encodable codepoint */ 16 /* 17 * implicit: table-offset represents the number of following 18 * bytes of the form 10xxxxxx (6 bits capacity each) 19 */ 20 } lut[] = { 21 [0] = { 22 /* 0xxxxxxx */ 23 .lower = 0x00, /* 00000000 */ 24 .upper = 0x7F, /* 01111111 */ 25 .mincp = (uint_least32_t)0, 26 .maxcp = ((uint_least32_t)1 << 7) - 1, /* 7 bits capacity */ 27 }, 28 [1] = { 29 /* 110xxxxx */ 30 .lower = 0xC0, /* 11000000 */ 31 .upper = 0xDF, /* 11011111 */ 32 .mincp = (uint_least32_t)1 << 7, 33 .maxcp = ((uint_least32_t)1 << 11) - 1, /* 5+6=11 bits capacity */ 34 }, 35 [2] = { 36 /* 1110xxxx */ 37 .lower = 0xE0, /* 11100000 */ 38 .upper = 0xEF, /* 11101111 */ 39 .mincp = (uint_least32_t)1 << 11, 40 .maxcp = ((uint_least32_t)1 << 16) - 1, /* 4+6+6=16 bits capacity */ 41 }, 42 [3] = { 43 /* 11110xxx */ 44 .lower = 0xF0, /* 11110000 */ 45 .upper = 0xF7, /* 11110111 */ 46 .mincp = (uint_least32_t)1 << 16, 47 .maxcp = ((uint_least32_t)1 << 21) - 1, /* 3+6+6+6=21 bits capacity */ 48 }, 49 }; 50 51 size_t 52 grapheme_decode_utf8(const char *str, size_t len, uint_least32_t *cp) 53 { 54 size_t off, i; 55 uint_least32_t tmp; 56 57 if (cp == NULL) { 58 /* 59 * instead of checking every time if cp is NULL within 60 * the decoder, simply point it at a dummy variable here. 61 */ 62 cp = &tmp; 63 } 64 65 if (str == NULL || len == 0) { 66 /* a sequence must be at least 1 byte long */ 67 *cp = GRAPHEME_INVALID_CODEPOINT; 68 return 0; 69 } 70 71 /* identify sequence type with the first byte */ 72 for (off = 0; off < LEN(lut); off++) { 73 if (BETWEEN(((const unsigned char *)str)[0], lut[off].lower, 74 lut[off].upper)) { 75 /* 76 * first byte is within the bounds; fill 77 * p with the the first bits contained in 78 * the first byte (by subtracting the high bits) 79 */ 80 *cp = ((const unsigned char *)str)[0] - lut[off].lower; 81 break; 82 } 83 } 84 if (off == LEN(lut)) { 85 /* 86 * first byte does not match a sequence type; 87 * set cp as invalid and return 1 byte processed 88 * 89 * this also includes the cases where bits higher than 90 * the 8th are set on systems with CHAR_BIT > 8 91 */ 92 *cp = GRAPHEME_INVALID_CODEPOINT; 93 return 1; 94 } 95 if (1 + off > len) { 96 /* 97 * input is not long enough, set cp as invalid 98 */ 99 *cp = GRAPHEME_INVALID_CODEPOINT; 100 101 /* 102 * count the following continuation bytes, but nothing 103 * else in case we have a "rogue" case where e.g. such a 104 * sequence starter occurs right before a NUL-byte. 105 */ 106 for (i = 0; 1 + i < len; i++) { 107 if (!BETWEEN(((const unsigned char *)str)[1 + i], 0x80, 108 0xBF)) { 109 break; 110 } 111 } 112 113 /* 114 * if the continuation bytes do not continue until 115 * the end, return the incomplete sequence length. 116 * Otherwise return the number of bytes we actually 117 * expected, which is larger than n. 118 */ 119 return ((1 + i) < len) ? (1 + i) : (1 + off); 120 } 121 122 /* 123 * process 'off' following bytes, each of the form 10xxxxxx 124 * (i.e. between 0x80 (10000000) and 0xBF (10111111)) 125 */ 126 for (i = 1; i <= off; i++) { 127 if (!BETWEEN(((const unsigned char *)str)[i], 0x80, 0xBF)) { 128 /* 129 * byte does not match format; return 130 * number of bytes processed excluding the 131 * unexpected character as recommended since 132 * Unicode 6 (chapter 3) 133 * 134 * this also includes the cases where bits 135 * higher than the 8th are set on systems 136 * with CHAR_BIT > 8 137 */ 138 *cp = GRAPHEME_INVALID_CODEPOINT; 139 return 1 + (i - 1); 140 } 141 /* 142 * shift codepoint by 6 bits and add the 6 stored bits 143 * in s[i] to it using the bitmask 0x3F (00111111) 144 */ 145 *cp = (*cp << 6) | (((const unsigned char *)str)[i] & 0x3F); 146 } 147 148 if (*cp < lut[off].mincp || 149 BETWEEN(*cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) || 150 *cp > UINT32_C(0x10FFFF)) { 151 /* 152 * codepoint is overlong encoded in the sequence, is a 153 * high or low UTF-16 surrogate half (0xD800..0xDFFF) or 154 * not representable in UTF-16 (>0x10FFFF) (RFC-3629 155 * specifies the latter two conditions) 156 */ 157 *cp = GRAPHEME_INVALID_CODEPOINT; 158 } 159 160 return 1 + off; 161 } 162 163 size_t 164 grapheme_encode_utf8(uint_least32_t cp, char *str, size_t len) 165 { 166 size_t off, i; 167 168 if (BETWEEN(cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) || 169 cp > UINT32_C(0x10FFFF)) { 170 /* 171 * codepoint is a high or low UTF-16 surrogate half 172 * (0xD800..0xDFFF) or not representable in UTF-16 173 * (>0x10FFFF), which RFC-3629 deems invalid for UTF-8. 174 */ 175 cp = GRAPHEME_INVALID_CODEPOINT; 176 } 177 178 /* determine necessary sequence type */ 179 for (off = 0; off < LEN(lut); off++) { 180 if (cp <= lut[off].maxcp) { 181 break; 182 } 183 } 184 if (1 + off > len || str == NULL || len == 0) { 185 /* 186 * specified buffer is too small to store sequence or 187 * the caller just wanted to know how many bytes the 188 * codepoint needs by passing a NULL-buffer. 189 */ 190 return 1 + off; 191 } 192 193 /* build sequence by filling cp-bits into each byte */ 194 195 /* 196 * lut[off].lower is the bit-format for the first byte and 197 * the bits to fill into it are determined by shifting the 198 * cp 6 times the number of following bytes, as each 199 * following byte stores 6 bits, yielding the wanted bits. 200 * 201 * We do not overwrite the mask because we guaranteed earlier 202 * that there are no bits higher than the mask allows. 203 */ 204 ((unsigned char *)str)[0] = 205 lut[off].lower | (uint_least8_t)(cp >> (6 * off)); 206 207 for (i = 1; i <= off; i++) { 208 /* 209 * the bit-format for following bytes is 10000000 (0x80) 210 * and it each stores 6 bits in the 6 low bits that we 211 * extract from the properly-shifted value using the 212 * mask 00111111 (0x3F) 213 */ 214 ((unsigned char *)str)[i] = 215 0x80 | ((cp >> (6 * (off - i))) & 0x3F); 216 } 217 218 return 1 + off; 219 }