codepoint.c (4636B)
1 /* See LICENSE file for copyright and license details. */ 2 #include "../grapheme.h" 3 #include <stdio.h> 4 5 #define BETWEEN(c, l, u) (c >= l && c <= u) 6 #define LEN(x) (sizeof(x) / sizeof(*x)) 7 8 /* lookup-table for the types of sequence first bytes */ 9 static const struct { 10 uint8_t lower; /* lower bound of sequence first byte */ 11 uint8_t upper; /* upper bound of sequence first byte */ 12 uint32_t mincp; /* smallest non-overlong encoded code point */ 13 uint32_t maxcp; /* largest encodable code point */ 14 /* 15 * implicit: table-offset represents the number of following 16 * bytes of the form 10xxxxxx (6 bits capacity each) 17 */ 18 } lut[] = { 19 [0] = { 20 /* 0xxxxxxx */ 21 .lower = 0x00, /* 00000000 */ 22 .upper = 0x7F, /* 01111111 */ 23 .mincp = (uint32_t)0, 24 .maxcp = ((uint32_t)1 << 7) - 1, /* 7 bits capacity */ 25 }, 26 [1] = { 27 /* 110xxxxx */ 28 .lower = 0xC0, /* 11000000 */ 29 .upper = 0xDF, /* 11011111 */ 30 .mincp = (uint32_t)1 << 7, 31 .maxcp = ((uint32_t)1 << 11) - 1, /* 5+6=11 bits capacity */ 32 }, 33 [2] = { 34 /* 1110xxxx */ 35 .lower = 0xE0, /* 11100000 */ 36 .upper = 0xEF, /* 11101111 */ 37 .mincp = (uint32_t)1 << 11, 38 .maxcp = ((uint32_t)1 << 16) - 1, /* 4+6+6=16 bits capacity */ 39 }, 40 [3] = { 41 /* 11110xxx */ 42 .lower = 0xF0, /* 11110000 */ 43 .upper = 0xF7, /* 11110111 */ 44 .mincp = (uint32_t)1 << 16, 45 .maxcp = ((uint32_t)1 << 21) - 1, /* 3+6+6+6=21 bits capacity */ 46 }, 47 }; 48 49 size_t 50 grapheme_cp_decode(uint32_t *cp, const uint8_t *s, size_t n) 51 { 52 size_t off, i; 53 54 if (n == 0) { 55 /* a sequence must be at least 1 byte long */ 56 *cp = GRAPHEME_CP_INVALID; 57 return 1; 58 } 59 60 /* identify sequence type with the first byte */ 61 for (off = 0; off < LEN(lut); off++) { 62 if (BETWEEN(s[0], lut[off].lower, lut[off].upper)) { 63 /* 64 * first byte is within the bounds; fill 65 * p with the the first bits contained in 66 * the first byte (by subtracting the high bits) 67 */ 68 *cp = s[0] - lut[off].lower; 69 break; 70 } 71 } 72 if (off == LEN(lut)) { 73 /* 74 * first byte does not match a sequence type; 75 * set cp as invalid and return 1 byte processed 76 */ 77 *cp = GRAPHEME_CP_INVALID; 78 return 1; 79 } 80 if (1 + off > n) { 81 /* 82 * input is not long enough, set cp as invalid and 83 * return number of bytes needed 84 */ 85 *cp = GRAPHEME_CP_INVALID; 86 return 1 + off; 87 } 88 89 /* 90 * process 'off' following bytes, each of the form 10xxxxxx 91 * (i.e. between 0x80 (10000000) and 0xBF (10111111)) 92 */ 93 for (i = 1; i <= off; i++) { 94 if(!BETWEEN(s[i], 0x80, 0xBF)) { 95 /* 96 * byte does not match format; return 97 * number of bytes processed excluding the 98 * unexpected character as recommended since 99 * Unicode 6 (chapter 3) 100 */ 101 *cp = GRAPHEME_CP_INVALID; 102 return 1 + (i - 1); 103 } 104 /* 105 * shift code point by 6 bits and add the 6 stored bits 106 * in s[i] to it using the bitmask 0x3F (00111111) 107 */ 108 *cp = (*cp << 6) | (s[i] & 0x3F); 109 } 110 111 if (*cp < lut[off].mincp || 112 BETWEEN(*cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) || 113 *cp > UINT32_C(0x10FFFF)) { 114 /* 115 * code point is overlong encoded in the sequence, is a 116 * high or low UTF-16 surrogate half (0xD800..0xDFFF) or 117 * not representable in UTF-16 (>0x10FFFF) (RFC-3629 118 * specifies the latter two conditions) 119 */ 120 *cp = GRAPHEME_CP_INVALID; 121 } 122 123 return 1 + off; 124 } 125 126 size_t 127 grapheme_cp_encode(uint32_t cp, uint8_t *s, size_t n) 128 { 129 size_t off, i; 130 131 if (BETWEEN(cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) || 132 cp > UINT32_C(0x10FFFF)) { 133 /* 134 * code point is a high or low UTF-16 surrogate half 135 * (0xD800..0xDFFF) or not representable in UTF-16 136 * (>0x10FFFF), which RFC-3629 deems invalid for UTF-8. 137 */ 138 cp = GRAPHEME_CP_INVALID; 139 } 140 141 /* determine necessary sequence type */ 142 for (off = 0; off < LEN(lut); off++) { 143 if (cp <= lut[off].maxcp) { 144 break; 145 } 146 } 147 if (1 + off > n) { 148 /* specified buffer is too small to store sequence */ 149 return 1 + off; 150 } 151 152 /* build sequence by filling cp-bits into each byte */ 153 154 /* 155 * lut[off].lower is the bit-format for the first byte and 156 * the bits to fill into it are determined by shifting the 157 * cp 6 times the number of following bytes, as each 158 * following byte stores 6 bits, yielding the wanted bits. 159 * 160 * We do not overwrite the mask because we guaranteed earlier 161 * that there are no bits higher than the mask allows. 162 */ 163 s[0] = lut[off].lower | (cp >> (6 * off)); 164 165 for (i = 1; i <= off; i++) { 166 /* 167 * the bit-format for following bytes is 10000000 (0x80) 168 * and it each stores 6 bits in the 6 low bits that we 169 * extract from the properly-shifted value using the 170 * mask 00111111 (0x3F) 171 */ 172 s[i] = 0x80 | ((cp >> (6 * (off - i))) & 0x3F); 173 } 174 175 return 1 + off; 176 }