libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

utf8-decode.c (7494B)


      1 /* See LICENSE file for copyright and license details. */
      2 #include <stddef.h>
      3 #include <stdint.h>
      4 #include <stdio.h>
      5 #include <string.h>
      6 
      7 #include "../grapheme.h"
      8 #include "util.h"
      9 
     10 static const struct {
     11 	char          *arr;     /* UTF-8 byte sequence */
     12 	size_t         len;     /* length of UTF-8 byte sequence */
     13 	size_t         exp_len; /* expected length returned */
     14 	uint_least32_t exp_cp;  /* expected codepoint returned */
     15 } dec_test[] = {
     16 	{
     17 		/* empty sequence
     18 		 * [ ] ->
     19 		 * INVALID
     20 		 */
     21 		.arr     = NULL,
     22 		.len     = 0,
     23 		.exp_len = 0,
     24 		.exp_cp  = GRAPHEME_INVALID_CODEPOINT,
     25 	},
     26 	{
     27 		/* invalid lead byte
     28 		 * [ 11111101 ] ->
     29 		 * INVALID
     30 		 */
     31 		.arr     = (char *)(unsigned char[]){ 0xFD },
     32 		.len     = 1,
     33 		.exp_len = 1,
     34 		.exp_cp  = GRAPHEME_INVALID_CODEPOINT,
     35 	},
     36 	{
     37 		/* valid 1-byte sequence
     38 		 * [ 00000001 ] ->
     39 		 * 0000001
     40 		 */
     41 		.arr     = (char *)(unsigned char[]){ 0x01 },
     42 		.len     = 1,
     43 		.exp_len = 1,
     44 		.exp_cp  = 0x1,
     45 	},
     46 	{
     47 		/* valid 2-byte sequence
     48 		 * [ 11000011 10111111 ] ->
     49 		 * 00011111111
     50 		 */
     51 		.arr     = (char *)(unsigned char[]){ 0xC3, 0xBF },
     52 		.len     = 2,
     53 		.exp_len = 2,
     54 		.exp_cp  = 0xFF,
     55 	},
     56 	{
     57 		/* invalid 2-byte sequence (second byte missing)
     58 		 * [ 11000011 ] ->
     59 		 * INVALID
     60 		 */
     61 		.arr     = (char *)(unsigned char[]){ 0xC3 },
     62 		.len     = 1,
     63 		.exp_len = 2,
     64 		.exp_cp  = GRAPHEME_INVALID_CODEPOINT,
     65 	},
     66 	{
     67 		/* invalid 2-byte sequence (second byte malformed)
     68 		 * [ 11000011 11111111 ] ->
     69 		 * INVALID
     70 		 */
     71 		.arr     = (char *)(unsigned char[]){ 0xC3, 0xFF },
     72 		.len     = 2,
     73 		.exp_len = 1,
     74 		.exp_cp  = GRAPHEME_INVALID_CODEPOINT,
     75 	},
     76 	{
     77 		/* invalid 2-byte sequence (overlong encoded)
     78 		 * [ 11000001 10111111 ] ->
     79 		 * INVALID
     80 		 */
     81 		.arr     = (char *)(unsigned char[]){ 0xC1, 0xBF },
     82 		.len     = 2,
     83 		.exp_len = 2,
     84 		.exp_cp  = GRAPHEME_INVALID_CODEPOINT,
     85 	},
     86 	{
     87 		/* valid 3-byte sequence
     88 		 * [ 11100000 10111111 10111111 ] ->
     89 		 * 0000111111111111
     90 		 */
     91 		.arr     = (char *)(unsigned char[]){ 0xE0, 0xBF, 0xBF },
     92 		.len     = 3,
     93 		.exp_len = 3,
     94 		.exp_cp  = 0xFFF,
     95 	},
     96 	{
     97 		/* invalid 3-byte sequence (second byte missing)
     98 		 * [ 11100000 ] ->
     99 		 * INVALID
    100 		 */
    101 		.arr     = (char *)(unsigned char[]){ 0xE0 },
    102 		.len     = 1,
    103 		.exp_len = 3,
    104 		.exp_cp  = GRAPHEME_INVALID_CODEPOINT,
    105 	},
    106 	{
    107 		/* invalid 3-byte sequence (second byte malformed)
    108 		 * [ 11100000 01111111 10111111 ] ->
    109 		 * INVALID
    110 		 */
    111 		.arr     = (char *)(unsigned char[]){ 0xE0, 0x7F, 0xBF },
    112 		.len     = 3,
    113 		.exp_len = 1,
    114 		.exp_cp  = GRAPHEME_INVALID_CODEPOINT,
    115 	},
    116 	{
    117 		/* invalid 3-byte sequence (short string, second byte malformed)
    118 		 * [ 11100000 01111111 ] ->
    119 		 * INVALID
    120 		 */
    121 		.arr     = (char *)(unsigned char[]){ 0xE0, 0x7F },
    122 		.len     = 2,
    123 		.exp_len = 1,
    124 		.exp_cp  = GRAPHEME_INVALID_CODEPOINT,
    125 	},
    126 	{
    127 		/* invalid 3-byte sequence (third byte missing)
    128 		 * [ 11100000 10111111 ] ->
    129 		 * INVALID
    130 		 */
    131 		.arr     = (char *)(unsigned char[]){ 0xE0, 0xBF },
    132 		.len     = 2,
    133 		.exp_len = 3,
    134 		.exp_cp  = GRAPHEME_INVALID_CODEPOINT,
    135 	},
    136 	{
    137 		/* invalid 3-byte sequence (third byte malformed)
    138 		 * [ 11100000 10111111 01111111 ] ->
    139 		 * INVALID
    140 		 */
    141 		.arr     = (char *)(unsigned char[]){ 0xE0, 0xBF, 0x7F },
    142 		.len     = 3,
    143 		.exp_len = 2,
    144 		.exp_cp  = GRAPHEME_INVALID_CODEPOINT,
    145 	},
    146 	{
    147 		/* invalid 3-byte sequence (overlong encoded)
    148 		 * [ 11100000 10011111 10111111 ] ->
    149 		 * INVALID
    150 		 */
    151 		.arr     = (char *)(unsigned char[]){ 0xE0, 0x9F, 0xBF },
    152 		.len     = 3,
    153 		.exp_len = 3,
    154 		.exp_cp  = GRAPHEME_INVALID_CODEPOINT,
    155 	},
    156 	{
    157 		/* invalid 3-byte sequence (UTF-16 surrogate half)
    158 		 * [ 11101101 10100000 10000000 ] ->
    159 		 * INVALID
    160 		 */
    161 		.arr     = (char *)(unsigned char[]){ 0xED, 0xA0, 0x80 },
    162 		.len     = 3,
    163 		.exp_len = 3,
    164 		.exp_cp  = GRAPHEME_INVALID_CODEPOINT,
    165 	},
    166 	{
    167 		/* valid 4-byte sequence
    168 		 * [ 11110011 10111111 10111111 10111111 ] ->
    169 		 * 011111111111111111111
    170 		 */
    171 		.arr     = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0xBF },
    172 		.len     = 4,
    173 		.exp_len = 4,
    174 		.exp_cp  = UINT32_C(0xFFFFF),
    175 	},
    176 	{
    177 		/* invalid 4-byte sequence (second byte missing)
    178 		 * [ 11110011 ] ->
    179 		 * INVALID
    180 		 */
    181 		.arr     = (char *)(unsigned char[]){ 0xF3 },
    182 		.len     = 1,
    183 		.exp_len = 4,
    184 		.exp_cp  = GRAPHEME_INVALID_CODEPOINT,
    185 	},
    186 	{
    187 		/* invalid 4-byte sequence (second byte malformed)
    188 		 * [ 11110011 01111111 10111111 10111111 ] ->
    189 		 * INVALID
    190 		 */
    191 		.arr     = (char *)(unsigned char[]){ 0xF3, 0x7F, 0xBF, 0xBF },
    192 		.len     = 4,
    193 		.exp_len = 1,
    194 		.exp_cp  = GRAPHEME_INVALID_CODEPOINT,
    195 	},
    196 	{
    197 		/* invalid 4-byte sequence (short string 1, second byte malformed)
    198 		 * [ 11110011 011111111 ] ->
    199 		 * INVALID
    200 		 */
    201 		.arr     = (char *)(unsigned char[]){ 0xF3, 0x7F },
    202 		.len     = 2,
    203 		.exp_len = 1,
    204 		.exp_cp  = GRAPHEME_INVALID_CODEPOINT,
    205 	},
    206 	{
    207 		/* invalid 4-byte sequence (short string 2, second byte malformed)
    208 		 * [ 11110011 011111111 10111111 ] ->
    209 		 * INVALID
    210 		 */
    211 		.arr     = (char *)(unsigned char[]){ 0xF3, 0x7F, 0xBF },
    212 		.len     = 3,
    213 		.exp_len = 1,
    214 		.exp_cp  = GRAPHEME_INVALID_CODEPOINT,
    215 	},
    216 
    217 	{
    218 		/* invalid 4-byte sequence (third byte missing)
    219 		 * [ 11110011 10111111 ] ->
    220 		 * INVALID
    221 		 */
    222 		.arr     = (char *)(unsigned char[]){ 0xF3, 0xBF },
    223 		.len     = 2,
    224 		.exp_len = 4,
    225 		.exp_cp  = GRAPHEME_INVALID_CODEPOINT,
    226 	},
    227 	{
    228 		/* invalid 4-byte sequence (third byte malformed)
    229 		 * [ 11110011 10111111 01111111 10111111 ] ->
    230 		 * INVALID
    231 		 */
    232 		.arr     = (char *)(unsigned char[]){ 0xF3, 0xBF, 0x7F, 0xBF },
    233 		.len     = 4,
    234 		.exp_len = 2,
    235 		.exp_cp  = GRAPHEME_INVALID_CODEPOINT,
    236 	},
    237 	{
    238 		/* invalid 4-byte sequence (short string, third byte malformed)
    239 		 * [ 11110011 10111111 01111111 ] ->
    240 		 * INVALID
    241 		 */
    242 		.arr     = (char *)(unsigned char[]){ 0xF3, 0xBF, 0x7F },
    243 		.len     = 3,
    244 		.exp_len = 2,
    245 		.exp_cp  = GRAPHEME_INVALID_CODEPOINT,
    246 	},
    247 	{
    248 		/* invalid 4-byte sequence (fourth byte missing)
    249 		 * [ 11110011 10111111 10111111 ] ->
    250 		 * INVALID
    251 		 */
    252 		.arr     = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF },
    253 		.len     = 3,
    254 		.exp_len = 4,
    255 		.exp_cp  = GRAPHEME_INVALID_CODEPOINT,
    256 	},
    257 	{
    258 		/* invalid 4-byte sequence (fourth byte malformed)
    259 		 * [ 11110011 10111111 10111111 01111111 ] ->
    260 		 * INVALID
    261 		 */
    262 		.arr     = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0x7F },
    263 		.len     = 4,
    264 		.exp_len = 3,
    265 		.exp_cp  = GRAPHEME_INVALID_CODEPOINT,
    266 	},
    267 	{
    268 		/* invalid 4-byte sequence (overlong encoded)
    269 		 * [ 11110000 10000000 10000001 10111111 ] ->
    270 		 * INVALID
    271 		 */
    272 		.arr     = (char *)(unsigned char[]){ 0xF0, 0x80, 0x81, 0xBF },
    273 		.len     = 4,
    274 		.exp_len = 4,
    275 		.exp_cp  = GRAPHEME_INVALID_CODEPOINT,
    276 	},
    277 	{
    278 		/* invalid 4-byte sequence (UTF-16-unrepresentable)
    279 		 * [ 11110100 10010000 10000000 10000000 ] ->
    280 		 * INVALID
    281 		 */
    282 		.arr     = (char *)(unsigned char[]){ 0xF4, 0x90, 0x80, 0x80 },
    283 		.len     = 4,
    284 		.exp_len = 4,
    285 		.exp_cp  = GRAPHEME_INVALID_CODEPOINT,
    286 	},
    287 };
    288 
    289 int
    290 main(int argc, char *argv[])
    291 {
    292 	size_t i, failed;
    293 
    294 	(void)argc;
    295 
    296 	/* UTF-8 decoder test */
    297 	for (i = 0, failed = 0; i < LEN(dec_test); i++) {
    298 		size_t len;
    299 		uint_least32_t cp;
    300 
    301 		len = grapheme_decode_utf8(dec_test[i].arr,
    302 		                           dec_test[i].len, &cp);
    303 
    304 		if (len != dec_test[i].exp_len ||
    305 		    cp != dec_test[i].exp_cp) {
    306 			fprintf(stderr, "%s: Failed test %zu: "
    307 			        "Expected (%zx,%u), but got (%zx,%u).\n",
    308 			        argv[0], i, dec_test[i].exp_len,
    309 			        dec_test[i].exp_cp, len, cp);
    310 			failed++;
    311 		}
    312 	}
    313 	printf("%s: %zu/%zu tests passed.\n", argv[0],
    314 	       LEN(dec_test) - failed, LEN(dec_test));
    315 
    316 	return (failed > 0) ? 1 : 0;
    317 }