libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

utf8-decode.c (7826B)


      1 /* See LICENSE file for copyright and license details. */
      2 #include <stddef.h>
      3 #include <stdint.h>
      4 #include <stdio.h>
      5 #include <string.h>
      6 
      7 #include "../grapheme.h"
      8 #include "util.h"
      9 
     10 static const struct {
     11 	char *arr;             /* UTF-8 byte sequence */
     12 	size_t len;            /* length of UTF-8 byte sequence */
     13 	size_t exp_len;        /* expected length returned */
     14 	uint_least32_t exp_cp; /* expected codepoint returned */
     15 } dec_test[] = {
     16 	{
     17 		/* empty sequence
     18 	         * [ ] ->
     19 	         * INVALID
     20 	         */
     21 		.arr = NULL,
     22 		.len = 0,
     23 		.exp_len = 0,
     24 		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
     25 	},
     26 	{
     27 		/* invalid lead byte
     28 	         * [ 11111101 ] ->
     29 	         * INVALID
     30 	         */
     31 		.arr = (char *)(unsigned char[]) { 0xFD },
     32 		.len = 1,
     33 		.exp_len = 1,
     34 		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
     35 	},
     36 	{
     37 		/* valid 1-byte sequence
     38 	         * [ 00000001 ] ->
     39 	         * 0000001
     40 	         */
     41 		.arr = (char *)(unsigned char[]) { 0x01 },
     42 		.len = 1,
     43 		.exp_len = 1,
     44 		.exp_cp = 0x1,
     45 	},
     46 	{
     47 		/* valid 2-byte sequence
     48 	         * [ 11000011 10111111 ] ->
     49 	         * 00011111111
     50 	         */
     51 		.arr = (char *)(unsigned char[]) { 0xC3, 0xBF },
     52 		.len = 2,
     53 		.exp_len = 2,
     54 		.exp_cp = 0xFF,
     55 	},
     56 	{
     57 		/* invalid 2-byte sequence (second byte missing)
     58 	         * [ 11000011 ] ->
     59 	         * INVALID
     60 	         */
     61 		.arr = (char *)(unsigned char[]) { 0xC3 },
     62 		.len = 1,
     63 		.exp_len = 2,
     64 		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
     65 	},
     66 	{
     67 		/* invalid 2-byte sequence (second byte malformed)
     68 	         * [ 11000011 11111111 ] ->
     69 	         * INVALID
     70 	         */
     71 		.arr = (char *)(unsigned char[]) { 0xC3, 0xFF },
     72 		.len = 2,
     73 		.exp_len = 1,
     74 		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
     75 	},
     76 	{
     77 		/* invalid 2-byte sequence (overlong encoded)
     78 	         * [ 11000001 10111111 ] ->
     79 	         * INVALID
     80 	         */
     81 		.arr = (char *)(unsigned char[]) { 0xC1, 0xBF },
     82 		.len = 2,
     83 		.exp_len = 2,
     84 		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
     85 	},
     86 	{
     87 		/* valid 3-byte sequence
     88 	         * [ 11100000 10111111 10111111 ] ->
     89 	         * 0000111111111111
     90 	         */
     91 		.arr = (char *)(unsigned char[]) { 0xE0, 0xBF, 0xBF },
     92 		.len = 3,
     93 		.exp_len = 3,
     94 		.exp_cp = 0xFFF,
     95 	},
     96 	{
     97 		/* invalid 3-byte sequence (second byte missing)
     98 	         * [ 11100000 ] ->
     99 	         * INVALID
    100 	         */
    101 		.arr = (char *)(unsigned char[]) { 0xE0 },
    102 		.len = 1,
    103 		.exp_len = 3,
    104 		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    105 	},
    106 	{
    107 		/* invalid 3-byte sequence (second byte malformed)
    108 	         * [ 11100000 01111111 10111111 ] ->
    109 	         * INVALID
    110 	         */
    111 		.arr = (char *)(unsigned char[]) { 0xE0, 0x7F, 0xBF },
    112 		.len = 3,
    113 		.exp_len = 1,
    114 		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    115 	},
    116 	{
    117 		/* invalid 3-byte sequence (short string, second byte malformed)
    118 	         * [ 11100000 01111111 ] ->
    119 	         * INVALID
    120 	         */
    121 		.arr = (char *)(unsigned char[]) { 0xE0, 0x7F },
    122 		.len = 2,
    123 		.exp_len = 1,
    124 		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    125 	},
    126 	{
    127 		/* invalid 3-byte sequence (third byte missing)
    128 	         * [ 11100000 10111111 ] ->
    129 	         * INVALID
    130 	         */
    131 		.arr = (char *)(unsigned char[]) { 0xE0, 0xBF },
    132 		.len = 2,
    133 		.exp_len = 3,
    134 		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    135 	},
    136 	{
    137 		/* invalid 3-byte sequence (third byte malformed)
    138 	         * [ 11100000 10111111 01111111 ] ->
    139 	         * INVALID
    140 	         */
    141 		.arr = (char *)(unsigned char[]) { 0xE0, 0xBF, 0x7F },
    142 		.len = 3,
    143 		.exp_len = 2,
    144 		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    145 	},
    146 	{
    147 		/* invalid 3-byte sequence (overlong encoded)
    148 	         * [ 11100000 10011111 10111111 ] ->
    149 	         * INVALID
    150 	         */
    151 		.arr = (char *)(unsigned char[]) { 0xE0, 0x9F, 0xBF },
    152 		.len = 3,
    153 		.exp_len = 3,
    154 		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    155 	},
    156 	{
    157 		/* invalid 3-byte sequence (UTF-16 surrogate half)
    158 	         * [ 11101101 10100000 10000000 ] ->
    159 	         * INVALID
    160 	         */
    161 		.arr = (char *)(unsigned char[]) { 0xED, 0xA0, 0x80 },
    162 		.len = 3,
    163 		.exp_len = 3,
    164 		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    165 	},
    166 	{
    167 		/* valid 4-byte sequence
    168 	         * [ 11110011 10111111 10111111 10111111 ] ->
    169 	         * 011111111111111111111
    170 	         */
    171 		.arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0xBF, 0xBF },
    172 		.len = 4,
    173 		.exp_len = 4,
    174 		.exp_cp = UINT32_C(0xFFFFF),
    175 	},
    176 	{
    177 		/* invalid 4-byte sequence (second byte missing)
    178 	         * [ 11110011 ] ->
    179 	         * INVALID
    180 	         */
    181 		.arr = (char *)(unsigned char[]) { 0xF3 },
    182 		.len = 1,
    183 		.exp_len = 4,
    184 		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    185 	},
    186 	{
    187 		/* invalid 4-byte sequence (second byte malformed)
    188 	         * [ 11110011 01111111 10111111 10111111 ] ->
    189 	         * INVALID
    190 	         */
    191 		.arr = (char *)(unsigned char[]) { 0xF3, 0x7F, 0xBF, 0xBF },
    192 		.len = 4,
    193 		.exp_len = 1,
    194 		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    195 	},
    196 	{
    197 		/* invalid 4-byte sequence (short string 1, second byte
    198 	         * malformed) [ 11110011 011111111 ] -> INVALID
    199 	         */
    200 		.arr = (char *)(unsigned char[]) { 0xF3, 0x7F },
    201 		.len = 2,
    202 		.exp_len = 1,
    203 		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    204 	},
    205 	{
    206 		/* invalid 4-byte sequence (short string 2, second byte
    207 	         * malformed) [ 11110011 011111111 10111111 ] -> INVALID
    208 	         */
    209 		.arr = (char *)(unsigned char[]) { 0xF3, 0x7F, 0xBF },
    210 		.len = 3,
    211 		.exp_len = 1,
    212 		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    213 	},
    214 
    215 	{
    216 		/* invalid 4-byte sequence (third byte missing)
    217 	         * [ 11110011 10111111 ] ->
    218 	         * INVALID
    219 	         */
    220 		.arr = (char *)(unsigned char[]) { 0xF3, 0xBF },
    221 		.len = 2,
    222 		.exp_len = 4,
    223 		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    224 	},
    225 	{
    226 		/* invalid 4-byte sequence (third byte malformed)
    227 	         * [ 11110011 10111111 01111111 10111111 ] ->
    228 	         * INVALID
    229 	         */
    230 		.arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0x7F, 0xBF },
    231 		.len = 4,
    232 		.exp_len = 2,
    233 		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    234 	},
    235 	{
    236 		/* invalid 4-byte sequence (short string, third byte malformed)
    237 	         * [ 11110011 10111111 01111111 ] ->
    238 	         * INVALID
    239 	         */
    240 		.arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0x7F },
    241 		.len = 3,
    242 		.exp_len = 2,
    243 		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    244 	},
    245 	{
    246 		/* invalid 4-byte sequence (fourth byte missing)
    247 	         * [ 11110011 10111111 10111111 ] ->
    248 	         * INVALID
    249 	         */
    250 		.arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0xBF },
    251 		.len = 3,
    252 		.exp_len = 4,
    253 		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    254 	},
    255 	{
    256 		/* invalid 4-byte sequence (fourth byte malformed)
    257 	         * [ 11110011 10111111 10111111 01111111 ] ->
    258 	         * INVALID
    259 	         */
    260 		.arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0xBF, 0x7F },
    261 		.len = 4,
    262 		.exp_len = 3,
    263 		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    264 	},
    265 	{
    266 		/* invalid 4-byte sequence (overlong encoded)
    267 	         * [ 11110000 10000000 10000001 10111111 ] ->
    268 	         * INVALID
    269 	         */
    270 		.arr = (char *)(unsigned char[]) { 0xF0, 0x80, 0x81, 0xBF },
    271 		.len = 4,
    272 		.exp_len = 4,
    273 		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    274 	},
    275 	{
    276 		/* invalid 4-byte sequence (UTF-16-unrepresentable)
    277 	         * [ 11110100 10010000 10000000 10000000 ] ->
    278 	         * INVALID
    279 	         */
    280 		.arr = (char *)(unsigned char[]) { 0xF4, 0x90, 0x80, 0x80 },
    281 		.len = 4,
    282 		.exp_len = 4,
    283 		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    284 	},
    285 };
    286 
    287 int
    288 main(int argc, char *argv[])
    289 {
    290 	size_t i, failed;
    291 
    292 	(void)argc;
    293 
    294 	/* UTF-8 decoder test */
    295 	for (i = 0, failed = 0; i < LEN(dec_test); i++) {
    296 		size_t len;
    297 		uint_least32_t cp;
    298 
    299 		len = grapheme_decode_utf8(dec_test[i].arr, dec_test[i].len,
    300 		                           &cp);
    301 
    302 		if (len != dec_test[i].exp_len || cp != dec_test[i].exp_cp) {
    303 			fprintf(stderr,
    304 			        "%s: Failed test %zu: "
    305 			        "Expected (%zx,%u), but got (%zx,%u).\n",
    306 			        argv[0], i, dec_test[i].exp_len,
    307 			        dec_test[i].exp_cp, len, cp);
    308 			failed++;
    309 		}
    310 	}
    311 	printf("%s: %zu/%zu unit tests passed.\n", argv[0],
    312 	       LEN(dec_test) - failed, LEN(dec_test));
    313 
    314 	return (failed > 0) ? 1 : 0;
    315 }