libgrapheme

grapheme cluster utility library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | LICENSE

codepoint.c (4636B)


      1 /* See LICENSE file for copyright and license details. */
      2 #include "../grapheme.h"
      3 #include <stdio.h>
      4 
      5 #define BETWEEN(c, l, u) (c >= l && c <= u)
      6 #define LEN(x) (sizeof(x) / sizeof(*x))
      7 
      8 /* lookup-table for the types of sequence first bytes */
      9 static const struct {
     10 	uint8_t  lower; /* lower bound of sequence first byte */
     11 	uint8_t  upper; /* upper bound of sequence first byte */
     12 	uint32_t mincp; /* smallest non-overlong encoded code point */
     13 	uint32_t maxcp; /* largest encodable code point */
     14 	/*
     15 	 * implicit: table-offset represents the number of following
     16 	 * bytes of the form 10xxxxxx (6 bits capacity each)
     17 	 */
     18 } lut[] = {
     19 	[0] = {
     20 		/* 0xxxxxxx */
     21 		.lower = 0x00, /* 00000000 */
     22 		.upper = 0x7F, /* 01111111 */
     23 		.mincp = (uint32_t)0,
     24 		.maxcp = ((uint32_t)1 << 7) - 1, /* 7 bits capacity */
     25 	},
     26 	[1] = {
     27 		/* 110xxxxx */
     28 		.lower = 0xC0, /* 11000000 */
     29 		.upper = 0xDF, /* 11011111 */
     30 		.mincp = (uint32_t)1 << 7,
     31 		.maxcp = ((uint32_t)1 << 11) - 1, /* 5+6=11 bits capacity */
     32 	},
     33 	[2] = {
     34 		/* 1110xxxx */
     35 		.lower = 0xE0, /* 11100000 */
     36 		.upper = 0xEF, /* 11101111 */
     37 		.mincp = (uint32_t)1 << 11,
     38 		.maxcp = ((uint32_t)1 << 16) - 1, /* 4+6+6=16 bits capacity */
     39 	},
     40 	[3] = {
     41 		/* 11110xxx */
     42 		.lower = 0xF0, /* 11110000 */
     43 		.upper = 0xF7, /* 11110111 */
     44 		.mincp = (uint32_t)1 << 16,
     45 		.maxcp = ((uint32_t)1 << 21) - 1, /* 3+6+6+6=21 bits capacity */
     46 	},
     47 };
     48 
     49 size_t
     50 grapheme_cp_decode(uint32_t *cp, const uint8_t *s, size_t n)
     51 {
     52 	size_t off, i;
     53 
     54 	if (n == 0) {
     55 		/* a sequence must be at least 1 byte long */
     56 		*cp = GRAPHEME_CP_INVALID;
     57 		return 1;
     58 	}
     59 
     60 	/* identify sequence type with the first byte */
     61 	for (off = 0; off < LEN(lut); off++) {
     62 		if (BETWEEN(s[0], lut[off].lower, lut[off].upper)) {
     63 			/*
     64 			 * first byte is within the bounds; fill
     65 			 * p with the the first bits contained in
     66 			 * the first byte (by subtracting the high bits)
     67 			 */
     68 			*cp = s[0] - lut[off].lower;
     69 			break;
     70 		}
     71 	}
     72 	if (off == LEN(lut)) {
     73 		/*
     74 		 * first byte does not match a sequence type;
     75 		 * set cp as invalid and return 1 byte processed
     76 		 */
     77 		*cp = GRAPHEME_CP_INVALID;
     78 		return 1;
     79 	}
     80 	if (1 + off > n) {
     81 		/*
     82 		 * input is not long enough, set cp as invalid and
     83 		 * return number of bytes needed
     84 		 */
     85 		*cp = GRAPHEME_CP_INVALID;
     86 		return 1 + off;
     87 	}
     88 
     89 	/*
     90 	 * process 'off' following bytes, each of the form 10xxxxxx
     91 	 * (i.e. between 0x80 (10000000) and 0xBF (10111111))
     92 	 */
     93 	for (i = 1; i <= off; i++) {
     94 		if(!BETWEEN(s[i], 0x80, 0xBF)) {
     95 			/*
     96 			 * byte does not match format; return
     97 			 * number of bytes processed excluding the
     98 			 * unexpected character as recommended since
     99 			 * Unicode 6 (chapter 3)
    100 			 */
    101 			*cp = GRAPHEME_CP_INVALID;
    102 			return 1 + (i - 1);
    103 		}
    104 		/*
    105 		 * shift code point by 6 bits and add the 6 stored bits
    106 		 * in s[i] to it using the bitmask 0x3F (00111111)
    107 		 */
    108 		*cp = (*cp << 6) | (s[i] & 0x3F);
    109 	}
    110 
    111 	if (*cp < lut[off].mincp ||
    112 	    BETWEEN(*cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) ||
    113 	    *cp > UINT32_C(0x10FFFF)) {
    114 		/*
    115 		 * code point is overlong encoded in the sequence, is a
    116 		 * high or low UTF-16 surrogate half (0xD800..0xDFFF) or
    117 		 * not representable in UTF-16 (>0x10FFFF) (RFC-3629
    118 		 * specifies the latter two conditions)
    119 		 */
    120 		*cp = GRAPHEME_CP_INVALID;
    121 	}
    122 
    123 	return 1 + off;
    124 }
    125 
    126 size_t
    127 grapheme_cp_encode(uint32_t cp, uint8_t *s, size_t n)
    128 {
    129 	size_t off, i;
    130 
    131 	if (BETWEEN(cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) ||
    132 	    cp > UINT32_C(0x10FFFF)) {
    133 		/*
    134 		 * code point is a high or low UTF-16 surrogate half
    135 		 * (0xD800..0xDFFF) or not representable in UTF-16
    136 		 * (>0x10FFFF), which RFC-3629 deems invalid for UTF-8.
    137 		 */
    138 		cp = GRAPHEME_CP_INVALID;
    139 	}
    140 
    141 	/* determine necessary sequence type */
    142 	for (off = 0; off < LEN(lut); off++) {
    143 		if (cp <= lut[off].maxcp) {
    144 			break;
    145 		}
    146 	}
    147 	if (1 + off > n) {
    148 		/* specified buffer is too small to store sequence */
    149 		return 1 + off;
    150 	}
    151 
    152 	/* build sequence by filling cp-bits into each byte */
    153 
    154 	/*
    155 	 * lut[off].lower is the bit-format for the first byte and
    156 	 * the bits to fill into it are determined by shifting the
    157 	 * cp 6 times the number of following bytes, as each
    158 	 * following byte stores 6 bits, yielding the wanted bits.
    159 	 *
    160 	 * We do not overwrite the mask because we guaranteed earlier
    161 	 * that there are no bits higher than the mask allows.
    162 	 */
    163 	s[0] = lut[off].lower | (cp >> (6 * off));
    164 
    165 	for (i = 1; i <= off; i++) {
    166 		/*
    167 		 * the bit-format for following bytes is 10000000 (0x80)
    168 		 * and it each stores 6 bits in the 6 low bits that we
    169 		 * extract from the properly-shifted value using the
    170 		 * mask 00111111 (0x3F)
    171 		 */
    172 		s[i] = 0x80 | ((cp >> (6 * (off - i))) & 0x3F);
    173 	}
    174 
    175 	return 1 + off;
    176 }