libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

grapheme_decode_utf8.3 (2337B)


      1 .Dd 2021-12-22
      2 .Dt GRAPHEME_DECODE_UTF8 3
      3 .Os suckless.org
      4 .Sh NAME
      5 .Nm grapheme_decode_utf8
      6 .Nd decode first codepoint in UTF-8-encoded string
      7 .Sh SYNOPSIS
      8 .In grapheme.h
      9 .Ft size_t
     10 .Fn grapheme_decode_utf8 "const char *str" "size_t len" "uint_least32_t *cp"
     11 .Sh DESCRIPTION
     12 The
     13 .Fn grapheme_decode_utf8
     14 function decodes the next codepoint in the UTF-8-encoded string
     15 .Va str
     16 of length
     17 .Va len .
     18 If the UTF-8-sequence is invalid (overlong encoding, unexpected byte,
     19 string ends unexpectedly, empty string, etc.) the decoding is stopped
     20 at the last processed byte and the decoded codepoint set to
     21 .Dv GRAPHEME_INVALID_CODEPOINT .
     22 .Pp
     23 If
     24 .Va cp
     25 is not
     26 .Dv NULL
     27 the decoded codepoint is stored in the memory pointed to by
     28 .Va cp .
     29 .Pp
     30 Given NUL has a unique 1 byte representation, it is safe to operate on
     31 NUL-terminated strings by setting
     32 .Va len
     33 to
     34 .Dv SIZE_MAX
     35 (stdint.h is already included by grapheme.h) and terminating when
     36 .Va cp
     37 is 0 (see
     38 .Sx EXAMPLES
     39 for an example).
     40 .Sh RETURN VALUES
     41 The
     42 .Fn grapheme_decode_utf8
     43 function returns the number of processed bytes and 0 if
     44 .Va str
     45 is
     46 .Dv NULL
     47 or
     48 .Va len
     49 is 0.
     50 If the string ends unexpectedly in a multibyte sequence, the desired
     51 length (that is larger than
     52 .Va len )
     53 is returned.
     54 .Sh EXAMPLES
     55 .Bd -literal
     56 /* cc (-static) -o example example.c -lgrapheme */
     57 #include <grapheme.h>
     58 #include <inttypes.h>
     59 #include <stdio.h>
     60 
     61 void
     62 print_cps(const char *str, size_t len)
     63 {
     64 	size_t ret, off;
     65 	uint_least32_t cp;
     66 
     67 	for (off = 0; off < len; off += ret) {
     68 		if ((ret = grapheme_decode_utf8(str + off,
     69 		                                len - off, &cp)) > (len - off)) {
     70 			/*
     71 			 * string ended unexpectedly in the middle of a
     72 			 * multibyte sequence and we have the choice
     73 			 * here to possibly expand str by ret - len + off
     74 			 * bytes to get a full sequence, but we just
     75 			 * bail out in this case.
     76 			 */
     77 			break;
     78 		}
     79 		printf("%"PRIxLEAST32"\\n", cp);
     80 	}
     81 }
     82 
     83 void
     84 print_cps_nul_terminated(const char *str)
     85 {
     86 	size_t ret, off;
     87 	uint_least32_t cp;
     88 
     89 	for (off = 0; (ret = grapheme_decode_utf8(str + off,
     90 	                                          SIZE_MAX, &cp)) > 0 &&
     91 	     cp != 0; off += ret) {
     92 		printf("%"PRIxLEAST32"\\n", cp);
     93 	}
     94 }
     95 .Ed
     96 .Sh SEE ALSO
     97 .Xr grapheme_encode_utf8 3 ,
     98 .Xr grapheme_is_character_break 3 ,
     99 .Xr libgrapheme 7
    100 .Sh AUTHORS
    101 .An Laslo Hunhold Aq Mt dev@frign.de