libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

grapheme_decode_utf8.sh (2322B)


      1 cat << EOF
      2 .Dd ${MAN_DATE}
      3 .Dt GRAPHEME_DECODE_UTF8 3
      4 .Os suckless.org
      5 .Sh NAME
      6 .Nm grapheme_decode_utf8
      7 .Nd decode first codepoint in UTF-8-encoded string
      8 .Sh SYNOPSIS
      9 .In grapheme.h
     10 .Ft size_t
     11 .Fn grapheme_decode_utf8 "const char *str" "size_t len" "uint_least32_t *cp"
     12 .Sh DESCRIPTION
     13 The
     14 .Fn grapheme_decode_utf8
     15 function decodes the first codepoint in the UTF-8-encoded string
     16 .Va str
     17 of length
     18 .Va len .
     19 If the UTF-8-sequence is invalid (overlong encoding, unexpected byte,
     20 string ends unexpectedly, empty string, etc.) the decoding is stopped
     21 at the last processed byte and the decoded codepoint set to
     22 .Dv GRAPHEME_INVALID_CODEPOINT .
     23 .Pp
     24 If
     25 .Va cp
     26 is not
     27 .Dv NULL
     28 the decoded codepoint is stored in the memory pointed to by
     29 .Va cp .
     30 .Pp
     31 Given NUL has a unique 1 byte representation, it is safe to operate on
     32 NUL-terminated strings by setting
     33 .Va len
     34 to
     35 .Dv SIZE_MAX
     36 (stdint.h is already included by grapheme.h) and terminating when
     37 .Va cp
     38 is 0 (see
     39 .Sx EXAMPLES
     40 for an example).
     41 .Sh RETURN VALUES
     42 The
     43 .Fn grapheme_decode_utf8
     44 function returns the number of processed bytes and 0 if
     45 .Va str
     46 is
     47 .Dv NULL
     48 or
     49 .Va len
     50 is 0.
     51 If the string ends unexpectedly in a multibyte sequence, the desired
     52 length (that is larger than
     53 .Va len )
     54 is returned.
     55 .Sh EXAMPLES
     56 .Bd -literal
     57 /* cc (-static) -o example example.c -lgrapheme */
     58 #include <grapheme.h>
     59 #include <inttypes.h>
     60 #include <stdio.h>
     61 
     62 void
     63 print_cps(const char *str, size_t len)
     64 {
     65 	size_t ret, off;
     66 	uint_least32_t cp;
     67 
     68 	for (off = 0; off < len; off += ret) {
     69 		if ((ret = grapheme_decode_utf8(str + off,
     70 		                                len - off, &cp)) > (len - off)) {
     71 			/*
     72 			 * string ended unexpectedly in the middle of a
     73 			 * multibyte sequence and we have the choice
     74 			 * here to possibly expand str by ret - len + off
     75 			 * bytes to get a full sequence, but we just
     76 			 * bail out in this case.
     77 			 */
     78 			break;
     79 		}
     80 		printf("%"PRIxLEAST32"\\\\n", cp);
     81 	}
     82 }
     83 
     84 void
     85 print_cps_nul_terminated(const char *str)
     86 {
     87 	size_t ret, off;
     88 	uint_least32_t cp;
     89 
     90 	for (off = 0; (ret = grapheme_decode_utf8(str + off,
     91 	                                          SIZE_MAX, &cp)) > 0 &&
     92 	     cp != 0; off += ret) {
     93 		printf("%"PRIxLEAST32"\\\\n", cp);
     94 	}
     95 }
     96 .Ed
     97 .Sh SEE ALSO
     98 .Xr grapheme_encode_utf8 3 ,
     99 .Xr libgrapheme 7
    100 .Sh AUTHORS
    101 .An Laslo Hunhold Aq Mt dev@frign.de
    102 EOF