libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

next_break_utf8.sh (2809B)


      1 cat << EOF
      2 .Dd $MAN_DATE
      3 .Dt GRAPHEME_NEXT_$(printf $TYPE | tr [:lower:] [:upper:])_BREAK_UTF8 3
      4 .Os suckless.org
      5 .Sh NAME
      6 .Nm grapheme_next_$(printf $TYPE)_break_utf8
      7 .Nd determine byte-offset to next $REALTYPE break
      8 .Sh SYNOPSIS
      9 .In grapheme.h
     10 .Ft size_t
     11 .Fn grapheme_next_$(printf $TYPE)_break_utf8 "const char *str" "size_t len"
     12 .Sh DESCRIPTION
     13 The
     14 .Fn grapheme_next_$(printf $TYPE)_break_utf8
     15 function computes the offset (in bytes) to the next $REALTYPE
     16 break (see
     17 .Xr libgrapheme 7 )
     18 in the UTF-8-encoded string
     19 .Va str
     20 of length
     21 .Va len .$(if [ "$TYPE" != "line" ]; then printf "\nIf a $REALTYPE begins at
     22 .Va str
     23 this offset is equal to the length of said $REALTYPE."; fi)
     24 .Pp
     25 If
     26 .Va len
     27 is set to
     28 .Dv SIZE_MAX
     29 (stdint.h is already included by grapheme.h) the string
     30 .Va str
     31 is interpreted to be NUL-terminated and processing stops when a
     32 NUL-byte is encountered.
     33 .Pp
     34 For non-UTF-8 input data$(if [ "$TYPE" = "character" ];
     35 then printf "\n.Xr grapheme_is_character_break 3
     36 and"; fi)
     37 .Xr grapheme_next_$(printf $TYPE)_break 3
     38 can be used instead.
     39 .Sh RETURN VALUES
     40 The
     41 .Fn grapheme_next_$(printf $TYPE)_break_utf8
     42 function returns the offset (in bytes) to the next $REALTYPE
     43 break in
     44 .Va str
     45 or 0 if
     46 .Va str
     47 is
     48 .Dv NULL .
     49 .Sh EXAMPLES
     50 .Bd -literal
     51 /* cc (-static) -o example example.c -lgrapheme */
     52 #include <grapheme.h>
     53 #include <stdint.h>
     54 #include <stdio.h>
     55 
     56 int
     57 main(void)
     58 {
     59 	/* UTF-8 encoded input */
     60 	char *s = "T\\\\xC3\\\\xABst \\\\xF0\\\\x9F\\\\x91\\\\xA8\\\\xE2\\\\x80\\\\x8D\\\\xF0"
     61 	          "\\\\x9F\\\\x91\\\\xA9\\\\xE2\\\\x80\\\\x8D\\\\xF0\\\\x9F\\\\x91\\\\xA6 \\\\xF0"
     62 	          "\\\\x9F\\\\x87\\\\xBA\\\\xF0\\\\x9F\\\\x87\\\\xB8 \\\\xE0\\\\xA4\\\\xA8\\\\xE0"
     63 	          "\\\\xA5\\\\x80 \\\\xE0\\\\xAE\\\\xA8\\\\xE0\\\\xAE\\\\xBF!";
     64 	size_t ret, len, off;
     65 
     66 	printf("Input: \\\\"%s\\\\"\\\\n", s);
     67 
     68 	/* print each $REALTYPE with byte-length */
     69 	printf("$(printf "$REALTYPE")s in NUL-delimited input:\\\\n");
     70 	for (off = 0; s[off] != '\\\\0'; off += ret) {
     71 		ret = grapheme_next_$(printf $TYPE)_break_utf8(s + off, SIZE_MAX);
     72 		printf("%2zu bytes | %.*s\\\\n", ret, (int)ret, s + off, ret);
     73 	}
     74 	printf("\\\\n");
     75 
     76 	/* do the same, but this time string is length-delimited */
     77 	len = 17;
     78 	printf("$(printf "$REALTYPE")s in input delimited to %zu bytes:\\\\n", len);
     79 	for (off = 0; off < len; off += ret) {
     80 		ret = grapheme_next_$(printf $TYPE)_break_utf8(s + off, len - off);
     81 		printf("%2zu bytes | %.*s\\\\n", ret, (int)ret, s + off, ret);
     82 	}
     83 
     84 	return 0;
     85 }
     86 .Ed
     87 .Sh SEE ALSO$(if [ "$TYPE" = "character" ];
     88 then printf "\n.Xr grapheme_is_character_break 3 ,"; fi)
     89 .Xr grapheme_next_$(printf $TYPE)_break 3 ,
     90 .Xr libgrapheme 7
     91 .Sh STANDARDS
     92 .Fn grapheme_next_$(printf $TYPE)_break_utf8
     93 is compliant with the Unicode 14.0.0 specification.
     94 .Sh AUTHORS
     95 .An Laslo Hunhold Aq Mt dev@frign.de
     96 EOF