libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

grapheme_next_word_break_utf8.3 (2308B)


      1 .Dd 2022-08-26
      2 .Dt GRAPHEME_NEXT_WORD_BREAK_UTF8 3
      3 .Os suckless.org
      4 .Sh NAME
      5 .Nm grapheme_next_word_break_utf8
      6 .Nd determine byte-offset to next word break
      7 .Sh SYNOPSIS
      8 .In grapheme.h
      9 .Ft size_t
     10 .Fn grapheme_next_word_break_utf8 "const char *str" "size_t len"
     11 .Sh DESCRIPTION
     12 The
     13 .Fn grapheme_next_word_break_utf8
     14 function computes the offset (in bytes) to the next word
     15 break (see
     16 .Xr libgrapheme 7 )
     17 in the UTF-8-encoded string
     18 .Va str
     19 of length
     20 .Va len .
     21 If a word begins at
     22 .Va str
     23 this offset is equal to the length of said word.
     24 .Pp
     25 If
     26 .Va len
     27 is set to
     28 .Dv SIZE_MAX
     29 (stdint.h is already included by grapheme.h) the string
     30 .Va str
     31 is interpreted to be NUL-terminated and processing stops when a
     32 NUL-byte is encountered.
     33 .Pp
     34 For non-UTF-8 input data
     35 .Xr grapheme_next_word_break 3
     36 can be used instead.
     37 .Sh RETURN VALUES
     38 The
     39 .Fn grapheme_next_word_break_utf8
     40 function returns the offset (in bytes) to the next word
     41 break in
     42 .Va str
     43 or 0 if
     44 .Va str
     45 is
     46 .Dv NULL .
     47 .Sh EXAMPLES
     48 .Bd -literal
     49 /* cc (-static) -o example example.c -lgrapheme */
     50 #include <grapheme.h>
     51 #include <stdint.h>
     52 #include <stdio.h>
     53 
     54 int
     55 main(void)
     56 {
     57 	/* UTF-8 encoded input */
     58 	char *s = "T\\xC3\\xABst \\xF0\\x9F\\x91\\xA8\\xE2\\x80\\x8D\\xF0"
     59 	          "\\x9F\\x91\\xA9\\xE2\\x80\\x8D\\xF0\\x9F\\x91\\xA6 \\xF0"
     60 	          "\\x9F\\x87\\xBA\\xF0\\x9F\\x87\\xB8 \\xE0\\xA4\\xA8\\xE0"
     61 	          "\\xA5\\x80 \\xE0\\xAE\\xA8\\xE0\\xAE\\xBF!";
     62 	size_t ret, len, off;
     63 
     64 	printf("Input: \\"%s\\"\\n", s);
     65 
     66 	/* print each grapheme cluster with byte-length */
     67 	printf("Grapheme clusters in NUL-delimited input:\\n");
     68 	for (off = 0; s[off] != '\\0'; off += ret) {
     69 		ret = grapheme_next_word_break_utf8(s + off, SIZE_MAX);
     70 		printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret);
     71 	}
     72 	printf("\\n");
     73 
     74 	/* do the same, but this time string is length-delimited */
     75 	len = 17;
     76 	printf("Grapheme clusters in input delimited to %zu bytes:\\n", len);
     77 	for (off = 0; off < len; off += ret) {
     78 		ret = grapheme_next_word_break_utf8(s + off, len - off);
     79 		printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret);
     80 	}
     81 
     82 	return 0;
     83 }
     84 .Ed
     85 .Sh SEE ALSO
     86 .Xr grapheme_next_word_break 3 ,
     87 .Xr libgrapheme 7
     88 .Sh STANDARDS
     89 .Fn grapheme_next_word_break_utf8
     90 is compliant with the Unicode 14.0.0 specification.
     91 .Sh AUTHORS
     92 .An Laslo Hunhold Aq Mt dev@frign.de