libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

grapheme_next_sentence_break_utf8.3 (2368B)


      1 .Dd 2022-08-26
      2 .Dt GRAPHEME_NEXT_SENTENCE_BREAK_UTF8 3
      3 .Os suckless.org
      4 .Sh NAME
      5 .Nm grapheme_next_sentence_break_utf8
      6 .Nd determine byte-offset to next sentence break
      7 .Sh SYNOPSIS
      8 .In grapheme.h
      9 .Ft size_t
     10 .Fn grapheme_next_sentence_break_utf8 "const char *str" "size_t len"
     11 .Sh DESCRIPTION
     12 The
     13 .Fn grapheme_next_sentence_break_utf8
     14 function computes the offset (in bytes) to the next sentence
     15 break (see
     16 .Xr libgrapheme 7 )
     17 in the UTF-8-encoded string
     18 .Va str
     19 of length
     20 .Va len .
     21 If a sentence begins at
     22 .Va str
     23 this offset is equal to the length of said sentence.
     24 .Pp
     25 If
     26 .Va len
     27 is set to
     28 .Dv SIZE_MAX
     29 (stdint.h is already included by grapheme.h) the string
     30 .Va str
     31 is interpreted to be NUL-terminated and processing stops when a
     32 NUL-byte is encountered.
     33 .Pp
     34 For non-UTF-8 input data
     35 .Xr grapheme_next_sentence_break 3
     36 can be used instead.
     37 .Sh RETURN VALUES
     38 The
     39 .Fn grapheme_next_sentence_break_utf8
     40 function returns the offset (in bytes) to the next sentence
     41 break in
     42 .Va str
     43 or 0 if
     44 .Va str
     45 is
     46 .Dv NULL .
     47 .Sh EXAMPLES
     48 .Bd -literal
     49 /* cc (-static) -o example example.c -lgrapheme */
     50 #include <grapheme.h>
     51 #include <stdint.h>
     52 #include <stdio.h>
     53 
     54 int
     55 main(void)
     56 {
     57 	/* UTF-8 encoded input */
     58 	char *s = "T\\xC3\\xABst \\xF0\\x9F\\x91\\xA8\\xE2\\x80\\x8D\\xF0"
     59 	          "\\x9F\\x91\\xA9\\xE2\\x80\\x8D\\xF0\\x9F\\x91\\xA6 \\xF0"
     60 	          "\\x9F\\x87\\xBA\\xF0\\x9F\\x87\\xB8 \\xE0\\xA4\\xA8\\xE0"
     61 	          "\\xA5\\x80 \\xE0\\xAE\\xA8\\xE0\\xAE\\xBF!";
     62 	size_t ret, len, off;
     63 
     64 	printf("Input: \\"%s\\"\\n", s);
     65 
     66 	/* print each grapheme cluster with byte-length */
     67 	printf("Grapheme clusters in NUL-delimited input:\\n");
     68 	for (off = 0; s[off] != '\\0'; off += ret) {
     69 		ret = grapheme_next_sentence_break_utf8(s + off, SIZE_MAX);
     70 		printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret);
     71 	}
     72 	printf("\\n");
     73 
     74 	/* do the same, but this time string is length-delimited */
     75 	len = 17;
     76 	printf("Grapheme clusters in input delimited to %zu bytes:\\n", len);
     77 	for (off = 0; off < len; off += ret) {
     78 		ret = grapheme_next_sentence_break_utf8(s + off, len - off);
     79 		printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret);
     80 	}
     81 
     82 	return 0;
     83 }
     84 .Ed
     85 .Sh SEE ALSO
     86 .Xr grapheme_next_sentence_break 3 ,
     87 .Xr libgrapheme 7
     88 .Sh STANDARDS
     89 .Fn grapheme_next_sentence_break_utf8
     90 is compliant with the Unicode 14.0.0 specification.
     91 .Sh AUTHORS
     92 .An Laslo Hunhold Aq Mt dev@frign.de