libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

commit 497b500df21812b49729ff9514dd81dac29ec940
parent 687329edd3c31c2b2f89af79fd2ff5107bf2e31f
Author: Laslo Hunhold <dev@frign.de>
Date:   Wed, 15 Dec 2021 10:59:42 +0100

Refactor manual pages, document lg_grapheme_isbreak()

In particular, simplify the given example in lg_grapheme_nextbreak().

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
MMakefile | 2+-
Dman/grapheme_bytelen.3 | 85-------------------------------------------------------------------------------
Aman/lg_grapheme_isbreak.3 | 79+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aman/lg_grapheme_nextbreak.3 | 70++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mman/libgrapheme.7 | 32++++++++++----------------------
5 files changed, 160 insertions(+), 108 deletions(-)

diff --git a/Makefile b/Makefile @@ -12,7 +12,7 @@ GEN = gen/grapheme gen/grapheme-test LIB = src/grapheme src/utf8 src/util TEST = test/grapheme test/grapheme-performance test/utf8-decode test/utf8-encode -MAN3 = man/grapheme_bytelen.3 +MAN3 = man/lg_grapheme_isbreak.3 man/lg_grapheme_nextbreak.3 MAN7 = man/libgrapheme.7 all: libgrapheme.a libgrapheme.so diff --git a/man/grapheme_bytelen.3 b/man/grapheme_bytelen.3 @@ -1,85 +0,0 @@ -.Dd 2020-10-12 -.Dt GRAPHEME_BYTELEN 3 -.Os suckless.org -.Sh NAME -.Nm grapheme_bytelen -.Nd compute grapheme cluster byte-length -.Sh SYNOPSIS -.In grapheme.h -.Ft size_t -.Fn grapheme_bytelen "const char *str" -.Sh DESCRIPTION -The -.Fn grapheme_bytelen -function computes the length (in bytes) of the grapheme cluster -(see -.Xr libgrapheme 7 ) -beginning at the UTF-8-encoded NUL-terminated string -.Va str . -.Sh RETURN VALUES -The -.Fn grapheme_bytelen -function returns the length (in bytes) of the grapheme cluster beginning -at -.Va str -or 0 if -.Va str -is -.Dv NULL . -.Sh EXAMPLES -.Bd -literal -/* cc (-static) -o example example.c -lgrapheme */ -#include <grapheme.h> -#include <stdio.h> - -int -main(void) -{ - /* UTF-8 encoded input */ - char *s = - "T" - "\\xC3\\xAB" /* U+000EB LATIN SMALL LETTER E - WITH DIAERESIS */ - "s" - "t" - " " - "\\xF0\\x9F\\x91\\xA8" /* U+1F468 MAN */ - "\\xE2\\x80\\x8D" /* U+0200D ZERO WIDTH JOINER */ - "\\xF0\\x9F\\x91\\xA9" /* U+1F469 WOMAN */ - "\\xE2\\x80\\x8D" /* U+0200D ZERO WIDTH JOINER */ - "\\xF0\\x9F\\x91\\xA6" /* U+1F466 BOY */ - " " - "\\xF0\\x9F\\x87\\xBA" /* U+1F1FA REGIONAL INDICATOR - SYMBOL LETTER U */ - "\\xF0\\x9F\\x87\\xB8" /* U+1F1F8 REGIONAL INDICATOR - SYMBOL LETTER S */ - " " - "\\xE0\\xA4\\xA8" /* U+00928 DEVANAGARI LETTER NA */ - "\\xE0\\xA5\\x80" /* U+00940 DEVANAGARI VOWEL - SIGN II */ - " " - "\\xE0\\xAE\\xA8" /* U+00BA8 TAMIL LETTER NA */ - "\\xE0\\xAE\\xBF" /* U+00BBF TAMIL VOWEL SIGN I */ - "!"; - size_t len; - - /* print input string */ - printf("Input: %s\\n", s); - - /* print each grapheme cluster with accompanying byte-length */ - while (*s != '\\0') { - len = grapheme_bytelen(s); - printf("%2zu byte(s) | %.*s\\n", len, (int)len, s, len); - s += len; - } - - return 0; -} -.Ed -.Sh SEE ALSO -.Xr libgrapheme 7 -.Sh STANDARDS -.Fn grapheme_bytelen -is compliant with the Unicode 13.0.0 specification. -.Sh AUTHORS -.An Laslo Hunhold Aq Mt dev@frign.de diff --git a/man/lg_grapheme_isbreak.3 b/man/lg_grapheme_isbreak.3 @@ -0,0 +1,79 @@ +.Dd 2021-12-15 +.Dt LG_GRAPHEME_ISBREAK 3 +.Os suckless.org +.Sh NAME +.Nm lg_grapheme_isbreak +.Nd test for a grapheme cluster break between two code points +.Sh SYNOPSIS +.In grapheme.h +.Ft size_t +.Fn lg_grapheme_isbreak "uint_least32_t a, uint_least32_t b, LG_SEGMENTATION_STATE *state" +.Sh DESCRIPTION +The +.Fn lg_grapheme_isbreak +function determines if there is a grapheme cluster break (see +.Xr libgrapheme 7 ) +between the two code points +.Va a +and +.Va b . +By specification this decision depends on a +.Va state +that can at most be completely reset after detecting a break and must +be reset every time one deviates from sequential processing. +.Pp +If +.Va state +is +.Dv NULL +.Fn lg_grapheme_isbreak +behaves as if it was called with a fully reset state. +.Sh RETURN VALUES +.Fn lg_grapheme_isbreak +returns +.Va true +if there is a grapheme cluster break between the code points +.Va a +and +.Va b +and +.Va false +if there is not. +.Sh EXAMPLES +.Bd -literal +/* cc (-static) -o example example.c -lgrapheme */ +#include <grapheme.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> + +int +main(void) +{ + LG_SEGMENTATION_STATE state = { 0 }; + uint_least32_t s1[] = ..., s2[] = ...; /* two input arrays */ + size_t i; + + for (i = 0; i + 1 < sizeof(s1) / sizeof(*s1); i++) { + if (lg_grapheme_isbreak(s[i], s[i + 1], &state)) { + printf("break in s1 at offset %zu\n", i); + } + } + memset(&state, 0, sizeof(state)); /* reset state */ + for (i = 0; i + 1 < sizeof(s2) / sizeof(*s2); i++) { + if (lg_grapheme_isbreak(s[i], s[i + 1], &state)) { + printf("break in s2 at offset %zu\n", i); + } + } + + return 0; +} +.Ed +.Sh SEE ALSO +.Xr lg_grapheme_nextbreak 3 , +.Xr libgrapheme 7 +.Sh STANDARDS +.Fn lg_grapheme_isbreak +is compliant with the Unicode 14.0.0 specification. +.Sh AUTHORS +.An Laslo Hunhold Aq Mt dev@frign.de diff --git a/man/lg_grapheme_nextbreak.3 b/man/lg_grapheme_nextbreak.3 @@ -0,0 +1,70 @@ +.Dd 2021-12-15 +.Dt LG_GRAPHEME_NEXTBREAK 3 +.Os suckless.org +.Sh NAME +.Nm lg_grapheme_nextbreak +.Nd determine byte-offset to next grapheme cluster break +.Sh SYNOPSIS +.In grapheme.h +.Ft size_t +.Fn lg_grapheme_nextbreak "const uint8_t *str" +.Sh DESCRIPTION +.Fn lg_grapheme_nextbreak +computes the offset (in bytes) to the next grapheme +cluster break (see +.Xr libgrapheme 7 ) +in the UTF-8-encoded NUL-terminated string +.Va str . +If a grapheme cluster begins at +.Va str +this offset is equal to the length of said grapheme cluster. +.Pp +For non-UTF-8 input data +.Xr lg_grapheme_isbreak 3 +can be used instead. +.Sh RETURN VALUES +.Fn lg_grapheme_nextbreak +returns the offset (in bytes) to the next grapheme cluster +break in +.Va str +or 0 if +.Va str +is +.Dv NULL . +.Sh EXAMPLES +.Bd -literal +/* cc (-static) -o example example.c -lgrapheme */ +#include <grapheme.h> +#include <stdint.h> +#include <stdio.h> + +int +main(void) +{ + /* UTF-8 encoded input */ + char *s = "T\\xC3\\xABst \\xF0\\x9F\\x91\\xA8\\xE2\\x80\\x8D\\xF0" + "\\x9F\\x91\\xA9\\xE2\\x80\\x8D\\xF0\\x9F\\x91\\xA6 \\xF0" + "\\x9F\\x87\\xBA\\xF0\\x9F\\x87\\xB8 \\xE0\\xA4\\xA8\\xE0" + "\\xA5\\x80 \\xE0\\xAE\\xA8\\xE0\\xAE\\xBF!"; + size_t len; + + printf("Input: \\"%s\\"\\n", s); + + /* print each grapheme cluster with byte-length */ + for (; *s != '\\0';) { + len = lg_grapheme_nextbreak((uint8_t *)s); + printf("%2zu bytes | %.*s\\n", len, (int)len, s, len); + s += len; + } + + return 0; +} +.Ed +.Sh SEE ALSO +.Xr lg_grapheme_isbreak 3 , +.Xr libgrapheme 7 +.Sh STANDARDS +.Fn lg_grapheme_nextbreak +is compliant with the Unicode 14.0.0 specification. +.Sh AUTHORS +.An Laslo Hunhold Aq Mt dev@frign.de diff --git a/man/libgrapheme.7 b/man/libgrapheme.7 @@ -1,39 +1,27 @@ -.Dd 2020-10-12 +.Dd 2021-12-15 .Dt LIBGRAPHEME 7 .Os suckless.org .Sh NAME .Nm libgrapheme -.Nd grapheme cluster library +.Nd unicode library .Sh SYNOPSIS .In grapheme.h .Sh DESCRIPTION The .Nm -library provides functions to properly separate a string into -user-perceived characters +library provides functions to properly handle user-perceived characters .Dq ( grapheme clusters , see .Sx MOTIVATION ) -using the Unicode grapheme cluster breaking algorithm (UAX #29). -.Pp -You can either count the length (in bytes) of the grapheme cluster at -the beginning of an UTF-8-encoded string (see -.Xr grapheme_bytelen 3 ) -or determine if a grapheme cluster breaks between two Unicode code -points (see -.Xr grapheme_boundary 3 ) , -while a safe UTF-8-de/encoder for the latter purpose is provided (see -.Xr grapheme_cp_decode 3 -and -.Xr grapheme_cp_encode 3 ) . +according to the Unicode specification. .Sh SEE ALSO -.Xr grapheme_boundary 3 , -.Xr grapheme_bytelen 3 -.Xr grapheme_cp_decode 3 , -.Xr grapheme_cp_encode 3 , +.Xr lg_grapheme_isbreak 3 , +.Xr lg_grapheme_nextbreak 3 , +.Xr lg_utf8_decode 3 , +.Xr lg_utf8_encode 3 .Sh STANDARDS .Nm -is compliant with the Unicode 13.0.0 specification. +is compliant with the Unicode 14.0.0 specification. .Sh MOTIVATION The idea behind every character encoding scheme like ASCII or Unicode is to express abstract characters (which can be thought of as shapes @@ -118,7 +106,7 @@ is continued between two code points. Libraries like ICU, which also offer this functionality, are often bloated, not correct, difficult to use or not statically linkable. The motivation behind .Nm -is to make grapheme cluster handling suck less and abide by the UNIX +is to make unicode handling suck less and abide by the UNIX philosophy. .Sh AUTHORS .An Laslo Hunhold Aq Mt dev@frign.de