libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

commit 4aa9cbec9fa8cc9faeddadac5f4108c367d40718
parent 031a47497bd4ef470bd48b8c9455ae4ce9d88121
Author: Laslo Hunhold <dev@frign.de>
Date:   Sat, 18 Dec 2021 01:26:53 +0100

Add manual pages for lg_utf8_*() and refactor lg_grapheme_nextbreak()

Officially document how to treat null-terminated strings and use
(size_t)-1 instead of some magic number 5. Using the maximum allowed
size indicates clearly that len is not used at all within the decoder.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
Aman/lg_utf8_decode.3 | 101+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aman/lg_utf8_encode.3 | 98+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/grapheme.c | 6+++---
3 files changed, 202 insertions(+), 3 deletions(-)

diff --git a/man/lg_utf8_decode.3 b/man/lg_utf8_decode.3 @@ -0,0 +1,101 @@ +.Dd 2021-12-17 +.Dt LG_UTF8_DECODE 3 +.Os suckless.org +.Sh NAME +.Nm lg_utf8_decode +.Nd decode first code point in UTF-8-encoded string +.Sh SYNOPSIS +.In grapheme.h +.Ft size_t +.Fn lg_utf8_decode "const char *str" "size_t len" "uint_least32_t *cp" +.Sh DESCRIPTION +The +.Fn lg_utf8_decode +function decodes the next code point in the UTF-8-encoded string +.Va str +of length +.Va len . +If the UTF-8-sequence is invalid (overlong encoding, unexpected byte, +string ends unexpectedly, empty string, etc.) the decoding is stopped +at the last processed byte and the decoded code point set to +.Dv LG_INVALID_CODE_POINT. +.Pp +If +.Va cp +is not +.Dv NULL +the decoded code point is stored in the memory pointed to by +.Va cp . +.Pp +Given NUL has a unique 1 byte representation, it is safe to operate on +NUL-terminated strings by setting +.Va len +to +.Dv (size_t)-1 +and terminating when +.Va cp +is 0 (see +.Sx EXAMPLES +for an example). +.Sh RETURN VALUES +The +.Fn lg_utf8_decode +function returns the number of processed bytes and 0 if +.Va str +is +.Dv NULL +or +.Va len +is 0. +If the string ends unexpectedly in a multibyte sequence, the desired +length (that is larger than +.Va len ) +is returned. +.Sh EXAMPLES +.Bd -literal +/* cc (-static) -o example example.c -lgrapheme */ +#include <grapheme.h> +#include <inttypes.h> +#include <stdio.h> + +void +print_cps(const char *str, size_t len) +{ + size_t ret, off; + uint_least32_t cp; + + for (off = 0; off < len; off += ret) { + if ((ret = lg_utf8_decode(str + off, + len - off, &cp)) > (len - off)) { + /* + * string ended unexpectedly in the middle of a + * multibyte sequence and we have the choice + * here to possibly expand str by ret - len + off + * bytes to get a full sequence, but we just + * bail out in this case. + */ + break; + } + printf("%"PRIxLEAST32"\\n", cp); + } +} + +void +print_cps_nul_terminated(const char *str) +{ + size_t ret, off; + uint_least32_t cp; + + for (off = 0; (ret = lg_utf8_decode(str + off, + (size_t)-1, &cp)) > 0 && + cp != 0; off += ret) { + printf("%"PRIxLEAST32"\\n", cp); + } +} +.Ed +.Sh SEE ALSO +.Xr lg_grapheme_encode 3 , +.Xr lg_grapheme_isbreak 3 , +.Xr libgrapheme 7 +.Sh AUTHORS +.An Laslo Hunhold Aq Mt dev@frign.de diff --git a/man/lg_utf8_encode.3 b/man/lg_utf8_encode.3 @@ -0,0 +1,98 @@ +.Dd 2021-12-17 +.Dt LG_UTF8_ENCODE 3 +.Os suckless.org +.Sh NAME +.Nm lg_utf8_encode +.Nd encode code point into UTF-8 string +.Sh SYNOPSIS +.In grapheme.h +.Ft size_t +.Fn lg_utf8_encode "uint_least32_t cp" "char *" "size_t" +.Sh DESCRIPTION +The +.Fn lg_utf8_encode +function encodes the code point +.Va cp +into a UTF-8-string. +If +.Va str +is not +.Dv NULL +and +.Va len +is large enough it writes the UTF-8-string to the memory pointed to by +.Va str . +.Sh RETURN VALUES +The +.Fn lg_utf8_encode +function returns the length (in bytes) of the UTF-8-string resulting +from encoding +.Va cp . +When the returned value is larger than +.Va len +it is indicated that the output string is too small and no data has been +written. +.Sh EXAMPLES +.Bd -literal +/* cc (-static) -o example example.c -lgrapheme */ +#include <grapheme.h> +#include <stddef.h> +#include <stdlib.h> + +size_t +cps_to_utf8(const uint_least32_t *cp, size_t cplen, char *str, size_t len) +{ + size_t i, off, ret; + + for (i = 0, off = 0; i < cplen; i++, off += ret) { + if ((ret = lg_utf8_encode(cp[i], str + off, + len - off)) > (len - off)) { + /* buffer too small */ + break; + } + } + + return off; +} + +size_t +cps_bytelen(const uint_least32_t *cp, size_t cplen) +{ + size_t i, len; + + for (i = 0, len = 0; i < cplen; i++) { + len += lg_utf8_encode(cp[i], NULL, 0); + } + + return len; +} + +char * +cps_to_utf8_alloc(const uint_least32_t *cp, size_t cplen) +{ + char *str; + size_t len, i, ret, off; + + len = cps_bytelen(cp, cplen); + + if (!(str = malloc(len))) { + return NULL; + } + + for (i = 0, off = 0; i < cplen; i++, off += ret) { + if ((ret = lg_utf8_encode(cp[i], str + off, + len - off)) > (len - off)) { + /* buffer too small */ + break; + } + } + str[off] = '\\0'; + + return str; +} +.Ed +.Sh SEE ALSO +.Xr lg_grapheme_decode 3 , +.Xr libgrapheme 7 +.Sh AUTHORS +.An Laslo Hunhold Aq Mt dev@frign.de diff --git a/src/grapheme.c b/src/grapheme.c @@ -197,19 +197,19 @@ lg_grapheme_nextbreak(const char *str) * miss it, even if the previous UTF-8 sequence terminates * unexpectedly, as it would either act as an unexpected byte, * saved for later, or as a null byte itself, that we can catch. - * We pass 5 to the length, as we will never read beyond + * We pass (size_t)-1 to the length, as we will never read beyond * the null byte for the reasons given above. */ /* get first code point */ - len += lg_utf8_decode(str, 5, &cp0); + len += lg_utf8_decode(str, (size_t)-1, &cp0); if (cp0 == LG_INVALID_CODE_POINT) { return len; } while (cp0 != 0) { /* get next code point */ - ret = lg_utf8_decode(str + len, 5, &cp1); + ret = lg_utf8_decode(str + len, (size_t)-1, &cp1); if (cp1 == LG_INVALID_CODE_POINT || lg_grapheme_isbreak(cp0, cp1, &state)) {