Add manual pages for lg_utf8_*() and refactor lg_grapheme_nextbreak() - libgrapheme

commit 4aa9cbec9fa8cc9faeddadac5f4108c367d40718
parent 031a47497bd4ef470bd48b8c9455ae4ce9d88121
Author: Laslo Hunhold <dev@frign.de>
Date:   Sat, 18 Dec 2021 01:26:53 +0100

Add manual pages for lg_utf8_*() and refactor lg_grapheme_nextbreak()

Officially document how to treat null-terminated strings and use
(size_t)-1 instead of some magic number 5. Using the maximum allowed
size indicates clearly that len is not used at all within the decoder.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
A man/lg_utf8_decode.3  | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A man/lg_utf8_encode.3  | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M src/grapheme.c  | 6 +++---

3 files changed, 202 insertions(+), 3 deletions(-)
diff --git a/man/lg_utf8_decode.3 b/man/lg_utf8_decode.3
@@ -0,0 +1,101 @@
+.Dd 2021-12-17
+.Dt LG_UTF8_DECODE 3
+.Os suckless.org
+.Sh NAME
+.Nm lg_utf8_decode
+.Nd decode first code point in UTF-8-encoded string
+.Sh SYNOPSIS
+.In grapheme.h
+.Ft size_t
+.Fn lg_utf8_decode "const char *str" "size_t len" "uint_least32_t *cp"
+.Sh DESCRIPTION
+The
+.Fn lg_utf8_decode
+function decodes the next code point in the UTF-8-encoded string
+.Va str
+of length
+.Va len .
+If the UTF-8-sequence is invalid (overlong encoding, unexpected byte,
+string ends unexpectedly, empty string, etc.) the decoding is stopped
+at the last processed byte and the decoded code point set to
+.Dv LG_INVALID_CODE_POINT.
+.Pp
+If
+.Va cp
+is not
+.Dv NULL
+the decoded code point is stored in the memory pointed to by
+.Va cp .
+.Pp
+Given NUL has a unique 1 byte representation, it is safe to operate on
+NUL-terminated strings by setting
+.Va len
+to
+.Dv (size_t)-1
+and terminating when
+.Va cp
+is 0 (see
+.Sx EXAMPLES
+for an example).
+.Sh RETURN VALUES
+The
+.Fn lg_utf8_decode
+function returns the number of processed bytes and 0 if
+.Va str
+is
+.Dv NULL
+or
+.Va len
+is 0.
+If the string ends unexpectedly in a multibyte sequence, the desired
+length (that is larger than
+.Va len )
+is returned.
+.Sh EXAMPLES
+.Bd -literal
+/* cc (-static) -o example example.c -lgrapheme */
+#include <grapheme.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+void
+print_cps(const char *str, size_t len)
+{
+	size_t ret, off;
+	uint_least32_t cp;
+
+	for (off = 0; off < len; off += ret) {
+		if ((ret = lg_utf8_decode(str + off,
+		                          len - off, &cp)) > (len - off)) {	
+			/*
+			 * string ended unexpectedly in the middle of a
+			 * multibyte sequence and we have the choice
+			 * here to possibly expand str by ret - len + off
+			 * bytes to get a full sequence, but we just
+			 * bail out in this case.
+			 */
+			break;
+		}
+		printf("%"PRIxLEAST32"\\n", cp);
+	}
+}
+
+void
+print_cps_nul_terminated(const char *str)
+{
+	size_t ret, off;
+	uint_least32_t cp;
+
+	for (off = 0; (ret = lg_utf8_decode(str + off,
+	                                    (size_t)-1, &cp)) > 0 &&
+	     cp != 0; off += ret) {
+		printf("%"PRIxLEAST32"\\n", cp);
+	}
+}
+.Ed
+.Sh SEE ALSO
+.Xr lg_grapheme_encode 3 ,
+.Xr lg_grapheme_isbreak 3 ,
+.Xr libgrapheme 7
+.Sh AUTHORS
+.An Laslo Hunhold Aq Mt dev@frign.de
diff --git a/man/lg_utf8_encode.3 b/man/lg_utf8_encode.3
@@ -0,0 +1,98 @@
+.Dd 2021-12-17
+.Dt LG_UTF8_ENCODE 3
+.Os suckless.org
+.Sh NAME
+.Nm lg_utf8_encode
+.Nd encode code point into UTF-8 string
+.Sh SYNOPSIS
+.In grapheme.h
+.Ft size_t
+.Fn lg_utf8_encode "uint_least32_t cp" "char *" "size_t"
+.Sh DESCRIPTION
+The
+.Fn lg_utf8_encode
+function encodes the code point
+.Va cp
+into a UTF-8-string.
+If
+.Va str
+is not
+.Dv NULL
+and
+.Va len
+is large enough it writes the UTF-8-string to the memory pointed to by
+.Va str .
+.Sh RETURN VALUES
+The
+.Fn lg_utf8_encode
+function returns the length (in bytes) of the UTF-8-string resulting
+from encoding
+.Va cp .
+When the returned value is larger than
+.Va len
+it is indicated that the output string is too small and no data has been
+written.
+.Sh EXAMPLES
+.Bd -literal
+/* cc (-static) -o example example.c -lgrapheme */
+#include <grapheme.h>
+#include <stddef.h>
+#include <stdlib.h>
+
+size_t
+cps_to_utf8(const uint_least32_t *cp, size_t cplen, char *str, size_t len)
+{
+	size_t i, off, ret;
+
+	for (i = 0, off = 0; i < cplen; i++, off += ret) {
+		if ((ret = lg_utf8_encode(cp[i], str + off,
+		                          len - off)) > (len - off)) {
+			/* buffer too small */
+			break;
+		}
+	}
+	
+	return off;
+}
+
+size_t
+cps_bytelen(const uint_least32_t *cp, size_t cplen)
+{
+	size_t i, len;
+
+	for (i = 0, len = 0; i < cplen; i++) {
+		len += lg_utf8_encode(cp[i], NULL, 0);
+	}
+
+	return len;
+}
+
+char *
+cps_to_utf8_alloc(const uint_least32_t *cp, size_t cplen)
+{
+	char *str;
+	size_t len, i, ret, off;
+
+	len = cps_bytelen(cp, cplen);
+
+	if (!(str = malloc(len))) {
+		return NULL;
+	}
+
+	for (i = 0, off = 0; i < cplen; i++, off += ret) {
+		if ((ret = lg_utf8_encode(cp[i], str + off,
+		                          len - off)) > (len - off)) {
+			/* buffer too small */
+			break;
+		}
+	}
+	str[off] = '\\0';
+
+	return str;
+}
+.Ed
+.Sh SEE ALSO
+.Xr lg_grapheme_decode 3 ,
+.Xr libgrapheme 7
+.Sh AUTHORS
+.An Laslo Hunhold Aq Mt dev@frign.de
diff --git a/src/grapheme.c b/src/grapheme.c
@@ -197,19 +197,19 @@ lg_grapheme_nextbreak(const char *str)
 	 * miss it, even if the previous UTF-8 sequence terminates
 	 * unexpectedly, as it would either act as an unexpected byte,
 	 * saved for later, or as a null byte itself, that we can catch.
-	 * We pass 5 to the length, as we will never read beyond
+	 * We pass (size_t)-1 to the length, as we will never read beyond
 	 * the null byte for the reasons given above.
 	 */
 
 	/* get first code point */
-	len += lg_utf8_decode(str, 5, &cp0);
+	len += lg_utf8_decode(str, (size_t)-1, &cp0);
 	if (cp0 == LG_INVALID_CODE_POINT) {
 		return len;
 	}
 
 	while (cp0 != 0) {
 		/* get next code point */
-		ret = lg_utf8_decode(str + len, 5, &cp1);
+		ret = lg_utf8_decode(str + len, (size_t)-1, &cp1);
 
 		if (cp1 == LG_INVALID_CODE_POINT ||
 		    lg_grapheme_isbreak(cp0, cp1, &state)) {

	libgrapheme unicode string library
	git clone git://git.suckless.org/libgrapheme
	Log \| Files \| Refs \| README \| LICENSE

A	man/lg_utf8_decode.3	\|	101	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	man/lg_utf8_encode.3	\|	98	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	src/grapheme.c	\|	6	+++---