commit 4aa9cbec9fa8cc9faeddadac5f4108c367d40718
parent 031a47497bd4ef470bd48b8c9455ae4ce9d88121
Author: Laslo Hunhold <dev@frign.de>
Date: Sat, 18 Dec 2021 01:26:53 +0100
Add manual pages for lg_utf8_*() and refactor lg_grapheme_nextbreak()
Officially document how to treat null-terminated strings and use
(size_t)-1 instead of some magic number 5. Using the maximum allowed
size indicates clearly that len is not used at all within the decoder.
Signed-off-by: Laslo Hunhold <dev@frign.de>
Diffstat:
3 files changed, 202 insertions(+), 3 deletions(-)
diff --git a/man/lg_utf8_decode.3 b/man/lg_utf8_decode.3
@@ -0,0 +1,101 @@
+.Dd 2021-12-17
+.Dt LG_UTF8_DECODE 3
+.Os suckless.org
+.Sh NAME
+.Nm lg_utf8_decode
+.Nd decode first code point in UTF-8-encoded string
+.Sh SYNOPSIS
+.In grapheme.h
+.Ft size_t
+.Fn lg_utf8_decode "const char *str" "size_t len" "uint_least32_t *cp"
+.Sh DESCRIPTION
+The
+.Fn lg_utf8_decode
+function decodes the next code point in the UTF-8-encoded string
+.Va str
+of length
+.Va len .
+If the UTF-8-sequence is invalid (overlong encoding, unexpected byte,
+string ends unexpectedly, empty string, etc.) the decoding is stopped
+at the last processed byte and the decoded code point set to
+.Dv LG_INVALID_CODE_POINT.
+.Pp
+If
+.Va cp
+is not
+.Dv NULL
+the decoded code point is stored in the memory pointed to by
+.Va cp .
+.Pp
+Given NUL has a unique 1 byte representation, it is safe to operate on
+NUL-terminated strings by setting
+.Va len
+to
+.Dv (size_t)-1
+and terminating when
+.Va cp
+is 0 (see
+.Sx EXAMPLES
+for an example).
+.Sh RETURN VALUES
+The
+.Fn lg_utf8_decode
+function returns the number of processed bytes and 0 if
+.Va str
+is
+.Dv NULL
+or
+.Va len
+is 0.
+If the string ends unexpectedly in a multibyte sequence, the desired
+length (that is larger than
+.Va len )
+is returned.
+.Sh EXAMPLES
+.Bd -literal
+/* cc (-static) -o example example.c -lgrapheme */
+#include <grapheme.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+void
+print_cps(const char *str, size_t len)
+{
+ size_t ret, off;
+ uint_least32_t cp;
+
+ for (off = 0; off < len; off += ret) {
+ if ((ret = lg_utf8_decode(str + off,
+ len - off, &cp)) > (len - off)) {
+ /*
+ * string ended unexpectedly in the middle of a
+ * multibyte sequence and we have the choice
+ * here to possibly expand str by ret - len + off
+ * bytes to get a full sequence, but we just
+ * bail out in this case.
+ */
+ break;
+ }
+ printf("%"PRIxLEAST32"\\n", cp);
+ }
+}
+
+void
+print_cps_nul_terminated(const char *str)
+{
+ size_t ret, off;
+ uint_least32_t cp;
+
+ for (off = 0; (ret = lg_utf8_decode(str + off,
+ (size_t)-1, &cp)) > 0 &&
+ cp != 0; off += ret) {
+ printf("%"PRIxLEAST32"\\n", cp);
+ }
+}
+.Ed
+.Sh SEE ALSO
+.Xr lg_grapheme_encode 3 ,
+.Xr lg_grapheme_isbreak 3 ,
+.Xr libgrapheme 7
+.Sh AUTHORS
+.An Laslo Hunhold Aq Mt dev@frign.de
diff --git a/man/lg_utf8_encode.3 b/man/lg_utf8_encode.3
@@ -0,0 +1,98 @@
+.Dd 2021-12-17
+.Dt LG_UTF8_ENCODE 3
+.Os suckless.org
+.Sh NAME
+.Nm lg_utf8_encode
+.Nd encode code point into UTF-8 string
+.Sh SYNOPSIS
+.In grapheme.h
+.Ft size_t
+.Fn lg_utf8_encode "uint_least32_t cp" "char *" "size_t"
+.Sh DESCRIPTION
+The
+.Fn lg_utf8_encode
+function encodes the code point
+.Va cp
+into a UTF-8-string.
+If
+.Va str
+is not
+.Dv NULL
+and
+.Va len
+is large enough it writes the UTF-8-string to the memory pointed to by
+.Va str .
+.Sh RETURN VALUES
+The
+.Fn lg_utf8_encode
+function returns the length (in bytes) of the UTF-8-string resulting
+from encoding
+.Va cp .
+When the returned value is larger than
+.Va len
+it is indicated that the output string is too small and no data has been
+written.
+.Sh EXAMPLES
+.Bd -literal
+/* cc (-static) -o example example.c -lgrapheme */
+#include <grapheme.h>
+#include <stddef.h>
+#include <stdlib.h>
+
+size_t
+cps_to_utf8(const uint_least32_t *cp, size_t cplen, char *str, size_t len)
+{
+ size_t i, off, ret;
+
+ for (i = 0, off = 0; i < cplen; i++, off += ret) {
+ if ((ret = lg_utf8_encode(cp[i], str + off,
+ len - off)) > (len - off)) {
+ /* buffer too small */
+ break;
+ }
+ }
+
+ return off;
+}
+
+size_t
+cps_bytelen(const uint_least32_t *cp, size_t cplen)
+{
+ size_t i, len;
+
+ for (i = 0, len = 0; i < cplen; i++) {
+ len += lg_utf8_encode(cp[i], NULL, 0);
+ }
+
+ return len;
+}
+
+char *
+cps_to_utf8_alloc(const uint_least32_t *cp, size_t cplen)
+{
+ char *str;
+ size_t len, i, ret, off;
+
+ len = cps_bytelen(cp, cplen);
+
+ if (!(str = malloc(len))) {
+ return NULL;
+ }
+
+ for (i = 0, off = 0; i < cplen; i++, off += ret) {
+ if ((ret = lg_utf8_encode(cp[i], str + off,
+ len - off)) > (len - off)) {
+ /* buffer too small */
+ break;
+ }
+ }
+ str[off] = '\\0';
+
+ return str;
+}
+.Ed
+.Sh SEE ALSO
+.Xr lg_grapheme_decode 3 ,
+.Xr libgrapheme 7
+.Sh AUTHORS
+.An Laslo Hunhold Aq Mt dev@frign.de
diff --git a/src/grapheme.c b/src/grapheme.c
@@ -197,19 +197,19 @@ lg_grapheme_nextbreak(const char *str)
* miss it, even if the previous UTF-8 sequence terminates
* unexpectedly, as it would either act as an unexpected byte,
* saved for later, or as a null byte itself, that we can catch.
- * We pass 5 to the length, as we will never read beyond
+ * We pass (size_t)-1 to the length, as we will never read beyond
* the null byte for the reasons given above.
*/
/* get first code point */
- len += lg_utf8_decode(str, 5, &cp0);
+ len += lg_utf8_decode(str, (size_t)-1, &cp0);
if (cp0 == LG_INVALID_CODE_POINT) {
return len;
}
while (cp0 != 0) {
/* get next code point */
- ret = lg_utf8_decode(str + len, 5, &cp1);
+ ret = lg_utf8_decode(str + len, (size_t)-1, &cp1);
if (cp1 == LG_INVALID_CODE_POINT ||
lg_grapheme_isbreak(cp0, cp1, &state)) {