Reintroduce the "grapheme_" prefix - libgrapheme

commit 82b85a60b3a334c928aa22de2555a55367bf739d
parent dfda0db8503b0051addc96368840b06c22fa8eeb
Author: Laslo Hunhold <dev@frign.de>
Date:   Sat, 18 Dec 2021 12:48:32 +0100

Reintroduce the "grapheme_" prefix

With the character ambiguity out of the way we can now go back to
prefixing everything with "grapheme_" instead of "lg_". It's always
better to have a prefix matching the library name, as it's otherwise
not immediately obvious where a given symbol or function comes from.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
M grapheme.h  | 21 +++++++++++----------
A man/grapheme_character_isbreak.3  | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A man/grapheme_character_nextbreak.3  | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A man/grapheme_utf8_decode.3  | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A man/grapheme_utf8_encode.3  | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
D man/lg_grapheme_isbreak.3  | 80 -------------------------------------------------------------------------------
D man/lg_grapheme_nextbreak.3  | 72 ------------------------------------------------------------------------
D man/lg_utf8_decode.3  | 101 -------------------------------------------------------------------------------
D man/lg_utf8_encode.3  | 98 -------------------------------------------------------------------------------
M man/libgrapheme.7  | 8 ++++----
M src/character.c  | 21 +++++++++++----------
M src/utf8.c  | 16 ++++++++--------
M src/util.c  | 6 +++---
M src/util.h  | 6 +++---
M test/character-performance.c  | 4 ++--
M test/character.c  | 8 ++++----
M test/utf8-decode.c  | 50 +++++++++++++++++++++++++-------------------------
M test/utf8-encode.c  | 2 +-

18 files changed, 423 insertions(+), 421 deletions(-)
diff --git a/grapheme.h b/grapheme.h
@@ -6,24 +6,25 @@
 #include <stddef.h>
 #include <stdint.h>
 
-struct lg_internal_heisenstate {
+struct grapheme_internal_heisenstate {
 	uint_least64_t determined;
 	uint_least64_t state;
 };
 
-typedef struct lg_internal_segmentation_state {
-	struct lg_internal_heisenstate a;
-	struct lg_internal_heisenstate b;
+typedef struct grapheme_internal_segmentation_state {
+	struct grapheme_internal_heisenstate a;
+	struct grapheme_internal_heisenstate b;
 	uint_least16_t flags;
-} LG_SEGMENTATION_STATE;
+} GRAPHEME_SEGMENTATION_STATE;
 
-#define LG_INVALID_CODE_POINT UINT32_C(0xFFFD)
+#define GRAPHEME_INVALID_CODE_POINT UINT32_C(0xFFFD)
 
-size_t lg_character_nextbreak(const char *);
+size_t grapheme_character_nextbreak(const char *);
 
-bool lg_character_isbreak(uint_least32_t, uint_least32_t, LG_SEGMENTATION_STATE *);
+bool grapheme_character_isbreak(uint_least32_t, uint_least32_t,
+                                GRAPHEME_SEGMENTATION_STATE *);
 
-size_t lg_utf8_decode(const char *, size_t, uint_least32_t *);
-size_t lg_utf8_encode(uint_least32_t, char *, size_t);
+size_t grapheme_utf8_decode(const char *, size_t, uint_least32_t *);
+size_t grapheme_utf8_encode(uint_least32_t, char *, size_t);
 
 #endif /* GRAPHEME_H */
diff --git a/man/grapheme_character_isbreak.3 b/man/grapheme_character_isbreak.3
@@ -0,0 +1,80 @@
+.Dd 2021-12-18
+.Dt GRAPHEME_CHARACTER_ISBREAK 3
+.Os suckless.org
+.Sh NAME
+.Nm grapheme_character_isbreak
+.Nd test for a grapheme cluster break between two code points
+.Sh SYNOPSIS
+.In grapheme.h
+.Ft size_t
+.Fn grapheme_character_isbreak "uint_least32_t cp1" "uint_least32_t cp2" "GRAPHEME_SEGMENTATION_STATE *state"
+.Sh DESCRIPTION
+The
+.Fn grapheme_character_isbreak
+function determines if there is a grapheme cluster break (see
+.Xr libgrapheme 7 )
+between the two code points
+.Va cp1
+and
+.Va cp2 .
+By specification this decision depends on a
+.Va state
+that can at most be completely reset after detecting a break and must
+be reset every time one deviates from sequential processing.
+.Pp
+If
+.Va state
+is
+.Dv NULL
+.Fn grapheme_character_isbreak
+behaves as if it was called with a fully reset state.
+.Sh RETURN VALUES
+The
+.Fn grapheme_character_isbreak
+function returns
+.Va true
+if there is a grapheme cluster break between the code points
+.Va cp1
+and
+.Va cp2
+and
+.Va false
+if there is not.
+.Sh EXAMPLES
+.Bd -literal
+/* cc (-static) -o example example.c -lgrapheme */
+#include <grapheme.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int
+main(void)
+{
+	GRAPHEME_SEGMENTATION_STATE state = { 0 };
+	uint_least32_t s1[] = ..., s2[] = ...; /* two input arrays */
+	size_t i;
+
+	for (i = 0; i + 1 < sizeof(s1) / sizeof(*s1); i++) {
+		if (grapheme_character_isbreak(s[i], s[i + 1], &state)) {
+			printf("break in s1 at offset %zu\n", i);
+		}
+	}
+	memset(&state, 0, sizeof(state)); /* reset state */
+	for (i = 0; i + 1 < sizeof(s2) / sizeof(*s2); i++) {
+		if (grapheme_character_isbreak(s[i], s[i + 1], &state)) {
+			printf("break in s2 at offset %zu\n", i);
+		}
+	}
+
+	return 0;
+}
+.Ed
+.Sh SEE ALSO
+.Xr grapheme_character_nextbreak 3 ,
+.Xr libgrapheme 7
+.Sh STANDARDS
+.Fn grapheme_character_isbreak
+is compliant with the Unicode 14.0.0 specification.
+.Sh AUTHORS
+.An Laslo Hunhold Aq Mt dev@frign.de
diff --git a/man/grapheme_character_nextbreak.3 b/man/grapheme_character_nextbreak.3
@@ -0,0 +1,72 @@
+.Dd 2021-12-18
+.Dt GRAPHEME_CHARACTER_NEXTBREAK 3
+.Os suckless.org
+.Sh NAME
+.Nm grapheme_character_nextbreak
+.Nd determine byte-offset to next grapheme cluster break
+.Sh SYNOPSIS
+.In grapheme.h
+.Ft size_t
+.Fn grapheme_character_nextbreak "const char *str"
+.Sh DESCRIPTION
+The
+.Fn grapheme_character_nextbreak
+function computes the offset (in bytes) to the next grapheme
+cluster break (see
+.Xr libgrapheme 7 )
+in the UTF-8-encoded NUL-terminated string
+.Va str .
+If a grapheme cluster begins at
+.Va str
+this offset is equal to the length of said grapheme cluster.
+.Pp
+For non-UTF-8 input data
+.Xr grapheme_character_isbreak 3
+can be used instead.
+.Sh RETURN VALUES
+The
+.Fn grapheme_character_nextbreak
+function returns the offset (in bytes) to the next grapheme cluster
+break in
+.Va str
+or 0 if
+.Va str
+is
+.Dv NULL .
+.Sh EXAMPLES
+.Bd -literal
+/* cc (-static) -o example example.c -lgrapheme */
+#include <grapheme.h>
+#include <stdint.h>
+#include <stdio.h>
+
+int
+main(void)
+{
+	/* UTF-8 encoded input */
+	char *s = "T\\xC3\\xABst \\xF0\\x9F\\x91\\xA8\\xE2\\x80\\x8D\\xF0"
+	          "\\x9F\\x91\\xA9\\xE2\\x80\\x8D\\xF0\\x9F\\x91\\xA6 \\xF0"
+	          "\\x9F\\x87\\xBA\\xF0\\x9F\\x87\\xB8 \\xE0\\xA4\\xA8\\xE0"
+	          "\\xA5\\x80 \\xE0\\xAE\\xA8\\xE0\\xAE\\xBF!";
+	size_t len;
+
+	printf("Input: \\"%s\\"\\n", s);
+
+	/* print each grapheme cluster with byte-length */
+	for (; *s != '\\0';) {
+		len = grapheme_character_nextbreak(s);
+		printf("%2zu bytes | %.*s\\n", len, (int)len, s, len);
+		s += len;
+	}
+
+	return 0;
+}
+.Ed
+.Sh SEE ALSO
+.Xr grapheme_character_isbreak 3 ,
+.Xr libgrapheme 7
+.Sh STANDARDS
+.Fn grapheme_character_nextbreak
+is compliant with the Unicode 14.0.0 specification.
+.Sh AUTHORS
+.An Laslo Hunhold Aq Mt dev@frign.de
diff --git a/man/grapheme_utf8_decode.3 b/man/grapheme_utf8_decode.3
@@ -0,0 +1,101 @@
+.Dd 2021-12-17
+.Dt GRAPHEME_UTF8_DECODE 3
+.Os suckless.org
+.Sh NAME
+.Nm grapheme_utf8_decode
+.Nd decode first code point in UTF-8-encoded string
+.Sh SYNOPSIS
+.In grapheme.h
+.Ft size_t
+.Fn grapheme_utf8_decode "const char *str" "size_t len" "uint_least32_t *cp"
+.Sh DESCRIPTION
+The
+.Fn grapheme_utf8_decode
+function decodes the next code point in the UTF-8-encoded string
+.Va str
+of length
+.Va len .
+If the UTF-8-sequence is invalid (overlong encoding, unexpected byte,
+string ends unexpectedly, empty string, etc.) the decoding is stopped
+at the last processed byte and the decoded code point set to
+.Dv GRAPHEME_INVALID_CODE_POINT.
+.Pp
+If
+.Va cp
+is not
+.Dv NULL
+the decoded code point is stored in the memory pointed to by
+.Va cp .
+.Pp
+Given NUL has a unique 1 byte representation, it is safe to operate on
+NUL-terminated strings by setting
+.Va len
+to
+.Dv (size_t)-1
+and terminating when
+.Va cp
+is 0 (see
+.Sx EXAMPLES
+for an example).
+.Sh RETURN VALUES
+The
+.Fn grapheme_utf8_decode
+function returns the number of processed bytes and 0 if
+.Va str
+is
+.Dv NULL
+or
+.Va len
+is 0.
+If the string ends unexpectedly in a multibyte sequence, the desired
+length (that is larger than
+.Va len )
+is returned.
+.Sh EXAMPLES
+.Bd -literal
+/* cc (-static) -o example example.c -lgrapheme */
+#include <grapheme.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+void
+print_cps(const char *str, size_t len)
+{
+	size_t ret, off;
+	uint_least32_t cp;
+
+	for (off = 0; off < len; off += ret) {
+		if ((ret = grapheme_utf8_decode(str + off,
+		                                len - off, &cp)) > (len - off)) {
+			/*
+			 * string ended unexpectedly in the middle of a
+			 * multibyte sequence and we have the choice
+			 * here to possibly expand str by ret - len + off
+			 * bytes to get a full sequence, but we just
+			 * bail out in this case.
+			 */
+			break;
+		}
+		printf("%"PRIxLEAST32"\\n", cp);
+	}
+}
+
+void
+print_cps_nul_terminated(const char *str)
+{
+	size_t ret, off;
+	uint_least32_t cp;
+
+	for (off = 0; (ret = grapheme_utf8_decode(str + off,
+	                                          (size_t)-1, &cp)) > 0 &&
+	     cp != 0; off += ret) {
+		printf("%"PRIxLEAST32"\\n", cp);
+	}
+}
+.Ed
+.Sh SEE ALSO
+.Xr grapheme_utf8_encode 3 ,
+.Xr grapheme_character_isbreak 3 ,
+.Xr libgrapheme 7
+.Sh AUTHORS
+.An Laslo Hunhold Aq Mt dev@frign.de
diff --git a/man/grapheme_utf8_encode.3 b/man/grapheme_utf8_encode.3
@@ -0,0 +1,98 @@
+.Dd 2021-12-17
+.Dt GRAPHEME_UTF8_ENCODE 3
+.Os suckless.org
+.Sh NAME
+.Nm grapheme_utf8_encode
+.Nd encode code point into UTF-8 string
+.Sh SYNOPSIS
+.In grapheme.h
+.Ft size_t
+.Fn grapheme_utf8_encode "uint_least32_t cp" "char *" "size_t"
+.Sh DESCRIPTION
+The
+.Fn grapheme_utf8_encode
+function encodes the code point
+.Va cp
+into a UTF-8-string.
+If
+.Va str
+is not
+.Dv NULL
+and
+.Va len
+is large enough it writes the UTF-8-string to the memory pointed to by
+.Va str .
+.Sh RETURN VALUES
+The
+.Fn grapheme_utf8_encode
+function returns the length (in bytes) of the UTF-8-string resulting
+from encoding
+.Va cp .
+When the returned value is larger than
+.Va len
+it is indicated that the output string is too small and no data has been
+written.
+.Sh EXAMPLES
+.Bd -literal
+/* cc (-static) -o example example.c -lgrapheme */
+#include <grapheme.h>
+#include <stddef.h>
+#include <stdlib.h>
+
+size_t
+cps_to_utf8(const uint_least32_t *cp, size_t cplen, char *str, size_t len)
+{
+	size_t i, off, ret;
+
+	for (i = 0, off = 0; i < cplen; i++, off += ret) {
+		if ((ret = grapheme_utf8_encode(cp[i], str + off,
+		                                len - off)) > (len - off)) {
+			/* buffer too small */
+			break;
+		}
+	}
+
+	return off;
+}
+
+size_t
+cps_bytelen(const uint_least32_t *cp, size_t cplen)
+{
+	size_t i, len;
+
+	for (i = 0, len = 0; i < cplen; i++) {
+		len += grapheme_utf8_encode(cp[i], NULL, 0);
+	}
+
+	return len;
+}
+
+char *
+cps_to_utf8_alloc(const uint_least32_t *cp, size_t cplen)
+{
+	char *str;
+	size_t len, i, ret, off;
+
+	len = cps_bytelen(cp, cplen);
+
+	if (!(str = malloc(len))) {
+		return NULL;
+	}
+
+	for (i = 0, off = 0; i < cplen; i++, off += ret) {
+		if ((ret = grapheme_utf8_encode(cp[i], str + off,
+		                                len - off)) > (len - off)) {
+			/* buffer too small */
+			break;
+		}
+	}
+	str[off] = '\\0';
+
+	return str;
+}
+.Ed
+.Sh SEE ALSO
+.Xr grapheme_utf8_decode 3 ,
+.Xr libgrapheme 7
+.Sh AUTHORS
+.An Laslo Hunhold Aq Mt dev@frign.de
diff --git a/man/lg_grapheme_isbreak.3 b/man/lg_grapheme_isbreak.3
@@ -1,80 +0,0 @@
-.Dd 2021-12-18
-.Dt LG_GRAPHEME_ISBREAK 3
-.Os suckless.org
-.Sh NAME
-.Nm lg_grapheme_isbreak
-.Nd test for a grapheme cluster break between two code points
-.Sh SYNOPSIS
-.In grapheme.h
-.Ft size_t
-.Fn lg_grapheme_isbreak "uint_least32_t cp1" "uint_least32_t cp2" "LG_SEGMENTATION_STATE *state"
-.Sh DESCRIPTION
-The
-.Fn lg_grapheme_isbreak
-function determines if there is a grapheme cluster break (see
-.Xr libgrapheme 7 )
-between the two code points
-.Va cp1
-and
-.Va cp2 .
-By specification this decision depends on a
-.Va state
-that can at most be completely reset after detecting a break and must
-be reset every time one deviates from sequential processing.
-.Pp
-If
-.Va state
-is
-.Dv NULL
-.Fn lg_grapheme_isbreak
-behaves as if it was called with a fully reset state.
-.Sh RETURN VALUES
-The
-.Fn lg_grapheme_isbreak
-function returns
-.Va true
-if there is a grapheme cluster break between the code points
-.Va cp1
-and
-.Va cp2
-and
-.Va false
-if there is not.
-.Sh EXAMPLES
-.Bd -literal
-/* cc (-static) -o example example.c -lgrapheme */
-#include <grapheme.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-int
-main(void)
-{
-	LG_SEGMENTATION_STATE state = { 0 };
-	uint_least32_t s1[] = ..., s2[] = ...; /* two input arrays */
-	size_t i;
-
-	for (i = 0; i + 1 < sizeof(s1) / sizeof(*s1); i++) {
-		if (lg_grapheme_isbreak(s[i], s[i + 1], &state)) {
-			printf("break in s1 at offset %zu\n", i);
-		}
-	}
-	memset(&state, 0, sizeof(state)); /* reset state */
-	for (i = 0; i + 1 < sizeof(s2) / sizeof(*s2); i++) {
-		if (lg_grapheme_isbreak(s[i], s[i + 1], &state)) {
-			printf("break in s2 at offset %zu\n", i);
-		}
-	}
-
-	return 0;
-}
-.Ed
-.Sh SEE ALSO
-.Xr lg_grapheme_nextbreak 3 ,
-.Xr libgrapheme 7
-.Sh STANDARDS
-.Fn lg_grapheme_isbreak
-is compliant with the Unicode 14.0.0 specification.
-.Sh AUTHORS
-.An Laslo Hunhold Aq Mt dev@frign.de
diff --git a/man/lg_grapheme_nextbreak.3 b/man/lg_grapheme_nextbreak.3
@@ -1,72 +0,0 @@
-.Dd 2021-12-18
-.Dt LG_GRAPHEME_NEXTBREAK 3
-.Os suckless.org
-.Sh NAME
-.Nm lg_grapheme_nextbreak
-.Nd determine byte-offset to next grapheme cluster break
-.Sh SYNOPSIS
-.In grapheme.h
-.Ft size_t
-.Fn lg_grapheme_nextbreak "const char *str"
-.Sh DESCRIPTION
-The
-.Fn lg_grapheme_nextbreak
-function computes the offset (in bytes) to the next grapheme
-cluster break (see
-.Xr libgrapheme 7 )
-in the UTF-8-encoded NUL-terminated string
-.Va str .
-If a grapheme cluster begins at
-.Va str
-this offset is equal to the length of said grapheme cluster.
-.Pp
-For non-UTF-8 input data
-.Xr lg_grapheme_isbreak 3
-can be used instead.
-.Sh RETURN VALUES
-The
-.Fn lg_grapheme_nextbreak
-function returns the offset (in bytes) to the next grapheme cluster
-break in
-.Va str
-or 0 if
-.Va str
-is
-.Dv NULL .
-.Sh EXAMPLES
-.Bd -literal
-/* cc (-static) -o example example.c -lgrapheme */
-#include <grapheme.h>
-#include <stdint.h>
-#include <stdio.h>
-
-int
-main(void)
-{
-	/* UTF-8 encoded input */
-	char *s = "T\\xC3\\xABst \\xF0\\x9F\\x91\\xA8\\xE2\\x80\\x8D\\xF0"
-	          "\\x9F\\x91\\xA9\\xE2\\x80\\x8D\\xF0\\x9F\\x91\\xA6 \\xF0"
-	          "\\x9F\\x87\\xBA\\xF0\\x9F\\x87\\xB8 \\xE0\\xA4\\xA8\\xE0"
-	          "\\xA5\\x80 \\xE0\\xAE\\xA8\\xE0\\xAE\\xBF!";
-	size_t len;
-
-	printf("Input: \\"%s\\"\\n", s);
-
-	/* print each grapheme cluster with byte-length */
-	for (; *s != '\\0';) {
-		len = lg_grapheme_nextbreak(s);
-		printf("%2zu bytes | %.*s\\n", len, (int)len, s, len);
-		s += len;
-	}
-
-	return 0;
-}
-.Ed
-.Sh SEE ALSO
-.Xr lg_grapheme_isbreak 3 ,
-.Xr libgrapheme 7
-.Sh STANDARDS
-.Fn lg_grapheme_nextbreak
-is compliant with the Unicode 14.0.0 specification.
-.Sh AUTHORS
-.An Laslo Hunhold Aq Mt dev@frign.de
diff --git a/man/lg_utf8_decode.3 b/man/lg_utf8_decode.3
@@ -1,101 +0,0 @@
-.Dd 2021-12-17
-.Dt LG_UTF8_DECODE 3
-.Os suckless.org
-.Sh NAME
-.Nm lg_utf8_decode
-.Nd decode first code point in UTF-8-encoded string
-.Sh SYNOPSIS
-.In grapheme.h
-.Ft size_t
-.Fn lg_utf8_decode "const char *str" "size_t len" "uint_least32_t *cp"
-.Sh DESCRIPTION
-The
-.Fn lg_utf8_decode
-function decodes the next code point in the UTF-8-encoded string
-.Va str
-of length
-.Va len .
-If the UTF-8-sequence is invalid (overlong encoding, unexpected byte,
-string ends unexpectedly, empty string, etc.) the decoding is stopped
-at the last processed byte and the decoded code point set to
-.Dv LG_INVALID_CODE_POINT.
-.Pp
-If
-.Va cp
-is not
-.Dv NULL
-the decoded code point is stored in the memory pointed to by
-.Va cp .
-.Pp
-Given NUL has a unique 1 byte representation, it is safe to operate on
-NUL-terminated strings by setting
-.Va len
-to
-.Dv (size_t)-1
-and terminating when
-.Va cp
-is 0 (see
-.Sx EXAMPLES
-for an example).
-.Sh RETURN VALUES
-The
-.Fn lg_utf8_decode
-function returns the number of processed bytes and 0 if
-.Va str
-is
-.Dv NULL
-or
-.Va len
-is 0.
-If the string ends unexpectedly in a multibyte sequence, the desired
-length (that is larger than
-.Va len )
-is returned.
-.Sh EXAMPLES
-.Bd -literal
-/* cc (-static) -o example example.c -lgrapheme */
-#include <grapheme.h>
-#include <inttypes.h>
-#include <stdio.h>
-
-void
-print_cps(const char *str, size_t len)
-{
-	size_t ret, off;
-	uint_least32_t cp;
-
-	for (off = 0; off < len; off += ret) {
-		if ((ret = lg_utf8_decode(str + off,
-		                          len - off, &cp)) > (len - off)) {	
-			/*
-			 * string ended unexpectedly in the middle of a
-			 * multibyte sequence and we have the choice
-			 * here to possibly expand str by ret - len + off
-			 * bytes to get a full sequence, but we just
-			 * bail out in this case.
-			 */
-			break;
-		}
-		printf("%"PRIxLEAST32"\\n", cp);
-	}
-}
-
-void
-print_cps_nul_terminated(const char *str)
-{
-	size_t ret, off;
-	uint_least32_t cp;
-
-	for (off = 0; (ret = lg_utf8_decode(str + off,
-	                                    (size_t)-1, &cp)) > 0 &&
-	     cp != 0; off += ret) {
-		printf("%"PRIxLEAST32"\\n", cp);
-	}
-}
-.Ed
-.Sh SEE ALSO
-.Xr lg_grapheme_encode 3 ,
-.Xr lg_grapheme_isbreak 3 ,
-.Xr libgrapheme 7
-.Sh AUTHORS
-.An Laslo Hunhold Aq Mt dev@frign.de
diff --git a/man/lg_utf8_encode.3 b/man/lg_utf8_encode.3
@@ -1,98 +0,0 @@
-.Dd 2021-12-17
-.Dt LG_UTF8_ENCODE 3
-.Os suckless.org
-.Sh NAME
-.Nm lg_utf8_encode
-.Nd encode code point into UTF-8 string
-.Sh SYNOPSIS
-.In grapheme.h
-.Ft size_t
-.Fn lg_utf8_encode "uint_least32_t cp" "char *" "size_t"
-.Sh DESCRIPTION
-The
-.Fn lg_utf8_encode
-function encodes the code point
-.Va cp
-into a UTF-8-string.
-If
-.Va str
-is not
-.Dv NULL
-and
-.Va len
-is large enough it writes the UTF-8-string to the memory pointed to by
-.Va str .
-.Sh RETURN VALUES
-The
-.Fn lg_utf8_encode
-function returns the length (in bytes) of the UTF-8-string resulting
-from encoding
-.Va cp .
-When the returned value is larger than
-.Va len
-it is indicated that the output string is too small and no data has been
-written.
-.Sh EXAMPLES
-.Bd -literal
-/* cc (-static) -o example example.c -lgrapheme */
-#include <grapheme.h>
-#include <stddef.h>
-#include <stdlib.h>
-
-size_t
-cps_to_utf8(const uint_least32_t *cp, size_t cplen, char *str, size_t len)
-{
-	size_t i, off, ret;
-
-	for (i = 0, off = 0; i < cplen; i++, off += ret) {
-		if ((ret = lg_utf8_encode(cp[i], str + off,
-		                          len - off)) > (len - off)) {
-			/* buffer too small */
-			break;
-		}
-	}
-	
-	return off;
-}
-
-size_t
-cps_bytelen(const uint_least32_t *cp, size_t cplen)
-{
-	size_t i, len;
-
-	for (i = 0, len = 0; i < cplen; i++) {
-		len += lg_utf8_encode(cp[i], NULL, 0);
-	}
-
-	return len;
-}
-
-char *
-cps_to_utf8_alloc(const uint_least32_t *cp, size_t cplen)
-{
-	char *str;
-	size_t len, i, ret, off;
-
-	len = cps_bytelen(cp, cplen);
-
-	if (!(str = malloc(len))) {
-		return NULL;
-	}
-
-	for (i = 0, off = 0; i < cplen; i++, off += ret) {
-		if ((ret = lg_utf8_encode(cp[i], str + off,
-		                          len - off)) > (len - off)) {
-			/* buffer too small */
-			break;
-		}
-	}
-	str[off] = '\\0';
-
-	return str;
-}
-.Ed
-.Sh SEE ALSO
-.Xr lg_grapheme_decode 3 ,
-.Xr libgrapheme 7
-.Sh AUTHORS
-.An Laslo Hunhold Aq Mt dev@frign.de
diff --git a/man/libgrapheme.7 b/man/libgrapheme.7
@@ -15,10 +15,10 @@ see
 .Sx MOTIVATION )
 according to the Unicode specification.
 .Sh SEE ALSO
-.Xr lg_grapheme_isbreak 3 ,
-.Xr lg_grapheme_nextbreak 3 ,
-.Xr lg_utf8_decode 3 ,
-.Xr lg_utf8_encode 3
+.Xr grapheme_character_isbreak 3 ,
+.Xr grapheme_character_nextbreak 3 ,
+.Xr grapheme_utf8_decode 3 ,
+.Xr grapheme_utf8_encode 3
 .Sh STANDARDS
 .Nm
 is compliant with the Unicode 14.0.0 specification.
diff --git a/src/character.c b/src/character.c
@@ -14,9 +14,10 @@ enum {
 };
 
 bool
-lg_character_isbreak(uint_least32_t a, uint_least32_t b, LG_SEGMENTATION_STATE *state)
+grapheme_character_isbreak(uint_least32_t a, uint_least32_t b,
+                           GRAPHEME_SEGMENTATION_STATE *state)
 {
-	struct lg_internal_heisenstate *p[2] = { 0 };
+	struct grapheme_internal_heisenstate *p[2] = { 0 };
 	uint_least16_t flags = 0;
 	bool isbreak = true;
 
@@ -179,18 +180,18 @@ hasbreak:
 }
 
 size_t
-lg_character_nextbreak(const char *str)
+grapheme_character_nextbreak(const char *str)
 {
 	uint_least32_t cp0, cp1;
 	size_t ret, len = 0;
-	LG_SEGMENTATION_STATE state = { 0 };
+	GRAPHEME_SEGMENTATION_STATE state = { 0 };
 
 	if (str == NULL) {
 		return 0;
 	}
 
 	/*
-	 * lg_utf8_decode, when it encounters an unexpected byte,
+	 * grapheme_utf8_decode, when it encounters an unexpected byte,
 	 * does not count it to the error and instead assumes that the
 	 * unexpected byte is the beginning of a new sequence.
 	 * This way, when the string ends with a null byte, we never
@@ -202,17 +203,17 @@ lg_character_nextbreak(const char *str)
 	 */
 
 	/* get first code point */
-	len += lg_utf8_decode(str, (size_t)-1, &cp0);
-	if (cp0 == LG_INVALID_CODE_POINT) {
+	len += grapheme_utf8_decode(str, (size_t)-1, &cp0);
+	if (cp0 == GRAPHEME_INVALID_CODE_POINT) {
 		return len;
 	}
 
 	while (cp0 != 0) {
 		/* get next code point */
-		ret = lg_utf8_decode(str + len, (size_t)-1, &cp1);
+		ret = grapheme_utf8_decode(str + len, (size_t)-1, &cp1);
 
-		if (cp1 == LG_INVALID_CODE_POINT ||
-		    lg_character_isbreak(cp0, cp1, &state)) {
+		if (cp1 == GRAPHEME_INVALID_CODE_POINT ||
+		    grapheme_character_isbreak(cp0, cp1, &state)) {
 			/* we read an invalid cp or have a breakpoint */
 			break;
 		} else {
diff --git a/src/utf8.c b/src/utf8.c
@@ -48,13 +48,13 @@ static const struct {
 };
 
 size_t
-lg_utf8_decode(const char *s, size_t n, uint_least32_t *cp)
+grapheme_utf8_decode(const char *s, size_t n, uint_least32_t *cp)
 {
 	size_t off, i;
 
 	if (s == NULL || n == 0) {
 		/* a sequence must be at least 1 byte long */
-		*cp = LG_INVALID_CODE_POINT;
+		*cp = GRAPHEME_INVALID_CODE_POINT;
 		return 0;
 	}
 
@@ -79,14 +79,14 @@ lg_utf8_decode(const char *s, size_t n, uint_least32_t *cp)
 		 * this also includes the cases where bits higher than
 		 * the 8th are set on systems with CHAR_BIT > 8
 		 */
-		*cp = LG_INVALID_CODE_POINT;
+		*cp = GRAPHEME_INVALID_CODE_POINT;
 		return 1;
 	}
 	if (1 + off > n) {
 		/*
 		 * input is not long enough, set cp as invalid
 		 */
-		*cp = LG_INVALID_CODE_POINT;
+		*cp = GRAPHEME_INVALID_CODE_POINT;
 
 		/*
 		 * count the following continuation bytes, but nothing
@@ -125,7 +125,7 @@ lg_utf8_decode(const char *s, size_t n, uint_least32_t *cp)
 			 * higher than the 8th are set on systems
 			 * with CHAR_BIT > 8
 			 */
-			*cp = LG_INVALID_CODE_POINT;
+			*cp = GRAPHEME_INVALID_CODE_POINT;
 			return 1 + (i - 1);
 		}
 		/*
@@ -144,14 +144,14 @@ lg_utf8_decode(const char *s, size_t n, uint_least32_t *cp)
 		 * not representable in UTF-16 (>0x10FFFF) (RFC-3629
 		 * specifies the latter two conditions)
 		 */
-		*cp = LG_INVALID_CODE_POINT;
+		*cp = GRAPHEME_INVALID_CODE_POINT;
 	}
 
 	return 1 + off;
 }
 
 size_t
-lg_utf8_encode(uint_least32_t cp, char *s, size_t n)
+grapheme_utf8_encode(uint_least32_t cp, char *s, size_t n)
 {
 	size_t off, i;
 
@@ -162,7 +162,7 @@ lg_utf8_encode(uint_least32_t cp, char *s, size_t n)
 		 * (0xD800..0xDFFF) or not representable in UTF-16
 		 * (>0x10FFFF), which RFC-3629 deems invalid for UTF-8.
 		 */
-		cp = LG_INVALID_CODE_POINT;
+		cp = GRAPHEME_INVALID_CODE_POINT;
 	}
 
 	/* determine necessary sequence type */
diff --git a/src/util.c b/src/util.c
@@ -8,7 +8,7 @@
 /* 64-slot (0,...,63) optionally undetermined binary state */
 
 int
-heisenstate_get(struct lg_internal_heisenstate *h, int slot)
+heisenstate_get(struct grapheme_internal_heisenstate *h, int slot)
 {
 	if (h == NULL || slot >= 64 || slot < 0 ||
 	    !(h->determined & (1 << slot))) {
@@ -21,7 +21,7 @@ heisenstate_get(struct lg_internal_heisenstate *h, int slot)
 }
 
 int
-heisenstate_set(struct lg_internal_heisenstate *h, int slot, int state)
+heisenstate_set(struct grapheme_internal_heisenstate *h, int slot, int state)
 {
 	if (h == NULL || slot >= 64 || slot < 0) {
 		/* no state given or slot out of range */
@@ -54,7 +54,7 @@ cp_cmp(const void *a, const void *b)
 }
 
 int
-has_property(uint_least32_t cp, struct lg_internal_heisenstate *cpstate,
+has_property(uint_least32_t cp, struct grapheme_internal_heisenstate *cpstate,
              const struct range_list *proptable, int property)
 {
 	int res;
diff --git a/src/util.h b/src/util.h
@@ -19,10 +19,10 @@ struct range_list {
 	size_t len;
 };
 
-int heisenstate_get(struct lg_internal_heisenstate *, int);
-int heisenstate_set(struct lg_internal_heisenstate *, int, int);
+int heisenstate_get(struct grapheme_internal_heisenstate *, int);
+int heisenstate_set(struct grapheme_internal_heisenstate *, int, int);
 
-int has_property(uint_least32_t, struct lg_internal_heisenstate *,
+int has_property(uint_least32_t, struct grapheme_internal_heisenstate *,
                  const struct range_list *, int);
 
 #endif /* UTIL_H */
diff --git a/test/character-performance.c b/test/character-performance.c
@@ -17,7 +17,7 @@ main(int argc, char *argv[])
 	struct timespec start, end;
 	size_t i, j, bufsiz, off;
 	uint32_t *buf;
-	LG_SEGMENTATION_STATE state;
+	GRAPHEME_SEGMENTATION_STATE state;
 	double cp_per_sec;
 
 	(void)argc;
@@ -45,7 +45,7 @@ main(int argc, char *argv[])
 	for (i = 0; i < NUM_ITERATIONS; i++) {
 		memset(&state, 0, sizeof(state));
 		for (j = 0; j < bufsiz - 1; j++) {
-			(void)lg_character_isbreak(buf[j], buf[j+1], &state);
+			(void)grapheme_character_isbreak(buf[j], buf[j+1], &state);
 		}
 		if (i % (NUM_ITERATIONS / 10) == 0) {
 			printf(".");
diff --git a/test/character.c b/test/character.c
@@ -11,7 +11,7 @@
 int
 main(int argc, char *argv[])
 {
-	LG_SEGMENTATION_STATE state;
+	GRAPHEME_SEGMENTATION_STATE state;
 	size_t i, j, k, len, failed;
 
 	(void)argc;
@@ -21,9 +21,9 @@ main(int argc, char *argv[])
 		memset(&state, 0, sizeof(state));
 		for (j = 0, k = 0, len = 1; j < character_test[i].cplen; j++) {
 			if ((j + 1) == character_test[i].cplen ||
-			    lg_character_isbreak(character_test[i].cp[j],
-			                         character_test[i].cp[j + 1],
-			                         &state)) {
+			    grapheme_character_isbreak(character_test[i].cp[j],
+			                               character_test[i].cp[j + 1],
+			                               &state)) {
 				/* check if our resulting length matches */
 				if (k == character_test[i].lenlen ||
 				    len != character_test[i].len[k++]) {
diff --git a/test/utf8-decode.c b/test/utf8-decode.c
@@ -21,7 +21,7 @@ static const struct {
 		.arr     = NULL,
 		.len     = 0,
 		.exp_len = 0,
-		.exp_cp  = LG_INVALID_CODE_POINT,
+		.exp_cp  = GRAPHEME_INVALID_CODE_POINT,
 	},
 	{
 		/* invalid lead byte
@@ -31,7 +31,7 @@ static const struct {
 		.arr     = (char *)(unsigned char[]){ 0xFD },
 		.len     = 1,
 		.exp_len = 1,
-		.exp_cp  = LG_INVALID_CODE_POINT,
+		.exp_cp  = GRAPHEME_INVALID_CODE_POINT,
 	},
 	{
 		/* valid 1-byte sequence
@@ -61,7 +61,7 @@ static const struct {
 		.arr     = (char *)(unsigned char[]){ 0xC3 },
 		.len     = 1,
 		.exp_len = 2,
-		.exp_cp  = LG_INVALID_CODE_POINT,
+		.exp_cp  = GRAPHEME_INVALID_CODE_POINT,
 	},
 	{
 		/* invalid 2-byte sequence (second byte malformed)
@@ -71,7 +71,7 @@ static const struct {
 		.arr     = (char *)(unsigned char[]){ 0xC3, 0xFF },
 		.len     = 2,
 		.exp_len = 1,
-		.exp_cp  = LG_INVALID_CODE_POINT,
+		.exp_cp  = GRAPHEME_INVALID_CODE_POINT,
 	},
 	{
 		/* invalid 2-byte sequence (overlong encoded)
@@ -81,7 +81,7 @@ static const struct {
 		.arr     = (char *)(unsigned char[]){ 0xC1, 0xBF },
 		.len     = 2,
 		.exp_len = 2,
-		.exp_cp  = LG_INVALID_CODE_POINT,
+		.exp_cp  = GRAPHEME_INVALID_CODE_POINT,
 	},
 	{
 		/* valid 3-byte sequence
@@ -101,7 +101,7 @@ static const struct {
 		.arr     = (char *)(unsigned char[]){ 0xE0 },
 		.len     = 1,
 		.exp_len = 3,
-		.exp_cp  = LG_INVALID_CODE_POINT,
+		.exp_cp  = GRAPHEME_INVALID_CODE_POINT,
 	},
 	{
 		/* invalid 3-byte sequence (second byte malformed)
@@ -111,7 +111,7 @@ static const struct {
 		.arr     = (char *)(unsigned char[]){ 0xE0, 0x7F, 0xBF },
 		.len     = 3,
 		.exp_len = 1,
-		.exp_cp  = LG_INVALID_CODE_POINT,
+		.exp_cp  = GRAPHEME_INVALID_CODE_POINT,
 	},
 	{
 		/* invalid 3-byte sequence (short string, second byte malformed)
@@ -121,7 +121,7 @@ static const struct {
 		.arr     = (char *)(unsigned char[]){ 0xE0, 0x7F },
 		.len     = 2,
 		.exp_len = 1,
-		.exp_cp  = LG_INVALID_CODE_POINT,
+		.exp_cp  = GRAPHEME_INVALID_CODE_POINT,
 	},
 	{
 		/* invalid 3-byte sequence (third byte missing)
@@ -131,7 +131,7 @@ static const struct {
 		.arr     = (char *)(unsigned char[]){ 0xE0, 0xBF },
 		.len     = 2,
 		.exp_len = 3,
-		.exp_cp  = LG_INVALID_CODE_POINT,
+		.exp_cp  = GRAPHEME_INVALID_CODE_POINT,
 	},
 	{
 		/* invalid 3-byte sequence (third byte malformed)
@@ -141,7 +141,7 @@ static const struct {
 		.arr     = (char *)(unsigned char[]){ 0xE0, 0xBF, 0x7F },
 		.len     = 3,
 		.exp_len = 2,
-		.exp_cp  = LG_INVALID_CODE_POINT,
+		.exp_cp  = GRAPHEME_INVALID_CODE_POINT,
 	},
 	{
 		/* invalid 3-byte sequence (overlong encoded)
@@ -151,7 +151,7 @@ static const struct {
 		.arr     = (char *)(unsigned char[]){ 0xE0, 0x9F, 0xBF },
 		.len     = 3,
 		.exp_len = 3,
-		.exp_cp  = LG_INVALID_CODE_POINT,
+		.exp_cp  = GRAPHEME_INVALID_CODE_POINT,
 	},
 	{
 		/* invalid 3-byte sequence (UTF-16 surrogate half)
@@ -161,7 +161,7 @@ static const struct {
 		.arr     = (char *)(unsigned char[]){ 0xED, 0xA0, 0x80 },
 		.len     = 3,
 		.exp_len = 3,
-		.exp_cp  = LG_INVALID_CODE_POINT,
+		.exp_cp  = GRAPHEME_INVALID_CODE_POINT,
 	},
 	{
 		/* valid 4-byte sequence
@@ -181,7 +181,7 @@ static const struct {
 		.arr     = (char *)(unsigned char[]){ 0xF3 },
 		.len     = 1,
 		.exp_len = 4,
-		.exp_cp  = LG_INVALID_CODE_POINT,
+		.exp_cp  = GRAPHEME_INVALID_CODE_POINT,
 	},
 	{
 		/* invalid 4-byte sequence (second byte malformed)
@@ -191,7 +191,7 @@ static const struct {
 		.arr     = (char *)(unsigned char[]){ 0xF3, 0x7F, 0xBF, 0xBF },
 		.len     = 4,
 		.exp_len = 1,
-		.exp_cp  = LG_INVALID_CODE_POINT,
+		.exp_cp  = GRAPHEME_INVALID_CODE_POINT,
 	},
 	{
 		/* invalid 4-byte sequence (short string 1, second byte malformed)
@@ -201,7 +201,7 @@ static const struct {
 		.arr     = (char *)(unsigned char[]){ 0xF3, 0x7F },
 		.len     = 2,
 		.exp_len = 1,
-		.exp_cp  = LG_INVALID_CODE_POINT,
+		.exp_cp  = GRAPHEME_INVALID_CODE_POINT,
 	},
 	{
 		/* invalid 4-byte sequence (short string 2, second byte malformed)
@@ -211,7 +211,7 @@ static const struct {
 		.arr     = (char *)(unsigned char[]){ 0xF3, 0x7F, 0xBF },
 		.len     = 3,
 		.exp_len = 1,
-		.exp_cp  = LG_INVALID_CODE_POINT,
+		.exp_cp  = GRAPHEME_INVALID_CODE_POINT,
 	},
 
 	{
@@ -222,7 +222,7 @@ static const struct {
 		.arr     = (char *)(unsigned char[]){ 0xF3, 0xBF },
 		.len     = 2,
 		.exp_len = 4,
-		.exp_cp  = LG_INVALID_CODE_POINT,
+		.exp_cp  = GRAPHEME_INVALID_CODE_POINT,
 	},
 	{
 		/* invalid 4-byte sequence (third byte malformed)
@@ -232,7 +232,7 @@ static const struct {
 		.arr     = (char *)(unsigned char[]){ 0xF3, 0xBF, 0x7F, 0xBF },
 		.len     = 4,
 		.exp_len = 2,
-		.exp_cp  = LG_INVALID_CODE_POINT,
+		.exp_cp  = GRAPHEME_INVALID_CODE_POINT,
 	},
 	{
 		/* invalid 4-byte sequence (short string, third byte malformed)
@@ -242,7 +242,7 @@ static const struct {
 		.arr     = (char *)(unsigned char[]){ 0xF3, 0xBF, 0x7F },
 		.len     = 3,
 		.exp_len = 2,
-		.exp_cp  = LG_INVALID_CODE_POINT,
+		.exp_cp  = GRAPHEME_INVALID_CODE_POINT,
 	},
 	{
 		/* invalid 4-byte sequence (fourth byte missing)
@@ -252,7 +252,7 @@ static const struct {
 		.arr     = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF },
 		.len     = 3,
 		.exp_len = 4,
-		.exp_cp  = LG_INVALID_CODE_POINT,
+		.exp_cp  = GRAPHEME_INVALID_CODE_POINT,
 	},
 	{
 		/* invalid 4-byte sequence (fourth byte malformed)
@@ -262,7 +262,7 @@ static const struct {
 		.arr     = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0x7F },
 		.len     = 4,
 		.exp_len = 3,
-		.exp_cp  = LG_INVALID_CODE_POINT,
+		.exp_cp  = GRAPHEME_INVALID_CODE_POINT,
 	},
 	{
 		/* invalid 4-byte sequence (overlong encoded)
@@ -272,7 +272,7 @@ static const struct {
 		.arr     = (char *)(unsigned char[]){ 0xF0, 0x80, 0x81, 0xBF },
 		.len     = 4,
 		.exp_len = 4,
-		.exp_cp  = LG_INVALID_CODE_POINT,
+		.exp_cp  = GRAPHEME_INVALID_CODE_POINT,
 	},
 	{
 		/* invalid 4-byte sequence (UTF-16-unrepresentable)
@@ -282,7 +282,7 @@ static const struct {
 		.arr     = (char *)(unsigned char[]){ 0xF4, 0x90, 0x80, 0x80 },
 		.len     = 4,
 		.exp_len = 4,
-		.exp_cp  = LG_INVALID_CODE_POINT,
+		.exp_cp  = GRAPHEME_INVALID_CODE_POINT,
 	},
 };
 
@@ -298,8 +298,8 @@ main(int argc, char *argv[])
 		size_t len;
 		uint_least32_t cp;
 
-		len = lg_utf8_decode(dec_test[i].arr,
-		                     dec_test[i].len, &cp);
+		len = grapheme_utf8_decode(dec_test[i].arr,
+		                           dec_test[i].len, &cp);
 
 		if (len != dec_test[i].exp_len ||
 		    cp != dec_test[i].exp_cp) {
diff --git a/test/utf8-encode.c b/test/utf8-encode.c
@@ -62,7 +62,7 @@ main(int argc, char *argv[])
 		char arr[4];
 		size_t len;
 
-		len = lg_utf8_encode(enc_test[i].cp, arr, LEN(arr));
+		len = grapheme_utf8_encode(enc_test[i].cp, arr, LEN(arr));
 
 		if (len != enc_test[i].exp_len ||
 		    memcmp(arr, enc_test[i].exp_arr, len)) {

	libgrapheme unicode string library
	git clone git://git.suckless.org/libgrapheme
	Log \| Files \| Refs \| README \| LICENSE

M	grapheme.h	\|	21	+++++++++++----------
A	man/grapheme_character_isbreak.3	\|	80	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	man/grapheme_character_nextbreak.3	\|	72	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	man/grapheme_utf8_decode.3	\|	101	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	man/grapheme_utf8_encode.3	\|	98	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
D	man/lg_grapheme_isbreak.3	\|	80	-------------------------------------------------------------------------------
D	man/lg_grapheme_nextbreak.3	\|	72	------------------------------------------------------------------------
D	man/lg_utf8_decode.3	\|	101	-------------------------------------------------------------------------------
D	man/lg_utf8_encode.3	\|	98	-------------------------------------------------------------------------------
M	man/libgrapheme.7	\|	8	++++----
M	src/character.c	\|	21	+++++++++++----------
M	src/utf8.c	\|	16	++++++++--------
M	src/util.c	\|	6	+++---
M	src/util.h	\|	6	+++---
M	test/character-performance.c	\|	4	++--
M	test/character.c	\|	8	++++----
M	test/utf8-decode.c	\|	50	+++++++++++++++++++++++++-------------------------
M	test/utf8-encode.c	\|	2	+-