libgrapheme

grapheme cluster utility library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | LICENSE

commit a88c9a176dfc9459c3a7b09f6c9aedda6d06732f
parent f8bb18d674283ffd63b87fd903fdc75bee4fc2fd
Author: Laslo Hunhold <dev@frign.de>
Date:   Mon,  1 Jun 2020 12:00:50 +0200

Expose grapheme_cp_{en,de}code() and grapheme_boundary()

After the preparation, we can now expose these three functions in
grapheme.h, as suggested by Mattias.
In this context, we get rid of the Codepoint-typedef, as there is no
need to opaquely define uint32_t. A codepoint is just a number, and thus
let's stop with the "Rune", "Codepoint", etc. naming-nonsense!

Moving everything into grapheme.h, there is also no need for boundary.h
and codepoint.h, which we reflect in the Makefile.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
MMakefile | 17+++++++++--------
Mdata/emo.awk | 2+-
Mdata/gbp.awk | 2+-
Mdata/gbt.awk | 4++--
Mgrapheme.h | 8++++++++
Dsrc/boundary.h | 11-----------
Msrc/boundary_body.c | 16+++++++---------
Msrc/codepoint.c | 2+-
Dsrc/codepoint.h | 15---------------
Msrc/grapheme.c | 7+++----
Msrc/test_body.c | 7++++---
11 files changed, 36 insertions(+), 55 deletions(-)

diff --git a/Makefile b/Makefile @@ -5,7 +5,7 @@ include config.mk BIN = src/test -REQ = src/codepoint src/boundary src/grapheme +REQ = src/boundary src/codepoint src/grapheme GBP_URL = https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt EMO_URL = https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt GBT_URL = https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt @@ -19,10 +19,10 @@ all: libgrapheme.a libgrapheme.so $(BIN) src/test: src/test.o $(REQ:=.o) -src/boundary.o: src/boundary.c config.mk src/codepoint.h src/boundary.h -src/codepoint.o: src/codepoint.c config.mk src/codepoint.h -src/grapheme.o: src/grapheme.c config.mk src/codepoint.h src/boundary.h -src/test.o: src/test.c config.mk src/codepoint.h src/boundary.h +src/boundary.o: src/boundary.c config.mk grapheme.h +src/codepoint.o: src/codepoint.c config.mk grapheme.h +src/grapheme.o: src/grapheme.c config.mk grapheme.h +src/test.o: src/test.c config.mk grapheme.h .o: $(CC) -o $@ $(LDFLAGS) $< $(REQ:=.o) @@ -42,7 +42,7 @@ test: src/boundary.c: data/gbt.awk $(GBP) data/emo.awk $(EMO) src/boundary_body.c printf "/* Automatically generated by gbp.awk and emo.awk */\n" > $@ - printf "#include \"codepoint.h\"\n" >> $@ + printf "#include <stdint.h>\n\n" >> $@ awk -f data/gbp.awk $(GBP) >> $@ awk -f data/emo.awk $(EMO) >> $@ printf "\n" >> $@ @@ -50,8 +50,9 @@ src/boundary.c: data/gbt.awk $(GBP) data/emo.awk $(EMO) src/boundary_body.c src/test.c: data/gbt.awk $(GBT) src/test_body.c printf "/* Automatically generated by gbt.awk */\n" > $@ - printf "#include <stddef.h>\n\n" >> $@ - printf "#include \"codepoint.h\"\n\n" >> $@ + printf "#include <stddef.h>\n" >> $@ + printf "#include <stdint.h>\n\n" >> $@ + printf "#include \"../grapheme.h\"\n\n" >> $@ awk -f data/gbt.awk $(GBT) >> $@ printf "\n" >> $@ cat src/test_body.c >> $@ diff --git a/data/emo.awk b/data/emo.awk @@ -34,7 +34,7 @@ function hextonum(str) { } function mktable(name, array, arrlen) { - printf("\nstatic const Codepoint "name"_table[][2] = {\n"); + printf("\nstatic const uint32_t "name"_table[][2] = {\n"); for (j = 0; j < arrlen; j++) { if (ind = index(array[j], "..")) { diff --git a/data/gbp.awk b/data/gbp.awk @@ -58,7 +58,7 @@ function hextonum(str) { } function mktable(name, array, arrlen) { - printf("\nstatic const Codepoint "name"_table[][2] = {\n"); + printf("static const uint32_t "name"_table[][2] = {\n"); for (j = 0; j < arrlen; j++) { if (ind = index(array[j], "..")) { diff --git a/data/gbt.awk b/data/gbt.awk @@ -4,7 +4,7 @@ BEGIN { FS = " " - printf("struct test {\n\tCodepoint *cp;\n\tsize_t cplen;\n"); + printf("struct test {\n\tuint32_t *cp;\n\tsize_t cplen;\n"); printf("\tsize_t *len;\n\tsize_t lenlen;\n\tchar *descr;\n};\n\n"); printf("static const struct test t[] = {\n"); } @@ -38,7 +38,7 @@ $0 ~ /^#/ || $0 ~ /^\s*$/ { next } len[nlens++] = curlen; # print code points - printf("\t{\n\t\t.cp = (Codepoint[]){ "); + printf("\t{\n\t\t.cp = (uint32_t[]){ "); for (i = 0; i < ncps; i++) { printf("0x%s", cp[i]); if (i + 1 < ncps) { diff --git a/grapheme.h b/grapheme.h @@ -3,6 +3,14 @@ #define GRAPHEME_H #include <stddef.h> +#include <stdint.h> + +#define CP_INVALID UINT32_C(0xFFFD) + +int grapheme_boundary(uint32_t, uint32_t, int *); + +size_t grapheme_cp_decode(uint32_t *, const uint8_t *, size_t); +size_t grapheme_cp_encode(uint32_t, uint8_t *, size_t); size_t grapheme_len(const char *); diff --git a/src/boundary.h b/src/boundary.h @@ -1,11 +0,0 @@ -/* See LICENSE file for copyright and license details. */ -#ifndef BOUNDARY_H -#define BOUNDARY_H - -#include <stddef.h> - -#include "codepoint.h" - -int boundary(Codepoint, Codepoint, int *); - -#endif /* BOUNDARY_H */ diff --git a/src/boundary_body.c b/src/boundary_body.c @@ -1,10 +1,8 @@ /* See LICENSE file for copyright and license details. */ #include <stddef.h> +#include <stdint.h> #include <stdlib.h> -#include "codepoint.h" -#include "boundary.h" - #define LEN(x) (sizeof(x) / sizeof(*x)) enum { @@ -15,8 +13,8 @@ enum { static int cp_cmp(const void *a, const void *b) { - Codepoint cp = *(Codepoint *)a; - Codepoint *range = (Codepoint *)b; + uint32_t cp = *(uint32_t *)a; + uint32_t *range = (uint32_t *)b; return (cp >= range[0] && cp <= range[1]) ? 0 : (cp - range[0]); } @@ -40,7 +38,7 @@ enum property { }; struct { - const Codepoint (*table)[2]; + const uint32_t (*table)[2]; size_t tablelen; } tables[] = { [PROP_CR] = { @@ -102,7 +100,7 @@ struct { }; static int -is(Codepoint cp[2], char (*props)[2], int index, enum property p) +is(uint32_t cp[2], char (*props)[2], int index, enum property p) { if (props[p][index] == 2) { /* need to determine property */ @@ -119,9 +117,9 @@ is(Codepoint cp[2], char (*props)[2], int index, enum property p) #define IS(I, PROP) (is(cp, props, I, PROP)) int -boundary(Codepoint cp0, Codepoint cp1, int *state) +grapheme_boundary(uint32_t cp0, uint32_t cp1, int *state) { - Codepoint cp[2] = { cp0, cp1 }; + uint32_t cp[2] = { cp0, cp1 }; char props[NUM_PROPS][2]; size_t i; diff --git a/src/codepoint.c b/src/codepoint.c @@ -1,5 +1,5 @@ /* See LICENSE file for copyright and license details. */ -#include "codepoint.h" +#include "../grapheme.h" #include <stdio.h> #define BETWEEN(c, l, u) (c >= l && c <= u) diff --git a/src/codepoint.h b/src/codepoint.h @@ -1,15 +0,0 @@ -/* See LICENSE file for copyright and license details. */ -#ifndef CODEPOINT_H -#define CODEPOINT_H - -#include <stddef.h> -#include <stdint.h> - -typedef uint32_t Codepoint; - -#define CP_INVALID 0xFFFD - -size_t grapheme_cp_decode(uint32_t *, const uint8_t *, size_t); -size_t grapheme_cp_encode(uint32_t, uint8_t *, size_t); - -#endif /* CODEPOINT_H */ diff --git a/src/grapheme.c b/src/grapheme.c @@ -2,13 +2,12 @@ #include <stddef.h> #include <stdlib.h> -#include "codepoint.h" -#include "boundary.h" +#include "../grapheme.h" size_t grapheme_len(const char *str) { - Codepoint cp0, cp1; + uint32_t cp0, cp1; size_t ret, len = 0; int state = 0; @@ -38,7 +37,7 @@ grapheme_len(const char *str) /* get next code point */ ret = grapheme_cp_decode(&cp1, (uint8_t *)(str + len), 5); - if (cp1 == CP_INVALID || boundary(cp0, cp1, &state)) { + if (cp1 == CP_INVALID || grapheme_boundary(cp0, cp1, &state)) { /* we read an invalid cp or have a breakpoint */ break; } else { diff --git a/src/test_body.c b/src/test_body.c @@ -1,10 +1,10 @@ /* See LICENSE file for copyright and license details. */ #include <stddef.h> +#include <stdint.h> #include <stdio.h> #include <string.h> -#include "boundary.h" -#include "codepoint.h" +#include "../grapheme.h" #define LEN(x) (sizeof(x) / sizeof(*x)) @@ -350,7 +350,8 @@ int main(void) for (i = 0, failed = 0; i < LEN(t); i++) { for (j = 0, k = 0, state = 0, len = 1; j < t[i].cplen; j++) { if ((j + 1) == t[i].cplen || - boundary(t[i].cp[j], t[i].cp[j + 1], &state)) { + grapheme_boundary(t[i].cp[j], t[i].cp[j + 1], + &state)) { /* check if our resulting length matches */ if (k == t[i].lenlen || len != t[i].len[k++]) { fprintf(stderr, "Failed \"%s\"\n",