Refactor benchmark code - libgrapheme - unicode string library

commit da46b2648d2846dc23e310b7ac0cc3ddebb7ccd3
parent 0f8eb87382b2953b6c4b62c6f4c42616ce74003c
Author: Laslo Hunhold <dev@frign.de>
Date:   Sun,  9 Jan 2022 17:30:53 +0100

Refactor benchmark code

Rename some variables for more consistent naming, add a function
to explicitly generate a UTF-8-test-buffer and move some things into
benchmark/util.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
M benchmark/character.c  | 41 +++++++++++++++++------------------------
M benchmark/utf8-decode.c  | 78 ++++++++++++++++++++++++++----------------------------------------------------
M benchmark/util.c  | 49 +++++++++++++++++++++++++++++++++++++++++++------
M benchmark/util.h  | 13 +++++++++++--

4 files changed, 97 insertions(+), 84 deletions(-)
diff --git a/benchmark/character.c b/benchmark/character.c
@@ -14,27 +14,20 @@
 
 #define NUM_ITERATIONS 1000000
 
-#ifdef __has_attribute
-	#if __has_attribute(optnone)
-		void libgrapheme(const void *) __attribute__((optnone));
-		void libutf8proc(const void *) __attribute__((optnone));
-	#endif
-#endif
-
-struct payload {
+struct break_benchmark_payload {
 	uint_least32_t *buf;
-	utf8proc_int32_t *buf_int32;
-	size_t bufsiz;
+	utf8proc_int32_t *buf_utf8proc;
+	size_t buflen;
 };
 
 void
 libgrapheme(const void *payload)
 {
 	GRAPHEME_STATE state = { 0 };
-	const struct payload *p = payload;
+	const struct break_benchmark_payload *p = payload;
 	size_t i;
 
-	for (i = 0; i + 1 < p->bufsiz; i++) {
+	for (i = 0; i + 1 < p->buflen; i++) {
 		(void)grapheme_is_character_break(p->buf[i], p->buf[i+1],
 		                                  &state);
 	}
@@ -44,12 +37,12 @@ void
 libutf8proc(const void *payload)
 {
 	utf8proc_int32_t state = 0;
-	const struct payload *p = payload;
+	const struct break_benchmark_payload *p = payload;
 	size_t i;
 
-	for (i = 0; i + 1 < p->bufsiz; i++) {
-		(void)utf8proc_grapheme_break_stateful(p->buf_int32[i],
-		                                       p->buf_int32[i+1],
+	for (i = 0; i + 1 < p->buflen; i++) {
+		(void)utf8proc_grapheme_break_stateful(p->buf_utf8proc[i],
+		                                       p->buf_utf8proc[i+1],
 		                                       &state);
 	}
 }
@@ -57,33 +50,33 @@ libutf8proc(const void *payload)
 int
 main(int argc, char *argv[])
 {
-	struct payload p;
+	struct break_benchmark_payload p;
 	double baseline = (double)NAN;
 	size_t i;
 
 	(void)argc;
 
-	if ((p.buf = generate_test_buffer(character_test, LEN(character_test),
-	                                  &(p.bufsiz))) == NULL) {
+	if ((p.buf = generate_cp_test_buffer(character_test, LEN(character_test),
+	                                     &(p.buflen))) == NULL) {
 		return 1;
 	}
-	if ((p.buf_int32 = malloc(p.bufsiz * sizeof(*(p.buf_int32)))) == NULL) {
+	if ((p.buf_utf8proc = malloc(p.buflen * sizeof(*(p.buf_utf8proc)))) == NULL) {
 		fprintf(stderr, "malloc: %s\n", strerror(errno));
 		exit(1);
 	}
-	for (i = 0; i < p.bufsiz; i++) {
+	for (i = 0; i < p.buflen; i++) {
 		/*
 		 * there is no overflow, as we know that the maximum
 		 * codepoint is 0x10FFFF, which is way below 2^31
 		 */
-		p.buf_int32[i] = (utf8proc_int32_t)p.buf[i];
+		p.buf_utf8proc[i] = (utf8proc_int32_t)p.buf[i];
 	}
 
 	printf("%s\n", argv[0]);
 	run_benchmark(libgrapheme, &p, "libgrapheme ", NULL, "comparison",
-	              &baseline, NUM_ITERATIONS, p.bufsiz - 1);
+	              &baseline, NUM_ITERATIONS, p.buflen - 1);
 	run_benchmark(libutf8proc, &p, "libutf8proc ", NULL, "comparison",
-	              &baseline, NUM_ITERATIONS, p.bufsiz - 1);
+	              &baseline, NUM_ITERATIONS, p.buflen - 1);
 
 	free(p.buf);
 
diff --git a/benchmark/utf8-decode.c b/benchmark/utf8-decode.c
@@ -14,30 +14,23 @@
 
 #define NUM_ITERATIONS 100000
 
-#ifdef __has_attribute
-	#if __has_attribute(optnone)
-		void libgrapheme(const void *) __attribute__((optnone));
-		void libutf8proc(const void *) __attribute__((optnone));
-	#endif
-#endif
-
-struct payload {
-	char *buf_char;
-	utf8proc_uint8_t *buf_uint8;
-	size_t bufsiz;
+struct utf8_benchmark_payload {
+	char *buf;
+	utf8proc_uint8_t *buf_utf8proc;
+	size_t buflen;
 };
 
 void
 libgrapheme(const void *payload)
 {
-	const struct payload *p = payload;
+	const struct utf8_benchmark_payload *p = payload;
 	uint_least32_t cp;
 	size_t ret, off;
 
-	for (off = 0; off < p->bufsiz; off += ret) {
-		if ((ret = grapheme_decode_utf8(p->buf_char + off,
-		                                p->bufsiz - off, &cp)) >
-		    (p->bufsiz - off)) {
+	for (off = 0; off < p->buflen; off += ret) {
+		if ((ret = grapheme_decode_utf8(p->buf + off,
+		                                p->buflen - off, &cp)) >
+		    (p->buflen - off)) {
 			break;
 		}
 		(void)cp;
@@ -47,14 +40,14 @@ libgrapheme(const void *payload)
 void
 libutf8proc(const void *payload)
 {
-	const struct payload *p = payload;
+	const struct utf8_benchmark_payload *p = payload;
 	utf8proc_int32_t cp;
 	utf8proc_ssize_t ret;
 	size_t off;
 
-	for (off = 0; off < p->bufsiz; off += (size_t)ret) {
-		if ((ret = utf8proc_iterate(p->buf_uint8 + off,
-		                            (utf8proc_ssize_t)(p->bufsiz - off),
+	for (off = 0; off < p->buflen; off += (size_t)ret) {
+		if ((ret = utf8proc_iterate(p->buf_utf8proc + off,
+		                            (utf8proc_ssize_t)(p->buflen - off),
 				            &cp)) < 0) {
 			break;
 		}
@@ -65,57 +58,38 @@ libutf8proc(const void *payload)
 int
 main(int argc, char *argv[])
 {
-	struct payload p;
-	size_t cpbufsiz, i, off, ret;
-	uint_least32_t *cpbuf;
+	struct utf8_benchmark_payload p;
+	size_t i;
 	double baseline = (double)NAN;
 
 	(void)argc;
 
-	if ((cpbuf = generate_test_buffer(character_test, LEN(character_test),
-	                                  &cpbufsiz)) == NULL) {
-		return 1;
-	}
+	p.buf = generate_utf8_test_buffer(character_test,
+	                                  LEN(character_test),
+	                                  &(p.buflen));
 
-	/* convert cp-buffer to utf8-data (both as char and custom uint8-type) */
-	for (i = 0, p.bufsiz = 0; i < cpbufsiz; i++) {
-		p.bufsiz += grapheme_encode_utf8(cpbuf[i], NULL, 0);
-	}
-	if ((p.buf_char = malloc(p.bufsiz)) == NULL) {
-		fprintf(stderr, "malloc: %s\n", strerror(errno));
-		exit(1);
-	}
-	for (i = 0, off = 0; i < cpbufsiz; i++, off += ret) {
-		if ((ret = grapheme_encode_utf8(cpbuf[i], p.buf_char + off,
-		                                p.bufsiz - off)) >
-		    (p.bufsiz - off)) {
-			/* shouldn't happen */
-			fprintf(stderr, "Error while converting buffer.\n");
-			exit(1);
-		}
-	}
-	if ((p.buf_uint8 = malloc(p.bufsiz)) == NULL) {	
+	/* convert cp-buffer to stupid custom libutf8proc-uint8-type */
+	if ((p.buf_utf8proc = malloc(p.buflen)) == NULL) {
 		fprintf(stderr, "malloc: %s\n", strerror(errno));
 		exit(1);
 	}
-	for (i = 0; i < p.bufsiz; i++) {
+	for (i = 0; i < p.buflen; i++) {
 		/* 
 		 * even if char is larger than 8 bit, it will only have
 		 * any of the first 8 bits set (by construction).
 		 */
-		p.buf_uint8[i] = (utf8proc_uint8_t)p.buf_char[i];
+		p.buf_utf8proc[i] = (utf8proc_uint8_t)p.buf[i];
 	}
 
 	printf("%s\n", argv[0]);
 	run_benchmark(libgrapheme, &p, "libgrapheme ", NULL,
-	              "byte", &baseline, NUM_ITERATIONS, p.bufsiz);
+	              "byte", &baseline, NUM_ITERATIONS, p.buflen);
 	run_benchmark(libutf8proc, &p, "libutf8proc ",
 	              "but unsafe (does not detect overlong encodings)",
-	              "byte", &baseline, NUM_ITERATIONS, p.bufsiz);
+	              "byte", &baseline, NUM_ITERATIONS, p.buflen);
 
-	free(cpbuf);
-	free(p.buf_char);
-	free(p.buf_uint8);
+	free(p.buf);
+	free(p.buf_utf8proc);
 
 	return 0;
 }
diff --git a/benchmark/util.c b/benchmark/util.c
@@ -5,22 +5,23 @@
 #include <time.h>
 
 #include "../gen/types.h"
+#include "../grapheme.h"
 #include "util.h"
 
 uint_least32_t *
-generate_test_buffer(const struct break_test *test, size_t testlen,
-                     size_t *bufsiz)
+generate_cp_test_buffer(const struct break_test *test, size_t testlen,
+                        size_t *buflen)
 {
 	size_t i, j, off;
 	uint_least32_t *buf;
 
 	/* allocate and generate buffer */
-	for (i = 0, *bufsiz = 0; i < testlen; i++) {
-		*bufsiz += test[i].cplen;
+	for (i = 0, *buflen = 0; i < testlen; i++) {
+		*buflen += test[i].cplen;
 	}
-	if (!(buf = calloc(*bufsiz, sizeof(*buf)))) {
+	if (!(buf = calloc(*buflen, sizeof(*buf)))) {
 		fprintf(stderr, "generate_test_buffer: calloc: Out of memory.\n");
-		return NULL;
+		exit(1);
 	}
 	for (i = 0, off = 0; i < testlen; i++) {
 		for (j = 0; j < test[i].cplen; j++) {
@@ -32,6 +33,42 @@ generate_test_buffer(const struct break_test *test, size_t testlen,
 	return buf;
 }
 
+char *
+generate_utf8_test_buffer(const struct break_test *test, size_t testlen,
+                          size_t *buflen)
+{
+	size_t i, j, off, ret;
+	char *buf;
+
+	/* allocate and generate buffer */
+	for (i = 0, *buflen = 0; i < testlen; i++) {
+		for (j = 0; j < test[i].cplen; j++) {
+			*buflen += grapheme_encode_utf8(test[i].cp[j], NULL, 0);
+		}
+	}
+	(*buflen)++; /* terminating NUL-byte */
+	if (!(buf = malloc(*buflen))) {
+		fprintf(stderr, "generate_test_buffer: malloc: Out of memory.\n");
+		exit(1);
+	}
+	for (i = 0, off = 0; i < testlen; i++) {
+		for (j = 0; j < test[i].cplen; j++, off += ret) {
+			if ((ret = grapheme_encode_utf8(test[i].cp[j],
+			                                buf + off,
+			                                *buflen - off)) >
+			    (*buflen - off)) {
+				/* shouldn't happen */
+				fprintf(stderr, "generate_utf8_test_buffer: "
+				        "Buffer too small.\n");
+				exit(1);
+			}
+		}
+	}
+	buf[*buflen - 1] = '\0';
+
+	return buf;
+}
+
 static double
 time_diff(struct timespec *a, struct timespec *b)
 {
diff --git a/benchmark/util.h b/benchmark/util.h
@@ -6,8 +6,17 @@
 
 #define LEN(x) (sizeof(x) / sizeof(*(x)))
 
-uint_least32_t *generate_test_buffer(const struct break_test *, size_t,
-                                     size_t *);
+#ifdef __has_attribute
+	#if __has_attribute(optnone)
+		void libgrapheme(const void *) __attribute__((optnone));
+		void libutf8proc(const void *) __attribute__((optnone));
+	#endif
+#endif
+
+uint_least32_t *generate_cp_test_buffer(const struct break_test *, size_t,
+                                        size_t *);
+char *generate_utf8_test_buffer(const struct break_test *, size_t, size_t *);
+
 void run_benchmark(void (*func)(const void *), const void *, const char *,
                    const char *, const char *, double *, size_t, size_t);

	libgrapheme unicode string library
	git clone git://git.suckless.org/libgrapheme
	Log \| Files \| Refs \| README \| LICENSE

M	benchmark/character.c	\|	41	+++++++++++++++++------------------------
M	benchmark/utf8-decode.c	\|	78	++++++++++++++++++++++++++----------------------------------------------------
M	benchmark/util.c	\|	49	+++++++++++++++++++++++++++++++++++++++++++------
M	benchmark/util.h	\|	13	+++++++++++--