gbt.awk (1332B)
1 # See LICENSE file for copyright and license details. 2 3 # https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt 4 BEGIN { 5 FS = " " 6 7 printf("struct test {\n\tuint32_t *cp;\n\tsize_t cplen;\n"); 8 printf("\tsize_t *len;\n\tsize_t lenlen;\n\tchar *descr;\n};\n\n"); 9 printf("static const struct test t[] = {\n"); 10 } 11 12 $0 ~ /^#/ || $0 ~ /^\s*$/ { next } 13 14 { 15 ncps = 0; 16 nlens = 0; 17 18 curlen = 1; 19 for (i = 2; i <= NF; i++) { 20 if ($(i + 1) == "#") { 21 break; 22 } 23 if (i % 2 == 0) { 24 # code point 25 cp[ncps++] = tolower($i); 26 } else { 27 # break information 28 if ($i == "÷") { 29 # break 30 len[nlens++] = curlen; 31 curlen = 1; 32 } else { # $i == "×" 33 # no break 34 curlen++; 35 } 36 } 37 } 38 len[nlens++] = curlen; 39 40 # print code points 41 printf("\t{\n\t\t.cp = (uint32_t[]){ "); 42 for (i = 0; i < ncps; i++) { 43 printf("UINT32_C(0x%s)", cp[i]); 44 if (i + 1 < ncps) { 45 printf(", "); 46 } 47 } 48 printf(" },\n\t\t.cplen = %d,\n", ncps); 49 50 # print grapheme cluster lengths 51 printf("\t\t.len = (size_t[]){ "); 52 for (i = 0; i < nlens; i++) { 53 printf("%s", len[i]); 54 if (i + 1 < nlens) { 55 printf(", "); 56 } 57 } 58 printf(" },\n\t\t.lenlen = %d,\n", nlens); 59 60 # print testcase description 61 printf("\t\t.descr = \"%s\",\n", substr($0, index($0, "#") + 3)); 62 63 printf("\t},\n"); 64 } 65 66 END { 67 printf("};\n"); 68 }