libgrapheme

unicode string library
git clone git://git.suckless.org/libgrapheme
Log | Files | Refs | README | LICENSE

word.c (3038B)


      1 /* See LICENSE file for copyright and license details. */
      2 #include <stdio.h>
      3 #include <stdlib.h>
      4 #include <string.h>
      5 
      6 #include "util.h"
      7 
      8 #define FILE_EMOJI "data/emoji-data.txt"
      9 #define FILE_WORD  "data/WordBreakProperty.txt"
     10 
     11 static const struct property_spec word_break_property[] = {
     12 	{
     13 		.enumname = "OTHER",
     14 		.file = NULL,
     15 		.ucdname = NULL,
     16 	},
     17 	{
     18 		.enumname = "ALETTER",
     19 		.file = FILE_WORD,
     20 		.ucdname = "ALetter",
     21 	},
     22 	{
     23 		.enumname = "BOTH_ALETTER_EXTPICT",
     24 		.file = NULL,
     25 		.ucdname = NULL,
     26 	},
     27 	{
     28 		.enumname = "CR",
     29 		.file = FILE_WORD,
     30 		.ucdname = "CR",
     31 	},
     32 	{
     33 		.enumname = "DOUBLE_QUOTE",
     34 		.file = FILE_WORD,
     35 		.ucdname = "Double_Quote",
     36 	},
     37 	{
     38 		.enumname = "EXTEND",
     39 		.file = FILE_WORD,
     40 		.ucdname = "Extend",
     41 	},
     42 	{
     43 		.enumname = "EXTENDED_PICTOGRAPHIC",
     44 		.file = FILE_EMOJI,
     45 		.ucdname = "Extended_Pictographic",
     46 	},
     47 	{
     48 		.enumname = "EXTENDNUMLET",
     49 		.file = FILE_WORD,
     50 		.ucdname = "ExtendNumLet",
     51 	},
     52 	{
     53 		.enumname = "FORMAT",
     54 		.file = FILE_WORD,
     55 		.ucdname = "Format",
     56 	},
     57 	{
     58 		.enumname = "HEBREW_LETTER",
     59 		.file = FILE_WORD,
     60 		.ucdname = "Hebrew_Letter",
     61 	},
     62 	{
     63 		.enumname = "KATAKANA",
     64 		.file = FILE_WORD,
     65 		.ucdname = "Katakana",
     66 	},
     67 	{
     68 		.enumname = "LF",
     69 		.file = FILE_WORD,
     70 		.ucdname = "LF",
     71 	},
     72 	{
     73 		.enumname = "MIDLETTER",
     74 		.file = FILE_WORD,
     75 		.ucdname = "MidLetter",
     76 	},
     77 	{
     78 		.enumname = "MIDNUM",
     79 		.file = FILE_WORD,
     80 		.ucdname = "MidNum",
     81 	},
     82 	{
     83 		.enumname = "MIDNUMLET",
     84 		.file = FILE_WORD,
     85 		.ucdname = "MidNumLet",
     86 	},
     87 	{
     88 		.enumname = "NEWLINE",
     89 		.file = FILE_WORD,
     90 		.ucdname = "Newline",
     91 	},
     92 	{
     93 		.enumname = "NUMERIC",
     94 		.file = FILE_WORD,
     95 		.ucdname = "Numeric",
     96 	},
     97 	{
     98 		.enumname = "REGIONAL_INDICATOR",
     99 		.file = FILE_WORD,
    100 		.ucdname = "Regional_Indicator",
    101 	},
    102 	{
    103 		.enumname = "SINGLE_QUOTE",
    104 		.file = FILE_WORD,
    105 		.ucdname = "Single_Quote",
    106 	},
    107 	{
    108 		.enumname = "WSEGSPACE",
    109 		.file = FILE_WORD,
    110 		.ucdname = "WSegSpace",
    111 	},
    112 	{
    113 		.enumname = "ZWJ",
    114 		.file = FILE_WORD,
    115 		.ucdname = "ZWJ",
    116 	},
    117 };
    118 
    119 static uint_least8_t
    120 handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t prop2)
    121 {
    122 	uint_least8_t result;
    123 
    124 	(void)cp;
    125 
    126 	if ((!strcmp(word_break_property[prop1].enumname, "ALETTER") &&
    127 	     !strcmp(word_break_property[prop2].enumname,
    128 	             "EXTENDED_PICTOGRAPHIC")) ||
    129 	    (!strcmp(word_break_property[prop1].enumname,
    130 	             "EXTENDED_PICTOGRAPHIC") &&
    131 	     !strcmp(word_break_property[prop2].enumname, "ALETTER"))) {
    132 		for (result = 0; result < LEN(word_break_property); result++) {
    133 			if (!strcmp(word_break_property[result].enumname,
    134 			            "BOTH_ALETTER_EXTPICT")) {
    135 				break;
    136 			}
    137 		}
    138 		if (result == LEN(word_break_property)) {
    139 			fprintf(stderr, "handle_conflict: Internal error.\n");
    140 			exit(1);
    141 		}
    142 	} else {
    143 		fprintf(stderr, "handle_conflict: Cannot handle conflict.\n");
    144 		exit(1);
    145 	}
    146 
    147 	return result;
    148 }
    149 
    150 int
    151 main(int argc, char *argv[])
    152 {
    153 	(void)argc;
    154 
    155 	properties_generate_break_property(
    156 		word_break_property, LEN(word_break_property), NULL,
    157 		handle_conflict, NULL, "word_break", argv[0]);
    158 
    159 	return 0;
    160 }