word.c (3038B)
1 /* See LICENSE file for copyright and license details. */ 2 #include <stdio.h> 3 #include <stdlib.h> 4 #include <string.h> 5 6 #include "util.h" 7 8 #define FILE_EMOJI "data/emoji-data.txt" 9 #define FILE_WORD "data/WordBreakProperty.txt" 10 11 static const struct property_spec word_break_property[] = { 12 { 13 .enumname = "OTHER", 14 .file = NULL, 15 .ucdname = NULL, 16 }, 17 { 18 .enumname = "ALETTER", 19 .file = FILE_WORD, 20 .ucdname = "ALetter", 21 }, 22 { 23 .enumname = "BOTH_ALETTER_EXTPICT", 24 .file = NULL, 25 .ucdname = NULL, 26 }, 27 { 28 .enumname = "CR", 29 .file = FILE_WORD, 30 .ucdname = "CR", 31 }, 32 { 33 .enumname = "DOUBLE_QUOTE", 34 .file = FILE_WORD, 35 .ucdname = "Double_Quote", 36 }, 37 { 38 .enumname = "EXTEND", 39 .file = FILE_WORD, 40 .ucdname = "Extend", 41 }, 42 { 43 .enumname = "EXTENDED_PICTOGRAPHIC", 44 .file = FILE_EMOJI, 45 .ucdname = "Extended_Pictographic", 46 }, 47 { 48 .enumname = "EXTENDNUMLET", 49 .file = FILE_WORD, 50 .ucdname = "ExtendNumLet", 51 }, 52 { 53 .enumname = "FORMAT", 54 .file = FILE_WORD, 55 .ucdname = "Format", 56 }, 57 { 58 .enumname = "HEBREW_LETTER", 59 .file = FILE_WORD, 60 .ucdname = "Hebrew_Letter", 61 }, 62 { 63 .enumname = "KATAKANA", 64 .file = FILE_WORD, 65 .ucdname = "Katakana", 66 }, 67 { 68 .enumname = "LF", 69 .file = FILE_WORD, 70 .ucdname = "LF", 71 }, 72 { 73 .enumname = "MIDLETTER", 74 .file = FILE_WORD, 75 .ucdname = "MidLetter", 76 }, 77 { 78 .enumname = "MIDNUM", 79 .file = FILE_WORD, 80 .ucdname = "MidNum", 81 }, 82 { 83 .enumname = "MIDNUMLET", 84 .file = FILE_WORD, 85 .ucdname = "MidNumLet", 86 }, 87 { 88 .enumname = "NEWLINE", 89 .file = FILE_WORD, 90 .ucdname = "Newline", 91 }, 92 { 93 .enumname = "NUMERIC", 94 .file = FILE_WORD, 95 .ucdname = "Numeric", 96 }, 97 { 98 .enumname = "REGIONAL_INDICATOR", 99 .file = FILE_WORD, 100 .ucdname = "Regional_Indicator", 101 }, 102 { 103 .enumname = "SINGLE_QUOTE", 104 .file = FILE_WORD, 105 .ucdname = "Single_Quote", 106 }, 107 { 108 .enumname = "WSEGSPACE", 109 .file = FILE_WORD, 110 .ucdname = "WSegSpace", 111 }, 112 { 113 .enumname = "ZWJ", 114 .file = FILE_WORD, 115 .ucdname = "ZWJ", 116 }, 117 }; 118 119 static uint_least8_t 120 handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t prop2) 121 { 122 uint_least8_t result; 123 124 (void)cp; 125 126 if ((!strcmp(word_break_property[prop1].enumname, "ALETTER") && 127 !strcmp(word_break_property[prop2].enumname, 128 "EXTENDED_PICTOGRAPHIC")) || 129 (!strcmp(word_break_property[prop1].enumname, 130 "EXTENDED_PICTOGRAPHIC") && 131 !strcmp(word_break_property[prop2].enumname, "ALETTER"))) { 132 for (result = 0; result < LEN(word_break_property); result++) { 133 if (!strcmp(word_break_property[result].enumname, 134 "BOTH_ALETTER_EXTPICT")) { 135 break; 136 } 137 } 138 if (result == LEN(word_break_property)) { 139 fprintf(stderr, "handle_conflict: Internal error.\n"); 140 exit(1); 141 } 142 } else { 143 fprintf(stderr, "handle_conflict: Cannot handle conflict.\n"); 144 exit(1); 145 } 146 147 return result; 148 } 149 150 int 151 main(int argc, char *argv[]) 152 { 153 (void)argc; 154 155 properties_generate_break_property( 156 word_break_property, LEN(word_break_property), NULL, 157 handle_conflict, NULL, "word_break", argv[0]); 158 159 return 0; 160 }