n8.c (10320B)
1 #include <u.h> 2 #include "tdef.h" 3 #include "fns.h" 4 #include "ext.h" 5 6 #define HY_BIT 0200 /* stuff in here only works for 7-bit ascii */ 7 /* this value is used (as a literal) in suftab.c */ 8 /* to encode possible hyphenation points in suffixes. */ 9 /* it could be changed, by widening the tables */ 10 /* to be shorts instead of chars. */ 11 12 /* 13 * troff8.c 14 * 15 * hyphenation 16 */ 17 18 int hexsize = 0; /* hyphenation exception list size */ 19 char *hbufp = NULL; /* base of list */ 20 char *nexth = NULL; /* first free slot in list */ 21 Tchar *hyend; 22 23 #define THRESH 160 /* digram goodness threshold */ 24 int thresh = THRESH; 25 26 int texhyphen(void); 27 static int alpha(Tchar); 28 29 void hyphen(Tchar *wp) 30 { 31 int j; 32 Tchar *i; 33 34 i = wp; 35 while (punct((*i++))) 36 ; 37 if (!alpha(*--i)) 38 return; 39 wdstart = i++; 40 while (alpha(*i++)) 41 ; 42 hyend = wdend = --i - 1; 43 while (punct((*i++))) 44 ; 45 if (*--i) 46 return; 47 if (wdend - wdstart < 4) /* 4 chars is too short to hyphenate */ 48 return; 49 hyp = hyptr; 50 *hyp = 0; 51 hyoff = 2; 52 53 /* for now, try exceptions first, then tex (if hyphalg is non-zero), 54 then suffix and digram if tex didn't hyphenate it at all. 55 */ 56 57 if (!exword() && !texhyphen() && !suffix()) 58 digram(); 59 60 /* this appears to sort hyphenation points into increasing order */ 61 *hyp++ = 0; 62 if (*hyptr) 63 for (j = 1; j; ) { 64 j = 0; 65 for (hyp = hyptr + 1; *hyp != 0; hyp++) { 66 if (*(hyp - 1) > *hyp) { 67 j++; 68 i = *hyp; 69 *hyp = *(hyp - 1); 70 *(hyp - 1) = i; 71 } 72 } 73 } 74 } 75 76 static int alpha(Tchar i) /* non-zero if really alphabetic */ 77 { 78 if (ismot(i)) 79 return 0; 80 else if (cbits(i) >= ALPHABET) /* this isn't very elegant, but there's */ 81 return 0; /* no good way to make sure i is in range for */ 82 else /* the call of isalpha */ 83 return isalpha(cbits(i)); 84 } 85 86 int 87 punct(Tchar i) 88 { 89 if (!i || alpha(i)) 90 return(0); 91 else 92 return(1); 93 } 94 95 96 void caseha(void) /* set hyphenation algorithm */ 97 { 98 hyphalg = HYPHALG; 99 if (skip()) 100 return; 101 noscale++; 102 hyphalg = atoi0(); 103 noscale = 0; 104 } 105 106 107 void caseht(void) /* set hyphenation threshold; not in manual! */ 108 { 109 thresh = THRESH; 110 if (skip()) 111 return; 112 noscale++; 113 thresh = atoi0(); 114 noscale = 0; 115 } 116 117 118 char *growh(char *where) 119 { 120 char *new; 121 122 hexsize += NHEX; 123 if ((new = grow(hbufp, hexsize, sizeof(char))) == NULL) 124 return NULL; 125 if (new == hbufp) { 126 return where; 127 } else { 128 int diff; 129 diff = where - hbufp; 130 hbufp = new; 131 return new + diff; 132 } 133 } 134 135 136 void casehw(void) 137 { 138 int i, k; 139 char *j; 140 Tchar t; 141 142 if (nexth == NULL) { 143 if ((nexth = hbufp = grow(hbufp, NHEX, sizeof(char))) == NULL) { 144 ERROR "No space for exception word list." WARN; 145 return; 146 } 147 hexsize = NHEX; 148 } 149 k = 0; 150 while (!skip()) { 151 if ((j = nexth) >= hbufp + hexsize - 2) 152 if ((j = nexth = growh(j)) == NULL) 153 goto full; 154 for (;;) { 155 if (ismot(t = getch())) 156 continue; 157 i = cbits(t); 158 if (i == ' ' || i == '\n') { 159 *j++ = 0; 160 nexth = j; 161 *j = 0; 162 if (i == ' ') 163 break; 164 else 165 return; 166 } 167 if (i == '-') { 168 k = HY_BIT; 169 continue; 170 } 171 *j++ = maplow(i) | k; 172 k = 0; 173 if (j >= hbufp + hexsize - 2) 174 if ((j = growh(j)) == NULL) 175 goto full; 176 } 177 } 178 return; 179 full: 180 ERROR "Cannot grow exception word list." WARN; 181 *nexth = 0; 182 } 183 184 185 int exword(void) 186 { 187 Tchar *w; 188 char *e, *save; 189 190 e = hbufp; 191 while (1) { 192 save = e; 193 if (e == NULL || *e == 0) 194 return(0); 195 w = wdstart; 196 while (*e && w <= hyend && (*e & 0177) == maplow(cbits(*w))) { 197 e++; 198 w++; 199 } 200 if (!*e) { 201 if (w-1 == hyend || (w == wdend && maplow(cbits(*w)) == 's')) { 202 w = wdstart; 203 for (e = save; *e; e++) { 204 if (*e & HY_BIT) 205 *hyp++ = w; 206 if (hyp > hyptr + NHYP - 1) 207 hyp = hyptr + NHYP - 1; 208 w++; 209 } 210 return(1); 211 } else { 212 e++; 213 continue; 214 } 215 } else 216 while (*e++) 217 ; 218 } 219 } 220 221 int 222 suffix(void) 223 { 224 Tchar *w; 225 char *s, *s0; 226 Tchar i; 227 extern char *suftab[]; 228 229 again: 230 i = cbits(*hyend); 231 if (!alpha(i)) 232 return(0); 233 if (i < 'a') 234 i -= 'A' - 'a'; 235 if ((s0 = suftab[i-'a']) == 0) 236 return(0); 237 for (;;) { 238 if ((i = *s0 & 017) == 0) 239 return(0); 240 s = s0 + i - 1; 241 w = hyend - 1; 242 while (s > s0 && w >= wdstart && (*s & 0177) == maplow(cbits(*w))) { 243 s--; 244 w--; 245 } 246 if (s == s0) 247 break; 248 s0 += i; 249 } 250 s = s0 + i - 1; 251 w = hyend; 252 if (*s0 & HY_BIT) 253 goto mark; 254 while (s > s0) { 255 w--; 256 if (*s-- & HY_BIT) { 257 mark: 258 hyend = w - 1; 259 if (*s0 & 0100) /* 0100 used in suftab to encode something too */ 260 continue; 261 if (!chkvow(w)) 262 return(0); 263 *hyp++ = w; 264 } 265 } 266 if (*s0 & 040) 267 return(0); 268 if (exword()) 269 return(1); 270 goto again; 271 } 272 273 int 274 maplow(int i) 275 { 276 if (isupper(i)) 277 i = tolower(i); 278 return(i); 279 } 280 281 int 282 vowel(int i) 283 { 284 switch (i) { 285 case 'a': case 'A': 286 case 'e': case 'E': 287 case 'i': case 'I': 288 case 'o': case 'O': 289 case 'u': case 'U': 290 case 'y': case 'Y': 291 return(1); 292 default: 293 return(0); 294 } 295 } 296 297 298 Tchar *chkvow(Tchar *w) 299 { 300 while (--w >= wdstart) 301 if (vowel(cbits(*w))) 302 return(w); 303 return(0); 304 } 305 306 307 void digram(void) 308 { 309 Tchar *w; 310 int val; 311 Tchar *nhyend, *maxw; 312 int maxval; 313 extern char bxh[26][13], bxxh[26][13], xxh[26][13], xhx[26][13], hxx[26][13]; 314 maxw = 0; 315 again: 316 if (!(w = chkvow(hyend + 1))) 317 return; 318 hyend = w; 319 if (!(w = chkvow(hyend))) 320 return; 321 nhyend = w; 322 maxval = 0; 323 w--; 324 while (++w < hyend && w < wdend - 1) { 325 val = 1; 326 if (w == wdstart) 327 val *= dilook('a', cbits(*w), bxh); 328 else if (w == wdstart + 1) 329 val *= dilook(cbits(*(w-1)), cbits(*w), bxxh); 330 else 331 val *= dilook(cbits(*(w-1)), cbits(*w), xxh); 332 val *= dilook(cbits(*w), cbits(*(w+1)), xhx); 333 val *= dilook(cbits(*(w+1)), cbits(*(w+2)), hxx); 334 if (val > maxval) { 335 maxval = val; 336 maxw = w + 1; 337 } 338 } 339 hyend = nhyend; 340 if (maxval > thresh) 341 *hyp++ = maxw; 342 goto again; 343 } 344 345 int 346 dilook(int a, int b, char t[26][13]) 347 { 348 int i, j; 349 350 i = t[maplow(a)-'a'][(j = maplow(b)-'a')/2]; 351 if (!(j & 01)) 352 i >>= 4; 353 return(i & 017); 354 } 355 356 357 /* here beginneth the tex hyphenation code, as interpreted freely */ 358 /* the main difference is that there is no attempt to squeeze space */ 359 /* as tightly at tex does. */ 360 361 static int texit(Tchar *, Tchar *); 362 static int readpats(void); 363 static void install(char *); 364 static void fixup(void); 365 static int trieindex(int, int); 366 367 static char pats[50000]; /* size ought to be computed dynamically */ 368 static char *nextpat = pats; 369 static char *trie[27*27]; /* english-specific sizes */ 370 371 int texhyphen(void) 372 { 373 static int loaded = 0; /* -1: couldn't find tex file */ 374 375 if (hyphalg == 0 || loaded == -1) /* non-zero => tex for now */ 376 return 0; 377 if (loaded == 0) { 378 if (readpats()) 379 loaded = 1; 380 else 381 loaded = -1; 382 } 383 return texit(wdstart, wdend); 384 } 385 386 static int texit(Tchar *start, Tchar *end) /* hyphenate as in tex, return # found */ 387 { 388 int nw, i, k, equal, cnt[500]; 389 char w[500+1], *np, *pp, *wp, *xpp, *xwp; 390 391 w[0] = '.'; 392 for (nw = 1; start <= end && nw < 500-1; nw++, start++) 393 w[nw] = maplow(tolower(cbits(*start))); 394 start -= (nw - 1); 395 w[nw++] = '.'; 396 w[nw] = 0; 397 /* 398 * printf("try %s\n", w); 399 */ 400 for (i = 0; i <= nw; i++) 401 cnt[i] = '0'; 402 403 for (wp = w; wp+1 < w+nw; wp++) { 404 for (pp = trie[trieindex(*wp, *(wp+1))]; pp < nextpat; ) { 405 if (pp == 0 /* no trie entry */ 406 || *pp != *wp /* no match on 1st letter */ 407 || *(pp+1) != *(wp+1)) /* no match on 2nd letter */ 408 break; /* so move to next letter of word */ 409 equal = 1; 410 for (xpp = pp+2, xwp = wp+2; *xpp; ) 411 if (*xpp++ != *xwp++) { 412 equal = 0; 413 break; 414 } 415 if (equal) { 416 np = xpp+1; /* numpat */ 417 for (k = wp-w; *np; k++, np++) 418 if (*np > cnt[k]) 419 cnt[k] = *np; 420 /* 421 * printf("match: %s %s\n", pp, xpp+1); 422 */ 423 } 424 pp += *(pp-1); /* skip over pattern and numbers to next */ 425 } 426 } 427 /* 428 * for (i = 0; i < nw; i++) printf("%c", w[i]); 429 * printf(" "); 430 * for (i = 0; i <= nw; i++) printf("%c", cnt[i]); 431 * printf("\n"); 432 */ 433 /* 434 * for (i = 1; i < nw - 1; i++) { 435 * if (i > 2 && i < nw - 3 && cnt[i] % 2) 436 * printf("-"); 437 * if (cbits(start[i-1]) != '.') 438 * printf("%c", cbits(start[i-1])); 439 * } 440 * printf("\n"); 441 */ 442 for (i = 1; i < nw -1; i++) 443 if (i > 2 && i < nw - 3 && cnt[i] % 2) 444 *hyp++ = start + i - 1; 445 return hyp - hyptr; /* non-zero if a hyphen was found */ 446 } 447 448 /* 449 This code assumes that hyphen.tex looks like 450 % some comments 451 \patterns{ % more comments 452 pat5ter4ns, 1 per line, SORTED, nothing else 453 } 454 more goo 455 \hyphenation{ % more comments 456 ex-cep-tions, one per line; i ignore this part for now 457 } 458 459 this code is NOT robust against variations. unfortunately, 460 it looks like every local language version of this file has 461 a different format. i have also made no provision for weird 462 characters. sigh. 463 */ 464 465 static int readpats(void) 466 { 467 FILE *fp; 468 char buf[200], buf1[200]; 469 470 if ((fp = fopen(unsharp(TEXHYPHENS), "r")) == NULL 471 && (fp = fopen(unsharp(DWBalthyphens), "r")) == NULL) { 472 ERROR "warning: can't find hyphen.tex" WARN; 473 return 0; 474 } 475 476 while (fgets(buf, sizeof buf, fp) != NULL) { 477 sscanf(buf, "%s", buf1); 478 if (strcmp(buf1, "\\patterns{") == 0) 479 break; 480 } 481 while (fgets(buf, sizeof buf, fp) != NULL) { 482 if (buf[0] == '}') 483 break; 484 install(buf); 485 } 486 fclose(fp); 487 fixup(); 488 return 1; 489 } 490 491 static void install(char *s) /* map ab4c5de to: 12 abcde \0 00405 \0 */ 492 { 493 int npat, lastpat; 494 char num[500], *onextpat = nextpat; 495 496 num[0] = '0'; 497 *nextpat++ = ' '; /* fill in with count later */ 498 for (npat = lastpat = 0; *s != '\n' && *s != '\0'; s++) { 499 if (isdigit((uchar)*s)) { 500 num[npat] = *s; 501 lastpat = npat; 502 } else { 503 *nextpat++ = *s; 504 npat++; 505 num[npat] = '0'; 506 } 507 } 508 *nextpat++ = 0; 509 if (nextpat > pats + sizeof(pats)-20) { 510 ERROR "tex hyphenation table overflow, tail end ignored" WARN; 511 nextpat = onextpat; 512 } 513 num[lastpat+1] = 0; 514 strcat(nextpat, num); 515 nextpat += strlen(nextpat) + 1; 516 } 517 518 static void fixup(void) /* build indexes of where . a b c ... start */ 519 { 520 char *p, *lastc; 521 int n; 522 523 for (lastc = pats, p = pats+1; p < nextpat; p++) 524 if (*p == ' ') { 525 *lastc = p - lastc; 526 lastc = p; 527 } 528 *lastc = p - lastc; 529 for (p = pats+1; p < nextpat; ) { 530 n = trieindex(p[0], p[1]); 531 if (trie[n] == 0) 532 trie[n] = p; 533 p += p[-1]; 534 } 535 /* printf("pats = %d\n", nextpat - pats); */ 536 } 537 538 static int trieindex(int d1, int d2) 539 { 540 int z; 541 542 z = 27 * (d1 == '.' ? 0 : d1 - 'a' + 1) + (d2 == '.' ? 0 : d2 - 'a' + 1); 543 assert(z >= 0 && z < 27*27); 544 return z; 545 }