9base

revived minimalist port of Plan 9 userland to Unix
git clone git://git.suckless.org/9base
Log | Files | Refs | README | LICENSE

n8.c (10320B)


      1 #include <u.h>
      2 #include "tdef.h"
      3 #include "fns.h"
      4 #include "ext.h"
      5 
      6 #define	HY_BIT	0200	/* stuff in here only works for 7-bit ascii */
      7 			/* this value is used (as a literal) in suftab.c */
      8 			/* to encode possible hyphenation points in suffixes. */
      9 			/* it could be changed, by widening the tables */
     10 			/* to be shorts instead of chars. */
     11 
     12 /*
     13  * troff8.c
     14  * 
     15  * hyphenation
     16  */
     17 
     18 int	hexsize = 0;		/* hyphenation exception list size */
     19 char	*hbufp = NULL;		/* base of list */
     20 char	*nexth = NULL;		/* first free slot in list */
     21 Tchar	*hyend;
     22 
     23 #define THRESH 160 		/* digram goodness threshold */
     24 int	thresh = THRESH;
     25 
     26 int	texhyphen(void);
     27 static	int	alpha(Tchar);
     28 
     29 void hyphen(Tchar *wp)
     30 {
     31 	int j;
     32 	Tchar *i;
     33 
     34 	i = wp;
     35 	while (punct((*i++)))
     36 		;
     37 	if (!alpha(*--i))
     38 		return;
     39 	wdstart = i++;
     40 	while (alpha(*i++))
     41 		;
     42 	hyend = wdend = --i - 1;
     43 	while (punct((*i++)))
     44 		;
     45 	if (*--i)
     46 		return;
     47 	if (wdend - wdstart < 4)	/* 4 chars is too short to hyphenate */
     48 		return;
     49 	hyp = hyptr;
     50 	*hyp = 0;
     51 	hyoff = 2;
     52 
     53 	/* for now, try exceptions first, then tex (if hyphalg is non-zero),
     54 	   then suffix and digram if tex didn't hyphenate it at all.
     55 	*/
     56 
     57 	if (!exword() && !texhyphen() && !suffix())
     58 		digram();
     59 
     60 	/* this appears to sort hyphenation points into increasing order */
     61 	*hyp++ = 0;
     62 	if (*hyptr) 
     63 		for (j = 1; j; ) {
     64 			j = 0;
     65 			for (hyp = hyptr + 1; *hyp != 0; hyp++) {
     66 				if (*(hyp - 1) > *hyp) {
     67 					j++;
     68 					i = *hyp;
     69 					*hyp = *(hyp - 1);
     70 					*(hyp - 1) = i;
     71 				}
     72 			}
     73 		}
     74 }
     75 
     76 static int alpha(Tchar i)	/* non-zero if really alphabetic */
     77 {
     78 	if (ismot(i))
     79 		return 0;
     80 	else if (cbits(i) >= ALPHABET)	/* this isn't very elegant, but there's */
     81 		return 0;		/* no good way to make sure i is in range for */
     82 	else				/* the call of isalpha */
     83 		return isalpha(cbits(i));
     84 }
     85 
     86 int
     87 punct(Tchar i)
     88 {
     89 	if (!i || alpha(i))
     90 		return(0);
     91 	else
     92 		return(1);
     93 }
     94 
     95 
     96 void caseha(void)	/* set hyphenation algorithm */
     97 {
     98 	hyphalg = HYPHALG;
     99 	if (skip())
    100 		return;
    101 	noscale++;
    102 	hyphalg = atoi0();
    103 	noscale = 0;
    104 }
    105 
    106 
    107 void caseht(void)	/* set hyphenation threshold;  not in manual! */
    108 {
    109 	thresh = THRESH;
    110 	if (skip())
    111 		return;
    112 	noscale++;
    113 	thresh = atoi0();
    114 	noscale = 0;
    115 }
    116 
    117 
    118 char *growh(char *where)
    119 {
    120 	char *new;
    121 
    122 	hexsize += NHEX;
    123 	if ((new = grow(hbufp, hexsize, sizeof(char))) == NULL)
    124 		return NULL;
    125 	if (new == hbufp) {
    126 		return where;
    127 	} else {
    128 		int diff;
    129 		diff = where - hbufp;
    130 		hbufp = new;
    131 		return new + diff;
    132 	}
    133 }
    134 
    135 
    136 void casehw(void)
    137 {
    138 	int i, k;
    139 	char *j;
    140 	Tchar t;
    141 
    142 	if (nexth == NULL) {
    143 		if ((nexth = hbufp = grow(hbufp, NHEX, sizeof(char))) == NULL) {
    144 			ERROR "No space for exception word list." WARN;
    145 			return;
    146 		}
    147 		hexsize = NHEX;
    148 	}
    149 	k = 0;
    150 	while (!skip()) {
    151 		if ((j = nexth) >= hbufp + hexsize - 2)
    152 			if ((j = nexth = growh(j)) == NULL)
    153 				goto full;
    154 		for (;;) {
    155 			if (ismot(t = getch()))
    156 				continue;
    157 			i = cbits(t);
    158 			if (i == ' ' || i == '\n') {
    159 				*j++ = 0;
    160 				nexth = j;
    161 				*j = 0;
    162 				if (i == ' ')
    163 					break;
    164 				else
    165 					return;
    166 			}
    167 			if (i == '-') {
    168 				k = HY_BIT;
    169 				continue;
    170 			}
    171 			*j++ = maplow(i) | k;
    172 			k = 0;
    173 			if (j >= hbufp + hexsize - 2)
    174 				if ((j = growh(j)) == NULL)
    175 					goto full;
    176 		}
    177 	}
    178 	return;
    179 full:
    180 	ERROR "Cannot grow exception word list." WARN;
    181 	*nexth = 0;
    182 }
    183 
    184 
    185 int exword(void)
    186 {
    187 	Tchar *w;
    188 	char *e, *save;
    189 
    190 	e = hbufp;
    191 	while (1) {
    192 		save = e;
    193 		if (e == NULL || *e == 0)
    194 			return(0);
    195 		w = wdstart;
    196 		while (*e && w <= hyend && (*e & 0177) == maplow(cbits(*w))) {
    197 			e++; 
    198 			w++;
    199 		}
    200 		if (!*e) {
    201 			if (w-1 == hyend || (w == wdend && maplow(cbits(*w)) == 's')) {
    202 				w = wdstart;
    203 				for (e = save; *e; e++) {
    204 					if (*e & HY_BIT)
    205 						*hyp++ = w;
    206 					if (hyp > hyptr + NHYP - 1)
    207 						hyp = hyptr + NHYP - 1;
    208 					w++;
    209 				}
    210 				return(1);
    211 			} else {
    212 				e++; 
    213 				continue;
    214 			}
    215 		} else 
    216 			while (*e++)
    217 				;
    218 	}
    219 }
    220 
    221 int
    222 suffix(void)
    223 {
    224 	Tchar *w;
    225 	char *s, *s0;
    226 	Tchar i;
    227 	extern char *suftab[];
    228 
    229 again:
    230 	i = cbits(*hyend);
    231 	if (!alpha(i))
    232 		return(0);
    233 	if (i < 'a')
    234 		i -= 'A' - 'a';
    235 	if ((s0 = suftab[i-'a']) == 0)
    236 		return(0);
    237 	for (;;) {
    238 		if ((i = *s0 & 017) == 0)
    239 			return(0);
    240 		s = s0 + i - 1;
    241 		w = hyend - 1;
    242 		while (s > s0 && w >= wdstart && (*s & 0177) == maplow(cbits(*w))) {
    243 			s--;
    244 			w--;
    245 		}
    246 		if (s == s0)
    247 			break;
    248 		s0 += i;
    249 	}
    250 	s = s0 + i - 1;
    251 	w = hyend;
    252 	if (*s0 & HY_BIT) 
    253 		goto mark;
    254 	while (s > s0) {
    255 		w--;
    256 		if (*s-- & HY_BIT) {
    257 mark:
    258 			hyend = w - 1;
    259 			if (*s0 & 0100)	/* 0100 used in suftab to encode something too */
    260 				continue;
    261 			if (!chkvow(w))
    262 				return(0);
    263 			*hyp++ = w;
    264 		}
    265 	}
    266 	if (*s0 & 040)
    267 		return(0);
    268 	if (exword())
    269 		return(1);
    270 	goto again;
    271 }
    272 
    273 int
    274 maplow(int i)
    275 {
    276 	if (isupper(i)) 
    277 		i = tolower(i);
    278 	return(i);
    279 }
    280 
    281 int
    282 vowel(int i)
    283 {
    284 	switch (i) {
    285 	case 'a': case 'A':
    286 	case 'e': case 'E':
    287 	case 'i': case 'I':
    288 	case 'o': case 'O':
    289 	case 'u': case 'U':
    290 	case 'y': case 'Y':
    291 		return(1);
    292 	default:
    293 		return(0);
    294 	}
    295 }
    296 
    297 
    298 Tchar *chkvow(Tchar *w)
    299 {
    300 	while (--w >= wdstart)
    301 		if (vowel(cbits(*w)))
    302 			return(w);
    303 	return(0);
    304 }
    305 
    306 
    307 void digram(void)
    308 {
    309 	Tchar *w;
    310 	int val;
    311 	Tchar *nhyend, *maxw;
    312 	int maxval;
    313 	extern char bxh[26][13], bxxh[26][13], xxh[26][13], xhx[26][13], hxx[26][13];
    314         maxw = 0;
    315 again:
    316 	if (!(w = chkvow(hyend + 1)))
    317 		return;
    318 	hyend = w;
    319 	if (!(w = chkvow(hyend)))
    320 		return;
    321 	nhyend = w;
    322 	maxval = 0;
    323 	w--;
    324 	while (++w < hyend && w < wdend - 1) {
    325 		val = 1;
    326 		if (w == wdstart)
    327 			val *= dilook('a', cbits(*w), bxh);
    328 		else if (w == wdstart + 1)
    329 			val *= dilook(cbits(*(w-1)), cbits(*w), bxxh);
    330 		else 
    331 			val *= dilook(cbits(*(w-1)), cbits(*w), xxh);
    332 		val *= dilook(cbits(*w), cbits(*(w+1)), xhx);
    333 		val *= dilook(cbits(*(w+1)), cbits(*(w+2)), hxx);
    334 		if (val > maxval) {
    335 			maxval = val;
    336 			maxw = w + 1;
    337 		}
    338 	}
    339 	hyend = nhyend;
    340 	if (maxval > thresh)
    341 		*hyp++ = maxw;
    342 	goto again;
    343 }
    344 
    345 int
    346 dilook(int a, int b, char t[26][13])
    347 {
    348 	int i, j;
    349 
    350 	i = t[maplow(a)-'a'][(j = maplow(b)-'a')/2];
    351 	if (!(j & 01))
    352 		i >>= 4;
    353 	return(i & 017);
    354 }
    355 
    356 
    357 /* here beginneth the tex hyphenation code, as interpreted freely */
    358 /* the main difference is that there is no attempt to squeeze space */
    359 /* as tightly at tex does. */
    360 
    361 static int	texit(Tchar *, Tchar *);
    362 static int	readpats(void);
    363 static void	install(char *);
    364 static void	fixup(void);
    365 static int	trieindex(int, int);
    366 
    367 static char	pats[50000];	/* size ought to be computed dynamically */
    368 static char	*nextpat = pats;
    369 static char	*trie[27*27];	/* english-specific sizes */
    370 
    371 int texhyphen(void)
    372 {
    373 	static int loaded = 0;		/* -1: couldn't find tex file */
    374 
    375 	if (hyphalg == 0 || loaded == -1)	/* non-zero => tex for now */
    376 		return 0;
    377 	if (loaded == 0) {
    378 		if (readpats())
    379 			loaded = 1;
    380 		else
    381 			loaded = -1;
    382 	}
    383 	return texit(wdstart, wdend);
    384 }
    385 
    386 static int texit(Tchar *start, Tchar *end)	/* hyphenate as in tex, return # found */
    387 {
    388 	int nw, i, k, equal, cnt[500];
    389 	char w[500+1], *np, *pp, *wp, *xpp, *xwp;
    390 
    391 	w[0] = '.';
    392 	for (nw = 1; start <= end && nw < 500-1; nw++, start++)
    393 		w[nw] = maplow(tolower(cbits(*start)));
    394 	start -= (nw - 1);
    395 	w[nw++] = '.';
    396 	w[nw] = 0;
    397 /*
    398  * printf("try %s\n", w);
    399 */
    400 	for (i = 0; i <= nw; i++)
    401 		cnt[i] = '0';
    402 
    403 	for (wp = w; wp+1 < w+nw; wp++) {
    404 		for (pp = trie[trieindex(*wp, *(wp+1))]; pp < nextpat; ) {
    405 			if (pp == 0		/* no trie entry */
    406 			 || *pp != *wp		/* no match on 1st letter */
    407 			 || *(pp+1) != *(wp+1))	/* no match on 2nd letter */
    408 				break;		/*   so move to next letter of word */
    409 			equal = 1;
    410 			for (xpp = pp+2, xwp = wp+2; *xpp; )
    411 				if (*xpp++ != *xwp++) {
    412 					equal = 0;
    413 					break;
    414 				}
    415 			if (equal) {
    416 				np = xpp+1;	/* numpat */
    417 				for (k = wp-w; *np; k++, np++)
    418 					if (*np > cnt[k])
    419 						cnt[k] = *np;
    420 /*
    421  * printf("match: %s  %s\n", pp, xpp+1);
    422 */
    423 			}
    424 			pp += *(pp-1);	/* skip over pattern and numbers to next */
    425 		}
    426 	}
    427 /*
    428  * for (i = 0; i < nw; i++) printf("%c", w[i]);
    429  * printf("  ");
    430  * for (i = 0; i <= nw; i++) printf("%c", cnt[i]);
    431  * printf("\n");
    432 */
    433 /*
    434  * 	for (i = 1; i < nw - 1; i++) {
    435  * 		if (i > 2 && i < nw - 3 && cnt[i] % 2)
    436  * 			printf("-");
    437  * 		if (cbits(start[i-1]) != '.')
    438  * 			printf("%c", cbits(start[i-1]));
    439  * 	}
    440  * 	printf("\n");
    441 */
    442 	for (i = 1; i < nw -1; i++)
    443 		if (i > 2 && i < nw - 3 && cnt[i] % 2)
    444 			*hyp++ = start + i - 1;
    445 	return hyp - hyptr;	/* non-zero if a hyphen was found */
    446 }
    447 
    448 /*
    449 	This code assumes that hyphen.tex looks like
    450 		% some comments
    451 		\patterns{ % more comments
    452 		pat5ter4ns, 1 per line, SORTED, nothing else
    453 		}
    454 		more goo
    455 		\hyphenation{ % more comments
    456 		ex-cep-tions, one per line; i ignore this part for now
    457 		}
    458 
    459 	this code is NOT robust against variations.  unfortunately,
    460 	it looks like every local language version of this file has
    461 	a different format.  i have also made no provision for weird
    462 	characters.  sigh.
    463 */
    464 
    465 static int readpats(void)
    466 {
    467 	FILE *fp;
    468 	char buf[200], buf1[200];
    469 
    470 	if ((fp = fopen(unsharp(TEXHYPHENS), "r")) == NULL
    471 	 && (fp = fopen(unsharp(DWBalthyphens), "r")) == NULL) {
    472 		ERROR "warning: can't find hyphen.tex" WARN;
    473 		return 0;
    474 	}
    475 
    476 	while (fgets(buf, sizeof buf, fp) != NULL) {
    477 		sscanf(buf, "%s", buf1);
    478 		if (strcmp(buf1, "\\patterns{") == 0)
    479 			break;
    480 	}
    481 	while (fgets(buf, sizeof buf, fp) != NULL) {
    482 		if (buf[0] == '}')
    483 			break;
    484 		install(buf);
    485 	}
    486 	fclose(fp);
    487 	fixup();
    488 	return 1;
    489 }
    490 
    491 static void install(char *s)	/* map ab4c5de to: 12 abcde \0 00405 \0 */
    492 {
    493 	int npat, lastpat;
    494 	char num[500], *onextpat = nextpat;
    495 
    496 	num[0] = '0';
    497 	*nextpat++ = ' ';	/* fill in with count later */
    498 	for (npat = lastpat = 0; *s != '\n' && *s != '\0'; s++) {
    499 		if (isdigit((uchar)*s)) {
    500 			num[npat] = *s;
    501 			lastpat = npat;
    502 		} else {
    503 			*nextpat++ = *s;
    504 			npat++;
    505 			num[npat] = '0';
    506 		}
    507 	}
    508 	*nextpat++ = 0;
    509 	if (nextpat > pats + sizeof(pats)-20) {
    510 		ERROR "tex hyphenation table overflow, tail end ignored" WARN;
    511 		nextpat = onextpat;
    512 	}
    513 	num[lastpat+1] = 0;
    514 	strcat(nextpat, num);
    515 	nextpat += strlen(nextpat) + 1;
    516 }
    517 
    518 static void fixup(void)	/* build indexes of where . a b c ... start */
    519 {
    520 	char *p, *lastc;
    521 	int n;
    522 
    523 	for (lastc = pats, p = pats+1; p < nextpat; p++)
    524 		if (*p == ' ') {
    525 			*lastc = p - lastc;
    526 			lastc = p;
    527 		}
    528 	*lastc = p - lastc;
    529 	for (p = pats+1; p < nextpat; ) {
    530 		n = trieindex(p[0], p[1]);
    531 		if (trie[n] == 0)
    532 			trie[n] = p;
    533 		p += p[-1];
    534 	}
    535 	/* printf("pats = %d\n", nextpat - pats); */
    536 }
    537 
    538 static int trieindex(int d1, int d2)
    539 {
    540 	int z;
    541 
    542 	z = 27 * (d1 == '.' ? 0 : d1 - 'a' + 1) + (d2 == '.' ? 0 : d2 - 'a' + 1);
    543 	assert(z >= 0 && z < 27*27);
    544 	return z;
    545 }