9base

revived minimalist port of Plan 9 userland to Unix
git clone git://git.suckless.org/9base
Log | Files | Refs | README | LICENSE

re.c (6951B)


      1 /****************************************************************
      2 Copyright (C) Lucent Technologies 1997
      3 All Rights Reserved
      4 
      5 Permission to use, copy, modify, and distribute this software and
      6 its documentation for any purpose and without fee is hereby
      7 granted, provided that the above copyright notice appear in all
      8 copies and that both that the copyright notice and this
      9 permission notice and warranty disclaimer appear in supporting
     10 documentation, and that the name Lucent Technologies or any of
     11 its entities not be used in advertising or publicity pertaining
     12 to distribution of the software without specific, written prior
     13 permission.
     14 
     15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
     16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
     17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
     18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
     20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
     21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
     22 THIS SOFTWARE.
     23 ****************************************************************/
     24 
     25 
     26 #define DEBUG
     27 #include <stdio.h>
     28 #include <u.h>
     29 #include <libc.h>
     30 #include <ctype.h>
     31 #include <bio.h>
     32 #include <regexp.h>
     33 #include "awk.h"
     34 #include "y.tab.h"
     35 
     36 	/* This file provides the interface between the main body of
     37 	 * awk and the pattern matching package.  It preprocesses
     38 	 * patterns prior to compilation to provide awk-like semantics
     39 	 * to character sequences not supported by the pattern package.
     40 	 * The following conversions are performed:
     41 	 *
     42 	 *	"()"		->	"[]"
     43 	 *	"[-"		->	"[\-"
     44 	 *	"[^-"		->	"[^\-"
     45 	 *	"-]"		->	"\-]"
     46 	 *	"[]"		->	"[]*"
     47 	 *	"\xdddd"	->	"\z" where 'z' is the UTF sequence
     48 	 *					for the hex value
     49 	 *	"\ddd"		->	"\o" where 'o' is a char octal value
     50 	 *	"\b"		->	"\B"	where 'B' is backspace
     51 	 *	"\t"		->	"\T"	where 'T' is tab
     52 	 *	"\f"		->	"\F"	where 'F' is form feed
     53 	 *	"\n"		->	"\N"	where 'N' is newline
     54 	 *	"\r"		->	"\r"	where 'C' is cr
     55 	 */
     56 
     57 #define	MAXRE	512
     58 
     59 static char	re[MAXRE];	/* copy buffer */
     60 
     61 char	*patbeg;
     62 int	patlen;			/* number of chars in pattern */
     63 
     64 #define	NPATS	20		/* number of slots in pattern cache */
     65 
     66 static struct pat_list		/* dynamic pattern cache */
     67 {
     68 	char	*re;
     69 	int	use;
     70 	Reprog	*program;
     71 } pattern[NPATS];
     72 
     73 static int npats;		/* cache fill level */
     74 
     75 	/* Compile a pattern */
     76 void
     77 *compre(char *pat)
     78 {
     79 	int i, j, inclass;
     80 	char c, *p, *s;
     81 	Reprog *program;
     82 
     83 	if (!compile_time) {	/* search cache for dynamic pattern */
     84 		for (i = 0; i < npats; i++)
     85 			if (!strcmp(pat, pattern[i].re)) {
     86 				pattern[i].use++;
     87 				return((void *) pattern[i].program);
     88 			}
     89 	}
     90 		/* Preprocess Pattern for compilation */
     91 	p = re;
     92 	s = pat;
     93 	inclass = 0;
     94 	while (c = *s++) {
     95 		if (c == '\\') {
     96 			quoted(&s, &p, re+MAXRE);
     97 			continue;
     98 		}
     99 		else if (!inclass && c == '(' && *s == ')') {
    100 			if (p < re+MAXRE-2) {	/* '()' -> '[]*' */
    101 				*p++ = '[';
    102 				*p++ = ']';
    103 				c = '*';
    104 				s++;
    105 			}
    106 			else overflow();
    107 		}
    108 		else if (c == '['){			/* '[-' -> '[\-' */
    109 			inclass = 1;
    110 			if (*s == '-') {
    111 				if (p < re+MAXRE-2) {
    112 					*p++ = '[';
    113 					*p++ = '\\';
    114 					c = *s++;
    115 				}
    116 				else overflow();
    117 			}				/* '[^-' -> '[^\-'*/
    118 			else if (*s == '^' && s[1] == '-'){
    119 				if (p < re+MAXRE-3) {
    120 					*p++ = '[';
    121 					*p++ = *s++;
    122 					*p++ = '\\';
    123 					c = *s++;
    124 				}
    125 				else overflow();
    126 			}
    127 			else if (*s == '['){		/* skip '[[' */
    128 				if (p < re+MAXRE-1)
    129 					*p++ = c;
    130 				else overflow();
    131 				c = *s++;
    132 			}
    133 			else if (*s == '^' && s[1] == '[') {	/* skip '[^['*/
    134 				if (p < re+MAXRE-2) {
    135 					*p++ = c;
    136 					*p++ = *s++;
    137 					c = *s++;
    138 				}
    139 				else overflow();
    140 			}
    141 			else if (*s == ']') {		/* '[]' -> '[]*' */
    142 				if (p < re+MAXRE-2) {
    143 					*p++ = c;
    144 					*p++ = *s++;
    145 					c = '*';
    146 					inclass = 0;
    147 				}
    148 				else overflow();
    149 			}
    150 		}
    151 		else if (c == '-' && *s == ']') {	/* '-]' -> '\-]' */
    152 			if (p < re+MAXRE-1)
    153 				*p++ = '\\';
    154 			else overflow();
    155 		}
    156 		else if (c == ']')
    157 			inclass = 0;
    158 		if (p < re+MAXRE-1)
    159 			*p++ = c;
    160 		else overflow();
    161 	}
    162 	*p = 0;
    163 	program = regcomp(re);		/* compile pattern */
    164 	if (!compile_time) {
    165 		if (npats < NPATS)	/* Room in cache */
    166 			i = npats++;
    167 		else {			/* Throw out least used */
    168 			int use = pattern[0].use;
    169 			i = 0;
    170 			for (j = 1; j < NPATS; j++) {
    171 				if (pattern[j].use < use) {
    172 					use = pattern[j].use;
    173 					i = j;
    174 				}
    175 			}
    176 			xfree(pattern[i].program);
    177 			xfree(pattern[i].re);
    178 		}
    179 		pattern[i].re = tostring(pat);
    180 		pattern[i].program = program;
    181 		pattern[i].use = 1;
    182 	}
    183 	return((void *) program);
    184 }
    185 
    186 	/* T/F match indication - matched string not exported */
    187 int
    188 match(void *p, char *s, char *start)
    189 {
    190 	return regexec((Reprog *) p, (char *) s, 0, 0);
    191 }
    192 
    193 	/* match and delimit the matched string */
    194 int
    195 pmatch(void *p, char *s, char *start)
    196 {
    197 	Resub m;
    198 
    199 	m.s.sp = start;
    200 	m.e.ep = 0;
    201 	if (regexec((Reprog *) p, (char *) s, &m, 1)) {
    202 		patbeg = m.s.sp;
    203 		patlen = m.e.ep-m.s.sp;
    204 		return 1;
    205 	}
    206 	patlen = -1;
    207 	patbeg = start;
    208 	return 0;
    209 }
    210 
    211 	/* perform a non-empty match */
    212 int
    213 nematch(void *p, char *s, char *start)
    214 {
    215 	if (pmatch(p, s, start) == 1 && patlen > 0)
    216 		return 1;
    217 	patlen = -1;
    218 	patbeg = start; 
    219 	return 0;
    220 }
    221 /* in the parsing of regular expressions, metacharacters like . have */
    222 /* to be seen literally;  \056 is not a metacharacter. */
    223 
    224 int
    225 hexstr(char **pp)	/* find and eval hex string at pp, return new p */
    226 {
    227 	char c;
    228 	int n = 0;
    229 	int i;
    230 
    231 	for (i = 0, c = (*pp)[i]; i < 4 && isxdigit(c); i++, c = (*pp)[i]) {
    232 		if (isdigit(c))
    233 			n = 16 * n + c - '0';
    234 		else if ('a' <= c && c <= 'f')
    235 			n = 16 * n + c - 'a' + 10;
    236 		else if ('A' <= c && c <= 'F')
    237 			n = 16 * n + c - 'A' + 10;
    238 	}
    239 	*pp += i;
    240 	return n;
    241 }
    242 
    243 	/* look for awk-specific escape sequences */
    244 
    245 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
    246 
    247 void
    248 quoted(char **s, char **to, char *end)	/* handle escaped sequence */
    249 {
    250 	char *p = *s;
    251 	char *t = *to;
    252 	wchar_t c;
    253 
    254 	switch(c = *p++) {
    255 	case 't':
    256 		c = '\t';
    257 		break;
    258 	case 'n':
    259 		c = '\n';
    260 		break;
    261 	case 'f':
    262 		c = '\f';
    263 		break;
    264 	case 'r':
    265 		c = '\r';
    266 		break;
    267 	case 'b':
    268 		c = '\b';
    269 		break;
    270 	default:
    271 		if (t < end-1)		/* all else must be escaped */
    272 			*t++ = '\\';
    273 		if (c == 'x') {		/* hexadecimal goo follows */
    274 			c = hexstr(&p);
    275 			if (t < end-MB_CUR_MAX)
    276 				t += wctomb(t, c);
    277 			else overflow();
    278 			*to = t;
    279 			*s = p;
    280 			return;
    281 		} else if (isoctdigit(c)) {	/* \d \dd \ddd */
    282 			c -= '0';
    283 			if (isoctdigit(*p)) {
    284 				c = 8 * c + *p++ - '0';
    285 				if (isoctdigit(*p))
    286 					c = 8 * c + *p++ - '0';
    287 			}
    288 		}
    289 		break;
    290 	}
    291 	if (t < end-1)
    292 		*t++ = c;
    293 	*s = p;
    294 	*to = t;
    295 }
    296 	/* count rune positions */
    297 int
    298 countposn(char *s, int n)
    299 {
    300 	int i, j;
    301 	char *end;
    302 
    303 	for (i = 0, end = s+n; *s && s < end; i++){
    304 		j = mblen(s, n);
    305 		if(j <= 0)
    306 			j = 1;
    307 		s += j;
    308 	}
    309 	return(i);
    310 }
    311 
    312 	/* pattern package error handler */
    313 
    314 void
    315 regerror(char *s)
    316 {
    317 	FATAL("%s", s);
    318 }
    319 
    320 void
    321 overflow(void)
    322 {
    323 	FATAL("%s", "regular expression too big");
    324 }
    325