re.c (6951B)
1 /**************************************************************** 2 Copyright (C) Lucent Technologies 1997 3 All Rights Reserved 4 5 Permission to use, copy, modify, and distribute this software and 6 its documentation for any purpose and without fee is hereby 7 granted, provided that the above copyright notice appear in all 8 copies and that both that the copyright notice and this 9 permission notice and warranty disclaimer appear in supporting 10 documentation, and that the name Lucent Technologies or any of 11 its entities not be used in advertising or publicity pertaining 12 to distribution of the software without specific, written prior 13 permission. 14 15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 22 THIS SOFTWARE. 23 ****************************************************************/ 24 25 26 #define DEBUG 27 #include <stdio.h> 28 #include <u.h> 29 #include <libc.h> 30 #include <ctype.h> 31 #include <bio.h> 32 #include <regexp.h> 33 #include "awk.h" 34 #include "y.tab.h" 35 36 /* This file provides the interface between the main body of 37 * awk and the pattern matching package. It preprocesses 38 * patterns prior to compilation to provide awk-like semantics 39 * to character sequences not supported by the pattern package. 40 * The following conversions are performed: 41 * 42 * "()" -> "[]" 43 * "[-" -> "[\-" 44 * "[^-" -> "[^\-" 45 * "-]" -> "\-]" 46 * "[]" -> "[]*" 47 * "\xdddd" -> "\z" where 'z' is the UTF sequence 48 * for the hex value 49 * "\ddd" -> "\o" where 'o' is a char octal value 50 * "\b" -> "\B" where 'B' is backspace 51 * "\t" -> "\T" where 'T' is tab 52 * "\f" -> "\F" where 'F' is form feed 53 * "\n" -> "\N" where 'N' is newline 54 * "\r" -> "\r" where 'C' is cr 55 */ 56 57 #define MAXRE 512 58 59 static char re[MAXRE]; /* copy buffer */ 60 61 char *patbeg; 62 int patlen; /* number of chars in pattern */ 63 64 #define NPATS 20 /* number of slots in pattern cache */ 65 66 static struct pat_list /* dynamic pattern cache */ 67 { 68 char *re; 69 int use; 70 Reprog *program; 71 } pattern[NPATS]; 72 73 static int npats; /* cache fill level */ 74 75 /* Compile a pattern */ 76 void 77 *compre(char *pat) 78 { 79 int i, j, inclass; 80 char c, *p, *s; 81 Reprog *program; 82 83 if (!compile_time) { /* search cache for dynamic pattern */ 84 for (i = 0; i < npats; i++) 85 if (!strcmp(pat, pattern[i].re)) { 86 pattern[i].use++; 87 return((void *) pattern[i].program); 88 } 89 } 90 /* Preprocess Pattern for compilation */ 91 p = re; 92 s = pat; 93 inclass = 0; 94 while (c = *s++) { 95 if (c == '\\') { 96 quoted(&s, &p, re+MAXRE); 97 continue; 98 } 99 else if (!inclass && c == '(' && *s == ')') { 100 if (p < re+MAXRE-2) { /* '()' -> '[]*' */ 101 *p++ = '['; 102 *p++ = ']'; 103 c = '*'; 104 s++; 105 } 106 else overflow(); 107 } 108 else if (c == '['){ /* '[-' -> '[\-' */ 109 inclass = 1; 110 if (*s == '-') { 111 if (p < re+MAXRE-2) { 112 *p++ = '['; 113 *p++ = '\\'; 114 c = *s++; 115 } 116 else overflow(); 117 } /* '[^-' -> '[^\-'*/ 118 else if (*s == '^' && s[1] == '-'){ 119 if (p < re+MAXRE-3) { 120 *p++ = '['; 121 *p++ = *s++; 122 *p++ = '\\'; 123 c = *s++; 124 } 125 else overflow(); 126 } 127 else if (*s == '['){ /* skip '[[' */ 128 if (p < re+MAXRE-1) 129 *p++ = c; 130 else overflow(); 131 c = *s++; 132 } 133 else if (*s == '^' && s[1] == '[') { /* skip '[^['*/ 134 if (p < re+MAXRE-2) { 135 *p++ = c; 136 *p++ = *s++; 137 c = *s++; 138 } 139 else overflow(); 140 } 141 else if (*s == ']') { /* '[]' -> '[]*' */ 142 if (p < re+MAXRE-2) { 143 *p++ = c; 144 *p++ = *s++; 145 c = '*'; 146 inclass = 0; 147 } 148 else overflow(); 149 } 150 } 151 else if (c == '-' && *s == ']') { /* '-]' -> '\-]' */ 152 if (p < re+MAXRE-1) 153 *p++ = '\\'; 154 else overflow(); 155 } 156 else if (c == ']') 157 inclass = 0; 158 if (p < re+MAXRE-1) 159 *p++ = c; 160 else overflow(); 161 } 162 *p = 0; 163 program = regcomp(re); /* compile pattern */ 164 if (!compile_time) { 165 if (npats < NPATS) /* Room in cache */ 166 i = npats++; 167 else { /* Throw out least used */ 168 int use = pattern[0].use; 169 i = 0; 170 for (j = 1; j < NPATS; j++) { 171 if (pattern[j].use < use) { 172 use = pattern[j].use; 173 i = j; 174 } 175 } 176 xfree(pattern[i].program); 177 xfree(pattern[i].re); 178 } 179 pattern[i].re = tostring(pat); 180 pattern[i].program = program; 181 pattern[i].use = 1; 182 } 183 return((void *) program); 184 } 185 186 /* T/F match indication - matched string not exported */ 187 int 188 match(void *p, char *s, char *start) 189 { 190 return regexec((Reprog *) p, (char *) s, 0, 0); 191 } 192 193 /* match and delimit the matched string */ 194 int 195 pmatch(void *p, char *s, char *start) 196 { 197 Resub m; 198 199 m.s.sp = start; 200 m.e.ep = 0; 201 if (regexec((Reprog *) p, (char *) s, &m, 1)) { 202 patbeg = m.s.sp; 203 patlen = m.e.ep-m.s.sp; 204 return 1; 205 } 206 patlen = -1; 207 patbeg = start; 208 return 0; 209 } 210 211 /* perform a non-empty match */ 212 int 213 nematch(void *p, char *s, char *start) 214 { 215 if (pmatch(p, s, start) == 1 && patlen > 0) 216 return 1; 217 patlen = -1; 218 patbeg = start; 219 return 0; 220 } 221 /* in the parsing of regular expressions, metacharacters like . have */ 222 /* to be seen literally; \056 is not a metacharacter. */ 223 224 int 225 hexstr(char **pp) /* find and eval hex string at pp, return new p */ 226 { 227 char c; 228 int n = 0; 229 int i; 230 231 for (i = 0, c = (*pp)[i]; i < 4 && isxdigit(c); i++, c = (*pp)[i]) { 232 if (isdigit(c)) 233 n = 16 * n + c - '0'; 234 else if ('a' <= c && c <= 'f') 235 n = 16 * n + c - 'a' + 10; 236 else if ('A' <= c && c <= 'F') 237 n = 16 * n + c - 'A' + 10; 238 } 239 *pp += i; 240 return n; 241 } 242 243 /* look for awk-specific escape sequences */ 244 245 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */ 246 247 void 248 quoted(char **s, char **to, char *end) /* handle escaped sequence */ 249 { 250 char *p = *s; 251 char *t = *to; 252 wchar_t c; 253 254 switch(c = *p++) { 255 case 't': 256 c = '\t'; 257 break; 258 case 'n': 259 c = '\n'; 260 break; 261 case 'f': 262 c = '\f'; 263 break; 264 case 'r': 265 c = '\r'; 266 break; 267 case 'b': 268 c = '\b'; 269 break; 270 default: 271 if (t < end-1) /* all else must be escaped */ 272 *t++ = '\\'; 273 if (c == 'x') { /* hexadecimal goo follows */ 274 c = hexstr(&p); 275 if (t < end-MB_CUR_MAX) 276 t += wctomb(t, c); 277 else overflow(); 278 *to = t; 279 *s = p; 280 return; 281 } else if (isoctdigit(c)) { /* \d \dd \ddd */ 282 c -= '0'; 283 if (isoctdigit(*p)) { 284 c = 8 * c + *p++ - '0'; 285 if (isoctdigit(*p)) 286 c = 8 * c + *p++ - '0'; 287 } 288 } 289 break; 290 } 291 if (t < end-1) 292 *t++ = c; 293 *s = p; 294 *to = t; 295 } 296 /* count rune positions */ 297 int 298 countposn(char *s, int n) 299 { 300 int i, j; 301 char *end; 302 303 for (i = 0, end = s+n; *s && s < end; i++){ 304 j = mblen(s, n); 305 if(j <= 0) 306 j = 1; 307 s += j; 308 } 309 return(i); 310 } 311 312 /* pattern package error handler */ 313 314 void 315 regerror(char *s) 316 { 317 FATAL("%s", s); 318 } 319 320 void 321 overflow(void) 322 { 323 FATAL("%s", "regular expression too big"); 324 } 325