lex.c (12347B)
1 /**************************************************************** 2 Copyright (C) Lucent Technologies 1997 3 All Rights Reserved 4 5 Permission to use, copy, modify, and distribute this software and 6 its documentation for any purpose and without fee is hereby 7 granted, provided that the above copyright notice appear in all 8 copies and that both that the copyright notice and this 9 permission notice and warranty disclaimer appear in supporting 10 documentation, and that the name Lucent Technologies or any of 11 its entities not be used in advertising or publicity pertaining 12 to distribution of the software without specific, written prior 13 permission. 14 15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 22 THIS SOFTWARE. 23 ****************************************************************/ 24 25 #include <stdio.h> 26 #include <stdlib.h> 27 #include <string.h> 28 #include <ctype.h> 29 #include "awk.h" 30 #include "y.tab.h" 31 32 extern YYSTYPE yylval; 33 extern int infunc; 34 35 int lineno = 1; 36 int bracecnt = 0; 37 int brackcnt = 0; 38 int parencnt = 0; 39 40 typedef struct Keyword { 41 char *word; 42 int sub; 43 int type; 44 } Keyword; 45 46 Keyword keywords[] ={ /* keep sorted: binary searched */ 47 { "BEGIN", XBEGIN, XBEGIN }, 48 { "END", XEND, XEND }, 49 { "NF", VARNF, VARNF }, 50 { "atan2", FATAN, BLTIN }, 51 { "break", BREAK, BREAK }, 52 { "close", CLOSE, CLOSE }, 53 { "continue", CONTINUE, CONTINUE }, 54 { "cos", FCOS, BLTIN }, 55 { "delete", DELETE, DELETE }, 56 { "do", DO, DO }, 57 { "else", ELSE, ELSE }, 58 { "exit", EXIT, EXIT }, 59 { "exp", FEXP, BLTIN }, 60 { "fflush", FFLUSH, BLTIN }, 61 { "for", FOR, FOR }, 62 { "func", FUNC, FUNC }, 63 { "function", FUNC, FUNC }, 64 { "getline", GETLINE, GETLINE }, 65 { "gsub", GSUB, GSUB }, 66 { "if", IF, IF }, 67 { "in", IN, IN }, 68 { "index", INDEX, INDEX }, 69 { "int", FINT, BLTIN }, 70 { "length", FLENGTH, BLTIN }, 71 { "log", FLOG, BLTIN }, 72 { "match", MATCHFCN, MATCHFCN }, 73 { "next", NEXT, NEXT }, 74 { "nextfile", NEXTFILE, NEXTFILE }, 75 { "print", PRINT, PRINT }, 76 { "printf", PRINTF, PRINTF }, 77 { "rand", FRAND, BLTIN }, 78 { "return", RETURN, RETURN }, 79 { "sin", FSIN, BLTIN }, 80 { "split", SPLIT, SPLIT }, 81 { "sprintf", SPRINTF, SPRINTF }, 82 { "sqrt", FSQRT, BLTIN }, 83 { "srand", FSRAND, BLTIN }, 84 { "sub", SUB, SUB }, 85 { "substr", SUBSTR, SUBSTR }, 86 { "system", FSYSTEM, BLTIN }, 87 { "tolower", FTOLOWER, BLTIN }, 88 { "toupper", FTOUPPER, BLTIN }, 89 { "utf", FUTF, BLTIN }, 90 { "while", WHILE, WHILE }, 91 }; 92 93 #define DEBUG 94 #ifdef DEBUG 95 #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); } 96 #else 97 #define RET(x) return(x) 98 #endif 99 100 int peek(void) 101 { 102 int c = input(); 103 unput(c); 104 return c; 105 } 106 107 int gettok(char **pbuf, int *psz) /* get next input token */ 108 { 109 int c; 110 char *buf = *pbuf; 111 int sz = *psz; 112 char *bp = buf; 113 114 c = input(); 115 if (c == 0) 116 return 0; 117 buf[0] = c; 118 buf[1] = 0; 119 if (!isalnum(c) && c != '.' && c != '_') 120 return c; 121 122 *bp++ = c; 123 if (isalpha(c) || c == '_') { /* it's a varname */ 124 for ( ; (c = input()) != 0; ) { 125 if (bp-buf >= sz) 126 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0)) 127 FATAL( "out of space for name %.10s...", buf ); 128 if (isalnum(c) || c == '_') 129 *bp++ = c; 130 else { 131 *bp = 0; 132 unput(c); 133 break; 134 } 135 } 136 } else { /* it's a number */ 137 char *rem; 138 /* read input until can't be a number */ 139 for ( ; (c = input()) != 0; ) { 140 if (bp-buf >= sz) 141 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0)) 142 FATAL( "out of space for number %.10s...", buf ); 143 if (isdigit(c) || c == 'e' || c == 'E' 144 || c == '.' || c == '+' || c == '-') 145 *bp++ = c; 146 else { 147 unput(c); 148 break; 149 } 150 } 151 *bp = 0; 152 strtod(buf, &rem); /* parse the number */ 153 unputstr(rem); /* put rest back for later */ 154 rem[0] = 0; 155 } 156 *pbuf = buf; 157 *psz = sz; 158 return buf[0]; 159 } 160 161 int word(char *); 162 int string(void); 163 int regexpr(void); 164 int sc = 0; /* 1 => return a } right now */ 165 int reg = 0; /* 1 => return a REGEXPR now */ 166 167 int yylex(void) 168 { 169 int c; 170 static char *buf = 0; 171 static int bufsize = 500; 172 173 if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL) 174 FATAL( "out of space in yylex" ); 175 if (sc) { 176 sc = 0; 177 RET('}'); 178 } 179 if (reg) { 180 reg = 0; 181 return regexpr(); 182 } 183 for (;;) { 184 c = gettok(&buf, &bufsize); 185 if (c == 0) 186 return 0; 187 if (isalpha(c) || c == '_') 188 return word(buf); 189 if (isdigit(c) || c == '.') { 190 yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab); 191 /* should this also have STR set? */ 192 RET(NUMBER); 193 } 194 195 yylval.i = c; 196 switch (c) { 197 case '\n': /* {EOL} */ 198 RET(NL); 199 case '\r': /* assume \n is coming */ 200 case ' ': /* {WS}+ */ 201 case '\t': 202 break; 203 case '#': /* #.* strip comments */ 204 while ((c = input()) != '\n' && c != 0) 205 ; 206 unput(c); 207 break; 208 case ';': 209 RET(';'); 210 case '\\': 211 if (peek() == '\n') { 212 input(); 213 } else if (peek() == '\r') { 214 input(); input(); /* \n */ 215 lineno++; 216 } else { 217 RET(c); 218 } 219 break; 220 case '&': 221 if (peek() == '&') { 222 input(); RET(AND); 223 } else 224 RET('&'); 225 case '|': 226 if (peek() == '|') { 227 input(); RET(BOR); 228 } else 229 RET('|'); 230 case '!': 231 if (peek() == '=') { 232 input(); yylval.i = NE; RET(NE); 233 } else if (peek() == '~') { 234 input(); yylval.i = NOTMATCH; RET(MATCHOP); 235 } else 236 RET(NOT); 237 case '~': 238 yylval.i = MATCH; 239 RET(MATCHOP); 240 case '<': 241 if (peek() == '=') { 242 input(); yylval.i = LE; RET(LE); 243 } else { 244 yylval.i = LT; RET(LT); 245 } 246 case '=': 247 if (peek() == '=') { 248 input(); yylval.i = EQ; RET(EQ); 249 } else { 250 yylval.i = ASSIGN; RET(ASGNOP); 251 } 252 case '>': 253 if (peek() == '=') { 254 input(); yylval.i = GE; RET(GE); 255 } else if (peek() == '>') { 256 input(); yylval.i = APPEND; RET(APPEND); 257 } else { 258 yylval.i = GT; RET(GT); 259 } 260 case '+': 261 if (peek() == '+') { 262 input(); yylval.i = INCR; RET(INCR); 263 } else if (peek() == '=') { 264 input(); yylval.i = ADDEQ; RET(ASGNOP); 265 } else 266 RET('+'); 267 case '-': 268 if (peek() == '-') { 269 input(); yylval.i = DECR; RET(DECR); 270 } else if (peek() == '=') { 271 input(); yylval.i = SUBEQ; RET(ASGNOP); 272 } else 273 RET('-'); 274 case '*': 275 if (peek() == '=') { /* *= */ 276 input(); yylval.i = MULTEQ; RET(ASGNOP); 277 } else if (peek() == '*') { /* ** or **= */ 278 input(); /* eat 2nd * */ 279 if (peek() == '=') { 280 input(); yylval.i = POWEQ; RET(ASGNOP); 281 } else { 282 RET(POWER); 283 } 284 } else 285 RET('*'); 286 case '/': 287 RET('/'); 288 case '%': 289 if (peek() == '=') { 290 input(); yylval.i = MODEQ; RET(ASGNOP); 291 } else 292 RET('%'); 293 case '^': 294 if (peek() == '=') { 295 input(); yylval.i = POWEQ; RET(ASGNOP); 296 } else 297 RET(POWER); 298 299 case '$': 300 /* BUG: awkward, if not wrong */ 301 c = gettok(&buf, &bufsize); 302 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) { 303 unputstr(buf); 304 RET(INDIRECT); 305 } else if (isalpha(c)) { 306 if (strcmp(buf, "NF") == 0) { /* very special */ 307 unputstr("(NF)"); 308 RET(INDIRECT); 309 } 310 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab); 311 RET(IVAR); 312 } else { 313 unputstr(buf); 314 RET(INDIRECT); 315 } 316 317 case '}': 318 if (--bracecnt < 0) 319 SYNTAX( "extra }" ); 320 sc = 1; 321 RET(';'); 322 case ']': 323 if (--brackcnt < 0) 324 SYNTAX( "extra ]" ); 325 RET(']'); 326 case ')': 327 if (--parencnt < 0) 328 SYNTAX( "extra )" ); 329 RET(')'); 330 case '{': 331 bracecnt++; 332 RET('{'); 333 case '[': 334 brackcnt++; 335 RET('['); 336 case '(': 337 parencnt++; 338 RET('('); 339 340 case '"': 341 return string(); /* BUG: should be like tran.c ? */ 342 343 default: 344 RET(c); 345 } 346 } 347 } 348 349 int string(void) 350 { 351 int c, n; 352 char *s, *bp; 353 static char *buf = 0; 354 static int bufsz = 500; 355 356 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL) 357 FATAL("out of space for strings"); 358 for (bp = buf; (c = input()) != '"'; ) { 359 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0)) 360 FATAL("out of space for string %.10s...", buf); 361 switch (c) { 362 case '\n': 363 case '\r': 364 case 0: 365 SYNTAX( "non-terminated string %.10s...", buf ); 366 lineno++; 367 break; 368 case '\\': 369 c = input(); 370 switch (c) { 371 case '"': *bp++ = '"'; break; 372 case 'n': *bp++ = '\n'; break; 373 case 't': *bp++ = '\t'; break; 374 case 'f': *bp++ = '\f'; break; 375 case 'r': *bp++ = '\r'; break; 376 case 'b': *bp++ = '\b'; break; 377 case 'v': *bp++ = '\v'; break; 378 case 'a': *bp++ = '\007'; break; 379 case '\\': *bp++ = '\\'; break; 380 381 case '0': case '1': case '2': /* octal: \d \dd \ddd */ 382 case '3': case '4': case '5': case '6': case '7': 383 n = c - '0'; 384 if ((c = peek()) >= '0' && c < '8') { 385 n = 8 * n + input() - '0'; 386 if ((c = peek()) >= '0' && c < '8') 387 n = 8 * n + input() - '0'; 388 } 389 *bp++ = n; 390 break; 391 392 case 'x': /* hex \x0-9a-fA-F + */ 393 { char xbuf[100], *px; 394 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) { 395 if (isdigit(c) 396 || (c >= 'a' && c <= 'f') 397 || (c >= 'A' && c <= 'F')) 398 *px++ = c; 399 else 400 break; 401 } 402 *px = 0; 403 unput(c); 404 sscanf(xbuf, "%x", &n); 405 *bp++ = n; 406 break; 407 } 408 409 default: 410 *bp++ = c; 411 break; 412 } 413 break; 414 default: 415 *bp++ = c; 416 break; 417 } 418 } 419 *bp = 0; 420 s = tostring(buf); 421 *bp++ = ' '; *bp++ = 0; 422 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab); 423 RET(STRING); 424 } 425 426 427 int binsearch(char *w, Keyword *kp, int n) 428 { 429 int cond, low, mid, high; 430 431 low = 0; 432 high = n - 1; 433 while (low <= high) { 434 mid = (low + high) / 2; 435 if ((cond = strcmp(w, kp[mid].word)) < 0) 436 high = mid - 1; 437 else if (cond > 0) 438 low = mid + 1; 439 else 440 return mid; 441 } 442 return -1; 443 } 444 445 int word(char *w) 446 { 447 Keyword *kp; 448 int c, n; 449 450 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0])); 451 kp = keywords + n; 452 if (n != -1) { /* found in table */ 453 yylval.i = kp->sub; 454 switch (kp->type) { /* special handling */ 455 case FSYSTEM: 456 if (safe) 457 SYNTAX( "system is unsafe" ); 458 RET(kp->type); 459 case FUNC: 460 if (infunc) 461 SYNTAX( "illegal nested function" ); 462 RET(kp->type); 463 case RETURN: 464 if (!infunc) 465 SYNTAX( "return not in function" ); 466 RET(kp->type); 467 case VARNF: 468 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab); 469 RET(VARNF); 470 default: 471 RET(kp->type); 472 } 473 } 474 c = peek(); /* look for '(' */ 475 if (c != '(' && infunc && (n=isarg(w)) >= 0) { 476 yylval.i = n; 477 RET(ARG); 478 } else { 479 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab); 480 if (c == '(') { 481 RET(CALL); 482 } else { 483 RET(VAR); 484 } 485 } 486 } 487 488 void startreg(void) /* next call to yyles will return a regular expression */ 489 { 490 reg = 1; 491 } 492 493 int regexpr(void) 494 { 495 int c; 496 static char *buf = 0; 497 static int bufsz = 500; 498 char *bp; 499 500 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL) 501 FATAL("out of space for rex expr"); 502 bp = buf; 503 for ( ; (c = input()) != '/' && c != 0; ) { 504 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0)) 505 FATAL("out of space for reg expr %.10s...", buf); 506 if (c == '\n') { 507 SYNTAX( "newline in regular expression %.10s...", buf ); 508 unput('\n'); 509 break; 510 } else if (c == '\\') { 511 *bp++ = '\\'; 512 *bp++ = input(); 513 } else { 514 *bp++ = c; 515 } 516 } 517 *bp = 0; 518 yylval.s = tostring(buf); 519 unput('/'); 520 RET(REGEXPR); 521 } 522 523 /* low-level lexical stuff, sort of inherited from lex */ 524 525 char ebuf[300]; 526 char *ep = ebuf; 527 char yysbuf[100]; /* pushback buffer */ 528 char *yysptr = yysbuf; 529 FILE *yyin = 0; 530 531 int input(void) /* get next lexical input character */ 532 { 533 int c; 534 extern char *lexprog; 535 536 if (yysptr > yysbuf) 537 c = *--yysptr; 538 else if (lexprog != NULL) { /* awk '...' */ 539 if ((c = *lexprog) != 0) 540 lexprog++; 541 } else /* awk -f ... */ 542 c = pgetc(); 543 if (c == '\n') 544 lineno++; 545 else if (c == EOF) 546 c = 0; 547 if (ep >= ebuf + sizeof ebuf) 548 ep = ebuf; 549 return *ep++ = c; 550 } 551 552 void unput(int c) /* put lexical character back on input */ 553 { 554 if (c == '\n') 555 lineno--; 556 if (yysptr >= yysbuf + sizeof(yysbuf)) 557 FATAL("pushed back too much: %.20s...", yysbuf); 558 *yysptr++ = c; 559 if (--ep < ebuf) 560 ep = ebuf + sizeof(ebuf) - 1; 561 } 562 563 void unputstr(char *s) /* put a string back on input */ 564 { 565 int i; 566 567 for (i = strlen(s)-1; i >= 0; i--) 568 unput(s[i]); 569 } 570