9base

revived minimalist port of Plan 9 userland to Unix
git clone git://git.suckless.org/9base
Log | Files | Refs | README | LICENSE

lex.c (12347B)


      1 /****************************************************************
      2 Copyright (C) Lucent Technologies 1997
      3 All Rights Reserved
      4 
      5 Permission to use, copy, modify, and distribute this software and
      6 its documentation for any purpose and without fee is hereby
      7 granted, provided that the above copyright notice appear in all
      8 copies and that both that the copyright notice and this
      9 permission notice and warranty disclaimer appear in supporting
     10 documentation, and that the name Lucent Technologies or any of
     11 its entities not be used in advertising or publicity pertaining
     12 to distribution of the software without specific, written prior
     13 permission.
     14 
     15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
     16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
     17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
     18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
     20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
     21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
     22 THIS SOFTWARE.
     23 ****************************************************************/
     24 
     25 #include <stdio.h>
     26 #include <stdlib.h>
     27 #include <string.h>
     28 #include <ctype.h>
     29 #include "awk.h"
     30 #include "y.tab.h"
     31 
     32 extern YYSTYPE	yylval;
     33 extern int	infunc;
     34 
     35 int	lineno	= 1;
     36 int	bracecnt = 0;
     37 int	brackcnt  = 0;
     38 int	parencnt = 0;
     39 
     40 typedef struct Keyword {
     41 	char	*word;
     42 	int	sub;
     43 	int	type;
     44 } Keyword;
     45 
     46 Keyword keywords[] ={	/* keep sorted: binary searched */
     47 	{ "BEGIN",	XBEGIN,		XBEGIN },
     48 	{ "END",	XEND,		XEND },
     49 	{ "NF",		VARNF,		VARNF },
     50 	{ "atan2",	FATAN,		BLTIN },
     51 	{ "break",	BREAK,		BREAK },
     52 	{ "close",	CLOSE,		CLOSE },
     53 	{ "continue",	CONTINUE,	CONTINUE },
     54 	{ "cos",	FCOS,		BLTIN },
     55 	{ "delete",	DELETE,		DELETE },
     56 	{ "do",		DO,		DO },
     57 	{ "else",	ELSE,		ELSE },
     58 	{ "exit",	EXIT,		EXIT },
     59 	{ "exp",	FEXP,		BLTIN },
     60 	{ "fflush",	FFLUSH,		BLTIN },
     61 	{ "for",	FOR,		FOR },
     62 	{ "func",	FUNC,		FUNC },
     63 	{ "function",	FUNC,		FUNC },
     64 	{ "getline",	GETLINE,	GETLINE },
     65 	{ "gsub",	GSUB,		GSUB },
     66 	{ "if",		IF,		IF },
     67 	{ "in",		IN,		IN },
     68 	{ "index",	INDEX,		INDEX },
     69 	{ "int",	FINT,		BLTIN },
     70 	{ "length",	FLENGTH,	BLTIN },
     71 	{ "log",	FLOG,		BLTIN },
     72 	{ "match",	MATCHFCN,	MATCHFCN },
     73 	{ "next",	NEXT,		NEXT },
     74 	{ "nextfile",	NEXTFILE,	NEXTFILE },
     75 	{ "print",	PRINT,		PRINT },
     76 	{ "printf",	PRINTF,		PRINTF },
     77 	{ "rand",	FRAND,		BLTIN },
     78 	{ "return",	RETURN,		RETURN },
     79 	{ "sin",	FSIN,		BLTIN },
     80 	{ "split",	SPLIT,		SPLIT },
     81 	{ "sprintf",	SPRINTF,	SPRINTF },
     82 	{ "sqrt",	FSQRT,		BLTIN },
     83 	{ "srand",	FSRAND,		BLTIN },
     84 	{ "sub",	SUB,		SUB },
     85 	{ "substr",	SUBSTR,		SUBSTR },
     86 	{ "system",	FSYSTEM,	BLTIN },
     87 	{ "tolower",	FTOLOWER,	BLTIN },
     88 	{ "toupper",	FTOUPPER,	BLTIN },
     89 	{ "utf",	FUTF,		BLTIN },
     90 	{ "while",	WHILE,		WHILE },
     91 };
     92 
     93 #define DEBUG
     94 #ifdef	DEBUG
     95 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
     96 #else
     97 #define	RET(x)	return(x)
     98 #endif
     99 
    100 int peek(void)
    101 {
    102 	int c = input();
    103 	unput(c);
    104 	return c;
    105 }
    106 
    107 int gettok(char **pbuf, int *psz)	/* get next input token */
    108 {
    109 	int c;
    110 	char *buf = *pbuf;
    111 	int sz = *psz;
    112 	char *bp = buf;
    113 
    114 	c = input();
    115 	if (c == 0)
    116 		return 0;
    117 	buf[0] = c;
    118 	buf[1] = 0;
    119 	if (!isalnum(c) && c != '.' && c != '_')
    120 		return c;
    121 
    122 	*bp++ = c;
    123 	if (isalpha(c) || c == '_') {	/* it's a varname */
    124 		for ( ; (c = input()) != 0; ) {
    125 			if (bp-buf >= sz)
    126 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
    127 					FATAL( "out of space for name %.10s...", buf );
    128 			if (isalnum(c) || c == '_')
    129 				*bp++ = c;
    130 			else {
    131 				*bp = 0;
    132 				unput(c);
    133 				break;
    134 			}
    135 		}
    136 	} else {	/* it's a number */
    137 		char *rem;
    138 		/* read input until can't be a number */
    139 		for ( ; (c = input()) != 0; ) {
    140 			if (bp-buf >= sz)
    141 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
    142 					FATAL( "out of space for number %.10s...", buf );
    143 			if (isdigit(c) || c == 'e' || c == 'E' 
    144 			  || c == '.' || c == '+' || c == '-')
    145 				*bp++ = c;
    146 			else {
    147 				unput(c);
    148 				break;
    149 			}
    150 		}
    151 		*bp = 0;
    152 		strtod(buf, &rem);	/* parse the number */
    153 		unputstr(rem);		/* put rest back for later */
    154 		rem[0] = 0;
    155 	}
    156 	*pbuf = buf;
    157 	*psz = sz;
    158 	return buf[0];
    159 }
    160 
    161 int	word(char *);
    162 int	string(void);
    163 int	regexpr(void);
    164 int	sc	= 0;	/* 1 => return a } right now */
    165 int	reg	= 0;	/* 1 => return a REGEXPR now */
    166 
    167 int yylex(void)
    168 {
    169 	int c;
    170 	static char *buf = 0;
    171 	static int bufsize = 500;
    172 
    173 	if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
    174 		FATAL( "out of space in yylex" );
    175 	if (sc) {
    176 		sc = 0;
    177 		RET('}');
    178 	}
    179 	if (reg) {
    180 		reg = 0;
    181 		return regexpr();
    182 	}
    183 	for (;;) {
    184 		c = gettok(&buf, &bufsize);
    185 		if (c == 0)
    186 			return 0;
    187 		if (isalpha(c) || c == '_')
    188 			return word(buf);
    189 		if (isdigit(c) || c == '.') {
    190 			yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
    191 			/* should this also have STR set? */
    192 			RET(NUMBER);
    193 		}
    194 	
    195 		yylval.i = c;
    196 		switch (c) {
    197 		case '\n':	/* {EOL} */
    198 			RET(NL);
    199 		case '\r':	/* assume \n is coming */
    200 		case ' ':	/* {WS}+ */
    201 		case '\t':
    202 			break;
    203 		case '#':	/* #.* strip comments */
    204 			while ((c = input()) != '\n' && c != 0)
    205 				;
    206 			unput(c);
    207 			break;
    208 		case ';':
    209 			RET(';');
    210 		case '\\':
    211 			if (peek() == '\n') {
    212 				input();
    213 			} else if (peek() == '\r') {
    214 				input(); input();	/* \n */
    215 				lineno++;
    216 			} else {
    217 				RET(c);
    218 			}
    219 			break;
    220 		case '&':
    221 			if (peek() == '&') {
    222 				input(); RET(AND);
    223 			} else 
    224 				RET('&');
    225 		case '|':
    226 			if (peek() == '|') {
    227 				input(); RET(BOR);
    228 			} else
    229 				RET('|');
    230 		case '!':
    231 			if (peek() == '=') {
    232 				input(); yylval.i = NE; RET(NE);
    233 			} else if (peek() == '~') {
    234 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
    235 			} else
    236 				RET(NOT);
    237 		case '~':
    238 			yylval.i = MATCH;
    239 			RET(MATCHOP);
    240 		case '<':
    241 			if (peek() == '=') {
    242 				input(); yylval.i = LE; RET(LE);
    243 			} else {
    244 				yylval.i = LT; RET(LT);
    245 			}
    246 		case '=':
    247 			if (peek() == '=') {
    248 				input(); yylval.i = EQ; RET(EQ);
    249 			} else {
    250 				yylval.i = ASSIGN; RET(ASGNOP);
    251 			}
    252 		case '>':
    253 			if (peek() == '=') {
    254 				input(); yylval.i = GE; RET(GE);
    255 			} else if (peek() == '>') {
    256 				input(); yylval.i = APPEND; RET(APPEND);
    257 			} else {
    258 				yylval.i = GT; RET(GT);
    259 			}
    260 		case '+':
    261 			if (peek() == '+') {
    262 				input(); yylval.i = INCR; RET(INCR);
    263 			} else if (peek() == '=') {
    264 				input(); yylval.i = ADDEQ; RET(ASGNOP);
    265 			} else
    266 				RET('+');
    267 		case '-':
    268 			if (peek() == '-') {
    269 				input(); yylval.i = DECR; RET(DECR);
    270 			} else if (peek() == '=') {
    271 				input(); yylval.i = SUBEQ; RET(ASGNOP);
    272 			} else
    273 				RET('-');
    274 		case '*':
    275 			if (peek() == '=') {	/* *= */
    276 				input(); yylval.i = MULTEQ; RET(ASGNOP);
    277 			} else if (peek() == '*') {	/* ** or **= */
    278 				input();	/* eat 2nd * */
    279 				if (peek() == '=') {
    280 					input(); yylval.i = POWEQ; RET(ASGNOP);
    281 				} else {
    282 					RET(POWER);
    283 				}
    284 			} else
    285 				RET('*');
    286 		case '/':
    287 			RET('/');
    288 		case '%':
    289 			if (peek() == '=') {
    290 				input(); yylval.i = MODEQ; RET(ASGNOP);
    291 			} else
    292 				RET('%');
    293 		case '^':
    294 			if (peek() == '=') {
    295 				input(); yylval.i = POWEQ; RET(ASGNOP);
    296 			} else
    297 				RET(POWER);
    298 	
    299 		case '$':
    300 			/* BUG: awkward, if not wrong */
    301 			c = gettok(&buf, &bufsize);
    302 			if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
    303 				unputstr(buf);
    304 				RET(INDIRECT);
    305 			} else if (isalpha(c)) {
    306 				if (strcmp(buf, "NF") == 0) {	/* very special */
    307 					unputstr("(NF)");
    308 					RET(INDIRECT);
    309 				}
    310 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
    311 				RET(IVAR);
    312 			} else {
    313 				unputstr(buf);
    314 				RET(INDIRECT);
    315 			}
    316 	
    317 		case '}':
    318 			if (--bracecnt < 0)
    319 				SYNTAX( "extra }" );
    320 			sc = 1;
    321 			RET(';');
    322 		case ']':
    323 			if (--brackcnt < 0)
    324 				SYNTAX( "extra ]" );
    325 			RET(']');
    326 		case ')':
    327 			if (--parencnt < 0)
    328 				SYNTAX( "extra )" );
    329 			RET(')');
    330 		case '{':
    331 			bracecnt++;
    332 			RET('{');
    333 		case '[':
    334 			brackcnt++;
    335 			RET('[');
    336 		case '(':
    337 			parencnt++;
    338 			RET('(');
    339 	
    340 		case '"':
    341 			return string();	/* BUG: should be like tran.c ? */
    342 	
    343 		default:
    344 			RET(c);
    345 		}
    346 	}
    347 }
    348 
    349 int string(void)
    350 {
    351 	int c, n;
    352 	char *s, *bp;
    353 	static char *buf = 0;
    354 	static int bufsz = 500;
    355 
    356 	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
    357 		FATAL("out of space for strings");
    358 	for (bp = buf; (c = input()) != '"'; ) {
    359 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0))
    360 			FATAL("out of space for string %.10s...", buf);
    361 		switch (c) {
    362 		case '\n':
    363 		case '\r':
    364 		case 0:
    365 			SYNTAX( "non-terminated string %.10s...", buf );
    366 			lineno++;
    367 			break;
    368 		case '\\':
    369 			c = input();
    370 			switch (c) {
    371 			case '"': *bp++ = '"'; break;
    372 			case 'n': *bp++ = '\n'; break;	
    373 			case 't': *bp++ = '\t'; break;
    374 			case 'f': *bp++ = '\f'; break;
    375 			case 'r': *bp++ = '\r'; break;
    376 			case 'b': *bp++ = '\b'; break;
    377 			case 'v': *bp++ = '\v'; break;
    378 			case 'a': *bp++ = '\007'; break;
    379 			case '\\': *bp++ = '\\'; break;
    380 
    381 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
    382 			case '3': case '4': case '5': case '6': case '7':
    383 				n = c - '0';
    384 				if ((c = peek()) >= '0' && c < '8') {
    385 					n = 8 * n + input() - '0';
    386 					if ((c = peek()) >= '0' && c < '8')
    387 						n = 8 * n + input() - '0';
    388 				}
    389 				*bp++ = n;
    390 				break;
    391 
    392 			case 'x':	/* hex  \x0-9a-fA-F + */
    393 			    {	char xbuf[100], *px;
    394 				for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
    395 					if (isdigit(c)
    396 					 || (c >= 'a' && c <= 'f')
    397 					 || (c >= 'A' && c <= 'F'))
    398 						*px++ = c;
    399 					else
    400 						break;
    401 				}
    402 				*px = 0;
    403 				unput(c);
    404 	  			sscanf(xbuf, "%x", &n);
    405 				*bp++ = n;
    406 				break;
    407 			    }
    408 
    409 			default: 
    410 				*bp++ = c;
    411 				break;
    412 			}
    413 			break;
    414 		default:
    415 			*bp++ = c;
    416 			break;
    417 		}
    418 	}
    419 	*bp = 0; 
    420 	s = tostring(buf);
    421 	*bp++ = ' '; *bp++ = 0;
    422 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
    423 	RET(STRING);
    424 }
    425 
    426 
    427 int binsearch(char *w, Keyword *kp, int n)
    428 {
    429 	int cond, low, mid, high;
    430 
    431 	low = 0;
    432 	high = n - 1;
    433 	while (low <= high) {
    434 		mid = (low + high) / 2;
    435 		if ((cond = strcmp(w, kp[mid].word)) < 0)
    436 			high = mid - 1;
    437 		else if (cond > 0)
    438 			low = mid + 1;
    439 		else
    440 			return mid;
    441 	}
    442 	return -1;
    443 }
    444 
    445 int word(char *w) 
    446 {
    447 	Keyword *kp;
    448 	int c, n;
    449 
    450 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
    451 	kp = keywords + n;
    452 	if (n != -1) {	/* found in table */
    453 		yylval.i = kp->sub;
    454 		switch (kp->type) {	/* special handling */
    455 		case FSYSTEM:
    456 			if (safe)
    457 				SYNTAX( "system is unsafe" );
    458 			RET(kp->type);
    459 		case FUNC:
    460 			if (infunc)
    461 				SYNTAX( "illegal nested function" );
    462 			RET(kp->type);
    463 		case RETURN:
    464 			if (!infunc)
    465 				SYNTAX( "return not in function" );
    466 			RET(kp->type);
    467 		case VARNF:
    468 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
    469 			RET(VARNF);
    470 		default:
    471 			RET(kp->type);
    472 		}
    473 	}
    474 	c = peek();	/* look for '(' */
    475 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
    476 		yylval.i = n;
    477 		RET(ARG);
    478 	} else {
    479 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
    480 		if (c == '(') {
    481 			RET(CALL);
    482 		} else {
    483 			RET(VAR);
    484 		}
    485 	}
    486 }
    487 
    488 void startreg(void)	/* next call to yyles will return a regular expression */
    489 {
    490 	reg = 1;
    491 }
    492 
    493 int regexpr(void)
    494 {
    495 	int c;
    496 	static char *buf = 0;
    497 	static int bufsz = 500;
    498 	char *bp;
    499 
    500 	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
    501 		FATAL("out of space for rex expr");
    502 	bp = buf;
    503 	for ( ; (c = input()) != '/' && c != 0; ) {
    504 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0))
    505 			FATAL("out of space for reg expr %.10s...", buf);
    506 		if (c == '\n') {
    507 			SYNTAX( "newline in regular expression %.10s...", buf ); 
    508 			unput('\n');
    509 			break;
    510 		} else if (c == '\\') {
    511 			*bp++ = '\\'; 
    512 			*bp++ = input();
    513 		} else {
    514 			*bp++ = c;
    515 		}
    516 	}
    517 	*bp = 0;
    518 	yylval.s = tostring(buf);
    519 	unput('/');
    520 	RET(REGEXPR);
    521 }
    522 
    523 /* low-level lexical stuff, sort of inherited from lex */
    524 
    525 char	ebuf[300];
    526 char	*ep = ebuf;
    527 char	yysbuf[100];	/* pushback buffer */
    528 char	*yysptr = yysbuf;
    529 FILE	*yyin = 0;
    530 
    531 int input(void)	/* get next lexical input character */
    532 {
    533 	int c;
    534 	extern char *lexprog;
    535 
    536 	if (yysptr > yysbuf)
    537 		c = *--yysptr;
    538 	else if (lexprog != NULL) {	/* awk '...' */
    539 		if ((c = *lexprog) != 0)
    540 			lexprog++;
    541 	} else				/* awk -f ... */
    542 		c = pgetc();
    543 	if (c == '\n')
    544 		lineno++;
    545 	else if (c == EOF)
    546 		c = 0;
    547 	if (ep >= ebuf + sizeof ebuf)
    548 		ep = ebuf;
    549 	return *ep++ = c;
    550 }
    551 
    552 void unput(int c)	/* put lexical character back on input */
    553 {
    554 	if (c == '\n')
    555 		lineno--;
    556 	if (yysptr >= yysbuf + sizeof(yysbuf))
    557 		FATAL("pushed back too much: %.20s...", yysbuf);
    558 	*yysptr++ = c;
    559 	if (--ep < ebuf)
    560 		ep = ebuf + sizeof(ebuf) - 1;
    561 }
    562 
    563 void unputstr(char *s)	/* put a string back on input */
    564 {
    565 	int i;
    566 
    567 	for (i = strlen(s)-1; i >= 0; i--)
    568 		unput(s[i]);
    569 }
    570