regexp.c (15453B)
1 #include "sam.h" 2 3 Rangeset sel; 4 String lastregexp; 5 /* 6 * Machine Information 7 */ 8 typedef struct Inst Inst; 9 10 struct Inst 11 { 12 long type; /* < 0x10000 ==> literal, otherwise action */ 13 union { 14 int rsid; 15 int rsubid; 16 int class; 17 struct Inst *rother; 18 struct Inst *rright; 19 } r; 20 union{ 21 struct Inst *lleft; 22 struct Inst *lnext; 23 } l; 24 }; 25 #define sid r.rsid 26 #define subid r.rsubid 27 #define rclass r.class 28 #define other r.rother 29 #define right r.rright 30 #define left l.lleft 31 #define next l.lnext 32 33 #define NPROG 1024 34 Inst program[NPROG]; 35 Inst *progp; 36 Inst *startinst; /* First inst. of program; might not be program[0] */ 37 Inst *bstartinst; /* same for backwards machine */ 38 39 typedef struct Ilist Ilist; 40 struct Ilist 41 { 42 Inst *inst; /* Instruction of the thread */ 43 Rangeset se; 44 Posn startp; /* first char of match */ 45 }; 46 47 #define NLIST 127 48 49 Ilist *tl, *nl; /* This list, next list */ 50 Ilist list[2][NLIST+1]; /* +1 for trailing null */ 51 static Rangeset sempty; 52 53 /* 54 * Actions and Tokens 55 * 56 * 0x100xx are operators, value == precedence 57 * 0x200xx are tokens, i.e. operands for operators 58 */ 59 #define OPERATOR 0x10000 /* Bitmask of all operators */ 60 #define START 0x10000 /* Start, used for marker on stack */ 61 #define RBRA 0x10001 /* Right bracket, ) */ 62 #define LBRA 0x10002 /* Left bracket, ( */ 63 #define OR 0x10003 /* Alternation, | */ 64 #define CAT 0x10004 /* Concatentation, implicit operator */ 65 #define STAR 0x10005 /* Closure, * */ 66 #define PLUS 0x10006 /* a+ == aa* */ 67 #define QUEST 0x10007 /* a? == a|nothing, i.e. 0 or 1 a's */ 68 #define ANY 0x20000 /* Any character but newline, . */ 69 #define NOP 0x20001 /* No operation, internal use only */ 70 #define BOL 0x20002 /* Beginning of line, ^ */ 71 #define EOL 0x20003 /* End of line, $ */ 72 #define CCLASS 0x20004 /* Character class, [] */ 73 #define NCCLASS 0x20005 /* Negated character class, [^] */ 74 #define END 0x20077 /* Terminate: match found */ 75 76 #define ISATOR 0x10000 77 #define ISAND 0x20000 78 79 /* 80 * Parser Information 81 */ 82 typedef struct Node Node; 83 struct Node 84 { 85 Inst *first; 86 Inst *last; 87 }; 88 89 #define NSTACK 20 90 Node andstack[NSTACK]; 91 Node *andp; 92 int atorstack[NSTACK]; 93 int *atorp; 94 int lastwasand; /* Last token was operand */ 95 int cursubid; 96 int subidstack[NSTACK]; 97 int *subidp; 98 int backwards; 99 int nbra; 100 Rune *exprp; /* pointer to next character in source expression */ 101 #define DCLASS 10 /* allocation increment */ 102 int nclass; /* number active */ 103 int Nclass; /* high water mark */ 104 Rune **class; 105 int negateclass; 106 107 int addinst(Ilist *l, Inst *inst, Rangeset *sep); 108 void newmatch(Rangeset*); 109 void bnewmatch(Rangeset*); 110 void pushand(Inst*, Inst*); 111 void pushator(int); 112 Node *popand(int); 113 int popator(void); 114 void startlex(Rune*); 115 int lex(void); 116 void operator(int); 117 void operand(int); 118 void evaluntil(int); 119 void optimize(Inst*); 120 void bldcclass(void); 121 122 void 123 regerror(Err e) 124 { 125 Strzero(&lastregexp); 126 error(e); 127 } 128 129 void 130 regerror_c(Err e, int c) 131 { 132 Strzero(&lastregexp); 133 error_c(e, c); 134 } 135 136 Inst * 137 newinst(int t) 138 { 139 if(progp >= &program[NPROG]) 140 regerror(Etoolong); 141 progp->type = t; 142 progp->left = 0; 143 progp->right = 0; 144 return progp++; 145 } 146 147 Inst * 148 realcompile(Rune *s) 149 { 150 int token; 151 152 startlex(s); 153 atorp = atorstack; 154 andp = andstack; 155 subidp = subidstack; 156 cursubid = 0; 157 lastwasand = FALSE; 158 /* Start with a low priority operator to prime parser */ 159 pushator(START-1); 160 while((token=lex()) != END){ 161 if((token&ISATOR) == OPERATOR) 162 operator(token); 163 else 164 operand(token); 165 } 166 /* Close with a low priority operator */ 167 evaluntil(START); 168 /* Force END */ 169 operand(END); 170 evaluntil(START); 171 if(nbra) 172 regerror(Eleftpar); 173 --andp; /* points to first and only operand */ 174 return andp->first; 175 } 176 177 void 178 compile(String *s) 179 { 180 int i; 181 Inst *oprogp; 182 183 if(Strcmp(s, &lastregexp)==0) 184 return; 185 for(i=0; i<nclass; i++) 186 free(class[i]); 187 nclass = 0; 188 progp = program; 189 backwards = FALSE; 190 startinst = realcompile(s->s); 191 optimize(program); 192 oprogp = progp; 193 backwards = TRUE; 194 bstartinst = realcompile(s->s); 195 optimize(oprogp); 196 Strduplstr(&lastregexp, s); 197 } 198 199 void 200 operand(int t) 201 { 202 Inst *i; 203 if(lastwasand) 204 operator(CAT); /* catenate is implicit */ 205 i = newinst(t); 206 if(t == CCLASS){ 207 if(negateclass) 208 i->type = NCCLASS; /* UGH */ 209 i->rclass = nclass-1; /* UGH */ 210 } 211 pushand(i, i); 212 lastwasand = TRUE; 213 } 214 215 void 216 operator(int t) 217 { 218 if(t==RBRA && --nbra<0) 219 regerror(Erightpar); 220 if(t==LBRA){ 221 /* 222 * if(++cursubid >= NSUBEXP) 223 * regerror(Esubexp); 224 */ 225 cursubid++; /* silently ignored */ 226 nbra++; 227 if(lastwasand) 228 operator(CAT); 229 }else 230 evaluntil(t); 231 if(t!=RBRA) 232 pushator(t); 233 lastwasand = FALSE; 234 if(t==STAR || t==QUEST || t==PLUS || t==RBRA) 235 lastwasand = TRUE; /* these look like operands */ 236 } 237 238 void 239 cant(char *s) 240 { 241 char buf[100]; 242 243 sprint(buf, "regexp: can't happen: %s", s); 244 panic(buf); 245 } 246 247 void 248 pushand(Inst *f, Inst *l) 249 { 250 if(andp >= &andstack[NSTACK]) 251 cant("operand stack overflow"); 252 andp->first = f; 253 andp->last = l; 254 andp++; 255 } 256 257 void 258 pushator(int t) 259 { 260 if(atorp >= &atorstack[NSTACK]) 261 cant("operator stack overflow"); 262 *atorp++=t; 263 if(cursubid >= NSUBEXP) 264 *subidp++= -1; 265 else 266 *subidp++=cursubid; 267 } 268 269 Node * 270 popand(int op) 271 { 272 if(andp <= &andstack[0]) 273 if(op) 274 regerror_c(Emissop, op); 275 else 276 regerror(Ebadregexp); 277 return --andp; 278 } 279 280 int 281 popator(void) 282 { 283 if(atorp <= &atorstack[0]) 284 cant("operator stack underflow"); 285 --subidp; 286 return *--atorp; 287 } 288 289 void 290 evaluntil(int pri) 291 { 292 Node *op1, *op2, *t; 293 Inst *inst1, *inst2; 294 295 while(pri==RBRA || atorp[-1]>=pri){ 296 switch(popator()){ 297 case LBRA: 298 op1 = popand('('); 299 inst2 = newinst(RBRA); 300 inst2->subid = *subidp; 301 op1->last->next = inst2; 302 inst1 = newinst(LBRA); 303 inst1->subid = *subidp; 304 inst1->next = op1->first; 305 pushand(inst1, inst2); 306 return; /* must have been RBRA */ 307 default: 308 panic("unknown regexp operator"); 309 break; 310 case OR: 311 op2 = popand('|'); 312 op1 = popand('|'); 313 inst2 = newinst(NOP); 314 op2->last->next = inst2; 315 op1->last->next = inst2; 316 inst1 = newinst(OR); 317 inst1->right = op1->first; 318 inst1->left = op2->first; 319 pushand(inst1, inst2); 320 break; 321 case CAT: 322 op2 = popand(0); 323 op1 = popand(0); 324 if(backwards && op2->first->type!=END) 325 t = op1, op1 = op2, op2 = t; 326 op1->last->next = op2->first; 327 pushand(op1->first, op2->last); 328 break; 329 case STAR: 330 op2 = popand('*'); 331 inst1 = newinst(OR); 332 op2->last->next = inst1; 333 inst1->right = op2->first; 334 pushand(inst1, inst1); 335 break; 336 case PLUS: 337 op2 = popand('+'); 338 inst1 = newinst(OR); 339 op2->last->next = inst1; 340 inst1->right = op2->first; 341 pushand(op2->first, inst1); 342 break; 343 case QUEST: 344 op2 = popand('?'); 345 inst1 = newinst(OR); 346 inst2 = newinst(NOP); 347 inst1->left = inst2; 348 inst1->right = op2->first; 349 op2->last->next = inst2; 350 pushand(inst1, inst2); 351 break; 352 } 353 } 354 } 355 356 357 void 358 optimize(Inst *start) 359 { 360 Inst *inst, *target; 361 362 for(inst=start; inst->type!=END; inst++){ 363 target = inst->next; 364 while(target->type == NOP) 365 target = target->next; 366 inst->next = target; 367 } 368 } 369 370 #ifdef DEBUG 371 void 372 dumpstack(void){ 373 Node *stk; 374 int *ip; 375 376 dprint("operators\n"); 377 for(ip = atorstack; ip<atorp; ip++) 378 dprint("0%o\n", *ip); 379 dprint("operands\n"); 380 for(stk = andstack; stk<andp; stk++) 381 dprint("0%o\t0%o\n", stk->first->type, stk->last->type); 382 } 383 void 384 dump(void){ 385 Inst *l; 386 387 l = program; 388 do{ 389 dprint("%d:\t0%o\t%d\t%d\n", l-program, l->type, 390 l->left-program, l->right-program); 391 }while(l++->type); 392 } 393 #endif 394 395 void 396 startlex(Rune *s) 397 { 398 exprp = s; 399 nbra = 0; 400 } 401 402 403 int 404 lex(void){ 405 int c= *exprp++; 406 407 switch(c){ 408 case '\\': 409 if(*exprp) 410 if((c= *exprp++)=='n') 411 c='\n'; 412 break; 413 case 0: 414 c = END; 415 --exprp; /* In case we come here again */ 416 break; 417 case '*': 418 c = STAR; 419 break; 420 case '?': 421 c = QUEST; 422 break; 423 case '+': 424 c = PLUS; 425 break; 426 case '|': 427 c = OR; 428 break; 429 case '.': 430 c = ANY; 431 break; 432 case '(': 433 c = LBRA; 434 break; 435 case ')': 436 c = RBRA; 437 break; 438 case '^': 439 c = BOL; 440 break; 441 case '$': 442 c = EOL; 443 break; 444 case '[': 445 c = CCLASS; 446 bldcclass(); 447 break; 448 } 449 return c; 450 } 451 452 long 453 nextrec(void){ 454 if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0)) 455 regerror(Ebadclass); 456 if(exprp[0] == '\\'){ 457 exprp++; 458 if(*exprp=='n'){ 459 exprp++; 460 return '\n'; 461 } 462 return *exprp++|0x10000; 463 } 464 return *exprp++; 465 } 466 467 void 468 bldcclass(void) 469 { 470 long c1, c2, n, na; 471 Rune *classp; 472 473 classp = emalloc(DCLASS*RUNESIZE); 474 n = 0; 475 na = DCLASS; 476 /* we have already seen the '[' */ 477 if(*exprp == '^'){ 478 classp[n++] = '\n'; /* don't match newline in negate case */ 479 negateclass = TRUE; 480 exprp++; 481 }else 482 negateclass = FALSE; 483 while((c1 = nextrec()) != ']'){ 484 if(c1 == '-'){ 485 Error: 486 free(classp); 487 regerror(Ebadclass); 488 } 489 if(n+4 >= na){ /* 3 runes plus NUL */ 490 na += DCLASS; 491 classp = erealloc(classp, na*RUNESIZE); 492 } 493 if(*exprp == '-'){ 494 exprp++; /* eat '-' */ 495 if((c2 = nextrec()) == ']') 496 goto Error; 497 classp[n+0] = Runemax; 498 classp[n+1] = c1; 499 classp[n+2] = c2; 500 n += 3; 501 }else 502 classp[n++] = c1; 503 } 504 classp[n] = 0; 505 if(nclass == Nclass){ 506 Nclass += DCLASS; 507 class = erealloc(class, Nclass*sizeof(Rune*)); 508 } 509 class[nclass++] = classp; 510 } 511 512 int 513 classmatch(int classno, int c, int negate) 514 { 515 Rune *p; 516 517 p = class[classno]; 518 while(*p){ 519 if(*p == Runemax){ 520 if(p[1]<=c && c<=p[2]) 521 return !negate; 522 p += 3; 523 }else if(*p++ == c) 524 return !negate; 525 } 526 return negate; 527 } 528 529 /* 530 * Note optimization in addinst: 531 * *l must be pending when addinst called; if *l has been looked 532 * at already, the optimization is a bug. 533 */ 534 int 535 addinst(Ilist *l, Inst *inst, Rangeset *sep) 536 { 537 Ilist *p; 538 539 for(p = l; p->inst; p++){ 540 if(p->inst==inst){ 541 if((sep)->p[0].p1 < p->se.p[0].p1) 542 p->se= *sep; /* this would be bug */ 543 return 0; /* It's already there */ 544 } 545 } 546 p->inst = inst; 547 p->se= *sep; 548 (p+1)->inst = 0; 549 return 1; 550 } 551 552 int 553 execute(File *f, Posn startp, Posn eof) 554 { 555 int flag = 0; 556 Inst *inst; 557 Ilist *tlp; 558 Posn p = startp; 559 int nnl = 0, ntl; 560 int c; 561 int wrapped = 0; 562 int startchar = startinst->type<OPERATOR? startinst->type : 0; 563 564 list[0][0].inst = list[1][0].inst = 0; 565 sel.p[0].p1 = -1; 566 /* Execute machine once for each character */ 567 for(;;p++){ 568 doloop: 569 c = filereadc(f, p); 570 if(p>=eof || c<0){ 571 switch(wrapped++){ 572 case 0: /* let loop run one more click */ 573 case 2: 574 break; 575 case 1: /* expired; wrap to beginning */ 576 if(sel.p[0].p1>=0 || eof!=INFINITY) 577 goto Return; 578 list[0][0].inst = list[1][0].inst = 0; 579 p = 0; 580 goto doloop; 581 default: 582 goto Return; 583 } 584 }else if(((wrapped && p>=startp) || sel.p[0].p1>0) && nnl==0) 585 break; 586 /* fast check for first char */ 587 if(startchar && nnl==0 && c!=startchar) 588 continue; 589 tl = list[flag]; 590 nl = list[flag^=1]; 591 nl->inst = 0; 592 ntl = nnl; 593 nnl = 0; 594 if(sel.p[0].p1<0 && (!wrapped || p<startp || startp==eof)){ 595 /* Add first instruction to this list */ 596 sempty.p[0].p1 = p; 597 if(addinst(tl, startinst, &sempty)) 598 if(++ntl >= NLIST) 599 Overflow: 600 error(Eoverflow); 601 } 602 /* Execute machine until this list is empty */ 603 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */ 604 Switchstmt: 605 switch(inst->type){ 606 default: /* regular character */ 607 if(inst->type==c){ 608 Addinst: 609 if(addinst(nl, inst->next, &tlp->se)) 610 if(++nnl >= NLIST) 611 goto Overflow; 612 } 613 break; 614 case LBRA: 615 if(inst->subid>=0) 616 tlp->se.p[inst->subid].p1 = p; 617 inst = inst->next; 618 goto Switchstmt; 619 case RBRA: 620 if(inst->subid>=0) 621 tlp->se.p[inst->subid].p2 = p; 622 inst = inst->next; 623 goto Switchstmt; 624 case ANY: 625 if(c!='\n') 626 goto Addinst; 627 break; 628 case BOL: 629 if(p==0 || filereadc(f, p - 1)=='\n'){ 630 Step: 631 inst = inst->next; 632 goto Switchstmt; 633 } 634 break; 635 case EOL: 636 if(c == '\n') 637 goto Step; 638 break; 639 case CCLASS: 640 if(c>=0 && classmatch(inst->rclass, c, 0)) 641 goto Addinst; 642 break; 643 case NCCLASS: 644 if(c>=0 && classmatch(inst->rclass, c, 1)) 645 goto Addinst; 646 break; 647 case OR: 648 /* evaluate right choice later */ 649 if(addinst(tl, inst->right, &tlp->se)) 650 if(++ntl >= NLIST) 651 goto Overflow; 652 /* efficiency: advance and re-evaluate */ 653 inst = inst->left; 654 goto Switchstmt; 655 case END: /* Match! */ 656 tlp->se.p[0].p2 = p; 657 newmatch(&tlp->se); 658 break; 659 } 660 } 661 } 662 Return: 663 return sel.p[0].p1>=0; 664 } 665 666 void 667 newmatch(Rangeset *sp) 668 { 669 int i; 670 671 if(sel.p[0].p1<0 || sp->p[0].p1<sel.p[0].p1 || 672 (sp->p[0].p1==sel.p[0].p1 && sp->p[0].p2>sel.p[0].p2)) 673 for(i = 0; i<NSUBEXP; i++) 674 sel.p[i] = sp->p[i]; 675 } 676 677 int 678 bexecute(File *f, Posn startp) 679 { 680 int flag = 0; 681 Inst *inst; 682 Ilist *tlp; 683 Posn p = startp; 684 int nnl = 0, ntl; 685 int c; 686 int wrapped = 0; 687 int startchar = bstartinst->type<OPERATOR? bstartinst->type : 0; 688 689 list[0][0].inst = list[1][0].inst = 0; 690 sel.p[0].p1= -1; 691 /* Execute machine once for each character, including terminal NUL */ 692 for(;;--p){ 693 doloop: 694 if((c = filereadc(f, p - 1))==-1){ 695 switch(wrapped++){ 696 case 0: /* let loop run one more click */ 697 case 2: 698 break; 699 case 1: /* expired; wrap to end */ 700 if(sel.p[0].p1>=0) 701 case 3: 702 goto Return; 703 list[0][0].inst = list[1][0].inst = 0; 704 p = f->b.nc; 705 goto doloop; 706 default: 707 goto Return; 708 } 709 }else if(((wrapped && p<=startp) || sel.p[0].p1>0) && nnl==0) 710 break; 711 /* fast check for first char */ 712 if(startchar && nnl==0 && c!=startchar) 713 continue; 714 tl = list[flag]; 715 nl = list[flag^=1]; 716 nl->inst = 0; 717 ntl = nnl; 718 nnl = 0; 719 if(sel.p[0].p1<0 && (!wrapped || p>startp)){ 720 /* Add first instruction to this list */ 721 /* the minus is so the optimizations in addinst work */ 722 sempty.p[0].p1 = -p; 723 if(addinst(tl, bstartinst, &sempty)) 724 if(++ntl >= NLIST) 725 Overflow: 726 error(Eoverflow); 727 } 728 /* Execute machine until this list is empty */ 729 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */ 730 Switchstmt: 731 switch(inst->type){ 732 default: /* regular character */ 733 if(inst->type == c){ 734 Addinst: 735 if(addinst(nl, inst->next, &tlp->se)) 736 if(++nnl >= NLIST) 737 goto Overflow; 738 } 739 break; 740 case LBRA: 741 if(inst->subid>=0) 742 tlp->se.p[inst->subid].p1 = p; 743 inst = inst->next; 744 goto Switchstmt; 745 case RBRA: 746 if(inst->subid >= 0) 747 tlp->se.p[inst->subid].p2 = p; 748 inst = inst->next; 749 goto Switchstmt; 750 case ANY: 751 if(c != '\n') 752 goto Addinst; 753 break; 754 case BOL: 755 if(c=='\n' || p==0){ 756 Step: 757 inst = inst->next; 758 goto Switchstmt; 759 } 760 break; 761 case EOL: 762 if(p==f->b.nc || filereadc(f, p)=='\n') 763 goto Step; 764 break; 765 case CCLASS: 766 if(c>=0 && classmatch(inst->rclass, c, 0)) 767 goto Addinst; 768 break; 769 case NCCLASS: 770 if(c>=0 && classmatch(inst->rclass, c, 1)) 771 goto Addinst; 772 break; 773 case OR: 774 /* evaluate right choice later */ 775 if(addinst(tlp, inst->right, &tlp->se)) 776 if(++ntl >= NLIST) 777 goto Overflow; 778 /* efficiency: advance and re-evaluate */ 779 inst = inst->left; 780 goto Switchstmt; 781 case END: /* Match! */ 782 tlp->se.p[0].p1 = -tlp->se.p[0].p1; /* minus sign */ 783 tlp->se.p[0].p2 = p; 784 bnewmatch(&tlp->se); 785 break; 786 } 787 } 788 } 789 Return: 790 return sel.p[0].p1>=0; 791 } 792 793 void 794 bnewmatch(Rangeset *sp) 795 { 796 int i; 797 if(sel.p[0].p1<0 || sp->p[0].p1>sel.p[0].p2 || (sp->p[0].p1==sel.p[0].p2 && sp->p[0].p2<sel.p[0].p1)) 798 for(i = 0; i<NSUBEXP; i++){ /* note the reversal; p1<=p2 */ 799 sel.p[i].p1 = sp->p[i].p2; 800 sel.p[i].p2 = sp->p[i].p1; 801 } 802 }