sort.c (29541B)
1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 5 /* 6 bugs: 7 00/ff for end of file can conflict with 00/ff characters 8 */ 9 10 enum 11 { 12 Nline = 500000, /* default max number of lines saved in memory */ 13 Nmerge = 10, /* max number of temporary files merged */ 14 Nfield = 20, /* max number of argument fields */ 15 16 Bflag = 1<<0, /* flags per field */ 17 B1flag = 1<<1, 18 19 Dflag = 1<<2, 20 Fflag = 1<<3, 21 Gflag = 1<<4, 22 Iflag = 1<<5, 23 Mflag = 1<<6, 24 Nflag = 1<<7, 25 Rflag = 1<<8, 26 Wflag = 1<<9, 27 28 NSstart = 0, /* states for number to key decoding */ 29 NSsign, 30 NSzero, 31 NSdigit, 32 NSpoint, 33 NSfract, 34 NSzerofract, 35 NSexp, 36 NSexpsign, 37 NSexpdigit 38 }; 39 40 typedef struct Line Line; 41 typedef struct Key Key; 42 typedef struct Merge Merge; 43 typedef struct Field Field; 44 45 struct Line 46 { 47 Key* key; 48 int llen; /* always >= 1 */ 49 uchar line[1]; /* always ends in '\n' */ 50 }; 51 52 struct Merge 53 { 54 Key* key; /* copy of line->key so (Line*) looks like (Merge*) */ 55 Line* line; /* line at the head of a merged temp file */ 56 int fd; /* file descriptor */ 57 Biobuf b; /* iobuf for reading a temp file */ 58 }; 59 60 struct Key 61 { 62 int klen; 63 uchar key[1]; 64 }; 65 66 struct Field 67 { 68 int beg1; 69 int beg2; 70 int end1; 71 int end2; 72 73 long flags; 74 uchar mapto[256]; 75 76 void (*dokey)(Key*, uchar*, uchar*, Field*); 77 }; 78 79 struct args 80 { 81 char* ofile; 82 char* tname; 83 Rune tabchar; 84 char cflag; 85 char uflag; 86 char vflag; 87 int nfield; 88 int nfile; 89 Field field[Nfield]; 90 91 Line** linep; 92 long nline; /* number of lines in this temp file */ 93 long lineno; /* overall ordinal for -s option */ 94 int ntemp; 95 long mline; /* max lines per file */ 96 } args; 97 98 extern int latinmap[]; 99 extern Rune* month[12]; 100 101 void buildkey(Line*); 102 void doargs(int, char*[]); 103 void dofield(char*, int*, int*, int, int); 104 void dofile(Biobuf*); 105 void dokey_(Key*, uchar*, uchar*, Field*); 106 void dokey_dfi(Key*, uchar*, uchar*, Field*); 107 void dokey_gn(Key*, uchar*, uchar*, Field*); 108 void dokey_m(Key*, uchar*, uchar*, Field*); 109 void dokey_r(Key*, uchar*, uchar*, Field*); 110 void done(char*); 111 int kcmp(Key*, Key*); 112 void makemapd(Field*); 113 void makemapm(Field*); 114 void mergefiles(int, int, Biobuf*); 115 void mergeout(Biobuf*); 116 void newfield(void); 117 Line* newline(Biobuf*); 118 void nomem(void); 119 void notifyf(void*, char*); 120 void printargs(void); 121 void printout(Biobuf*); 122 void setfield(int, int); 123 uchar* skip(uchar*, int, int, int, int); 124 void sort4(void*, ulong); 125 char* tempfile(int); 126 void tempout(void); 127 void lineout(Biobuf*, Line*); 128 129 void 130 main(int argc, char *argv[]) 131 { 132 int i, f; 133 char *s; 134 Biobuf bbuf; 135 136 notify(notifyf); /**/ 137 doargs(argc, argv); 138 if(args.vflag) 139 printargs(); 140 141 for(i=1; i<argc; i++) { 142 s = argv[i]; 143 if(s == 0) 144 continue; 145 if(strcmp(s, "-") == 0) { 146 Binit(&bbuf, 0, OREAD); 147 dofile(&bbuf); 148 Bterm(&bbuf); 149 continue; 150 } 151 f = open(s, OREAD); 152 if(f < 0) { 153 fprint(2, "sort: open %s: %r\n", s); 154 done("open"); 155 } 156 Binit(&bbuf, f, OREAD); 157 dofile(&bbuf); 158 Bterm(&bbuf); 159 close(f); 160 } 161 if(args.nfile == 0) { 162 Binit(&bbuf, 0, OREAD); 163 dofile(&bbuf); 164 Bterm(&bbuf); 165 } 166 if(args.cflag) 167 done(0); 168 if(args.vflag) 169 fprint(2, "=========\n"); 170 171 f = 1; 172 if(args.ofile) { 173 f = create(args.ofile, OWRITE, 0666); 174 if(f < 0) { 175 fprint(2, "sort: create %s: %r\n", args.ofile); 176 done("create"); 177 } 178 } 179 180 Binit(&bbuf, f, OWRITE); 181 if(args.ntemp) { 182 tempout(); 183 mergeout(&bbuf); 184 } else { 185 printout(&bbuf); 186 } 187 Bterm(&bbuf); 188 done(0); 189 } 190 191 void 192 dofile(Biobuf *b) 193 { 194 Line *l, *ol; 195 int n; 196 197 if(args.cflag) { 198 ol = newline(b); 199 if(ol == 0) 200 return; 201 for(;;) { 202 l = newline(b); 203 if(l == 0) 204 break; 205 n = kcmp(ol->key, l->key); 206 if(n > 0 || (n == 0 && args.uflag)) { 207 fprint(2, "sort: -c file not in sort\n"); /**/ 208 done("order"); 209 } 210 free(ol->key); 211 free(ol); 212 ol = l; 213 } 214 return; 215 } 216 217 if(args.linep == 0) { 218 args.linep = malloc(args.mline * sizeof(args.linep)); 219 if(args.linep == 0) 220 nomem(); 221 } 222 for(;;) { 223 l = newline(b); 224 if(l == 0) 225 break; 226 if(args.nline >= args.mline) 227 tempout(); 228 args.linep[args.nline] = l; 229 args.nline++; 230 args.lineno++; 231 } 232 } 233 234 void 235 notifyf(void *a, char *s) 236 { 237 USED(a); 238 if(strcmp(s, "interrupt") == 0) 239 done(0); 240 if(strcmp(s, "hangup") == 0) 241 done(0); 242 if(strcmp(s, "kill") == 0) 243 done(0); 244 if(strncmp(s, "sys: write on closed pipe", 25) == 0) 245 done(0); 246 noted(NDFLT); 247 } 248 249 Line* 250 newline(Biobuf *b) 251 { 252 Line *l; 253 char *p; 254 int n, c; 255 256 p = Brdline(b, '\n'); 257 n = Blinelen(b); 258 if(p == 0) { 259 if(n == 0) 260 return 0; 261 l = 0; 262 for(n=0;;) { 263 if((n & 31) == 0) { 264 l = realloc(l, sizeof(Line) + 265 (n+31)*sizeof(l->line[0])); 266 if(l == 0) 267 nomem(); 268 } 269 c = Bgetc(b); 270 if(c < 0) { 271 fprint(2, "sort: newline added\n"); 272 c = '\n'; 273 } 274 l->line[n++] = c; 275 if(c == '\n') 276 break; 277 } 278 l->llen = n; 279 buildkey(l); 280 return l; 281 } 282 l = malloc(sizeof(Line) + 283 (n-1)*sizeof(l->line[0])); 284 if(l == 0) 285 nomem(); 286 l->llen = n; 287 memmove(l->line, p, n); 288 buildkey(l); 289 return l; 290 } 291 292 void 293 lineout(Biobuf *b, Line *l) 294 { 295 int n, m; 296 297 n = l->llen; 298 m = Bwrite(b, l->line, n); 299 if(n != m) 300 exits("write"); 301 } 302 303 void 304 tempout(void) 305 { 306 long n; 307 Line **lp, *l; 308 char *tf; 309 int f; 310 Biobuf tb; 311 312 sort4(args.linep, args.nline); 313 tf = tempfile(args.ntemp); 314 args.ntemp++; 315 f = create(tf, OWRITE, 0666); 316 if(f < 0) { 317 fprint(2, "sort: create %s: %r\n", tf); 318 done("create"); 319 } 320 321 Binit(&tb, f, OWRITE); 322 lp = args.linep; 323 for(n=args.nline; n>0; n--) { 324 l = *lp++; 325 lineout(&tb, l); 326 free(l->key); 327 free(l); 328 } 329 args.nline = 0; 330 Bterm(&tb); 331 close(f); 332 } 333 334 void 335 done(char *xs) 336 { 337 int i; 338 339 for(i=0; i<args.ntemp; i++) 340 remove(tempfile(i)); 341 exits(xs); 342 } 343 344 void 345 nomem(void) 346 { 347 fprint(2, "sort: out of memory\n"); 348 done("mem"); 349 } 350 351 char* 352 tempfile(int n) 353 { 354 static char file[100]; 355 static uint pid; 356 char *dir; 357 358 dir = "/var/tmp"; 359 if(args.tname) 360 dir = args.tname; 361 if(strlen(dir) >= nelem(file)-20) { 362 fprint(2, "temp file directory name is too long: %s\n", dir); 363 done("tdir"); 364 } 365 366 if(pid == 0) { 367 pid = getpid(); 368 if(pid == 0) { 369 pid = time(0); 370 if(pid == 0) 371 pid = 1; 372 } 373 } 374 375 sprint(file, "%s/sort.%.4d.%.4d", dir, pid%10000, n); 376 return file; 377 } 378 379 void 380 mergeout(Biobuf *b) 381 { 382 int n, i, f; 383 char *tf; 384 Biobuf tb; 385 386 for(i=0; i<args.ntemp; i+=n) { 387 n = args.ntemp - i; 388 if(n > Nmerge) { 389 tf = tempfile(args.ntemp); 390 args.ntemp++; 391 f = create(tf, OWRITE, 0666); 392 if(f < 0) { 393 fprint(2, "sort: create %s: %r\n", tf); 394 done("create"); 395 } 396 Binit(&tb, f, OWRITE); 397 398 n = Nmerge; 399 mergefiles(i, n, &tb); 400 401 Bterm(&tb); 402 close(f); 403 } else 404 mergefiles(i, n, b); 405 } 406 } 407 408 void 409 mergefiles(int t, int n, Biobuf *b) 410 { 411 Merge *m, *mp, **mmp; 412 Key *ok; 413 Line *l; 414 char *tf; 415 int i, f, nn; 416 417 mmp = malloc(n*sizeof(*mmp)); 418 mp = malloc(n*sizeof(*mp)); 419 if(mmp == 0 || mp == 0) 420 nomem(); 421 422 nn = 0; 423 m = mp; 424 for(i=0; i<n; i++,m++) { 425 tf = tempfile(t+i); 426 f = open(tf, OREAD); 427 if(f < 0) { 428 fprint(2, "sort: reopen %s: %r\n", tf); 429 done("open"); 430 } 431 m->fd = f; 432 Binit(&m->b, f, OREAD); 433 mmp[nn] = m; 434 435 l = newline(&m->b); 436 if(l == 0) 437 continue; 438 nn++; 439 m->line = l; 440 m->key = l->key; 441 } 442 443 ok = 0; 444 for(;;) { 445 sort4(mmp, nn); 446 m = *mmp; 447 if(nn == 0) 448 break; 449 for(;;) { 450 l = m->line; 451 if(args.uflag && ok && kcmp(ok, l->key) == 0) { 452 free(l->key); 453 free(l); 454 } else { 455 lineout(b, l); 456 if(ok) 457 free(ok); 458 ok = l->key; 459 free(l); 460 } 461 462 l = newline(&m->b); 463 if(l == 0) { 464 nn--; 465 mmp[0] = mmp[nn]; 466 break; 467 } 468 m->line = l; 469 m->key = l->key; 470 if(nn > 1 && kcmp(mmp[0]->key, mmp[1]->key) > 0) 471 break; 472 } 473 } 474 if(ok) 475 free(ok); 476 477 m = mp; 478 for(i=0; i<n; i++,m++) { 479 Bterm(&m->b); 480 close(m->fd); 481 } 482 483 free(mp); 484 free(mmp); 485 } 486 487 int 488 kcmp(Key *ka, Key *kb) 489 { 490 int n, m; 491 492 /* 493 * set n to length of smaller key 494 */ 495 n = ka->klen; 496 m = kb->klen; 497 if(n > m) 498 n = m; 499 return memcmp(ka->key, kb->key, n); 500 } 501 502 void 503 printout(Biobuf *b) 504 { 505 long n; 506 Line **lp, *l; 507 Key *ok; 508 509 sort4(args.linep, args.nline); 510 lp = args.linep; 511 ok = 0; 512 for(n=args.nline; n>0; n--) { 513 l = *lp++; 514 if(args.uflag && ok && kcmp(ok, l->key) == 0) 515 continue; 516 lineout(b, l); 517 ok = l->key; 518 } 519 } 520 521 void 522 setfield(int n, int c) 523 { 524 Field *f; 525 526 f = &args.field[n]; 527 switch(c) { 528 default: 529 fprint(2, "sort: unknown option: field.%C\n", c); 530 done("option"); 531 case 'b': /* skip blanks */ 532 f->flags |= Bflag; 533 break; 534 case 'd': /* directory order */ 535 f->flags |= Dflag; 536 break; 537 case 'f': /* fold case */ 538 f->flags |= Fflag; 539 break; 540 case 'g': /* floating point -n case */ 541 f->flags |= Gflag; 542 break; 543 case 'i': /* ignore non-ascii */ 544 f->flags |= Iflag; 545 break; 546 case 'M': /* month */ 547 f->flags |= Mflag; 548 break; 549 case 'n': /* numbers */ 550 f->flags |= Nflag; 551 break; 552 case 'r': /* reverse */ 553 f->flags |= Rflag; 554 break; 555 case 'w': /* ignore white */ 556 f->flags |= Wflag; 557 break; 558 } 559 } 560 561 void 562 dofield(char *s, int *n1, int *n2, int off1, int off2) 563 { 564 int c, n; 565 566 c = *s++; 567 if(c >= '0' && c <= '9') { 568 n = 0; 569 while(c >= '0' && c <= '9') { 570 n = n*10 + (c-'0'); 571 c = *s++; 572 } 573 n -= off1; /* posix committee: rot in hell */ 574 if(n < 0) { 575 fprint(2, "sort: field offset must be positive\n"); 576 done("option"); 577 } 578 *n1 = n; 579 } 580 if(c == '.') { 581 c = *s++; 582 if(c >= '0' && c <= '9') { 583 n = 0; 584 while(c >= '0' && c <= '9') { 585 n = n*10 + (c-'0'); 586 c = *s++; 587 } 588 n -= off2; 589 if(n < 0) { 590 fprint(2, "sort: character offset must be positive\n"); 591 done("option"); 592 } 593 *n2 = n; 594 } 595 } 596 while(c != 0) { 597 setfield(args.nfield, c); 598 c = *s++; 599 } 600 } 601 602 void 603 printargs(void) 604 { 605 int i, n; 606 Field *f; 607 char *prefix; 608 609 fprint(2, "sort"); 610 for(i=0; i<=args.nfield; i++) { 611 f = &args.field[i]; 612 prefix = " -"; 613 if(i) { 614 n = f->beg1; 615 if(n >= 0) 616 fprint(2, " +%d", n); 617 else 618 fprint(2, " +*"); 619 n = f->beg2; 620 if(n >= 0) 621 fprint(2, ".%d", n); 622 else 623 fprint(2, ".*"); 624 625 if(f->flags & B1flag) 626 fprint(2, "b"); 627 628 n = f->end1; 629 if(n >= 0) 630 fprint(2, " -%d", n); 631 else 632 fprint(2, " -*"); 633 n = f->end2; 634 if(n >= 0) 635 fprint(2, ".%d", n); 636 else 637 fprint(2, ".*"); 638 prefix = ""; 639 } 640 if(f->flags & Bflag) 641 fprint(2, "%sb", prefix); 642 if(f->flags & Dflag) 643 fprint(2, "%sd", prefix); 644 if(f->flags & Fflag) 645 fprint(2, "%sf", prefix); 646 if(f->flags & Gflag) 647 fprint(2, "%sg", prefix); 648 if(f->flags & Iflag) 649 fprint(2, "%si", prefix); 650 if(f->flags & Mflag) 651 fprint(2, "%sM", prefix); 652 if(f->flags & Nflag) 653 fprint(2, "%sn", prefix); 654 if(f->flags & Rflag) 655 fprint(2, "%sr", prefix); 656 if(f->flags & Wflag) 657 fprint(2, "%sw", prefix); 658 } 659 if(args.cflag) 660 fprint(2, " -c"); 661 if(args.uflag) 662 fprint(2, " -u"); 663 if(args.ofile) 664 fprint(2, " -o %s", args.ofile); 665 if(args.mline != Nline) 666 fprint(2, " -l %ld", args.mline); 667 fprint(2, "\n"); 668 } 669 670 void 671 newfield(void) 672 { 673 int n; 674 Field *f; 675 676 n = args.nfield + 1; 677 if(n >= Nfield) { 678 fprint(2, "sort: too many fields specified\n"); 679 done("option"); 680 } 681 args.nfield = n; 682 f = &args.field[n]; 683 f->beg1 = -1; 684 f->beg2 = -1; 685 f->end1 = -1; 686 f->end2 = -1; 687 } 688 689 void 690 doargs(int argc, char *argv[]) 691 { 692 int i, c, hadplus; 693 char *s, *p, *q; 694 Field *f; 695 696 hadplus = 0; 697 args.mline = Nline; 698 for(i=1; i<argc; i++) { 699 s = argv[i]; 700 c = *s++; 701 if(c == '-') { 702 c = *s; 703 if(c == 0) /* forced end of arg marker */ 704 break; 705 argv[i] = 0; /* clobber args processed */ 706 if(c == '.' || (c >= '0' && c <= '9')) { 707 if(!hadplus) 708 newfield(); 709 f = &args.field[args.nfield]; 710 dofield(s, &f->end1, &f->end2, 0, 0); 711 hadplus = 0; 712 continue; 713 } 714 715 while(c = *s++) 716 switch(c) { 717 case '-': /* end of options */ 718 i = argc; 719 continue; 720 case 'T': /* temp directory */ 721 if(*s == 0) { 722 i++; 723 if(i < argc) { 724 args.tname = argv[i]; 725 argv[i] = 0; 726 } 727 } else 728 args.tname = s; 729 s = strchr(s, 0); 730 break; 731 case 'o': /* output file */ 732 if(*s == 0) { 733 i++; 734 if(i < argc) { 735 args.ofile = argv[i]; 736 argv[i] = 0; 737 } 738 } else 739 args.ofile = s; 740 s = strchr(s, 0); 741 break; 742 case 'k': /* posix key (what were they thinking?) */ 743 p = 0; 744 if(*s == 0) { 745 i++; 746 if(i < argc) { 747 p = argv[i]; 748 argv[i] = 0; 749 } 750 } else 751 p = s; 752 s = strchr(s, 0); 753 if(p == 0) 754 break; 755 756 newfield(); 757 q = strchr(p, ','); 758 if(q) 759 *q++ = 0; 760 f = &args.field[args.nfield]; 761 dofield(p, &f->beg1, &f->beg2, 1, 1); 762 if(f->flags & Bflag) { 763 f->flags |= B1flag; 764 f->flags &= ~Bflag; 765 } 766 if(q) { 767 dofield(q, &f->end1, &f->end2, 1, 0); 768 if(f->end2 <= 0) 769 f->end1++; 770 } 771 hadplus = 0; 772 break; 773 case 't': /* tab character */ 774 if(*s == 0) { 775 i++; 776 if(i < argc) { 777 chartorune(&args.tabchar, argv[i]); 778 argv[i] = 0; 779 } 780 } else 781 s += chartorune(&args.tabchar, s); 782 if(args.tabchar == '\n') { 783 fprint(2, "aw come on, rob\n"); 784 done("rob"); 785 } 786 break; 787 case 'c': /* check order */ 788 args.cflag = 1; 789 break; 790 case 'u': /* unique */ 791 args.uflag = 1; 792 break; 793 case 'v': /* debugging noise */ 794 args.vflag = 1; 795 break; 796 case 'l': 797 if(*s == 0) { 798 i++; 799 if(i < argc) { 800 args.mline = atol(argv[i]); 801 argv[i] = 0; 802 } 803 } else 804 args.mline = atol(s); 805 s = strchr(s, 0); 806 break; 807 808 case 'M': /* month */ 809 case 'b': /* skip blanks */ 810 case 'd': /* directory order */ 811 case 'f': /* fold case */ 812 case 'g': /* floating numbers */ 813 case 'i': /* ignore non-ascii */ 814 case 'n': /* numbers */ 815 case 'r': /* reverse */ 816 case 'w': /* ignore white */ 817 if(args.nfield > 0) 818 fprint(2, "sort: global field set after -k\n"); 819 setfield(0, c); 820 break; 821 case 'm': 822 /* option m silently ignored but required by posix */ 823 break; 824 default: 825 fprint(2, "sort: unknown option: -%C\n", c); 826 done("option"); 827 } 828 continue; 829 } 830 if(c == '+') { 831 argv[i] = 0; /* clobber args processed */ 832 c = *s; 833 if(c == '.' || (c >= '0' && c <= '9')) { 834 newfield(); 835 f = &args.field[args.nfield]; 836 dofield(s, &f->beg1, &f->beg2, 0, 0); 837 if(f->flags & Bflag) { 838 f->flags |= B1flag; 839 f->flags &= ~Bflag; 840 } 841 hadplus = 1; 842 continue; 843 } 844 fprint(2, "sort: unknown option: +%C\n", c); 845 done("option"); 846 } 847 args.nfile++; 848 } 849 850 for(i=0; i<=args.nfield; i++) { 851 f = &args.field[i]; 852 853 /* 854 * global options apply to fields that 855 * specify no options 856 */ 857 if(f->flags == 0) { 858 f->flags = args.field[0].flags; 859 if(args.field[0].flags & Bflag) 860 f->flags |= B1flag; 861 } 862 863 864 /* 865 * build buildkey specification 866 */ 867 switch(f->flags & ~(Bflag|B1flag)) { 868 default: 869 fprint(2, "sort: illegal combination of flags: %lx\n", f->flags); 870 done("option"); 871 case 0: 872 f->dokey = dokey_; 873 break; 874 case Rflag: 875 f->dokey = dokey_r; 876 break; 877 case Gflag: 878 case Nflag: 879 case Gflag|Nflag: 880 case Gflag|Rflag: 881 case Nflag|Rflag: 882 case Gflag|Nflag|Rflag: 883 f->dokey = dokey_gn; 884 break; 885 case Mflag: 886 case Mflag|Rflag: 887 f->dokey = dokey_m; 888 makemapm(f); 889 break; 890 case Dflag: 891 case Dflag|Fflag: 892 case Dflag|Fflag|Iflag: 893 case Dflag|Fflag|Iflag|Rflag: 894 case Dflag|Fflag|Iflag|Rflag|Wflag: 895 case Dflag|Fflag|Iflag|Wflag: 896 case Dflag|Fflag|Rflag: 897 case Dflag|Fflag|Rflag|Wflag: 898 case Dflag|Fflag|Wflag: 899 case Dflag|Iflag: 900 case Dflag|Iflag|Rflag: 901 case Dflag|Iflag|Rflag|Wflag: 902 case Dflag|Iflag|Wflag: 903 case Dflag|Rflag: 904 case Dflag|Rflag|Wflag: 905 case Dflag|Wflag: 906 case Fflag: 907 case Fflag|Iflag: 908 case Fflag|Iflag|Rflag: 909 case Fflag|Iflag|Rflag|Wflag: 910 case Fflag|Iflag|Wflag: 911 case Fflag|Rflag: 912 case Fflag|Rflag|Wflag: 913 case Fflag|Wflag: 914 case Iflag: 915 case Iflag|Rflag: 916 case Iflag|Rflag|Wflag: 917 case Iflag|Wflag: 918 case Wflag: 919 f->dokey = dokey_dfi; 920 makemapd(f); 921 break; 922 } 923 } 924 925 /* 926 * random spot checks 927 */ 928 if(args.nfile > 1 && args.cflag) { 929 fprint(2, "sort: -c can have at most one input file\n"); 930 done("option"); 931 } 932 return; 933 } 934 935 uchar* 936 skip(uchar *l, int n1, int n2, int bflag, int endfield) 937 { 938 int i, c, tc; 939 Rune r; 940 941 if(endfield && n1 < 0) 942 return 0; 943 944 c = *l++; 945 tc = args.tabchar; 946 if(tc) { 947 if(tc < Runeself) { 948 for(i=n1; i>0; i--) { 949 while(c != tc) { 950 if(c == '\n') 951 return 0; 952 c = *l++; 953 } 954 if(!(endfield && i == 1)) 955 c = *l++; 956 } 957 } else { 958 l--; 959 l += chartorune(&r, (char*)l); 960 for(i=n1; i>0; i--) { 961 while(r != tc) { 962 if(r == '\n') 963 return 0; 964 l += chartorune(&r, (char*)l); 965 } 966 if(!(endfield && i == 1)) 967 l += chartorune(&r, (char*)l); 968 } 969 c = r; 970 } 971 } else { 972 for(i=n1; i>0; i--) { 973 while(c == ' ' || c == '\t') 974 c = *l++; 975 while(c != ' ' && c != '\t') { 976 if(c == '\n') 977 return 0; 978 c = *l++; 979 } 980 } 981 } 982 983 if(bflag) 984 while(c == ' ' || c == '\t') 985 c = *l++; 986 987 l--; 988 for(i=n2; i>0; i--) { 989 c = *l; 990 if(c < Runeself) { 991 if(c == '\n') 992 return 0; 993 l++; 994 continue; 995 } 996 l += chartorune(&r, (char*)l); 997 } 998 return l; 999 } 1000 1001 void 1002 dokey_gn(Key *k, uchar *lp, uchar *lpe, Field *f) 1003 { 1004 uchar *kp; 1005 int c, cl, dp; 1006 int state, nzero, exp, expsign, rflag; 1007 1008 cl = k->klen + 3; 1009 kp = k->key + cl; /* skip place for sign, exponent[2] */ 1010 1011 nzero = 0; /* number of trailing zeros */ 1012 exp = 0; /* value of the exponent */ 1013 expsign = 0; /* sign of the exponent */ 1014 dp = 0x4040; /* location of decimal point */ 1015 rflag = f->flags&Rflag; /* xor of rflag and - sign */ 1016 state = NSstart; 1017 1018 for(;; lp++) { 1019 if(lp >= lpe) 1020 break; 1021 c = *lp; 1022 1023 if(c == ' ' || c == '\t') { 1024 switch(state) { 1025 case NSstart: 1026 case NSsign: 1027 continue; 1028 } 1029 break; 1030 } 1031 if(c == '+' || c == '-') { 1032 switch(state) { 1033 case NSstart: 1034 state = NSsign; 1035 if(c == '-') 1036 rflag = !rflag; 1037 continue; 1038 case NSexp: 1039 state = NSexpsign; 1040 if(c == '-') 1041 expsign = 1; 1042 continue; 1043 } 1044 break; 1045 } 1046 if(c == '0') { 1047 switch(state) { 1048 case NSdigit: 1049 if(rflag) 1050 c = ~c; 1051 *kp++ = c; 1052 cl++; 1053 nzero++; 1054 dp++; 1055 state = NSdigit; 1056 continue; 1057 case NSfract: 1058 if(rflag) 1059 c = ~c; 1060 *kp++ = c; 1061 cl++; 1062 nzero++; 1063 state = NSfract; 1064 continue; 1065 case NSstart: 1066 case NSsign: 1067 case NSzero: 1068 state = NSzero; 1069 continue; 1070 case NSzerofract: 1071 case NSpoint: 1072 dp--; 1073 state = NSzerofract; 1074 continue; 1075 case NSexpsign: 1076 case NSexp: 1077 case NSexpdigit: 1078 exp = exp*10 + (c - '0'); 1079 state = NSexpdigit; 1080 continue; 1081 } 1082 break; 1083 } 1084 if(c >= '1' && c <= '9') { 1085 switch(state) { 1086 case NSzero: 1087 case NSstart: 1088 case NSsign: 1089 case NSdigit: 1090 if(rflag) 1091 c = ~c; 1092 *kp++ = c; 1093 cl++; 1094 nzero = 0; 1095 dp++; 1096 state = NSdigit; 1097 continue; 1098 case NSzerofract: 1099 case NSpoint: 1100 case NSfract: 1101 if(rflag) 1102 c = ~c; 1103 *kp++ = c; 1104 cl++; 1105 nzero = 0; 1106 state = NSfract; 1107 continue; 1108 case NSexpsign: 1109 case NSexp: 1110 case NSexpdigit: 1111 exp = exp*10 + (c - '0'); 1112 state = NSexpdigit; 1113 continue; 1114 } 1115 break; 1116 } 1117 if(c == '.') { 1118 switch(state) { 1119 case NSstart: 1120 case NSsign: 1121 state = NSpoint; 1122 continue; 1123 case NSzero: 1124 state = NSzerofract; 1125 continue; 1126 case NSdigit: 1127 state = NSfract; 1128 continue; 1129 } 1130 break; 1131 } 1132 if((f->flags & Gflag) && (c == 'e' || c == 'E')) { 1133 switch(state) { 1134 case NSdigit: 1135 case NSfract: 1136 state = NSexp; 1137 continue; 1138 } 1139 break; 1140 } 1141 break; 1142 } 1143 1144 switch(state) { 1145 /* 1146 * result is zero 1147 */ 1148 case NSstart: 1149 case NSsign: 1150 case NSzero: 1151 case NSzerofract: 1152 case NSpoint: 1153 kp = k->key + k->klen; 1154 k->klen += 2; 1155 kp[0] = 0x20; /* between + and - */ 1156 kp[1] = 0; 1157 return; 1158 /* 1159 * result has exponent 1160 */ 1161 case NSexpsign: 1162 case NSexp: 1163 case NSexpdigit: 1164 if(expsign) 1165 exp = -exp; 1166 dp += exp; 1167 1168 /* 1169 * result is fixed point number 1170 */ 1171 case NSdigit: 1172 case NSfract: 1173 kp -= nzero; 1174 cl -= nzero; 1175 break; 1176 } 1177 1178 /* 1179 * end of number 1180 */ 1181 c = 0; 1182 if(rflag) 1183 c = ~c; 1184 *kp = c; 1185 1186 /* 1187 * sign and exponent 1188 */ 1189 c = 0x30; 1190 if(rflag) { 1191 c = 0x10; 1192 dp = ~dp; 1193 } 1194 kp = k->key + k->klen; 1195 kp[0] = c; 1196 kp[1] = (dp >> 8); 1197 kp[2] = dp; 1198 k->klen = cl+1; 1199 } 1200 1201 void 1202 dokey_m(Key *k, uchar *lp, uchar *lpe, Field *f) 1203 { 1204 uchar *kp; 1205 Rune r, place[3]; 1206 int c, cl, pc; 1207 int rflag; 1208 1209 rflag = f->flags&Rflag; 1210 pc = 0; 1211 1212 cl = k->klen; 1213 kp = k->key + cl; 1214 1215 for(;;) { 1216 /* 1217 * get the character 1218 */ 1219 if(lp >= lpe) 1220 break; 1221 c = *lp; 1222 if(c >= Runeself) { 1223 lp += chartorune(&r, (char*)lp); 1224 c = r; 1225 } else 1226 lp++; 1227 1228 if(c < nelem(f->mapto)) { 1229 c = f->mapto[c]; 1230 if(c == 0) 1231 continue; 1232 } 1233 place[pc++] = c; 1234 if(pc < 3) 1235 continue; 1236 for(c=11; c>=0; c--) 1237 if(memcmp(month[c], place, sizeof(place)) == 0) 1238 break; 1239 c += 10; 1240 if(rflag) 1241 c = ~c; 1242 *kp++ = c; 1243 cl++; 1244 break; 1245 } 1246 1247 c = 0; 1248 if(rflag) 1249 c = ~c; 1250 *kp = c; 1251 k->klen = cl+1; 1252 } 1253 1254 void 1255 dokey_dfi(Key *k, uchar *lp, uchar *lpe, Field *f) 1256 { 1257 uchar *kp; 1258 Rune r; 1259 int c, cl, n, rflag; 1260 1261 cl = k->klen; 1262 kp = k->key + cl; 1263 rflag = f->flags & Rflag; 1264 1265 for(;;) { 1266 /* 1267 * get the character 1268 */ 1269 if(lp >= lpe) 1270 break; 1271 c = *lp; 1272 if(c >= Runeself) { 1273 lp += chartorune(&r, (char*)lp); 1274 c = r; 1275 } else 1276 lp++; 1277 1278 /* 1279 * do the various mappings. 1280 * the common case is handled 1281 * completely by the table. 1282 */ 1283 if(c != 0 && c < Runeself) { 1284 c = f->mapto[c]; 1285 if(c) { 1286 *kp++ = c; 1287 cl++; 1288 } 1289 continue; 1290 } 1291 1292 /* 1293 * for characters out of range, 1294 * the table does not do Rflag. 1295 * ignore is based on mapto[255] 1296 */ 1297 if(c != 0 && c < nelem(f->mapto)) { 1298 c = f->mapto[c]; 1299 if(c == 0) 1300 continue; 1301 } else 1302 if(f->mapto[nelem(f->mapto)-1] == 0) 1303 continue; 1304 1305 /* 1306 * put it in the key 1307 */ 1308 r = c; 1309 n = runetochar((char*)kp, &r); 1310 kp += n; 1311 cl += n; 1312 if(rflag) 1313 while(n > 0) { 1314 kp[-n] = ~kp[-n]; 1315 n--; 1316 } 1317 } 1318 1319 /* 1320 * end of key 1321 */ 1322 k->klen = cl+1; 1323 if(rflag) { 1324 *kp = ~0; 1325 return; 1326 } 1327 *kp = 0; 1328 } 1329 1330 void 1331 dokey_r(Key *k, uchar *lp, uchar *lpe, Field *f) 1332 { 1333 int cl, n; 1334 uchar *kp; 1335 1336 USED(f); 1337 n = lpe - lp; 1338 if(n < 0) 1339 n = 0; 1340 cl = k->klen; 1341 kp = k->key + cl; 1342 k->klen = cl+n+1; 1343 1344 lpe -= 3; 1345 while(lp < lpe) { 1346 kp[0] = ~lp[0]; 1347 kp[1] = ~lp[1]; 1348 kp[2] = ~lp[2]; 1349 kp[3] = ~lp[3]; 1350 kp += 4; 1351 lp += 4; 1352 } 1353 1354 lpe += 3; 1355 while(lp < lpe) 1356 *kp++ = ~*lp++; 1357 *kp = ~0; 1358 } 1359 1360 void 1361 dokey_(Key *k, uchar *lp, uchar *lpe, Field *f) 1362 { 1363 int n, cl; 1364 uchar *kp; 1365 1366 USED(f); 1367 n = lpe - lp; 1368 if(n < 0) 1369 n = 0; 1370 cl = k->klen; 1371 kp = k->key + cl; 1372 k->klen = cl+n+1; 1373 memmove(kp, lp, n); 1374 kp[n] = 0; 1375 } 1376 1377 void 1378 buildkey(Line *l) 1379 { 1380 Key *k; 1381 uchar *lp, *lpe; 1382 int ll, kl, cl, i, n; 1383 Field *f; 1384 1385 ll = l->llen - 1; 1386 kl = 0; /* allocated length */ 1387 cl = 0; /* current length */ 1388 k = 0; 1389 1390 for(i=1; i<=args.nfield; i++) { 1391 f = &args.field[i]; 1392 lp = skip(l->line, f->beg1, f->beg2, f->flags&B1flag, 0); 1393 if(lp == 0) 1394 lp = l->line + ll; 1395 lpe = skip(l->line, f->end1, f->end2, f->flags&Bflag, 1); 1396 if(lpe == 0) 1397 lpe = l->line + ll; 1398 n = (lpe - lp) + 1; 1399 if(n <= 0) 1400 n = 1; 1401 if(cl+(n+4) > kl) { 1402 kl = cl+(n+4); 1403 k = realloc(k, sizeof(Key) + 1404 (kl-1)*sizeof(k->key[0])); 1405 if(k == 0) 1406 nomem(); 1407 } 1408 k->klen = cl; 1409 (*f->dokey)(k, lp, lpe, f); 1410 cl = k->klen; 1411 } 1412 1413 /* 1414 * global comparisons 1415 */ 1416 if(!(args.uflag && cl > 0)) { 1417 f = &args.field[0]; 1418 if(cl+(ll+4) > kl) { 1419 kl = cl+(ll+4); 1420 k = realloc(k, sizeof(Key) + 1421 (kl-1)*sizeof(k->key[0])); 1422 if(k == 0) 1423 nomem(); 1424 } 1425 k->klen = cl; 1426 (*f->dokey)(k, l->line, l->line+ll, f); 1427 cl = k->klen; 1428 } 1429 1430 l->key = k; 1431 k->klen = cl; 1432 1433 if(args.vflag) { 1434 write(2, l->line, l->llen); 1435 for(i=0; i<k->klen; i++) { 1436 fprint(2, " %.2x", k->key[i]); 1437 if(k->key[i] == 0x00 || k->key[i] == 0xff) 1438 fprint(2, "\n"); 1439 } 1440 } 1441 } 1442 1443 void 1444 makemapm(Field *f) 1445 { 1446 int i, c; 1447 1448 for(i=0; i<nelem(f->mapto); i++) { 1449 c = 1; 1450 if(i == ' ' || i == '\t') 1451 c = 0; 1452 if(i >= 'a' && i <= 'z') 1453 c = i + ('A' - 'a'); 1454 if(i >= 'A' && i <= 'Z') 1455 c = i; 1456 f->mapto[i] = c; 1457 if(args.vflag) { 1458 if((i & 15) == 0) 1459 fprint(2, " "); 1460 fprint(2, " %.2x", c); 1461 if((i & 15) == 15) 1462 fprint(2, "\n"); 1463 } 1464 } 1465 } 1466 1467 void 1468 makemapd(Field *f) 1469 { 1470 int i, j, c; 1471 1472 for(i=0; i<nelem(f->mapto); i++) { 1473 c = i; 1474 if(f->flags & Iflag) 1475 if(c < 040 || c > 0176) 1476 c = -1; 1477 if((f->flags & Wflag) && c >= 0) 1478 if(c == ' ' || c == '\t') 1479 c = -1; 1480 if((f->flags & Dflag) && c >= 0) 1481 if(!(c == ' ' || c == '\t' || 1482 (c >= 'a' && c <= 'z') || 1483 (c >= 'A' && c <= 'Z') || 1484 (c >= '0' && c <= '9'))) { 1485 for(j=0; latinmap[j]; j+=3) 1486 if(c == latinmap[j+0] || 1487 c == latinmap[j+1]) 1488 break; 1489 if(latinmap[j] == 0) 1490 c = -1; 1491 } 1492 if((f->flags & Fflag) && c >= 0) { 1493 if(c >= 'a' && c <= 'z') 1494 c += 'A' - 'a'; 1495 for(j=0; latinmap[j]; j+=3) 1496 if(c == latinmap[j+0] || 1497 c == latinmap[j+1]) { 1498 c = latinmap[j+2]; 1499 break; 1500 } 1501 } 1502 if((f->flags & Rflag) && c >= 0 && i > 0 && i < Runeself) 1503 c = ~c & 0xff; 1504 if(c < 0) 1505 c = 0; 1506 f->mapto[i] = c; 1507 if(args.vflag) { 1508 if((i & 15) == 0) 1509 fprint(2, " "); 1510 fprint(2, " %.2x", c); 1511 if((i & 15) == 15) 1512 fprint(2, "\n"); 1513 } 1514 } 1515 } 1516 1517 int latinmap[] = 1518 { 1519 /* lcase ucase fold */ 1520 0xe0, 0xc0, 0x41, /* L'à', L'À', L'A', */ 1521 0xe1, 0xc1, 0x41, /* L'á', L'Á', L'A', */ 1522 0xe2, 0xc2, 0x41, /* L'â', L'Â', L'A', */ 1523 0xe4, 0xc4, 0x41, /* L'ä', L'Ä', L'A', */ 1524 0xe3, 0xc3, 0x41, /* L'ã', L'Ã', L'A', */ 1525 0xe5, 0xc5, 0x41, /* L'å', L'Å', L'A', */ 1526 0xe8, 0xc8, 0x45, /* L'è', L'È', L'E', */ 1527 0xe9, 0xc9, 0x45, /* L'é', L'É', L'E', */ 1528 0xea, 0xca, 0x45, /* L'ê', L'Ê', L'E', */ 1529 0xeb, 0xcb, 0x45, /* L'ë', L'Ë', L'E', */ 1530 0xec, 0xcc, 0x49, /* L'ì', L'Ì', L'I', */ 1531 0xed, 0xcd, 0x49, /* L'í', L'Í', L'I', */ 1532 0xee, 0xce, 0x49, /* L'î', L'Î', L'I', */ 1533 0xef, 0xcf, 0x49, /* L'ï', L'Ï', L'I', */ 1534 0xf2, 0xd2, 0x4f, /* L'ò', L'Ò', L'O', */ 1535 0xf3, 0xd3, 0x4f, /* L'ó', L'Ó', L'O', */ 1536 0xf4, 0xd4, 0x4f, /* L'ô', L'Ô', L'O', */ 1537 0xf6, 0xd6, 0x4f, /* L'ö', L'Ö', L'O', */ 1538 0xf5, 0xd5, 0x4f, /* L'õ', L'Õ', L'O', */ 1539 0xf8, 0xd8, 0x4f, /* L'ø', L'Ø', L'O', */ 1540 0xf9, 0xd9, 0x55, /* L'ù', L'Ù', L'U', */ 1541 0xfa, 0xda, 0x55, /* L'ú', L'Ú', L'U', */ 1542 0xfb, 0xdb, 0x55, /* L'û', L'Û', L'U', */ 1543 0xfc, 0xdc, 0x55, /* L'ü', L'Ü', L'U', */ 1544 0xe6, 0xc6, 0x41, /* L'æ', L'Æ', L'A', */ 1545 0xf0, 0xd0, 0x44, /* L'ð', L'Ð', L'D', */ 1546 0xf1, 0xd1, 0x4e, /* L'ñ', L'Ñ', L'N', */ 1547 0xfd, 0xdd, 0x59, /* L'ý', L'Ý', L'Y', */ 1548 0xe7, 0xc7, 0x43, /* L'ç', L'Ç', L'C', */ 1549 0, 1550 }; 1551 1552 Rune LJAN[] = { 'J', 'A', 'N', 0 }; 1553 Rune LFEB[] = { 'F', 'E', 'B', 0 }; 1554 Rune LMAR[] = { 'M', 'A', 'R', 0 }; 1555 Rune LAPR[] = { 'A', 'P', 'R', 0 }; 1556 Rune LMAY[] = { 'M', 'A', 'Y', 0 }; 1557 Rune LJUN[] = { 'J', 'U', 'N', 0 }; 1558 Rune LJUL[] = { 'J', 'U', 'L', 0 }; 1559 Rune LAUG[] = { 'A', 'U', 'G', 0 }; 1560 Rune LSEP[] = { 'S', 'E', 'P', 0 }; 1561 Rune LOCT[] = { 'O', 'C', 'T', 0 }; 1562 Rune LNOV[] = { 'N', 'O', 'V', 0 }; 1563 Rune LDEC[] = { 'D', 'E', 'C', 0 }; 1564 1565 Rune* month[12] = 1566 { 1567 LJAN, 1568 LFEB, 1569 LMAR, 1570 LAPR, 1571 LMAY, 1572 LJUN, 1573 LJUL, 1574 LAUG, 1575 LSEP, 1576 LOCT, 1577 LNOV, 1578 LDEC, 1579 }; 1580 1581 /************** radix sort ***********/ 1582 1583 enum 1584 { 1585 Threshold = 14 1586 }; 1587 1588 void rsort4(Key***, ulong, int); 1589 void bsort4(Key***, ulong, int); 1590 1591 void 1592 sort4(void *a, ulong n) 1593 { 1594 if(n > Threshold) 1595 rsort4((Key***)a, n, 0); 1596 else 1597 bsort4((Key***)a, n, 0); 1598 } 1599 1600 void 1601 rsort4(Key ***a, ulong n, int b) 1602 { 1603 Key ***ea, ***t, ***u, **t1, **u1, *k; 1604 Key ***part[257]; 1605 static long count[257]; 1606 long clist[257+257], *cp, *cp1; 1607 int c, lowc, higc; 1608 1609 /* 1610 * pass 1 over all keys, 1611 * count the number of each key[b]. 1612 * find low count and high count. 1613 */ 1614 lowc = 256; 1615 higc = 0; 1616 ea = a+n; 1617 for(t=a; t<ea; t++) { 1618 k = **t; 1619 n = k->klen; 1620 if(b >= n) { 1621 count[256]++; 1622 continue; 1623 } 1624 c = k->key[b]; 1625 n = count[c]++; 1626 if(n == 0) { 1627 if(c < lowc) 1628 lowc = c; 1629 if(c > higc) 1630 higc = c; 1631 } 1632 } 1633 1634 /* 1635 * pass 2 over all counts, 1636 * put partition pointers in part[c]. 1637 * save compacted indexes and counts 1638 * in clist[]. 1639 */ 1640 t = a; 1641 n = count[256]; 1642 clist[0] = n; 1643 part[256] = t; 1644 t += n; 1645 1646 cp1 = clist+1; 1647 cp = count+lowc; 1648 for(c=lowc; c<=higc; c++,cp++) { 1649 n = *cp; 1650 if(n) { 1651 cp1[0] = n; 1652 cp1[1] = c; 1653 cp1 += 2; 1654 part[c] = t; 1655 t += n; 1656 } 1657 } 1658 *cp1 = 0; 1659 1660 /* 1661 * pass 3 over all counts. 1662 * chase lowest pointer in each partition 1663 * around a permutation until it comes 1664 * back and is stored where it started. 1665 * static array, count[], should be 1666 * reduced to zero entries except maybe 1667 * count[256]. 1668 */ 1669 for(cp1=clist+1; cp1[0]; cp1+=2) { 1670 c = cp1[1]; 1671 cp = count+c; 1672 while(*cp) { 1673 t1 = *part[c]; 1674 for(;;) { 1675 k = *t1; 1676 n = 256; 1677 if(b < k->klen) 1678 n = k->key[b]; 1679 u = part[n]++; 1680 count[n]--; 1681 u1 = *u; 1682 *u = t1; 1683 if(n == c) 1684 break; 1685 t1 = u1; 1686 } 1687 } 1688 } 1689 1690 /* 1691 * pass 4 over all partitions. 1692 * call recursively. 1693 */ 1694 b++; 1695 t = a + clist[0]; 1696 count[256] = 0; 1697 for(cp1=clist+1; n=cp1[0]; cp1+=2) { 1698 if(n > Threshold) 1699 rsort4(t, n, b); 1700 else 1701 if(n > 1) 1702 bsort4(t, n, b); 1703 t += n; 1704 } 1705 } 1706 1707 /* 1708 * bubble sort to pick up 1709 * the pieces. 1710 */ 1711 void 1712 bsort4(Key ***a, ulong n, int b) 1713 { 1714 Key ***i, ***j, ***k, ***l, **t; 1715 Key *ka, *kb; 1716 int n1, n2; 1717 1718 l = a+n; 1719 j = a; 1720 1721 loop: 1722 i = j; 1723 j++; 1724 if(j >= l) 1725 return; 1726 1727 ka = **i; 1728 kb = **j; 1729 n1 = ka->klen - b; 1730 n2 = kb->klen - b; 1731 if(n1 > n2) 1732 n1 = n2; 1733 if(n1 <= 0) 1734 goto loop; 1735 n2 = ka->key[b] - kb->key[b]; 1736 if(n2 == 0) 1737 n2 = memcmp(ka->key+b, kb->key+b, n1); 1738 if(n2 <= 0) 1739 goto loop; 1740 1741 for(;;) { 1742 k = i+1; 1743 1744 t = *k; 1745 *k = *i; 1746 *i = t; 1747 1748 if(i <= a) 1749 goto loop; 1750 1751 i--; 1752 ka = **i; 1753 kb = *t; 1754 n1 = ka->klen - b; 1755 n2 = kb->klen - b; 1756 if(n1 > n2) 1757 n1 = n2; 1758 if(n1 <= 0) 1759 goto loop; 1760 n2 = ka->key[b] - kb->key[b]; 1761 if(n2 == 0) 1762 n2 = memcmp(ka->key+b, kb->key+b, n1); 1763 if(n2 <= 0) 1764 goto loop; 1765 } 1766 }