sed.c (41896B)
1 /* FIXME: summary 2 * decide whether we enforce valid UTF-8, right now it's enforced in certain 3 * parts of the script, but not the input... 4 * nul bytes cause explosions due to use of libc string functions. thoughts? 5 * lack of newline at end of file, currently we add one. what should we do? 6 * allow "\\t" for "\t" etc. in regex? in replacement text? 7 * POSIX says don't flush on N when out of input, but GNU and busybox do. 8 */ 9 10 #include <ctype.h> 11 #include <errno.h> 12 #include <regex.h> 13 #include <stdlib.h> 14 #include <string.h> 15 16 #include "utf.h" 17 #include "util.h" 18 19 /* Types */ 20 21 /* used as queue for writes and stack for {,:,b,t */ 22 typedef struct { 23 void **data; 24 size_t size; 25 size_t cap; 26 } Vec; 27 28 /* used for arbitrary growth, str is a C string 29 * FIXME: does it make sense to keep track of length? or just rely on libc 30 * string functions? If we want to support nul bytes everything changes 31 */ 32 typedef struct { 33 char *str; 34 size_t cap; 35 } String; 36 37 typedef struct Cmd Cmd; 38 typedef struct { 39 void (*fn)(Cmd *); 40 char *(*getarg)(Cmd *, char *); 41 void (*freearg)(Cmd *); 42 unsigned char naddr; 43 } Fninfo; 44 45 typedef struct { 46 union { 47 size_t lineno; 48 regex_t *re; 49 } u; 50 enum { 51 IGNORE, /* empty address, ignore */ 52 EVERY , /* every line */ 53 LINE , /* line number */ 54 LAST , /* last line ($) */ 55 REGEX , /* use included regex */ 56 LASTRE, /* use most recently used regex */ 57 } type; 58 } Addr; 59 60 /* DISCUSS: naddr is not strictly necessary, but very helpful 61 * naddr == 0 iff beg.type == EVERY && end.type == IGNORE 62 * naddr == 1 iff beg.type != IGNORE && end.type == IGNORE 63 * naddr == 2 iff beg.type != IGNORE && end.type != IGNORE 64 */ 65 typedef struct { 66 Addr beg; 67 Addr end; 68 unsigned char naddr; 69 } Range; 70 71 typedef struct { 72 regex_t *re; /* if NULL use last regex */ 73 String repl; 74 FILE *file; 75 size_t occurrence; /* 0 for all (g flag) */ 76 Rune delim; 77 unsigned int p:1; 78 } Sarg; 79 80 typedef struct { 81 Rune *set1; 82 Rune *set2; 83 } Yarg; 84 85 typedef struct { 86 String str; /* a,c,i text. r file path */ 87 void (*print)(char *, FILE *); /* check_puts for a, write_file for r, unused for c,i */ 88 } ACIRarg; 89 90 struct Cmd { 91 Range range; 92 Fninfo *fninfo; 93 union { 94 Cmd *jump; /* used for b,t when running */ 95 char *label; /* used for :,b,t when building */ 96 ptrdiff_t offset; /* used for { (pointers break during realloc) */ 97 FILE *file; /* used for w */ 98 99 /* FIXME: Should the following be in the union? or pointers and malloc? */ 100 Sarg s; 101 Yarg y; 102 ACIRarg acir; 103 } u; /* I find your lack of anonymous unions disturbing */ 104 unsigned int in_match:1; 105 unsigned int negate :1; 106 }; 107 108 /* Files for w command (and s' w flag) */ 109 typedef struct { 110 char *path; 111 FILE *file; 112 } Wfile; 113 114 /* 115 * Function Declarations 116 */ 117 118 /* Dynamically allocated arrays and strings */ 119 static void resize(void **ptr, size_t *nmemb, size_t size, size_t new_nmemb, void **next); 120 static void *pop(Vec *v); 121 static void push(Vec *v, void *p); 122 static void stracat(String *dst, char *src); 123 static void strnacat(String *dst, char *src, size_t n); 124 static void stracpy(String *dst, char *src); 125 126 /* Cleanup and errors */ 127 static void usage(void); 128 129 /* Parsing functions and related utilities */ 130 static void compile(char *s, int isfile); 131 static int read_line(FILE *f, String *s); 132 static char *make_range(Range *range, char *s); 133 static char *make_addr(Addr *addr, char *s); 134 static char *find_delim(char *s, Rune delim, int do_brackets); 135 static char *chompr(char *s, Rune rune); 136 static char *chomp(char *s); 137 static Rune *strtorunes(char *s, size_t nrunes); 138 static long stol(char *s, char **endp); 139 static size_t escapes(char *beg, char *end, Rune delim, int n_newline); 140 static size_t echarntorune(Rune *r, char *s, size_t n); 141 static void insert_labels(void); 142 143 /* Get and Free arg and related utilities */ 144 static char *get_aci_arg(Cmd *c, char *s); 145 static void aci_append(Cmd *c, char *s); 146 static void free_acir_arg(Cmd *c); 147 static char *get_bt_arg(Cmd *c, char *s); 148 static char *get_r_arg(Cmd *c, char *s); 149 static char *get_s_arg(Cmd *c, char *s); 150 static void free_s_arg(Cmd *c); 151 static char *get_w_arg(Cmd *c, char *s); 152 static char *get_y_arg(Cmd *c, char *s); 153 static void free_y_arg(Cmd *c); 154 static char *get_colon_arg(Cmd *c, char *s); 155 static char *get_lbrace_arg(Cmd *c, char *s); 156 static char *get_rbrace_arg(Cmd *c, char *s); 157 static char *semicolon_arg(char *s); 158 159 /* Running */ 160 static void run(void); 161 static int in_range(Cmd *c); 162 static int match_addr(Addr *a); 163 static int next_file(void); 164 static int is_eof(FILE *f); 165 static void do_writes(void); 166 static void write_file(char *path, FILE *out); 167 static void check_puts(char *s, FILE *f); 168 static void update_ranges(Cmd *beg, Cmd *end); 169 170 /* Sed functions */ 171 static void cmd_y(Cmd *c); 172 static void cmd_x(Cmd *c); 173 static void cmd_w(Cmd *c); 174 static void cmd_t(Cmd *c); 175 static void cmd_s(Cmd *c); 176 static void cmd_r(Cmd *c); 177 static void cmd_q(Cmd *c); 178 static void cmd_P(Cmd *c); 179 static void cmd_p(Cmd *c); 180 static void cmd_N(Cmd *c); 181 static void cmd_n(Cmd *c); 182 static void cmd_l(Cmd *c); 183 static void cmd_i(Cmd *c); 184 static void cmd_H(Cmd *c); 185 static void cmd_h(Cmd *c); 186 static void cmd_G(Cmd *c); 187 static void cmd_g(Cmd *c); 188 static void cmd_D(Cmd *c); 189 static void cmd_d(Cmd *c); 190 static void cmd_c(Cmd *c); 191 static void cmd_b(Cmd *c); 192 static void cmd_a(Cmd *c); 193 static void cmd_colon(Cmd *c); 194 static void cmd_equal(Cmd *c); 195 static void cmd_lbrace(Cmd *c); 196 static void cmd_rbrace(Cmd *c); 197 static void cmd_last(Cmd *c); 198 199 /* Actions */ 200 static void new_line(void); 201 static void app_line(void); 202 static void new_next(void); 203 static void old_next(void); 204 205 /* 206 * Globals 207 */ 208 static Vec braces, labels, branches; /* holds ptrdiff_t. addrs of {, :, bt */ 209 static Vec writes; /* holds cmd*. writes scheduled by a and r commands */ 210 static Vec wfiles; /* holds Wfile*. files for w and s///w commands */ 211 212 static Cmd *prog, *pc; /* Program, program counter */ 213 static size_t pcap; 214 static size_t lineno; 215 216 static regex_t *lastre; /* last used regex for empty regex search */ 217 static char **files; /* list of file names from argv */ 218 static FILE *file; /* current file we are reading */ 219 static int ret; /* exit status */ 220 221 static String patt, hold, genbuf; 222 223 static struct { 224 unsigned int n :1; /* -n (no print) */ 225 unsigned int E :1; /* -E (extended re) */ 226 unsigned int s :1; /* s/// replacement happened */ 227 unsigned int aci_cont:1; /* a,c,i text continuation */ 228 unsigned int s_cont :1; /* s/// replacement text continuation */ 229 unsigned int halt :1; /* halt execution */ 230 } gflags; 231 232 /* FIXME: move character inside Fninfo and only use 26*sizeof(Fninfo) instead of 127*sizeof(Fninfo) bytes */ 233 static Fninfo fns[] = { 234 ['a'] = { cmd_a , get_aci_arg , free_acir_arg , 1 }, /* schedule write of text for later */ 235 ['b'] = { cmd_b , get_bt_arg , NULL , 2 }, /* branch to label char *label when building, Cmd *jump when running */ 236 ['c'] = { cmd_c , get_aci_arg , free_acir_arg , 2 }, /* delete pattern space. at 0 or 1 addr or end of 2 addr, write text */ 237 ['d'] = { cmd_d , NULL , NULL , 2 }, /* delete pattern space */ 238 ['D'] = { cmd_D , NULL , NULL , 2 }, /* delete to first newline and start new cycle without reading (if no newline, d) */ 239 ['g'] = { cmd_g , NULL , NULL , 2 }, /* replace pattern space with hold space */ 240 ['G'] = { cmd_G , NULL , NULL , 2 }, /* append newline and hold space to pattern space */ 241 ['h'] = { cmd_h , NULL , NULL , 2 }, /* replace hold space with pattern space */ 242 ['H'] = { cmd_H , NULL , NULL , 2 }, /* append newline and pattern space to hold space */ 243 ['i'] = { cmd_i , get_aci_arg , free_acir_arg , 1 }, /* write text */ 244 ['l'] = { cmd_l , NULL , NULL , 2 }, /* write pattern space in 'visually unambiguous form' */ 245 ['n'] = { cmd_n , NULL , NULL , 2 }, /* write pattern space (unless -n) read to replace pattern space (if no input, quit) */ 246 ['N'] = { cmd_N , NULL , NULL , 2 }, /* append to pattern space separated by newline, line number changes (if no input, quit) */ 247 ['p'] = { cmd_p , NULL , NULL , 2 }, /* write pattern space */ 248 ['P'] = { cmd_P , NULL , NULL , 2 }, /* write pattern space up to first newline */ 249 ['q'] = { cmd_q , NULL , NULL , 1 }, /* quit */ 250 ['r'] = { cmd_r , get_r_arg , free_acir_arg , 1 }, /* write contents of file (unable to open/read treated as empty file) */ 251 ['s'] = { cmd_s , get_s_arg , free_s_arg , 2 }, /* find/replace/all that crazy s stuff */ 252 ['t'] = { cmd_t , get_bt_arg , NULL , 2 }, /* if s/// succeeded (since input or last t) branch to label (branch to end if no label) */ 253 ['w'] = { cmd_w , get_w_arg , NULL , 2 }, /* append pattern space to file */ 254 ['x'] = { cmd_x , NULL , NULL , 2 }, /* exchange pattern and hold spaces */ 255 ['y'] = { cmd_y , get_y_arg , free_y_arg , 2 }, /* replace runes in set1 with runes in set2 */ 256 [':'] = { cmd_colon , get_colon_arg , NULL , 0 }, /* defines label for later b and t commands */ 257 ['='] = { cmd_equal , NULL , NULL , 1 }, /* printf("%d\n", line_number); */ 258 ['{'] = { cmd_lbrace, get_lbrace_arg, NULL , 2 }, /* if we match, run commands, otherwise jump to close */ 259 ['}'] = { cmd_rbrace, get_rbrace_arg, NULL , 0 }, /* noop, hold onto open for ease of building scripts */ 260 261 [0x7f] = { NULL, NULL, NULL, 0 }, /* index is checked with isascii(3p). fill out rest of array */ 262 }; 263 264 /* 265 * Function Definitions 266 */ 267 268 /* given memory pointed to by *ptr that currently holds *nmemb members of size 269 * size, realloc to hold new_nmemb members, return new_nmemb in *memb and one 270 * past old end in *next. if realloc fails...explode 271 */ 272 static void 273 resize(void **ptr, size_t *nmemb, size_t size, size_t new_nmemb, void **next) 274 { 275 void *n, *tmp; 276 277 if (new_nmemb) { 278 tmp = ereallocarray(*ptr, new_nmemb, size); 279 } else { /* turns out realloc(*ptr, 0) != free(*ptr) */ 280 free(*ptr); 281 tmp = NULL; 282 } 283 n = (char *)tmp + *nmemb * size; 284 *nmemb = new_nmemb; 285 *ptr = tmp; 286 if (next) 287 *next = n; 288 } 289 290 static void * 291 pop(Vec *v) 292 { 293 if (!v->size) 294 return NULL; 295 return v->data[--v->size]; 296 } 297 298 static void 299 push(Vec *v, void *p) 300 { 301 if (v->size == v->cap) 302 resize((void **)&v->data, &v->cap, sizeof(*v->data), v->cap * 2 + 1, NULL); 303 v->data[v->size++] = p; 304 } 305 306 static void 307 stracat(String *dst, char *src) 308 { 309 int new = !dst->cap; 310 size_t len; 311 312 len = (new ? 0 : strlen(dst->str)) + strlen(src) + 1; 313 if (dst->cap < len) 314 resize((void **)&dst->str, &dst->cap, 1, len * 2, NULL); 315 if (new) 316 *dst->str = '\0'; 317 strcat(dst->str, src); 318 } 319 320 static void 321 strnacat(String *dst, char *src, size_t n) 322 { 323 int new = !dst->cap; 324 size_t len; 325 326 len = strlen(src); 327 len = (new ? 0 : strlen(dst->str)) + MIN(n, len) + 1; 328 if (dst->cap < len) 329 resize((void **)&dst->str, &dst->cap, 1, len * 2, NULL); 330 if (new) 331 *dst->str = '\0'; 332 strlcat(dst->str, src, len); 333 } 334 335 static void 336 stracpy(String *dst, char *src) 337 { 338 size_t len; 339 340 len = strlen(src) + 1; 341 if (dst->cap < len) 342 resize((void **)&dst->str, &dst->cap, 1, len * 2, NULL); 343 strcpy(dst->str, src); 344 } 345 346 static void 347 leprintf(char *s) 348 { 349 if (errno) 350 eprintf("%zu: %s: %s\n", lineno, s, strerror(errno)); 351 else 352 eprintf("%zu: %s\n", lineno, s); 353 } 354 355 /* FIXME: write usage message */ 356 static void 357 usage(void) 358 { 359 eprintf("usage: sed [-nrE] script [file ...]\n" 360 " sed [-nrE] -e script [-e script] ... [-f scriptfile] ... [file ...]\n" 361 " sed [-nrE] [-e script] ... -f scriptfile [-f scriptfile] ... [file ...]\n"); 362 } 363 364 /* Differences from POSIX 365 * we allows semicolons and trailing blanks inside {} 366 * we allow spaces after ! (and in between !s) 367 * we allow extended regular expressions (-E) 368 */ 369 static void 370 compile(char *s, int isfile) 371 { 372 FILE *f; 373 374 if (isfile) { 375 f = fopen(s, "r"); 376 if (!f) 377 eprintf("fopen %s:", s); 378 } else { 379 if (!*s) /* empty string script */ 380 return; 381 f = fmemopen(s, strlen(s), "r"); 382 if (!f) 383 eprintf("fmemopen:"); 384 } 385 386 /* NOTE: get arg functions can't use genbuf */ 387 while (read_line(f, &genbuf) != EOF) { 388 s = genbuf.str; 389 390 /* if the first two characters of the script are "#n" default output shall be suppressed */ 391 if (++lineno == 1 && *s == '#' && s[1] == 'n') { 392 gflags.n = 1; 393 continue; 394 } 395 396 if (gflags.aci_cont) { 397 aci_append(pc - 1, s); 398 continue; 399 } 400 if (gflags.s_cont) 401 s = (pc - 1)->fninfo->getarg(pc - 1, s); 402 403 while (*s) { 404 s = chompr(s, ';'); 405 if (!*s || *s == '#') 406 break; 407 408 if ((size_t)(pc - prog) == pcap) 409 resize((void **)&prog, &pcap, sizeof(*prog), pcap * 2 + 1, (void **)&pc); 410 411 pc->range.beg.type = pc->range.end.type = IGNORE; 412 pc->fninfo = NULL; 413 pc->in_match = 0; 414 415 s = make_range(&pc->range, s); 416 s = chomp(s); 417 pc->negate = *s == '!'; 418 s = chompr(s, '!'); 419 420 if (!isascii(*s) || !(pc->fninfo = &fns[(unsigned)*s])->fn) 421 leprintf("bad sed function"); 422 if (pc->range.naddr > pc->fninfo->naddr) 423 leprintf("wrong number of addresses"); 424 s++; 425 426 if (pc->fninfo->getarg) 427 s = pc->fninfo->getarg(pc, s); 428 429 pc++; 430 } 431 } 432 433 fshut(f, s); 434 } 435 436 /* FIXME: if we decide to honor lack of trailing newline, set/clear a global 437 * flag when reading a line 438 */ 439 static int 440 read_line(FILE *f, String *s) 441 { 442 ssize_t len; 443 444 if (!f) 445 return EOF; 446 447 if ((len = getline(&s->str, &s->cap, f)) < 0) { 448 if (ferror(f)) 449 eprintf("getline:"); 450 return EOF; 451 } 452 if (s->str[--len] == '\n') 453 s->str[len] = '\0'; 454 return 0; 455 } 456 457 /* read first range from s, return pointer to one past end of range */ 458 static char * 459 make_range(Range *range, char *s) 460 { 461 s = make_addr(&range->beg, s); 462 463 if (*s == ',') 464 s = make_addr(&range->end, s + 1); 465 else 466 range->end.type = IGNORE; 467 468 if (range->beg.type == EVERY && range->end.type == IGNORE) range->naddr = 0; 469 else if (range->beg.type != IGNORE && range->end.type == IGNORE) range->naddr = 1; 470 else if (range->beg.type != IGNORE && range->end.type != IGNORE) range->naddr = 2; 471 else leprintf("this is impossible..."); 472 473 return s; 474 } 475 476 /* read first addr from s, return pointer to one past end of addr */ 477 static char * 478 make_addr(Addr *addr, char *s) 479 { 480 Rune r; 481 char *p = s + strlen(s); 482 size_t rlen = echarntorune(&r, s, p - s); 483 484 if (r == '$') { 485 addr->type = LAST; 486 s += rlen; 487 } else if (isdigitrune(r)) { 488 addr->type = LINE; 489 addr->u.lineno = stol(s, &s); 490 } else if (r == '/' || r == '\\') { 491 Rune delim; 492 if (r == '\\') { 493 s += rlen; 494 rlen = echarntorune(&r, s, p - s); 495 } 496 if (r == '\\') 497 leprintf("bad delimiter '\\'"); 498 delim = r; 499 s += rlen; 500 rlen = echarntorune(&r, s, p - s); 501 if (r == delim) { 502 addr->type = LASTRE; 503 s += rlen; 504 } else { 505 addr->type = REGEX; 506 p = find_delim(s, delim, 1); 507 if (!*p) 508 leprintf("unclosed regex"); 509 p -= escapes(s, p, delim, 0); 510 *p++ = '\0'; 511 addr->u.re = emalloc(sizeof(*addr->u.re)); 512 eregcomp(addr->u.re, s, gflags.E ? REG_EXTENDED : 0); 513 s = p; 514 } 515 } else { 516 addr->type = EVERY; 517 } 518 519 return s; 520 } 521 522 /* return pointer to first delim in s that is not escaped 523 * and if do_brackets is set, not in [] (note possible [::], [..], [==], inside []) 524 * return pointer to trailing nul byte if no delim found 525 * 526 * any escaped character that is not special is just itself (POSIX undefined) 527 * FIXME: pull out into some util thing, will be useful for ed as well 528 */ 529 static char * 530 find_delim(char *s, Rune delim, int do_brackets) 531 { 532 enum { 533 OUTSIDE , /* not in brackets */ 534 BRACKETS_OPENING, /* last char was first [ or last two were first [^ */ 535 BRACKETS_INSIDE , /* inside [] */ 536 INSIDE_OPENING , /* inside [] and last char was [ */ 537 CLASS_INSIDE , /* inside class [::], or colating element [..] or [==], inside [] */ 538 CLASS_CLOSING , /* inside class [::], or colating element [..] or [==], and last character was the respective : . or = */ 539 } state = OUTSIDE; 540 541 Rune r, c = 0; /* no c won't be used uninitialized, shutup -Wall */ 542 size_t rlen; 543 int escape = 0; 544 char *end = s + strlen(s); 545 546 for (; *s; s += rlen) { 547 rlen = echarntorune(&r, s, end - s); 548 549 if (state == BRACKETS_OPENING && r == '^' ) { continue; } 550 else if (state == BRACKETS_OPENING && r == ']' ) { state = BRACKETS_INSIDE ; continue; } 551 else if (state == BRACKETS_OPENING ) { state = BRACKETS_INSIDE ; } 552 553 if (state == CLASS_CLOSING && r == ']' ) { state = BRACKETS_INSIDE ; } 554 else if (state == CLASS_CLOSING ) { state = CLASS_INSIDE ; } 555 else if (state == CLASS_INSIDE && r == c ) { state = CLASS_CLOSING ; } 556 else if (state == INSIDE_OPENING && (r == ':' || 557 r == '.' || 558 r == '=') ) { state = CLASS_INSIDE ; c = r; } 559 else if (state == INSIDE_OPENING && r == ']' ) { state = OUTSIDE ; } 560 else if (state == INSIDE_OPENING ) { state = BRACKETS_INSIDE ; } 561 else if (state == BRACKETS_INSIDE && r == '[' ) { state = INSIDE_OPENING ; } 562 else if (state == BRACKETS_INSIDE && r == ']' ) { state = OUTSIDE ; } 563 else if (state == OUTSIDE && escape ) { escape = 0 ; } 564 else if (state == OUTSIDE && r == '\\' ) { escape = 1 ; } 565 else if (state == OUTSIDE && r == delim) return s; 566 else if (state == OUTSIDE && do_brackets && r == '[' ) { state = BRACKETS_OPENING; } 567 } 568 return s; 569 } 570 571 static char * 572 chomp(char *s) 573 { 574 return chompr(s, 0); 575 } 576 577 /* eat all leading whitespace and occurrences of rune */ 578 static char * 579 chompr(char *s, Rune rune) 580 { 581 Rune r; 582 size_t rlen; 583 char *end = s + strlen(s); 584 585 while (*s && (rlen = echarntorune(&r, s, end - s)) && (isspacerune(r) || r == rune)) 586 s += rlen; 587 return s; 588 } 589 590 /* convert first nrunes Runes from UTF-8 string s in allocated Rune* 591 * NOTE: sequence must be valid UTF-8, check first */ 592 static Rune * 593 strtorunes(char *s, size_t nrunes) 594 { 595 Rune *rs, *rp; 596 597 rp = rs = ereallocarray(NULL, nrunes + 1, sizeof(*rs)); 598 599 while (nrunes--) 600 s += chartorune(rp++, s); 601 602 *rp = '\0'; 603 return rs; 604 } 605 606 static long 607 stol(char *s, char **endp) 608 { 609 long n; 610 errno = 0; 611 n = strtol(s, endp, 10); 612 613 if (errno) 614 leprintf("strtol:"); 615 if (*endp == s) 616 leprintf("strtol: invalid number"); 617 618 return n; 619 } 620 621 /* from beg to end replace "\\d" with "d" and "\\n" with "\n" (where d is delim) 622 * if delim is 'n' and n_newline is 0 then "\\n" is replaced with "n" (normal) 623 * if delim is 'n' and n_newline is 1 then "\\n" is replaced with "\n" (y command) 624 * if delim is 0 all escaped characters represent themselves (aci text) 625 * memmove rest of string (beyond end) into place 626 * return the number of converted escapes (backslashes removed) 627 * FIXME: this has had too many corner cases slapped on and is ugly. rewrite better 628 */ 629 static size_t 630 escapes(char *beg, char *end, Rune delim, int n_newline) 631 { 632 size_t num = 0; 633 char *src = beg, *dst = beg; 634 635 while (src < end) { 636 /* handle escaped backslash specially so we don't think the second 637 * backslash is escaping something */ 638 if (*src == '\\' && src[1] == '\\') { 639 *dst++ = *src++; 640 if (delim) 641 *dst++ = *src++; 642 else 643 src++; 644 } else if (*src == '\\' && !delim) { 645 src++; 646 } else if (*src == '\\' && src[1]) { 647 Rune r; 648 size_t rlen; 649 num++; 650 src++; 651 rlen = echarntorune(&r, src, end - src); 652 653 if (r == 'n' && delim == 'n') { 654 *src = n_newline ? '\n' : 'n'; /* src so we can still memmove() */ 655 } else if (r == 'n') { 656 *src = '\n'; 657 } else if (r != delim) { 658 *dst++ = '\\'; 659 num--; 660 } 661 662 memmove(dst, src, rlen); 663 dst += rlen; 664 src += rlen; 665 } else { 666 *dst++ = *src++; 667 } 668 } 669 memmove(dst, src, strlen(src) + 1); 670 return num; 671 } 672 673 static size_t 674 echarntorune(Rune *r, char *s, size_t n) 675 { 676 size_t rlen = charntorune(r, s, n); 677 if (!rlen || *r == Runeerror) 678 leprintf("invalid UTF-8"); 679 return rlen; 680 } 681 682 static void 683 insert_labels(void) 684 { 685 size_t i; 686 Cmd *from, *to; 687 688 while (branches.size) { 689 from = prog + (ptrdiff_t)pop(&branches); 690 691 if (!from->u.label) {/* no label branch to end of script */ 692 from->u.jump = pc - 1; 693 } else { 694 for (i = 0; i < labels.size; i++) { 695 to = prog + (ptrdiff_t)labels.data[i]; 696 if (!strcmp(from->u.label, to->u.label)) { 697 from->u.jump = to; 698 break; 699 } 700 } 701 if (i == labels.size) 702 leprintf("bad label"); 703 } 704 } 705 } 706 707 /* 708 * Getargs / Freeargs 709 * Read argument from s, return pointer to one past last character of argument 710 */ 711 712 /* POSIX compliant 713 * i\ 714 * foobar 715 * 716 * also allow the following non POSIX compliant 717 * i # empty line 718 * ifoobar 719 * ifoobar\ 720 * baz 721 * 722 * FIXME: GNU and busybox discard leading spaces 723 * i foobar 724 * i foobar 725 * ifoobar 726 * are equivalent in GNU and busybox. We don't. Should we? 727 */ 728 static char * 729 get_aci_arg(Cmd *c, char *s) 730 { 731 c->u.acir.print = check_puts; 732 c->u.acir.str = (String){ NULL, 0 }; 733 734 gflags.aci_cont = !!*s; /* no continue flag if empty string */ 735 736 /* neither empty string nor POSIX compliant */ 737 if (*s && !(*s == '\\' && !s[1])) 738 aci_append(c, s); 739 740 return s + strlen(s); 741 } 742 743 static void 744 aci_append(Cmd *c, char *s) 745 { 746 char *end = s + strlen(s), *p = end; 747 748 gflags.aci_cont = 0; 749 while (--p >= s && *p == '\\') 750 gflags.aci_cont = !gflags.aci_cont; 751 752 if (gflags.aci_cont) 753 *--end = '\n'; 754 755 escapes(s, end, 0, 0); 756 stracat(&c->u.acir.str, s); 757 } 758 759 static void 760 free_acir_arg(Cmd *c) 761 { 762 free(c->u.acir.str.str); 763 } 764 765 /* POSIX dictates that label is rest of line, including semicolons, trailing 766 * whitespace, closing braces, etc. and can be limited to 8 bytes 767 * 768 * I allow a semicolon or closing brace to terminate a label name, it's not 769 * POSIX compliant, but it's useful and every sed version I've tried to date 770 * does the same. 771 * 772 * FIXME: POSIX dictates that leading whitespace is ignored but trailing 773 * whitespace is not. This is annoying and we should probably get rid of it. 774 */ 775 static char * 776 get_bt_arg(Cmd *c, char *s) 777 { 778 char *p = semicolon_arg(s = chomp(s)); 779 780 if (p != s) { 781 c->u.label = estrndup(s, p - s); 782 } else { 783 c->u.label = NULL; 784 } 785 786 push(&branches, (void *)(c - prog)); 787 788 return p; 789 } 790 791 /* POSIX dictates file name is rest of line including semicolons, trailing 792 * whitespace, closing braces, etc. and file name must be preceded by a space 793 * 794 * I allow a semicolon or closing brace to terminate a file name and don't 795 * enforce leading space. 796 * 797 * FIXME: decide whether trailing whitespace should be included and fix 798 * accordingly 799 */ 800 static char * 801 get_r_arg(Cmd *c, char *s) 802 { 803 char *p = semicolon_arg(s = chomp(s)); 804 805 if (p == s) 806 leprintf("no file name"); 807 808 c->u.acir.str.str = estrndup(s, p - s); 809 c->u.acir.print = write_file; 810 811 return p; 812 } 813 814 /* we allow "\\n" in replacement text to mean "\n" (undefined in POSIX) 815 * 816 * FIXME: allow other escapes in regex and replacement? if so change escapes() 817 */ 818 static char * 819 get_s_arg(Cmd *c, char *s) 820 { 821 Rune delim, r; 822 Cmd buf; 823 char *p; 824 int esc, lastre; 825 826 /* s/Find/Replace/Flags */ 827 828 /* Find */ 829 if (!gflags.s_cont) { /* NOT continuing from literal newline in replacement text */ 830 lastre = 0; 831 c->u.s.repl = (String){ NULL, 0 }; 832 c->u.s.occurrence = 1; 833 c->u.s.file = NULL; 834 c->u.s.p = 0; 835 836 if (!*s || *s == '\\') 837 leprintf("bad delimiter"); 838 839 p = s + strlen(s); 840 s += echarntorune(&delim, s, p - s); 841 c->u.s.delim = delim; 842 843 echarntorune(&r, s, p - s); 844 if (r == delim) /* empty regex */ 845 lastre = 1; 846 847 p = find_delim(s, delim, 1); 848 if (!*p) 849 leprintf("missing second delimiter"); 850 p -= escapes(s, p, delim, 0); 851 *p = '\0'; 852 853 if (lastre) { 854 c->u.s.re = NULL; 855 } else { 856 c->u.s.re = emalloc(sizeof(*c->u.s.re)); 857 /* FIXME: different eregcomp that calls fatal */ 858 eregcomp(c->u.s.re, s, gflags.E ? REG_EXTENDED : 0); 859 } 860 s = p + runelen(delim); 861 } 862 863 /* Replace */ 864 delim = c->u.s.delim; 865 866 p = find_delim(s, delim, 0); 867 p -= escapes(s, p, delim, 0); 868 if (!*p) { /* no third delimiter */ 869 /* FIXME: same backslash counting as aci_append() */ 870 if (p[-1] != '\\') 871 leprintf("missing third delimiter or <backslash><newline>"); 872 p[-1] = '\n'; 873 gflags.s_cont = 1; 874 } else { 875 gflags.s_cont = 0; 876 } 877 878 /* check for bad references in replacement text */ 879 *p = '\0'; 880 for (esc = 0, p = s; *p; p++) { 881 if (esc) { 882 esc = 0; 883 if (isdigit(*p) && c->u.s.re && (size_t)(*p - '0') > c->u.s.re->re_nsub) 884 leprintf("back reference number greater than number of groups"); 885 } else if (*p == '\\') { 886 esc = 1; 887 } 888 } 889 stracat(&c->u.s.repl, s); 890 891 if (gflags.s_cont) 892 return p; 893 894 s = p + runelen(delim); 895 896 /* Flags */ 897 p = semicolon_arg(s = chomp(s)); 898 899 /* FIXME: currently for simplicity take last of g or occurrence flags and 900 * ignore multiple p flags. need to fix that */ 901 for (; s < p; s++) { 902 if (isdigit(*s)) { 903 c->u.s.occurrence = stol(s, &s); 904 s--; /* for loop will advance pointer */ 905 } else { 906 switch (*s) { 907 case 'g': c->u.s.occurrence = 0; break; 908 case 'p': c->u.s.p = 1; break; 909 case 'w': 910 /* must be last flag, take everything up to newline/semicolon 911 * s == p after this */ 912 s = get_w_arg(&buf, chomp(s+1)); 913 c->u.s.file = buf.u.file; 914 break; 915 } 916 } 917 } 918 return p; 919 } 920 921 static void 922 free_s_arg(Cmd *c) 923 { 924 if (c->u.s.re) 925 regfree(c->u.s.re); 926 free(c->u.s.re); 927 free(c->u.s.repl.str); 928 } 929 930 /* see get_r_arg notes */ 931 static char * 932 get_w_arg(Cmd *c, char *s) 933 { 934 char *p = semicolon_arg(s = chomp(s)); 935 Wfile *w, **wp; 936 937 if (p == s) 938 leprintf("no file name"); 939 940 for (wp = (Wfile **)wfiles.data; (size_t)(wp - (Wfile **)wfiles.data) < wfiles.size; wp++) { 941 if (strlen((*wp)->path) == (size_t)(p - s) && !strncmp(s, (*wp)->path, p - s)) { 942 c->u.file = (*wp)->file; 943 return p; 944 } 945 } 946 947 w = emalloc(sizeof(*w)); 948 w->path = estrndup(s, p - s); 949 950 if (!(w->file = fopen(w->path, "w"))) 951 leprintf("fopen failed"); 952 953 c->u.file = w->file; 954 955 push(&wfiles, w); 956 return p; 957 } 958 959 static char * 960 get_y_arg(Cmd *c, char *s) 961 { 962 Rune delim; 963 char *p = s + strlen(s); 964 size_t rlen = echarntorune(&delim, s, p - s); 965 size_t nrunes1, nrunes2; 966 967 c->u.y.set1 = c->u.y.set2 = NULL; 968 969 s += rlen; 970 p = find_delim(s, delim, 0); 971 p -= escapes(s, p, delim, 1); 972 nrunes1 = utfnlen(s, p - s); 973 c->u.y.set1 = strtorunes(s, nrunes1); 974 975 s = p + rlen; 976 p = find_delim(s, delim, 0); 977 p -= escapes(s, p, delim, 1); 978 nrunes2 = utfnlen(s, p - s); 979 980 if (nrunes1 != nrunes2) 981 leprintf("different set lengths"); 982 983 c->u.y.set2 = strtorunes(s, utfnlen(s, p - s)); 984 985 return p + rlen; 986 } 987 988 static void 989 free_y_arg(Cmd *c) 990 { 991 free(c->u.y.set1); 992 free(c->u.y.set2); 993 } 994 995 /* see get_bt_arg notes */ 996 static char * 997 get_colon_arg(Cmd *c, char *s) 998 { 999 char *p = semicolon_arg(s = chomp(s)); 1000 1001 if (p == s) 1002 leprintf("no label name"); 1003 1004 c->u.label = estrndup(s, p - s); 1005 push(&labels, (void *)(c - prog)); 1006 return p; 1007 } 1008 1009 static char * 1010 get_lbrace_arg(Cmd *c, char *s) 1011 { 1012 push(&braces, (void *)(c - prog)); 1013 return s; 1014 } 1015 1016 static char * 1017 get_rbrace_arg(Cmd *c, char *s) 1018 { 1019 Cmd *lbrace; 1020 1021 if (!braces.size) 1022 leprintf("extra }"); 1023 1024 lbrace = prog + (ptrdiff_t)pop(&braces); 1025 lbrace->u.offset = c - prog; 1026 return s; 1027 } 1028 1029 /* s points to beginning of an argument that may be semicolon terminated 1030 * return pointer to semicolon or nul byte after string 1031 * or closing brace as to not force ; before } 1032 * FIXME: decide whether or not to eat trailing whitespace for arguments that 1033 * we allow semicolon/brace termination that POSIX doesn't 1034 * b, r, t, w, : 1035 * POSIX says trailing whitespace is part of label name, file name, etc. 1036 * we should probably eat it 1037 */ 1038 static char * 1039 semicolon_arg(char *s) 1040 { 1041 char *p = strpbrk(s, ";}"); 1042 if (!p) 1043 p = s + strlen(s); 1044 return p; 1045 } 1046 1047 static void 1048 run(void) 1049 { 1050 lineno = 0; 1051 if (braces.size) 1052 leprintf("extra {"); 1053 1054 /* genbuf has already been initialized, patt will be in new_line 1055 * (or we'll halt) */ 1056 stracpy(&hold, ""); 1057 1058 insert_labels(); 1059 next_file(); 1060 new_line(); 1061 1062 for (pc = prog; !gflags.halt; pc++) 1063 pc->fninfo->fn(pc); 1064 } 1065 1066 /* return true if we are in range for c, set c->in_match appropriately */ 1067 static int 1068 in_range(Cmd *c) 1069 { 1070 if (match_addr(&c->range.beg)) { 1071 if (c->range.naddr == 2) { 1072 if (c->range.end.type == LINE && c->range.end.u.lineno <= lineno) 1073 c->in_match = 0; 1074 else 1075 c->in_match = 1; 1076 } 1077 return !c->negate; 1078 } 1079 if (c->in_match && match_addr(&c->range.end)) { 1080 c->in_match = 0; 1081 return !c->negate; 1082 } 1083 return c->in_match ^ c->negate; 1084 } 1085 1086 /* return true if addr matches current line */ 1087 static int 1088 match_addr(Addr *a) 1089 { 1090 switch (a->type) { 1091 default: 1092 case IGNORE: return 0; 1093 case EVERY: return 1; 1094 case LINE: return lineno == a->u.lineno; 1095 case LAST: 1096 while (is_eof(file) && !next_file()) 1097 ; 1098 return !file; 1099 case REGEX: 1100 lastre = a->u.re; 1101 return !regexec(a->u.re, patt.str, 0, NULL, 0); 1102 case LASTRE: 1103 if (!lastre) 1104 leprintf("no previous regex"); 1105 return !regexec(lastre, patt.str, 0, NULL, 0); 1106 } 1107 } 1108 1109 /* move to next input file 1110 * stdin if first call and no files 1111 * return 0 for success and 1 for no more files 1112 */ 1113 static int 1114 next_file(void) 1115 { 1116 static unsigned char first = 1; 1117 1118 if (file == stdin) 1119 clearerr(file); 1120 else if (file) 1121 fshut(file, "<file>"); 1122 /* given no files, default to stdin */ 1123 file = first && !*files ? stdin : NULL; 1124 first = 0; 1125 1126 while (!file && *files) { 1127 if (!strcmp(*files, "-")) { 1128 file = stdin; 1129 } else if (!(file = fopen(*files, "r"))) { 1130 /* warn this file didn't open, but move on to next */ 1131 weprintf("fopen %s:", *files); 1132 ret = 1; 1133 } 1134 files++; 1135 } 1136 1137 return !file; 1138 } 1139 1140 /* test if stream is at EOF */ 1141 static int 1142 is_eof(FILE *f) 1143 { 1144 int c; 1145 1146 if (!f || feof(f)) 1147 return 1; 1148 1149 c = fgetc(f); 1150 if (c == EOF && ferror(f)) 1151 eprintf("fgetc:"); 1152 if (c != EOF && ungetc(c, f) == EOF) 1153 eprintf("ungetc EOF\n"); 1154 1155 return c == EOF; 1156 } 1157 1158 /* perform writes that were scheduled 1159 * for aci this is check_puts(string, stdout) 1160 * for r this is write_file(path, stdout) 1161 */ 1162 static void 1163 do_writes(void) 1164 { 1165 Cmd *c; 1166 size_t i; 1167 1168 for (i = 0; i < writes.size; i++) { 1169 c = writes.data[i]; 1170 c->u.acir.print(c->u.acir.str.str, stdout); 1171 } 1172 writes.size = 0; 1173 } 1174 1175 /* used for r's u.acir.print() 1176 * FIXME: something like util's concat() would be better 1177 */ 1178 static void 1179 write_file(char *path, FILE *out) 1180 { 1181 FILE *in = fopen(path, "r"); 1182 if (!in) /* no file is treated as empty file */ 1183 return; 1184 1185 while (read_line(in, &genbuf) != EOF) 1186 check_puts(genbuf.str, out); 1187 1188 fshut(in, path); 1189 } 1190 1191 static void 1192 check_puts(char *s, FILE *f) 1193 { 1194 if (s && fputs(s, f) == EOF) 1195 eprintf("fputs:"); 1196 if (fputs("\n", f) == EOF) 1197 eprintf("fputs:"); 1198 } 1199 1200 /* iterate from beg to end updating ranges so we don't miss any commands 1201 * e.g. sed -n '1d;1,3p' should still print lines 2 and 3 1202 */ 1203 static void 1204 update_ranges(Cmd *beg, Cmd *end) 1205 { 1206 while (beg < end) 1207 in_range(beg++); 1208 } 1209 1210 /* 1211 * Sed functions 1212 */ 1213 static void 1214 cmd_a(Cmd *c) 1215 { 1216 if (in_range(c)) 1217 push(&writes, c); 1218 } 1219 1220 static void 1221 cmd_b(Cmd *c) 1222 { 1223 if (!in_range(c)) 1224 return; 1225 1226 /* if we jump backwards update to end, otherwise update to destination */ 1227 update_ranges(c + 1, c->u.jump > c ? c->u.jump : prog + pcap); 1228 pc = c->u.jump; 1229 } 1230 1231 static void 1232 cmd_c(Cmd *c) 1233 { 1234 if (!in_range(c)) 1235 return; 1236 1237 /* write the text on the last line of the match */ 1238 if (!c->in_match) 1239 check_puts(c->u.acir.str.str, stdout); 1240 /* otherwise start the next cycle without printing pattern space 1241 * effectively deleting the text */ 1242 new_next(); 1243 } 1244 1245 static void 1246 cmd_d(Cmd *c) 1247 { 1248 if (!in_range(c)) 1249 return; 1250 1251 new_next(); 1252 } 1253 1254 static void 1255 cmd_D(Cmd *c) 1256 { 1257 char *p; 1258 1259 if (!in_range(c)) 1260 return; 1261 1262 if ((p = strchr(patt.str, '\n'))) { 1263 p++; 1264 memmove(patt.str, p, strlen(p) + 1); 1265 old_next(); 1266 } else { 1267 new_next(); 1268 } 1269 } 1270 1271 static void 1272 cmd_g(Cmd *c) 1273 { 1274 if (in_range(c)) 1275 stracpy(&patt, hold.str); 1276 } 1277 1278 static void 1279 cmd_G(Cmd *c) 1280 { 1281 if (!in_range(c)) 1282 return; 1283 1284 stracat(&patt, "\n"); 1285 stracat(&patt, hold.str); 1286 } 1287 1288 static void 1289 cmd_h(Cmd *c) 1290 { 1291 if (in_range(c)) 1292 stracpy(&hold, patt.str); 1293 } 1294 1295 static void 1296 cmd_H(Cmd *c) 1297 { 1298 if (!in_range(c)) 1299 return; 1300 1301 stracat(&hold, "\n"); 1302 stracat(&hold, patt.str); 1303 } 1304 1305 static void 1306 cmd_i(Cmd *c) 1307 { 1308 if (in_range(c)) 1309 check_puts(c->u.acir.str.str, stdout); 1310 } 1311 1312 /* I think it makes sense to print invalid UTF-8 sequences in octal to satisfy 1313 * the "visually unambiguous form" sed(1p) 1314 */ 1315 static void 1316 cmd_l(Cmd *c) 1317 { 1318 Rune r; 1319 char *p, *end; 1320 size_t rlen; 1321 1322 char *escapes[] = { /* FIXME: 7 entries and search instead of 127 */ 1323 ['\\'] = "\\\\", ['\a'] = "\\a", ['\b'] = "\\b", 1324 ['\f'] = "\\f" , ['\r'] = "\\r", ['\t'] = "\\t", 1325 ['\v'] = "\\v" , [0x7f] = NULL, /* fill out the table */ 1326 }; 1327 1328 if (!in_range(c)) 1329 return; 1330 1331 /* FIXME: line wrapping. sed(1p) says "length at which folding occurs is 1332 * unspecified, but should be appropraite for the output device" 1333 * just wrap at 80 Runes? 1334 */ 1335 for (p = patt.str, end = p + strlen(p); p < end; p += rlen) { 1336 if (isascii(*p) && escapes[(unsigned int)*p]) { 1337 fputs(escapes[(unsigned int)*p], stdout); 1338 rlen = 1; 1339 } else if (!(rlen = charntorune(&r, p, end - p))) { 1340 /* ran out of chars, print the bytes of the short sequence */ 1341 for (; p < end; p++) 1342 printf("\\%03hho", (unsigned char)*p); 1343 break; 1344 } else if (r == Runeerror) { 1345 for (; rlen; rlen--, p++) 1346 printf("\\%03hho", (unsigned char)*p); 1347 } else { 1348 while (fwrite(p, rlen, 1, stdout) < 1 && errno == EINTR) 1349 ; 1350 if (ferror(stdout)) 1351 eprintf("fwrite:"); 1352 } 1353 } 1354 check_puts("$", stdout); 1355 } 1356 1357 static void 1358 cmd_n(Cmd *c) 1359 { 1360 if (!in_range(c)) 1361 return; 1362 1363 if (!gflags.n) 1364 check_puts(patt.str, stdout); 1365 do_writes(); 1366 new_line(); 1367 } 1368 1369 static void 1370 cmd_N(Cmd *c) 1371 { 1372 if (!in_range(c)) 1373 return; 1374 do_writes(); 1375 app_line(); 1376 } 1377 1378 static void 1379 cmd_p(Cmd *c) 1380 { 1381 if (in_range(c)) 1382 check_puts(patt.str, stdout); 1383 } 1384 1385 static void 1386 cmd_P(Cmd *c) 1387 { 1388 char *p; 1389 1390 if (!in_range(c)) 1391 return; 1392 1393 if ((p = strchr(patt.str, '\n'))) 1394 *p = '\0'; 1395 1396 check_puts(patt.str, stdout); 1397 1398 if (p) 1399 *p = '\n'; 1400 } 1401 1402 static void 1403 cmd_q(Cmd *c) 1404 { 1405 if (!in_range(c)) 1406 return; 1407 1408 if (!gflags.n) 1409 check_puts(patt.str, stdout); 1410 do_writes(); 1411 gflags.halt = 1; 1412 } 1413 1414 static void 1415 cmd_r(Cmd *c) 1416 { 1417 if (in_range(c)) 1418 push(&writes, c); 1419 } 1420 1421 static void 1422 cmd_s(Cmd *c) 1423 { 1424 String tmp; 1425 Rune r; 1426 size_t plen, rlen, len; 1427 char *p, *s, *end; 1428 unsigned int matches = 0, last_empty = 1, qflag = 0, cflags = 0; 1429 regex_t *re; 1430 regmatch_t *rm, *pmatch = NULL; 1431 1432 if (!in_range(c)) 1433 return; 1434 1435 if (!c->u.s.re && !lastre) 1436 leprintf("no previous regex"); 1437 1438 re = c->u.s.re ? c->u.s.re : lastre; 1439 lastre = re; 1440 1441 plen = re->re_nsub + 1; 1442 pmatch = ereallocarray(NULL, plen, sizeof(regmatch_t)); 1443 1444 *genbuf.str = '\0'; 1445 s = patt.str; 1446 1447 while (!qflag && !regexec(re, s, plen, pmatch, cflags)) { 1448 cflags = REG_NOTBOL; /* match against beginning of line first time, but not again */ 1449 if (!*s) /* match against empty string first time, but not again */ 1450 qflag = 1; 1451 1452 /* don't substitute if last match was not empty but this one is. 1453 * s_a*_._g 1454 * foobar -> .f.o.o.b.r. 1455 */ 1456 if ((last_empty || pmatch[0].rm_eo) && 1457 (++matches == c->u.s.occurrence || !c->u.s.occurrence)) { 1458 /* copy over everything before the match */ 1459 strnacat(&genbuf, s, pmatch[0].rm_so); 1460 1461 /* copy over replacement text, taking into account &, backreferences, and \ escapes */ 1462 for (p = c->u.s.repl.str, len = strcspn(p, "\\&"); *p; len = strcspn(++p, "\\&")) { 1463 strnacat(&genbuf, p, len); 1464 p += len; 1465 switch (*p) { 1466 default: leprintf("this shouldn't be possible"); 1467 case '\0': 1468 /* we're at the end, back up one so the ++p will put us on 1469 * the null byte to break out of the loop */ 1470 --p; 1471 break; 1472 case '&': 1473 strnacat(&genbuf, s + pmatch[0].rm_so, pmatch[0].rm_eo - pmatch[0].rm_so); 1474 break; 1475 case '\\': 1476 if (isdigit(*++p)) { /* backreference */ 1477 /* only need to check here if using lastre, otherwise we checked when building */ 1478 if (!c->u.s.re && (size_t)(*p - '0') > re->re_nsub) 1479 leprintf("back reference number greater than number of groups"); 1480 rm = &pmatch[*p - '0']; 1481 strnacat(&genbuf, s + rm->rm_so, rm->rm_eo - rm->rm_so); 1482 } else { /* character after backslash taken literally (well one byte, but it works) */ 1483 strnacat(&genbuf, p, 1); 1484 } 1485 break; 1486 } 1487 } 1488 } else { 1489 /* not replacing, copy over everything up to and including the match */ 1490 strnacat(&genbuf, s, pmatch[0].rm_eo); 1491 } 1492 1493 if (!pmatch[0].rm_eo) { /* empty match, advance one rune and add it to output */ 1494 end = s + strlen(s); 1495 rlen = charntorune(&r, s, end - s); 1496 1497 if (!rlen) { /* ran out of bytes, copy short sequence */ 1498 stracat(&genbuf, s); 1499 s = end; 1500 } else { /* copy whether or not it's a good rune */ 1501 strnacat(&genbuf, s, rlen); 1502 s += rlen; 1503 } 1504 } 1505 last_empty = !pmatch[0].rm_eo; 1506 s += pmatch[0].rm_eo; 1507 } 1508 free(pmatch); 1509 1510 if (!(matches && matches >= c->u.s.occurrence)) /* no replacement */ 1511 return; 1512 1513 gflags.s = 1; 1514 1515 stracat(&genbuf, s); 1516 1517 tmp = patt; 1518 patt = genbuf; 1519 genbuf = tmp; 1520 1521 if (c->u.s.p) 1522 check_puts(patt.str, stdout); 1523 if (c->u.s.file) 1524 check_puts(patt.str, c->u.s.file); 1525 } 1526 1527 static void 1528 cmd_t(Cmd *c) 1529 { 1530 if (!in_range(c) || !gflags.s) 1531 return; 1532 1533 /* if we jump backwards update to end, otherwise update to destination */ 1534 update_ranges(c + 1, c->u.jump > c ? c->u.jump : prog + pcap); 1535 pc = c->u.jump; 1536 gflags.s = 0; 1537 } 1538 1539 static void 1540 cmd_w(Cmd *c) 1541 { 1542 if (in_range(c)) 1543 check_puts(patt.str, c->u.file); 1544 } 1545 1546 static void 1547 cmd_x(Cmd *c) 1548 { 1549 String tmp; 1550 1551 if (!in_range(c)) 1552 return; 1553 1554 tmp = patt; 1555 patt = hold; 1556 hold = tmp; 1557 } 1558 1559 static void 1560 cmd_y(Cmd *c) 1561 { 1562 String tmp; 1563 Rune r, *rp; 1564 size_t n, rlen; 1565 char *s, *end, buf[UTFmax]; 1566 1567 if (!in_range(c)) 1568 return; 1569 1570 *genbuf.str = '\0'; 1571 for (s = patt.str, end = s + strlen(s); *s; s += rlen) { 1572 if (!(rlen = charntorune(&r, s, end - s))) { /* ran out of chars, copy rest */ 1573 stracat(&genbuf, s); 1574 break; 1575 } else if (r == Runeerror) { /* bad UTF-8 sequence, copy bytes */ 1576 strnacat(&genbuf, s, rlen); 1577 } else { 1578 for (rp = c->u.y.set1; *rp; rp++) 1579 if (*rp == r) 1580 break; 1581 if (*rp) { /* found r in set1, replace with Rune from set2 */ 1582 n = runetochar(buf, c->u.y.set2 + (rp - c->u.y.set1)); 1583 strnacat(&genbuf, buf, n); 1584 } else { 1585 strnacat(&genbuf, s, rlen); 1586 } 1587 } 1588 } 1589 tmp = patt; 1590 patt = genbuf; 1591 genbuf = tmp; 1592 } 1593 1594 static void 1595 cmd_colon(Cmd *c) 1596 { 1597 } 1598 1599 static void 1600 cmd_equal(Cmd *c) 1601 { 1602 if (in_range(c)) 1603 printf("%zu\n", lineno); 1604 } 1605 1606 static void 1607 cmd_lbrace(Cmd *c) 1608 { 1609 Cmd *jump; 1610 1611 if (in_range(c)) 1612 return; 1613 1614 /* update ranges on all commands we skip */ 1615 jump = prog + c->u.offset; 1616 update_ranges(c + 1, jump); 1617 pc = jump; 1618 } 1619 1620 static void 1621 cmd_rbrace(Cmd *c) 1622 { 1623 } 1624 1625 /* not actually a sed function, but acts like one, put in last spot of script */ 1626 static void 1627 cmd_last(Cmd *c) 1628 { 1629 if (!gflags.n) 1630 check_puts(patt.str, stdout); 1631 do_writes(); 1632 new_next(); 1633 } 1634 1635 /* 1636 * Actions 1637 */ 1638 1639 /* read new line, continue current cycle */ 1640 static void 1641 new_line(void) 1642 { 1643 while (read_line(file, &patt) == EOF) { 1644 if (next_file()) { 1645 gflags.halt = 1; 1646 return; 1647 } 1648 } 1649 gflags.s = 0; 1650 lineno++; 1651 } 1652 1653 /* append new line, continue current cycle 1654 * FIXME: used for N, POSIX specifies do not print pattern space when out of 1655 * input, but GNU does so busybox does as well. Currently we don't. 1656 * Should we? 1657 */ 1658 static void 1659 app_line(void) 1660 { 1661 while (read_line(file, &genbuf) == EOF) { 1662 if (next_file()) { 1663 gflags.halt = 1; 1664 return; 1665 } 1666 } 1667 1668 stracat(&patt, "\n"); 1669 stracat(&patt, genbuf.str); 1670 gflags.s = 0; 1671 lineno++; 1672 } 1673 1674 /* read new line, start new cycle */ 1675 static void 1676 new_next(void) 1677 { 1678 *patt.str = '\0'; 1679 update_ranges(pc + 1, prog + pcap); 1680 new_line(); 1681 pc = prog - 1; 1682 } 1683 1684 /* keep old pattern space, start new cycle */ 1685 static void 1686 old_next(void) 1687 { 1688 update_ranges(pc + 1, prog + pcap); 1689 pc = prog - 1; 1690 } 1691 1692 int 1693 main(int argc, char *argv[]) 1694 { 1695 char *arg; 1696 int script = 0; 1697 1698 ARGBEGIN { 1699 case 'n': 1700 gflags.n = 1; 1701 break; 1702 case 'r': 1703 case 'E': 1704 gflags.E = 1; 1705 break; 1706 case 'e': 1707 arg = EARGF(usage()); 1708 compile(arg, 0); 1709 script = 1; 1710 break; 1711 case 'f': 1712 arg = EARGF(usage()); 1713 compile(arg, 1); 1714 script = 1; 1715 break; 1716 default : usage(); 1717 } ARGEND 1718 1719 /* no script to run */ 1720 if (!script && !argc) 1721 usage(); 1722 1723 /* no script yet, next argument is script */ 1724 if (!script) 1725 compile(*argv++, 0); 1726 1727 /* shrink/grow memory to fit and add our last instruction */ 1728 resize((void **)&prog, &pcap, sizeof(*prog), pc - prog + 1, NULL); 1729 pc = prog + pcap - 1; 1730 pc->fninfo = &(Fninfo){ cmd_last, NULL, NULL, 0 }; 1731 1732 files = argv; 1733 run(); 1734 1735 ret |= fshut(stdin, "<stdin>") | fshut(stdout, "<stdout>"); 1736 1737 return ret; 1738 }