sort.c (9340B)
1 /* See LICENSE file for copyright and license details. */ 2 #include <ctype.h> 3 #include <stdio.h> 4 #include <stdlib.h> 5 #include <string.h> 6 7 #include "queue.h" 8 #include "text.h" 9 #include "utf.h" 10 #include "util.h" 11 12 struct keydef { 13 int start_column; 14 int end_column; 15 int start_char; 16 int end_char; 17 int flags; 18 TAILQ_ENTRY(keydef) entry; 19 }; 20 21 struct column { 22 struct line line; 23 size_t cap; 24 }; 25 26 enum { 27 MOD_N = 1 << 0, 28 MOD_STARTB = 1 << 1, 29 MOD_ENDB = 1 << 2, 30 MOD_R = 1 << 3, 31 MOD_D = 1 << 4, 32 MOD_F = 1 << 5, 33 MOD_I = 1 << 6, 34 }; 35 36 static TAILQ_HEAD(kdhead, keydef) kdhead = TAILQ_HEAD_INITIALIZER(kdhead); 37 38 static int Cflag = 0, cflag = 0, uflag = 0; 39 static char *fieldsep = NULL; 40 static size_t fieldseplen = 0; 41 static struct column col1, col2; 42 43 static void 44 skipblank(struct line *a) 45 { 46 while (a->len && (*(a->data) == ' ' || *(a->data) == '\t')) { 47 a->data++; 48 a->len--; 49 } 50 } 51 52 static void 53 skipnonblank(struct line *a) 54 { 55 while (a->len && (*(a->data) != '\n' && *(a->data) != ' ' && 56 *(a->data) != '\t')) { 57 a->data++; 58 a->len--; 59 } 60 } 61 62 static void 63 skipcolumn(struct line *a, int skip_to_next_col) 64 { 65 char *s; 66 67 if (fieldsep) { 68 if ((s = memmem(a->data, a->len, fieldsep, fieldseplen))) { 69 if (skip_to_next_col) 70 s += fieldseplen; 71 a->len -= s - a->data; 72 a->data = s; 73 } else { 74 a->data += a->len - 1; 75 a->len = 1; 76 } 77 } else { 78 skipblank(a); 79 skipnonblank(a); 80 } 81 } 82 83 static void 84 columns(struct line *line, const struct keydef *kd, struct column *col) 85 { 86 Rune r; 87 struct line start, end; 88 size_t utflen, rlen; 89 int i; 90 91 start.data = line->data; 92 start.len = line->len; 93 for (i = 1; i < kd->start_column; i++) 94 skipcolumn(&start, 1); 95 if (kd->flags & MOD_STARTB) 96 skipblank(&start); 97 for (utflen = 0; start.len > 1 && utflen < kd->start_char - 1;) { 98 rlen = chartorune(&r, start.data); 99 start.data += rlen; 100 start.len -= rlen; 101 utflen++; 102 } 103 104 end.data = line->data; 105 end.len = line->len; 106 if (kd->end_column) { 107 for (i = 1; i < kd->end_column; i++) 108 skipcolumn(&end, 1); 109 if (kd->flags & MOD_ENDB) 110 skipblank(&end); 111 if (kd->end_char) { 112 for (utflen = 0; end.len > 1 && utflen < kd->end_char;) { 113 rlen = chartorune(&r, end.data); 114 end.data += rlen; 115 end.len -= rlen; 116 utflen++; 117 } 118 } else { 119 skipcolumn(&end, 0); 120 } 121 } else { 122 end.data += end.len - 1; 123 end.len = 1; 124 } 125 col->line.len = MAX(0, end.data - start.data); 126 col->line.data = start.data; 127 col->line.data[col->line.len] = '\0'; 128 } 129 130 static int 131 skipmodcmp(struct line *a, struct line *b, int flags) 132 { 133 Rune r1, r2; 134 size_t offa = 0, offb = 0; 135 136 do { 137 offa += chartorune(&r1, a->data + offa); 138 offb += chartorune(&r2, b->data + offb); 139 140 if (flags & MOD_D && flags & MOD_I) { 141 while (offa < a->len && ((!isblankrune(r1) && 142 !isalnumrune(r1)) || (!isprintrune(r1)))) 143 offa += chartorune(&r1, a->data + offa); 144 while (offb < b->len && ((!isblankrune(r2) && 145 !isalnumrune(r2)) || (!isprintrune(r2)))) 146 offb += chartorune(&r2, b->data + offb); 147 } 148 else if (flags & MOD_D) { 149 while (offa < a->len && !isblankrune(r1) && 150 !isalnumrune(r1)) 151 offa += chartorune(&r1, a->data + offa); 152 while (offb < b->len && !isblankrune(r2) && 153 !isalnumrune(r2)) 154 offb += chartorune(&r2, b->data + offb); 155 } 156 else if (flags & MOD_I) { 157 while (offa < a->len && !isprintrune(r1)) 158 offa += chartorune(&r1, a->data + offa); 159 while (offb < b->len && !isprintrune(r2)) 160 offb += chartorune(&r2, b->data + offb); 161 } 162 if (flags & MOD_F) { 163 r1 = toupperrune(r1); 164 r2 = toupperrune(r2); 165 } 166 } while (r1 && r1 == r2); 167 168 return r1 - r2; 169 } 170 171 static int 172 slinecmp(struct line *a, struct line *b) 173 { 174 int res = 0; 175 double x, y; 176 struct keydef *kd; 177 178 TAILQ_FOREACH(kd, &kdhead, entry) { 179 columns(a, kd, &col1); 180 columns(b, kd, &col2); 181 182 /* if -u is given, don't use default key definition 183 * unless it is the only one */ 184 if (uflag && kd == TAILQ_LAST(&kdhead, kdhead) && 185 TAILQ_LAST(&kdhead, kdhead) != TAILQ_FIRST(&kdhead)) { 186 res = 0; 187 } else if (kd->flags & MOD_N) { 188 x = strtod(col1.line.data, NULL); 189 y = strtod(col2.line.data, NULL); 190 res = (x < y) ? -1 : (x > y); 191 } else if (kd->flags & (MOD_D | MOD_F | MOD_I)) { 192 res = skipmodcmp(&col1.line, &col2.line, kd->flags); 193 } else { 194 res = linecmp(&col1.line, &col2.line); 195 } 196 197 if (kd->flags & MOD_R) 198 res = -res; 199 if (res) 200 break; 201 } 202 203 return res; 204 } 205 206 static int 207 check(FILE *fp, const char *fname) 208 { 209 static struct line prev, cur, tmp; 210 static size_t prevsize, cursize, tmpsize; 211 ssize_t len; 212 213 if (!prev.data) { 214 if ((len = getline(&prev.data, &prevsize, fp)) < 0) 215 eprintf("getline:"); 216 prev.len = len; 217 } 218 while ((len = getline(&cur.data, &cursize, fp)) > 0) { 219 cur.len = len; 220 if (uflag > slinecmp(&cur, &prev)) { 221 if (!Cflag) { 222 weprintf("disorder %s: ", fname); 223 fwrite(cur.data, 1, cur.len, stderr); 224 } 225 return 1; 226 } 227 tmp = cur; 228 tmpsize = cursize; 229 cur = prev; 230 cursize = prevsize; 231 prev = tmp; 232 prevsize = tmpsize; 233 } 234 235 return 0; 236 } 237 238 static int 239 parse_flags(char **s, int *flags, int bflag) 240 { 241 while (isalpha((int)**s)) { 242 switch (*((*s)++)) { 243 case 'b': 244 *flags |= bflag; 245 break; 246 case 'd': 247 *flags |= MOD_D; 248 break; 249 case 'f': 250 *flags |= MOD_F; 251 break; 252 case 'i': 253 *flags |= MOD_I; 254 break; 255 case 'n': 256 *flags |= MOD_N; 257 break; 258 case 'r': 259 *flags |= MOD_R; 260 break; 261 default: 262 return -1; 263 } 264 } 265 266 return 0; 267 } 268 269 static void 270 addkeydef(char *kdstr, int flags) 271 { 272 struct keydef *kd; 273 274 kd = enmalloc(2, sizeof(*kd)); 275 276 /* parse key definition kdstr with format 277 * start_column[.start_char][flags][,end_column[.end_char][flags]] 278 */ 279 kd->start_column = 1; 280 kd->start_char = 1; 281 kd->end_column = 0; /* 0 means end of line */ 282 kd->end_char = 0; /* 0 means end of column */ 283 kd->flags = flags; 284 285 if ((kd->start_column = strtol(kdstr, &kdstr, 10)) < 1) 286 enprintf(2, "invalid start column in key definition\n"); 287 288 if (*kdstr == '.') { 289 if ((kd->start_char = strtol(kdstr + 1, &kdstr, 10)) < 1) 290 enprintf(2, "invalid start character in key " 291 "definition\n"); 292 } 293 if (parse_flags(&kdstr, &kd->flags, MOD_STARTB) < 0) 294 enprintf(2, "invalid start flags in key definition\n"); 295 296 if (*kdstr == ',') { 297 if ((kd->end_column = strtol(kdstr + 1, &kdstr, 10)) < 0) 298 enprintf(2, "invalid end column in key definition\n"); 299 if (*kdstr == '.') { 300 if ((kd->end_char = strtol(kdstr + 1, &kdstr, 10)) < 0) 301 enprintf(2, "invalid end character in key " 302 "definition\n"); 303 } 304 if (parse_flags(&kdstr, &kd->flags, MOD_ENDB) < 0) 305 enprintf(2, "invalid end flags in key definition\n"); 306 } 307 308 if (*kdstr != '\0') 309 enprintf(2, "invalid key definition\n"); 310 311 TAILQ_INSERT_TAIL(&kdhead, kd, entry); 312 } 313 314 static void 315 usage(void) 316 { 317 enprintf(2, "usage: %s [-Cbcdfimnru] [-o outfile] [-t delim] " 318 "[-k def]... [file ...]\n", argv0); 319 } 320 321 int 322 main(int argc, char *argv[]) 323 { 324 FILE *fp, *ofp = stdout; 325 struct linebuf linebuf = EMPTY_LINEBUF; 326 size_t i; 327 int global_flags = 0, ret = 0; 328 char *outfile = NULL; 329 330 ARGBEGIN { 331 case 'C': 332 Cflag = 1; 333 break; 334 case 'b': 335 global_flags |= MOD_STARTB | MOD_ENDB; 336 break; 337 case 'c': 338 cflag = 1; 339 break; 340 case 'd': 341 global_flags |= MOD_D; 342 break; 343 case 'f': 344 global_flags |= MOD_F; 345 break; 346 case 'i': 347 global_flags |= MOD_I; 348 break; 349 case 'k': 350 addkeydef(EARGF(usage()), global_flags); 351 break; 352 case 'm': 353 /* more or less for free, but for performance-reasons, 354 * we should keep this flag in mind and maybe some later 355 * day implement it properly so we don't run out of memory 356 * while merging large sorted files. 357 */ 358 break; 359 case 'n': 360 global_flags |= MOD_N; 361 break; 362 case 'o': 363 outfile = EARGF(usage()); 364 break; 365 case 'r': 366 global_flags |= MOD_R; 367 break; 368 case 't': 369 fieldsep = EARGF(usage()); 370 if (!*fieldsep) 371 eprintf("empty delimiter\n"); 372 fieldseplen = unescape(fieldsep); 373 break; 374 case 'u': 375 uflag = 1; 376 break; 377 default: 378 usage(); 379 } ARGEND 380 381 /* -b shall only apply to custom key definitions */ 382 if (TAILQ_EMPTY(&kdhead) && global_flags) 383 addkeydef("1", global_flags & ~(MOD_STARTB | MOD_ENDB)); 384 if (TAILQ_EMPTY(&kdhead) || (!Cflag && !cflag)) 385 addkeydef("1", global_flags & MOD_R); 386 387 if (!argc) { 388 if (Cflag || cflag) { 389 if (check(stdin, "<stdin>") && !ret) 390 ret = 1; 391 } else { 392 getlines(stdin, &linebuf); 393 } 394 } else for (; *argv; argc--, argv++) { 395 if (!strcmp(*argv, "-")) { 396 *argv = "<stdin>"; 397 fp = stdin; 398 } else if (!(fp = fopen(*argv, "r"))) { 399 enprintf(2, "fopen %s:", *argv); 400 continue; 401 } 402 if (Cflag || cflag) { 403 if (check(fp, *argv) && !ret) 404 ret = 1; 405 } else { 406 getlines(fp, &linebuf); 407 } 408 if (fp != stdin && fshut(fp, *argv)) 409 ret = 2; 410 } 411 412 if (!Cflag && !cflag) { 413 if (outfile && !(ofp = fopen(outfile, "w"))) 414 eprintf("fopen %s:", outfile); 415 416 qsort(linebuf.lines, linebuf.nlines, sizeof(*linebuf.lines), 417 (int (*)(const void *, const void *))slinecmp); 418 419 for (i = 0; i < linebuf.nlines; i++) { 420 if (!uflag || i == 0 || 421 slinecmp(&linebuf.lines[i], &linebuf.lines[i - 1])) { 422 linebuf.lines[i].data[linebuf.lines[i].len-1] = '\n'; 423 fwrite(linebuf.lines[i].data, 1, 424 linebuf.lines[i].len, ofp); 425 } 426 } 427 } 428 429 if (fshut(stdin, "<stdin>") | fshut(stdout, "<stdout>") | 430 fshut(stderr, "<stderr>")) 431 ret = 2; 432 433 return ret; 434 }