sort.c (9436B)
1 /* See LICENSE file for copyright and license details. */ 2 #include <ctype.h> 3 #include <stdio.h> 4 #include <stdlib.h> 5 #include <string.h> 6 7 #include "queue.h" 8 #include "text.h" 9 #include "utf.h" 10 #include "util.h" 11 12 struct keydef { 13 int start_column; 14 int end_column; 15 int start_char; 16 int end_char; 17 int flags; 18 TAILQ_ENTRY(keydef) entry; 19 }; 20 21 struct column { 22 struct line line; 23 size_t cap; 24 }; 25 26 enum { 27 MOD_N = 1 << 0, 28 MOD_STARTB = 1 << 1, 29 MOD_ENDB = 1 << 2, 30 MOD_R = 1 << 3, 31 MOD_D = 1 << 4, 32 MOD_F = 1 << 5, 33 MOD_I = 1 << 6, 34 }; 35 36 static TAILQ_HEAD(kdhead, keydef) kdhead = TAILQ_HEAD_INITIALIZER(kdhead); 37 38 static int Cflag = 0, cflag = 0, uflag = 0; 39 static char *fieldsep = NULL; 40 static size_t fieldseplen = 0; 41 static struct column col1, col2; 42 43 static void 44 skipblank(struct line *a) 45 { 46 while (a->len && (*(a->data) == ' ' || *(a->data) == '\t')) { 47 a->data++; 48 a->len--; 49 } 50 } 51 52 static void 53 skipnonblank(struct line *a) 54 { 55 while (a->len && (*(a->data) != '\n' && *(a->data) != ' ' && 56 *(a->data) != '\t')) { 57 a->data++; 58 a->len--; 59 } 60 } 61 62 static void 63 skipcolumn(struct line *a, int skip_to_next_col) 64 { 65 char *s; 66 67 if (fieldsep) { 68 if ((s = memmem(a->data, a->len, fieldsep, fieldseplen))) { 69 if (skip_to_next_col) 70 s += fieldseplen; 71 a->len -= s - a->data; 72 a->data = s; 73 } else { 74 a->data += a->len - 1; 75 a->len = 1; 76 } 77 } else { 78 skipblank(a); 79 skipnonblank(a); 80 } 81 } 82 83 static void 84 columns(struct line *line, const struct keydef *kd, struct column *col) 85 { 86 Rune r; 87 struct line start, end; 88 size_t utflen, rlen; 89 int i; 90 91 start.data = line->data; 92 start.len = line->len; 93 for (i = 1; i < kd->start_column; i++) 94 skipcolumn(&start, 1); 95 if (kd->flags & MOD_STARTB) 96 skipblank(&start); 97 for (utflen = 0; start.len > 1 && utflen < kd->start_char - 1;) { 98 rlen = chartorune(&r, start.data); 99 start.data += rlen; 100 start.len -= rlen; 101 utflen++; 102 } 103 104 end.data = line->data; 105 end.len = line->len; 106 if (kd->end_column) { 107 for (i = 1; i < kd->end_column; i++) 108 skipcolumn(&end, 1); 109 if (kd->flags & MOD_ENDB) 110 skipblank(&end); 111 if (kd->end_char) { 112 for (utflen = 0; end.len > 1 && utflen < kd->end_char;) { 113 rlen = chartorune(&r, end.data); 114 end.data += rlen; 115 end.len -= rlen; 116 utflen++; 117 } 118 } else { 119 skipcolumn(&end, 0); 120 } 121 } else { 122 end.data += end.len - 1; 123 end.len = 1; 124 } 125 col->line.len = MAX(0, end.data - start.data); 126 if (!(col->line.data) || col->cap < col->line.len + 1) { 127 free(col->line.data); 128 col->line.data = emalloc(col->line.len + 1); 129 } 130 memcpy(col->line.data, start.data, col->line.len); 131 col->line.data[col->line.len] = '\0'; 132 } 133 134 static int 135 skipmodcmp(struct line *a, struct line *b, int flags) 136 { 137 Rune r1, r2; 138 size_t offa = 0, offb = 0; 139 140 do { 141 offa += chartorune(&r1, a->data + offa); 142 offb += chartorune(&r2, b->data + offb); 143 144 if (flags & MOD_D && flags & MOD_I) { 145 while (offa < a->len && ((!isblankrune(r1) && 146 !isalnumrune(r1)) || (!isprintrune(r1)))) 147 offa += chartorune(&r1, a->data + offa); 148 while (offb < b->len && ((!isblankrune(r2) && 149 !isalnumrune(r2)) || (!isprintrune(r2)))) 150 offb += chartorune(&r2, b->data + offb); 151 } 152 else if (flags & MOD_D) { 153 while (offa < a->len && !isblankrune(r1) && 154 !isalnumrune(r1)) 155 offa += chartorune(&r1, a->data + offa); 156 while (offb < b->len && !isblankrune(r2) && 157 !isalnumrune(r2)) 158 offb += chartorune(&r2, b->data + offb); 159 } 160 else if (flags & MOD_I) { 161 while (offa < a->len && !isprintrune(r1)) 162 offa += chartorune(&r1, a->data + offa); 163 while (offb < b->len && !isprintrune(r2)) 164 offb += chartorune(&r2, b->data + offb); 165 } 166 if (flags & MOD_F) { 167 r1 = toupperrune(r1); 168 r2 = toupperrune(r2); 169 } 170 } while (r1 && r1 == r2); 171 172 return r1 - r2; 173 } 174 175 static int 176 slinecmp(struct line *a, struct line *b) 177 { 178 int res = 0; 179 double x, y; 180 struct keydef *kd; 181 182 TAILQ_FOREACH(kd, &kdhead, entry) { 183 columns(a, kd, &col1); 184 columns(b, kd, &col2); 185 186 /* if -u is given, don't use default key definition 187 * unless it is the only one */ 188 if (uflag && kd == TAILQ_LAST(&kdhead, kdhead) && 189 TAILQ_LAST(&kdhead, kdhead) != TAILQ_FIRST(&kdhead)) { 190 res = 0; 191 } else if (kd->flags & MOD_N) { 192 x = strtod(col1.line.data, NULL); 193 y = strtod(col2.line.data, NULL); 194 res = (x < y) ? -1 : (x > y); 195 } else if (kd->flags & (MOD_D | MOD_F | MOD_I)) { 196 res = skipmodcmp(&col1.line, &col2.line, kd->flags); 197 } else { 198 res = linecmp(&col1.line, &col2.line); 199 } 200 201 if (kd->flags & MOD_R) 202 res = -res; 203 if (res) 204 break; 205 } 206 207 return res; 208 } 209 210 static int 211 check(FILE *fp, const char *fname) 212 { 213 static struct line prev, cur, tmp; 214 static size_t prevsize, cursize, tmpsize; 215 ssize_t len; 216 217 if (!prev.data) { 218 if ((len = getline(&prev.data, &prevsize, fp)) < 0) 219 eprintf("getline:"); 220 prev.len = len; 221 } 222 while ((len = getline(&cur.data, &cursize, fp)) > 0) { 223 cur.len = len; 224 if (uflag > slinecmp(&cur, &prev)) { 225 if (!Cflag) { 226 weprintf("disorder %s: ", fname); 227 fwrite(cur.data, 1, cur.len, stderr); 228 } 229 return 1; 230 } 231 tmp = cur; 232 tmpsize = cursize; 233 cur = prev; 234 cursize = prevsize; 235 prev = tmp; 236 prevsize = tmpsize; 237 } 238 239 return 0; 240 } 241 242 static int 243 parse_flags(char **s, int *flags, int bflag) 244 { 245 while (isalpha((int)**s)) { 246 switch (*((*s)++)) { 247 case 'b': 248 *flags |= bflag; 249 break; 250 case 'd': 251 *flags |= MOD_D; 252 break; 253 case 'f': 254 *flags |= MOD_F; 255 break; 256 case 'i': 257 *flags |= MOD_I; 258 break; 259 case 'n': 260 *flags |= MOD_N; 261 break; 262 case 'r': 263 *flags |= MOD_R; 264 break; 265 default: 266 return -1; 267 } 268 } 269 270 return 0; 271 } 272 273 static void 274 addkeydef(char *kdstr, int flags) 275 { 276 struct keydef *kd; 277 278 kd = enmalloc(2, sizeof(*kd)); 279 280 /* parse key definition kdstr with format 281 * start_column[.start_char][flags][,end_column[.end_char][flags]] 282 */ 283 kd->start_column = 1; 284 kd->start_char = 1; 285 kd->end_column = 0; /* 0 means end of line */ 286 kd->end_char = 0; /* 0 means end of column */ 287 kd->flags = flags; 288 289 if ((kd->start_column = strtol(kdstr, &kdstr, 10)) < 1) 290 enprintf(2, "invalid start column in key definition\n"); 291 292 if (*kdstr == '.') { 293 if ((kd->start_char = strtol(kdstr + 1, &kdstr, 10)) < 1) 294 enprintf(2, "invalid start character in key " 295 "definition\n"); 296 } 297 if (parse_flags(&kdstr, &kd->flags, MOD_STARTB) < 0) 298 enprintf(2, "invalid start flags in key definition\n"); 299 300 if (*kdstr == ',') { 301 if ((kd->end_column = strtol(kdstr + 1, &kdstr, 10)) < 0) 302 enprintf(2, "invalid end column in key definition\n"); 303 if (*kdstr == '.') { 304 if ((kd->end_char = strtol(kdstr + 1, &kdstr, 10)) < 0) 305 enprintf(2, "invalid end character in key " 306 "definition\n"); 307 } 308 if (parse_flags(&kdstr, &kd->flags, MOD_ENDB) < 0) 309 enprintf(2, "invalid end flags in key definition\n"); 310 } 311 312 if (*kdstr != '\0') 313 enprintf(2, "invalid key definition\n"); 314 315 TAILQ_INSERT_TAIL(&kdhead, kd, entry); 316 } 317 318 static void 319 usage(void) 320 { 321 enprintf(2, "usage: %s [-Cbcdfimnru] [-o outfile] [-t delim] " 322 "[-k def]... [file ...]\n", argv0); 323 } 324 325 int 326 main(int argc, char *argv[]) 327 { 328 FILE *fp, *ofp = stdout; 329 struct linebuf linebuf = EMPTY_LINEBUF; 330 size_t i; 331 int global_flags = 0, ret = 0; 332 char *outfile = NULL; 333 334 ARGBEGIN { 335 case 'C': 336 Cflag = 1; 337 break; 338 case 'b': 339 global_flags |= MOD_STARTB | MOD_ENDB; 340 break; 341 case 'c': 342 cflag = 1; 343 break; 344 case 'd': 345 global_flags |= MOD_D; 346 break; 347 case 'f': 348 global_flags |= MOD_F; 349 break; 350 case 'i': 351 global_flags |= MOD_I; 352 break; 353 case 'k': 354 addkeydef(EARGF(usage()), global_flags); 355 break; 356 case 'm': 357 /* more or less for free, but for performance-reasons, 358 * we should keep this flag in mind and maybe some later 359 * day implement it properly so we don't run out of memory 360 * while merging large sorted files. 361 */ 362 break; 363 case 'n': 364 global_flags |= MOD_N; 365 break; 366 case 'o': 367 outfile = EARGF(usage()); 368 break; 369 case 'r': 370 global_flags |= MOD_R; 371 break; 372 case 't': 373 fieldsep = EARGF(usage()); 374 if (!*fieldsep) 375 eprintf("empty delimiter\n"); 376 fieldseplen = unescape(fieldsep); 377 break; 378 case 'u': 379 uflag = 1; 380 break; 381 default: 382 usage(); 383 } ARGEND 384 385 /* -b shall only apply to custom key definitions */ 386 if (TAILQ_EMPTY(&kdhead) && global_flags) 387 addkeydef("1", global_flags & ~(MOD_STARTB | MOD_ENDB)); 388 if (TAILQ_EMPTY(&kdhead) || (!Cflag && !cflag)) 389 addkeydef("1", global_flags & MOD_R); 390 391 if (!argc) { 392 if (Cflag || cflag) { 393 if (check(stdin, "<stdin>") && !ret) 394 ret = 1; 395 } else { 396 getlines(stdin, &linebuf); 397 } 398 } else for (; *argv; argc--, argv++) { 399 if (!strcmp(*argv, "-")) { 400 *argv = "<stdin>"; 401 fp = stdin; 402 } else if (!(fp = fopen(*argv, "r"))) { 403 enprintf(2, "fopen %s:", *argv); 404 continue; 405 } 406 if (Cflag || cflag) { 407 if (check(fp, *argv) && !ret) 408 ret = 1; 409 } else { 410 getlines(fp, &linebuf); 411 } 412 if (fp != stdin && fshut(fp, *argv)) 413 ret = 2; 414 } 415 416 if (!Cflag && !cflag) { 417 if (outfile && !(ofp = fopen(outfile, "w"))) 418 eprintf("fopen %s:", outfile); 419 420 qsort(linebuf.lines, linebuf.nlines, sizeof(*linebuf.lines), 421 (int (*)(const void *, const void *))slinecmp); 422 423 for (i = 0; i < linebuf.nlines; i++) { 424 if (!uflag || i == 0 || 425 slinecmp(&linebuf.lines[i], &linebuf.lines[i - 1])) { 426 fwrite(linebuf.lines[i].data, 1, 427 linebuf.lines[i].len, ofp); 428 } 429 } 430 } 431 432 if (fshut(stdin, "<stdin>") | fshut(stdout, "<stdout>") | 433 fshut(stderr, "<stderr>")) 434 ret = 2; 435 436 return ret; 437 }