diffreg.c (8845B)
1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include "diff.h" 5 6 /* diff - differential file comparison 7 * 8 * Uses an algorithm due to Harold Stone, which finds 9 * a pair of longest identical subsequences in the two 10 * files. 11 * 12 * The major goal is to generate the match vector J. 13 * J[i] is the index of the line in file1 corresponding 14 * to line i file0. J[i] = 0 if there is no 15 * such line in file1. 16 * 17 * Lines are hashed so as to work in core. All potential 18 * matches are located by sorting the lines of each file 19 * on the hash (called value). In particular, this 20 * collects the equivalence classes in file1 together. 21 * Subroutine equiv replaces the value of each line in 22 * file0 by the index of the first element of its 23 * matching equivalence in (the reordered) file1. 24 * To save space equiv squeezes file1 into a single 25 * array member in which the equivalence classes 26 * are simply concatenated, except that their first 27 * members are flagged by changing sign. 28 * 29 * Next the indices that point into member are unsorted into 30 * array class according to the original order of file0. 31 * 32 * The cleverness lies in routine stone. This marches 33 * through the lines of file0, developing a vector klist 34 * of "k-candidates". At step i a k-candidate is a matched 35 * pair of lines x,y (x in file0 y in file1) such that 36 * there is a common subsequence of lenght k 37 * between the first i lines of file0 and the first y 38 * lines of file1, but there is no such subsequence for 39 * any smaller y. x is the earliest possible mate to y 40 * that occurs in such a subsequence. 41 * 42 * Whenever any of the members of the equivalence class of 43 * lines in file1 matable to a line in file0 has serial number 44 * less than the y of some k-candidate, that k-candidate 45 * with the smallest such y is replaced. The new 46 * k-candidate is chained (via pred) to the current 47 * k-1 candidate so that the actual subsequence can 48 * be recovered. When a member has serial number greater 49 * that the y of all k-candidates, the klist is extended. 50 * At the end, the longest subsequence is pulled out 51 * and placed in the array J by unravel. 52 * 53 * With J in hand, the matches there recorded are 54 * check'ed against reality to assure that no spurious 55 * matches have crept in due to hashing. If they have, 56 * they are broken, and "jackpot " is recorded--a harmless 57 * matter except that a true match for a spuriously 58 * mated line may now be unnecessarily reported as a change. 59 * 60 * Much of the complexity of the program comes simply 61 * from trying to minimize core utilization and 62 * maximize the range of doable problems by dynamically 63 * allocating what is needed and reusing what is not. 64 * The core requirements for problems larger than somewhat 65 * are (in words) 2*length(file0) + length(file1) + 66 * 3*(number of k-candidates installed), typically about 67 * 6n words for files of length n. 68 */ 69 /* TIDY THIS UP */ 70 struct cand { 71 int x; 72 int y; 73 int pred; 74 } cand; 75 struct line { 76 int serial; 77 int value; 78 } *file[2], line; 79 int len[2]; 80 int binary; 81 struct line *sfile[2]; /*shortened by pruning common prefix and suffix*/ 82 int slen[2]; 83 int pref, suff; /*length of prefix and suffix*/ 84 int *class; /*will be overlaid on file[0]*/ 85 int *member; /*will be overlaid on file[1]*/ 86 int *klist; /*will be overlaid on file[0] after class*/ 87 struct cand *clist; /* merely a free storage pot for candidates */ 88 int clen; 89 int *J; /*will be overlaid on class*/ 90 long *ixold; /*will be overlaid on klist*/ 91 long *ixnew; /*will be overlaid on file[1]*/ 92 /* END OF SOME TIDYING */ 93 94 static void 95 sort(struct line *a, int n) /*shellsort CACM #201*/ 96 { 97 int m; 98 struct line *ai, *aim, *j, *k; 99 struct line w; 100 int i; 101 102 m = 0; 103 for (i = 1; i <= n; i *= 2) 104 m = 2*i - 1; 105 for (m /= 2; m != 0; m /= 2) { 106 k = a+(n-m); 107 for (j = a+1; j <= k; j++) { 108 ai = j; 109 aim = ai+m; 110 do { 111 if (aim->value > ai->value || 112 aim->value == ai->value && 113 aim->serial > ai->serial) 114 break; 115 w = *ai; 116 *ai = *aim; 117 *aim = w; 118 119 aim = ai; 120 ai -= m; 121 } while (ai > a && aim >= ai); 122 } 123 } 124 } 125 126 static void 127 unsort(struct line *f, int l, int *b) 128 { 129 int *a; 130 int i; 131 132 a = MALLOC(int, (l+1)); 133 for(i=1;i<=l;i++) 134 a[f[i].serial] = f[i].value; 135 for(i=1;i<=l;i++) 136 b[i] = a[i]; 137 FREE(a); 138 } 139 140 static void 141 prune(void) 142 { 143 int i,j; 144 145 for(pref=0;pref<len[0]&&pref<len[1]&& 146 file[0][pref+1].value==file[1][pref+1].value; 147 pref++ ) ; 148 for(suff=0;suff<len[0]-pref&&suff<len[1]-pref&& 149 file[0][len[0]-suff].value==file[1][len[1]-suff].value; 150 suff++) ; 151 for(j=0;j<2;j++) { 152 sfile[j] = file[j]+pref; 153 slen[j] = len[j]-pref-suff; 154 for(i=0;i<=slen[j];i++) 155 sfile[j][i].serial = i; 156 } 157 } 158 159 static void 160 equiv(struct line *a, int n, struct line *b, int m, int *c) 161 { 162 int i, j; 163 164 i = j = 1; 165 while(i<=n && j<=m) { 166 if(a[i].value < b[j].value) 167 a[i++].value = 0; 168 else if(a[i].value == b[j].value) 169 a[i++].value = j; 170 else 171 j++; 172 } 173 while(i <= n) 174 a[i++].value = 0; 175 b[m+1].value = 0; 176 j = 0; 177 while(++j <= m) { 178 c[j] = -b[j].serial; 179 while(b[j+1].value == b[j].value) { 180 j++; 181 c[j] = b[j].serial; 182 } 183 } 184 c[j] = -1; 185 } 186 187 static int 188 newcand(int x, int y, int pred) 189 { 190 struct cand *q; 191 192 clist = REALLOC(clist, struct cand, (clen+1)); 193 q = clist + clen; 194 q->x = x; 195 q->y = y; 196 q->pred = pred; 197 return clen++; 198 } 199 200 static int 201 search(int *c, int k, int y) 202 { 203 int i, j, l; 204 int t; 205 206 if(clist[c[k]].y < y) /*quick look for typical case*/ 207 return k+1; 208 i = 0; 209 j = k+1; 210 while((l=(i+j)/2) > i) { 211 t = clist[c[l]].y; 212 if(t > y) 213 j = l; 214 else if(t < y) 215 i = l; 216 else 217 return l; 218 } 219 return l+1; 220 } 221 222 static int 223 stone(int *a, int n, int *b, int *c) 224 { 225 int i, k,y; 226 int j, l; 227 int oldc, tc; 228 int oldl; 229 230 k = 0; 231 c[0] = newcand(0,0,0); 232 for(i=1; i<=n; i++) { 233 j = a[i]; 234 if(j==0) 235 continue; 236 y = -b[j]; 237 oldl = 0; 238 oldc = c[0]; 239 do { 240 if(y <= clist[oldc].y) 241 continue; 242 l = search(c, k, y); 243 if(l!=oldl+1) 244 oldc = c[l-1]; 245 if(l<=k) { 246 if(clist[c[l]].y <= y) 247 continue; 248 tc = c[l]; 249 c[l] = newcand(i,y,oldc); 250 oldc = tc; 251 oldl = l; 252 } else { 253 c[l] = newcand(i,y,oldc); 254 k++; 255 break; 256 } 257 } while((y=b[++j]) > 0); 258 } 259 return k; 260 } 261 262 static void 263 unravel(int p) 264 { 265 int i; 266 struct cand *q; 267 268 for(i=0; i<=len[0]; i++) { 269 if (i <= pref) 270 J[i] = i; 271 else if (i > len[0]-suff) 272 J[i] = i+len[1]-len[0]; 273 else 274 J[i] = 0; 275 } 276 for(q=clist+p;q->y!=0;q=clist+q->pred) 277 J[q->x+pref] = q->y+pref; 278 } 279 280 static void 281 output(void) 282 { 283 int m, i0, i1, j0, j1; 284 285 m = len[0]; 286 J[0] = 0; 287 J[m+1] = len[1]+1; 288 if (mode != 'e') { 289 for (i0 = 1; i0 <= m; i0 = i1+1) { 290 while (i0 <= m && J[i0] == J[i0-1]+1) 291 i0++; 292 j0 = J[i0-1]+1; 293 i1 = i0-1; 294 while (i1 < m && J[i1+1] == 0) 295 i1++; 296 j1 = J[i1+1]-1; 297 J[i1] = j1; 298 change(i0, i1, j0, j1); 299 } 300 } 301 else { 302 for (i0 = m; i0 >= 1; i0 = i1-1) { 303 while (i0 >= 1 && J[i0] == J[i0+1]-1 && J[i0]) 304 i0--; 305 j0 = J[i0+1]-1; 306 i1 = i0+1; 307 while (i1 > 1 && J[i1-1] == 0) 308 i1--; 309 j1 = J[i1-1]+1; 310 J[i1] = j1; 311 change(i1 , i0, j1, j0); 312 } 313 } 314 if (m == 0) 315 change(1, 0, 1, len[1]); 316 flushchanges(); 317 } 318 319 #define BUF 4096 320 static int 321 cmp(Biobuf* b1, Biobuf* b2) 322 { 323 int n; 324 uchar buf1[BUF], buf2[BUF]; 325 int f1, f2; 326 vlong nc = 1; 327 uchar *b1s, *b1e, *b2s, *b2e; 328 329 f1 = Bfildes(b1); 330 f2 = Bfildes(b2); 331 seek(f1, 0, 0); 332 seek(f2, 0, 0); 333 b1s = b1e = buf1; 334 b2s = b2e = buf2; 335 for(;;){ 336 if(b1s >= b1e){ 337 if(b1s >= &buf1[BUF]) 338 b1s = buf1; 339 n = read(f1, b1s, &buf1[BUF] - b1s); 340 b1e = b1s + n; 341 } 342 if(b2s >= b2e){ 343 if(b2s >= &buf2[BUF]) 344 b2s = buf2; 345 n = read(f2, b2s, &buf2[BUF] - b2s); 346 b2e = b2s + n; 347 } 348 n = b2e - b2s; 349 if(n > b1e - b1s) 350 n = b1e - b1s; 351 if(n <= 0) 352 break; 353 if(memcmp((void *)b1s, (void *)b2s, n) != 0){ 354 return 1; 355 } 356 nc += n; 357 b1s += n; 358 b2s += n; 359 } 360 if(b1e - b1s == b2e - b2s) 361 return 0; 362 return 1; 363 } 364 365 void 366 diffreg(char *f, char *t) 367 { 368 Biobuf *b0, *b1; 369 int k; 370 371 binary = 0; 372 b0 = prepare(0, f); 373 if (!b0) 374 return; 375 b1 = prepare(1, t); 376 if (!b1) { 377 FREE(file[0]); 378 Bterm(b0); 379 return; 380 } 381 if (binary){ 382 /* could use b0 and b1 but this is simpler. */ 383 if (cmp(b0, b1)) 384 print("binary files %s %s differ\n", f, t); 385 Bterm(b0); 386 Bterm(b1); 387 return; 388 } 389 clen = 0; 390 prune(); 391 sort(sfile[0], slen[0]); 392 sort(sfile[1], slen[1]); 393 394 member = (int *)file[1]; 395 equiv(sfile[0], slen[0], sfile[1], slen[1], member); 396 member = REALLOC(member, int, slen[1]+2); 397 398 class = (int *)file[0]; 399 unsort(sfile[0], slen[0], class); 400 class = REALLOC(class, int, slen[0]+2); 401 402 klist = MALLOC(int, slen[0]+2); 403 clist = MALLOC(struct cand, 1); 404 k = stone(class, slen[0], member, klist); 405 FREE(member); 406 FREE(class); 407 408 J = MALLOC(int, len[0]+2); 409 unravel(klist[k]); 410 FREE(clist); 411 FREE(klist); 412 413 ixold = MALLOC(long, len[0]+2); 414 ixnew = MALLOC(long, len[1]+2); 415 Bseek(b0, 0, 0); Bseek(b1, 0, 0); 416 check(b0, b1); 417 output(); 418 FREE(J); FREE(ixold); FREE(ixnew); 419 Bterm(b0); Bterm(b1); /* ++++ */ 420 }