/* ** Copyright (c) 2007 D. Richard Hipp ** ** This program is free software; you can redistribute it and/or ** modify it under the terms of the GNU General Public ** License version 2 as published by the Free Software Foundation. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ** General Public License for more details. ** ** You should have received a copy of the GNU General Public ** License along with this library; if not, write to the ** Free Software Foundation, Inc., 59 Temple Place - Suite 330, ** Boston, MA 02111-1307, USA. ** ** Author contact information: ** drh@hwaci.com ** http://www.hwaci.com/drh/ ** ******************************************************************************* ** ** This file contains code used to compute a "diff" between two ** text files. */ #include "config.h" #include "diff.h" #include <assert.h> #if 0 #define DEBUG(X) X #else #define DEBUG(X) #endif /* ** Information about each line of a file being diffed. ** ** The lower 20 bits of the hash are the length of the ** line. If any line is longer than 1048575 characters, ** the file is considered binary. */ typedef struct DLine DLine; struct DLine { const char *z; /* The text of the line */ unsigned int h; /* Hash of the line */ }; #define LENGTH_MASK 0x000fffff /* ** Return an array of DLine objects containing a pointer to the ** start of each line and a hash of that line. The lower ** bits of the hash store the length of each line. ** ** Trailing whitespace is removed from each line. ** ** Return 0 if the file is binary or contains a line that is ** longer than 1048575 bytes. */ static DLine *break_into_lines(char *z, int *pnLine){ int nLine, i, j, k, x; unsigned int h; DLine *a; /* Count the number of lines. Allocate space to hold ** the returned array. */ for(i=j=0, nLine=1; z[i]; i++, j++){ int c = z[i]; if( c==0 ){ return 0; } if( c=='\n' && z[i+1]!=0 ){ nLine++; if( j>1048575 ){ return 0; } j = 0; } } if( j>1048575 ){ return 0; } a = malloc( nLine*sizeof(a[0]) ); if( a==0 ) fossil_panic("out of memory"); /* Fill in the array */ for(i=0; i<nLine; i++){ a[i].z = z; for(j=0; z[j] && z[j]!='\n'; j++){} for(k=j; k>0 && isspace(z[k-1]); k--){} for(h=0, x=0; x<k; x++){ h = h ^ (h<<2) ^ z[x]; } a[i].h = (h<<20) | k;; z += j+1; } /* Return results */ *pnLine = nLine; return a; } /* ** Return true if two DLine elements are identical. */ static int same_dline(DLine *pA, DLine *pB){ return pA->h==pB->h && memcmp(pA->z,pB->z,pA->h & LENGTH_MASK)==0; } /* ** Append a single line of "diff" output to pOut. */ static void appendDiffLine(Blob *pOut, char *zPrefix, DLine *pLine){ blob_append(pOut, zPrefix, 1); blob_append(pOut, pLine->z, pLine->h & LENGTH_MASK); blob_append(pOut, "\n", 1); } /* ** Generate a report of the differences between files pA and pB. ** If pOut is not NULL then a unified diff is appended there. It ** is assumed that pOut has already been initialized. If pOut is ** NULL, then a pointer to an array of integers is returned. ** The integers come in triples. For each triple, ** the elements are the number of lines copied, the number of ** lines deleted, and the number of lines inserted. The vector ** is terminated by a triple of all zeros. ** ** This diff utility does not work on binary files. If a binary ** file is encountered, 0 is returned and pOut is written with ** text "cannot compute difference between binary files". ** **************************************************************************** ** ** The core algorithm is a variation on the classic Wagner ** minimum edit distance with enhancements to reduce the runtime ** to be almost linear in the common case where the two files ** have a lot in common. For additional information see ** Eugene W. Myers, "An O(ND) Difference Algorithm And Its ** Variations" ** ** Consider comparing strings A and B. A=abcabba and B=cbabac ** We construct a "Wagner" matrix W with A along the X axis and ** B along the Y axis: ** ** c 6 * ** a 5 * ** b 4 * * ** a 3 * ** b 2 * ** B c 1 * ** 0 * * * ** 0 1 2 3 4 5 6 7 ** a b c a b b a ** A ** ** (Note: we draw this Wagner matrix with the origin at the lower ** left whereas Myers uses the origin at the upper left. Otherwise, ** they are the same.) ** ** Let Y be the maximum y value or the number of characters in B. ** 6 in this example. X is the maximum x value or the number of ** elements in A. Here 7. ** ** Our goal is to find a path from (0,0) to (X,Y). The path can ** use horizontal, vertical, or diagonal steps. A diagonal from ** (x-1,y-1) to (x,y) is only allowed if A[x]==B[y]. A vertical ** steps corresponds to an insertion. A horizontal step corresponds ** to a deletion. We want to find the path with the fewest ** horizontal and vertical steps. ** ** Diagonal k consists of all points such that x-y==k. Diagonal ** zero begins at the origin. Diagonal 1 begins at (1,0). ** Diagonal -1 begins at (0,1). All diagonals move up and to the ** right at 45 degrees. Diagonal number increase from upper left ** to lower right. ** ** Myers matrix M is a lower right triangular matrix with indices d ** along the bottom and i vertically: ** ** ** i=4 | +4 \ ** 3 | +3 +2 | ** 2 | +2 +1 0 |- k values. k = 2*i-d ** 1 | +1 0 -1 -2 | ** 0 | 0 -1 -2 -3 -4 / ** --------------- ** 0 1 2 3 4 = d ** ** Each element of the Myers matrix corresponds to a diagonal. ** The diagonal is k=2*i-d. The diagonal values are shown ** in the template above. ** ** Each entry in M represents the end-point on a path from (0,0). ** The end-point is on diagonal k. The value stored in M is ** q=x+y where (x,y) is the terminus of the path. A path ** to M[d][i] will come through either M[d-1][i-1] or ** though M[d-1][i], whichever holds the largest value of q. ** If both elements hold the same value, the path comes ** through M[d-1][i-1]. ** ** The value of d is the number of insertions and deletions ** made so far on the path. M grows progressively. So the ** size of the M matrix is proportional to d*d. For the ** common case where A and B are similar, d will be small ** compared to X and Y so little memory is required. The ** original Wagner algorithm requires X*Y memory, which for ** larger files (100K lines) is more memory than we have at ** hand. */ int *text_diff( Blob *pA_Blob, /* FROM file */ Blob *pB_Blob, /* TO file */ Blob *pOut, /* Write unified diff here if not NULL */ int nContext /* Amount of context to unified diff */ ){ DLine *A, *B; /* Files being compared */ int X, Y; /* Number of elements in A and B */ int x, y; /* Indices: A[x] and B[y] */ int szM = 0; /* Number of rows and columns in M */ int **M = 0; /* Myers matrix */ int i, d; /* Indices on M. M[d][i] */ int k, q; /* Diagonal number and distinct from (0,0) */ int K, D; /* The diagonal and d for the final solution */ int *R = 0; /* Result vector */ int r; /* Loop variables */ int go = 1; /* Outer loop control */ int MAX; /* Largest of X and Y */ /* Break the two files being diffed into individual lines. ** Compute hashes on each line for fast comparison. */ A = break_into_lines(blob_str(pA_Blob), &X); B = break_into_lines(blob_str(pB_Blob), &Y); if( A==0 || B==0 ){ free(A); free(B); if( pOut ){ blob_appendf(pOut, "cannot compute difference between binary files\n"); } return 0; } szM = 0; MAX = X>Y ? X : Y; if( MAX>2000 ) MAX = 2000; for(d=0; go && d<=MAX; d++){ if( szM<d+1 ){ szM += szM + 10; M = realloc(M, sizeof(M[0])*szM); if( M==0 ){ fossil_panic("out of memory"); } } M[d] = malloc( sizeof(M[d][0])*(d+1) ); if( M[d]==0 ){ fossil_panic("out of memory"); } for(i=0; i<=d; i++){ k = 2*i - d; if( d==0 ){ q = 0; }else if( i==0 ){ q = M[d-1][0]; }else if( i<d-1 && M[d-1][i-1] < M[d-1][i] ){ q = M[d-1][i]; }else{ q = M[d-1][i-1]; } x = (k + q + 1)/2; y = x - k; if( x<0 || x>X || y<0 || y>Y ){ x = y = 0; }else{ while( x<X && y<Y && same_dline(&A[x],&B[y]) ){ x++; y++; } } M[d][i] = x + y; DEBUG( printf("M[%d][%i] = %d k=%d (%d,%d)\n", d, i, x+y, k, x, y); ) if( x==X && y==Y ){ go = 0; break; } } } if( d>MAX ){ R = malloc( sizeof(R[0])*7 ); R[0] = 0; R[1] = X; R[2] = Y; R[3] = 0; R[4] = 0; R[5] = 0; R[6] = 0; }else{ /* Reuse M[] as follows: ** ** M[d][1] = 1 if a line is inserted or 0 if a line is deleted. ** M[d][0] = number of lines copied after the ins or del above. ** */ D = d - 1; K = X - Y; for(d=D, i=(K+D)/2; d>0; d--){ DEBUG( printf("d=%d i=%d\n", d, i); ) if( i==d || (i>0 && M[d-1][i-1] > M[d-1][i]) ){ M[d][0] = M[d][i] - M[d-1][i-1] - 1; M[d][1] = 0; i--; }else{ M[d][0] = M[d][i] - M[d-1][i] - 1; M[d][1] = 1; } } DEBUG( printf("---------------\nM[0][0] = %5d\n", M[0][0]); for(d=1; d<=D; d++){ printf("M[%d][0] = %5d M[%d][1] = %d\n",d,M[d][0],d,M[d][1]); } ) /* Allocate the output vector */ R = malloc( sizeof(R[0])*(D+2)*3 ); if( R==0 ){ fossil_fatal("out of memory"); } /* Populate the output vector */ d = r = 0; while( d<=D ){ int n; R[r++] = M[d++][0]/2; /* COPY */ if( d>D ){ R[r++] = 0; R[r++] = 0; break; } if( M[d][1]==0 ){ n = 1; while( M[d][0]==0 && d<D && M[d+1][1]==0 ){ d++; n++; } R[r++] = n; /* DELETE */ if( d==D || M[d][0]>0 ){ R[r++] = 0; /* INSERT */ continue; } d++; }else{ R[r++] = 0; /* DELETE */ } assert( M[d][1]==1 ); n = 1; while( M[d][0]==0 && d<D && M[d+1][1]==1 ){ d++; n++; } R[r++] = n; /* INSERT */ } R[r++] = 0; R[r++] = 0; R[r++] = 0; } /* Free the Myers matrix */ for(d=0; d<=D; d++){ free(M[d]); } free(M); /* If pOut is defined, construct a unified diff into pOut and ** delete R */ if( pOut ){ int a = 0; /* Index of next line in A[] */ int b = 0; /* Index of next line in B[] */ int nr; /* Number of COPY/DELETE/INSERT triples to process */ int mxr; /* Maximum value for r */ int na, nb; /* Number of lines shown from A and B */ int i, j; /* Loop counters */ int m; /* Number of lines to output */ int skip; /* Number of lines to skip */ for(mxr=0; R[mxr+1] || R[mxr+2] || R[mxr+3]; mxr += 3){} for(r=0; r<mxr; r += 3*nr){ /* Figure out how many triples to show in a single block */ for(nr=1; R[r+nr*3]>0 && R[r+nr*3]<nContext*2; nr++){} DEBUG( printf("r=%d nr=%d\n", r, nr); ) /* For the current block comprising nr triples, figure out ** how many lines of A and B are to be displayed */ if( R[r]>nContext ){ na = nb = nContext; skip = R[r] - nContext; }else{ na = nb = R[r]; skip = 0; } for(i=0; i<nr; i++){ na += R[r+i*3+1]; nb += R[r+i*3+2]; } if( R[r+nr*3]>nContext ){ na += nContext; nb += nContext; }else{ na += R[r+nr*3]; nb += R[r+nr*3]; } for(i=1; i<nr; i++){ na += R[r+i*3]; nb += R[r+i*3]; } blob_appendf(pOut,"@@ -%d,%d +%d,%d @@\n", a+skip+1, na, b+skip+1, nb); /* Show the initial common area */ a += skip; b += skip; m = R[r] - skip; for(j=0; j<m; j++){ appendDiffLine(pOut, " ", &A[a+j]); } a += m; b += m; /* Show the differences */ for(i=0; i<nr; i++){ m = R[r+i*3+1]; for(j=0; j<m; j++){ appendDiffLine(pOut, "-", &A[a+j]); } a += m; m = R[r+i*3+2]; for(j=0; j<m; j++){ appendDiffLine(pOut, "+", &B[b+j]); } b += m; if( i<nr-1 ){ m = R[r+i*3+3]; for(j=0; j<m; j++){ appendDiffLine(pOut, " ", &B[b+j]); } b += m; a += m; } } /* Show the final common area */ assert( nr==i ); m = R[r+nr*3]; if( m>nContext ) m = nContext; for(j=0; j<m; j++){ appendDiffLine(pOut, " ", &B[b+j]); } } free(R); R = 0; } /* We no longer need the A[] and B[] vectors */ free(A); free(B); /* Return the result */ return R; } /* ** COMMAND: test-rawdiff */ void test_rawdiff_cmd(void){ Blob a, b; int r; int i; int *R; if( g.argc<4 ) usage("FILE1 FILE2 ..."); blob_read_from_file(&a, g.argv[2]); for(i=3; i<g.argc; i++){ if( i>3 ) printf("-------------------------------\n"); blob_read_from_file(&b, g.argv[i]); R = text_diff(&a, &b, 0, 0); for(r=0; R[r] || R[r+1] || R[r+2]; r += 3){ printf(" copy %4d delete %4d insert %4d\n", R[r], R[r+1], R[r+2]); } /* free(R); */ blob_reset(&b); } } /* ** COMMAND: test-udiff */ void test_udiff_cmd(void){ Blob a, b, out; if( g.argc!=4 ) usage("FILE1 FILE2"); blob_read_from_file(&a, g.argv[2]); blob_read_from_file(&b, g.argv[3]); blob_zero(&out); text_diff(&a, &b, &out, 3); blob_write_to_file(&out, "-"); }