/* ** Copyright (c) 2007 D. Richard Hipp ** ** This program is free software; you can redistribute it and/or ** modify it under the terms of the GNU General Public ** License version 2 as published by the Free Software Foundation. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ** General Public License for more details. ** ** You should have received a copy of the GNU General Public ** License along with this library; if not, write to the ** Free Software Foundation, Inc., 59 Temple Place - Suite 330, ** Boston, MA 02111-1307, USA. ** ** Author contact information: ** drh@hwaci.com ** http://www.hwaci.com/drh/ ** ******************************************************************************* ** ** This file contains code used to compute a "diff" between two ** text files. */ #include "config.h" #include "diff.h" #include #if 0 #define DEBUG(X) X #else #define DEBUG(X) #endif /* ** Information about each line of a file being diffed. ** ** The lower 20 bits of the hash are the length of the ** line. If any line is longer than 1048575 characters, ** the file is considered binary. */ typedef struct DLine DLine; struct DLine { const char *z; /* The text of the line */ unsigned int h; /* Hash of the line */ }; #define LENGTH_MASK 0x000fffff /* ** Return an array of DLine objects containing a pointer to the ** start of each line and a hash of that line. The lower ** bits of the hash store the length of each line. ** ** Trailing whitespace is removed from each line. ** ** Return 0 if the file is binary or contains a line that is ** longer than 1048575 bytes. */ static DLine *break_into_lines(char *z, int *pnLine){ int nLine, i, j, k, x; unsigned int h; DLine *a; /* Count the number of lines. Allocate space to hold ** the returned array. */ for(i=j=0, nLine=1; z[i]; i++, j++){ int c = z[i]; if( c==0 ){ return 0; } if( c=='\n' && z[i+1]!=0 ){ nLine++; if( j>1048575 ){ return 0; } j = 0; } } if( j>1048575 ){ return 0; } a = malloc( nLine*sizeof(a[0]) ); if( a==0 ) fossil_panic("out of memory"); /* Fill in the array */ for(i=0; i0 && isspace(z[k-1]); k--){} for(h=0, x=0; xh==pB->h && memcmp(pA->z,pB->z,pA->h & LENGTH_MASK)==0; } /* ** Append a single line of "diff" output to pOut. */ static void appendDiffLine(Blob *pOut, char *zPrefix, DLine *pLine){ blob_append(pOut, zPrefix, 1); blob_append(pOut, pLine->z, pLine->h & LENGTH_MASK); blob_append(pOut, "\n", 1); } /* ** Generate a report of the differences between files pA and pB. ** If pOut is not NULL then a unified diff is appended there. It ** is assumed that pOut has already been initialized. If pOut is ** NULL, then a pointer to an array of integers is returned. ** The integers come in triples. For each triple, ** the elements are the number of lines copied, the number of ** lines deleted, and the number of lines inserted. The vector ** is terminated by a triple of all zeros. ** ** This diff utility does not work on binary files. If a binary ** file is encountered, 0 is returned and pOut is written with ** text "cannot compute difference between binary files". ** **************************************************************************** ** ** The core algorithm is a variation on the classic Wagner ** minimum edit distance with enhancements to reduce the runtime ** to be almost linear in the common case where the two files ** have a lot in common. For additional information see ** Eugene W. Myers, "An O(ND) Difference Algorithm And Its ** Variations" ** ** Consider comparing strings A and B. A=abcabba and B=cbabac ** We construct a "Wagner" matrix W with A along the X axis and ** B along the Y axis: ** ** c 6 * ** a 5 * ** b 4 * * ** a 3 * ** b 2 * ** B c 1 * ** 0 * * * ** 0 1 2 3 4 5 6 7 ** a b c a b b a ** A ** ** (Note: we draw this Wagner matrix with the origin at the lower ** left whereas Myers uses the origin at the upper left. Otherwise, ** they are the same.) ** ** Let Y be the maximum y value or the number of characters in B. ** 6 in this example. X is the maximum x value or the number of ** elements in A. Here 7. ** ** Our goal is to find a path from (0,0) to (X,Y). The path can ** use horizontal, vertical, or diagonal steps. A diagonal from ** (x-1,y-1) to (x,y) is only allowed if A[x]==B[y]. A vertical ** steps corresponds to an insertion. A horizontal step corresponds ** to a deletion. We want to find the path with the fewest ** horizontal and vertical steps. ** ** Diagonal k consists of all points such that x-y==k. Diagonal ** zero begins at the origin. Diagonal 1 begins at (1,0). ** Diagonal -1 begins at (0,1). All diagonals move up and to the ** right at 45 degrees. Diagonal number increase from upper left ** to lower right. ** ** Myers matrix M is a lower right triangular matrix with indices d ** along the bottom and i vertically: ** ** ** i=4 | +4 \ ** 3 | +3 +2 | ** 2 | +2 +1 0 |- k values. k = 2*i-d ** 1 | +1 0 -1 -2 | ** 0 | 0 -1 -2 -3 -4 / ** --------------- ** 0 1 2 3 4 = d ** ** Each element of the Myers matrix corresponds to a diagonal. ** The diagonal is k=2*i-d. The diagonal values are shown ** in the template above. ** ** Each entry in M represents the end-point on a path from (0,0). ** The end-point is on diagonal k. The value stored in M is ** q=x+y where (x,y) is the terminus of the path. A path ** to M[d][i] will come through either M[d-1][i-1] or ** though M[d-1][i], whichever holds the largest value of q. ** If both elements hold the same value, the path comes ** through M[d-1][i-1]. ** ** The value of d is the number of insertions and deletions ** made so far on the path. M grows progressively. So the ** size of the M matrix is proportional to d*d. For the ** common case where A and B are similar, d will be small ** compared to X and Y so little memory is required. The ** original Wagner algorithm requires X*Y memory, which for ** larger files (100K lines) is more memory than we have at ** hand. */ int *text_diff( Blob *pA_Blob, /* FROM file */ Blob *pB_Blob, /* TO file */ Blob *pOut, /* Write unified diff here if not NULL */ int nContext /* Amount of context to unified diff */ ){ DLine *A, *B; /* Files being compared */ int X, Y; /* Number of elements in A and B */ int x, y; /* Indices: A[x] and B[y] */ int szM = 0; /* Number of rows and columns in M */ int **M = 0; /* Myers matrix */ int i, d; /* Indices on M. M[d][i] */ int k, q; /* Diagonal number and distinct from (0,0) */ int K, D; /* The diagonal and d for the final solution */ int *R = 0; /* Result vector */ int r; /* Loop variables */ int go = 1; /* Outer loop control */ int MAX; /* Largest of X and Y */ /* Break the two files being diffed into individual lines. ** Compute hashes on each line for fast comparison. */ A = break_into_lines(blob_str(pA_Blob), &X); B = break_into_lines(blob_str(pB_Blob), &Y); if( A==0 || B==0 ){ free(A); free(B); if( pOut ){ blob_appendf(pOut, "cannot compute difference between binary files\n"); } return 0; } szM = 0; MAX = X>Y ? X : Y; if( MAX>2000 ) MAX = 2000; for(d=0; go && d<=MAX; d++){ if( szMX || y<0 || y>Y ){ x = y = 0; }else{ while( xMAX ){ R = malloc( sizeof(R[0])*7 ); R[0] = 0; R[1] = X; R[2] = Y; R[3] = 0; R[4] = 0; R[5] = 0; R[6] = 0; }else{ /* Reuse M[] as follows: ** ** M[d][1] = 1 if a line is inserted or 0 if a line is deleted. ** M[d][0] = number of lines copied after the ins or del above. ** */ D = d - 1; K = X - Y; for(d=D, i=(K+D)/2; d>0; d--){ DEBUG( printf("d=%d i=%d\n", d, i); ) if( i==d || (i>0 && M[d-1][i-1] > M[d-1][i]) ){ M[d][0] = M[d][i] - M[d-1][i-1] - 1; M[d][1] = 0; i--; }else{ M[d][0] = M[d][i] - M[d-1][i] - 1; M[d][1] = 1; } } DEBUG( printf("---------------\nM[0][0] = %5d\n", M[0][0]); for(d=1; d<=D; d++){ printf("M[%d][0] = %5d M[%d][1] = %d\n",d,M[d][0],d,M[d][1]); } ) /* Allocate the output vector */ R = malloc( sizeof(R[0])*(D+2)*3 ); if( R==0 ){ fossil_fatal("out of memory"); } /* Populate the output vector */ d = r = 0; while( d<=D ){ int n; R[r++] = M[d++][0]/2; /* COPY */ if( d>D ){ R[r++] = 0; R[r++] = 0; break; } if( M[d][1]==0 ){ n = 1; while( M[d][0]==0 && d0 ){ R[r++] = 0; /* INSERT */ continue; } d++; }else{ R[r++] = 0; /* DELETE */ } assert( M[d][1]==1 ); n = 1; while( M[d][0]==0 && d0 && R[r+nr*3]nContext ){ na = nb = nContext; skip = R[r] - nContext; }else{ na = nb = R[r]; skip = 0; } for(i=0; inContext ){ na += nContext; nb += nContext; }else{ na += R[r+nr*3]; nb += R[r+nr*3]; } for(i=1; inContext ) m = nContext; for(j=0; j3 ) printf("-------------------------------\n"); blob_read_from_file(&b, g.argv[i]); R = text_diff(&a, &b, 0, 0); for(r=0; R[r] || R[r+1] || R[r+2]; r += 3){ printf(" copy %4d delete %4d insert %4d\n", R[r], R[r+1], R[r+2]); } /* free(R); */ blob_reset(&b); } } /* ** COMMAND: test-udiff */ void test_udiff_cmd(void){ Blob a, b, out; if( g.argc!=4 ) usage("FILE1 FILE2"); blob_read_from_file(&a, g.argv[2]); blob_read_from_file(&b, g.argv[3]); blob_zero(&out); text_diff(&a, &b, &out, 3); blob_write_to_file(&out, "-"); }