Fossil

Check-in [1cb182ac]
Login

Check-in [1cb182ac]

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Diff algorithm is slightly faster and does a better job of dealing with indentation changes in code. See forum thread 7631656a2823338a.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: 1cb182ac18de0bb78c060c9641ba87b08b56a532ee71f04b2718747123942260
User & Date: drh 2022-01-23 20:11:50
Context
2022-01-24
06:54
Replaced the "manual" TLS EOF tracking with BIO_eof(), analog to how is done in althttpd. ... (check-in: 06e300e5 user: stephan tags: trunk)
2022-01-23
20:11
Diff algorithm is slightly faster and does a better job of dealing with indentation changes in code. See forum thread 7631656a2823338a. ... (check-in: 1cb182ac user: drh tags: trunk)
19:57
Add a heuristic to the diff generator that helps it do a better job of identifying differences in C code that result from a change in indentation level. ... (Closed-Leaf check-in: 8cd73dda user: drh tags: diff-improvement)
12:52
Fix bullets in wsl_caveats.wiki ... (check-in: ea6b2d3e user: larrybr tags: trunk)
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/diff.c.

119
120
121
122
123
124
125
126
127

128
129
130
131
132
133
134
** of the line.  If any line is longer than LENGTH_MASK characters,
** the file is considered binary.
*/
typedef struct DLine DLine;
struct DLine {
  const char *z;          /* The text of the line */
  u64 h;                  /* Hash of the line */
  unsigned short indent;  /* Indent of the line. Only !=0 with -w/-Z option */
  unsigned short n;       /* number of bytes */

  unsigned int iNext;     /* 1+(Index of next line with same the same hash) */

  /* an array of DLine elements serves two purposes.  The fields
  ** above are one per line of input text.  But each entry is also
  ** a bucket in a hash table, as follows: */
  unsigned int iHash;     /* 1+(first entry in the hash chain) */
};







|

>







119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
** of the line.  If any line is longer than LENGTH_MASK characters,
** the file is considered binary.
*/
typedef struct DLine DLine;
struct DLine {
  const char *z;          /* The text of the line */
  u64 h;                  /* Hash of the line */
  unsigned short indent;  /* Index of first non-space */
  unsigned short n;       /* number of bytes */
  unsigned short nw;      /* number of bytes without leading/trailing space */
  unsigned int iNext;     /* 1+(Index of next line with same the same hash) */

  /* an array of DLine elements serves two purposes.  The fields
  ** above are one per line of input text.  But each entry is also
  ** a bucket in a hash table, as follows: */
  unsigned int iHash;     /* 1+(first entry in the hash chain) */
};
161
162
163
164
165
166
167
168
169






















170
171
172
173
174
175
176
  int *aEdit;        /* Array of copy/delete/insert triples */
  int nEdit;         /* Number of integers (3x num of triples) in aEdit[] */
  int nEditAlloc;    /* Space allocated for aEdit[] */
  DLine *aFrom;      /* File on left side of the diff */
  int nFrom;         /* Number of lines in aFrom[] */
  DLine *aTo;        /* File on right side of the diff */
  int nTo;           /* Number of lines in aTo[] */
  int (*xDiffer)(const DLine*,const DLine*); /* comparison function */
};























/*
** Count the number of lines in the input string.  Include the last line
** in the count even if it lacks the \n terminator.  If an empty string
** is specified, the number of lines is zero.  For the purposes of this
** function, a string is considered empty if it contains no characters
** -OR- it contains only NUL characters.







|

>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
  int *aEdit;        /* Array of copy/delete/insert triples */
  int nEdit;         /* Number of integers (3x num of triples) in aEdit[] */
  int nEditAlloc;    /* Space allocated for aEdit[] */
  DLine *aFrom;      /* File on left side of the diff */
  int nFrom;         /* Number of lines in aFrom[] */
  DLine *aTo;        /* File on right side of the diff */
  int nTo;           /* Number of lines in aTo[] */
  int (*xDiffer)(const DLine *,const DLine *); /* comparison function */
};

/* Fast isspace for use by diff */
static const char diffIsSpace[] = {
  0, 0, 0, 0, 0, 0, 0, 0,   1, 1, 1, 0, 1, 1, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,
  1, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,

  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,
};
#define diff_isspace(X)  (diffIsSpace[(unsigned char)(X)])

/*
** Count the number of lines in the input string.  Include the last line
** in the count even if it lacks the \n terminator.  If an empty string
** is specified, the number of lines is zero.  For the purposes of this
** function, a string is considered empty if it contains no characters
** -OR- it contains only NUL characters.
243
244
245
246
247
248
249
250
251
252
253
254
255
256


257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
    }
    a[i].z = z;
    k = nn;
    if( diffFlags & DIFF_STRIP_EOLCR ){
      if( k>0 && z[k-1]=='\r' ){ k--; }
    }
    a[i].n = k;
    s = 0;
    if( diffFlags & DIFF_IGNORE_EOLWS ){
      while( k>0 && fossil_isspace(z[k-1]) ){ k--; }
    }
    if( (diffFlags & DIFF_IGNORE_ALLWS)==DIFF_IGNORE_ALLWS ){
      int numws = 0;
      while( s<k && fossil_isspace(z[s]) ){ s++; }


      for(h=0, x=s; x<k; x++){
        char c = z[x];
        if( fossil_isspace(c) ){
          ++numws;
        }else{
          h = (h^c)*9000000000000000041LL;
        }
      }
      k -= numws;
    }else{
      int k2 = k & ~0x7;
      u64 m;
      for(h=0, x=s; x<k2; x += 8){
        memcpy(&m, z+x, 8);
        h = (h^m)*9000000000000000041LL;
      }
      m = 0;
      memcpy(&m, z+x, k-k2);
      h ^= m;
    }
    a[i].indent = s;
    a[i].h = h = ((h%281474976710597LL)<<LENGTH_MASK_SZ) | (k-s);
    h2 = h % nLine;
    a[i].iNext = a[h2].iHash;
    a[h2].iHash = i+1;
    z += nn+1; n -= nn+1;
    i++;
  }while( zNL[0]!='\0' && zNL[1]!='\0' );







<

|



|
>
>


|









|







<







266
267
268
269
270
271
272

273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300

301
302
303
304
305
306
307
    }
    a[i].z = z;
    k = nn;
    if( diffFlags & DIFF_STRIP_EOLCR ){
      if( k>0 && z[k-1]=='\r' ){ k--; }
    }
    a[i].n = k;

    if( diffFlags & DIFF_IGNORE_EOLWS ){
      while( k>0 && diff_isspace(z[k-1]) ){ k--; }
    }
    if( (diffFlags & DIFF_IGNORE_ALLWS)==DIFF_IGNORE_ALLWS ){
      int numws = 0;
      for(s=0; s<k && z[s]<=' '; s++){}
      a[i].indent = s;
      a[i].nw = k - s;
      for(h=0, x=s; x<k; x++){
        char c = z[x];
        if( diff_isspace(c) ){
          ++numws;
        }else{
          h = (h^c)*9000000000000000041LL;
        }
      }
      k -= numws;
    }else{
      int k2 = k & ~0x7;
      u64 m;
      for(h=x=s=0; x<k2; x += 8){
        memcpy(&m, z+x, 8);
        h = (h^m)*9000000000000000041LL;
      }
      m = 0;
      memcpy(&m, z+x, k-k2);
      h ^= m;
    }

    a[i].h = h = ((h%281474976710597LL)<<LENGTH_MASK_SZ) | (k-s);
    h2 = h % nLine;
    a[i].iNext = a[h2].iHash;
    a[h2].iHash = i+1;
    z += nn+1; n -= nn+1;
    i++;
  }while( zNL[0]!='\0' && zNL[1]!='\0' );
298
299
300
301
302
303
304
305
306
307
308




309
310
311
312
313
314
315
316
317
318
319
}

/*
** Return zero if two DLine elements are identical, ignoring
** all whitespace. The indent field of pA/pB already points
** to the first non-space character in the string.
*/

static int compare_dline_ignore_allws(const DLine *pA, const DLine *pB){
  int a = pA->indent, b = pB->indent;
  if( pA->h==pB->h ){




    while( a<pA->n || b<pB->n ){
      if( a<pA->n && b<pB->n && pA->z[a++] != pB->z[b++] ) return 1;
      while( a<pA->n && fossil_isspace(pA->z[a])) ++a;
      while( b<pB->n && fossil_isspace(pB->z[b])) ++b;
    }
    return pA->n-a != pB->n-b;
  }
  return 1;
}

/*







<

<

>
>
>
>


|
|







321
322
323
324
325
326
327

328

329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
}

/*
** Return zero if two DLine elements are identical, ignoring
** all whitespace. The indent field of pA/pB already points
** to the first non-space character in the string.
*/

static int compare_dline_ignore_allws(const DLine *pA, const DLine *pB){

  if( pA->h==pB->h ){
    int a, b;
    if( memcmp(pA->z, pB->z, pA->h&LENGTH_MASK)==0 ) return 0;
    a = pA->indent;
    b = pB->indent;
    while( a<pA->n || b<pB->n ){
      if( a<pA->n && b<pB->n && pA->z[a++] != pB->z[b++] ) return 1;
      while( a<pA->n && diff_isspace(pA->z[a])) ++a;
      while( b<pB->n && diff_isspace(pB->z[b])) ++b;
    }
    return pA->n-a != pB->n-b;
  }
  return 1;
}

/*
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350

/*
** Append a single line of context-diff output to pOut.
*/
static void appendDiffLine(
  Blob *pOut,         /* Where to write the line of output */
  char cPrefix,       /* One of " ", "+",  or "-" */
  DLine *pLine        /* The line to be output */
){
  blob_append_char(pOut, cPrefix);
  blob_append(pOut, pLine->z, pLine->n);
  blob_append_char(pOut, '\n');
}

/*







|







361
362
363
364
365
366
367
368
369
370
371
372
373
374
375

/*
** Append a single line of context-diff output to pOut.
*/
static void appendDiffLine(
  Blob *pOut,         /* Where to write the line of output */
  char cPrefix,       /* One of " ", "+",  or "-" */
  const DLine *pLine  /* The line to be output */
){
  blob_append_char(pOut, cPrefix);
  blob_append(pOut, pLine->z, pLine->n);
  blob_append_char(pOut, '\n');
}

/*
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
** Output a patch-style text diff.
*/
static void contextDiff(
  DContext *p,      /* The difference */
  Blob *pOut,       /* Output a context diff to here */
  DiffConfig *pCfg  /* Configuration options */
){
  DLine *A;     /* Left side of the diff */
  DLine *B;     /* Right side of the diff */
  int a = 0;    /* Index of next line in A[] */
  int b = 0;    /* Index of next line in B[] */
  int *R;       /* Array of COPY/DELETE/INSERT triples */
  int r;        /* Index into R[] */
  int nr;       /* Number of COPY/DELETE/INSERT triples to process */
  int mxr;      /* Maximum value for r */
  int na, nb;   /* Number of lines shown from A and B */
  int i, j;     /* Loop counters */
  int m;        /* Number of lines to output */







|
|
|
|







394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
** Output a patch-style text diff.
*/
static void contextDiff(
  DContext *p,      /* The difference */
  Blob *pOut,       /* Output a context diff to here */
  DiffConfig *pCfg  /* Configuration options */
){
  const DLine *A;   /* Left side of the diff */
  const DLine *B;   /* Right side of the diff */
  int a = 0;        /* Index of next line in A[] */
  int b = 0;        /* Index of next line in B[] */
  int *R;       /* Array of COPY/DELETE/INSERT triples */
  int r;        /* Index into R[] */
  int nr;       /* Number of COPY/DELETE/INSERT triples to process */
  int mxr;      /* Maximum value for r */
  int na, nb;   /* Number of lines shown from A and B */
  int i, j;     /* Loop counters */
  int m;        /* Number of lines to output */
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
}

/*
** Return true if the string starts with n spaces
*/
static int allSpaces(const char *z, int n){
  int i;
  for(i=0; i<n && fossil_isspace(z[i]); i++){}
  return i==n;
}

/*
** Try to improve the human-readability of the LineChange p.
**
** (1)  If the first change span shows a change of indentation, try to







|







642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
}

/*
** Return true if the string starts with n spaces
*/
static int allSpaces(const char *z, int n){
  int i;
  for(i=0; i<n && diff_isspace(z[i]); i++){}
  return i==n;
}

/*
** Try to improve the human-readability of the LineChange p.
**
** (1)  If the first change span shows a change of indentation, try to
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
    int iBestVal = -1;
    int i;
    int nLong = nLeft<nRight ? nRight : nLeft;
    int nGap = nLong - nShort;
    for(i=nShort-nSuffix; i<=nPrefix; i++){
       int iVal = 0;
       char c = zLeft[i];
       if( fossil_isspace(c) ){
         iVal += 5;
       }else if( !fossil_isalnum(c) ){
         iVal += 2;
       }
       c = zLeft[i+nGap-1];
       if( fossil_isspace(c) ){
         iVal += 5;
       }else if( !fossil_isalnum(c) ){
         iVal += 2;
       }
       if( iVal>iBestVal ){
         iBestVal = iVal;
         iBest = i;







|





|







768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
    int iBestVal = -1;
    int i;
    int nLong = nLeft<nRight ? nRight : nLeft;
    int nGap = nLong - nShort;
    for(i=nShort-nSuffix; i<=nPrefix; i++){
       int iVal = 0;
       char c = zLeft[i];
       if( diff_isspace(c) ){
         iVal += 5;
       }else if( !fossil_isalnum(c) ){
         iVal += 2;
       }
       c = zLeft[i+nGap-1];
       if( diff_isspace(c) ){
         iVal += 5;
       }else if( !fossil_isalnum(c) ){
         iVal += 2;
       }
       if( iVal>iBestVal ){
         iBestVal = iVal;
         iBest = i;
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
*/ 
typedef struct DiffBuilder DiffBuilder;
struct DiffBuilder {
  void (*xSkip)(DiffBuilder*, unsigned int, int);
  void (*xCommon)(DiffBuilder*,const DLine*);
  void (*xInsert)(DiffBuilder*,const DLine*);
  void (*xDelete)(DiffBuilder*,const DLine*);
  void (*xReplace)(DiffBuilder*,const DLine*, const DLine*);
  void (*xEdit)(DiffBuilder*,const DLine*,const DLine*);
  void (*xEnd)(DiffBuilder*);
  unsigned int lnLeft;              /* Lines seen on the left (delete) side */
  unsigned int lnRight;             /* Lines seen on the right (insert) side */
  unsigned int nPending;            /* Number of pending lines */
  int eState;                       /* State of the output */
  int width;                        /* Display width */







|







912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
*/ 
typedef struct DiffBuilder DiffBuilder;
struct DiffBuilder {
  void (*xSkip)(DiffBuilder*, unsigned int, int);
  void (*xCommon)(DiffBuilder*,const DLine*);
  void (*xInsert)(DiffBuilder*,const DLine*);
  void (*xDelete)(DiffBuilder*,const DLine*);
  void (*xReplace)(DiffBuilder*,const DLine*,const DLine*);
  void (*xEdit)(DiffBuilder*,const DLine*,const DLine*);
  void (*xEnd)(DiffBuilder*);
  unsigned int lnLeft;              /* Lines seen on the left (delete) side */
  unsigned int lnRight;             /* Lines seen on the right (insert) side */
  unsigned int nPending;            /* Number of pending lines */
  int eState;                       /* State of the output */
  int width;                        /* Display width */
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753









1754


1755




1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
** (1) Remove leading and trailing whitespace.
** (2) Truncate both strings to at most 250 characters
** (3) If the two strings have a common prefix, measure that prefix
** (4) Find the length of the longest common subsequence that is
**     at least 150% longer than the common prefix.
** (5) Longer common subsequences yield lower scores.
*/
static int match_dline(const DLine *pA, const DLine *pB){
  const char *zA;            /* Left string */
  const char *zB;            /* right string */
  int nA;                    /* Bytes in zA[] */
  int nB;                    /* Bytes in zB[] */
  int nMin;
  int nPrefix;
  int avg;                   /* Average length of A and B */
  int i, j, k;               /* Loop counters */
  int best = 0;              /* Longest match found so far */
  int score;                 /* Final score.  0..100 */
  unsigned char c;           /* Character being examined */
  unsigned char aFirst[256]; /* aFirst[X] = index in zB[] of first char X */
  unsigned char aNext[252];  /* aNext[i] = index in zB[] of next zB[i] char */

  zA = pA->z;









  zB = pB->z;


  nA = pA->n;




  nB = pB->n;
  while( nA>0 && (unsigned char)zA[0]<=' ' ){ nA--; zA++; }
  while( nA>0 && (unsigned char)zA[nA-1]<=' ' ){ nA--; }
  while( nB>0 && (unsigned char)zB[0]<=' ' ){ nB--; zB++; }
  while( nB>0 && (unsigned char)zB[nB-1]<=' ' ){ nB--; }
  if( nA>250 ) nA = 250;
  if( nB>250 ) nB = 250;
  avg = (nA+nB)/2;
  if( avg==0 ) return 0;
  nMin = nA;
  if( nB<nMin ) nMin = nB;
  if( nMin==0 ) return 68;







|















>
>
>
>
>
>
>
>
>

>
>
|
>
>
>
>
|
|
<
<
<







1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797



1798
1799
1800
1801
1802
1803
1804
** (1) Remove leading and trailing whitespace.
** (2) Truncate both strings to at most 250 characters
** (3) If the two strings have a common prefix, measure that prefix
** (4) Find the length of the longest common subsequence that is
**     at least 150% longer than the common prefix.
** (5) Longer common subsequences yield lower scores.
*/
static int match_dline(DLine *pA, DLine *pB){
  const char *zA;            /* Left string */
  const char *zB;            /* right string */
  int nA;                    /* Bytes in zA[] */
  int nB;                    /* Bytes in zB[] */
  int nMin;
  int nPrefix;
  int avg;                   /* Average length of A and B */
  int i, j, k;               /* Loop counters */
  int best = 0;              /* Longest match found so far */
  int score;                 /* Final score.  0..100 */
  unsigned char c;           /* Character being examined */
  unsigned char aFirst[256]; /* aFirst[X] = index in zB[] of first char X */
  unsigned char aNext[252];  /* aNext[i] = index in zB[] of next zB[i] char */

  zA = pA->z;
  if( pA->nw==0 && pA->n ){
    for(i=0; i<pA->n && diff_isspace(zA[i]); i++){}
    pA->indent = i;
    for(j=pA->n-1; j>i && diff_isspace(zA[j]); j--){}
    pA->nw = j - i + 1;
  }
  zA += pA->indent;
  nA = pA->nw;

  zB = pB->z;
  if( pB->nw==0 && pB->n ){
    for(i=0; i<pB->n && diff_isspace(zB[i]); i++){}
    pB->indent = i;
    for(j=pB->n-1; j>i && diff_isspace(zB[j]); j--){}
    pB->nw = j - i + 1;
  }
  zB += pB->indent;
  nB = pB->nw;




  if( nA>250 ) nA = 250;
  if( nB>250 ) nB = 250;
  avg = (nA+nB)/2;
  if( avg==0 ) return 0;
  nMin = nA;
  if( nB<nMin ) nMin = nB;
  if( nMin==0 ) return 68;
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
    c = (unsigned char)zA[i];
    for(j=aFirst[c]; j<nB-best && memcmp(&zA[i],&zB[j],best)==0; j = aNext[j]){
      int limit = minInt(nA-i, nB-j);
      for(k=best; k<=limit && zA[k+i]==zB[k+j]; k++){}
      if( k>best ) best = k;
    }
  }
  score = (best>=avg) ? 0 : (avg - best)*100/avg;

#if 0
  fprintf(stderr, "A: [%.*s]\nB: [%.*s]\nbest=%d avg=%d score=%d\n",
  nA, zA+1, nB, zB+1, best, avg, score);
#endif

  /* Return the result */







|







1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
    c = (unsigned char)zA[i];
    for(j=aFirst[c]; j<nB-best && memcmp(&zA[i],&zB[j],best)==0; j = aNext[j]){
      int limit = minInt(nA-i, nB-j);
      for(k=best; k<=limit && zA[k+i]==zB[k+j]; k++){}
      if( k>best ) best = k;
    }
  }
  score = 5 + ((best>=avg) ? 0 : (avg - best)*95/avg);

#if 0
  fprintf(stderr, "A: [%.*s]\nB: [%.*s]\nbest=%d avg=%d score=%d\n",
  nA, zA+1, nB, zB+1, best, avg, score);
#endif

  /* Return the result */
1815
1816
1817
1818
1819
1820
1821













































































































































































1822
1823
1824
1825
1826
1827
1828
  a.z = g.argv[2];
  a.n = (int)strlen(a.z);
  b.z = g.argv[3];
  b.n = (int)strlen(b.z);
  x = match_dline(&a, &b);
  fossil_print("%d\n", x);
}














































































































































































/*
** The threshold at which diffBlockAlignment transitions from the
** O(N*N) Wagner minimum-edit-distance algorithm to a less process
** O(NlogN) divide-and-conquer approach.
*/
#define DIFF_ALIGN_MX  1225







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
  a.z = g.argv[2];
  a.n = (int)strlen(a.z);
  b.z = g.argv[3];
  b.n = (int)strlen(b.z);
  x = match_dline(&a, &b);
  fossil_print("%d\n", x);
}

/* Forward declarations for recursion */
static unsigned char *diffBlockAlignment(
  DLine *aLeft, int nLeft,     /* Text on the left */
  DLine *aRight, int nRight,   /* Text on the right */
  DiffConfig *pCfg,            /* Configuration options */
  int *pNResult                /* OUTPUT: Bytes of result */
);
static void longestCommonSequence(
  DContext *p,               /* Two files being compared */
  int iS1, int iE1,          /* Range of lines in p->aFrom[] */
  int iS2, int iE2,          /* Range of lines in p->aTo[] */
  int *piSX, int *piEX,      /* Write p->aFrom[] common segment here */
  int *piSY, int *piEY       /* Write p->aTo[] common segment here */
);

/*
** Make a copy of a list of nLine DLine objects from one array to
** another.  Hash the new array to ignore whitespace.
*/
static void diffDLineXfer(
  DLine *aTo,
  const DLine *aFrom,
  int nLine
){
  int i, j, k;
  u64 h, h2;
  for(i=0; i<nLine; i++) aTo[i].iHash = 0;
  for(i=0; i<nLine; i++){
    const char *z = aFrom[i].z;
    int n = aFrom[i].n;
    for(j=0; j<n && diff_isspace(z[j]); j++){}
    aTo[i].z = &z[j];
    for(k=aFrom[i].n; k>j && diff_isspace(z[k-1]); k--){}
    aTo[i].n = n = k-j;
    aTo[i].indent = 0;
    aTo[i].nw = 0;
    for(h=0; j<k; j++){
      char c = z[j];
      if( !diff_isspace(c) ){
        h = (h^c)*9000000000000000041LL;
      }
    }
    aTo[i].h = h = ((h%281474976710597LL)<<LENGTH_MASK_SZ) | n;
    h2 = h % nLine;
    aTo[i].iNext = aTo[h2].iHash;
    aTo[h2].iHash = i+1;
  }
}


/*
** For a difficult diff-block alignment that was originally for
** the default consider-all-whitespace algorithm, try to find the
** longest common subsequence between the two blocks that involves
** only whitespace changes.
*/
static unsigned char *diffBlockAlignmentIgnoreSpace(
  DLine *aLeft, int nLeft,     /* Text on the left */
  DLine *aRight, int nRight,   /* Text on the right */
  DiffConfig *pCfg,            /* Configuration options */
  int *pNResult                /* OUTPUT: Bytes of result */
){
  DContext dc;
  int iSX, iEX;                /* Start and end of LCS on the left */
  int iSY, iEY;                /* Start and end of the LCS on the right */
  unsigned char *a1, *a2;
  int n1, n2, nLCS;

  dc.aEdit = 0;
  dc.nEdit = 0;
  dc.nEditAlloc = 0;
  dc.nFrom = nLeft;
  dc.nTo = nRight;
  dc.xDiffer = compare_dline_ignore_allws;
  dc.aFrom = fossil_malloc( sizeof(DLine)*(nLeft+nRight) );
  dc.aTo = &dc.aFrom[dc.nFrom];
  diffDLineXfer(dc.aFrom, aLeft, nLeft);
  diffDLineXfer(dc.aTo, aRight, nRight);
  longestCommonSequence(&dc,0,nLeft,0,nRight,&iSX,&iEX,&iSY,&iEY);
  fossil_free(dc.aFrom);
  nLCS = iEX - iSX;
  if( nLCS<5 ) return 0;   /* No good LCS was found */

  if( pCfg->diffFlags & DIFF_DEBUG ){
     fossil_print("   LCS size=%d\n"
                  "     [%.*s]\n"
                  "     [%.*s]\n",
                  nLCS, aLeft[iSX].n, aLeft[iSX].z,
                  aLeft[iEX-1].n, aLeft[iEX-1].z);
  }

  a1 = diffBlockAlignment(aLeft,iSX,aRight,iSY,pCfg,&n1);
  a2 = diffBlockAlignment(aLeft+iEX, nLeft-iEX,
                          aRight+iEY, nRight-iEY,
                          pCfg, &n2);
  a1 = fossil_realloc(a1, n1+nLCS+n2);
  memcpy(a1+n1+nLCS,a2,n2);
  memset(a1+n1,3,nLCS);
  fossil_free(a2);
  *pNResult = n1+n2+nLCS;
  return a1;
}


/*
** This is a helper route for diffBlockAlignment().  In this case,
** a very large block is encountered that might be too expensive to
** use the O(N*N) Wagner edit distance algorithm.  So instead, this
** block implements a less-precise but faster O(N*logN) divide-and-conquer
** approach.
*/
static unsigned char *diffBlockAlignmentDivideAndConquer(
  DLine *aLeft, int nLeft,     /* Text on the left */
  DLine *aRight, int nRight,   /* Text on the right */
  DiffConfig *pCfg,            /* Configuration options */
  int *pNResult                /* OUTPUT: Bytes of result */
){
  DLine *aSmall;               /* The smaller of aLeft and aRight */
  DLine *aBig;                 /* The larger of aLeft and aRight */
  int nSmall, nBig;            /* Size of aSmall and aBig.  nSmall<=nBig */
  int iDivSmall, iDivBig;      /* Divider point for aSmall and aBig */
  int iDivLeft, iDivRight;     /* Divider point for aLeft and aRight */
  unsigned char *a1, *a2;      /* Results of the alignments on two halves */
  int n1, n2;                  /* Number of entries in a1 and a2 */
  int score, bestScore;        /* Score and best score seen so far */
  int i;                       /* Loop counter */

  if( nLeft>nRight ){
    aSmall = aRight;
    nSmall = nRight;
    aBig = aLeft;
    nBig = nLeft;
  }else{
    aSmall = aLeft;
    nSmall = nLeft;
    aBig = aRight;
    nBig = nRight;
  }
  iDivBig = nBig/2;
  iDivSmall = nSmall/2;

  if( pCfg->diffFlags & DIFF_DEBUG ){
    fossil_print("  Divide at [%.*s]\n", 
                 aBig[iDivBig].n, aBig[iDivBig].z);
  }

  bestScore = 10000;
  for(i=0; i<nSmall; i++){
    score = match_dline(aBig+iDivBig, aSmall+i) + abs(i-nSmall/2)*2;
    if( score<bestScore ){
      bestScore = score;
      iDivSmall = i;
    }
  }
  if( aSmall==aRight ){
    iDivRight = iDivSmall;
    iDivLeft = iDivBig;
  }else{
    iDivRight = iDivBig;
    iDivLeft = iDivSmall;
  }
  a1 = diffBlockAlignment(aLeft,iDivLeft,aRight,iDivRight,pCfg,&n1);
  a2 = diffBlockAlignment(aLeft+iDivLeft, nLeft-iDivLeft,
                          aRight+iDivRight, nRight-iDivRight,
                          pCfg, &n2);
  a1 = fossil_realloc(a1, n1+n2 );
  memcpy(a1+n1,a2,n2);
  fossil_free(a2);
  *pNResult = n1+n2;
  return a1;
}


/*
** The threshold at which diffBlockAlignment transitions from the
** O(N*N) Wagner minimum-edit-distance algorithm to a less process
** O(NlogN) divide-and-conquer approach.
*/
#define DIFF_ALIGN_MX  1225
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
** Algorithm:  Wagner's minimum edit-distance algorithm, modified by
** adding a cost to each match based on how well the two rows match
** each other.  Insertion and deletion costs are 50.  Match costs
** are between 0 and 100 where 0 is a perfect match 100 is a complete
** mismatch.
*/
static unsigned char *diffBlockAlignment(
  const DLine *aLeft, int nLeft,     /* Text on the left */
  const DLine *aRight, int nRight,   /* Text on the right */
  DiffConfig *pCfg,                  /* Configuration options */
  int *pNResult                      /* OUTPUT: Bytes of result */
){
  int i, j, k;                 /* Loop counters */
  int *a;                      /* One row of the Wagner matrix */
  int *pToFree;                /* Space that needs to be freed */
  unsigned char *aM;           /* Wagner result matrix */
  int nMatch, iMatch;          /* Number of matching lines and match score */
  int aBuf[100];               /* Stack space for a[] if nRight not to big */







|
|
|
|







2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
** Algorithm:  Wagner's minimum edit-distance algorithm, modified by
** adding a cost to each match based on how well the two rows match
** each other.  Insertion and deletion costs are 50.  Match costs
** are between 0 and 100 where 0 is a perfect match 100 is a complete
** mismatch.
*/
static unsigned char *diffBlockAlignment(
  DLine *aLeft, int nLeft,     /* Text on the left */
  DLine *aRight, int nRight,   /* Text on the right */
  DiffConfig *pCfg,            /* Configuration options */
  int *pNResult                /* OUTPUT: Bytes of result */
){
  int i, j, k;                 /* Loop counters */
  int *a;                      /* One row of the Wagner matrix */
  int *pToFree;                /* Space that needs to be freed */
  unsigned char *aM;           /* Wagner result matrix */
  int nMatch, iMatch;          /* Number of matching lines and match score */
  int aBuf[100];               /* Stack space for a[] if nRight not to big */
1873
1874
1875
1876
1877
1878
1879

1880



1881

1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
  if( nRight==0 ){
    aM = fossil_malloc( nLeft + 2 );
    memset(aM, 1, nLeft);
    *pNResult = nLeft;
    return aM;
  }


  /* For large alignments, use a divide and conquer algorithm that is



  ** O(NlogN).  The result is not as precise, but this whole thing is an

  ** approximation anyhow, and the faster response time is an acceptable
  ** trade-off for reduced precision.
  */
  if( nLeft*nRight>DIFF_ALIGN_MX && (pCfg->diffFlags & DIFF_SLOW_SBS)==0 ){
    const DLine *aSmall;   /* The smaller of aLeft and aRight */
    const DLine *aBig;     /* The larger of aLeft and aRight */
    int nSmall, nBig;      /* Size of aSmall and aBig.  nSmall<=nBig */
    int iDivSmall, iDivBig;  /* Divider point for aSmall and aBig */
    int iDivLeft, iDivRight; /* Divider point for aLeft and aRight */
    unsigned char *a1, *a2;  /* Results of the alignments on two halves */
    int n1, n2;              /* Number of entries in a1 and a2 */
    int score, bestScore;    /* Score and best score seen so far */
    if( nLeft>nRight ){
      aSmall = aRight;
      nSmall = nRight;
      aBig = aLeft;
      nBig = nLeft;
    }else{
      aSmall = aLeft;
      nSmall = nLeft;
      aBig = aRight;
      nBig = nRight;
    }
    iDivBig = nBig/2;
    iDivSmall = nSmall/2;
    bestScore = 10000;
    for(i=0; i<nSmall; i++){
      score = match_dline(aBig+iDivBig, aSmall+i) + abs(i-nSmall/2)*2;
      if( score<bestScore ){
        bestScore = score;
        iDivSmall = i;
      }
    }
    if( aSmall==aRight ){
      iDivRight = iDivSmall;
      iDivLeft = iDivBig;
    }else{
      iDivRight = iDivBig;
      iDivLeft = iDivSmall;
    }
    a1 = diffBlockAlignment(aLeft,iDivLeft,aRight,iDivRight,pCfg,&n1);
    a2 = diffBlockAlignment(aLeft+iDivLeft, nLeft-iDivLeft,
                            aRight+iDivRight, nRight-iDivRight,
                            pCfg, &n2);
    a1 = fossil_realloc(a1, n1+n2 );
    memcpy(a1+n1,a2,n2);
    fossil_free(a2);
    *pNResult = n1+n2;
    return a1;
  }

  /* If we reach this point, we will be doing an O(N*N) Wagner minimum
  ** edit distance to compute the alignment.
  */
  if( nRight < count(aBuf)-1 ){
    pToFree = 0;







>
|
>
>
>
|
>
|
<


<
<
<
<
|
|
<
<
<
<
<
|
<
<
<
<
<
|
<
<
<
<
<
<
<
<
<
<
<
|
<
<
<
<
<

<
|
|
<
<
<
<
<
<







2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097

2098
2099




2100
2101





2102





2103











2104





2105

2106
2107






2108
2109
2110
2111
2112
2113
2114
  if( nRight==0 ){
    aM = fossil_malloc( nLeft + 2 );
    memset(aM, 1, nLeft);
    *pNResult = nLeft;
    return aM;
  }

  if( pCfg->diffFlags & DIFF_DEBUG ){
    fossil_print("BlockAlignment:\n   [%.*s] + %d\n   [%.*s] + %d\n",
                 aLeft[0].n, aLeft[0].z, nLeft,
                 aRight[0].n, aRight[0].z, nRight);
  }

  /* For large alignments, try to use alternative algorithms that are
  ** faster than the O(N*N) Wagner edit distance.

  */
  if( nLeft*nRight>DIFF_ALIGN_MX && (pCfg->diffFlags & DIFF_SLOW_SBS)==0 ){




    if( (pCfg->diffFlags & DIFF_IGNORE_ALLWS)==0 ){
      unsigned char *aRes;





      aRes = diffBlockAlignmentIgnoreSpace(





                 aLeft, nLeft,aRight, nRight,pCfg,pNResult);











      if( aRes ) return aRes;





    }

    return diffBlockAlignmentDivideAndConquer(
                 aLeft, nLeft,aRight, nRight,pCfg,pNResult);






  }

  /* If we reach this point, we will be doing an O(N*N) Wagner minimum
  ** edit distance to compute the alignment.
  */
  if( nRight < count(aBuf)-1 ){
    pToFree = 0;
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
** Format a diff using a DiffBuilder object
*/
static void formatDiff(
  DContext *p,           /* The computed diff */
  DiffConfig *pCfg,      /* Configuration options */
  DiffBuilder *pBuilder  /* The formatter object */
){
  const DLine *A;        /* Left side of the diff */
  const DLine *B;        /* Right side of the diff */
  unsigned int a = 0;    /* Index of next line in A[] */
  unsigned int b = 0;    /* Index of next line in B[] */
  const int *R;          /* Array of COPY/DELETE/INSERT triples */
  unsigned int r;        /* Index into R[] */
  unsigned int nr;       /* Number of COPY/DELETE/INSERT triples to process */
  unsigned int mxr;      /* Maximum value for r */
  unsigned int na, nb;   /* Number of lines shown from A and B */







|
|







2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
** Format a diff using a DiffBuilder object
*/
static void formatDiff(
  DContext *p,           /* The computed diff */
  DiffConfig *pCfg,      /* Configuration options */
  DiffBuilder *pBuilder  /* The formatter object */
){
  DLine *A;              /* Left side of the diff */
  DLine *B;              /* Right side of the diff */
  unsigned int a = 0;    /* Index of next line in A[] */
  unsigned int b = 0;    /* Index of next line in B[] */
  const int *R;          /* Array of COPY/DELETE/INSERT triples */
  unsigned int r;        /* Index into R[] */
  unsigned int nr;       /* Number of COPY/DELETE/INSERT triples to process */
  unsigned int mxr;      /* Maximum value for r */
  unsigned int na, nb;   /* Number of lines shown from A and B */
2408
2409
2410
2411
2412
2413
2414
























































2415
2416
2417
2418
2419
2420
2421
    expandEdit(p, p->nEdit*2 + 15);
    if( p->aEdit==0 ) return;
  }
  p->aEdit[p->nEdit++] = nCopy;
  p->aEdit[p->nEdit++] = nDel;
  p->aEdit[p->nEdit++] = nIns;
}

























































/*
** Do a single step in the difference.  Compute a sequence of
** copy/delete/insert steps that will convert lines iS1 through iE1-1 of
** the input into lines iS2 through iE2-1 of the output and write
** that sequence into the difference context.
**







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
    expandEdit(p, p->nEdit*2 + 15);
    if( p->aEdit==0 ) return;
  }
  p->aEdit[p->nEdit++] = nCopy;
  p->aEdit[p->nEdit++] = nDel;
  p->aEdit[p->nEdit++] = nIns;
}

/*
** A common subsequene between p->aFrom and p->aTo has been found.
** This routine tries to judge if the subsequence really is a valid
** match or rather is just an artifact of an indentation change.
**
** Return non-zero if the subsequence is valid.  Return zero if the
** subsequence seems likely to be an editing artifact and should be
** ignored.
**
** This routine is a heuristic optimization intended to give more
** intuitive diff results following an indentation change it code that
** is formatted similarly to C/C++, Javascript, Go, TCL, and similar
** languages that use {...} for nesting.  A correct diff is computed
** even if this routine always returns true (non-zero).  But sometimes
** a more intuitive diff can result if this routine returns false.
**
** The subsequences consists of the rows iSX through iEX-1 (inclusive)
** in p->aFrom[].  The total sequences is iS1 through iE1-1 (inclusive)
** of p->aFrom[].
**
** Example where this heuristic is useful, see the diff at
** https://www.sqlite.org/src/fdiff?v1=0e79dd15cbdb4f48&v2=33955a6fd874dd97
**
** See also discussion at https://fossil-scm.org/forum/forumpost/9ba3284295
**
** ALGORITHM (subject to change and refinement):
**
**    1.  If the subsequence is larger than 1/7th of the original span,
**        then consider it valid.  --> return 1
**
**    2.  If the subsequence contains any charaters other than '}', '{",
**        or whitespace, then consider it valid. --> return 1
**
**    3.  Otherwise, it is potentially an artifact of an indentation
**        change. --> return 0
*/
static int likelyNotIndentChngArtifact(
  DContext *p,     /* The complete diff context */
  int iS1,         /* Start of the main segment */
  int iSX,         /* Start of the subsequence */
  int iEX,         /* First row past the end of the subsequence */
  int iE1          /* First row past the end of the main segment */
){
  int i, j;
  if( (iEX-iSX)*7 >= (iE1-iS1) ) return 1;
  for(i=iSX; i<iEX; i++){
    const char *z = p->aFrom[i].z;
    for(j=p->aFrom[i].n-1; j>=0; j--){
      char c = z[j];
      if( c!='}' && c!='{' && !diff_isspace(c) ) return 1;
    }
  }
  return 0;
}


/*
** Do a single step in the difference.  Compute a sequence of
** copy/delete/insert steps that will convert lines iS1 through iE1-1 of
** the input into lines iS2 through iE2-1 of the output and write
** that sequence into the difference context.
**
2440
2441
2442
2443
2444
2445
2446
2447


2448
2449
2450
2451
2452
2453
2454
    appendTriple(p, 0, iE1-iS1, 0);
    return;
  }

  /* Find the longest matching segment between the two sequences */
  longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY);

  if( iEX>iSX ){


    /* A common segment has been found.
    ** Recursively diff either side of the matching segment */
    diff_step(p, iS1, iSX, iS2, iSY);
    if( iEX>iSX ){
      appendTriple(p, iEX - iSX, 0, 0);
    }
    diff_step(p, iEX, iE1, iEY, iE2);







|
>
>







2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
    appendTriple(p, 0, iE1-iS1, 0);
    return;
  }

  /* Find the longest matching segment between the two sequences */
  longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY);

  if( iEX>iSX+5
   || (iEX>iSX && likelyNotIndentChngArtifact(p,iS1,iSX,iEX,iE1) )
  ){
    /* A common segment has been found.
    ** Recursively diff either side of the matching segment */
    diff_step(p, iS1, iSX, iS2, iSY);
    if( iEX>iSX ){
      appendTriple(p, iEX - iSX, 0, 0);
    }
    diff_step(p, iEX, iE1, iEY, iE2);