Login
Artifact [e082991406]
Login

Artifact e0829914066c2a55bb1ab80fdd339c3b44a13e68:


/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 
/* vim: set ts=2 et sw=2 tw=80: */
/*
  Copyright 2013-2021 The Libfossil Authors, see LICENSES/BSD-2-Clause.txt

  SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  SPDX-FileCopyrightText: 2021 The Libfossil Authors
  SPDX-ArtifactOfProjectName: Libfossil
  SPDX-FileType: Code

  Heavily indebted to the Fossil SCM project (https://fossil-scm.org).
*/
/**
   This file houses the "2nd generation" diff-generation routines
   APIs. This code is a straight port of those algorithms from the
   Fossil SCM project.
*/
#include "fossil-scm/fossil.h"
#include <assert.h>
#include <memory.h>
#include <stdlib.h>
#include <string.h> /* for memmove()/strlen() */

const fsl_diff_cfg fsl_diff_cfg_empty = fsl_diff_cfg_empty_m;
const fsl_diff_builder fsl_diff_builder_empty = fsl_diff_builder_empty_m;
const fsl_dline fsl_dline_empty = fsl_dline_empty_m;
const fsl_dline_change fsl_dline_change_empty = fsl_dline_change_empty_m;

#define DLn fsl_dline

/* Porting crutch for to-be-static functions */
#define TO_BE_STATIC

/*
** Maximum length of a line in a text file, in bytes.  (2**15 = 32768 bytes)
*/
#define LENGTH_MASK_SZ  15
#define LENGTH_MASK     ((1<<LENGTH_MASK_SZ)-1)

/*
** These error messages are shared in multiple locations.  They are defined
** here for consistency.
*/
#define DIFF_CANNOT_COMPUTE_BINARY \
    "cannot compute difference between binary files"

#define DIFF_CANNOT_COMPUTE_SYMLINK \
    "cannot compute difference between symlink and regular file"

#define DIFF_TOO_MANY_CHANGES \
    "more than 10,000 changes"

#define DIFF_WHITESPACE_ONLY \
    "whitespace changes only"

/**
   A context for running a raw diff.
  
   The aEdit[] array describes the raw diff.  Each triple of integers in
   aEdit[] means:
  
     (1) COPY:   Number of lines aFrom and aTo have in common
     (2) DELETE: Number of lines found only in aFrom
     (3) INSERT: Number of lines found only in aTo
  
   The triples repeat until all lines of both aFrom and aTo are accounted
   for.
*/
typedef struct fsl_diff_cx fsl_diff_cx;
struct fsl_diff_cx {
  int *aEdit;        /* Array of copy/delete/insert triples */
  int nEdit;         /* Number of integers (3x num of triples) in aEdit[] */
  int nEditAlloc;    /* Space allocated for aEdit[] */
  DLn *aFrom;      /* File on left side of the diff */
  int nFrom;         /* Number of lines in aFrom[] */
  DLn *aTo;        /* File on right side of the diff */
  int nTo;           /* Number of lines in aTo[] */
  int (*same_fn)(const DLn *, const DLn *); /* Function to be used for comparing */
};
#define DCx fsl_diff_cx

/**
   Counts the number of lines in the first n bytes of the given 
   string. If n<0 then fsl_strlen() is used to count it. 

   It includes the last line in the count even if it lacks the \n
   terminator. If an empty string is passed in, the number of lines
   is zero.

   For the purposes of this function, a string is considered empty if
   it contains no characters OR contains only NUL characters.

   If the input appears to be plain text it returns true and, if nOut
   is not NULL, writes the number of lines there. If the input appears
   to be binary, returns false and does not modify nOut.
*/
FSL_EXPORT bool fsl_count_lines(const char *z, fsl_int_t n, uint32_t * nOut );

bool fsl_count_lines(const char *z, fsl_int_t n, uint32_t * nOut ){
  uint32_t nLine;
  const char *zNL, *z2;
  if(n<0) n = (fsl_int_t)fsl_strlen(z);
  for(nLine=0, z2=z; (zNL = strchr(z2,'\n'))!=0; z2=zNL+1, nLine++){}
  if( z2[0]!='\0' ){
    nLine++;
    do{ z2++; }while( z2[0]!='\0' );
  }
  if( n!=(fsl_int_t)(z2-z) ) return false;
  if( nOut ) *nOut = nLine;
  return true;
}

/**
   Breaks n bytes of n into an array of fsl_dline objects.
   On success returns 0, assigns *pnLine to the number of lines,
   *pOut to the array of lines (caller must fsl_free() it).

   If it returns 0 and *pOut is set to NULL, the input is empty
   (per fsl_count_lines()'s definition).
   
   One error, returns one of:

   - FSL_RC_OOM on OOM.

   - FSL_RC_TYPE if input appears to be binary.
*/
TO_BE_STATIC int break_into_dlines(const char *z, fsl_int_t n,
                                   uint32_t *pnLine,
                                   DLn **pOut, uint64_t diffFlags){
  uint32_t nLine, i, k, nn, s, x;
  uint64_t h, h2;
  DLn *a = 0;
  const char *zNL;

  if( !fsl_count_lines(z, n, &nLine) ){
    *pOut = 0;
    return FSL_RC_TYPE;
  }
  assert( nLine>0 || z[0]=='\0' );
  if(nLine>0){
    a = fsl_malloc( sizeof(a[0])*nLine );
    if(!a) return FSL_RC_OOM;
    memset(a, 0, sizeof(a[0])*nLine);
  }else{
    *pnLine = 0;
    *pOut = a;
    return 0;
  }
  assert( a );
  i = 0;
  do{
    zNL = strchr(z,'\n');
    if( zNL==0 ) zNL = z+n;
    nn = (uint32_t)(zNL - z);
    if( nn>LENGTH_MASK ){
      fsl_free(a);
      *pOut = 0;
      *pnLine = 0;
      return FSL_RC_TYPE;
    }
    a[i].z = z;
    k = nn;
    if( diffFlags & FSL_DIFF2_STRIP_EOLCR ){
      if( k>0 && z[k-1]=='\r' ){ k--; }
    }
    a[i].n = k;
    s = 0;
    if( diffFlags & FSL_DIFF2_IGNORE_EOLWS ){
      while( k>0 && fsl_isspace(z[k-1]) ){ k--; }
    }
    if( (diffFlags & FSL_DIFF2_IGNORE_ALLWS)
        ==FSL_DIFF2_IGNORE_ALLWS ){
      uint32_t numws = 0;
      while( s<k && fsl_isspace(z[s]) ){ ++s; }
      for(h=0, x=s; x<k; ++x){
        char c = z[x];
        if( fsl_isspace(c) ){
          ++numws;
        }else{
          h = (h^c)*9000000000000000041LL;
        }
      }
      k -= numws;
    }else{
      uint32_t k2 = k & ~0x7;
      uint64_t m;
      for(h=0, x=s; x<k2; x += 8){
        memcpy(&m, z+x, 8);
        h = (h^m)*9000000000000000041LL;
      }
      m = 0;
      memcpy(&m, z+x, k-k2);
      h ^= m;
    }
    a[i].indent = s;
    a[i].h = h = ((h%281474976710597LL)<<LENGTH_MASK_SZ) | (k-s);
    h2 = h % nLine;
    a[i].iNext = a[h2].iHash;
    a[h2].iHash = i+1;
    z += nn+1; n -= nn+1;
    i++;
  }while( zNL[0]!='\0' && zNL[1]!='\0' );
  assert( i==nLine );

  *pnLine = nLine;
  *pOut = a;
  return 0;
}

/*
** Return zero if two DLn elements are identical.
*/
TO_BE_STATIC int compare_dline(const DLn *pA, const DLn *pB){
  if( pA->h!=pB->h ) return 1;
  return memcmp(pA->z,pB->z, pA->h&LENGTH_MASK);
}

/*
** Return zero if two DLine elements are identical, ignoring
** all whitespace. The indent field of pA/pB already points
** to the first non-space character in the string.
*/
TO_BE_STATIC int compare_dline_ignore_allws(const DLn *pA, const DLn *pB){
  unsigned short a = pA->indent, b = pB->indent;
  if( pA->h==pB->h ){
    while( a<pA->n || b<pB->n ){
      if( a<pA->n && b<pB->n && pA->z[a++] != pB->z[b++] ) return 1;
      while( a<pA->n && fsl_isspace(pA->z[a])) ++a;
      while( b<pB->n && fsl_isspace(pB->z[b])) ++b;
    }
    return pA->n-a != pB->n-b;
  }
  return 1;
}

/*
** The two text segments zLeft and zRight are known to be different on
** both ends, but they might have  a common segment in the middle.  If
** they do not have a common segment, return 0.  If they do have a large
** common segment, return non-0 and before doing so set:
**
**   aLCS[0] = start of the common segment in zLeft
**   aLCS[1] = end of the common segment in zLeft
**   aLCS[2] = start of the common segment in zLeft
**   aLCS[3] = end of the common segment in zLeft
**
** This computation is for display purposes only and does not have to be
** optimal or exact.
*/
TO_BE_STATIC int textLCS2(
  const char *zLeft,  uint32_t nA, /* String on the left */
  const char *zRight, uint32_t nB, /* String on the right */
  uint32_t *aLCS                   /* Identify bounds of LCS here */
){
  const unsigned char *zA = (const unsigned char*)zLeft;    /* left string */
  const unsigned char *zB = (const unsigned char*)zRight;   /* right string */
  uint32_t i, j, k;               /* Loop counters */
  uint32_t lenBest = 0;           /* Match length to beat */

  for(i=0; i<nA-lenBest; i++){
    unsigned char cA = zA[i];
    if( (cA&0xc0)==0x80 ) continue;
    for(j=0; j<nB-lenBest; j++ ){
      if( zB[j]==cA ){
        for(k=1; j+k<nB && i+k<nA && zB[j+k]==zA[i+k]; k++){}
        while( (zB[j+k]&0xc0)==0x80 ){ k--; }
        if( k>lenBest ){
          lenBest = k;
          aLCS[0] = i;
          aLCS[1] = i+k;
          aLCS[2] = j;
          aLCS[3] = j+k;
        }
      }
    }
  }
  return lenBest>0;
}

/*
** Find the smallest spans that are different between two text strings
** that are known to be different on both ends. Returns the number
** of entries in p->a which get populated.
*/
TO_BE_STATIC unsigned short textLineChanges(
  const char *zLeft,  uint32_t nA, /* String on the left */
  const char *zRight, uint32_t nB, /* String on the right */
  fsl_dline_change * const p             /* Write results here */
){
  p->n = 1;
  p->a[0].iStart1 = 0;
  p->a[0].iLen1 = nA;
  p->a[0].iStart2 = 0;
  p->a[0].iLen2 = nB;
  p->a[0].isMin = 0;
  while( p->n<fsl_dline_change_max_spans-1 ){
    int mxi = -1;
    int mxLen = -1;
    int x, i;
    uint32_t aLCS[4];
    struct fsl_dline_change_span *a, *b;
    for(i=0; i<p->n; i++){
      if( p->a[i].isMin ) continue;
      x = p->a[i].iLen1;
      if( p->a[i].iLen2<x ) x = p->a[i].iLen2;
      if( x>mxLen ){
        mxLen = x;
        mxi = i;
      }
    }
    if( mxLen<6 ) break;
    x = textLCS2(zLeft + p->a[mxi].iStart1, p->a[mxi].iLen1,
                 zRight + p->a[mxi].iStart2, p->a[mxi].iLen2, aLCS);
    if( x==0 ){
      p->a[mxi].isMin = 1;
      continue;
    }
    a = p->a+mxi;
    b = a+1;
    if( mxi<p->n-1 ){
      memmove(b+1, b, sizeof(*b)*(p->n-mxi-1));
    }
    p->n++;
    b->iStart1 = a->iStart1 + aLCS[1];
    b->iLen1 = a->iLen1 - aLCS[1];
    a->iLen1 = aLCS[0];
    b->iStart2 = a->iStart2 + aLCS[3];
    b->iLen2 = a->iLen2 - aLCS[3];
    a->iLen2 = aLCS[2];
    b->isMin = 0;
  }
  return p->n;
}

/*
** Return true if the string starts with n spaces
*/
TO_BE_STATIC int allSpaces(const char *z, int n){
  int i;
  for(i=0; i<n && fsl_isspace(z[i]); ++i){}
  return i==n;
}

/*
** Try to improve the human-readability of the fsl_dline_change p.
**
** (1)  If the first change span shows a change of indentation, try to
**      move that indentation change to the left margin.
**
** (2)  Try to shift changes so that they begin or end with a space.
*/
TO_BE_STATIC void improveReadability(
  const char *zA,  /* Left line of the change */
  const char *zB,  /* Right line of the change */
  fsl_dline_change * const p /* The fsl_dline_change to be adjusted */
){
  int j, n, len;
  if( p->n<1 ) return;

  /* (1) Attempt to move indentation changes to the left margin */
  if( p->a[0].iLen1==0
   && (len = p->a[0].iLen2)>0
   && (j = p->a[0].iStart2)>0
   && zB[0]==zB[j]
   && allSpaces(zB, j)
  ){
    for(n=1; n<len && n<j && zB[j]==zB[j+n]; n++){}
    if( n<len ){
      memmove(&p->a[1], &p->a[0], sizeof(p->a[0])*p->n);
      p->n++;
      p->a[0] = p->a[1];
      p->a[1].iStart2 += n;
      p->a[1].iLen2 -= n;
      p->a[0].iLen2 = n;
    }
    p->a[0].iStart1 = 0;
    p->a[0].iStart2 = 0;
  }else
  if( p->a[0].iLen2==0
   && (len = p->a[0].iLen1)>0
   && (j = p->a[0].iStart1)>0
   && zA[0]==zA[j]
   && allSpaces(zA, j)
  ){
    for(n=1; n<len && n<j && zA[j]==zA[j+n]; n++){}
    if( n<len ){
      memmove(&p->a[1], &p->a[0], sizeof(p->a[0])*p->n);
      p->n++;
      p->a[0] = p->a[1];
      p->a[1].iStart1 += n;
      p->a[1].iLen1 -= n;
      p->a[0].iLen1 = n;
    }
    p->a[0].iStart1 = 0;
    p->a[0].iStart2 = 0;
  }

  /* (2) Try to shift changes so that they begin or end with a
  ** space.  (TBD) */
}

/*
** Given two lines of text, pFrom and pTo, compute a set of changes
** between those two lines, for enhanced display purposes.
**
** The result is written into the fsl_dline_change object given by the
** third parameter.
*/
void fsl_dline_change_spans(
  const DLn *pLeft,  /* Left line of the change */
  const DLn *pRight, /* Right line of the change */
  fsl_dline_change * const p /* OUTPUT: Write the results here */
){
  int nLeft;           /* Length of left line in bytes */
  int nRight;          /* Length of right line in bytes */
  int nShort;          /* Shortest of left and right */
  int nPrefix;         /* Length of common prefix */
  int nSuffix;         /* Length of common suffix */
  int nCommon;         /* Total byte length of suffix and prefix */
  const char *zLeft;   /* Text of the left line */
  const char *zRight;  /* Text of the right line */
  int nLeftDiff;       /* nLeft - nPrefix - nSuffix */
  int nRightDiff;      /* nRight - nPrefix - nSuffix */

  nLeft = pLeft->n;
  zLeft = pLeft->z;
  nRight = pRight->n;
  zRight = pRight->z;
  nShort = nLeft<nRight ? nLeft : nRight;

  nPrefix = 0;
  while( nPrefix<nShort && zLeft[nPrefix]==zRight[nPrefix] ){
    nPrefix++;
  }
  if( nPrefix<nShort ){
    while( nPrefix>0 && (zLeft[nPrefix]&0xc0)==0x80 ) nPrefix--;
  }
  nSuffix = 0;
  if( nPrefix<nShort ){
    while( nSuffix<nShort
           && zLeft[nLeft-nSuffix-1]==zRight[nRight-nSuffix-1] ){
      nSuffix++;
    }
    if( nSuffix<nShort ){
      while( nSuffix>0 && (zLeft[nLeft-nSuffix]&0xc0)==0x80 ) nSuffix--;
    }
    if( nSuffix==nLeft || nSuffix==nRight ) nPrefix = 0;
  }
  nCommon = nPrefix + nSuffix;

  /* If the prefix and suffix overlap, that means that we are dealing with
  ** a pure insertion or deletion of text that can have multiple alignments.
  ** Try to find an alignment to begins and ends on whitespace, or on
  ** punctuation, rather than in the middle of a name or number.
  */
  if( nCommon > nShort ){
    int iBest = -1;
    int iBestVal = -1;
    int i;
    int nLong = nLeft<nRight ? nRight : nLeft;
    int nGap = nLong - nShort;
    for(i=nShort-nSuffix; i<=nPrefix; i++){
       int iVal = 0;
       char c = zLeft[i];
       if( fsl_isspace(c) ){
         iVal += 5;
       }else if( !fsl_isalnum(c) ){
         iVal += 2;
       }
       c = zLeft[i+nGap-1];
       if( fsl_isspace(c) ){
         iVal += 5;
       }else if( !fsl_isalnum(c) ){
         iVal += 2;
       }
       if( iVal>iBestVal ){
         iBestVal = iVal;
         iBest = i;
       }
    }
    nPrefix = iBest;
    nSuffix = nShort - nPrefix;
    nCommon = nPrefix + nSuffix;
  }

  /* A single chunk of text inserted */
  if( nCommon==nLeft ){
    p->n = 1;
    p->a[0].iStart1 = nPrefix;
    p->a[0].iLen1 = 0;
    p->a[0].iStart2 = nPrefix;
    p->a[0].iLen2 = nRight - nCommon;
    improveReadability(zLeft, zRight, p);
    return;
  }

  /* A single chunk of text deleted */
  if( nCommon==nRight ){
    p->n = 1;
    p->a[0].iStart1 = nPrefix;
    p->a[0].iLen1 = nLeft - nCommon;
    p->a[0].iStart2 = nPrefix;
    p->a[0].iLen2 = 0;
    improveReadability(zLeft, zRight, p);
    return;
  }

  /* At this point we know that there is a chunk of text that has
  ** changed between the left and the right.  Check to see if there
  ** is a large unchanged section in the middle of that changed block.
  */
  nLeftDiff = nLeft - nCommon;
  nRightDiff = nRight - nCommon;
  if( nLeftDiff >= 4
   && nRightDiff >= 4
   && textLineChanges(&zLeft[nPrefix], nLeftDiff,
                      &zRight[nPrefix], nRightDiff, p)>1
  ){
    int i;
    for(i=0; i<p->n; i++){
      p->a[i].iStart1 += nPrefix;
      p->a[i].iStart2 += nPrefix;
    }
    improveReadability(zLeft, zRight, p);
    return;
  }

  /* If all else fails, show a single big change between left and right */
  p->n = 1;
  p->a[0].iStart1 = nPrefix;
  p->a[0].iLen1 = nLeft - nCommon;
  p->a[0].iStart2 = nPrefix;
  p->a[0].iLen2 = nRight - nCommon;
  improveReadability(zLeft, zRight, p);
}


#undef MX_CSN
#undef DCx
#undef DLn
#undef DIFF_CANNOT_COMPUTE_BINARY
#undef DIFF_CANNOT_COMPUTE_SYMLINK
#undef DIFF_TOO_MANY_CHANGES
#undef DIFF_WHITESPACE_ONLY
#undef LENGTH_MASK_SZ
#undef LENGTH_MASK
#undef fsl_dline_empty_m
#undef TO_BE_STATIC