/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* vim: set ts=2 et sw=2 tw=80: */ /* ** Copyright (c) 2013 D. Richard Hipp ** ** This program is free software; you can redistribute it and/or ** modify it under the terms of the Simplified BSD License (also ** known as the "2-Clause License" or "FreeBSD License".) ** ** This program is distributed in the hope that it will be useful, ** but without any warranty; without even the implied warranty of ** merchantability or fitness for a particular purpose. ** ** Author contact information: ** drh@hwaci.com ** http://www.hwaci.com/drh/ ** ******************************************************************************* ** This file houses Fossil's delta generation and application ** routines. This code is functionally independent of the rest of the ** library, relying only on fsl_malloc(), fsl_free(), and the integer ** typedefs defined by the configuration process. i.e. it can easily ** be pulled out and used in arbitrary projects. */ #include "fossil-scm/fossil.h" #include <memory.h> #include <stdlib.h> /* ** Macros for turning debugging printfs on and off */ #if 0 # define DEBUG1(X) X #else # define DEBUG1(X) #endif #if 0 #define DEBUG2(X) X /* ** For debugging: ** Print 16 characters of text from zBuf */ static const char *print16(const char *z){ int i; static char zBuf[20]; for(i=0; i<16; i++){ if( z[i]>=0x20 && z[i]<=0x7e ){ zBuf[i] = z[i]; }else{ zBuf[i] = '.'; } } zBuf[i] = 0; return zBuf; } #else # define DEBUG2(X) #endif /* ** The width of a hash window in bytes. The algorithm only works if this ** is a power of 2. */ #define NHASH 16 /* ** The current state of the rolling hash. ** ** z[] holds the values that have been hashed. z[] is a circular buffer. ** z[i] is the first entry and z[(i+NHASH-1)%NHASH] is the last entry of ** the window. ** ** Hash.a is the sum of all elements of hash.z[]. Hash.b is a weighted ** sum. Hash.b is z[i]*NHASH + z[i+1]*(NHASH-1) + ... + z[i+NHASH-1]*1. ** (Each index for z[] should be module NHASH, of course. The %NHASH operator ** is omitted in the prior expression for brevity.) */ typedef struct fsl_delta_hash fsl_delta_hash; struct fsl_delta_hash { fsl_uint16_t a, b; /* Hash values */ fsl_uint16_t i; /* Start of the hash window */ unsigned char z[NHASH]; /* The values that have been hashed */ }; /* ** Initialize the rolling hash using the first NHASH characters of z[] */ static void fsl_delta_hash_init(fsl_delta_hash *pHash, unsigned char const *z){ fsl_uint16_t a, b, i; a = b = 0; for(i=0; i<NHASH; i++){ a += z[i]; b += (NHASH-i)*z[i]; pHash->z[i] = z[i]; } pHash->a = a & 0xffff; pHash->b = b & 0xffff; pHash->i = 0; } /* ** Advance the rolling hash by a single character "c" */ static void fsl_delta_hash_next(fsl_delta_hash *pHash, int c){ fsl_uint16_t old = pHash->z[pHash->i]; pHash->z[pHash->i] = c; pHash->i = (pHash->i+1)&(NHASH-1); pHash->a = pHash->a - old + c; pHash->b = pHash->b - NHASH*old + pHash->a; } /* ** Return a 32-bit hash value */ static fsl_uint32_t fsl_delta_hash_32bit(fsl_delta_hash *pHash){ return (pHash->a & 0xffff) | (((fsl_uint32_t)(pHash->b & 0xffff))<<16); } /* ** Write an base-64 integer into the given buffer. */ static void fsl_delta_int_put(fsl_uint32_t v, unsigned char **pz){ static const char zDigits[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~"; /* 123456789 123456789 123456789 123456789 123456789 123456789 123 */ int i, j; unsigned char zBuf[20]; if( v==0 ){ *(*pz)++ = '0'; return; } for(i=0; v>0; i++, v>>=6){ zBuf[i] = zDigits[v&0x3f]; } for(j=i-1; j>=0; j--){ *(*pz)++ = zBuf[j]; } } /* ** Read bytes from *pz and convert them into a positive integer. When ** finished, leave *pz pointing to the first character past the end of ** the integer. The *pLen parameter holds the length of the string ** in *pz and is decremented once for each character in the integer. */ static fsl_size_t fsl_delta_int_get(unsigned char const **pz, fsl_int_t *pLen){ static const signed char zValue[] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, 36, -1, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, -1, -1, -1, 63, -1, }; fsl_size_t v = 0; fsl_int_t c; unsigned char const *z = (unsigned char const*)*pz; unsigned char const *zStart = z; while( (c = zValue[0x7f&*(z++)])>=0 ){ v = (v<<6) + c; } z--; *pLen -= z - zStart; *pz = z; return v; } /* ** Return the number digits in the base-64 representation of a positive integer */ static int fsl_delta_digit_count(int v){ unsigned int x; int i; for(i=1, x=64; v>=x; i++, x <<= 6){} return i; } /* ** Compute a 32-bit checksum on the N-byte buffer. Return the result. */ static unsigned int fsl_delta_checksum(void const *zIn, fsl_size_t N){ const unsigned char *z = (const unsigned char *)zIn; unsigned sum0 = 0; unsigned sum1 = 0; unsigned sum2 = 0; unsigned sum3 = 0; while(N >= 16){ sum0 += ((unsigned)z[0] + z[4] + z[8] + z[12]); sum1 += ((unsigned)z[1] + z[5] + z[9] + z[13]); sum2 += ((unsigned)z[2] + z[6] + z[10]+ z[14]); sum3 += ((unsigned)z[3] + z[7] + z[11]+ z[15]); z += 16; N -= 16; } while(N >= 4){ sum0 += z[0]; sum1 += z[1]; sum2 += z[2]; sum3 += z[3]; z += 4; N -= 4; } sum3 += (sum2 << 8) + (sum1 << 16) + (sum0 << 24); switch(N){ case 3: sum3 += (z[2] << 8); case 2: sum3 += (z[1] << 16); case 1: sum3 += (z[0] << 24); default: ; } return sum3; } int fsl_delta_create( unsigned char const *zSrc, /* The source or pattern file */ fsl_size_t lenSrc_, /* Length of the source file */ unsigned char const *zOut, /* The target file */ fsl_size_t lenOut_, /* Length of the target file */ unsigned char *zDelta /* Write the delta into this buffer */, fsl_size_t * deltaSize ){ int i, base; unsigned char *zOrigDelta = zDelta; fsl_delta_hash h; int nHash; /* Number of hash table entries */ int *landmark; /* Primary hash table */ int *collide; /* Collision chain */ int lastRead = -1; /* Last byte of zSrc read by a COPY command */ /* lenSrc/lenOut are cast to ints to avoid any potential side-effects caused by changing the function signature from signed to unsigned int types when porting from v1. */ fsl_int_t lenSrc = (fsl_int_t)lenSrc_; fsl_int_t lenOut = (fsl_int_t)lenOut_; if(!zSrc || !zOut || !zDelta) return FSL_RC_MISUSE; else if(lenSrc<0 || lenOut<0) return FSL_RC_RANGE; /* Add the target file size to the beginning of the delta */ fsl_delta_int_put((fsl_uint32_t)lenOut, &zDelta); *(zDelta++) = '\n'; /* If the source file is very small, it means that we have no ** chance of ever doing a copy command. Just output a single ** literal segment for the entire target and exit. */ if( lenSrc<=NHASH ){ fsl_delta_int_put((fsl_uint32_t)lenOut, &zDelta); *(zDelta++) = ':'; memcpy(zDelta, zOut, lenOut); zDelta += lenOut; fsl_delta_int_put(fsl_delta_checksum(zOut, lenOut), &zDelta); *(zDelta++) = ';'; *deltaSize = (fsl_size_t)(zDelta - zOrigDelta); return 0; } /* Compute the hash table used to locate matching sections in the ** source file. */ nHash = lenSrc/NHASH; collide = (int*)fsl_malloc( nHash*2*sizeof(int) ); if(!collide) return FSL_RC_OOM; landmark = &collide[nHash]; memset(landmark, -1, nHash*sizeof(int)); memset(collide, -1, nHash*sizeof(int)); for(i=0; i<lenSrc-NHASH; i+=NHASH){ int hv; fsl_delta_hash_init(&h, &zSrc[i]); hv = fsl_delta_hash_32bit(&h) % nHash; collide[i/NHASH] = landmark[hv]; landmark[hv] = i/NHASH; } /* Begin scanning the target file and generating copy commands and ** literal sections of the delta. */ base = 0; /* We have already generated everything before zOut[base] */ while( base+NHASH<lenOut ){ int iSrc, iBlock; unsigned int bestCnt, bestOfst=0, bestLitsz=0; fsl_delta_hash_init(&h, &zOut[base]); i = 0; /* Trying to match a landmark against zOut[base+i] */ bestCnt = 0; while( 1 ){ int hv; int limit = 250; hv = fsl_delta_hash_32bit(&h) % nHash; DEBUG2( printf("LOOKING: %4d [%s]\n", base+i, print16(&zOut[base+i])); ) iBlock = landmark[hv]; while( iBlock>=0 && (limit--)>0 ){ /* ** The hash window has identified a potential match against ** landmark block iBlock. But we need to investigate further. ** ** Look for a region in zOut that matches zSrc. Anchor the search ** at zSrc[iSrc] and zOut[base+i]. Do not include anything prior to ** zOut[base] or after zOut[outLen] nor anything after zSrc[srcLen]. ** ** Set cnt equal to the length of the match and set ofst so that ** zSrc[ofst] is the first element of the match. litsz is the number ** of characters between zOut[base] and the beginning of the match. ** sz will be the overhead (in bytes) needed to encode the copy ** command. Only generate copy command if the overhead of the ** copy command is less than the amount of literal text to be copied. */ int cnt, ofst, litsz; int j, k, x, y; int sz; /* Beginning at iSrc, match forwards as far as we can. j counts ** the number of characters that match */ iSrc = iBlock*NHASH; for(j=0, x=iSrc, y=base+i; x<lenSrc && y<lenOut; j++, x++, y++){ if( zSrc[x]!=zOut[y] ) break; } j--; /* Beginning at iSrc-1, match backwards as far as we can. k counts ** the number of characters that match */ for(k=1; k<iSrc && k<=i; k++){ if( zSrc[iSrc-k]!=zOut[base+i-k] ) break; } k--; /* Compute the offset and size of the matching region */ ofst = iSrc-k; cnt = j+k+1; litsz = i-k; /* Number of bytes of literal text before the copy */ DEBUG2( printf("MATCH %d bytes at %d: [%s] litsz=%d\n", cnt, ofst, print16(&zSrc[ofst]), litsz); ) /* sz will hold the number of bytes needed to encode the "insert" ** command and the copy command, not counting the "insert" text */ sz = fsl_delta_digit_count(i-k) +fsl_delta_digit_count(cnt) +fsl_delta_digit_count(ofst) +3; if( cnt>=sz && cnt>bestCnt ){ /* Remember this match only if it is the best so far and it ** does not increase the file size */ bestCnt = cnt; bestOfst = iSrc-k; bestLitsz = litsz; DEBUG2( printf("... BEST SO FAR\n"); ) } /* Check the next matching block */ iBlock = collide[iBlock]; } /* We have a copy command that does not cause the delta to be larger ** than a literal insert. So add the copy command to the delta. */ if( bestCnt>0 ){ if( bestLitsz>0 ){ /* Add an insert command before the copy */ fsl_delta_int_put(bestLitsz,&zDelta); *(zDelta++) = ':'; memcpy(zDelta, &zOut[base], bestLitsz); zDelta += bestLitsz; base += bestLitsz; DEBUG2( printf("insert %d\n", bestLitsz); ) } base += bestCnt; fsl_delta_int_put(bestCnt, &zDelta); *(zDelta++) = '@'; fsl_delta_int_put(bestOfst, &zDelta); DEBUG2( printf("copy %d bytes from %d\n", bestCnt, bestOfst); ) *(zDelta++) = ','; if( bestOfst + bestCnt -1 > lastRead ){ lastRead = bestOfst + bestCnt - 1; DEBUG2( printf("lastRead becomes %d\n", lastRead); ) } bestCnt = 0; break; } /* If we reach this point, it means no match is found so far */ if( base+i+NHASH>=lenOut ){ /* We have reached the end of the file and have not found any ** matches. Do an "insert" for everything that does not match */ fsl_delta_int_put(lenOut-base, &zDelta); *(zDelta++) = ':'; memcpy(zDelta, &zOut[base], lenOut-base); zDelta += lenOut-base; base = lenOut; break; } /* Advance the hash by one character. Keep looking for a match */ fsl_delta_hash_next(&h, zOut[base+i+NHASH]); i++; } } /* Output a final "insert" record to get all the text at the end of ** the file that does not match anything in the source file. */ if( base<lenOut ){ fsl_delta_int_put(lenOut-base, &zDelta); *(zDelta++) = ':'; memcpy(zDelta, &zOut[base], lenOut-base); zDelta += lenOut-base; } /* Output the final checksum record. */ fsl_delta_int_put(fsl_delta_checksum(zOut, lenOut), &zDelta); *(zDelta++) = ';'; fsl_free(collide); *deltaSize = (fsl_size_t)(zDelta - zOrigDelta); return 0; } /* ** Calculates the size (in bytes) of the output from applying a ** delta. On success 0 is returned and *deltaSize will be updated with ** the amount of memory required for applying the delta. ** ** This routine is provided so that an procedure that is able ** to call fsl_delta_apply() can learn how much space is required ** for the output and hence allocate nor more space that is really ** needed. */ int fsl_delta_applied_size(unsigned char const *zDelta, fsl_size_t lenDelta_, fsl_size_t * deltaSize){ if(!zDelta || (lenDelta_<2) || !deltaSize) return FSL_RC_MISUSE; else{ fsl_size_t size; fsl_int_t lenDelta = (fsl_int_t)lenDelta_; size = fsl_delta_int_get(&zDelta, &lenDelta); if( *zDelta!='\n' ){ /* ERROR: size integer not terminated by "\n" */ return FSL_RC_DELTA_INVALID_TERMINATOR; } *deltaSize = size; return 0; } } int fsl_delta_apply2( unsigned char const *zSrc, /* The source or pattern file */ fsl_size_t lenSrc_, /* Length of the source file */ unsigned char const *zDelta, /* Delta to apply to the pattern */ fsl_size_t lenDelta_, /* Length of the delta */ unsigned char *zOut, /* Write the output into this preallocated buffer */ fsl_error * pErr ){ unsigned int limit; unsigned int total = 0; #ifndef FSL_OMIT_DELTA_CKSUM_TEST unsigned char *zOrigOut = zOut; #endif /* lenSrc/lenDelta are cast to ints to avoid any potential side-effects caused by changing the function signature from signed to unsigned int types when porting from v1. */ fsl_int_t lenSrc = (fsl_int_t)lenSrc_; fsl_int_t lenDelta = (fsl_int_t)lenDelta_; if(!zSrc || !zDelta || !zOut) return FSL_RC_MISUSE; else if(lenSrc<0 || lenDelta<0) return FSL_RC_RANGE; limit = fsl_delta_int_get(&zDelta, &lenDelta); if( *zDelta!='\n' ){ if(pErr){ fsl_error_set(pErr, FSL_RC_DELTA_INVALID_TERMINATOR, "Delta: size integer not terminated by \\n"); } return FSL_RC_DELTA_INVALID_TERMINATOR; } zDelta++; lenDelta--; while( *zDelta && lenDelta>0 ){ fsl_size_t cnt, ofst; cnt = fsl_delta_int_get(&zDelta, &lenDelta); switch( zDelta[0] ){ case '@': { zDelta++; lenDelta--; ofst = fsl_delta_int_get(&zDelta, &lenDelta); if( lenDelta>0 && zDelta[0]!=',' ){ /* ERROR: copy command not terminated by ',' */ return FSL_RC_DELTA_INVALID_TERMINATOR; } zDelta++; lenDelta--; DEBUG1( printf("COPY %d from %d\n", cnt, ofst); ) total += cnt; if( total>limit ){ if(pErr){ fsl_error_set(pErr, FSL_RC_RANGE, "Delta: copy exceeds output file size"); } return FSL_RC_RANGE; } if( ofst+cnt > lenSrc ){ if(pErr){ fsl_error_set(pErr, FSL_RC_RANGE, "Delta: copy extends past end of input"); } return FSL_RC_RANGE; } memcpy(zOut, &zSrc[ofst], cnt); zOut += cnt; break; } case ':': { zDelta++; lenDelta--; total += cnt; if( total>limit ){ if(pErr){ fsl_error_set(pErr, FSL_RC_RANGE, "Delta: insert command gives an output " "larger than predicted"); } return FSL_RC_RANGE; } DEBUG1( printf("INSERT %d\n", cnt); ) if( cnt>lenDelta ){ if(pErr){ fsl_error_set(pErr, FSL_RC_RANGE, "Delta: insert count exceeds size of delta"); } return FSL_RC_RANGE; } memcpy(zOut, zDelta, cnt); zOut += cnt; zDelta += cnt; lenDelta -= cnt; break; } case ';': { zDelta++; lenDelta--; zOut[0] = 0; #ifndef FSL_OMIT_DELTA_CKSUM_TEST if( cnt!=fsl_delta_checksum(zOrigOut, total) ){ if(pErr){ fsl_error_set(pErr, FSL_RC_CHECKSUM_MISMATCH, "Delta: bad checksum"); } return FSL_RC_CHECKSUM_MISMATCH; } #endif if( total!=limit ){ if(pErr){ fsl_error_set(pErr, FSL_RC_SIZE_MISMATCH, "Delta: generated size does not match " "predicted size"); } return FSL_RC_SIZE_MISMATCH; } return 0; } default: { if(pErr){ fsl_error_set(pErr, FSL_RC_SIZE_MISMATCH, "Delta: unknown delta operator"); } return FSL_RC_DELTA_INVALID_OPERATOR; } } } /* ERROR: unterminated delta */ return FSL_RC_DELTA_INVALID_TERMINATOR; } int fsl_delta_apply( unsigned char const *zSrc, /* The source or pattern file */ fsl_size_t lenSrc_, /* Length of the source file */ unsigned char const *zDelta, /* Delta to apply to the pattern */ fsl_size_t lenDelta_, /* Length of the delta */ unsigned char *zOut /* Write the output into this preallocated buffer */ ){ return fsl_delta_apply2(zSrc, lenSrc_, zDelta, lenDelta_, zOut, NULL); } #undef NHASH #undef DEBUG1 #undef DEBUG2