/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* vim: set ts=2 et sw=2 tw=80: */ /* Copyright (c) 2013 D. Richard Hipp This program is free software; you can redistribute it and/or modify it under the terms of the Simplified BSD License (also known as the "2-Clause License" or "FreeBSD License".) This program is distributed in the hope that it will be useful, but without any warranty; without even the implied warranty of merchantability or fitness for a particular purpose. Author contact information: drh@hwaci.com http://www.hwaci.com/drh/ ***************************************************************************** This file houses Fossil's delta generation and application routines. This code is functionally independent of the rest of the library, relying only on fsl_malloc(), fsl_free(), and the integer typedefs defined by the configuration process. i.e. it can easily be pulled out and used in arbitrary projects. */ #include "fossil-scm/fossil.h" #include #include /* Macros for turning debugging printfs on and off */ #if 0 # define DEBUG1(X) X #else # define DEBUG1(X) #endif #if 0 #define DEBUG2(X) X /* For debugging: Print 16 characters of text from zBuf */ static const char *print16(const char *z){ int i; static char zBuf[20]; for(i=0; i<16; i++){ if( z[i]>=0x20 && z[i]<=0x7e ){ zBuf[i] = z[i]; }else{ zBuf[i] = '.'; } } zBuf[i] = 0; return zBuf; } #else # define DEBUG2(X) #endif /* The width of a hash window in bytes. The algorithm only works if this is a power of 2. */ #define NHASH 16 /* The current state of the rolling hash. z[] holds the values that have been hashed. z[] is a circular buffer. z[i] is the first entry and z[(i+NHASH-1)%NHASH] is the last entry of the window. Hash.a is the sum of all elements of hash.z[]. Hash.b is a weighted sum. Hash.b is z[i]*NHASH + z[i+1]*(NHASH-1) + ... + z[i+NHASH-1]*1. (Each index for z[] should be module NHASH, of course. The %NHASH operator is omitted in the prior expression for brevity.) */ typedef struct fsl_delta_hash fsl_delta_hash; struct fsl_delta_hash { fsl_uint16_t a, b; /* Hash values */ fsl_uint16_t i; /* Start of the hash window */ unsigned char z[NHASH]; /* The values that have been hashed */ }; /* Initialize the rolling hash using the first NHASH characters of z[] */ static void fsl_delta_hash_init(fsl_delta_hash *pHash, unsigned char const *z){ fsl_uint16_t a, b, i; a = b = 0; for(i=0; iz[i] = z[i]; } pHash->a = a & 0xffff; pHash->b = b & 0xffff; pHash->i = 0; } /* Advance the rolling hash by a single character "c" */ static void fsl_delta_hash_next(fsl_delta_hash *pHash, int c){ fsl_uint16_t old = pHash->z[pHash->i]; pHash->z[pHash->i] = c; pHash->i = (pHash->i+1)&(NHASH-1); pHash->a = pHash->a - old + c; pHash->b = pHash->b - NHASH*old + pHash->a; } /* Return a 32-bit hash value */ static fsl_uint32_t fsl_delta_hash_32bit(fsl_delta_hash *pHash){ return (pHash->a & 0xffff) | (((fsl_uint32_t)(pHash->b & 0xffff))<<16); } /* Write an base-64 integer into the given buffer. Return its length. */ static unsigned int fsl_delta_int_put(fsl_uint32_t v, unsigned char **pz){ static const char zDigits[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~"; /* 123456789 123456789 123456789 123456789 123456789 123456789 123 */ int i, j; unsigned char zBuf[20]; fsl_size_t rc = 0; if( v==0 ){ *(*pz)++ = '0'; rc = 1; }else{ for(i=0; v>0; ++rc, ++i, v>>=6){ zBuf[i] = zDigits[v&0x3f]; } zBuf[i]=0; for(j=i-1; j>=0; j--){ *(*pz)++ = zBuf[j]; } } return rc; } /* Read bytes from *pz and convert them into a positive integer. When finished, leave *pz pointing to the first character past the end of the integer. The *pLen parameter holds the length of the string in *pz and is decremented once for each character in the integer. */ static fsl_size_t fsl_delta_int_get(unsigned char const **pz, fsl_int_t *pLen){ static const signed char zValue[] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, 36, -1, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, -1, -1, -1, 63, -1, }; fsl_size_t v = 0; fsl_int_t c; unsigned char const *z = (unsigned char const*)*pz; unsigned char const *zStart = z; while( (c = zValue[0x7f&*(z++)])>=0 ){ v = (v<<6) + c; } z--; *pLen -= z - zStart; *pz = z; return v; } /* Return the number digits in the base-64 representation of a positive integer */ static int fsl_delta_digit_count(fsl_int_t v){ unsigned int x; int i; for(i=1, x=64; v>=(fsl_int_t)x; i++, x <<= 6){} return i; } /* Compute a 32-bit checksum on the N-byte buffer. Return the result. */ static unsigned int fsl_delta_checksum(void const *zIn, fsl_size_t N){ const unsigned char *z = (const unsigned char *)zIn; unsigned sum0 = 0; unsigned sum1 = 0; unsigned sum2 = 0; unsigned sum3 = 0; while(N >= 16){ sum0 += ((unsigned)z[0] + z[4] + z[8] + z[12]); sum1 += ((unsigned)z[1] + z[5] + z[9] + z[13]); sum2 += ((unsigned)z[2] + z[6] + z[10]+ z[14]); sum3 += ((unsigned)z[3] + z[7] + z[11]+ z[15]); z += 16; N -= 16; } while(N >= 4){ sum0 += z[0]; sum1 += z[1]; sum2 += z[2]; sum3 += z[3]; z += 4; N -= 4; } sum3 += (sum2 << 8) + (sum1 << 16) + (sum0 << 24); switch(N){ case 3: sum3 += (z[2] << 8); case 2: sum3 += (z[1] << 16); case 1: sum3 += (z[0] << 24); default: ; } return sum3; } int fsl_delta_create2( unsigned char const *zSrc, fsl_size_t lenSrc, unsigned char const *zOut, fsl_size_t lenOut, fsl_output_f out, void * outState){ enum { IntegerBufSize = 50 /* buffer size for integer conversions. */}; unsigned int i, base; unsigned int nHash; /* Number of hash table entries */ unsigned int *landmark; /* Primary hash table */ unsigned int *collide = NULL; /* Collision chain */ int lastRead = -1; /* Last byte of zSrc read by a COPY command */ int rc; /* generic return code checker. */ unsigned int olen = 0; /* current output length. */ unsigned int total = 0; /* total byte count. */ fsl_delta_hash h; unsigned char theBuf[IntegerBufSize] = {0,}; unsigned char * intBuf = theBuf; if(!zSrc || !zOut || !out) return FSL_RC_MISUSE; /* Add the target file size to the beginning of the delta */ #ifdef OUT #undef OUT #endif #define OUT(BLOB,LEN) rc=out(outState, BLOB, LEN); if(0 != rc) {fsl_free(collide); return rc;} else total += LEN #define OUTCH(CHAR) OUT(CHAR,1) #define PINT(I) intBuf = theBuf; olen=fsl_delta_int_put(I, &intBuf); OUT(theBuf,olen) PINT(lenOut); OUTCH("\n"); /* If the source file is very small, it means that we have no chance of ever doing a copy command. Just output a single literal segment for the entire target and exit. */ if( lenSrc<=NHASH ){ PINT(lenOut); OUTCH(":"); OUT(zOut,lenOut); PINT((fsl_delta_checksum(zOut, lenOut))); OUTCH(";"); return 0; } /* Compute the hash table used to locate matching sections in the source file. */ nHash = lenSrc/NHASH; collide = (unsigned int *)malloc( nHash*2*sizeof(int) ); if(!collide){ return FSL_RC_OOM; } landmark = &collide[nHash]; memset(landmark, -1, nHash*sizeof(int)); memset(collide, -1, nHash*sizeof(int)); for(i=0; i=0 && (limit--)>0 ){ /* The hash window has identified a potential match against landmark block iBlock. But we need to investigate further. Look for a region in zOut that matches zSrc. Anchor the search at zSrc[iSrc] and zOut[base+i]. Do not include anything prior to zOut[base] or after zOut[outLen] nor anything after zSrc[srcLen]. Set cnt equal to the length of the match and set ofst so that zSrc[ofst] is the first element of the match. litsz is the number of characters between zOut[base] and the beginning of the match. sz will be the overhead (in bytes) needed to encode the copy command. Only generate copy command if the overhead of the copy command is less than the amount of literal text to be copied. */ fsl_int_t cnt, ofst, litsz; fsl_int_t j, k, x, y; fsl_int_t sz; /* Beginning at iSrc, match forwards as far as we can. j counts the number of characters that match */ iSrc = iBlock*NHASH; for(j=0, x=iSrc, y=base+i; x<(fsl_int_t)lenSrc && y<(fsl_int_t)lenOut; j++, x++, y++){ if( zSrc[x]!=zOut[y] ) break; } j--; /* Beginning at iSrc-1, match backwards as far as we can. k counts the number of characters that match */ for(k=1; k=sz && cnt>bestCnt ){ /* Remember this match only if it is the best so far and it does not increase the file size */ bestCnt = cnt; bestOfst = iSrc-k; bestLitsz = litsz; DEBUG2( printf("... BEST SO FAR\n"); ) } /* Check the next matching block */ iBlock = collide[iBlock]; } /* We have a copy command that does not cause the delta to be larger than a literal insert. So add the copy command to the delta. */ if( bestCnt>0 ){ if( bestLitsz>0 ){ /* Add an insert command before the copy */ PINT(bestLitsz); OUTCH(":"); OUT(zOut+base, bestLitsz); base += bestLitsz; DEBUG2( printf("insert %d\n", bestLitsz); ) } base += bestCnt; PINT(bestCnt); OUTCH("@"); PINT(bestOfst); DEBUG2( printf("copy %d bytes from %d\n", bestCnt, bestOfst); ) OUTCH(","); if( bestOfst + bestCnt -1 > lastRead ){ lastRead = bestOfst + bestCnt - 1; DEBUG2( printf("lastRead becomes %d\n", lastRead); ) } bestCnt = 0; break; } /* If we reach this point, it means no match is found so far */ if( base+i+NHASH>=lenOut ){ /* We have reached the end of the input and have not found any matches. Do an "insert" for everything that does not match */ PINT(lenOut-base); OUTCH(":"); OUT(zOut+base, lenOut-base); base = lenOut; break; } /* Advance the hash by one character. Keep looking for a match */ fsl_delta_hash_next(&h, zOut[base+i+NHASH]); i++; } } fsl_free(collide); /* Output a final "insert" record to get all the text at the end of the file that does not match anything in the source file. */ if( basemem, increments state->cursor by n, and returns 0. */ static int fsl_output_f_ostring( void * state, void const * src, fsl_size_t n ){ DeltaOutputString * os = (DeltaOutputString*)state; memcpy( os->mem + os->cursor, src, n ); os->cursor += n; return 0; } int fsl_delta_create( unsigned char const *zSrc, fsl_size_t lenSrc, unsigned char const *zOut, fsl_size_t lenOut, unsigned char *zDelta, fsl_size_t * deltaSize){ int rc; DeltaOutputString os; if(!zSrc || !zOut || !zDelta || !deltaSize) return FSL_RC_MISUSE; os.mem = (unsigned char *)zDelta; os.cursor = 0; rc = fsl_delta_create2( zSrc, lenSrc, zOut, lenOut, fsl_output_f_ostring, &os ); if(!rc){ os.mem[os.cursor] = 0; if(deltaSize) *deltaSize = os.cursor; } return rc; } /* Calculates the size (in bytes) of the output from applying a delta. On success 0 is returned and *deltaSize will be updated with the amount of memory required for applying the delta. This routine is provided so that an procedure that is able to call fsl_delta_apply() can learn how much space is required for the output and hence allocate nor more space that is really needed. */ int fsl_delta_applied_size(unsigned char const *zDelta, fsl_size_t lenDelta_, fsl_size_t * deltaSize){ if(!zDelta || (lenDelta_<2) || !deltaSize) return FSL_RC_MISUSE; else{ fsl_size_t size; fsl_int_t lenDelta = (fsl_int_t)lenDelta_; size = fsl_delta_int_get(&zDelta, &lenDelta); if( *zDelta!='\n' ){ /* ERROR: size integer not terminated by "\n" */ return FSL_RC_DELTA_INVALID_TERMINATOR; } *deltaSize = size; return 0; } } int fsl_delta_apply2( unsigned char const *zSrc, /* The source or pattern file */ fsl_size_t lenSrc_, /* Length of the source file */ unsigned char const *zDelta, /* Delta to apply to the pattern */ fsl_size_t lenDelta_, /* Length of the delta */ unsigned char *zOut, /* Write the output into this preallocated buffer */ fsl_error * pErr ){ fsl_size_t limit; fsl_size_t total = 0; #ifndef FSL_OMIT_DELTA_CKSUM_TEST unsigned char *zOrigOut = zOut; #endif /* lenSrc/lenDelta are cast to ints to avoid any potential side-effects caused by changing the function signature from signed to unsigned int types when porting from v1. */ fsl_int_t lenSrc = (fsl_int_t)lenSrc_; fsl_int_t lenDelta = (fsl_int_t)lenDelta_; if(!zSrc || !zDelta || !zOut) return FSL_RC_MISUSE; else if(lenSrc<0 || lenDelta<0) return FSL_RC_RANGE; limit = fsl_delta_int_get(&zDelta, &lenDelta); if( *zDelta!='\n' ){ if(pErr){ fsl_error_set(pErr, FSL_RC_DELTA_INVALID_TERMINATOR, "Delta: size integer not terminated by \\n"); } return FSL_RC_DELTA_INVALID_TERMINATOR; } zDelta++; lenDelta--; while( *zDelta && lenDelta>0 ){ fsl_int_t cnt, ofst; cnt = fsl_delta_int_get(&zDelta, &lenDelta); switch( zDelta[0] ){ case '@': { zDelta++; lenDelta--; ofst = fsl_delta_int_get(&zDelta, &lenDelta); if( lenDelta>0 && zDelta[0]!=',' ){ /* ERROR: copy command not terminated by ',' */ if(pErr){ fsl_error_set(pErr, FSL_RC_DELTA_INVALID_TERMINATOR, "Delta: copy command not terminated by ','"); } return FSL_RC_DELTA_INVALID_TERMINATOR; } zDelta++; lenDelta--; DEBUG1( printf("COPY %d from %d\n", cnt, ofst); ) total += cnt; if( total>limit ){ if(pErr){ fsl_error_set(pErr, FSL_RC_RANGE, "Delta: copy exceeds output file size"); } return FSL_RC_RANGE; } if( ofst+cnt > lenSrc ){ if(pErr){ fsl_error_set(pErr, FSL_RC_RANGE, "Delta: copy extends past end of input"); } return FSL_RC_RANGE; } memcpy(zOut, &zSrc[ofst], cnt); zOut += cnt; break; } case ':': { zDelta++; lenDelta--; total += cnt; if( total>limit ){ if(pErr){ fsl_error_set(pErr, FSL_RC_RANGE, "Delta: insert command gives an output " "larger than predicted"); } return FSL_RC_RANGE; } DEBUG1( printf("INSERT %d\n", cnt); ) if( cnt>lenDelta ){ if(pErr){ fsl_error_set(pErr, FSL_RC_RANGE, "Delta: insert count exceeds size of delta"); } return FSL_RC_RANGE; } memcpy(zOut, zDelta, cnt); zOut += cnt; zDelta += cnt; lenDelta -= cnt; break; } case ';': { zDelta++; lenDelta--; zOut[0] = 0; #ifndef FSL_OMIT_DELTA_CKSUM_TEST if( cnt!=fsl_delta_checksum(zOrigOut, total) ){ if(pErr){ fsl_error_set(pErr, FSL_RC_CHECKSUM_MISMATCH, "Delta: bad checksum"); } return FSL_RC_CHECKSUM_MISMATCH; } #endif if( total!=limit ){ if(pErr){ fsl_error_set(pErr, FSL_RC_SIZE_MISMATCH, "Delta: generated size does not match " "predicted size"); } return FSL_RC_SIZE_MISMATCH; } return 0; } default: { if(pErr){ fsl_error_set(pErr, FSL_RC_DELTA_INVALID_OPERATOR, "Delta: unknown delta operator"); } return FSL_RC_DELTA_INVALID_OPERATOR; } } } /* ERROR: unterminated delta */ if(pErr){ fsl_error_set(pErr, FSL_RC_DELTA_INVALID_TERMINATOR, "Delta: unterminated delta"); } return FSL_RC_DELTA_INVALID_TERMINATOR; } int fsl_delta_apply( unsigned char const *zSrc, /* The source or pattern file */ fsl_size_t lenSrc_, /* Length of the source file */ unsigned char const *zDelta, /* Delta to apply to the pattern */ fsl_size_t lenDelta_, /* Length of the delta */ unsigned char *zOut /* Write the output into this preallocated buffer */ ){ return fsl_delta_apply2(zSrc, lenSrc_, zDelta, lenDelta_, zOut, NULL); } #undef NHASH #undef DEBUG1 #undef DEBUG2