Index: src/checkin.c ================================================================== --- src/checkin.c +++ src/checkin.c @@ -882,46 +882,51 @@ if( pnFBcard ) *pnFBcard = nFBcard; } /* ** Issue a warning and give the user an opportunity to abandon out -** if unicode or a \r\n line ending is seen in a text file. +** if a Unicode (UTF-16) byte-order-mark (BOM) or a \r\n line ending +** is seen in a text file. */ -static void encoding_warning(const Blob *p, int crnlOk, const char *zFilename){ - int looksLike; /* return value of looks_like_text() */ +static void commit_warning(const Blob *p, int crnlOk, const char *zFilename){ + int eType; /* return value of looks_like_text() */ + int fUnicode; /* return value of starts_with_utf16_bom() */ char *zMsg; /* Warning message */ Blob fname; /* Relative pathname of the file */ static int allOk = 0; /* Set to true to disable this routine */ if( allOk ) return; - looksLike = looks_like_text(p); - if( looksLike<0 ){ - const char *type; + eType = looks_like_text(p); + fUnicode = starts_with_utf16_bom(p); + if( eType==-1 || fUnicode ){ + const char *zWarning; Blob ans; char cReply; - if( looksLike&1 ){ + if( eType==-1 && fUnicode ){ + zWarning = "Unicode and CR/NL line endings"; + }else if( eType==-1 ){ if( crnlOk ){ - return; /* We don't want CrLf warnings for this file. */ + return; /* We don't want CR/NL warnings for this file. */ } - type = "CR/NL line endings"; + zWarning = "CR/NL line endings"; }else{ - type = "unicode"; + zWarning = "Unicode"; } file_relative_name(zFilename, &fname, 0); blob_zero(&ans); zMsg = mprintf( "%s contains %s; commit anyhow (a=all/y/N)?", - blob_str(&fname), type); + blob_str(&fname), zWarning); prompt_user(zMsg, &ans); fossil_free(zMsg); cReply = blob_str(&ans)[0]; if( cReply=='a' || cReply=='A' ){ allOk = 1; }else if( cReply!='y' && cReply!='Y' ){ fossil_fatal("Abandoning commit due to %s in %s", - type, blob_str(&fname)); + zWarning, blob_str(&fname)); } blob_reset(&ans); blob_reset(&fname); } } @@ -1232,11 +1237,11 @@ /* Instead of file content, put link destination path */ blob_read_link(&content, zFullname); }else{ blob_read_from_file(&content, zFullname); } - encoding_warning(&content, crnlOk, zFullname); + commit_warning(&content, crnlOk, zFullname); if( chnged==1 && contains_merge_marker(&content) ){ Blob fname; /* Relative pathname of the file */ nConflict++; file_relative_name(zFullname, &fname, 0); Index: src/diff.c ================================================================== --- src/diff.c +++ src/diff.c @@ -48,11 +48,11 @@ "cannot compute difference between binary files\n" #define DIFF_CANNOT_COMPUTE_SYMLINK \ "cannot compute difference between symlink and regular file\n" -#define looks_like_binary(blob) ((looks_like_text(blob)&1) == 0) +#define looks_like_binary(blob) (looks_like_text((blob)) == 0) #endif /* INTERFACE */ /* ** Maximum length of a line in a text file. (8192) */ @@ -170,41 +170,46 @@ *pnLine = nLine; return a; } /* -** Returns 1, if everything OK -** Returns 0 if the specified content appears to be binary or -** contains a line that is too long -** Returns -1, if the file appears text, but it contains CrLf -** Returns -2, if the file starts with an UTF-16 BOM (le or be) +** This function attempts to scan each logical line within the blob to +** determine the type of content it appears to contain. Possible return +** values are: +** +** (1) -- The content appears to consist entirely of text, with lines +** delimited by line-feed characters; however, the encoding may +** not be UTF-8. +** +** (0) -- The content appears to be binary because it contains embedded +** NUL (\000) characters or an extremely long line. Since this +** function does not understand UTF-16, it may falsely consider +** UTF-16 text to be binary. +** +** (-1) -- The content appears to consist entirely of text, with lines +** delimited by carriage-return, line-feed pairs; however, the +** encoding may not be UTF-8. +** */ int looks_like_text(const Blob *pContent){ const char *z = blob_buffer(pContent); unsigned int n = blob_size(pContent); int j, c; - int result = 1; /* Assume text with no CrLf */ + int result = 1; /* Assume text with no CR/NL */ /* Check individual lines. */ if( n==0 ) return result; /* Empty file -> text */ c = *z; if( c==0 ) return 0; /* \000 byte in a file -> binary */ - if ( n > 1 ){ - if ( (c==(char)0xff) && (z[1]==(char)0xfe) ){ - return -2; - } else if ( (c==(char)0xfe) && (z[1]==(char)0xff) ){ - return -2; - } - } j = (c!='\n'); while( --n>0 ){ c = *++z; ++j; if( c==0 ) return 0; /* \000 byte in a file -> binary */ if( c=='\n' ){ if( z[-1]=='\r' ){ - result = -1; /* Contains CrLf, continue */ + result = -1; /* Contains CR/NL, continue */ } if( j>LENGTH_MASK ){ return 0; /* Very long line -> binary */ } j = 0; @@ -213,10 +218,28 @@ if( j>LENGTH_MASK ){ return 0; /* Very long line -> binary */ } return result; /* No problems seen -> not binary */ } + +/* +** This function returns non-zero if the blob starts with a UTF-16le or +** UTF-16be byte-order-mark (BOM). +*/ +int starts_with_utf16_bom(const Blob *pContent){ + const char *z = blob_buffer(pContent); + int c1, c2; + + if( blob_size(pContent)<2 ) return 0; + c1 = z[0]; c2 = z[1]; + if( (c1==(char)0xff) && (c2==(char)0xfe) ){ + return 1; + }else if( (c1==(char)0xff) && (c2==(char)0xfe) ){ + return 1; + } + return 0; +} /* ** Return true if two DLine elements are identical. */ static int same_dline(DLine *pA, DLine *pB){