/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=2 et sw=2 tw=80: */
/*
Copyright 2021 The Libfossil Authors, see LICENSES/BSD-2-Clause.txt
SPDX-License-Identifier: BSD-2-Clause-FreeBSD
SPDX-FileCopyrightText: 2021 The Libfossil Authors
SPDX-ArtifactOfProjectName: Libfossil
SPDX-FileType: Code
Heavily indebted to the Fossil SCM project (https://fossil-scm.org).
*/
#include "libfossil.h"
#include "fossil-scm/internal.h"
#include <string.h> /* memcmp() */
/* definitions for various UTF-8 sequence lengths, encoded as start value
* and size of each valid range belonging to some lead byte*/
#define US2A 0x80, 0x01 /* for lead byte 0xC0 */
#define US2B 0x80, 0x40 /* for lead bytes 0xC2-0xDF */
#define US3A 0xA0, 0x20 /* for lead byte 0xE0 */
#define US3B 0x80, 0x40 /* for lead bytes 0xE1-0xEF */
#define US4A 0x90, 0x30 /* for lead byte 0xF0 */
#define US4B 0x80, 0x40 /* for lead bytes 0xF1-0xF3 */
#define US4C 0x80, 0x10 /* for lead byte 0xF4 */
#define US0A 0x00, 0x00 /* for any other lead byte */
/* a table used for quick lookup of the definition that goes with a
* particular lead byte */
static const unsigned char lb_tab[] = {
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A,
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A
};
#undef US2A
#undef US2B
#undef US3A
#undef US3B
#undef US4A
#undef US4B
#undef US4C
#undef US0A
bool fsl_looks_like_binary(fsl_buffer const * const b){
return (fsl_looks_like_utf8(b, FSL_LOOKSLIKE_BINARY) & FSL_LOOKSLIKE_BINARY)
!= FSL_LOOKSLIKE_NONE;
}
int fsl_looks_like_utf8(fsl_buffer const * const b, int stopFlags){
fsl_size_t n = 0;
const char *z = fsl_buffer_cstr2(b, &n);
int j, c, flags = FSL_LOOKSLIKE_NONE; /* Assume UTF-8 text, prove otherwise */
if( n==0 ) return flags; /* Empty file -> text */
c = *z;
if( c==0 ){
flags |= FSL_LOOKSLIKE_NUL; /* NUL character in a file -> binary */
}else if( c=='\r' ){
flags |= FSL_LOOKSLIKE_CR;
if( n<=1 || z[1]!='\n' ){
flags |= FSL_LOOKSLIKE_LONE_CR; /* Not enough chars or next char not LF */
}
}
j = (c!='\n');
if( !j ) flags |= (FSL_LOOKSLIKE_LF | FSL_LOOKSLIKE_LONE_LF); /* Found LF as first char */
while( !(flags&stopFlags) && --n>0 ){
int c2 = c;
c = *++z; ++j;
if( c==0 ){
flags |= FSL_LOOKSLIKE_NUL; /* NUL character in a file -> binary */
}else if( c=='\n' ){
flags |= FSL_LOOKSLIKE_LF;
if( c2=='\r' ){
flags |= (FSL_LOOKSLIKE_CR | FSL_LOOKSLIKE_CRLF); /* Found LF preceded by CR */
}else{
flags |= FSL_LOOKSLIKE_LONE_LF;
}
if( j>FSL__LINE_LENGTH_MASK ){
flags |= FSL_LOOKSLIKE_LONG; /* Very long line -> binary */
}
j = 0;
}else if( c=='\r' ){
flags |= FSL_LOOKSLIKE_CR;
if( n<=1 || z[1]!='\n' ){
flags |= FSL_LOOKSLIKE_LONE_CR; /* Not enough chars or next char not LF */
}
}
}
if( n ){
flags |= FSL_LOOKSLIKE_SHORT; /* The whole blob was not examined */
}
if( j>FSL__LINE_LENGTH_MASK ){
flags |= FSL_LOOKSLIKE_LONG; /* Very long line -> binary */
}
return flags;
}
unsigned char const *fsl_utf8_bom(unsigned int *pnByte){
static const unsigned char bom[] = {
0xef, 0xbb, 0xbf, 0x00, 0x00, 0x00
};
if( pnByte ) *pnByte = 3;
return bom;
}
bool fsl_starts_with_bom_utf8(fsl_buffer const * const b,
unsigned int *pBomSize){
unsigned int bomSize;
const char * const z = fsl_buffer_cstr(b);
const unsigned char * const bom = fsl_utf8_bom(&bomSize);
if( pBomSize ) *pBomSize = bomSize;
return fsl_buffer_size(b)<bomSize
? false
: memcmp(z, bom, bomSize)==0;
}
bool fsl_invalid_utf8(fsl_buffer const * const b){
fsl_size_t n;
const unsigned char *z = (unsigned char *) fsl_buffer_cstr2(b, &n);
unsigned char c; /* lead byte to be handled. */
if( n==0 ) return false; /* Empty file -> OK */
c = *z;
while( --n>0 ){
if( c>=0x80 ){
const unsigned char *def; /* pointer to range table*/
c <<= 1; /* multiply by 2 and get rid of highest bit */
def = &lb_tab[c]; /* search fb's valid range in table */
if( (unsigned int)(*++z-def[0])>=def[1] ){
return false/*FSL_LOOKSLIKE_INVALID*/;
}
c = (c>=0xC0) ? (c|3) : ' '; /* determine next lead byte */
} else {
c = *++z;
}
}
return c<0x80 /* Final lead byte must be ASCII. */;
}