Login
lookslike.c at [6ecdbab284]
Login

File src/lookslike.c artifact e92d49b638 part of check-in 6ecdbab284


/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 
/* vim: set ts=2 et sw=2 tw=80: */
/*
  Copyright 2021 The Libfossil Authors, see LICENSES/BSD-2-Clause.txt

  SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  SPDX-FileCopyrightText: 2021 The Libfossil Authors
  SPDX-ArtifactOfProjectName: Libfossil
  SPDX-FileType: Code

  Heavily indebted to the Fossil SCM project (https://fossil-scm.org).
*/

#include "libfossil.h"
#include "fossil-scm/internal.h"
#include <string.h> /* memcmp() */


/* definitions for various UTF-8 sequence lengths, encoded as start value
 * and size of each valid range belonging to some lead byte*/
#define US2A  0x80, 0x01 /* for lead byte 0xC0 */
#define US2B  0x80, 0x40 /* for lead bytes 0xC2-0xDF */
#define US3A  0xA0, 0x20 /* for lead byte 0xE0 */
#define US3B  0x80, 0x40 /* for lead bytes 0xE1-0xEF */
#define US4A  0x90, 0x30 /* for lead byte 0xF0 */
#define US4B  0x80, 0x40 /* for lead bytes 0xF1-0xF3 */
#define US4C  0x80, 0x10 /* for lead byte 0xF4 */
#define US0A  0x00, 0x00 /* for any other lead byte */

/* a table used for quick lookup of the definition that goes with a
 * particular lead byte */
static const unsigned char lb_tab[] = {
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
  US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
  US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
  US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
  US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
  US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
  US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A
};

#undef US2A
#undef US2B
#undef US3A
#undef US3B
#undef US4A
#undef US4B
#undef US4C
#undef US0A

bool fsl_looks_like_binary(fsl_buffer const * const b){
  return (fsl_looks_like_utf8(b, FSL_LOOKSLIKE_BINARY) & FSL_LOOKSLIKE_BINARY)
    != FSL_LOOKSLIKE_NONE;
}

int fsl_looks_like_utf8(fsl_buffer const * const b, int stopFlags){
  fsl_size_t n = 0;
  const char *z = fsl_buffer_cstr2(b, &n);
  int j, c, flags = FSL_LOOKSLIKE_NONE;  /* Assume UTF-8 text, prove otherwise */

  if( n==0 ) return flags;  /* Empty file -> text */
  c = *z;
  if( c==0 ){
    flags |= FSL_LOOKSLIKE_NUL;  /* NUL character in a file -> binary */
  }else if( c=='\r' ){
    flags |= FSL_LOOKSLIKE_CR;
    if( n<=1 || z[1]!='\n' ){
      flags |= FSL_LOOKSLIKE_LONE_CR;  /* Not enough chars or next char not LF */
    }
  }
  j = (c!='\n');
  if( !j ) flags |= (FSL_LOOKSLIKE_LF | FSL_LOOKSLIKE_LONE_LF);  /* Found LF as first char */
  while( !(flags&stopFlags) && --n>0 ){
    int c2 = c;
    c = *++z; ++j;
    if( c==0 ){
      flags |= FSL_LOOKSLIKE_NUL;  /* NUL character in a file -> binary */
    }else if( c=='\n' ){
      flags |= FSL_LOOKSLIKE_LF;
      if( c2=='\r' ){
        flags |= (FSL_LOOKSLIKE_CR | FSL_LOOKSLIKE_CRLF);  /* Found LF preceded by CR */
      }else{
        flags |= FSL_LOOKSLIKE_LONE_LF;
      }
      if( j>FSL__LINE_LENGTH_MASK ){
        flags |= FSL_LOOKSLIKE_LONG;  /* Very long line -> binary */
      }
      j = 0;
    }else if( c=='\r' ){
      flags |= FSL_LOOKSLIKE_CR;
      if( n<=1 || z[1]!='\n' ){
        flags |= FSL_LOOKSLIKE_LONE_CR;  /* Not enough chars or next char not LF */
      }
    }
  }
  if( n ){
    flags |= FSL_LOOKSLIKE_SHORT;  /* The whole blob was not examined */
  }
  if( j>FSL__LINE_LENGTH_MASK ){
    flags |= FSL_LOOKSLIKE_LONG;  /* Very long line -> binary */
  }
  return flags;
}

unsigned char const *fsl_utf8_bom(unsigned int *pnByte){
  static const unsigned char bom[] = {
    0xef, 0xbb, 0xbf, 0x00, 0x00, 0x00
  };
  if( pnByte ) *pnByte = 3;
  return bom;
}

bool fsl_starts_with_bom_utf8(fsl_buffer const * const b,
                              unsigned int *pBomSize){
  unsigned int bomSize;
  const char * const z = fsl_buffer_cstr(b);
  const unsigned char * const bom = fsl_utf8_bom(&bomSize);
  if( pBomSize ) *pBomSize = bomSize;
  return fsl_buffer_size(b)<bomSize
    ? false
    : memcmp(z, bom, bomSize)==0;
}

bool fsl_invalid_utf8(fsl_buffer const * const b){
  fsl_size_t n;
  const unsigned char *z = (unsigned char *) fsl_buffer_cstr2(b, &n);
  unsigned char c; /* lead byte to be handled. */
  if( n==0 ) return false;  /* Empty file -> OK */
  c = *z;
  while( --n>0 ){
    if( c>=0x80 ){
      const unsigned char *def; /* pointer to range table*/
      c <<= 1; /* multiply by 2 and get rid of highest bit */
      def = &lb_tab[c]; /* search fb's valid range in table */
      if( (unsigned int)(*++z-def[0])>=def[1] ){
        return false/*FSL_LOOKSLIKE_INVALID*/;
      }
      c = (c>=0xC0) ? (c|3) : ' '; /* determine next lead byte */
    } else {
      c = *++z;
    }
  }
  return c<0x80 /* Final lead byte must be ASCII. */;
}