Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Further coding style improvements for the new invalid_utf8() function. Also fixes several C99-isms.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | invalid_utf8_table
Files: files | file ages | folders
SHA1:2fb7d59beed17f94613e3108de7460681d0e0a1c
User & Date: mistachkin 2016-06-11 05:23:21
Original Comment: Further coding style improvements for the new invalid_utf8() function.
Context
2016-06-14
05:00
merged from trunk check-in: d22c72bc user: sdr tags: invalid_utf8_table
2016-06-11
05:23
Further coding style improvements for the new invalid_utf8() function. Also fixes several C99-isms. check-in: 2fb7d59b user: mistachkin tags: invalid_utf8_table
00:41
reformatted invalid_utf8 to make it conform a bit better to existing style check-in: dd3bb22c user: sdr tags: invalid_utf8_table
Changes

Changes to src/lookslike.c.

130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147

148

149
150


151



152


153


154


155


156

157
158

159

160
161

162
163
164
165
166
167
168
169
170



171


172
173

174
175
176
177
178

179
180
181
182
183
184
185
186
187
188
189

190
191

192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213

214
215
216

217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
...
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
  }
  if( j>LENGTH_MASK ){
    flags |= LOOK_LONG;  /* Very long line -> binary */
  }
  return flags;
}


/*
** Checks for proper UTF-8. It uses the method described in:
**   http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
** except for the "overlong form" of \u0000 (Modified UTF-8)
** which is not considered invalid here: Some languages like
** Java and Tcl use it. This function also considers valid
** the derivatives CESU-8 & WTF-8 (as described in the same
** wikipedia article referenced previously).
*/


int invalid_utf8(const Blob *pContent) {

  /* definitions for various utf-8 sequence lengths */
  static unsigned char def_2a[] = { 2, 0xC0, 0xC0, 0x80, 0x80 };


  static unsigned char def_2b[] = { 2, 0xC2, 0xDF, 0x80, 0xBF };



  static unsigned char def_3a[] = { 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF };


  static unsigned char def_3b[] = { 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF };


  static unsigned char def_4a[] = { 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF };


  static unsigned char def_4b[] = { 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF };


  static unsigned char def_4c[] = { 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF };


  /* an array of all the definitions */

  static unsigned char* def_arr[] = { def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL };


  /* a table used for quick lookup of the definition that goes with a particular lead byte */

  static unsigned char* lb_tab[256] = { NULL };

  /* a pointer to the table; NULL means not yet setup */
  static unsigned char** lb_ptr = NULL;

  /* if the table pointer hasn't been initialized */
  if (lb_ptr == NULL) {
    lb_ptr = lb_tab;




    /* for each definition, set the lead byte table pointer to the proper definition */


    unsigned char** pp = def_arr;
    while (*pp != NULL) {

      unsigned char lo = pp[0][1];
      unsigned char hi = pp[0][2];
      unsigned char i;
      for (i = lo; i <= hi; ++i)
        lb_ptr[i] = pp[0];

      ++pp;
    }
  }

  /* buffer pointer and size */
  const unsigned char *z = (unsigned char *)blob_buffer(pContent);
  unsigned int n = blob_size(pContent);

  /* while we haven't checked all the bytes in the buffer */
  while (n > 0) {


    /* ascii is trivial */
    if (*z < 0x80) {

      ++z;
      --n;
    } else {
      /* get the definition for this lead byte */
      unsigned char* def = lb_ptr[*z++];
      unsigned char i, len;

      /* if the definition doesn't exist, return invalid */
      if (!def) return LOOK_INVALID;

      /* get the expected sequence length */
      len = *def;

      /* if there aren't enough bytes left, return invalid */
      if (n < len) return LOOK_INVALID;

      /* skip the length & lead byte range */
      def += 3;

      /* we already know byte #0 is good, so check the remaining bytes */
      for (i = 1; i < len; ++i)
        /* if the byte is outside the allowed range for this definition, return invalid */

        if ((*z < *def++) || (*z++ > *def++))
          return LOOK_INVALID;


      /* advance to the next sequence */
      n -= len;
    }
  }

  /* we made it all the way through the buffer so it's not invalid */
  return 0;
}


/*
** Define the type needed to represent a Unicode (UTF-16) character.
*/
#ifndef WCHAR_T
#  ifdef _WIN32
#    define WCHAR_T wchar_t
................................................................................
      fUnicode = 0;
    }else{
      fUnicode = could_be_utf16(&blob, 0) || fForceUtf16;
    }
    if( fUnicode ){
      lookFlags = looks_like_utf16(&blob, bRevUtf16, 0);
    }else{
      lookFlags = looks_like_utf8(&blob, 0)|invalid_utf8(&blob);
    }
  }
  fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
  fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
  fossil_print("Starts with UTF-16 BOM: %s\n",
               fUtf16?(bRevUtf16?"reversed":"yes"):"no");
  fossil_print("Looks like UTF-%s: %s\n",fUnicode?"16":"8",







<










>
|
>
|
|
>
>
|
>
>
>
|
>
>
|
>
>
|
>
>
|
>
>
|
>


>
|
>

|
>





|
|
|

>
>
>
|
>
>
|
<
>



|

>



<
<
|
|
<

<
<
>

<
>


|





|
<


<

|
<


<

|
|
>
|

|
>




<

|

<







 







|







130
131
132
133
134
135
136

137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195

196
197
198
199
200
201
202
203
204
205


206
207

208


209
210

211
212
213
214
215
216
217
218
219
220

221
222

223
224

225
226

227
228
229
230
231
232
233
234
235
236
237
238

239
240
241

242
243
244
245
246
247
248
...
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
  }
  if( j>LENGTH_MASK ){
    flags |= LOOK_LONG;  /* Very long line -> binary */
  }
  return flags;
}


/*
** Checks for proper UTF-8. It uses the method described in:
**   http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
** except for the "overlong form" of \u0000 (Modified UTF-8)
** which is not considered invalid here: Some languages like
** Java and Tcl use it. This function also considers valid
** the derivatives CESU-8 & WTF-8 (as described in the same
** wikipedia article referenced previously).
*/

int invalid_utf8(
  const Blob *pContent
){
  /* definitions for various UTF-8 sequence lengths */
  static unsigned char def_2a[] = {
    2, 0xC0, 0xC0, 0x80, 0x80
  };
  static unsigned char def_2b[] = {
    2, 0xC2, 0xDF, 0x80, 0xBF
  };
  static unsigned char def_3a[] = {
    3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF
  };
  static unsigned char def_3b[] = {
    3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF
  };
  static unsigned char def_4a[] = {
    4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF
  };
  static unsigned char def_4b[] = {
    4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF
  };
  static unsigned char def_4c[] = {
    4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF
  };

  /* an array of all the definitions */
  static unsigned char* def_arr[] = {
    def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL
  };

  /* a table used for quick lookup of the definition that goes with a
   * particular lead byte */
  static unsigned char* lb_tab[256] = { NULL };

  /* a pointer to the table; NULL means not yet setup */
  static unsigned char** lb_ptr = NULL;

  /* buffer pointer and size */
  const unsigned char *z;
  unsigned int n;

  /* if the table pointer hasn't been initialized */
  if( lb_ptr==NULL ){
    unsigned char** pp;
    /* for each definition, set the lead byte table pointer to the
     * proper definition */
    lb_ptr = lb_tab;
    pp = def_arr;

    while( *pp!=NULL ){
      unsigned char lo = pp[0][1];
      unsigned char hi = pp[0][2];
      unsigned char i;
      for(i=lo; i<=hi; ++i){
        lb_ptr[i] = pp[0];
      }
      ++pp;
    }
  }


  z = (unsigned char *)blob_buffer(pContent);
  n = blob_size(pContent);

  /* while we haven't checked all the bytes in the buffer */


  while( n>0 ){
    /* ascii is trivial */

    if( *z<0x80 ){
      ++z;
      --n;
    }else{
      /* get the definition for this lead byte */
      unsigned char* def = lb_ptr[*z++];
      unsigned char i, len;

      /* if the definition doesn't exist, return invalid */
      if( !def ) return LOOK_INVALID;

      /* get the expected sequence length */
      len = *def;

      /* if there aren't enough bytes left, return invalid */
      if( n<len ) return LOOK_INVALID;

      /* skip the length & lead byte range */
      def += 3;

      /* we already know byte #0 is good, so check the remaining bytes */
      for(i=1; i<len; ++i){
        /* if the byte is outside the allowed range for this definition,
         * return invalid */
        if( (*z<*def++) || (*z++>*def++) ){
          return LOOK_INVALID;
        }
      }
      /* advance to the next sequence */
      n -= len;
    }
  }

  /* we made it all the way through the buffer so it's not invalid */
  return LOOK_NONE;
}


/*
** Define the type needed to represent a Unicode (UTF-16) character.
*/
#ifndef WCHAR_T
#  ifdef _WIN32
#    define WCHAR_T wchar_t
................................................................................
      fUnicode = 0;
    }else{
      fUnicode = could_be_utf16(&blob, 0) || fForceUtf16;
    }
    if( fUnicode ){
      lookFlags = looks_like_utf16(&blob, bRevUtf16, 0);
    }else{
      lookFlags = looks_like_utf8(&blob, 0) | invalid_utf8(&blob);
    }
  }
  fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
  fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
  fossil_print("Starts with UTF-16 BOM: %s\n",
               fUtf16?(bRevUtf16?"reversed":"yes"):"no");
  fossil_print("Looks like UTF-%s: %s\n",fUnicode?"16":"8",