Fossil

Check-in [d3fc3772]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:moved static table data out of invalid_utf8 and pre-initialized the table to avoid run time overhead; also shrunk the table by 50% by removing the ASCII code points
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | invalid_utf8_table
Files: files | file ages | folders
SHA1:d3fc377276b80b413d1d0f2eed2cd5d57517d029
User & Date: sdr 2016-06-14 18:06:07
Original Comment: restructured the invalid_utf8 so that it doesn't have to initialize the table on the first pass and shrink the size of the table
Context
2016-06-14
18:08
merged from trunk check-in: 12675ab7 user: sdr tags: invalid_utf8_table
18:06
moved static table data out of invalid_utf8 and pre-initialized the table to avoid run time overhead; also shrunk the table by 50% by removing the ASCII code points check-in: d3fc3772 user: sdr tags: invalid_utf8_table
05:00
merged from trunk check-in: d22c72bc user: sdr tags: invalid_utf8_table
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/lookslike.c.

139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
**   http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
** except for the "overlong form" of \u0000 (Modified UTF-8)
** which is not considered invalid here: Some languages like
** Java and Tcl use it. This function also considers valid
** the derivatives CESU-8 & WTF-8 (as described in the same
** wikipedia article referenced previously).
*/

int invalid_utf8(
  const Blob *pContent
){
  /* definitions for various UTF-8 sequence lengths */
  static unsigned char def_2a[] = {
    2, 0xC0, 0xC0, 0x80, 0x80
  };
  static unsigned char def_2b[] = {
    2, 0xC2, 0xDF, 0x80, 0xBF
  };
  static unsigned char def_3a[] = {
    3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF
  };
  static unsigned char def_3b[] = {
    3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF
  };
  static unsigned char def_4a[] = {
    4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF
  };
  static unsigned char def_4b[] = {
    4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF
  };
  static unsigned char def_4c[] = {
    4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF
  };

  /* an array of all the definitions */
  static unsigned char* def_arr[] = {
    def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL
  };

  /* a table used for quick lookup of the definition that goes with a
   * particular lead byte */
  static unsigned char* lb_tab[256] = { NULL };

  /* a pointer to the table; NULL means not yet setup */
  static unsigned char** lb_ptr = NULL;

  /* buffer pointer and size */
  const unsigned char *z;
  unsigned int n;

  /* if the table pointer hasn't been initialized */
  if( lb_ptr==NULL ){
    unsigned char** pp;
    /* for each definition, set the lead byte table pointer to the
     * proper definition */
    lb_ptr = lb_tab;
    pp = def_arr;
    while( *pp!=NULL ){
      unsigned char lo = pp[0][1];
      unsigned char hi = pp[0][2];
      unsigned char i;
      for(i=lo; i<=hi; ++i){
        lb_ptr[i] = pp[0];
      }
      ++pp;
    }
  }
  z = (unsigned char *)blob_buffer(pContent);
  n = blob_size(pContent);
  /* while we haven't checked all the bytes in the buffer */
  while( n>0 ){
    /* ascii is trivial */
    if( *z<0x80 ){
      ++z;
      --n;
    }else{
      /* get the definition for this lead byte */
      unsigned char* def = lb_ptr[*z++];
      unsigned char i, len;

      /* if the definition doesn't exist, return invalid */
      if( !def ) return LOOK_INVALID;
      /* get the expected sequence length */
      len = *def;
      /* if there aren't enough bytes left, return invalid */







|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<
<
<
<
<
<
<
<
<
<








|







139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197










198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
**   http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
** except for the "overlong form" of \u0000 (Modified UTF-8)
** which is not considered invalid here: Some languages like
** Java and Tcl use it. This function also considers valid
** the derivatives CESU-8 & WTF-8 (as described in the same
** wikipedia article referenced previously).
*/

/* definitions for various UTF-8 sequence lengths */
static const unsigned char us2a[] = {
  2, 0xC0, 0xC0, 0x80, 0x80
};
static const unsigned char us2b[] = {
  2, 0xC2, 0xDF, 0x80, 0xBF
};
static const unsigned char us3a[] = {
  3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF
};
static const unsigned char us3b[] = {
  3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF
};
static const unsigned char us4a[] = {
  4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF
};
static const unsigned char us4b[] = {
  4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF
};
static const unsigned char us4c[] = {
  4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF
};

/* a table used for quick lookup of the definition that goes with a
 * particular lead byte */
static const unsigned char* lb_tab[] = {
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  us2a, NULL, us2b, us2b, us2b, us2b, us2b, us2b,
  us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
  us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
  us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
  us3a, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
  us3b, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
  us4a, us4b, us4b, us4b, us4c, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
};

int invalid_utf8(
  const Blob *pContent
){
  /* buffer pointer and size */
  const unsigned char *z = (unsigned char *)blob_buffer(pContent);
  unsigned int n = blob_size(pContent);











  /* while we haven't checked all the bytes in the buffer */
  while( n>0 ){
    /* ascii is trivial */
    if( *z<0x80 ){
      ++z;
      --n;
    }else{
      /* get the definition for this lead byte */
      unsigned char* def = lb_tab[(*z++)-0x80];
      unsigned char i, len;

      /* if the definition doesn't exist, return invalid */
      if( !def ) return LOOK_INVALID;
      /* get the expected sequence length */
      len = *def;
      /* if there aren't enough bytes left, return invalid */