Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:performance optimizations
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | invalid_utf8_table
Files: files | file ages | folders
SHA1:635f3b0300cffc2aa01ece178fe9684ca8120f0c
User & Date: sdr 2016-06-10 20:45:49
Context
2016-06-11
00:11
fixed a bad function name; had tested it external to fossil but didn't build/test before committing; mea culpa check-in: 2e7a6cb0 user: sdr tags: invalid_utf8_table
2016-06-10
20:45
performance optimizations check-in: 635f3b03 user: sdr tags: invalid_utf8_table
11:52
Merge additional test-cases from trunk. All pass now. check-in: 8a65d6f0 user: jan.nijtmans tags: invalid_utf8_table
Changes

Changes to src/lookslike.c.

141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
...
187
188
189
190
191
192
193








194
195
196
197
198
199
200
201










202
203
204
205
206

207
208
209
210
211
212

213
214
215
216
217
218
219
** except for the "overlong form" of \u0000 (Modified UTF-8)
** which is not considered invalid here: Some languages like
** Java and Tcl use it. This function also considers valid
** the derivatives CESU-8 & WTF-8 (as described in the same
** wikipedia article referenced previously).
*/

int invalid_utf8(const Blob *pContent)
{
  /* definitions for various utf-8 sequence lengths */
  static unsigned char def_1a[] = { 1, 0x00, 0x7F };
  static unsigned char def_2a[] = { 2, 0xC0, 0xC0, 0x80, 0x80 };
  static unsigned char def_2b[] = { 2, 0xC2, 0xDF, 0x80, 0xBF };
  static unsigned char def_3a[] = { 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF };
  static unsigned char def_3b[] = { 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF };
  static unsigned char def_4a[] = { 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
  static unsigned char def_4b[] = { 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
  static unsigned char def_4c[] = { 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF };

  /* an array of all the definitions */
  static unsigned char* def_arr[] = { def_1a, def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL };

  /* a table used for quick lookup of the definition that goes with a particular lead byte */
  static unsigned char* lb_tab[256] = { NULL };

  /* a pointer to the table; NULL means not yet setup */
  static unsigned char** lb_ptr = NULL;

................................................................................
  /* buffer pointer and size */
  const unsigned char *z = (unsigned char *)blob_buffer(pContent);
  unsigned int n = blob_size(pContent);

  /* while we haven't checked all the bytes in the buffer */
  while (n > 0)
  {








    /* get the definition for this lead byte */
    unsigned char* def = lb_ptr[*z];
    unsigned char i;

    /* if the definition doesn't exist, or there aren't enough bytes left, return invalid */
    if (!def || (n < def[0]))
      return LOOK_INVALID;











    /* we already know byte #0 is good, so check the remaining bytes */
    for (i = 1; i < def[0]; ++i)
    {
      /* if the byte is outside the allowed range for this definition, return invalid */
      if ((z[i] < def[1 + i * 2 + 0]) || (z[i] > def[1 + i * 2 + 1]))

        return LOOK_INVALID;
    }

    /* advance to the next sequence */
    z += def[0];
    n -= def[0];

  }

  /* we made it all the way through the buffer so it's not invalid */
  return 0;
}









|


<









|







 







>
>
>
>
>
>
>
>
|
|
|

|
|
|

>
>
>
>
>
>
>
>
>
>
|
|
|
|
<
>
|
|

|
<
|
>







141
142
143
144
145
146
147
148
149
150

151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
...
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222

223
224
225
226
227

228
229
230
231
232
233
234
235
236
** except for the "overlong form" of \u0000 (Modified UTF-8)
** which is not considered invalid here: Some languages like
** Java and Tcl use it. This function also considers valid
** the derivatives CESU-8 & WTF-8 (as described in the same
** wikipedia article referenced previously).
*/

int invalid_utf8_b(const Blob *pContent)
{
  /* definitions for various utf-8 sequence lengths */

  static unsigned char def_2a[] = { 2, 0xC0, 0xC0, 0x80, 0x80 };
  static unsigned char def_2b[] = { 2, 0xC2, 0xDF, 0x80, 0xBF };
  static unsigned char def_3a[] = { 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF };
  static unsigned char def_3b[] = { 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF };
  static unsigned char def_4a[] = { 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
  static unsigned char def_4b[] = { 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
  static unsigned char def_4c[] = { 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF };

  /* an array of all the definitions */
  static unsigned char* def_arr[] = { def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL };

  /* a table used for quick lookup of the definition that goes with a particular lead byte */
  static unsigned char* lb_tab[256] = { NULL };

  /* a pointer to the table; NULL means not yet setup */
  static unsigned char** lb_ptr = NULL;

................................................................................
  /* buffer pointer and size */
  const unsigned char *z = (unsigned char *)blob_buffer(pContent);
  unsigned int n = blob_size(pContent);

  /* while we haven't checked all the bytes in the buffer */
  while (n > 0)
  {
    /* ascii is trivial */
    if (*z < 0x80)
    {
      ++z;
      --n;
    }
    else
    {
      /* get the definition for this lead byte */
      unsigned char* def = lb_ptr[*z++];
      unsigned char i, len;

      /* if the definition doesn't exist, return invalid */
      if (!def)
        return LOOK_INVALID;

      /* get the expected sequence length */
      len = *def;

      /* if there aren't enough bytes left, return invalid */
      if (n < len)
        return LOOK_INVALID;

      /* skip the length & lead byte range */
      def += 3;

      /* we already know byte #0 is good, so check the remaining bytes */
      for (i = 1; i < len; ++i)
      {
        /* if the byte is outside the allowed range for this definition, return invalid */

        if ((*z < *def++) || (*z++ > *def++))
          return LOOK_INVALID;
      }

      /* advance to the next sequence */

      n -= len;
    }
  }

  /* we made it all the way through the buffer so it's not invalid */
  return 0;
}