Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:shrunk size of lead byte table for invalid_utf8, and took a shortcut to invalidate lead bytes between 0x80 & 0xBF inclusive
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1:69328517f5726df77c55f842ad934a992f849036
User & Date: sdr 2016-06-16 22:14:29
Context
2016-06-17
07:24
Remove a function which isn't use anywhere check-in: e2a280fc user: jan.nijtmans tags: trunk
00:04
merged from trunk Closed-Leaf check-in: 8a877a7b user: sdr tags: invalid_utf8_table
2016-06-16
22:14
shrunk size of lead byte table for invalid_utf8, and took a shortcut to invalidate lead bytes between 0x80 & 0xBF inclusive check-in: 69328517 user: sdr tags: trunk
11:39
Minor further speed-up: Only increment pointer if really needed. check-in: 5be2e9cf user: jan.nijtmans tags: trunk
Changes

Changes to src/lookslike.c.

158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
...
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204


205
206
207
208
209
210
211
#define US4B  0x80, 0xBF /* for lead bytes 0xF1-0xF3 */
#define US4C  0x80, 0x8F /* for lead byte 0xF4 */
#define US0A  0xFF, 0x00 /* for any other lead byte */

/* a table used for quick lookup of the definition that goes with a
 * particular lead byte */
static const unsigned char lb_tab[] = {
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
  US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
  US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
  US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
  US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
  US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
  US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A,
................................................................................
  unsigned char c, c2;

  if( n==0 ) return 0;  /* Empty file -> OK */
  c = *z;
  while( --n>0 ){
    c2 = c;
    c = *++z;
    if( c2>=0x80 ){
      const unsigned char *def = &lb_tab[(2*c2)-0x100];
      if( (c<*def) || (c>*++def) ){
        return LOOK_INVALID; /* Invalid UTF-8 */
      }
      if( c2>=0xe0 ){
        c = (c2<<1)|3;
      }else{
        c = ' ';
      }


    }
  }
  return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */
}

/*
** Define the type needed to represent a Unicode (UTF-16) character.







<
<
<
<
<
<
<
<







 







|
|








>
>







158
159
160
161
162
163
164








165
166
167
168
169
170
171
...
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#define US4B  0x80, 0xBF /* for lead bytes 0xF1-0xF3 */
#define US4C  0x80, 0x8F /* for lead byte 0xF4 */
#define US0A  0xFF, 0x00 /* for any other lead byte */

/* a table used for quick lookup of the definition that goes with a
 * particular lead byte */
static const unsigned char lb_tab[] = {








  US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
  US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
  US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
  US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
  US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
  US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
  US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A,
................................................................................
  unsigned char c, c2;

  if( n==0 ) return 0;  /* Empty file -> OK */
  c = *z;
  while( --n>0 ){
    c2 = c;
    c = *++z;
    if( c2>=0xC0 ){
      const unsigned char *def = &lb_tab[(2*c2)-0x180];
      if( (c<*def) || (c>*++def) ){
        return LOOK_INVALID; /* Invalid UTF-8 */
      }
      if( c2>=0xe0 ){
        c = (c2<<1)|3;
      }else{
        c = ' ';
      }
    }else if( c2>=0x80 ){
      return LOOK_INVALID;
    }
  }
  return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */
}

/*
** Define the type needed to represent a Unicode (UTF-16) character.