Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:added a few comments
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | invalid_utf8_table
Files: files | file ages | folders
SHA1:63313a5f168933ddae7dec33807e8282d77573d7
User & Date: sdr 2016-06-15 15:00:15
Context
2016-06-16
07:46
Unroll loop for even greater speed check-in: cc09e003 user: jan.nijtmans tags: invalid_utf8_table
2016-06-15
15:00
added a few comments check-in: 63313a5f user: sdr tags: invalid_utf8_table
08:19
Simplifications and constification check-in: 6051c441 user: jan.nijtmans tags: invalid_utf8_table
Changes

Changes to src/lookslike.c.

141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
** which is not considered invalid here: Some languages like
** Java and Tcl use it. This function also considers valid
** the derivatives CESU-8 & WTF-8 (as described in the same
** wikipedia article referenced previously).
*/

/* definitions for various UTF-8 sequence lengths */
static const unsigned char us2a[] = {
  2, 0x80, 0x80
};
static const unsigned char us2b[] = {
  2, 0x80, 0xBF
};
static const unsigned char us3a[] = {
  3, 0xA0, 0xBF, 0x80, 0xBF
};
static const unsigned char us3b[] = {
  3, 0x80, 0xBF, 0x80, 0xBF
};
static const unsigned char us4a[] = {
  4, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF
};
static const unsigned char us4b[] = {
  4, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF
};
static const unsigned char us4c[] = {
  4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF
};

/* a table used for quick lookup of the definition that goes with a
 * particular lead byte */
static const unsigned char* const lb_tab[] = {
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,







|


|


|


|


|


|


|







141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
** which is not considered invalid here: Some languages like
** Java and Tcl use it. This function also considers valid
** the derivatives CESU-8 & WTF-8 (as described in the same
** wikipedia article referenced previously).
*/

/* definitions for various UTF-8 sequence lengths */
static const unsigned char us2a[] = { /* for lead byte 0xC0 */
  2, 0x80, 0x80
};
static const unsigned char us2b[] = { /* for lead bytes 0xC2-0xDF */
  2, 0x80, 0xBF
};
static const unsigned char us3a[] = { /* for lead byte 0xE0 */
  3, 0xA0, 0xBF, 0x80, 0xBF
};
static const unsigned char us3b[] = { /* for lead bytes 0xE1-0xEF */
  3, 0x80, 0xBF, 0x80, 0xBF
};
static const unsigned char us4a[] = { /* for lead byte 0xF0 */
  4, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF
};
static const unsigned char us4b[] = { /* for lead bytes 0xF1-0xF3 */
  4, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF
};
static const unsigned char us4c[] = { /* for lead byte 0xF4 */
  4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF
};

/* a table used for quick lookup of the definition that goes with a
 * particular lead byte */
static const unsigned char* const lb_tab[] = {
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,