Fossil

Check-in [1bbca2c3]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Modify the comment formatter to avoid output of incomplete UTF-8 sequences, and to avoid line breaks inside UTF-8 sequences. See https://fossil-scm.org/forum/forumpost/1247e4a3c4 for detailed information and tests.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | comment-formatter-utf8
Files: files | file ages | folders
SHA3-256: 1bbca2c3f89b826d3350ca34a0e1a69a31180b72dcbece58f2714c87f7a8267e
User & Date: florian 2018-10-17 14:16:00.000
Context
2018-11-15
12:43
Add output buffering to the (non-legacy) comment printing algorithm, to reduce calls to fossil_print(). The resulting performance improvement can be up to factor 10, with a perceptible difference even for short comments (measured and tested on Windows with MSVC builds, and on Ubuntu with GCC builds). (For comparison: for the legacy comment printing algorithm, the extra UTF-8 checks added by this branch impair performance by 0.12-1.8%, depending on whether the input contains predominantly multi-byte vs. ASCII-only sequences.) ... (check-in: 16fde3ff user: florian tags: comment-formatter-utf8)
2018-10-17
14:16
Modify the comment formatter to avoid output of incomplete UTF-8 sequences, and to avoid line breaks inside UTF-8 sequences. See https://fossil-scm.org/forum/forumpost/1247e4a3c4 for detailed information and tests. ... (check-in: 1bbca2c3 user: florian tags: comment-formatter-utf8)
2018-10-12
16:14
Fix a comment on the "html" and "puts" TH1 commands. Before this fix, the meanings of the two commands were reversed. ... (check-in: 35563f3d user: drh tags: trunk)
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/comformat.c.
223
224
225
226
227
228
229
























230
231
232
233
234
235
236
237
        break;
      }
      charCnt++;
    }else{
      charCnt++;
    }
    assert( c!='\n' || charCnt==0 );
























    fossil_print("%c", c);
    if( (c&0x80)==0 || (zLine[index+1]&0xc0)!=0xc0 ) maxChars -= useChars;
    if( maxChars<=0 ) break;
    if( c=='\n' ) break;
  }
  if( charCnt>0 ){
    fossil_print("\n");
    lineCnt++;







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|







223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
        break;
      }
      charCnt++;
    }else{
      charCnt++;
    }
    assert( c!='\n' || charCnt==0 );
    /*
    ** Avoid output of incomplete UTF-8 sequences, and also avoid line breaks
    ** inside UTF-8 sequences. Incomplete, ill-formed and overlong sequences are
    ** kept together. The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are
    ** allowed to initiate (ill-formed) 2- and 4-byte sequences, respectively,
    ** the other invalid lead bytes 0xF8 to 0xFF are treated as invalid 1-byte
    ** sequences (as lone trail bytes).
    */
    if( (c&0xc0)==0xc0 && zLine[index]!=0 ){  /* Any UTF-8 lead byte 11xxxxxx */
      char zUTF8[5]; /* Buffer to hold a UTF-8 sequence. */
      int cchUTF8=1; /* Code units consumed. */
      int maxUTF8=1; /* Expected sequence length. */
      zUTF8[0]=c;
      if( (c&0xe0)==0xc0 )maxUTF8=2;          /* UTF-8 lead byte 110vvvvv */
      else if( (c&0xf0)==0xe0 )maxUTF8=3;     /* UTF-8 lead byte 1110vvvv */
      else if( (c&0xf8)==0xf0 )maxUTF8=4;     /* UTF-8 lead byte 11110vvv */
      while( cchUTF8<maxUTF8 &&
              (zLine[index]&0xc0)==0x80 ){    /* UTF-8 trail byte 10vvvvvv */
        zUTF8[cchUTF8++] = zLine[index++];
      }
      zUTF8[cchUTF8]=0;
      fossil_print("%s", zUTF8);
    }
    else
      fossil_print("%c", c);
    if( (c&0x80)==0 || (zLine[index+1]&0xc0)!=0xc0 ) maxChars -= useChars;
    if( maxChars<=0 ) break;
    if( c=='\n' ) break;
  }
  if( charCnt>0 ){
    fossil_print("\n");
    lineCnt++;
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
*/
static int comment_print_legacy(
  const char *zText, /* The comment text to be printed. */
  int indent,        /* Number of spaces to indent each non-initial line. */
  int width          /* Maximum number of characters per line. */
){
  int maxChars = width - indent;
  int si, sk, i, k;
  int doIndent = 0;
  char *zBuf;
  char zBuffer[400];
  int lineCnt = 0;

  if( width<0 ){
    comment_set_maxchars(indent, &maxChars);







|







281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
*/
static int comment_print_legacy(
  const char *zText, /* The comment text to be printed. */
  int indent,        /* Number of spaces to indent each non-initial line. */
  int width          /* Maximum number of characters per line. */
){
  int maxChars = width - indent;
  int si, sk, i, k, kc;
  int doIndent = 0;
  char *zBuf;
  char zBuffer[400];
  int lineCnt = 0;

  if( width<0 ){
    comment_set_maxchars(indent, &maxChars);
285
286
287
288
289
290
291
292
293




















294
295
296
297
298
299
300
301
      if( doIndent==0 ){
        fossil_print("\n");
        lineCnt = 1;
      }
      if( zBuf!=zBuffer) fossil_free(zBuf);
      return lineCnt;
    }
    for(sk=si=i=k=0; zText[i] && k<maxChars; i++){
      char c = zText[i];




















      if( fossil_isspace(c) ){
        si = i;
        sk = k;
        if( k==0 || zBuf[k-1]!=' ' ){
          zBuf[k++] = ' ';
        }
      }else{
        zBuf[k] = c;







|

>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|







309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
      if( doIndent==0 ){
        fossil_print("\n");
        lineCnt = 1;
      }
      if( zBuf!=zBuffer) fossil_free(zBuf);
      return lineCnt;
    }
    for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){
      char c = zText[i];
      kc++; /* Count complete UTF-8 sequences. */
      /*
      ** Avoid line breaks inside UTF-8 sequences. Incomplete, ill-formed and
      ** overlong sequences are kept together. The invalid lead bytes 0xC0 to
      ** 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and
      ** 4-byte sequences, respectively, the other invalid lead bytes 0xF8 to
      ** 0xFF are treated as invalid 1-byte sequences (as lone trail bytes).
      */
      if( (c&0xc0)==0xc0 && zText[i+1]!=0 ){  /* Any UTF-8 lead byte 11xxxxxx */
        int cchUTF8=1; /* Code units consumed. */
        int maxUTF8=1; /* Expected sequence length. */
        if( (c&0xe0)==0xc0 )maxUTF8=2;        /* UTF-8 lead byte 110vvvvv */
        else if( (c&0xf0)==0xe0 )maxUTF8=3;   /* UTF-8 lead byte 1110vvvv */
        else if( (c&0xf8)==0xf0 )maxUTF8=4;   /* UTF-8 lead byte 11110vvv */
        zBuf[k++] = c;
        while( cchUTF8<maxUTF8 &&
                (zText[i+1]&0xc0)==0x80 ){    /* UTF-8 trail byte 10vvvvvv */
          zBuf[k++] = zText[++i];
        }
      }
      else if( fossil_isspace(c) ){
        si = i;
        sk = k;
        if( k==0 || zBuf[k-1]!=' ' ){
          zBuf[k++] = ' ';
        }
      }else{
        zBuf[k] = c;