Fossil

Check-in [d076853d]
Login

Check-in [d076853d]

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Improve truncation of UTF-8 encoded title using a function by @florian.balmer per https://fossil-scm.org/forum/forumpost/6d90d5d99c
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: d076853d10a2f2f7b1812b1db4c45ef522060e0fdcddd6b865d79a164906bea8
User & Date: ashepilko 2020-03-06 17:08:26
Context
2020-03-06
17:27
Limit the max length of new forum thread's title. ... (check-in: 81fb5e76 user: ashepilko tags: trunk)
17:08
Improve truncation of UTF-8 encoded title using a function by @florian.balmer per https://fossil-scm.org/forum/forumpost/6d90d5d99c ... (check-in: d076853d user: ashepilko tags: trunk)
10:07
Update to Unicode-13 ... (check-in: b70a76e3 user: jan.nijtmans tags: trunk)
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/forum.c.

553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
  blob_set(&title, zThreadTitle);
  /* truncate the title when longer than max allowed;
   * in case of UTF-8 make sure the truncated string remains valid,
   * otherwise (different encoding?) pass as-is
   */
  if( mxForumPostTitleLen>0 && blob_size(&title)>mxForumPostTitleLen ){
    int len;
    len = utf8_nearest_codepoint(blob_str(&title), mxForumPostTitleLen);
    if( len ){
      blob_truncate(&title, len);
      blob_append(&title, "...", 3);
    }
  }
  style_header("%s%s", blob_str(&title), blob_size(&title) ? " - Forum" : "Forum");
  blob_reset(&title);







|







553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
  blob_set(&title, zThreadTitle);
  /* truncate the title when longer than max allowed;
   * in case of UTF-8 make sure the truncated string remains valid,
   * otherwise (different encoding?) pass as-is
   */
  if( mxForumPostTitleLen>0 && blob_size(&title)>mxForumPostTitleLen ){
    int len;
    len = utf8_codepoint_index(blob_str(&title), mxForumPostTitleLen);
    if( len ){
      blob_truncate(&title, len);
      blob_append(&title, "...", 3);
    }
  }
  style_header("%s%s", blob_str(&title), blob_size(&title) ? " - Forum" : "Forum");
  blob_reset(&title);

Changes to src/utf8.c.

319
320
321
322
323
324
325






























326
327
328
329
330
331
332
int utf8_nearest_codepoint(const char *zString, int maxByteIndex){
  int i,n;
  for( n=0, i=maxByteIndex; n<4 && i>=0; n++, i-- ){
    if( !IsUTF8TrailByte(zString[i]) ) return i;
  }
  return maxByteIndex;
}































/*
** Display UTF-8 on the console.  Return the number of
** Characters written. If stdout or stderr is redirected
** to a file, -1 is returned and nothing is written
** to the console.
*/







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
int utf8_nearest_codepoint(const char *zString, int maxByteIndex){
  int i,n;
  for( n=0, i=maxByteIndex; n<4 && i>=0; n++, i-- ){
    if( !IsUTF8TrailByte(zString[i]) ) return i;
  }
  return maxByteIndex;
}

/*
** Find the byte index corresponding to the given code point index in a UTF-8
** string. If the string contains fewer than the given number of code points,
** the index of the end of the string (the null-terminator) is returned.
** Incomplete, ill-formed and overlong sequences are counted as one sequence.
** The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are allowed to initiate
** (ill-formed) 2- and 4-byte sequences, respectively, the other invalid lead
** bytes 0xF8 to 0xFF are treated as invalid 1-byte sequences (as lone trail
** bytes).
*/
int utf8_codepoint_index(const char *zString, int nCodePoint){
  int i;       /* Counted bytes. */
  int lenUTF8; /* Counted UTF-8 sequences. */
  if( zString==0 ) return 0;
  for(i=0, lenUTF8=0; zString[i]!=0 && lenUTF8<nCodePoint; i++, lenUTF8++){
    char c = zString[i];
    int cchUTF8=1; /* Code units consumed. */
    int maxUTF8=1; /* Expected sequence length. */
    if( (c&0xe0)==0xc0 )maxUTF8=2;          /* UTF-8 lead byte 110vvvvv */
    else if( (c&0xf0)==0xe0 )maxUTF8=3;     /* UTF-8 lead byte 1110vvvv */
    else if( (c&0xf8)==0xf0 )maxUTF8=4;     /* UTF-8 lead byte 11110vvv */
    while( cchUTF8<maxUTF8 &&
            (zString[i+1]&0xc0)==0x80 ){    /* UTF-8 trail byte 10vvvvvv */
      cchUTF8++;
      i++;
    }
  }
  return i;
}

/*
** Display UTF-8 on the console.  Return the number of
** Characters written. If stdout or stderr is redirected
** to a file, -1 is returned and nothing is written
** to the console.
*/