Fossil

Check-in [837d9b5b]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Improvements to the way that searchable text is parsed out of documents, wiki, and check-in comments. The first line of the text is the title. Subsequent lines are message body. Still need to do this for tickets.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | search-enhancements
Files: files | file ages | folders
SHA1:837d9b5b18d9852365068105cbc58f2f8acc973a
User & Date: drh 2015-02-12 02:32:08
Context
2015-02-13
21:21
Merge enhancements and fixes from trunk. check-in: 23c86b50 user: drh tags: search-enhancements
2015-02-12
02:32
Improvements to the way that searchable text is parsed out of documents, wiki, and check-in comments. The first line of the text is the title. Subsequent lines are message body. Still need to do this for tickets. check-in: 837d9b5b user: drh tags: search-enhancements
2015-02-11
20:16
Striving to make search work better. These changes will require search indexes to be rebuilt, so they go into a branch for now. check-in: e0df4859 user: drh tags: search-enhancements
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/main.mk.

489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
 $(OBJDIR)/th_lang.o \
 $(OBJDIR)/th_tcl.o \
 $(OBJDIR)/cson_amalgamation.o


$(APPNAME):	$(OBJDIR)/headers $(OBJDIR)/codecheck1 $(OBJ) $(EXTRAOBJ)
	$(OBJDIR)/codecheck1 $(TRANS_SRC)
	$(TCC) -o $(APPNAME) $(OBJ) $(EXTRAOBJ) $(LIB)

# This rule prevents make from using its default rules to try build
# an executable named "manifest" out of the file named "manifest.c"
#
$(SRCDIR)/../manifest:
	# noop








|







489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
 $(OBJDIR)/th_lang.o \
 $(OBJDIR)/th_tcl.o \
 $(OBJDIR)/cson_amalgamation.o


$(APPNAME):	$(OBJDIR)/headers $(OBJDIR)/codecheck1 $(OBJ) $(EXTRAOBJ)
	$(OBJDIR)/codecheck1 $(TRANS_SRC)
	$(TCC) $(CFLAGS) -o $(APPNAME) $(OBJ) $(EXTRAOBJ) $(LIB)

# This rule prevents make from using its default rules to try build
# an executable named "manifest" out of the file named "manifest.c"
#
$(SRCDIR)/../manifest:
	# noop

Changes to src/search.c.

981
982
983
984
985
986
987




988
989
990
991
992
993
994
995
996
997
998








999

1000
1001
1002





1003
1004



1005
1006
1007
1008
1009
1010
1011
....
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
....
1071
1072
1073
1074
1075
1076
1077

1078
1079
1080
1081
1082
1083
1084
1085
1086
1087



1088
1089
1090
1091









1092
1093
1094
1095
1096
1097
1098
....
1129
1130
1131
1132
1133
1134
1135




















1136
1137
1138
1139
1140
1141
1142
  style_footer();
}


/*
** This is a helper function for search_stext().  Writing into pOut
** the search text obtained from pIn according to zMimetype.




*/
static void get_stext_by_mimetype(
  Blob *pIn,
  const char *zMimetype,
  Blob *pOut
){
  Blob html, title;
  blob_init(&html, 0, 0);
  blob_init(&title, 0, 0);
  if( zMimetype==0 ) zMimetype = "text/plain";
  if( fossil_strcmp(zMimetype,"text/x-fossil-wiki")==0 ){








    wiki_convert(pIn, &html, 0);

    html_to_plaintext(blob_str(&html), pOut);
  }else if( fossil_strcmp(zMimetype,"text/x-markdown")==0 ){
    markdown_to_html(pIn, &title, &html);





    html_to_plaintext(blob_str(&html), pOut);
  }else if( fossil_strcmp(zMimetype,"text/html")==0 ){



    html_to_plaintext(blob_str(pIn), pOut);
  }else{
    *pOut = *pIn;
    blob_init(pIn, 0, 0);
  }
  blob_reset(&html);
  blob_reset(&title);
................................................................................
  const char *zName,     /* Auxiliary information */
  Blob *pOut             /* OUT: Initialize to the search text */
){
  blob_init(pOut, 0, 0);
  switch( cType ){
    case 'd': {   /* Documents */
      Blob doc;
        content_get(rid, &doc);
      blob_to_utf8_no_bom(&doc, 0);
      get_stext_by_mimetype(&doc, mimetype_from_name(zName), pOut);
      blob_reset(&doc);
      break;
    }
    case 'w': {   /* Wiki */
      Manifest *pWiki = manifest_get(rid, CFTYPE_WIKI,0);
................................................................................
                            pOut);
      blob_reset(&wiki);
      manifest_destroy(pWiki);
      break;
    }
    case 'c': {   /* Check-in Comments */
      static Stmt q;

      db_static_prepare(&q,
         "SELECT coalesce(ecomment,comment)"
         "  ||' (user: '||coalesce(euser,user,'?')"
         "  ||', tags: '||"
         "  (SELECT group_concat(substr(tag.tagname,5),',')"
         "     FROM tag, tagxref"
         "    WHERE tagname GLOB 'sym-*' AND tag.tagid=tagxref.tagid"
         "      AND tagxref.rid=event.objid AND tagxref.tagtype>0)"
         "  ||')'"
         "  FROM event WHERE objid=:x AND type='ci'");



      db_bind_int(&q, ":x", rid);
      if( db_step(&q)==SQLITE_ROW ){
        db_column_blob(&q, 0, pOut);
        blob_append(pOut, "\n", 1);









      }
      db_reset(&q);
      break;
    }
    case 't': {   /* Tickets */
      static Stmt q1;
      Blob raw;
................................................................................
  Blob out;
  db_find_and_open_repository(0,0);
  if( g.argc!=5 ) usage("TYPE RID NAME");
  search_stext(g.argv[2][0], atoi(g.argv[3]), g.argv[4], &out);
  fossil_print("%s\n",blob_str(&out));
  blob_reset(&out);
}





















/* The schema for the full-text index
*/
static const char zFtsSchema[] =
@ -- One entry for each possible search result
@ CREATE TABLE IF NOT EXISTS "%w".ftsdocs(
@   rowid INTEGER PRIMARY KEY, -- Maps to the ftsidx.docid







>
>
>
>






|




>
>
>
>
>
>
>
>
|
>



>
>
>
>
>


>
>
>







 







|







 







>










>
>
>


<

>
>
>
>
>
>
>
>
>







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
....
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
....
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114

1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
....
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
  style_footer();
}


/*
** This is a helper function for search_stext().  Writing into pOut
** the search text obtained from pIn according to zMimetype.
**
** The title of the document is the first line of text.  All subsequent
** lines are the body.  If the document has no title, the first line
** is blank.
*/
static void get_stext_by_mimetype(
  Blob *pIn,
  const char *zMimetype,
  Blob *pOut
){
  Blob html, title, tail;
  blob_init(&html, 0, 0);
  blob_init(&title, 0, 0);
  if( zMimetype==0 ) zMimetype = "text/plain";
  if( fossil_strcmp(zMimetype,"text/x-fossil-wiki")==0 ){
    Blob tail;
    blob_init(&tail, 0, 0);
    if( wiki_find_title(pIn, &title, &tail) ){
      blob_appendf(pOut, "%s\n", blob_str(&title));
      wiki_convert(&tail, &html, 0);
      blob_reset(&tail);
    }else{
      blob_append(pOut, "\n", 1);
      wiki_convert(pIn, &html, 0);
    }
    html_to_plaintext(blob_str(&html), pOut);
  }else if( fossil_strcmp(zMimetype,"text/x-markdown")==0 ){
    markdown_to_html(pIn, &title, &html);
    if( blob_size(&title) ){
      blob_appendf(pOut, "%s\n", blob_str(&title));
    }else{
      blob_append(pOut, "\n", 1);
    }
    html_to_plaintext(blob_str(&html), pOut);
  }else if( fossil_strcmp(zMimetype,"text/html")==0 ){
    if( doc_is_embedded_html(pIn, &title) ){
      blob_appendf(pOut, "%s\n", blob_str(&title));
    }
    html_to_plaintext(blob_str(pIn), pOut);
  }else{
    *pOut = *pIn;
    blob_init(pIn, 0, 0);
  }
  blob_reset(&html);
  blob_reset(&title);
................................................................................
  const char *zName,     /* Auxiliary information */
  Blob *pOut             /* OUT: Initialize to the search text */
){
  blob_init(pOut, 0, 0);
  switch( cType ){
    case 'd': {   /* Documents */
      Blob doc;
      content_get(rid, &doc);
      blob_to_utf8_no_bom(&doc, 0);
      get_stext_by_mimetype(&doc, mimetype_from_name(zName), pOut);
      blob_reset(&doc);
      break;
    }
    case 'w': {   /* Wiki */
      Manifest *pWiki = manifest_get(rid, CFTYPE_WIKI,0);
................................................................................
                            pOut);
      blob_reset(&wiki);
      manifest_destroy(pWiki);
      break;
    }
    case 'c': {   /* Check-in Comments */
      static Stmt q;
      static int isPlainText = -1;
      db_static_prepare(&q,
         "SELECT coalesce(ecomment,comment)"
         "  ||' (user: '||coalesce(euser,user,'?')"
         "  ||', tags: '||"
         "  (SELECT group_concat(substr(tag.tagname,5),',')"
         "     FROM tag, tagxref"
         "    WHERE tagname GLOB 'sym-*' AND tag.tagid=tagxref.tagid"
         "      AND tagxref.rid=event.objid AND tagxref.tagtype>0)"
         "  ||')'"
         "  FROM event WHERE objid=:x AND type='ci'");
      if( isPlainText<0 ){
        isPlainText = db_get_boolean("timeline-plaintext",0);
      }
      db_bind_int(&q, ":x", rid);
      if( db_step(&q)==SQLITE_ROW ){

        blob_append(pOut, "\n", 1);
        if( isPlainText ){
          db_column_blob(&q, 0, pOut);
        }else{
          Blob x;
          blob_init(&x,0,0);
          db_column_blob(&q, 0, &x);
          get_stext_by_mimetype(&x, "text/x-fossil-wiki", pOut);
          blob_reset(&x);
        }
      }
      db_reset(&q);
      break;
    }
    case 't': {   /* Tickets */
      static Stmt q1;
      Blob raw;
................................................................................
  Blob out;
  db_find_and_open_repository(0,0);
  if( g.argc!=5 ) usage("TYPE RID NAME");
  search_stext(g.argv[2][0], atoi(g.argv[3]), g.argv[4], &out);
  fossil_print("%s\n",blob_str(&out));
  blob_reset(&out);
}

/*
** COMMAND: test-convert-stext
**
** Usage: fossil test-convert-stext FILE MIMETYPE
**
** Read the content of FILE and convert it to stext according to MIMETYPE.
** Send the result to standard output.
*/
void test_convert_stext(void){
  Blob in, out;
  db_find_and_open_repository(0,0);
  if( g.argc!=4 ) usage("FILENAME MIMETYPE");
  blob_read_from_file(&in, g.argv[2]);
  blob_init(&out, 0, 0);
  get_stext_by_mimetype(&in, g.argv[3], &out);
  fossil_print("%s\n",blob_str(&out));
  blob_reset(&in);
  blob_reset(&out);
}

/* The schema for the full-text index
*/
static const char zFtsSchema[] =
@ -- One entry for each possible search result
@ CREATE TABLE IF NOT EXISTS "%w".ftsdocs(
@   rowid INTEGER PRIMARY KEY, -- Maps to the ftsidx.docid

Changes to src/wikiformat.c.

2107
2108
2109
2110
2111
2112
2113



2114
2115
2116
2117


2118
2119
2120
2121
2122
2123
2124
....
2138
2139
2140
2141
2142
2143
2144



2145
2146
2147
2148
2149
2150
2151
2152



2153

2154
2155
2156

2157
2158
2159
2160
2161
2162
2163
....
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182



2183
2184
2185
2186
2187

2188

2189
2190
2191
2192
2193
2194
2195
    blob_reset(&out);
  }
}

/*
** Remove all HTML markup from the input text.  The output written into
** pOut is pure text.



*/
void html_to_plaintext(const char *zIn, Blob *pOut){
  int n;
  int i, j;


  int nNL = 0;              /* Number of \n characters at the end of pOut */
  int nWS = 0;              /* True if pOut ends with whitespace */
  while( fossil_isspace(zIn[0]) ) zIn++;
  while( zIn[0] ){
    n = nextHtmlToken(zIn);
    if( zIn[0]=='<' && n>1 ){
      int isCloseTag;
................................................................................
          n = nextHtmlToken(zIn);
          if( fossil_strnicmp(zIn, "</style",7)==0 ) break;
          zIn += n;
        }
        if( zIn[0]=='<' ) zIn += n;
        continue;
      }



      if( !isCloseTag && (eType & (MUTYPE_BLOCK|MUTYPE_TABLE))!=0 ){
        if( nNL==0 ){
          blob_append(pOut, "\n", 1);
          nNL++;
        }
        nWS = 1;
      }
    }else if( fossil_isspace(zIn[0]) ){



      for(i=nNL=0; i<n; i++) if( zIn[i]=='\n' ) nNL++;

      if( !nWS ){
        blob_append(pOut, nNL ? "\n" : " ", 1);
        nWS = 1;

      }
    }else if( zIn[0]=='&' ){
      char c = '?';
      if( zIn[1]=='#' ){
        int x = atoi(&zIn[1]);
        if( x>0 && x<=127 ) c = x;
      }else{
................................................................................
          if( aEntity[jj].n==n && strncmp(aEntity[jj].z,zIn,n)==0 ){
            c = aEntity[jj].c;
            break;
          }
        }
      }
      if( fossil_isspace(c) ){
        if( nWS==0 ) blob_append(pOut, &c, 1);
        nWS = 1;
        nNL = c=='\n';
      }else{



        blob_append(pOut, &c, 1);
        nWS = nNL = 0;
      }
    }else{
      blob_append(pOut, zIn, n);

      nNL = nWS = 0;

    }
    zIn += n;
  }
  if( nNL==0 ) blob_append(pOut, "\n", 1);
}

/*







>
>
>




>
>







 







>
>
>
|







>
>
>
|
>
|
|
|
>







 







|



>
>
>

<


|
>

>







2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
....
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
....
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199

2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
    blob_reset(&out);
  }
}

/*
** Remove all HTML markup from the input text.  The output written into
** pOut is pure text.
**
** Put the title on the first line, if there is any <title> markup.
** If there is no <title>, then create a blank first line.
*/
void html_to_plaintext(const char *zIn, Blob *pOut){
  int n;
  int i, j;
  int inTitle = 0;          /* True between <title>...</title> */
  int seenText = 0;         /* True after first non-whitespace seen */
  int nNL = 0;              /* Number of \n characters at the end of pOut */
  int nWS = 0;              /* True if pOut ends with whitespace */
  while( fossil_isspace(zIn[0]) ) zIn++;
  while( zIn[0] ){
    n = nextHtmlToken(zIn);
    if( zIn[0]=='<' && n>1 ){
      int isCloseTag;
................................................................................
          n = nextHtmlToken(zIn);
          if( fossil_strnicmp(zIn, "</style",7)==0 ) break;
          zIn += n;
        }
        if( zIn[0]=='<' ) zIn += n;
        continue;
      }
      if( eTag==MARKUP_TITLE ){
        inTitle = !isCloseTag;
      }
      if( !isCloseTag && seenText && (eType & (MUTYPE_BLOCK|MUTYPE_TABLE))!=0 ){
        if( nNL==0 ){
          blob_append(pOut, "\n", 1);
          nNL++;
        }
        nWS = 1;
      }
    }else if( fossil_isspace(zIn[0]) ){
      if( seenText ){
        nNL = 0;
        if( !inTitle ){ /* '\n' -> ' ' within <title> */
          for(i=0; i<n; i++) if( zIn[i]=='\n' ) nNL++;
        }
        if( !nWS ){
          blob_append(pOut, nNL ? "\n" : " ", 1);
          nWS = 1;
        }
      }
    }else if( zIn[0]=='&' ){
      char c = '?';
      if( zIn[1]=='#' ){
        int x = atoi(&zIn[1]);
        if( x>0 && x<=127 ) c = x;
      }else{
................................................................................
          if( aEntity[jj].n==n && strncmp(aEntity[jj].z,zIn,n)==0 ){
            c = aEntity[jj].c;
            break;
          }
        }
      }
      if( fossil_isspace(c) ){
        if( nWS==0 && seenText ) blob_append(pOut, &c, 1);
        nWS = 1;
        nNL = c=='\n';
      }else{
        if( !seenText && !inTitle ) blob_append(pOut, "\n", 1);
        seenText = 1;
        nNL = nWS = 0;
        blob_append(pOut, &c, 1);

      }
    }else{
      if( !seenText && !inTitle ) blob_append(pOut, "\n", 1);
      seenText = 1;
      nNL = nWS = 0;
      blob_append(pOut, zIn, n);
    }
    zIn += n;
  }
  if( nNL==0 ) blob_append(pOut, "\n", 1);
}

/*