(10.1) By andygoth on 2020-09-15 20:03:16 edited from 10.0 in reply to 1 [source]
Here's what I've been doing for years. Maybe you might find some inspiration.
#!/usr/bin/env tclsh
# Load required packages.
package require Tcl 8.6
# encHex --
# Encode by substituting most non-alphanumerics with hexadecimal codes.
proc encHex {str {pattern {[^-^,./'=+|!$\w]}}} {
set pos 0
while {[regexp -indices -start $pos $pattern $str range]} {
binary scan [string range $str {*}$range] H2 char
set str [string replace $str {*}$range %$char]
set pos [expr {[lindex $range 0] + 3}]
}
return $str
}
# parseMarkdown --
# Basic Markdown parser that supports only a small subset of Markdown.
proc parseMarkdown {str} {
# Initialize intermediate and result variables, then invoke parser.
set index 0
set html {}
set plain {}
apply {{{emph {}}} {
# Get access to caller variables.
upvar 1 str str index index html html plain plain
# Helper procedure that appends a literal string to the result.
set literal {apply {{{count 1}} {
if {$count > 0} {
upvar 1 str str html html plain plain index index
set start $index
incr index $count
if {$index > [string length $str]} {
set index [string length $str]
}
set chunk [string range $str $start [expr {$index - 1}]]
append html [string map {
< < > > & & _ \\_ * \\* ` \\` [ \\[ ] \\]
} $chunk]
append plain $chunk
}
}}}
# Find the next supported Markdown introductory metacharacter.
while {[regexp -start $index -indices {[<&`*_\\]} $str match]} {
# Emit literal text since the start or the prior sequence.
{*}$literal [expr {[lindex $match 0] - $index}]
# Process this special sequence according to its first character.
switch [string index $str $index] {
< - & {
# Directly emit literal HTML tags and entities.
if {[regexp -start $index -expanded {
\A<(?:[^'\">]+|'[^']*'|\"[^\"]*\")+> # HTML tag
|\A&[^;]+; # HTML entity
} $str match]} {
# Parse success. Emit the HTML.
append html $match
incr index [string length $match]
} else {
# Parse error. Emit the character literally.
{*}$literal
}
} ` {
# Emit backtick-quoted text.
append html <code>
if {[regexp -start $index {\A`.*?`} $str match]} {
# Parse success. Emit the backtick-quoted text literally.
incr index
{*}$literal [expr {[string length $match] - 2}]
incr index
} else {
# Parse failure. Emit the backtick literally.
{*}$literal
}
append html </code>
} * - _ {
# Emit emphasized text.
if {$emph eq {}} {
# Beginning of emphasis. Recurse into the parser.
regexp -start $index {\A(.)\1{0,2}} $str nest
incr index [string length $nest]
switch [string length $nest] {
1 {append html <em>}
2 {append html <strong>}
3 {append html <strong><em>}
}
{*}[info level 0] $nest
} elseif {$emph eq [string range $str $index\
[expr {$index + [string length $emph] - 1}]]} {
# End of emphasis. Stop the recursive parser.
switch [string length $emph] {
1 {append html </em>}
2 {append html </strong>}
3 {append html </em></strong>}
}
incr index [string length $emph]
return
} else {
# Parse error. Emit the character literally.
{*}$literal
}
} \\ {
# Emit the backslashed character literally.
incr index
{*}$literal
}}
}
# Emit literal text following the final sequence.
{*}$literal [expr {[string length $str] - $index}]
}}
# Return the HTML and plain text.
list $html $plain
}
# Check command-line arguments.
if {![llength $argv]} {
set notoc 0
} elseif {[llength $argv] > 1 || [lindex $argv 0] ne "-notoc"} {
chan puts stderr "Usage: [file tail $argv0] ?-notoc?"
} else {
set notoc 1
}
# Find documentation directory.
set dir [file join [file dirname [info script]] doc]
# Read titles and other information from all input files except index.md.
set fileTitle {}
set orderGroup {}
set groupFiles {}
foreach file [glob -directory $dir *.md] {
if {[set tail [file tail $file]] ne "index.md"} {
# Read file.
set chan [open $file]
set data [chan read $chan]
chan close $chan
# Extract information from file.
dict set fileTitle $tail [lindex [regexp -inline -line {.*} $data] 0]
if {![regexp -line {^<!--GROUP=(\d+),(.*)-->$} $data _ order group]} {
error "group definition not found in $tail"
} elseif {![regexp -line {^<!--ORDER=(\d+)-->$} $data _ fileOrder]} {
error "file order not found in $tail"
}
# Save group ordering.
if {![dict exists $orderGroup $order]} {
dict set orderGroup $order $group
} elseif {[dict get $orderGroup $order] ne $group} {
error "inconsistent group names \"$group\" and\
\"[dict get $orderGroup $order]\" for group order $order"
}
# Add file to group.
if {![dict exists $groupFiles $group $fileOrder]} {
dict set groupFiles $group $fileOrder $tail
} else {
error "file order $fileOrder in group \"$group\" used for both\
\"$tail\" and \"[dict get $groupFiles $group $fileOrder]\""
}
}
}
# Create documentation index file and header.
set chan [open [file join $dir index.md] wb]
chan puts $chan\
{Documentation Index
===================
<!--This file is generated by docindex.tcl-->
<!--TOC-->
# Search Documentation
<form action="$ROOT/docsrch" method="GET">
<input type="text" name="s" size="40" autofocus="1">
<input type="submit" value="Search">
</form>}
# Process each group in order.
foreach {_ group} [lsort -integer -stride 2 -index 0 $orderGroup] {
# Emit the group heading.
chan puts $chan "\n# $group\n"
# Emit the document list in order.
foreach {_ tail} [lsort -integer -stride 2 -index 0\
[dict get $groupFiles $group]] {
chan puts $chan "- \[[dict get $fileTitle $tail]\]($tail)"
}
}
# Add version selection and Vim modeline, then close the documentation index.
puts $chan {
# Alternate Documentation Versions
- [Most recent check-in on trunk](/doc/trunk/doc/)
- [Most recent check-in on any branch](/doc/tip/doc/)
- [Current checkout](/doc/ckout/doc/)
<!-- vim: set sts=4 sw=4 tw=80 et ft=markdown: -->}
chan close $chan
# Link back to table of contents.
set top\
{<a href="#table_of_contents" style="font-size:small;float:right">[top]</a>}
# Process each documentation file.
foreach file [glob -directory $dir *.md] {
# Read Markdown file.
set chan [open $file]
set data [chan read $chan]
chan close $chan
# Skip the file if it does not contain a TOC.
if {[string first <!--TOC--> $data] < 0} {
continue
}
# Identify all code blocks fenced by "```" lines. Permit the opening code
# fence to be followed by other characters, e.g. syntax mode name.
set fences [regexp -indices -line -inline -all {^```(?:.|\n)*?^```$} $data]
# Initialize dict listing all existing anchors, used to avoid duplication.
set anchors {table_of_contents {}}
# Build the TOC to contain all first- and second-level headings. Consider
# only headings using "#" and "##" marks, not underlines, and skip headings
# that appear inside fenced code blocks.
set start 0
set offset 0
set toc <!--TOC-->
set oldHeading #
if {!$notoc} {
append toc "\n<span id=\"table_of_contents\"></span><div class=\"toc\"><ul>"
}
while {[regexp -indices -line -start $start {^##?[^#].*} $data match]} {
# Place the start and end indices in their own variables.
lassign $match match0 match1
# Skip this match if it starts within a fenced code block.
set skip 0
foreach fence $fences {
if {$match0 >= [lindex $fence 0] + $offset
&& $match0 <= [lindex $fence 1] + $offset} {
set skip 1
break
}
}
if {$skip} {
set start [expr {$match1 + 1}]
continue
}
# Get line from input.
set line [string range $data $match0 $match1]
# Get heading level.
regexp {^(#*)(.*)} $line _ heading line
# Strip anchor tag, TOC link, and enclosing whitespace.
regsub {^ <span id=".*"></span>} $line {} line
regsub { <a href="#.*".*} $line {} line
set line [string trim $line]
if {$notoc} {
# Strip anchor and TOC link from the section header line.
set line "$heading $line"
} else {
# Extract the title and convert to HTML and plain text.
lassign [parseMarkdown [string trim $line]] html anchor
# Compute unique anchor name.
set anchor [regsub -all {\W+} [string tolower $anchor] _]
set anchor [encHex [string trim $anchor _]]
if {[dict exists $anchors $anchor]} {
for {set i 1} {[dict exists $anchors $anchor\_$i]} {incr i} {}
append anchor _$i
}
dict set anchors $anchor {}
# Build table of contents.
if {$heading eq $oldHeading} {
append toc \n
} elseif {$heading eq "#"} {
append toc </ul>\n
} else {
append toc \n<ul>
}
set oldHeading $heading
append toc "<li><a href=\"#" $anchor \"> $html </a></li>
# Add an anchor and a TOC link to the section header line.
set line "$heading <span id=\"$anchor\"></span> $line $top"
}
# Replace the section header line.
set data [string replace $data $match0 $match1 $line]
# Update the start index and offset.
set start [expr {[string length $line] + $match0}]
incr offset [expr {$start - $match1 - 1}]
}
if {!$notoc} {
append toc \n</ul></div>
}
# Write Markdown file with the table of contents inserted or removed.
set chan [open $file wb]
chan puts -nonewline $chan [regsub -line {^<!--TOC-->$(?:\n.+$)*\n$}\
$data [string map {& \\& \\ \\\\} $toc]\n]
chan close $chan
}
# vim: set sts=4 sw=4 tw=80 et ft=tcl:
I thought I posted about this on the mailing list, but when I searched I could only find I'd sent it to individual users.