Fossil User Forum: Automatic TOC, paragraph numbering, and header IDs

(10.1) By andygoth on 2020-09-15 20:03:16 edited from 10.0 in reply to 1 [source]

Here's what I've been doing for years. Maybe you might find some inspiration.
#!/usr/bin/env tclsh

# Load required packages.
package require Tcl 8.6

# encHex --
# Encode by substituting most non-alphanumerics with hexadecimal codes.
proc encHex {str {pattern {[^-^,./'=+|!$\w]}}} {
    set pos 0
    while {[regexp -indices -start $pos $pattern $str range]} {
        binary scan [string range $str {*}$range] H2 char
        set str [string replace $str {*}$range %$char]
        set pos [expr {[lindex $range 0] + 3}]
    }
    return $str
}

# parseMarkdown --
# Basic Markdown parser that supports only a small subset of Markdown.
proc parseMarkdown {str} {
    # Initialize intermediate and result variables, then invoke parser.
    set index 0
    set html {}
    set plain {}
    apply {{{emph {}}} {
        # Get access to caller variables.
        upvar 1 str str index index html html plain plain

        # Helper procedure that appends a literal string to the result.
        set literal {apply {{{count 1}} {
            if {$count > 0} {
                upvar 1 str str html html plain plain index index
                set start $index
                incr index $count
                if {$index > [string length $str]} {
                    set index [string length $str]
                }
                set chunk [string range $str $start [expr {$index - 1}]]
                append html [string map {
                    < &lt; > &gt; & &amp; _ \\_ * \\* ` \\` [ \\[ ] \\]
                } $chunk]
                append plain $chunk
            }
        }}}

        # Find the next supported Markdown introductory metacharacter.
        while {[regexp -start $index -indices {[<&`*_\\]} $str match]} {
            # Emit literal text since the start or the prior sequence.
            {*}$literal [expr {[lindex $match 0] - $index}]

            # Process this special sequence according to its first character.
            switch [string index $str $index] {
            < - & {
                # Directly emit literal HTML tags and entities.
                if {[regexp -start $index -expanded {
                    \A<(?:[^'\">]+|'[^']*'|\"[^\"]*\")+>    # HTML tag
                   |\A&[^;]+;                               # HTML entity
                } $str match]} {
                    # Parse success.  Emit the HTML.
                    append html $match
                    incr index [string length $match]
                } else {
                    # Parse error.  Emit the character literally.
                    {*}$literal
                }
            } ` {
                # Emit backtick-quoted text.
                append html <code>
                if {[regexp -start $index {\A`.*?`} $str match]} {
                    # Parse success.  Emit the backtick-quoted text literally.
                    incr index
                    {*}$literal [expr {[string length $match] - 2}]
                    incr index
                } else {
                    # Parse failure.  Emit the backtick literally.
                    {*}$literal
                }
                append html </code>
            } * - _ {
                # Emit emphasized text.
                if {$emph eq {}} {
                    # Beginning of emphasis.  Recurse into the parser.
                    regexp -start $index {\A(.)\1{0,2}} $str nest
                    incr index [string length $nest]
                    switch [string length $nest] {
                        1 {append html <em>}
                        2 {append html <strong>}
                        3 {append html <strong><em>}
                    }
                    {*}[info level 0] $nest
                } elseif {$emph eq [string range $str $index\
                        [expr {$index + [string length $emph] - 1}]]} {
                    # End of emphasis.  Stop the recursive parser.
                    switch [string length $emph] {
                        1 {append html </em>}
                        2 {append html </strong>}
                        3 {append html </em></strong>}
                    }
                    incr index [string length $emph]
                    return
                } else {
                    # Parse error.  Emit the character literally.
                    {*}$literal
                }
            } \\ {
                # Emit the backslashed character literally.
                incr index
                {*}$literal
            }}
        }

        # Emit literal text following the final sequence.
        {*}$literal [expr {[string length $str] - $index}]
    }}

    # Return the HTML and plain text.
    list $html $plain
}

# Check command-line arguments.
if {![llength $argv]} {
    set notoc 0
} elseif {[llength $argv] > 1 || [lindex $argv 0] ne "-notoc"} {
    chan puts stderr "Usage: [file tail $argv0] ?-notoc?"
} else {
    set notoc 1
}

# Find documentation directory.
set dir [file join [file dirname [info script]] doc]

# Read titles and other information from all input files except index.md.
set fileTitle {}
set orderGroup {}
set groupFiles {}
foreach file [glob -directory $dir *.md] {
    if {[set tail [file tail $file]] ne "index.md"} {
        # Read file.
        set chan [open $file]
        set data [chan read $chan]
        chan close $chan

        # Extract information from file.
        dict set fileTitle $tail [lindex [regexp -inline -line {.*} $data] 0]
        if {![regexp -line {^<!--GROUP=(\d+),(.*)-->$} $data _ order group]} {
            error "group definition not found in $tail"
        } elseif {![regexp -line {^<!--ORDER=(\d+)-->$} $data _ fileOrder]} {
            error "file order not found in $tail"
        }

        # Save group ordering.
        if {![dict exists $orderGroup $order]} {
            dict set orderGroup $order $group
        } elseif {[dict get $orderGroup $order] ne $group} {
            error "inconsistent group names \"$group\" and\
                    \"[dict get $orderGroup $order]\" for group order $order"
        }

        # Add file to group.
        if {![dict exists $groupFiles $group $fileOrder]} {
            dict set groupFiles $group $fileOrder $tail
        } else {
            error "file order $fileOrder in group \"$group\" used for both\
                    \"$tail\" and \"[dict get $groupFiles $group $fileOrder]\""
        }
    }
}

# Create documentation index file and header.
set chan [open [file join $dir index.md] wb]
chan puts $chan\
{Documentation Index
===================
<!--This file is generated by docindex.tcl-->
<!--TOC-->

# Search Documentation

<form action="$ROOT/docsrch" method="GET">
  <input type="text" name="s" size="40" autofocus="1">
  <input type="submit" value="Search">
</form>}

# Process each group in order.
foreach {_ group} [lsort -integer -stride 2 -index 0 $orderGroup] {
    # Emit the group heading.
    chan puts $chan "\n# $group\n"

    # Emit the document list in order.
    foreach {_ tail} [lsort -integer -stride 2 -index 0\
            [dict get $groupFiles $group]] {
        chan puts $chan "- \[[dict get $fileTitle $tail]\]($tail)"
    }
}

# Add version selection and Vim modeline, then close the documentation index.
puts $chan {
# Alternate Documentation Versions

- [Most recent check-in on trunk](/doc/trunk/doc/)
- [Most recent check-in on any branch](/doc/tip/doc/)
- [Current checkout](/doc/ckout/doc/)

<!-- vim: set sts=4 sw=4 tw=80 et ft=markdown: -->}
chan close $chan

# Link back to table of contents.
set top\
{<a href="#table_of_contents" style="font-size:small;float:right">[top]</a>}

# Process each documentation file.
foreach file [glob -directory $dir *.md] {
    # Read Markdown file.
    set chan [open $file]
    set data [chan read $chan]
    chan close $chan

    # Skip the file if it does not contain a TOC.
    if {[string first <!--TOC--> $data] < 0} {
        continue
    }

    # Identify all code blocks fenced by "```" lines.  Permit the opening code
    # fence to be followed by other characters, e.g. syntax mode name.
    set fences [regexp -indices -line -inline -all {^```(?:.|\n)*?^```$} $data]

    # Initialize dict listing all existing anchors, used to avoid duplication.
    set anchors {table_of_contents {}}

    # Build the TOC to contain all first- and second-level headings.  Consider
    # only headings using "#" and "##" marks, not underlines, and skip headings
    # that appear inside fenced code blocks.
    set start 0
    set offset 0
    set toc <!--TOC-->
    set oldHeading #
    if {!$notoc} {
        append toc "\n<span id=\"table_of_contents\"></span><div class=\"toc\"><ul>"
    }
    while {[regexp -indices -line -start $start {^##?[^#].*} $data match]} {
        # Place the start and end indices in their own variables.
        lassign $match match0 match1

        # Skip this match if it starts within a fenced code block.
        set skip 0
        foreach fence $fences {
            if {$match0 >= [lindex $fence 0] + $offset
             && $match0 <= [lindex $fence 1] + $offset} {
                set skip 1
                break
            }
        }
        if {$skip} {
            set start [expr {$match1 + 1}]
            continue
        }

        # Get line from input.
        set line [string range $data $match0 $match1]

        # Get heading level.
        regexp {^(#*)(.*)} $line _ heading line

        # Strip anchor tag, TOC link, and enclosing whitespace.
        regsub {^ <span id=".*"></span>} $line {} line
        regsub { <a href="#.*".*} $line {} line
        set line [string trim $line]

        if {$notoc} {
            # Strip anchor and TOC link from the section header line.
            set line "$heading $line"
        } else {
            # Extract the title and convert to HTML and plain text.
            lassign [parseMarkdown [string trim $line]] html anchor

            # Compute unique anchor name.
            set anchor [regsub -all {\W+} [string tolower $anchor] _]
            set anchor [encHex [string trim $anchor _]]
            if {[dict exists $anchors $anchor]} {
                for {set i 1} {[dict exists $anchors $anchor\_$i]} {incr i} {}
                append anchor _$i
            }
            dict set anchors $anchor {}

            # Build table of contents.
            if {$heading eq $oldHeading} {
                append toc \n
            } elseif {$heading eq "#"} {
                append toc </ul>\n
            } else {
                append toc \n<ul>
            }
            set oldHeading $heading
            append toc "<li><a href=\"#" $anchor \"> $html </a></li>

            # Add an anchor and a TOC link to the section header line.
            set line "$heading <span id=\"$anchor\"></span> $line $top"
        }

        # Replace the section header line.
        set data [string replace $data $match0 $match1 $line]

        # Update the start index and offset.
        set start [expr {[string length $line] + $match0}]
        incr offset [expr {$start - $match1 - 1}]
    }
    if {!$notoc} {
        append toc \n</ul></div>
    }

    # Write Markdown file with the table of contents inserted or removed.
    set chan [open $file wb]
    chan puts -nonewline $chan [regsub -line {^<!--TOC-->$(?:\n.+$)*\n$}\
            $data [string map {& \\& \\ \\\\} $toc]\n]
    chan close $chan
}

# vim: set sts=4 sw=4 tw=80 et ft=tcl:
I thought I posted about this on the mailing list, but when I searched I could only find I'd sent it to individual users.