' . $_ . '

#!/usr/bin/perl -w $ID = q$Id: faq2html,v 1.36 2021/03/28 02:28:50 eagle Exp $; # # faq2html -- Convert some particular text formats into XHTML. # # Copyright 1999-2002, 2004-2005, 2008, 2010, 2013-2014, 2021 # Russ Allbery # # This program is free software; you may redistribute it and/or modify it # under the same terms as Perl itself. # # This program is an ad hoc set of heuristics and tricks, attempting to # convert a few text file formats that I commonly use into reasonable XHTML. # It's my opinion that general text to XHTML conversions is impossible due to # the huge number of differing formats used by different people when writing # text; this doesn't try to solve the general problem. Rather, it's good # enough to turn the FAQs I maintain into XHTML documents, which is all that I # need of it. ############################################################################## # Modules and declarations ############################################################################## require 5.003; use strict; use vars qw($BUFFER $ID $IN $INDENT @INDENT @MONTHS $OUT %STATE $USEVALUE $WS); use Getopt::Long qw(GetOptions); # Replace with the month names you want to use, if you don't want English. @MONTHS = qw(January February March April May June July August September October November December); ############################################################################## # Utility functions ############################################################################## # Turns section numbers at the beginning of lines in a paragraph into links. sub contents { local $_ = shift; s%^(\s*([\d.]+)[.\)]\s+)(.*?)([ \t]*\n)%$1$3$4%gm; $_; } # Removes an initial bullet on a paragraph, replacing it with spaces. sub debullet { local $_ = shift; s/(\s*)[-*o](\s)/$1 $2/; $_ } # Unescape &, <, and > characters. sub deescape { local $_ = shift; s/>/>/g; s/</some text, while trying to be # careful to avoid other uses of wildcards. sub embolden { local $_ = shift; s%(^|\s)\*(\w.*?\S)\*([,.!?;\s])%$1$2$3%gs; $_; } # Escapes &, <, and > characters found in a string. sub escape { local $_ = shift; s/&/&/g; s//>/g; $_ } # Returns the length of the indentation of a line or paragraph. sub indent { $_[0] =~ /^(\s*)/; length $1 } # Returns the number of lines in a paragraph, not counting trailing blanks. sub lines { local $_ = shift; s/\s+$/\n/; tr/\n// } # Returns a nicely formatted "Last modified" string from an RCS/CVS Id. sub modified_id { my $id = shift; my ($version, $date) = (split (' ', $id))[2,3]; my ($year, $month, $day) = split (m%[/-]%, $date); $day =~ s/^0//; my $revision = ($version =~ /\./) ? " (revision $version)" : ''; 'Last modified '. $MONTHS[$month - 1] . ' ' . $day . ', ' . $year . $revision; } # The same, but from a UNIX timestamp. sub modified_timestamp { my $timestamp = shift; my ($year, $month, $day) = (localtime $timestamp)[5, 4, 3]; $year += 1900; 'Last modified ' . $MONTHS[$month] . ' ' . $day . ', ' . $year; } # Output some text to the $OUT file handle, adding any preserved whitespace # from before first, but after any closing tags. sub output { local $_ = shift; if ($WS) { s%^(\s*(?:]+>\s*)*)%$1$WS%; $WS = ''; } print $OUT $_, @_; } # Read a paragraph in from $IN. If no argument is given or the argument is # false, lines with nothing but whitespace are paragraph dividers; otherwise, # only a completely blank line is a divider. Use $BUFFER to store the # unwanted next line. sub slurp { my $ws = shift; my $p; local $_; $p = $BUFFER || ''; $p .= $_ while (defined ($_ = <$IN>) && ($ws ? !/^$/ : /\S/)); $p .= $_ if defined; $p .= $_ while (defined ($_ = <$IN>) && /^\s*$/); $BUFFER = $_; $p; } # Remove all whitespace in a string. sub smash { local $_ = shift; s/\s//g; $_ } # Strip a number of characters of indentation from a line that's given by the # second argument, returning the result. Used to strip leading indentation # off of

 text so that it isn't indented excessively just because in the
# text version it had to be indented relative to the surrounding text.
sub strip_indent {
    local $_ = shift;
    my $indent = shift;
    if (defined $indent && $indent > 0) {
        s/^ {$indent}//gm;
    }
    $_;
}

# Undoes HTML character escapes.
sub unescape { local $_ = shift; s/&/&/g; s/<//g; $_ }

# Remove a constant prefix at the beginning of each line of a paragraph.
sub unquote {
    my ($string, $quote) = @_;
    $string =~ s/((?:^|\n)\s*)(\Q$quote\E\s+)/$1 . ' ' x length ($2)/ge;
    $string;
}

# Replace tabs with spaces.
sub untabify {
    local $_ = shift;
    1 while s/^(.*?)(\t+)/' ' x (length ($2) * 8 - length ($1) % 8)/me;
    $_;
}

# Given a special-character-escaped URL, wrap  to that URL around
# it.  Remove a leading mailto: in the link text.
sub url {
    my $link = shift;
    my $text = $link;
    $link = smash (unescape $link);
    $text =~ s/^(?:mailto|news)://;
    '<' . $text . '>';
}

# Looks for a URL in  form, with or without the URL: part, and wraps
# a link around it.
sub urlize {
    my $text = shift;
    $text =~ s%<(?:URL:)?([a-z]{2,}:.+?)>%url ($1)%ge;
    $text;
}

# Remove whitespace at the beginning and end of a string.
sub whitechomp { local $_ = shift; s/^\s+//; s/\s+$//; $_ }

##############################################################################
# Identification functions
##############################################################################

# Expects a paragraph, returns whether it is composed entirely of bullet
# items.  Take some care to avoid returning true for paragraphs that consist
# of a single bullet entry, since we want to handle those separately to wrap
# them in paragraph tags.
sub is_allbullet {
    local $_ = shift;
    my @lines = split ("\n", $_);
    return if not $lines[0] =~ /^(\s*[-*o]\s)\S/;
    my $bullet  = $1;
    my $space   = $bullet;
    $space =~ s/[-*o]/ /;
    my $bullets = 0;
    for (@lines) {
        next if !/\S/;
        return if !/^(?:\Q$bullet\E|\Q$space\E)\S/;
        $bullets++ if /^\Q$bullet\E/;
    }
    return $bullets > 1;
}

# Expects a paragraph, returns whether every line is a numbered item with a
# simple number.
sub is_allnumbered { $_[0] =~ /^(\s*\d\d?[.\)]\s.*\n){2,}\s*$/ }

# Expects a paragraph, returns whether it's in all capital letters.
sub is_allcaps { $_[0] !~ m%[^A-Z0-9\s\"\(\),:.!/?-]% }

# Expects a paragraph, returns whether it looks like it's broken into a series
# of short lines or a series of lines without internal space.  The last line
# of the paragraph doesn't matter for this determination.
sub is_broken {
    local $_ = shift;
    s/\s+$/\n/;
    my @lines = split ("\n", $_);
    return if @lines == 1;
    pop @lines;
    return 1 if grep { length ($_) < 40 } @lines;
    my $short = grep { length ($_) < 60 } @lines;
    ($short >= int (@lines / 2) + 1) || /^(?:\s*\S+[ \t]*\n)+$/;
}

# Expects a paragraph, returns whether it's a bulletted item.
sub is_bullet { $_[0] =~ /^\s*[-*o]\s/ }

# Expects a line, returns whether it's centered (in 74 columns).  Also require
# at least 10 spaces of whitespace so that we don't catch accidentally
# centered paragraph lines by mistake.
sub is_centered {
    $_[0] =~ /^(\s+)(.+)/
        && abs (74 - length ($2) - length ($1) * 2) < 2
        && length (untabify $1) >= 8;
}

# Expects a paragraph, returns whether it looks like a content listing.
sub is_contents { $_[0] =~ /^(?:\s*[\d.]+[.\)][ \t].*\n)+\s*$/ }

# Expects a paragraph, returns whether it looks like a title and description.
# Allow for multiple titles.
sub is_description {
    $_[0] =~ /^(\s*)\S.*\n(?:\1\S.*\n)*(\s+)\S.*\n(?:\2\S.*\n)*\s*$/
        && length ($1) < length ($2);
}

# Expects a paragraph, returns whether it's a digest divider.
sub is_divider { $_[0] =~ /^-{30}\s*$/ }

# Expects a line, returns whether it's a mail/news header.
sub is_header { $_[0] =~ /^[\w-]+:\s/ }

# Expects a paragraph, returns whether it's a heading.  This is all about
# heuristics and guesses, and there are a number of other things we could
# confuse for headings, so we have to be careful.  If it's a single line and
# outdented from the baseline, it's probably a heading.  If it's at the
# baseline, check to see if it looks like a heading and either it's in all
# caps or there is a rule underneath it.  If we haven't seen a baseline, be
# more accepting about headers.  If we're inside a contents block, be even
# more careful and disallow numbered things that look like a heading unless
# they're outdented.
sub is_heading {
    local $_ = deescape (shift);
    my $indent = indent $_;
    my $nobase = !defined $STATE{baseline};
    my $outdented = defined ($STATE{baseline}) && $indent < $STATE{baseline};
    return if (!$outdented && $STATE{contents} && /^[\d.]+[.\)]\s/);
    my $even = !defined ($INDENT) || $indent <= $INDENT;
    ($outdented && lines ($_) == 1 && (/\S\s\S/ || length ($_) < 30))
        || ($even && m%^\s*[ \w\"\(\),:./&-]{0,30}[\w\"\)]\s*\n[-=~]+\s*$%)
        || ($even && m%^\s*[ A-Z0-9\"\(\),:./&-]{0,30}[A-Z0-9\"\)]\s*\n$%)
        || ($even && $nobase && m%^\s*[ \w\"\(\),:./&-]{0,33}[\w\"\)]\s*\n$%);
}

# Expects a line, returns whether it's an RCS/CVS Id string that has been
# correctly expanded.
sub is_id { $_[0] =~ /^\s*\$Id\: .*\$\s*$/ }

# Expects a paragraph, returns whether it appears to have internal whitespace.
sub is_literal { $_[0] =~ /^[ \t]*\S.*(?:[^.?!\"\)\]:*_\n]  |   |\t)\S/m }

# Expects a paragraph, returns undef if it doesn't look like a numbered
# paragraph or the number if it does.
sub is_numbered { ($_[0] =~ /^\s*(\d\d?)[.\)]\s/) ? $1 : undef }

# Expects a paragraph, returns true if the paragraph has inconsistent
# indentation.
sub is_offset {
    local $_ = shift;

    # Strip off a leading bullet or number and consider it whitespace in
    # making this check.
    s/^(\s*(?:\d\d?)[.\)]\s)/' ' x length ($1)/e;
    s/^(\s*[-*o]\s)/' ' x length ($1)/e;

    # Now, return true if the indentation isn't consistent.
    !/^(\s*)\S.*\n(\1\S.*\n)*\s*$/
}

# Expects a paragraph, returns undef if not quoted or the quote character if
# it is quoted.  Requires that the paragraph be at least two lines.
sub is_quoted { $_[0] =~ /^\s*([^\w\s\"\'])\s*.*\n(\s*\1\s*.*\n)+$/ && $1 }

# Expects a paragraph, returns whether it's a rule.
sub is_rule { $_[0] =~ /^\s*[-=][-=\s]*$/ }

# Expects a paragraph, returns whether it ends with a sentence.  As a special
# case, a URL counts as a sentence so that we don't wrap  around URLs.
sub is_url;
sub is_sentence {
    local $_ = shift;
    return 1 if /\S[.?!][\)\]\"]?\s*$/;
    return 1 if /^\s*\w.*\s\S+:\s*$/;
    return 1 if is_url $_;
    0;
}

# Expects a paragraph, returns whether it's the start of a signature block,
# defined to be a paragraph whose first line is exactly "-- ".
sub is_signature { $_[0] =~ /^-- \n/ }

# Expects a paragraph, returns whether it's a simple intented URL (already
# converted into a real link.
sub is_url { $_[0] =~ m%^\s*<\S+>\s*$% }

##############################################################################
# HTML constructors
##############################################################################

# Output the DTD for XHTML.  We claim "transitional" XHTML 1.0 compliance; we
# can't claim strict solely because we use the value attribute in  in the
# absence of widespread implementation of CSS Level 2.
sub dtd {
    if ($USEVALUE) {
        qq(\n)
            . qq()
            . "\n";
    } else {
        qq(\n)
            . qq()
            . "\n";
    }
}

# An XML comment.
sub comment {
    my @data = @_;
    my $data = join ('', @data);
    '';
}

# The character set for the page; we assume UTF-8 for all pages.
sub charset {
    qq();
}

# A link to a CSS style sheet.
sub style {
    my $style = shift;
    qq();
}

# The initial  tag, which is a bit complicated for XHTML.  Assume
# English output.
sub html {
    qq();
}

# Wrap a container around data, keeping the tags on the same line.
sub container {
    my ($tag, @data) = @_;
    my $data = join ('', @data);
    $data = '<' . $tag . '>' . $data;
    $tag =~ s/ .*//;
    $data =~ s%(\s*)$%$1%;
    $data;
}

# Output a list item.  Takes the indentation, the item, and an optional third
# argument, which if specified is the number to use for the item (using the
# value attribute, which for some reason is deprecated under HTML 4.0 without
# any viable alternative for what I use it for).
sub li {
    my ($indent, $data, $value) = @_;
    $indent = 0 unless defined $indent;
    my $output = '';
    if (@INDENT && $INDENT[0][0] eq 'li') {
        $output .= "\n";
        shift @INDENT;
    }
    unshift (@INDENT, [ 'li', $indent ]);
    undef $value unless $USEVALUE;
    my $tag = defined $value ? qq(\n) : "
\n";
    $output . $tag . $data;
}

# Wrap a container around data, preserving trailing blank lines outside and
# putting the tags on lines of their own.
sub paragraph {
    my ($tag, @data) = @_;
    my $data = join ('', @data);
    $data .= "\n" unless ($data =~ /\n$/);
    '<' . $tag . ">\n" . $data . '\n";
}

# Multiparagraph structure is maintained based on indentation level.  The
# global variable @INDENT holds a stack of pairs of block elements and their
# corresponding indentation levels.  The possible structure elements are dl,
# dd, ul, ol, li, and blockquote.
#
# This function is used to start or end block structure elements.  It closes
# any pending open structure elements with an indent level greater than the
# indentation level given, and then closes any open structure elements with an
# indentation level equal to the one given if a new structure element is given
# and it is different than the open one.  Then, if a structure element is
# given, open a new block structure element with that indentation.
#
# One can pass attributes in for the opening tag; anything after a space will
# be stripped out for determining the close tag.
sub start {
    my ($indent, $tag, $data) = @_;
    $indent = 0 unless defined $indent;
    my $e = $tag || '';
    $e =~ s/ .*//;
    $data = '' unless $data;
    my $output = '';
    while (@INDENT) {
        last if ($INDENT[0][1] < $indent);
        last if ($tag && $INDENT[0][1] == $indent && $INDENT[0][0] eq $tag);
        last if ($INDENT[0][1] == $indent && !$tag && $INDENT[0][0] ne 'dl');
        $output .= "\n";
        shift @INDENT;
    }
    return $output unless $tag;
    if (!@INDENT || $indent > $INDENT[0][1]) {
        $output .= "<$tag>\n";
        unshift (@INDENT, [ $tag, $indent ]);
    }
    $output . $data;
}

# Handle titles, which should have newlines turned into spaces and leading and
# trailing whitespace zapped.
sub title {
    local $_ = shift;
    s/\s*\n\s*/ /g;
    s/^\s+//;
    s/\s+$//;
    '' . $_ . '';
}

# Various containers.
sub blockquote { paragraph ('blockquote', @_) }
sub dt         { container ('dt',         @_) }
sub h1         { container ('h1',         @_) }
sub h2         { container ('h2',         @_) }
sub h3         { container ('h3',         @_) }
sub head       { paragraph ('head',       @_) }
sub p          { paragraph ('p',          @_) }
sub pre        { container ('pre',        @_) }

##############################################################################
# Header parsing
##############################################################################

# Check to see if the header looks like that of a FAQ.  If it doesn't, return
# undefs; otherwise, return a list consisting of the author, the title, and
# the original author if any was given.
sub handle_faq_headers {
    my ($author, $title);
    if (defined && /^From /) { $_ = <$IN> }
    while (defined && is_header $_) {
        my ($header, $content) = /^([\w-]+):\s+(.*)/;

        # Deal with continuation lines.
        $_ = <$IN>;
        while (defined && /^\s+\S/) { $content .= $_; $_ = <$IN> }

        # Save information we care about.
        if    (lc $header eq 'from')    { $author = $content }
        elsif (lc $header eq 'subject') { $title  = $content }
    }

    # Skip blank lines (either initial ones or ones after headers.
    $_ = <$IN> while (defined && /^\s*$/);

    # Parse the FAQ subheaders, if any.  If we see any, we use the HTML-title
    # and the Original-author headers.
    my $original;
    while (defined && is_header $_) {
        my ($header, $content) = /^([\w-]+):\s+(.*)/;

        # Deal with continuation lines.
        $_ = <$IN>;
        while (defined && /^\s+\S/) { $content .= $_; $_ = <$IN> }

        # Save information we care about.
        if    (lc $header eq 'html-title')      { $title    = $content }
        elsif (lc $header eq 'original-author') { $original = $content }
    }

    # Return the information we found.
    return ($author, $title, $original);
}

# Check to see if the header looks like my documentation format.  If it
# doesn't, return undefs.  Otherwise, return a list consisting of the author,
# the title, and the CVS revision string.
sub handle_doc_headers {
    my ($author, $subject, $id);
    while (defined && /^\s*[\w-]+:\s/) {
        my ($header, $content) = /^\s*([\w-]+):\s+(.*)/;
        $_ = <$IN>;

        # Save information we care about.
        if    (lc $header eq 'author')   { $author  = $content }
        elsif (lc $header eq 'subject')  { $subject = $content }
        elsif (lc $header eq 'revision') { $id = $content if is_id $content }
    }

    # Return the information we found.
    return ($author, $subject, $id);
}

##############################################################################
# Main routine
##############################################################################

# Trim extraneous garbage from the path.
my $fullpath = $0;
$0 =~ s%.*/%%;

# Parse command-line options, if any.
my ($help, $lastmod, $numbered, $style, $realtitle, $version);
Getopt::Long::config ('bundling');
GetOptions ('h|help'          => \$help,
            'l|last-modified' => \$lastmod,
            's|style=s'       => \$style,
            't|title=s'       => \$realtitle,
            'u|use-value'     => \$USEVALUE,
            'v|version'       => \$version) or exit 1;

# If they asked for help, give them the documentation.
if ($help) {
    print "Feeding myself to perldoc, please wait....\n";
    exec ('perldoc', '-t', $fullpath) or die "$0: can't fork: $!\n";
}

# If they asked for the version number, print it and exit.
if ($version) {
    my $version = join (' ', (split (' ', $ID))[1..3]);
    $version =~ s/,v\b//;
    $version =~ s/(\S+)$/($1)/;
    $version =~ tr%/%-%;
    print $version, "\n";
    exit;
}

# Figure out what file we're going to be processing.  We can function as a
# filter if so desired.
my ($input, $output) = @ARGV;
if (defined $input && $input ne '-') {
    open (IN, $input) or die "$0: can't open $input: $!\n";
    $IN = \*IN;
} else {
    $IN = \*STDIN;
}
if (defined $output) {
    open (OUT, "> $output") or die "$0: can't write to $output: $!\n";
    $OUT = \*OUT;
} else {
    $OUT = \*STDOUT;
}

# Check for a leading RCS/CVS version identifier.  For FAQs that I'm posting
# to Usenet using postfaq, this will always be the first line of the file
# stored on disk.
my $id;
$_ = <$IN>;
if (is_id $_) {
    chomp ($id = $_);
    do { $_ = <$IN> } while (defined && /^\s*$/);
}

# Check for the type of document.  First we see if it looks like a FAQ with
# news/mail headers, and if so we read those headers and the subheaders.
# Otherwise, we see if it looks like one of my documentation files and try to
# grab information from it if so.
my ($author, $title, $original);
if (!$realtitle) {
    if (is_header ($_) || /^From /) {
        ($author, $title, $original) = handle_faq_headers;
    } else {
        my $newid;
        ($author, $title, $newid) = handle_doc_headers;
        $id = $newid if defined $newid;
    }
}

# Skip over whitespace after headers, and also skip over rules.
$_ = <$IN> while (defined && (/^\s*$/ || is_rule $_));

# See if we have a centered title at the top of the document.  If so, we'll
# make that the document title unless we also saw a Subject header.  Titles
# shouldn't be in all caps, though.
my $heading;
if (is_centered ($_)) {
    $heading = whitechomp $_;
    if (!$title) {
        $title = $heading;
        $title =~ s/\b([A-Z]+)\b/\L\u$1/g if (is_allcaps $title);
    }
    do { $_ = <$IN> } while (defined && (/^\s*$/ || is_rule $_));
}
$title = $realtitle if $realtitle;
$heading ||= $title;
$heading = urlize $heading;

# Generate the heading of the HTML file, using the filename as the title if we
# haven't been able to find a title.  We claim "transitional" XHTML 1.0
# compliance; we can't claim strict solely because we use the value attribute
# in 
 in the absence of widespread implementation of CSS Level 2.
($version) = (split (' ', $ID))[2];
output dtd;
output "\n";
output html, "\n";
output head ("  ", title ($title || $output || 'faq2html output'),
             $style ? ("\n  ", style ($style)) : '', "\n  ",
             charset, "\n"), "\n";
output comment ($id), "\n" if $id;
output comment ("Converted to XHTML by faq2html version $version"), "\n\n";

# Open the body of the document, and print out the heading if we found one.
output "\n\n";
output h1 ($heading), "\n" if $heading;

# If we have additional headers, print them out.  Otherwise, if we have author
# information from a From header, print that out under the main heading.
#
# If we have RCS/CVS Id information, add another subheading containing the
# last modified date.  Alternately, if the -l option was given, get the last
# modified date from the source file.  Existing subheadings that look like
# they're just Revision or Date strings are replaced by our more nicely
# formatted string.
#
# We go to some length here to avoid unnecessary 
 tags.
#
# Note that  has to be on the end of the last line rather than the
# beginning of the next to work around a bug in lynx.
if ($heading) {
    my ($subheading, $modified);
    if ($id) {
        $modified = modified_id ($id);
    } elsif ($lastmod && $input ne '-') {
        my $timestamp = (stat $input)[9];
        if ($timestamp) {
            $modified = modified_timestamp ($timestamp);
        }
    }
    while (defined && (/^\s*$/ || is_centered ($_) || $subheading)) {
        if (/^\s*$/) {
            do { $_ = <$IN> } while (defined && is_rule $_);
            if (defined && is_centered ($_)) {
                output "\n\n" if $subheading;
                $subheading = 0;
                next;
            } else {
                last;
            }
        } else {
            output qq(\n) unless $subheading;
            output "
\n" if $subheading;
            $subheading++;
            if ($modified && (/\$Revision/ || /\$Date/)) {
                output '  ', $modified;
                undef $modified;
            } else {
                $_ = urlize (escape (whitechomp ($_)));
                output '  ', $_;
            }
        }
        do { $_ = <$IN> } while (defined && is_rule $_);
    }
    if (!defined $subheading && $author) {
        $subheading++;
        output qq(
\n);
        output '  ', escape ($author);
        output "
\n  (originally by ", escape ($original), ')'
            if $original;
    }
    if ($modified) {
        output qq(
\n) unless $subheading;
        output "
\n" if $subheading;
        output '  ', $modified;
        $subheading++;
    }
    output "\n\n" if $subheading;
}

# Scan the actual body of the text.  We don't use paragraph mode, since it
# doesn't work with blank lines that contain whitespace; instead, we cobble
# together our own paragraph mode that does.  Note that $_ already has a
# non-blank line of input coming into this loop.
output "\n" if $heading;
$BUFFER = $_;
my $space;
while (defined $BUFFER) {
    $_ = slurp;

    # Ignore any text after a signature block.
    last if (is_signature $_);

    # If we just hit a digest divider, the next thing will likely be a
    # Subject: line that we want to turn into a section header.  Digest
    # section titles are always level 2 headers currently.
    if (is_divider $_) {
        $STATE{pre} = 0;
        output start (-1);
        undef $INDENT;
        ($WS) = /\n(\s*)$/;
        $_ = slurp;
        s/\n(\s*)$/\n/;
        $space = $1;
        if (s/^Subject:\s+//) {
            $STATE{contents} = /\bcontents\b/i;
            $_ = escape $_;
            if (/^([\d.]+)[.\)]\s/) {
                output h2 (container (qq(a name="S$1" id="S$1"), $_));
            } else {
                output h2 ($_);
            }
            next;
        }
    }

    # Treat lines of dash-type characters as rules.
    if (is_rule $_) {
        $STATE{pre} = 0;
        ($space) = /\n(\s*)$/;
        output start (-1), "\n";
        undef $INDENT;
        next
    }

    # Everything else needs to have special characters escaped.  We don't do
    # this earlier because if we want to allow < and > in rules, the escaping
    # would make our lives miserable.
    $_ = escape $_;

    # Do this before untabification and stashing of trailing whitespace, but
    # after escaping.  Check to see if this paragraph looks like literal text.
    # If so, we wrap it in  and output it as is.  As a special exception
    # to our normal paragraph handling, this paragraph doesn't end until we
    # find a literal blank line; this hack lets full diffs be included in a
    # FAQ without confusing the parser.
    if (is_literal $_) {
        if (/\n[ \t]+$/) { $_ .= slurp (1) }
        output pre (strip_indent ($_, $INDENT));
        s/\n(\n\s*)$/\n/;
        $space = $1;
        $STATE{pre} = 1;
        next;
    }

    # Not literal text, so untabify it and stash whitespace.
    $_ = untabify $_;
    s/\n(\s*)$/\n/;
    $space = $1;
    my $indent = indent $_;

    # If the paragraph has inconsistent indentation, or is indented relative
    # to the baseline *and* the last paragraph we emitted was enclosed in
    # , assume that this paragraph belongs in  as well.
    if ($STATE{pre}) {
        if (is_offset ($_) || (defined $INDENT && $indent > $INDENT)) {
            output pre (strip_indent ($_, $INDENT));
            next;
        } else {
            $STATE{pre} = 0;
        }
    }

    # Check for a heading.  We distinguish between level 2 headings and level
    # 3 headings as follows: The first heading we encounter is assumed to be a
    # level 2 heading, and any further headers at that same indentation level
    # are also level 2 headings.  If we detect any other headings at a greater
    # indent, they're marked as level 3.
    if (is_heading ($_)) {
        s/^\s+//;
        $STATE{contents} = /\bcontents\b/i;
        my $h;
        if (defined $STATE{h2}) {
            if ($indent <= $STATE{h2}) { $h = \&h2 }
            else                       { $h = \&h3 }
        } else {
            $STATE{h2} = $indent;
            $h = \&h2;
        }
        if (/^([\d.]+)[.\)]\s/) {
            my $anchor = qq(a name="S$1" id="S$1");
            output start, &$h (container ($anchor, derule ($_)));
        } else {
            output start, &$h (derule ($_));
        }
        $INDENT = $STATE{baseline};
        next;
    }

    # A sudden change to an indentation of 0 when that's less than our
    # indentation baseline is also a sign of literal text.
    if ($INDENT && $indent == 0 && $INDENT > 0 && defined ($STATE{baseline})
        && $STATE{baseline} > 0) {
        output pre (strip_indent ($_, $INDENT));
        $STATE{pre} = 1;
        next;
    }

    # We're dealing with a normal paragraph of some sort, so go ahead and turn
    # URLs into links.  Check whether the paragraph is broken first, though,
    # and stash that information, since turning URLs into links can
    # artificially lengthen lines.
    my $broken = is_broken $_;
    $_ = urlize $_;

    # Check to see if we're in a contents section, and if so if this paragraph
    # looks like a table of contents.  If so, turn all of the section headings
    # into links and assume broken text.
    if ($STATE{contents} && is_contents $_) { $_ = contents $_ }

    # Check for paragraphs that are entirely bulletted lines, and turn them
    # into unordered lists without  tags.
    if (is_allbullet $_) {
        my $last;
        my @lines = split (/\n/, $_);
        for (@lines) {
            next unless /\S/;
            if (is_bullet $_) {
                if (defined $last) {
                    output start ($INDENT, 'ul');
                    output li ($INDENT, embolden $last);
                }
                $last = debullet $_;
                $INDENT = indent $last;
            } else {
                $last .= "\n$_";
            }
        }
        if (defined $last) {
            output start ($INDENT, 'ul');
            output li ($INDENT, embolden $last);
        }
        next;
    }

    # Check for paragraphs that are entirely numbered lines, and turn them
    # into ordered lists without 
 tags.
    if (is_allnumbered $_) {
        my @lines = split (/\n/, $_);
        for (@lines) {
            next unless /\S/;
            my ($number) = /^(\d+)/;
            $_ = denumber $_;
            $INDENT = indent $_;
            output start ($INDENT, 'ol');
            output li ($INDENT, embolden ($_), $number);
        }
        next;
    }

    # Check for bulletted paragraphs and turn them into lists.
    if (is_bullet $_) {
        $_ = debullet $_;
        $INDENT = indent $_;
        output start ($INDENT, 'ul');
        output li ($INDENT, p (embolden $_));
        next;
    }

    # Check for paragraphs quoted with some character and turn them into
    # blockquotes provided they don't have inconsisted indentation.
    my $quote = is_quoted ($_);
    if ($quote && !$broken) {
        $_ = unquote ($_, $quote);
        $INDENT = indent $_;
        output start ($INDENT, 'blockquote', p (embolden $_));
        next;
    }

    # Check for numbered paragraphs and turn them into lists.
    my $number = is_numbered ($_);
    if (defined $number) {
        my $contents = is_contents ($_);
        $_ = denumber $_;
        $INDENT = indent $_;
        s%(\n\s*\S)%
$1%g if ($broken || $contents);
        output start ($INDENT, 'ol');
        output li ($INDENT, p (embolden $_), $number);
        next;
    }

    # Check for things that look like description lists and handle them.  Note
    # that we don't allow indented description lists, because they're usually
    # something we actually want to make 
.  This is another fairly
    # fragile heuristic.
    if (is_description ($_) && defined $INDENT) {
        my (@title, $body);
        ($title[0], $body) = split ("\n", $_, 2);
        my ($space) = ($title[0] =~ /^(\s*)/);
        while ($body =~ /^$space\S/) {
            my $title;
            ($title, $body) = split ("\n", $body, 2);
            push (@title, $title);
        }
        if ($indent == $INDENT || indent ($body) == $INDENT) {
            @title = map { embolden ($_) } @title;
            my $title = join ("
\n", @title) . "\n";
            $INDENT = indent $body;
            $body =~ s%(\n\s*\S)%
$1%g if is_broken $body;
            output start ($indent, 'dl', dt ($title));
            output start ($INDENT, 'dd', (p (embolden $body)));
            next;
        }
    }

    # If the paragraph has inconsistent indentation, we should output it in
    # .
    if (is_offset $_) {
        output pre (strip_indent ($_, $INDENT));
        $STATE{pre} = 1;
        next;
    }

    # A sudden indentation change also means the paragraph should be
    # blockquoted.  We render broken blockquoted text in , which may not
    # be what's wanted for things like quotes of poetry... this is probably
    # worth looking at in more detail.
    if (defined $INDENT && $indent > $INDENT) {
        if ($broken || (lines ($_) == 1 && !is_sentence $_)) {
            output pre (strip_indent ($_, $INDENT));
            $STATE{pre} = 1;
        } else {
            $INDENT = $indent;
            output start ($INDENT, 'blockquote', p (embolden $_));
        }
        next;
    }

    # Close multiparagraph structure if we've outdented again.
    if ($INDENT && $indent < $INDENT) { output start ($indent) }

    # Looks like a normal paragraph.  Establish our indentation baseline if we
    # haven't already.
    if (!defined $STATE{baseline} && !$INDENT) {
        $STATE{baseline} = $indent;
    }
    $INDENT = $indent;
    s%(\n\s*\S)%
$1%g if $broken;
    output p (embolden $_);

} continue {
    $WS = $space;
}

# All done.  Print out our closing tags.
output start (-1), "\n\n\n";
__END__


##############################################################################
# Documentation
##############################################################################

=head1 NAME

faq2html - Convert some particular text formats into XHTML

=head1 SYNOPSIS

B [B<-hluv>] [B<-s> I