#!/usr/bin/perl -w
$ID = q$Id: faq2html,v 1.36 2021/03/28 02:28:50 eagle Exp $;
#
# faq2html -- Convert some particular text formats into XHTML.
#
# Copyright 1999-2002, 2004-2005, 2008, 2010, 2013-2014, 2021
# Russ Allbery
#
# This program is free software; you may redistribute it and/or modify it
# under the same terms as Perl itself.
#
# This program is an ad hoc set of heuristics and tricks, attempting to
# convert a few text file formats that I commonly use into reasonable XHTML.
# It's my opinion that general text to XHTML conversions is impossible due to
# the huge number of differing formats used by different people when writing
# text; this doesn't try to solve the general problem. Rather, it's good
# enough to turn the FAQs I maintain into XHTML documents, which is all that I
# need of it.
##############################################################################
# Modules and declarations
##############################################################################
require 5.003;
use strict;
use vars qw($BUFFER $ID $IN $INDENT @INDENT @MONTHS $OUT %STATE $USEVALUE
$WS);
use Getopt::Long qw(GetOptions);
# Replace with the month names you want to use, if you don't want English.
@MONTHS = qw(January February March April May June July August September
October November December);
##############################################################################
# Utility functions
##############################################################################
# Turns section numbers at the beginning of lines in a paragraph into links.
sub contents {
local $_ = shift;
s%^(\s*([\d.]+)[.\)]\s+)(.*?)([ \t]*\n)%$1$3$4%gm;
$_;
}
# Removes an initial bullet on a paragraph, replacing it with spaces.
sub debullet { local $_ = shift; s/(\s*)[-*o](\s)/$1 $2/; $_ }
# Unescape &, <, and > characters.
sub deescape { local $_ = shift; s/>/>/g; s/</some text, while trying to be
# careful to avoid other uses of wildcards.
sub embolden {
local $_ = shift;
s%(^|\s)\*(\w.*?\S)\*([,.!?;\s])%$1$2$3%gs;
$_;
}
# Escapes &, <, and > characters found in a string.
sub escape { local $_ = shift; s/&/&/g; s/</g; s/>/>/g; $_ }
# Returns the length of the indentation of a line or paragraph.
sub indent { $_[0] =~ /^(\s*)/; length $1 }
# Returns the number of lines in a paragraph, not counting trailing blanks.
sub lines { local $_ = shift; s/\s+$/\n/; tr/\n// }
# Returns a nicely formatted "Last modified" string from an RCS/CVS Id.
sub modified_id {
my $id = shift;
my ($version, $date) = (split (' ', $id))[2,3];
my ($year, $month, $day) = split (m%[/-]%, $date);
$day =~ s/^0//;
my $revision = ($version =~ /\./) ? " (revision $version)" : '';
'Last modified '. $MONTHS[$month - 1] . ' ' . $day . ', ' . $year
. $revision;
}
# The same, but from a UNIX timestamp.
sub modified_timestamp {
my $timestamp = shift;
my ($year, $month, $day) = (localtime $timestamp)[5, 4, 3];
$year += 1900;
'Last modified ' . $MONTHS[$month] . ' ' . $day . ', ' . $year;
}
# Output some text to the $OUT file handle, adding any preserved whitespace
# from before first, but after any closing tags.
sub output {
local $_ = shift;
if ($WS) {
s%^(\s*(?:(?!body)[^>]+>\s*)*)%$1$WS%;
$WS = '';
}
print $OUT $_, @_;
}
# Read a paragraph in from $IN. If no argument is given or the argument is
# false, lines with nothing but whitespace are paragraph dividers; otherwise,
# only a completely blank line is a divider. Use $BUFFER to store the
# unwanted next line.
sub slurp {
my $ws = shift;
my $p;
local $_;
$p = $BUFFER || '';
$p .= $_ while (defined ($_ = <$IN>) && ($ws ? !/^$/ : /\S/));
$p .= $_ if defined;
$p .= $_ while (defined ($_ = <$IN>) && /^\s*$/);
$BUFFER = $_;
$p;
}
# Remove all whitespace in a string.
sub smash { local $_ = shift; s/\s//g; $_ }
# Strip a number of characters of indentation from a line that's given by the
# second argument, returning the result. Used to strip leading indentation
# off of
text so that it isn't indented excessively just because in the
# text version it had to be indented relative to the surrounding text.
sub strip_indent {
local $_ = shift;
my $indent = shift;
if (defined $indent && $indent > 0) {
s/^ {$indent}//gm;
}
$_;
}
# Undoes HTML character escapes.
sub unescape { local $_ = shift; s/&/&/g; s/<//g; $_ }
# Remove a constant prefix at the beginning of each line of a paragraph.
sub unquote {
my ($string, $quote) = @_;
$string =~ s/((?:^|\n)\s*)(\Q$quote\E\s+)/$1 . ' ' x length ($2)/ge;
$string;
}
# Replace tabs with spaces.
sub untabify {
local $_ = shift;
1 while s/^(.*?)(\t+)/' ' x (length ($2) * 8 - length ($1) % 8)/me;
$_;
}
# Given a special-character-escaped URL, wrap to that URL around
# it. Remove a leading mailto: in the link text.
sub url {
my $link = shift;
my $text = $link;
$link = smash (unescape $link);
$text =~ s/^(?:mailto|news)://;
'<' . $text . '>';
}
# Looks for a URL in form, with or without the URL: part, and wraps
# a link around it.
sub urlize {
my $text = shift;
$text =~ s%<(?:URL:)?([a-z]{2,}:.+?)>%url ($1)%ge;
$text;
}
# Remove whitespace at the beginning and end of a string.
sub whitechomp { local $_ = shift; s/^\s+//; s/\s+$//; $_ }
##############################################################################
# Identification functions
##############################################################################
# Expects a paragraph, returns whether it is composed entirely of bullet
# items. Take some care to avoid returning true for paragraphs that consist
# of a single bullet entry, since we want to handle those separately to wrap
# them in paragraph tags.
sub is_allbullet {
local $_ = shift;
my @lines = split ("\n", $_);
return if not $lines[0] =~ /^(\s*[-*o]\s)\S/;
my $bullet = $1;
my $space = $bullet;
$space =~ s/[-*o]/ /;
my $bullets = 0;
for (@lines) {
next if !/\S/;
return if !/^(?:\Q$bullet\E|\Q$space\E)\S/;
$bullets++ if /^\Q$bullet\E/;
}
return $bullets > 1;
}
# Expects a paragraph, returns whether every line is a numbered item with a
# simple number.
sub is_allnumbered { $_[0] =~ /^(\s*\d\d?[.\)]\s.*\n){2,}\s*$/ }
# Expects a paragraph, returns whether it's in all capital letters.
sub is_allcaps { $_[0] !~ m%[^A-Z0-9\s\"\(\),:.!/?-]% }
# Expects a paragraph, returns whether it looks like it's broken into a series
# of short lines or a series of lines without internal space. The last line
# of the paragraph doesn't matter for this determination.
sub is_broken {
local $_ = shift;
s/\s+$/\n/;
my @lines = split ("\n", $_);
return if @lines == 1;
pop @lines;
return 1 if grep { length ($_) < 40 } @lines;
my $short = grep { length ($_) < 60 } @lines;
($short >= int (@lines / 2) + 1) || /^(?:\s*\S+[ \t]*\n)+$/;
}
# Expects a paragraph, returns whether it's a bulletted item.
sub is_bullet { $_[0] =~ /^\s*[-*o]\s/ }
# Expects a line, returns whether it's centered (in 74 columns). Also require
# at least 10 spaces of whitespace so that we don't catch accidentally
# centered paragraph lines by mistake.
sub is_centered {
$_[0] =~ /^(\s+)(.+)/
&& abs (74 - length ($2) - length ($1) * 2) < 2
&& length (untabify $1) >= 8;
}
# Expects a paragraph, returns whether it looks like a content listing.
sub is_contents { $_[0] =~ /^(?:\s*[\d.]+[.\)][ \t].*\n)+\s*$/ }
# Expects a paragraph, returns whether it looks like a title and description.
# Allow for multiple titles.
sub is_description {
$_[0] =~ /^(\s*)\S.*\n(?:\1\S.*\n)*(\s+)\S.*\n(?:\2\S.*\n)*\s*$/
&& length ($1) < length ($2);
}
# Expects a paragraph, returns whether it's a digest divider.
sub is_divider { $_[0] =~ /^-{30}\s*$/ }
# Expects a line, returns whether it's a mail/news header.
sub is_header { $_[0] =~ /^[\w-]+:\s/ }
# Expects a paragraph, returns whether it's a heading. This is all about
# heuristics and guesses, and there are a number of other things we could
# confuse for headings, so we have to be careful. If it's a single line and
# outdented from the baseline, it's probably a heading. If it's at the
# baseline, check to see if it looks like a heading and either it's in all
# caps or there is a rule underneath it. If we haven't seen a baseline, be
# more accepting about headers. If we're inside a contents block, be even
# more careful and disallow numbered things that look like a heading unless
# they're outdented.
sub is_heading {
local $_ = deescape (shift);
my $indent = indent $_;
my $nobase = !defined $STATE{baseline};
my $outdented = defined ($STATE{baseline}) && $indent < $STATE{baseline};
return if (!$outdented && $STATE{contents} && /^[\d.]+[.\)]\s/);
my $even = !defined ($INDENT) || $indent <= $INDENT;
($outdented && lines ($_) == 1 && (/\S\s\S/ || length ($_) < 30))
|| ($even && m%^\s*[ \w\"\(\),:./&-]{0,30}[\w\"\)]\s*\n[-=~]+\s*$%)
|| ($even && m%^\s*[ A-Z0-9\"\(\),:./&-]{0,30}[A-Z0-9\"\)]\s*\n$%)
|| ($even && $nobase && m%^\s*[ \w\"\(\),:./&-]{0,33}[\w\"\)]\s*\n$%);
}
# Expects a line, returns whether it's an RCS/CVS Id string that has been
# correctly expanded.
sub is_id { $_[0] =~ /^\s*\$Id\: .*\$\s*$/ }
# Expects a paragraph, returns whether it appears to have internal whitespace.
sub is_literal { $_[0] =~ /^[ \t]*\S.*(?:[^.?!\"\)\]:*_\n] | |\t)\S/m }
# Expects a paragraph, returns undef if it doesn't look like a numbered
# paragraph or the number if it does.
sub is_numbered { ($_[0] =~ /^\s*(\d\d?)[.\)]\s/) ? $1 : undef }
# Expects a paragraph, returns true if the paragraph has inconsistent
# indentation.
sub is_offset {
local $_ = shift;
# Strip off a leading bullet or number and consider it whitespace in
# making this check.
s/^(\s*(?:\d\d?)[.\)]\s)/' ' x length ($1)/e;
s/^(\s*[-*o]\s)/' ' x length ($1)/e;
# Now, return true if the indentation isn't consistent.
!/^(\s*)\S.*\n(\1\S.*\n)*\s*$/
}
# Expects a paragraph, returns undef if not quoted or the quote character if
# it is quoted. Requires that the paragraph be at least two lines.
sub is_quoted { $_[0] =~ /^\s*([^\w\s\"\'])\s*.*\n(\s*\1\s*.*\n)+$/ && $1 }
# Expects a paragraph, returns whether it's a rule.
sub is_rule { $_[0] =~ /^\s*[-=][-=\s]*$/ }
# Expects a paragraph, returns whether it ends with a sentence. As a special
# case, a URL counts as a sentence so that we don't wrap
around URLs.
sub is_url;
sub is_sentence {
local $_ = shift;
return 1 if /\S[.?!][\)\]\"]?\s*$/;
return 1 if /^\s*\w.*\s\S+:\s*$/;
return 1 if is_url $_;
0;
}
# Expects a paragraph, returns whether it's the start of a signature block,
# defined to be a paragraph whose first line is exactly "-- ".
sub is_signature { $_[0] =~ /^-- \n/ }
# Expects a paragraph, returns whether it's a simple intented URL (already
# converted into a real link.
sub is_url { $_[0] =~ m%^\s*<\S+>\s*$% }
##############################################################################
# HTML constructors
##############################################################################
# Output the DTD for XHTML. We claim "transitional" XHTML 1.0 compliance; we
# can't claim strict solely because we use the value attribute in
in the
# absence of widespread implementation of CSS Level 2.
sub dtd {
if ($USEVALUE) {
qq(\n)
. qq()
. "\n";
} else {
qq(\n)
. qq()
. "\n";
}
}
# An XML comment.
sub comment {
my @data = @_;
my $data = join ('', @data);
'';
}
# The character set for the page; we assume UTF-8 for all pages.
sub charset {
qq();
}
# A link to a CSS style sheet.
sub style {
my $style = shift;
qq();
}
# The initial tag, which is a bit complicated for XHTML. Assume
# English output.
sub html {
qq();
}
# Wrap a container around data, keeping the tags on the same line.
sub container {
my ($tag, @data) = @_;
my $data = join ('', @data);
$data = '<' . $tag . '>' . $data;
$tag =~ s/ .*//;
$data =~ s%(\s*)$%$tag>$1%;
$data;
}
# Output a list item. Takes the indentation, the item, and an optional third
# argument, which if specified is the number to use for the item (using the
# value attribute, which for some reason is deprecated under HTML 4.0 without
# any viable alternative for what I use it for).
sub li {
my ($indent, $data, $value) = @_;
$indent = 0 unless defined $indent;
my $output = '';
if (@INDENT && $INDENT[0][0] eq 'li') {
$output .= "
\n";
$output . $tag . $data;
}
# Wrap a container around data, preserving trailing blank lines outside and
# putting the tags on lines of their own.
sub paragraph {
my ($tag, @data) = @_;
my $data = join ('', @data);
$data .= "\n" unless ($data =~ /\n$/);
'<' . $tag . ">\n" . $data . '' . $tag . ">\n";
}
# Multiparagraph structure is maintained based on indentation level. The
# global variable @INDENT holds a stack of pairs of block elements and their
# corresponding indentation levels. The possible structure elements are dl,
# dd, ul, ol, li, and blockquote.
#
# This function is used to start or end block structure elements. It closes
# any pending open structure elements with an indent level greater than the
# indentation level given, and then closes any open structure elements with an
# indentation level equal to the one given if a new structure element is given
# and it is different than the open one. Then, if a structure element is
# given, open a new block structure element with that indentation.
#
# One can pass attributes in for the opening tag; anything after a space will
# be stripped out for determining the close tag.
sub start {
my ($indent, $tag, $data) = @_;
$indent = 0 unless defined $indent;
my $e = $tag || '';
$e =~ s/ .*//;
$data = '' unless $data;
my $output = '';
while (@INDENT) {
last if ($INDENT[0][1] < $indent);
last if ($tag && $INDENT[0][1] == $indent && $INDENT[0][0] eq $tag);
last if ($INDENT[0][1] == $indent && !$tag && $INDENT[0][0] ne 'dl');
$output .= "$INDENT[0][0]>\n";
shift @INDENT;
}
return $output unless $tag;
if (!@INDENT || $indent > $INDENT[0][1]) {
$output .= "<$tag>\n";
unshift (@INDENT, [ $tag, $indent ]);
}
$output . $data;
}
# Handle titles, which should have newlines turned into spaces and leading and
# trailing whitespace zapped.
sub title {
local $_ = shift;
s/\s*\n\s*/ /g;
s/^\s+//;
s/\s+$//;
'' . $_ . '';
}
# Various containers.
sub blockquote { paragraph ('blockquote', @_) }
sub dt { container ('dt', @_) }
sub h1 { container ('h1', @_) }
sub h2 { container ('h2', @_) }
sub h3 { container ('h3', @_) }
sub head { paragraph ('head', @_) }
sub p { paragraph ('p', @_) }
sub pre { container ('pre', @_) }
##############################################################################
# Header parsing
##############################################################################
# Check to see if the header looks like that of a FAQ. If it doesn't, return
# undefs; otherwise, return a list consisting of the author, the title, and
# the original author if any was given.
sub handle_faq_headers {
my ($author, $title);
if (defined && /^From /) { $_ = <$IN> }
while (defined && is_header $_) {
my ($header, $content) = /^([\w-]+):\s+(.*)/;
# Deal with continuation lines.
$_ = <$IN>;
while (defined && /^\s+\S/) { $content .= $_; $_ = <$IN> }
# Save information we care about.
if (lc $header eq 'from') { $author = $content }
elsif (lc $header eq 'subject') { $title = $content }
}
# Skip blank lines (either initial ones or ones after headers.
$_ = <$IN> while (defined && /^\s*$/);
# Parse the FAQ subheaders, if any. If we see any, we use the HTML-title
# and the Original-author headers.
my $original;
while (defined && is_header $_) {
my ($header, $content) = /^([\w-]+):\s+(.*)/;
# Deal with continuation lines.
$_ = <$IN>;
while (defined && /^\s+\S/) { $content .= $_; $_ = <$IN> }
# Save information we care about.
if (lc $header eq 'html-title') { $title = $content }
elsif (lc $header eq 'original-author') { $original = $content }
}
# Return the information we found.
return ($author, $title, $original);
}
# Check to see if the header looks like my documentation format. If it
# doesn't, return undefs. Otherwise, return a list consisting of the author,
# the title, and the CVS revision string.
sub handle_doc_headers {
my ($author, $subject, $id);
while (defined && /^\s*[\w-]+:\s/) {
my ($header, $content) = /^\s*([\w-]+):\s+(.*)/;
$_ = <$IN>;
# Save information we care about.
if (lc $header eq 'author') { $author = $content }
elsif (lc $header eq 'subject') { $subject = $content }
elsif (lc $header eq 'revision') { $id = $content if is_id $content }
}
# Return the information we found.
return ($author, $subject, $id);
}
##############################################################################
# Main routine
##############################################################################
# Trim extraneous garbage from the path.
my $fullpath = $0;
$0 =~ s%.*/%%;
# Parse command-line options, if any.
my ($help, $lastmod, $numbered, $style, $realtitle, $version);
Getopt::Long::config ('bundling');
GetOptions ('h|help' => \$help,
'l|last-modified' => \$lastmod,
's|style=s' => \$style,
't|title=s' => \$realtitle,
'u|use-value' => \$USEVALUE,
'v|version' => \$version) or exit 1;
# If they asked for help, give them the documentation.
if ($help) {
print "Feeding myself to perldoc, please wait....\n";
exec ('perldoc', '-t', $fullpath) or die "$0: can't fork: $!\n";
}
# If they asked for the version number, print it and exit.
if ($version) {
my $version = join (' ', (split (' ', $ID))[1..3]);
$version =~ s/,v\b//;
$version =~ s/(\S+)$/($1)/;
$version =~ tr%/%-%;
print $version, "\n";
exit;
}
# Figure out what file we're going to be processing. We can function as a
# filter if so desired.
my ($input, $output) = @ARGV;
if (defined $input && $input ne '-') {
open (IN, $input) or die "$0: can't open $input: $!\n";
$IN = \*IN;
} else {
$IN = \*STDIN;
}
if (defined $output) {
open (OUT, "> $output") or die "$0: can't write to $output: $!\n";
$OUT = \*OUT;
} else {
$OUT = \*STDOUT;
}
# Check for a leading RCS/CVS version identifier. For FAQs that I'm posting
# to Usenet using postfaq, this will always be the first line of the file
# stored on disk.
my $id;
$_ = <$IN>;
if (is_id $_) {
chomp ($id = $_);
do { $_ = <$IN> } while (defined && /^\s*$/);
}
# Check for the type of document. First we see if it looks like a FAQ with
# news/mail headers, and if so we read those headers and the subheaders.
# Otherwise, we see if it looks like one of my documentation files and try to
# grab information from it if so.
my ($author, $title, $original);
if (!$realtitle) {
if (is_header ($_) || /^From /) {
($author, $title, $original) = handle_faq_headers;
} else {
my $newid;
($author, $title, $newid) = handle_doc_headers;
$id = $newid if defined $newid;
}
}
# Skip over whitespace after headers, and also skip over rules.
$_ = <$IN> while (defined && (/^\s*$/ || is_rule $_));
# See if we have a centered title at the top of the document. If so, we'll
# make that the document title unless we also saw a Subject header. Titles
# shouldn't be in all caps, though.
my $heading;
if (is_centered ($_)) {
$heading = whitechomp $_;
if (!$title) {
$title = $heading;
$title =~ s/\b([A-Z]+)\b/\L\u$1/g if (is_allcaps $title);
}
do { $_ = <$IN> } while (defined && (/^\s*$/ || is_rule $_));
}
$title = $realtitle if $realtitle;
$heading ||= $title;
$heading = urlize $heading;
# Generate the heading of the HTML file, using the filename as the title if we
# haven't been able to find a title. We claim "transitional" XHTML 1.0
# compliance; we can't claim strict solely because we use the value attribute
# in
in the absence of widespread implementation of CSS Level 2.
($version) = (split (' ', $ID))[2];
output dtd;
output "\n";
output html, "\n";
output head (" ", title ($title || $output || 'faq2html output'),
$style ? ("\n ", style ($style)) : '', "\n ",
charset, "\n"), "\n";
output comment ($id), "\n" if $id;
output comment ("Converted to XHTML by faq2html version $version"), "\n\n";
# Open the body of the document, and print out the heading if we found one.
output "\n\n";
output h1 ($heading), "\n" if $heading;
# If we have additional headers, print them out. Otherwise, if we have author
# information from a From header, print that out under the main heading.
#
# If we have RCS/CVS Id information, add another subheading containing the
# last modified date. Alternately, if the -l option was given, get the last
# modified date from the source file. Existing subheadings that look like
# they're just Revision or Date strings are replaced by our more nicely
# formatted string.
#
# We go to some length here to avoid unnecessary tags.
#
# Note that has to be on the end of the last line rather than the
# beginning of the next to work around a bug in lynx.
if ($heading) {
my ($subheading, $modified);
if ($id) {
$modified = modified_id ($id);
} elsif ($lastmod && $input ne '-') {
my $timestamp = (stat $input)[9];
if ($timestamp) {
$modified = modified_timestamp ($timestamp);
}
}
while (defined && (/^\s*$/ || is_centered ($_) || $subheading)) {
if (/^\s*$/) {
do { $_ = <$IN> } while (defined && is_rule $_);
if (defined && is_centered ($_)) {
output "\n
\n" if $subheading;
}
# Scan the actual body of the text. We don't use paragraph mode, since it
# doesn't work with blank lines that contain whitespace; instead, we cobble
# together our own paragraph mode that does. Note that $_ already has a
# non-blank line of input coming into this loop.
output "\n" if $heading;
$BUFFER = $_;
my $space;
while (defined $BUFFER) {
$_ = slurp;
# Ignore any text after a signature block.
last if (is_signature $_);
# If we just hit a digest divider, the next thing will likely be a
# Subject: line that we want to turn into a section header. Digest
# section titles are always level 2 headers currently.
if (is_divider $_) {
$STATE{pre} = 0;
output start (-1);
undef $INDENT;
($WS) = /\n(\s*)$/;
$_ = slurp;
s/\n(\s*)$/\n/;
$space = $1;
if (s/^Subject:\s+//) {
$STATE{contents} = /\bcontents\b/i;
$_ = escape $_;
if (/^([\d.]+)[.\)]\s/) {
output h2 (container (qq(a name="S$1" id="S$1"), $_));
} else {
output h2 ($_);
}
next;
}
}
# Treat lines of dash-type characters as rules.
if (is_rule $_) {
$STATE{pre} = 0;
($space) = /\n(\s*)$/;
output start (-1), "\n";
undef $INDENT;
next
}
# Everything else needs to have special characters escaped. We don't do
# this earlier because if we want to allow < and > in rules, the escaping
# would make our lives miserable.
$_ = escape $_;
# Do this before untabification and stashing of trailing whitespace, but
# after escaping. Check to see if this paragraph looks like literal text.
# If so, we wrap it in
and output it as is. As a special exception
# to our normal paragraph handling, this paragraph doesn't end until we
# find a literal blank line; this hack lets full diffs be included in a
# FAQ without confusing the parser.
if (is_literal $_) {
if (/\n[ \t]+$/) { $_ .= slurp (1) }
output pre (strip_indent ($_, $INDENT));
s/\n(\n\s*)$/\n/;
$space = $1;
$STATE{pre} = 1;
next;
}
# Not literal text, so untabify it and stash whitespace.
$_ = untabify $_;
s/\n(\s*)$/\n/;
$space = $1;
my $indent = indent $_;
# If the paragraph has inconsistent indentation, or is indented relative
# to the baseline *and* the last paragraph we emitted was enclosed in
#
, assume that this paragraph belongs in
as well.
if ($STATE{pre}) {
if (is_offset ($_) || (defined $INDENT && $indent > $INDENT)) {
output pre (strip_indent ($_, $INDENT));
next;
} else {
$STATE{pre} = 0;
}
}
# Check for a heading. We distinguish between level 2 headings and level
# 3 headings as follows: The first heading we encounter is assumed to be a
# level 2 heading, and any further headers at that same indentation level
# are also level 2 headings. If we detect any other headings at a greater
# indent, they're marked as level 3.
if (is_heading ($_)) {
s/^\s+//;
$STATE{contents} = /\bcontents\b/i;
my $h;
if (defined $STATE{h2}) {
if ($indent <= $STATE{h2}) { $h = \&h2 }
else { $h = \&h3 }
} else {
$STATE{h2} = $indent;
$h = \&h2;
}
if (/^([\d.]+)[.\)]\s/) {
my $anchor = qq(a name="S$1" id="S$1");
output start, &$h (container ($anchor, derule ($_)));
} else {
output start, &$h (derule ($_));
}
$INDENT = $STATE{baseline};
next;
}
# A sudden change to an indentation of 0 when that's less than our
# indentation baseline is also a sign of literal text.
if ($INDENT && $indent == 0 && $INDENT > 0 && defined ($STATE{baseline})
&& $STATE{baseline} > 0) {
output pre (strip_indent ($_, $INDENT));
$STATE{pre} = 1;
next;
}
# We're dealing with a normal paragraph of some sort, so go ahead and turn
# URLs into links. Check whether the paragraph is broken first, though,
# and stash that information, since turning URLs into links can
# artificially lengthen lines.
my $broken = is_broken $_;
$_ = urlize $_;
# Check to see if we're in a contents section, and if so if this paragraph
# looks like a table of contents. If so, turn all of the section headings
# into links and assume broken text.
if ($STATE{contents} && is_contents $_) { $_ = contents $_ }
# Check for paragraphs that are entirely bulletted lines, and turn them
# into unordered lists without
tags.
if (is_allbullet $_) {
my $last;
my @lines = split (/\n/, $_);
for (@lines) {
next unless /\S/;
if (is_bullet $_) {
if (defined $last) {
output start ($INDENT, 'ul');
output li ($INDENT, embolden $last);
}
$last = debullet $_;
$INDENT = indent $last;
} else {
$last .= "\n$_";
}
}
if (defined $last) {
output start ($INDENT, 'ul');
output li ($INDENT, embolden $last);
}
next;
}
# Check for paragraphs that are entirely numbered lines, and turn them
# into ordered lists without
tags.
if (is_allnumbered $_) {
my @lines = split (/\n/, $_);
for (@lines) {
next unless /\S/;
my ($number) = /^(\d+)/;
$_ = denumber $_;
$INDENT = indent $_;
output start ($INDENT, 'ol');
output li ($INDENT, embolden ($_), $number);
}
next;
}
# Check for bulletted paragraphs and turn them into lists.
if (is_bullet $_) {
$_ = debullet $_;
$INDENT = indent $_;
output start ($INDENT, 'ul');
output li ($INDENT, p (embolden $_));
next;
}
# Check for paragraphs quoted with some character and turn them into
# blockquotes provided they don't have inconsisted indentation.
my $quote = is_quoted ($_);
if ($quote && !$broken) {
$_ = unquote ($_, $quote);
$INDENT = indent $_;
output start ($INDENT, 'blockquote', p (embolden $_));
next;
}
# Check for numbered paragraphs and turn them into lists.
my $number = is_numbered ($_);
if (defined $number) {
my $contents = is_contents ($_);
$_ = denumber $_;
$INDENT = indent $_;
s%(\n\s*\S)% $1%g if ($broken || $contents);
output start ($INDENT, 'ol');
output li ($INDENT, p (embolden $_), $number);
next;
}
# Check for things that look like description lists and handle them. Note
# that we don't allow indented description lists, because they're usually
# something we actually want to make
. This is another fairly
# fragile heuristic.
if (is_description ($_) && defined $INDENT) {
my (@title, $body);
($title[0], $body) = split ("\n", $_, 2);
my ($space) = ($title[0] =~ /^(\s*)/);
while ($body =~ /^$space\S/) {
my $title;
($title, $body) = split ("\n", $body, 2);
push (@title, $title);
}
if ($indent == $INDENT || indent ($body) == $INDENT) {
@title = map { embolden ($_) } @title;
my $title = join (" \n", @title) . "\n";
$INDENT = indent $body;
$body =~ s%(\n\s*\S)% $1%g if is_broken $body;
output start ($indent, 'dl', dt ($title));
output start ($INDENT, 'dd', (p (embolden $body)));
next;
}
}
# If the paragraph has inconsistent indentation, we should output it in
#
.
if (is_offset $_) {
output pre (strip_indent ($_, $INDENT));
$STATE{pre} = 1;
next;
}
# A sudden indentation change also means the paragraph should be
# blockquoted. We render broken blockquoted text in
, which may not
# be what's wanted for things like quotes of poetry... this is probably
# worth looking at in more detail.
if (defined $INDENT && $indent > $INDENT) {
if ($broken || (lines ($_) == 1 && !is_sentence $_)) {
output pre (strip_indent ($_, $INDENT));
$STATE{pre} = 1;
} else {
$INDENT = $indent;
output start ($INDENT, 'blockquote', p (embolden $_));
}
next;
}
# Close multiparagraph structure if we've outdented again.
if ($INDENT && $indent < $INDENT) { output start ($indent) }
# Looks like a normal paragraph. Establish our indentation baseline if we
# haven't already.
if (!defined $STATE{baseline} && !$INDENT) {
$STATE{baseline} = $indent;
}
$INDENT = $indent;
s%(\n\s*\S)% $1%g if $broken;
output p (embolden $_);
} continue {
$WS = $space;
}
# All done. Print out our closing tags.
output start (-1), "\n\n\n";
__END__
##############################################################################
# Documentation
##############################################################################
=head1 NAME
faq2html - Convert some particular text formats into XHTML
=head1 SYNOPSIS
B [B<-hluv>] [B<-s> I