#!/usr/bin/perl
our $ID = q$Id: nan-archive,v 1.15 2009-01-02 21:59:40 eagle Exp $;
#
# nan-archive -- Archive posts to news.announce.newgroups.
#
# Copyright 2003, 2008 Russ Allbery <rra@stanford.edu>
#
# This program is free software; you may redistribute it and/or modify it
# under the same terms as Perl itself.

##############################################################################
# Local customization
##############################################################################

# The path to the news.announce.newgroups archive.
our $ARCHIVE = '/srv/nan/archive/archive';

# The e-mail address to which to report errors.
our $ERRORS = 'rra@stanford.edu';

# The path to the keyring used for PGPMoose verification.
our $KEYRING = '/srv/nan/archive/keyring';

# The log file into which to record actions taken.  The current year and
# month will be appended.
our $LOGBASE = '/srv/nan/archive/logs/log';
our $LOG;

# The PGPMoose signing identity.
our $SIGNER
    = 'Moderator of news.announce.newgroups <newgroups-request@isc.org>';

##############################################################################
# Modules and declarations
##############################################################################

require 5.006;

use strict;
use subs qw(log);

use Getopt::Long qw(GetOptions);
use Fcntl qw(LOCK_EX);
use IO::Handle ();
use News::Article;
use PGP::Sign ();
use POSIX qw(strftime);

##############################################################################
# Error reporting
##############################################################################

# Open the log file.
sub openlog {
    $LOG = $LOGBASE . '.' . strftime ('%Y-%m', gmtime);
    open (LOG, ">> $LOG") or die "$0: can't open $LOG: $!\n";
}

# Log a message to the log file.
sub log {
    my $log = $LOGBASE . '.' . strftime ('%Y-%m', gmtime);
    if ($log ne $LOG) {
        close LOG;
        openlog;
    }
    my $date = strftime ('%Y-%m-%d %T', gmtime);
    print LOG "$date [$$] ", @_, "\n";
    LOG->flush;
}

# Log a fatal message and then exit.
sub logdie {
    log ('FATAL: ', @_);
    exit 1;
}

# Report an error via e-mail and then exit.  This should not cause the article
# to be retried, since it was still received and saved successfully, but it
# won't be deleted out of the queue.
sub fail {
    my ($id, $message) = @_;
    my $sendmail = grep { -x $_ } qw(/usr/sbin/sendmail /usr/lib/sendmail);
    $sendmail ||= 'sendmail';
    open (MAIL, "|$sendmail -oi -oem $ERRORS")
        or die "$0: cannot fork sendmail: $!\n";
    print MAIL "To: $ERRORS\n";
    print MAIL "From: news.announce.newgroups Archiver <$ERRORS>\n";
    print MAIL "Subject: Error while processing $id\n";
    print MAIL "\n";
    print MAIL "Encountered the following error:\n\n";
    print MAIL "    $message\n\n";
    print MAIL "while processing $id\n";
    close MAIL;
    if ($? == 0) {
        logdie $message;
    } else {
        log ('FATAL: ', $message);
        logdie 'cannot send mail, sendmail exit status ', ($? >> 8), "\n";
    }
}

##############################################################################
# Message parsing
##############################################################################

# Parse an article into a hash of headers and an array containing the message.
# Takes the file handle to the article, its message ID, an array ref into
# which to put the message, and a hash ref into which to put parsed headers.
sub parse_article {
    my ($fh, $id, $message, $hdr) = @_;

    # Parse the headers first, checking for duplicates as we go.
    my $last;
    local $_;
    while (<$fh>) {
        s/\r\n\z/\n/;
        push (@$message, $_);
        last if ($_ eq "\n");
        if (/^(\S+):\s+(.+)/) {
            $last = lc $1;
            if (exists $$hdr{$last}) {
                $$hdr{$last} .= ' ' . $2;
            } else {
                $$hdr{$last} = $2;
            }
        } elsif (defined ($last) && /^(\s.+)/) {
            $$hdr{$last} .= $1;
        } else {
            log "$id broken headers";
            return 0;
        }
    }

    # Check the validity of the message.
    unless (@$message) {
        log "$id appears to be empty";
        return 0;
    }
    unless ($$hdr{'archive-name'}) {
        log "$id has no Archive-Name header";
        return 0;
    }
    if ($$hdr{'archive-name'} !~ m%^([A-Za-z][\w.]+/)?[a-z0-9+.-]+\s*$%) {
        log "$id has an invalid Archive-Name header";
        return 0;
    }

    # Slurp up the article body.
    while (<$fh>) {
        s/\r\n\z/\n/;
        last if ($_ eq ".\n");
        s/^\.\././;
        push (@$message, $_);
    }

    # Check the PGPMoose signature.
    my $article = News::Article->new ($message);
    unless ($article) {
        log "$id could not be parsed";
        return 0;
    }
    $PGP::Sign::PGPPATH = $KEYRING;
    my $signer = $article->verify_pgpmoose ('news.announce.newgroups');
    unless ($signer && $signer eq $SIGNER) {
        if (not defined $signer) {
            log "$id has an invalid signature";
        } else {
            $signer =~ s/\n/_/g;
            log "$id has an invalid signature from $signer";
        }
        return 0;
    }
    return 1;
}

# Given an Archive-Name header, turn that into a pathname relative to the root
# of the archive.
sub parse_archive {
    my ($archive) = @_;
    $archive =~ s/\s+$//;
    if ($archive =~ m%/%) {
        return $archive;
    } else {
        my ($hierarchy) = split (/\./, $archive, 2);
        $hierarchy = 'other.articles' unless $hierarchy;
        return "$hierarchy/$archive";
    }
}

##############################################################################
# Message archiving
##############################################################################

# Archive a message.  Takes the file into which to archive the message and the
# complete message as a reference to an array.  Uses a lock file at the top of
# the archive directory to prevent any contention.
sub archive {
    my ($file, $id, $article) = @_;
    fail ($id, "bad archive name $file")
        if (!$file || $file =~ m%^\.|^-|^/|\s%);
    fail ($id, "bad archive name $file")
        if (($file =~ tr%/%/%) != 1);
    $file = "$ARCHIVE/$file";
    my $dir = $file;
    $dir =~ s%/[^/]+$%%;
    fail ($id, "directory $dir does not exist") unless -d $dir;
    my $message = join ('', @$article);
    $message = 'From usenet ' . scalar (gmtime) . "\n" . $message;
    my $umask = umask 002;
    open (LOCK, "+> $ARCHIVE/.lock")
        or fail ($id, "cannot open lock file $ARCHIVE/.lock: $!");
    flock (LOCK, LOCK_EX)
        or fail ($id, "cannot lock $ARCHIVE/.lock: $!");
    open (FILE, ">> $file")
        or fail ($id, "cannot open $file: $!");
    print FILE "\n" if -s $file;
    print FILE $message
        or fail ($id, "cannot write to $file: $!");
    close FILE
        or fail ($id, "cannot write to $file: $!");
    close LOCK;
    umask $umask;
}

##############################################################################
# Main routine
##############################################################################

# Trim extraneous garbage from the path.
$| = 1;
my $fullpath = $0;
$0 =~ s%.*/%%;

# Parse command-line options.
my ($help, $version);
Getopt::Long::config ('bundling');
GetOptions ('h|help'    => \$help,
            'v|version' => \$version) or exit 1;
if ($help) {
    print "Feeding myself to perldoc, please wait....\n";
    exec ('perldoc', '-t', $fullpath);
} elsif ($version) {
    my $version = join (' ', (split (' ', $ID))[1..3]);
    $version =~ s/,v\b//;
    $version =~ s/(\S+)$/($1)/;
    $version =~ tr%/%-%;
    print $version, "\n";
    exit;
}

# Get permissions right.
umask 007;

# Open our log file.
openlog;

# Processing loop.  If file names were specified on the command-line, process
# that single file.  Otherwise, we listen on stdin and get file names and
# message IDs, one set per line, separated by spaces.
my ($file, $id);
if (@ARGV) {
    for my $file (@ARGV) {
        my $id = $file;

        # Parse the article.
        open (ART, $file) or fail ($id, "cannot open article $file: $!");
        my (@message, %hdr);
        next unless parse_article (\*ART, $id, \@message, \%hdr);

        # Archive the article.
        my $archive = parse_archive ($hdr{'archive-name'});
        archive ($archive, $id, \@message);
        log "$id archived (manual)";
    }
} else {
    while (<STDIN>) {
        ($file, $id) = split;

        # Parse the article.
        open (ART, $file) or fail ($id, "cannot open article $file: $!");
        my (@message, %hdr);
        next unless parse_article (\*ART, $id, \@message, \%hdr);

        # Archive the article.
        my $archive = parse_archive ($hdr{'archive-name'});
        archive ($archive, $id, \@message);
        log "$id archived";
    } continue {
        unlink $file or logdie "cannot unlink article $file: $!";
    }
}

##############################################################################
# Documentation
##############################################################################

=head1 NAME

nan-archive - Archive posts to news.announce.newgroups

=head1 SYNOPSIS

B<nan-archive> [I<file> ...]

=head1 REQUIREMENTS

Perl 5.6.0 or later and the Perl modules News::Article and PGP::Sign, both
of which are available from CPAN.

=head1 DESCRIPTION

B<nan-archive> processes messages posted to news.announce.newgroups,
verifying the article's PGPMoose signature and then archiving it according
to its Archive-Name header.  All actions are logged to a log file.

It expects to receive on standard input file names and message IDs, one
pair of file name and message ID per line.  The message ID is optional,
but is used for logging until the message has been parsed and therefore
omitting it may result in log messages that have no associated ID.  The
file is deleted when it has been successfully processed.

Optionally, instead, one or more files may be specified on the command
line.  In this case, they'll be processed as above, but they'll be noted
as manually processed in the logs.

The Archive-Name header field must be in one of three formats: the name of
a newsgroup (for articles affecting that group); a path formed from a
directory, a slash, and a filename; or some other filename, which will be
stored in the C<other.articles> directory.  If the Archive-Name header
field contains only a newsgroup name, it will be saved in a directory
named after the top-level hierarchy of that newsgroup name.

=head1 FILES

=over 4

=item F</srv/nan/archive/archive>

The archive area for all messages.  The top level should just be
directories, and all messages will be archived in those directories
according to the naming scheme described above.  The archived messages
will be stored in mbox format.  The archive is locked against multiple
simultaneous writers by using fcntl locking on a file named C<.lock> at
the top level.

=item F</srv/nan/archive/keyring/pubring.gpg>

The keyring used to verify messages using the PGPMoose protocol.  All
messages must be signed by a key stored in this keyring with a key ID
matching the one configured at the top of this script.

=item F</srv/nan/archive/logs/log.%Y-%m>

Where actions are logged.  %Y is replaced by the current four-digit year
and %m by the current two digit month.

=back

=head1 NOTES

B<nan-archive> should be run for each message posted to
news.announce.newgroups.  Its input format is designed to work with the
B<tinyleaf> server that will come with INN 2.5 and later and is available
from INN's Subversion repository, but it should also work as a channel
feed from pre-storage-API versions of INN (1.x).  It will not work without
modification via a channel feed from a current version of INN, since it
doesn't understand the storage API and doesn't know how to retrieve
articles by tokens.

To run B<nan-archive> with B<tinyleaf>, create C<archive>, C<logs>, and
C<spool> directories and the GnuPG keyring as described above.  Then, run
B<tinyleaf> on some port, configuring it to use the C<spool> directory and
run B<nan-archive>.  A typical B<tinyleaf> command line would be:

    tinyleaf /srv/nan-archive/spool /srv/nan-archive/scripts/nan-archive

I run B<tinyleaf> using B<tcpserver> (from the ucspi-tcp package), but
B<inetd> should work equally well.

Then, set up a feed of news.announce.newgroups to the system running
B<tinyleaf>.

=head1 SEE ALSO

inetd(8), tcpserver(1), tinyleaf(8)

L<http://www.eyrie.org/~eagle/software/scripts/> will have the latest
version of this script.

=head1 AUTHOR

Russ Allbery <rra@stanford.edu>

=head1 COPYRIGHT AND LICENSE

Copyright 2003, 2008 Russ Allbery <rra@stanford.edu>

This program is free software; you may redistribute it and/or modify it
under the same terms as Perl itself.

=cut