#!/usr/bin/perl
our $ID = q$Id: filter-syslog,v 2.3 2012/05/19 22:58:59 eagle Exp $;
#
# filter-syslog -- Filters a syslog file and mails the results.
#
# Written by Russ Allbery <rra@stanford.edu>
# Copyright 2002, 2003, 2004, 2006, 2007, 2009, 2010, 2011, 2012
#     The Board of Trustees of the Leland Stanford Junior University
#
# This program is free software; you can redistribute it and/or modify it
# under the same terms as Perl itself.

##############################################################################
# Site configuration
##############################################################################

# filter-syslog looks for filter-syslog.conf in the following directories.
# /etc/leland is included for backward-compatibility reasons at Stanford.
our @CONFIGDIR = qw(/etc /etc/leland);

# Global ignore regexes.  Lines matching these patterns are always ignored.
# These are Linux syslogd restart messages, which don't follow a normal format
# (they have the version number of syslogd after the name of the program).
our @GLOBAL = (
    qr/^(\w+  ?\d+ [\d:]+) \S+ syslogd [\w.\#]+: restart\b/,
    qr/^(\w+  ?\d+ [\d:]+) \S+ exiting on signal 15$/
);

# Syslog log parsing regexes.  These do the basic parse of the log line and
# any line that fails even this basic parse, except for a few specially
# recognized variations or globally filtered lines, will be resent to the
# reporting address.  Right now, these regexes are constrained by having to
# return the timestamp as $1, the hostname as $2, the ID (the program name) as
# $3, and the message itself as $4.
our @SYSLOG_REGEXES = (
    qr/^(\w+  ?\d+ [\d:]+) (\S+) (\S+): (?:\[\S+ \d+ \S+\] )?(.*)/,
    qr/^(\d{4}-\d\d-\d\dT\S+) (\S+) (\S+): (?:\d+: )?(.*)/,
    qr/^(\w+  ?\d+ [\d:]+) \S+ Message forwarded from (\S+): (\S+): (.*)/,
    qr/^<\d+>1 (\d{4}-\d\d-\d\dT\S+) (\S+) (\S*) (?:-|\d+) - -  ?(.*)/
);

# Apache log-parsing regexes.  These return the Apache timestamp (which is the
# result of ctime, rather than the syslog timestamp format) in $1, the log
# level in $2, the client IP (if any) in $3, and the rest of the line in $4.
our @APACHE_REGEXES = (
    qr/^\[(\S+\s*\S+\s*\S+\s*\S+\s*\d+)\] \[(\S+)\](?: \[client (\S+)\])? (.*)/
);

# Stores the results of parsing the configuration file.
our %CONFIG;

# Set to true if we're not sending mail, just printing the filtered results to
# standard output.
our $NOMAIL;

##############################################################################
# Modules and declarations
##############################################################################

require 5.006;

use strict;

use Getopt::Long qw(GetOptions);
use Sys::Hostname qw(hostname);

##############################################################################
# Configuration parsing
##############################################################################

# Parse an individual configuration file.  Calls itself recursively to handle
# included files or directories.  Note that there is currently no defense
# against recursive inclusion of files other than a simple check that a file
# doesn't include itself.
sub config_parse_file {
    my ($file) = @_;
    local $_;
    local *CONFFILE;
    open (CONFFILE, $file) or die "$0: cannot open $file: $!\n";
    while (<CONFFILE>) {
        chomp;
        next if /^\s*\#/;
        next if /^\s*$/;
        if (m%^\s*([^/]\S*)\s*=\s*(.*)%) {
            my ($param, $value) = ($1, $2);
            $param = lc $param;
            unless ($value) {
                warn "$0:$file:$.: parse error, no value for variable\n";
            }
            $CONFIG{$param} = $value;
        } elsif (m%^\s*include\s+(\S+)\s*$%) {
            my $inc = $1;
            my @files;
            if (-d $inc) {
                opendir (D, $inc)
                    or die "$0:$file:$.: cannot open directory $inc: $!\n";
                @files = map { "$inc/$_" } grep { !/\./ } readdir D;
                closedir D;
            } else {
                @files = ($inc);
            }
            for (@files) {
                if ($_ eq $file) {
                    die "$0:$file:$.: config file recursively included\n";
                }
                config_parse_file ($_);
            }
        } elsif (m%^\s*(\S+):\s*/(.*)/\s*$%) {
            my ($program, $regex) = ($1, $2);
            if ($program eq '*') {
                push (@{ $CONFIG{global} }, qr/$regex/);
            } elsif ($program =~ m,^/(.+)/\z,) {
                my $data = [ qr/$1/, qr/$regex/ ];
                push (@{ $CONFIG{ignorematch} }, $data);
            } else {
                $CONFIG{ignore}{$program} ||= [];
                push (@{ $CONFIG{ignore}{$program} }, qr/$regex/);
            }
        } elsif (m%^\s*/(.*)/\s*\.\.\.\s*/(.*)/\s*$%) {
            my ($start, $end) = ($1, $2);
            push (@{ $CONFIG{range} }, [ qr/$start/, qr/$end/ ]);
        } elsif (m%^\s*/(.*)/\s*$%) {
            my $regex = $1;
            push (@{ $CONFIG{raw} }, qr/$regex/);
        } else {
            warn "$0:$file:$.: parse error, unknown line\n";
        }
    }
    close CONFFILE;
}

# Parse a configuration file and fill out the %CONFIG hash.  Also make sure
# that all the required configuration parameters are set.
sub config_parse {
    my ($file) = @_;
    $CONFIG{global} = [];
    $CONFIG{ignorematch} = [];
    $CONFIG{raw} = [ @GLOBAL ];
    $CONFIG{range} = [];
    config_parse_file ($file);
    for (qw/alert subject/) {
        unless ($CONFIG{$_}) {
            warn "$0: paramter $_ not set in $file\n";
        }
        if ($CONFIG{$_} =~ /[\\\']/) {
            die "$0: $_ setting must not contain \\ or '\n";
        }
    }
    $CONFIG{subject} =~ s/\$h/hostname/e;
}

##############################################################################
# Mail sending
##############################################################################

# Initialize mail sending.
sub mail_init {
    if ($NOMAIL) {
        open (MAIL, '>&STDOUT') or die "$0: cannot dup stdout: $!\n";
    } else {
        my ($command) = grep { -x $_ }
            qw(/usr/sbin/sendmail /usr/lib/sendmail);
        $command ||= '/usr/lib/sendmail';
        $command .= " -f '$CONFIG{sender}'" if $CONFIG{sender};
        $command .= " '$CONFIG{alert}'";
        open (MAIL, "| $command")
            or die "$0: unable to fork sendmail: $!\n";
        print MAIL "From: $CONFIG{sender}\n" if $CONFIG{sender};
        print MAIL "To: $CONFIG{alert}\nSubject: $CONFIG{subject}\n\n";
    }
}

{
    my $sending = 0;

    # Send a line via email.  This stores internal state to see whether we've
    # already initialized mail sending and does so if required.  Takes the
    # line and a flag saying whether or not to actually send mail.
    sub mail_line {
        my ($line) = @_;
        mail_init unless $sending;
        $sending = 1;
        print MAIL $line;
    }

    # Finish sending the mail.
    sub mail_close {
        if ($sending) {
            close MAIL;
            return if $NOMAIL;
            die ("$0: sendmail exited with status ", ($? >> 8), "\n")
                if $? != 0;
        }
    }
}

##############################################################################
# Filtering
##############################################################################

# Filtering has to maintain a bunch of state, all of which is stored in a hash
# whose reference is passed around in $state.  This has the following keys:
#
#     last  => whether the last line was printed
#     range => reference to an array of lines possibly part of a range
#     start => index of the regex that started the current range
#     end   => regex that would end the current range

# Reprocess a set of lines.  We call this function if we thought we might have
# the start of an ignore range, but we never found the end (or accumulated
# 1000 lines without finding the end).  Mutually recursive with filter_line.
sub reprocess_range {
    my ($range, $state) = @_;
    $state->{range} = [];
    delete $state->{end};
    for my $line (@$range) {
        filter_line ($line, $state);
    }
    if (@{ $state->{range} }) {
        reprocess_range ($state->{range}, $state);
    }
}

# The core of the script.  Takes one line of syslog output and the filter
# state and filters the line.
sub filter_line {
    my ($line, $state) = @_;

    # Solaris mark lines.
    if ($line =~ /-- MARK --/) {
        $state->{last} = 0;
        return;
    }

    # If we're in the middle of a range, see if we found the end of it.  Also
    # check whether we've seen as many lines as we're willing to search for
    # and are forcibly aborting the range.
    if ($state->{end}) {
        if ($line =~ /$state->{end}/) {
            $state->{range} = [];
            delete $state->{start};
            delete $state->{end};
        } else {
            push (@{ $state->{range} }, $line);
            if (@{ $state->{range} } >= 1000) {
                reprocess_range ($state->{range}, $state);
            }
        }
        return;
    }

    # See if we're starting a range.  If we see a match to the start of a
    # range, we start accumulating lines until we match the end of that range.
    # Note, though, that if $state->{start} is set, we need to be sure not to
    # detect the same range start again; otherwise, we'll get into an infinite
    # loop.
    for (my $i = 0; $i < @{ $CONFIG{range} }; $i++) {
        next if (defined ($state->{start}) && $i <= $state->{start});
        my $start = $CONFIG{range}[$i][0];
        if ($line =~ /$start/) {
            $state->{range} = [ $line ];
            $state->{start} = $i;
            $state->{end} = $CONFIG{range}[$i][1];
            return;
        }
    }
    delete $state->{start};

    # Handle repeated messages by including the message repeated line if we
    # included the previous line.
    if ($line =~ /^(\S+\s*\S+\s*\S+) \S+ (last message repeated \d+ time)/) {
        mail_line ("$1 $2\n") if $state->{last};
        return;
    }

    # Remove trailing whitespace and then see if the line matches any of our
    # global filtering regexes.
    my $raw = $line;
    $raw =~ s/\s+$//;
    for my $regex (@{ $CONFIG{raw} }) {
        if ($raw =~ /$regex/) {
            $state->{last} = 0;
            return;
        }
    }

    # Parse the or Apache syslog line.  Any line that we can't parse is
    # mailed.
    my ($timestamp, $hostname, $id, $body);
    for my $regex (@SYSLOG_REGEXES) {
        if ($line =~ /$regex/) {
            ($timestamp, $hostname, $id, $body) = ($1, $2, $3, $4);
            last;
        }
    }
    my ($client, $level);
    for my $regex (@APACHE_REGEXES) {
        if ($line =~ /$regex/) {
            ($timestamp, $level, $client, $body) = ($1, $2, $3, $4);
            $id = "apache-$level";
            last;
        }
    }
    unless ($timestamp) {
        mail_line ($line);
        $state->{last} = 1;
        return;
    }
    $body =~ s/\s+$//;

    # Check to see if we're ignoring this message, and if not, start a report
    # (if we haven't already) and include it in the report.
    my ($program) = ($id =~ /^([^\[]+)/);
    for my $regex (@{ $CONFIG{global} }) {
        if ($body =~ /$regex/) {
            $state->{last} = 0;
            return;
        }
    }
    if ($CONFIG{ignore}{$program}) {
        for my $regex (@{ $CONFIG{ignore}{$program} }) {
            if ($body =~ /$regex/) {
                $state->{last} = 0;
                return;
            }
        }
    }
    for my $data (@{ $CONFIG{ignorematch} }) {
        my ($match, $regex) = @$data;
        if ($program =~ /$match/ and $body =~ /$regex/) {
            $state->{last} = 0;
            return;
        }
    }
    mail_line ($line);
    $state->{last} = 1;
}

##############################################################################
# Implementation
##############################################################################

# Clean up $0 for error reporting.
my $fullpath = $0;
$0 =~ s%^.*/%%;

# Parse command-line options.
my ($help, $hostname, $version);
Getopt::Long::config ('bundling', 'no_ignore_case');
GetOptions ('help|h'     => \$help,
            'hostname|o' => \$hostname,
            'no-mail|n'  => \$NOMAIL,
            'version|v'  => \$version) or exit 1;
if ($help) {
    print "Feeding myself to perldoc, please wait....\n";
    exec ('perldoc', '-t', $fullpath) or die "$0: can't fork: $!\n";
} elsif ($version) {
    my $version = join (' ', (split (' ', $ID))[1..3]);
    $version =~ s/,v\b//;
    $version =~ s/(\S+)$/($1)/;
    $version =~ tr%/%-%;
    print $version, "\n";
    exit 0;
}

# The path to the config file is the only argument.
my $config = shift || 'filter-syslog.conf';
unless ($config =~ m%^\.?/%) {
    for my $dir (@CONFIGDIR) {
        if (-f "$dir/$config") {
            $config = "$dir/$config";
            last;
        }
    }
}
config_parse ($config);

# Now, process our input.  We spit our output out through a pipe to sendmail
# if we find anything of note.
my %state = (range => []);
while (<>) {
    filter_line ($_, \%state);
}

# If we never saw the end of some range, reprocess that chunk.
if (@{ $state{range} }) {
    reprocess_range ($state{range}, \%state);
}
mail_close;

__END__

##############################################################################
# Documentation
##############################################################################

=head1 NAME

filter-syslog - Filters a syslog file and mails the results

=head1 SYNOPSIS

filter-syslog [B<-hnov>] [I<config>] < I<syslog>

=head1 DESCRIPTION

B<filter-syslog> parses a log generated by syslog, filtering out all of
the boring lines as configured in I<config>, and then mails the remaining
lines to the address specified in I<config>.  It expects the log file on
standard input, and is designed to run from an analyze action in
newsyslog(8), although it can be used in other situations as well.  It can
also parse Apache error logs and do similar filtering.

If I<config> isn't an absolute path, it's taken to be relative to either
F</etc> or F</etc/leland>, wherever the file is found (searched in that
order).  If I<config> is not specified, it defaults to
F<filter-syslog.conf> and is looked for in both F</etc> and
F</etc/leland>.

Lines containing C<-- MARK --> and sysklogd restart messages on Linux,
which look like:

    Sep 10 04:02:07 example syslogd 1.4.1: restart.
    Apr  1 23:55:01 example syslogd 1.4.1#10: restart.
    Apr  1 23:55:01 example syslogd 1.4.1#10: restart (remote reception).
    Apr  1 23:55:09 example exiting on signal 15

are always ignored.

Messages of the form:

    Apr 28 07:09:40 10.1.1.1 Message forwarded from example.org: \
        program[36398]: some log message

(line split only for readability in this example) will be parsed exactly
as if they had said:

    Apr 28 07:09:40 example.org program[36398]: some log message

This format is used by OpenBSD for forwarded syslog messages.

Please note that this is not intended to be a security tool or a real-time
monitoring tool, but rather a tool to make sure that system administrators
are aware of unusual log messages that might indicate server problems or
failing hardware.  An intrusion detection system would work differently
and would be more paranoid, and a real-time monitoring tool wouldn't run
in batch mode.  There are other tools available to do that type of
monitoring.

=head1 OPTIONS

=over 4

=item B<-h>, B<--help>

Print out this documentation (which is done simply by feeding the script
to C<perldoc -t>).

=item B<-n>, B<--no-mail>

Rather than sending the results via e-mail, instead print out the
non-boring lines that would have been sent via e-mail to standard output.
Useful for testing filter rules.

=item B<-o>, B<--hostname>

Display the hostname field (from the input syslog) in the output.

=item B<-v>, B<--version>

Print the version of B<filter-syslog> and exit.

=back

=head1 CONFIGURATION FILE

There are three types of valid lines in the configuration file; variable
settings, filter patterns, and includes of other configuration files.  A
variable setting looks like:

    variable = value

where I<value> can contain whitespace (but can't begin with whitespace).
A filter pattern looks like one of:

    program: /regex/
    /program/: /regex/
    /regex/
    /regex/ ... /regex/

where I<program> is the name of a particular program (the filter line
will only apply to log entries from that program) and I<regex> is a Perl
regular expression matching lines that are "boring" and shouldn't be
reported.  Any trailing whitespace in the syslog line will be removed
before matching it against the regex.  Slashes (C</>) in I<regex> do not
have to be escaped.  Each of these lines must be all on one line.

When a line is in Apache error log format, the program for that line will
be set to C<apache-I<level>> where I<level> is the log level for that line
(C<notice>, C<warn>, etc.).

If I<program> is surrounded by slashes (C</>), it is a regex and any
program name that matches that regex will have that filter line applied.
If I<program> is C<*> I<regex> will be applied to all lines, regardless of
what program they're from.

If I<program> is not present, as in the last two forms, the regex is
matched against the entire syslog line, including the timestamp, and the
line will be ignored if the regex matches.  This can be used to match logs
in a non-standard format, such as ones without a program name or with a
program name containing spaces.

If the line contains two regexes separated by C<...>, this indicates a
range of lines.  All lines between a line matching the first regex and a
line matching the second regex will be ignored, including the matching
lines.  Both regular expressions are matched against the entire line,
including the timestamp and program.  There must be no more than 1000
lines in the range; if more than 1000 lines are encountered after the
start regex, B<filter-syslog> will stop looking for the end regex and then
parse all the lines normally.

Finally, a line like:

    include /path/to/file

includes another configuration file at F</path/to/file>.  The path can be
a directory instead of a file, in which case every file in that directory
that does not begin with a period is included (in no defined order).

The following variables are recognized:

=over 4

=item alert

The address to which to mail the filtering results.  No mail will be sent
if all of the input lines are filtered out by the regexes provided.  This
variable must be set and may not contain any backslashes or single quotes.

=item sender

The address from which to mail the filtering results (used for the
envelope sender and the To: header).  If not set, no address will be given
to sendmail, which will result in the mail system picking some default
value based on the user B<filter-syslog> is running as.  The value of this
variable may not contain any backslashes or single quotes.

=item subject

The value to use for the Subject: header of the filtering results.  If you
include C<$h> in the value, it will be replaced with the hostname.  This
variable must be set.

=back

If there are any input lines that don't match one of the filter rules,
they will be mailed to the value of I<alert> with a subject given by
I<subject>.

=head1 EXAMPLES

Filter /var/log/syslog using /etc/syslog.filter as a configuration
file.

    filter-syslog syslog.filter < /var/log/syslog

Here's a sample configuration file that filters out normal Kerberos
messages and sends the result to root@example.com with a Subject: header
of C<example syslog filter results>:

    alert = root@example.com
    subject = example syslog filter results

    kftgtd:  /^connect from /
    klogind: /^connect from /
    kshd:    /^Executing .* for principal /
    kshd:    /^Shell process completed\.$/
    kshd:    /^connect from /

Instead of the three separate lines to filter out TCP wrappers messages,
one could instead use the line:

    *: /^connect from /

to filter out all syslog lines that begin with C<connect from>, but this
runs a larger risk of filtering out messages that would be of interest.

The rule:

    apache-warn: /^FastCGI: /

would filter out all Apache error log messages about FastCGI.  The rule:

    apache-debug: /.*/

would filter out everything in an Apache error.log logged at debug level.

The filter pattern:

    /^\w{3} [ :0-9]{11} \S+ \w+\[\d+\]: connect from /

would match any syslog line from any program beginning with C<connect
from>.  This regular expression is matched against the entire line; notice
that the timestamp and host identifier have to be matched, as well as the
PID in brackets after the program name.  In this specific case, there is
no reason to use such a rule since B<filter-syslog> can parse that line
into a program name and message, but this sort of rule can be used to
match any arbitrary syslog line that B<filter-syslog> may not otherwise be
able to parse.

Finally, the configuration line:

    /START/ ... /END/

would filter out every log line between a line containing C<START> to a
line containing C<END>, inclusive.  (This example isn't particularly
useful, but the regular expressions can of course be more complex.)

=head1 FILES

=over 4

=item F</etc>

=item F</etc/leland>

If the configuration file given on the command line isn't an absolute
path, it is looked for first in F</etc> and then in F</etc/leland>.  This
default can be changed by editing the beginning of this program.

=item F</etc/filter-syslog.conf>

=item F</etc/leland/filter-syslog.conf>

=item F<./filter-syslog.conf>

The default configuration file, if none is given.  The paths will be
searched in the above order.

=back

=head1 BUGS

The rule that ignores C<-- MARK --> lines, which are automatically
generated by (at least) Solaris syslogd at periodic intervals if
requested, could be exploited to hide messages from B<filter-syslog> that
an administrator may want to see.  Please again note that this is not a
security tool.  However, a better regex should be developed and used
instead, regardless.

There is no protection against inclusion loops (a configuration file that
includes another file which then includes the first file).

=head1 NOTES

As of version 1.20, B<filter-syslog> removes trailing whitespace from
syslog lines before seeing if the lines match the provided regexes.
Earlier versions did not do this.  You may need to change your regexes
when upgrading from 1.19 to 1.20.

=head1 SEE ALSO

newsyslog(8)

The current version of this program is available from its web page at
L<http://www.eyrie.org/~eagle/software/filter-syslog/>.

=head1 AUTHORS

Russ Allbery <rra@stanford.edu>.  Patch for B<--hostname> from Steve
Benson.

=head1 COPYRIGHT AND LICENSE

Copyright 2002, 2003, 2004, 2006, 2007, 2009, 2010, 2011, 2012 The Board
of Trustees of the Leland Stanford Junior University.

This program is free software; you may redistribute it and/or modify it
under the same terms as Perl itself.

=cut