#!/usr/bin/perl our $ID = q$Id: filter-syslog,v 2.3 2012/05/19 22:58:59 eagle Exp $; # # filter-syslog -- Filters a syslog file and mails the results. # # Written by Russ Allbery # Copyright 2002, 2003, 2004, 2006, 2007, 2009, 2010, 2011, 2012 # The Board of Trustees of the Leland Stanford Junior University # # This program is free software; you can redistribute it and/or modify it # under the same terms as Perl itself. ############################################################################## # Site configuration ############################################################################## # filter-syslog looks for filter-syslog.conf in the following directories. # /etc/leland is included for backward-compatibility reasons at Stanford. our @CONFIGDIR = qw(/etc /etc/leland); # Global ignore regexes. Lines matching these patterns are always ignored. # These are Linux syslogd restart messages, which don't follow a normal format # (they have the version number of syslogd after the name of the program). our @GLOBAL = ( qr/^(\w+ ?\d+ [\d:]+) \S+ syslogd [\w.\#]+: restart\b/, qr/^(\w+ ?\d+ [\d:]+) \S+ exiting on signal 15$/ ); # Syslog log parsing regexes. These do the basic parse of the log line and # any line that fails even this basic parse, except for a few specially # recognized variations or globally filtered lines, will be resent to the # reporting address. Right now, these regexes are constrained by having to # return the timestamp as $1, the hostname as $2, the ID (the program name) as # $3, and the message itself as $4. our @SYSLOG_REGEXES = ( qr/^(\w+ ?\d+ [\d:]+) (\S+) (\S+): (?:\[\S+ \d+ \S+\] )?(.*)/, qr/^(\d{4}-\d\d-\d\dT\S+) (\S+) (\S+): (?:\d+: )?(.*)/, qr/^(\w+ ?\d+ [\d:]+) \S+ Message forwarded from (\S+): (\S+): (.*)/, qr/^<\d+>1 (\d{4}-\d\d-\d\dT\S+) (\S+) (\S*) (?:-|\d+) - - ?(.*)/ ); # Apache log-parsing regexes. These return the Apache timestamp (which is the # result of ctime, rather than the syslog timestamp format) in $1, the log # level in $2, the client IP (if any) in $3, and the rest of the line in $4. our @APACHE_REGEXES = ( qr/^\[(\S+\s*\S+\s*\S+\s*\S+\s*\d+)\] \[(\S+)\](?: \[client (\S+)\])? (.*)/ ); # Stores the results of parsing the configuration file. our %CONFIG; # Set to true if we're not sending mail, just printing the filtered results to # standard output. our $NOMAIL; ############################################################################## # Modules and declarations ############################################################################## require 5.006; use strict; use Getopt::Long qw(GetOptions); use Sys::Hostname qw(hostname); ############################################################################## # Configuration parsing ############################################################################## # Parse an individual configuration file. Calls itself recursively to handle # included files or directories. Note that there is currently no defense # against recursive inclusion of files other than a simple check that a file # doesn't include itself. sub config_parse_file { my ($file) = @_; local $_; local *CONFFILE; open (CONFFILE, $file) or die "$0: cannot open $file: $!\n"; while () { chomp; next if /^\s*\#/; next if /^\s*$/; if (m%^\s*([^/]\S*)\s*=\s*(.*)%) { my ($param, $value) = ($1, $2); $param = lc $param; unless ($value) { warn "$0:$file:$.: parse error, no value for variable\n"; } $CONFIG{$param} = $value; } elsif (m%^\s*include\s+(\S+)\s*$%) { my $inc = $1; my @files; if (-d $inc) { opendir (D, $inc) or die "$0:$file:$.: cannot open directory $inc: $!\n"; @files = map { "$inc/$_" } grep { !/\./ } readdir D; closedir D; } else { @files = ($inc); } for (@files) { if ($_ eq $file) { die "$0:$file:$.: config file recursively included\n"; } config_parse_file ($_); } } elsif (m%^\s*(\S+):\s*/(.*)/\s*$%) { my ($program, $regex) = ($1, $2); if ($program eq '*') { push (@{ $CONFIG{global} }, qr/$regex/); } elsif ($program =~ m,^/(.+)/\z,) { my $data = [ qr/$1/, qr/$regex/ ]; push (@{ $CONFIG{ignorematch} }, $data); } else { $CONFIG{ignore}{$program} ||= []; push (@{ $CONFIG{ignore}{$program} }, qr/$regex/); } } elsif (m%^\s*/(.*)/\s*\.\.\.\s*/(.*)/\s*$%) { my ($start, $end) = ($1, $2); push (@{ $CONFIG{range} }, [ qr/$start/, qr/$end/ ]); } elsif (m%^\s*/(.*)/\s*$%) { my $regex = $1; push (@{ $CONFIG{raw} }, qr/$regex/); } else { warn "$0:$file:$.: parse error, unknown line\n"; } } close CONFFILE; } # Parse a configuration file and fill out the %CONFIG hash. Also make sure # that all the required configuration parameters are set. sub config_parse { my ($file) = @_; $CONFIG{global} = []; $CONFIG{ignorematch} = []; $CONFIG{raw} = [ @GLOBAL ]; $CONFIG{range} = []; config_parse_file ($file); for (qw/alert subject/) { unless ($CONFIG{$_}) { warn "$0: paramter $_ not set in $file\n"; } if ($CONFIG{$_} =~ /[\\\']/) { die "$0: $_ setting must not contain \\ or '\n"; } } $CONFIG{subject} =~ s/\$h/hostname/e; } ############################################################################## # Mail sending ############################################################################## # Initialize mail sending. sub mail_init { if ($NOMAIL) { open (MAIL, '>&STDOUT') or die "$0: cannot dup stdout: $!\n"; } else { my ($command) = grep { -x $_ } qw(/usr/sbin/sendmail /usr/lib/sendmail); $command ||= '/usr/lib/sendmail'; $command .= " -f '$CONFIG{sender}'" if $CONFIG{sender}; $command .= " '$CONFIG{alert}'"; open (MAIL, "| $command") or die "$0: unable to fork sendmail: $!\n"; print MAIL "From: $CONFIG{sender}\n" if $CONFIG{sender}; print MAIL "To: $CONFIG{alert}\nSubject: $CONFIG{subject}\n\n"; } } { my $sending = 0; # Send a line via email. This stores internal state to see whether we've # already initialized mail sending and does so if required. Takes the # line and a flag saying whether or not to actually send mail. sub mail_line { my ($line) = @_; mail_init unless $sending; $sending = 1; print MAIL $line; } # Finish sending the mail. sub mail_close { if ($sending) { close MAIL; return if $NOMAIL; die ("$0: sendmail exited with status ", ($? >> 8), "\n") if $? != 0; } } } ############################################################################## # Filtering ############################################################################## # Filtering has to maintain a bunch of state, all of which is stored in a hash # whose reference is passed around in $state. This has the following keys: # # last => whether the last line was printed # range => reference to an array of lines possibly part of a range # start => index of the regex that started the current range # end => regex that would end the current range # Reprocess a set of lines. We call this function if we thought we might have # the start of an ignore range, but we never found the end (or accumulated # 1000 lines without finding the end). Mutually recursive with filter_line. sub reprocess_range { my ($range, $state) = @_; $state->{range} = []; delete $state->{end}; for my $line (@$range) { filter_line ($line, $state); } if (@{ $state->{range} }) { reprocess_range ($state->{range}, $state); } } # The core of the script. Takes one line of syslog output and the filter # state and filters the line. sub filter_line { my ($line, $state) = @_; # Solaris mark lines. if ($line =~ /-- MARK --/) { $state->{last} = 0; return; } # If we're in the middle of a range, see if we found the end of it. Also # check whether we've seen as many lines as we're willing to search for # and are forcibly aborting the range. if ($state->{end}) { if ($line =~ /$state->{end}/) { $state->{range} = []; delete $state->{start}; delete $state->{end}; } else { push (@{ $state->{range} }, $line); if (@{ $state->{range} } >= 1000) { reprocess_range ($state->{range}, $state); } } return; } # See if we're starting a range. If we see a match to the start of a # range, we start accumulating lines until we match the end of that range. # Note, though, that if $state->{start} is set, we need to be sure not to # detect the same range start again; otherwise, we'll get into an infinite # loop. for (my $i = 0; $i < @{ $CONFIG{range} }; $i++) { next if (defined ($state->{start}) && $i <= $state->{start}); my $start = $CONFIG{range}[$i][0]; if ($line =~ /$start/) { $state->{range} = [ $line ]; $state->{start} = $i; $state->{end} = $CONFIG{range}[$i][1]; return; } } delete $state->{start}; # Handle repeated messages by including the message repeated line if we # included the previous line. if ($line =~ /^(\S+\s*\S+\s*\S+) \S+ (last message repeated \d+ time)/) { mail_line ("$1 $2\n") if $state->{last}; return; } # Remove trailing whitespace and then see if the line matches any of our # global filtering regexes. my $raw = $line; $raw =~ s/\s+$//; for my $regex (@{ $CONFIG{raw} }) { if ($raw =~ /$regex/) { $state->{last} = 0; return; } } # Parse the or Apache syslog line. Any line that we can't parse is # mailed. my ($timestamp, $hostname, $id, $body); for my $regex (@SYSLOG_REGEXES) { if ($line =~ /$regex/) { ($timestamp, $hostname, $id, $body) = ($1, $2, $3, $4); last; } } my ($client, $level); for my $regex (@APACHE_REGEXES) { if ($line =~ /$regex/) { ($timestamp, $level, $client, $body) = ($1, $2, $3, $4); $id = "apache-$level"; last; } } unless ($timestamp) { mail_line ($line); $state->{last} = 1; return; } $body =~ s/\s+$//; # Check to see if we're ignoring this message, and if not, start a report # (if we haven't already) and include it in the report. my ($program) = ($id =~ /^([^\[]+)/); for my $regex (@{ $CONFIG{global} }) { if ($body =~ /$regex/) { $state->{last} = 0; return; } } if ($CONFIG{ignore}{$program}) { for my $regex (@{ $CONFIG{ignore}{$program} }) { if ($body =~ /$regex/) { $state->{last} = 0; return; } } } for my $data (@{ $CONFIG{ignorematch} }) { my ($match, $regex) = @$data; if ($program =~ /$match/ and $body =~ /$regex/) { $state->{last} = 0; return; } } mail_line ($line); $state->{last} = 1; } ############################################################################## # Implementation ############################################################################## # Clean up $0 for error reporting. my $fullpath = $0; $0 =~ s%^.*/%%; # Parse command-line options. my ($help, $hostname, $version); Getopt::Long::config ('bundling', 'no_ignore_case'); GetOptions ('help|h' => \$help, 'hostname|o' => \$hostname, 'no-mail|n' => \$NOMAIL, 'version|v' => \$version) or exit 1; if ($help) { print "Feeding myself to perldoc, please wait....\n"; exec ('perldoc', '-t', $fullpath) or die "$0: can't fork: $!\n"; } elsif ($version) { my $version = join (' ', (split (' ', $ID))[1..3]); $version =~ s/,v\b//; $version =~ s/(\S+)$/($1)/; $version =~ tr%/%-%; print $version, "\n"; exit 0; } # The path to the config file is the only argument. my $config = shift || 'filter-syslog.conf'; unless ($config =~ m%^\.?/%) { for my $dir (@CONFIGDIR) { if (-f "$dir/$config") { $config = "$dir/$config"; last; } } } config_parse ($config); # Now, process our input. We spit our output out through a pipe to sendmail # if we find anything of note. my %state = (range => []); while (<>) { filter_line ($_, \%state); } # If we never saw the end of some range, reprocess that chunk. if (@{ $state{range} }) { reprocess_range ($state{range}, \%state); } mail_close; __END__ ############################################################################## # Documentation ############################################################################## =head1 NAME filter-syslog - Filters a syslog file and mails the results =head1 SYNOPSIS filter-syslog [B<-hnov>] [I] < I =head1 DESCRIPTION B parses a log generated by syslog, filtering out all of the boring lines as configured in I, and then mails the remaining lines to the address specified in I. It expects the log file on standard input, and is designed to run from an analyze action in newsyslog(8), although it can be used in other situations as well. It can also parse Apache error logs and do similar filtering. If I isn't an absolute path, it's taken to be relative to either F or F, wherever the file is found (searched in that order). If I is not specified, it defaults to F and is looked for in both F and F. Lines containing C<-- MARK --> and sysklogd restart messages on Linux, which look like: Sep 10 04:02:07 example syslogd 1.4.1: restart. Apr 1 23:55:01 example syslogd 1.4.1#10: restart. Apr 1 23:55:01 example syslogd 1.4.1#10: restart (remote reception). Apr 1 23:55:09 example exiting on signal 15 are always ignored. Messages of the form: Apr 28 07:09:40 10.1.1.1 Message forwarded from example.org: \ program[36398]: some log message (line split only for readability in this example) will be parsed exactly as if they had said: Apr 28 07:09:40 example.org program[36398]: some log message This format is used by OpenBSD for forwarded syslog messages. Please note that this is not intended to be a security tool or a real-time monitoring tool, but rather a tool to make sure that system administrators are aware of unusual log messages that might indicate server problems or failing hardware. An intrusion detection system would work differently and would be more paranoid, and a real-time monitoring tool wouldn't run in batch mode. There are other tools available to do that type of monitoring. =head1 OPTIONS =over 4 =item B<-h>, B<--help> Print out this documentation (which is done simply by feeding the script to C). =item B<-n>, B<--no-mail> Rather than sending the results via e-mail, instead print out the non-boring lines that would have been sent via e-mail to standard output. Useful for testing filter rules. =item B<-o>, B<--hostname> Display the hostname field (from the input syslog) in the output. =item B<-v>, B<--version> Print the version of B and exit. =back =head1 CONFIGURATION FILE There are three types of valid lines in the configuration file; variable settings, filter patterns, and includes of other configuration files. A variable setting looks like: variable = value where I can contain whitespace (but can't begin with whitespace). A filter pattern looks like one of: program: /regex/ /program/: /regex/ /regex/ /regex/ ... /regex/ where I is the name of a particular program (the filter line will only apply to log entries from that program) and I is a Perl regular expression matching lines that are "boring" and shouldn't be reported. Any trailing whitespace in the syslog line will be removed before matching it against the regex. Slashes (C) in I do not have to be escaped. Each of these lines must be all on one line. When a line is in Apache error log format, the program for that line will be set to C> where I is the log level for that line (C, C, etc.). If I is surrounded by slashes (C), it is a regex and any program name that matches that regex will have that filter line applied. If I is C<*> I will be applied to all lines, regardless of what program they're from. If I is not present, as in the last two forms, the regex is matched against the entire syslog line, including the timestamp, and the line will be ignored if the regex matches. This can be used to match logs in a non-standard format, such as ones without a program name or with a program name containing spaces. If the line contains two regexes separated by C<...>, this indicates a range of lines. All lines between a line matching the first regex and a line matching the second regex will be ignored, including the matching lines. Both regular expressions are matched against the entire line, including the timestamp and program. There must be no more than 1000 lines in the range; if more than 1000 lines are encountered after the start regex, B will stop looking for the end regex and then parse all the lines normally. Finally, a line like: include /path/to/file includes another configuration file at F. The path can be a directory instead of a file, in which case every file in that directory that does not begin with a period is included (in no defined order). The following variables are recognized: =over 4 =item alert The address to which to mail the filtering results. No mail will be sent if all of the input lines are filtered out by the regexes provided. This variable must be set and may not contain any backslashes or single quotes. =item sender The address from which to mail the filtering results (used for the envelope sender and the To: header). If not set, no address will be given to sendmail, which will result in the mail system picking some default value based on the user B is running as. The value of this variable may not contain any backslashes or single quotes. =item subject The value to use for the Subject: header of the filtering results. If you include C<$h> in the value, it will be replaced with the hostname. This variable must be set. =back If there are any input lines that don't match one of the filter rules, they will be mailed to the value of I with a subject given by I. =head1 EXAMPLES Filter /var/log/syslog using /etc/syslog.filter as a configuration file. filter-syslog syslog.filter < /var/log/syslog Here's a sample configuration file that filters out normal Kerberos messages and sends the result to root@example.com with a Subject: header of C: alert = root@example.com subject = example syslog filter results kftgtd: /^connect from / klogind: /^connect from / kshd: /^Executing .* for principal / kshd: /^Shell process completed\.$/ kshd: /^connect from / Instead of the three separate lines to filter out TCP wrappers messages, one could instead use the line: *: /^connect from / to filter out all syslog lines that begin with C, but this runs a larger risk of filtering out messages that would be of interest. The rule: apache-warn: /^FastCGI: / would filter out all Apache error log messages about FastCGI. The rule: apache-debug: /.*/ would filter out everything in an Apache error.log logged at debug level. The filter pattern: /^\w{3} [ :0-9]{11} \S+ \w+\[\d+\]: connect from / would match any syslog line from any program beginning with C. This regular expression is matched against the entire line; notice that the timestamp and host identifier have to be matched, as well as the PID in brackets after the program name. In this specific case, there is no reason to use such a rule since B can parse that line into a program name and message, but this sort of rule can be used to match any arbitrary syslog line that B may not otherwise be able to parse. Finally, the configuration line: /START/ ... /END/ would filter out every log line between a line containing C to a line containing C, inclusive. (This example isn't particularly useful, but the regular expressions can of course be more complex.) =head1 FILES =over 4 =item F =item F If the configuration file given on the command line isn't an absolute path, it is looked for first in F and then in F. This default can be changed by editing the beginning of this program. =item F =item F =item F<./filter-syslog.conf> The default configuration file, if none is given. The paths will be searched in the above order. =back =head1 BUGS The rule that ignores C<-- MARK --> lines, which are automatically generated by (at least) Solaris syslogd at periodic intervals if requested, could be exploited to hide messages from B that an administrator may want to see. Please again note that this is not a security tool. However, a better regex should be developed and used instead, regardless. There is no protection against inclusion loops (a configuration file that includes another file which then includes the first file). =head1 NOTES As of version 1.20, B removes trailing whitespace from syslog lines before seeing if the lines match the provided regexes. Earlier versions did not do this. You may need to change your regexes when upgrading from 1.19 to 1.20. =head1 SEE ALSO newsyslog(8) The current version of this program is available from its web page at L. =head1 AUTHORS Russ Allbery . Patch for B<--hostname> from Steve Benson. =head1 COPYRIGHT AND LICENSE Copyright 2002, 2003, 2004, 2006, 2007, 2009, 2010, 2011, 2012 The Board of Trustees of the Leland Stanford Junior University. This program is free software; you may redistribute it and/or modify it under the same terms as Perl itself. =cut