#!/usr/bin/perl
$version = '$Id: modcount,v 0.2 1996/11/14 05:28:07 eagle Exp $';
#
# modcount -- Do statistical analysis of the breakdown of moderated groups.
#             Copyright 1996 Russ Allbery <rra@cs.stanford.edu>
#
# This program is free software; you can redistribute it and/or modify it
# under the same terms as Perl itself.
#
# Syntax:
#    modcount [-8 | -h <hierachies>] <active>
#
#    <hierarchies> is a comma-separated list of top-level hierarchies to
#    check (no spaces).  <active> is the file name of an active file.  -8
#    specifies that the Big Eight should be checked.
#
# Edit the database at the end of this script to change how groups are
# classified.

require 5.001;

use strict;
use vars qw($version @out);

# Parse the command line and determine the extent of our check.
sub parse_opts {
    my @hierarchies;
    if ($ARGV[0] eq '-8') {
	@hierarchies = ('comp', 'humanities', 'misc', 'news', 'rec',
			'sci', 'soc', 'talk');
	shift @ARGV;
    } elsif ($ARGV[0] eq '-h') {
	shift @ARGV;
	@hierarchies = split (/,/, shift @ARGV)
	    or die "$0: -h requires an argument\n";
    }
    my $active = shift @ARGV or die "$0: no active file specified\n";
    if (@hierarchies) {
	my $regex = '^(' . join ('|', @hierarchies) . ')\.';
	return ($active, $regex);
    } else {
	return $active;
    }
}

# Parse the database at the end of the script.  rule <pattern> specifies
# that all groups matching that pattern are non-discussion groups.  except
# <group> d means that that particular group is a discussion group.  except
# <group> a (or anything other than d) says that that group is a
# non-discussion group.  Anything not matched by an exception or a rule is
# considered a discussion group.
sub parse_data {
    my ($rules, $exceptions) = @_;
    while (<DATA>) {
	my @rule = split;
	if ($rule[0] eq 'rule') {
	    push (@$rules, $rule[1]);
	} elsif ($rule[0] eq 'except') {
	    $exceptions->{$rule[1]} = ($rule[2] =~ /^d/) ? 0 : 1;
	}
    }
}

# Main routine, which opens and parses the active file and builds a list of
# groups placed into each category, as well as creating statistics for each
# top-level hierarchy.
{
    my ($active, $regex) = &parse_opts;
    my (@rules, %exceptions);
    &parse_data (\@rules, \%exceptions);

    my (%stats, @discuss, @nondiscuss);
    open (ACTIVE, $active) or die "cannot open active file $active: $!\n";
    
  GROUP:
    while (<ACTIVE>) {
	my @fields = split;
	if ($regex) { next unless $fields[0] =~ /$regex/o }
	my @components = split (/\./, $fields[0]);
	$stats{$components[0]}->[($fields[3] eq 'm') ? 1 : 0]++;
	next unless ($fields[3] eq 'm');
	if (defined $exceptions{$fields[0]}) {
	    if ($exceptions{$fields[0]}) {
		$stats{$components[0]}->[2]++;
		push (@nondiscuss, $fields[0]);
	    } else {
		$stats{$components[0]}->[3]++;
		push (@discuss, $fields[0]);
	    }
	} else {
	    my $rule;
	    foreach $rule (@rules) {
		if ($fields[0] =~ /\Q$rule/) {
		    $stats{$components[0]}->[2]++;
		    push (@nondiscuss, $fields[0]);
		    next GROUP;
		}
	    }
	    $stats{$components[0]}->[3]++;
	    push (@discuss, $fields[0]);
	}
    }

    my (@totals, $hierarchy);
    foreach $hierarchy (sort keys %stats) {
	my $total = $stats{$hierarchy}->[0] + $stats{$hierarchy}->[1];
	@out = ($hierarchy,
		   $stats{$hierarchy}->[0],
		   sprintf ("%.1f", $stats{$hierarchy}->[0] / $total * 100),
		   $stats{$hierarchy}->[2],
		   sprintf ("%.1f", $stats{$hierarchy}->[2] / $total * 100),
		   $stats{$hierarchy}->[3],
		   sprintf ("%.1f", $stats{$hierarchy}->[3] / $total * 100));
	write;
	$-++;
	$totals[0] += $total;
	$totals[1] += $out[1];
	$totals[2] += $out[3];
	$totals[3] += $out[5];
    }
    print "\n";
    @out = ('TOTALS',
	    $totals[1],
	    sprintf ("%.1f", $totals[1] / $totals[0] * 100),
	    $totals[2],
	    sprintf ("%.1f", $totals[2] / $totals[0] * 100),
	    $totals[3],
	    sprintf ("%.1f", $totals[3] / $totals[0] * 100));
    write;
    print '-' x 67 . "\n\nNon-discussion groups:\n\n";
    print join ("\n", sort @nondiscuss);
    print "\n\nDiscussion groups:\n\n";
    print join ("\n", sort @discuss);
}    

# Output formats to get a nice table listing.
format STDOUT_TOP =
Hierarchy          unmod            non-discuss         discuss
-------------------------------------------------------------------
.
format STDOUT =
@<<<<<<<<<<<<   @>>> - @>>>>%      @>>> - @>>>>%      @>>> - @>>>>%
$out[0], $out[1], $out[2], $out[3], $out[4], $out[5], $out[6]
.
__END__
rule .announce
rule .answers
rule .info
rule .binaries.
rule .sources.
rule .reviews
rule .creative
rule .stories
rule clari.
except comp.ai.jair.papers                n
except comp.ai.nlang-know-rep             n
except comp.ai.vision                     n
except comp.archives                      n
except comp.bugs.4bsd.ucb-fixes           n
except comp.doc.techreports               n
except comp.home.misc                     n
except comp.infosystems.kiosks            d
except comp.infosystems.www.authoring.cgi d
except comp.internet.library              n
except comp.internet.net-happenings       n
except comp.lang.sigplan                  n
except comp.laser-printers                n
except comp.mail.maps                     n
except comp.networks.noctools.tools       n
except comp.newprod                       n
except comp.org.eff.news                  n
except comp.org.fidonet                   n
except comp.patents                       n
except comp.protocols.iso.x400.gateway    n
except comp.research.japan                n
except comp.risks                         n
except comp.society                       n
except comp.society.cu-digest             n
except comp.std.lisp                      n
except comp.std.unix                      n
except comp.sys.concurrent                n
except comp.sys.ibm.pc.digest             n
except comp.sys.m68k.pc                   n
except comp.sys.mac.digest                n
except comp.theory.info-retrieval         d
except misc.activism.progressive          n
except misc.education.multimedia          n
except misc.health.injuries.rsi.moderated n
except misc.news.bosnia                   n
except misc.news.east-europe.rferl        n
except misc.news.southasia                n
except misc.test.moderated                n
except news.lists                         n
except rec.arts.erotica                   n
except rec.aviation.questions             n
except rec.boats.marketplace              n
except rec.food.recipes                   n
except rec.games.frp.archives             n
except rec.humor.funny                    n
except rec.humor.oracle                   n
except rec.music.promotional              n
except rec.sport.baseball.analysis        n
except sci.astro.hubble                   n
except sci.finance.abstracts              n
except sci.psychology.journals.psyche     n
except sci.psychology.journals.psycoloquy n
except sci.space.news                     n
except soc.politics                       n
except soc.politics.arms-d                n