#!/usr/bin/perl -w
$ID = q$Id: afs-balance,v 1.7 2007-10-16 00:01:53 eagle Exp $;
#
# afs-balance -- Balance AFS servers using AMPL.
#
# Written by Neil Crellin <neilc@wallaby.cc>
#        and Russ Allbery <rra@stanford.edu>
# Copyright 1998, 1999, 2005 Board of Trustees, Leland Stanford Jr. University
#
# This program is free software; you may redistribute it and/or modify it
# under the same terms as Perl itself.
#
# Takes a list of server/partition pairs as arguments (with . being a wildcard
# representing all partitions on that server), and generates an output file
# named vols.*.dat in the current directory (where * is a representation of
# the partitions being balanced) that is valid AMPL input for the constraint
# problem of the fewest volume transfers to provide a reasonable balance.
# Also post-processes the output from AMPL and generates the necessary
# commands to implement the balancing solution.
#
# Must be run on lsdb as a user with access to the password information for
# the afsdb database.

##############################################################################
# Site configuration
##############################################################################

# 1 - allowable fudge in the balance solution (how much each partition can
# differ in used space and accesses).
$FUDGE = 0.975;

##############################################################################
# Modules and declarations
##############################################################################

require 5.004;

use strict;
use vars qw($FUDGE $ID);

use Getopt::Long qw(GetOptions);
use Stanford::LSDB::AFSDB;

##############################################################################
# Utility functions
##############################################################################

# Sort by machine name, doing a numeric sort on a trailing digit and a
# lexicographic sort on the machine name.
sub machinesort ( @ ) {
    map { $$_[0] }
        sort { $$a[1] cmp $$b[1] || $$a[2] <=> $$b[2] }
            map { [ $_, split (/(\d+)$/, $_, 2) ] }
                @_
}

# Used for heredocs to make them more readable.
sub unquote ( $ ) {
    my ($string) = @_;
    $string =~ s/^:\s+//mg;
    return $string;
}

# Build a useful filename from the hash of servers and partitions that we're
# going to be looking at.  Takes a reference to the server hash.
sub filename ( $ ) {
    my ($servers) = @_;
    my $name = join ('+', map {
        my $server = $_;
        $server =~ s/^afssvr//;
        $$servers{$_}[0] eq 'vicep.'
            ? $server
            : map {
                  my $part = $_;
                  $part =~ s/^vicep//;
                  $part =~ tr/\[\]//d;
                  "$server$part"
              } @{ $$servers{$_} };
    } machinesort keys %$servers);
    return $name;
}

##############################################################################
# Server disk usage gathering
##############################################################################

# Given the comamnd-line arguments, build a hash of server names that we're
# affecting, where the value of each key is an array of partition
# specifications (which may include regex matches).  Return the hash as a
# list.
sub server_list ( @ ) {
    my (@args) = @_;
    my %servers;
    while (@args) {
        my ($server, $part) = splice (@args, 0, 2);
        $server = 'afssvr' . $server if ($server =~ /^\d+$/);
        if ($part eq '.') {
            $part = 'vicep.';
        } elsif ($part =~ /^[a-z]-[a-z]$/) {
            $part = "vicep[$part]";
        } else {
            $part = '/vicep' . $part unless ($part =~ m%^/vicep%);
            die "$0: invalid partition $part\n"
                if ($part !~ m%^/vicep[a-z]+$%);
            $part =~ s%^/%%;
        }
        push (@{ $servers{$server} }, $part);
    }
    for (keys %servers) {
        @{ $servers{$_} } = sort @{ $servers{$_} }
    }
    return %servers;
}

# Given a hash of server names, the values being anonymous arrays of
# partitions, populate hashes containing the partition capacities and the
# current usage of the given server partitions.  The keys of the latter two
# hashes will be <server>.<partition>.  Return the total capacity of all
# server partitions given.
sub partinfo ( $$$ ) {
    my ($servers, $parts, $capacity) = @_;
    local *PARTINFO;
    my ($header, $totalcapacity);
    for my $server (machinesort keys %$servers) {
        my @partitions = @{ $$servers{$server} };
        open (PARTINFO, "partinfo $server |") or die "$0: can't fork: $!\n";
        while (<PARTINFO>) {
            print, next if /^\s*Partition/ && !$header++;
            my ($partition) = m%\s/(vicep[a-z]+):% or next;
            next unless grep { $partition =~ /^$_$/ } @partitions;
            my $key = "$server.$partition";
            my $total = (split)[2];
            push (@$parts, $key);
            push (@$capacity, $total);
            $totalcapacity += $total;
            print;
        }
        close PARTINFO;
    }
    return $totalcapacity;
}

# Given a volume, return its size and the location of its RW copy as a list,
# or the empty list on error.
sub volinfo {
    my $volume = shift;
    local (*VEX, $_);
    my ($size, $server, $partition);
    open (VEX, "vos examine $volume |") or die "$0: can't fork vos: $!\n";
    while (<VEX>) {
        if (/^\Q$volume\E\s+\d+\s+(?:RW|RO)\s+(\d+)/) {
            $size = $1;
            ($server, $partition) = split (' ', scalar <VEX>);
            $server =~ s/\.stanford\.edu//i or next;
            last;
        }
    }
    1 while <VEX>;
    close VEX;
    return defined $size ? ($size, "$server $partition") : ();
}

# Given a list of server.partition names and a corresponding list of
# capacities, modify the lists so that the first will instead be a list of
# servers and the second will be a list of capacities of those servers.
sub compress_parts ( $$ ) {
    my ($parts, $capacities) = @_;
    my %new;
    for (my $i = 0; $i < @$parts; $i++) {
        my ($server) = split (/\./, $$parts[$i]);
        $new{$server} += $$capacities[$i];
    }
    my @newparts = sort keys %new;
    my @newcaps;
    for (@newparts) {
        push (@newcaps, $new{$_});
    }
    @$parts = @newparts;
    @$capacities = @newcaps;
}

##############################################################################
# Database queries
##############################################################################

# Submit a SQL query.
sub query_database ( $$$$;$ ) {
    my ($dbh, $what, $where, $condition, $order) = @_;
    my $request = "select $what from $where where $condition";
    $request .= " order by $order" if $order;
    return $dbh->selectall_arrayref ($request);
}

# Construct the constraint on our database queries to return only those
# volumes that we're balancing.  We first check to see if there are any
# read/write volumes on the partitions that we're looking at.  If there are,
# we balance read/write volumes; otherwise, we'll balance RO volumes.
#
# Takes the database handle and list of server.partition strings.  We return
# the SQL where clause that will select only the right partitions.
sub query_constraint ( $@ ) {
    my ($dbh, @parts) = @_;
    my $request = join (' or ', map {
        my ($server, $partition) = /^([^.]+)\.(.*)/;
        if ($partition eq 'vicep.') {
            "server = '$server'";
        } elsif ($partition =~ /^vicep\[([a-z-]+)\]$/) {
            my ($start, $end) = split ('-', $1);
            my @parts;
            for ($start .. $end) {
                push (@parts, "'/vicep$_'");
            }
            "(server = '$server' and part in (" . join (', ', @parts) . "))";
        } else {
            "(server = '$server' and part = '/$partition')";
        }
    } @parts);
    $request = "($request)";
    my $result = query_database ($dbh, 'count(*)', 'volumes',
                                 "$request and type = 'RW'");
    my $voltype = ($$result[0][0] > 0) ? 'RW' : 'RO';
    $request .= " and type = '$voltype'";
    return $request;
}

##############################################################################
# AMPL file generation
##############################################################################

# Given a total, a list of factors, a list of locations, and a fudge value,
# output a constraint listing each partition and a weighed ideal formed by
# multiplying the total, factor, and fudge.  If fudge is not provided, it
# defaults to $FUDGE.
sub constraint ( $\@\@;$ ) {
    my ($total, $factors, $parts, $fudge) = @_;
    $fudge ||= $FUDGE;
    my $output;
    for (my $i = 0; $i < @$parts; $i++) {
        $output .= "$$parts[$i]    ";
        $output .= int ($fudge * $total * $$factors[$i]) . "\n";
    }
    $output .= ";\n\n";
    return $output;
}

# Given a reference to an array of results, iterate through the array and
# print them all out given a printf format string to use.
sub printf_result ( $$$ ) {
    my ($fh, $format, $result) = @_;
    for my $line (@$result) {
        printf $fh $format, @$line;
    }
}

# Print out the data for equal volumes in each location.  We don't allow for
# any fudge in this at all.  Takes the database handle, the query, and the
# locations and factors arrays.
sub ampl_constraint_count ( $$\@\@ ) {
    my ($dbh, $request, $factors, $parts) = @_;

    # Calculate the ideal number of volumes per partition and output that to
    # the AMPL data file for use as a constraint.
    print VOLSDAT 'param: PARTS: eqnovols := '
        . '# defines set "PARTS" and param "eqnovols"' . "\n";
    my $result = query_database ($dbh, 'count(*)', 'volumes', $request);
    my $count = $$result[0][0];
    print VOLSDAT constraint ($count, @$factors, @$parts, 1.0);

    # Output the current number of volumes per partition.
    print VOLSDAT "# current volumes per partition\n";
    $result = query_database ($dbh, 'server, part, count(*)', 'volumes',
                              $request . " group by server, part");
    printf_result (\*VOLSDAT, "# %-12s %-8s: %5d\n", $result);
    print VOLSDAT "\n\n";
}

# Print out the data for roughly even amounts of disk usage in each location.
# Takes the database handle, the query, and the locations and factors arrays.
sub ampl_constraint_size ( $$\@\@ ) {
    my ($dbh, $request, $factors, $parts) = @_;

    # Calculate roughly even amounts of disk usage per partition and output
    # that to the AMPL data file for use as a constraint.
    print VOLSDAT "param capacity := # capacity constraint values\n";
    my $result = query_database ($dbh, 'sum(used)', 'volumes', $request);
    print VOLSDAT constraint ($$result[0][0], @$factors, @$parts);

    # Output the current disk usage per partition.
    print VOLSDAT "# current disk usage per partition\n";
    $result = query_database ($dbh, 'server, part, sum(used)', 'volumes',
                              $request . " group by server, part");
    printf_result (\*VOLSDAT, "# %-12s %-8s: %10d\n", $result);
    print VOLSDAT "\n\n";
}

# Print out the data for roughly even amounts of quota in each location.  We
# don't use this data currently; it over-constrains the problem too much and
# makes the solution take too long.  Takes the database handle, the query, and
# the locations and factors arrays.
sub ampl_constraint_quota ( $$\@\@ ) {
    my ($dbh, $request, $factors, $parts) = @_;

    # Calculate roughly even amounts of allocated quota per partition for use
    # as a constraint.  This isn't used by default.
    print VOLSDAT "param quotasum := # total quota constraint values\n";
    my $result = query_database ($dbh, 'sum(quota)', 'volumes', $request);
    print VOLSDAT constraint ($$result[0][0], @$factors, @$parts);

    # Output the current allocated quota per partition.
    print VOLSDAT "# current allocated quota per partition\n";
    $result = query_database ($dbh, 'server, part, sum(quota)', 'volumes',
                              $request . " group by server, part");
    printf_result (\*VOLSDAT, "# %-12s %-8s: %10d\n", $result);
    print VOLSDAT "\n\n";
}

# Print out data for roughly even numbers of accesses in each location.  Takes
# the database handle, the query, and the locations and factors arrays.
sub ampl_constraint_accesses ( $$\@\@ ) {
    my ($dbh, $request, $factors, $parts) = @_;

    # Calculate roughly even numbers of accesses per partition for use as a
    # constraint.  This assumes that large partitions can handle more
    # accesses, which isn't too bad but isn't quite what you want.
    print VOLSDAT "param eqaccess := # roughly equal accesses per partition\n";
    my $result = query_database ($dbh, 'sum(accesses)', 'volumes', $request);
    print VOLSDAT constraint ($$result[0][0], @$factors, @$parts);

    # Output the current number of accesses per partition.
    print VOLSDAT "# current accesses per partition\n";
    $result = query_database ($dbh, 'server, part, sum(accesses)', 'volumes',
                              $request . " group by server, part");
    printf_result (\*VOLSDAT, "# %-12s %-8s: %10d\n", $result);
    print VOLSDAT "\n\n";
}

# Print out the current sizes, quotas, and accesses of volumes.  Takes the
# database handle and the query.
sub ampl_volume_data ( $$ ) {
    my ($dbh, $request) = @_;

    # Output the names of all the affected volumes and their sizes.
    print VOLSDAT 'param: VOLS:  size := # defines "VOLS" and "size"', "\n";
    my $result = query_database ($dbh, 'volname, used', 'volumes', $request,
                                 'used desc');
    printf_result (\*VOLSDAT, "%-22s %12d\n", $result);
    print VOLSDAT ";\n\n";

    # Output the names of all the affected volumes and their quotas.  This
    # isn't used by default.
    print VOLSDAT "param quota := # quota of each volume\n";
    $result = query_database ($dbh, 'volname, quota', 'volumes', $request,
                              'used desc');
    printf_result (\*VOLSDAT, "%-22s %12d\n", $result);
    print VOLSDAT ";\n\n";

    # Output the names of all affected volumes and the number of accesses they
    # received the previous day.
    print VOLSDAT "param accesses := # how many accesses per volume\n";
    $result = query_database ($dbh, 'volname, accesses', 'volumes', $request,
                              'used desc');
    printf_result (\*VOLSDAT, "%-22s %12d\n", $result);
    print VOLSDAT ";\n\n";
}

# Print out the current locations of volumes, including server and partition.
# Takes the database handle, the SQL filter, and the array of locations.
sub ampl_locations ( $$\@ ) {
    my ($dbh, $request, $parts) = @_;

    # Build a table of location strings.  A location string is a
    # space-separated set of 0s and 1s for each server/partition pair we're
    # dealing with, with a 1 in the column corresponding to that location and
    # 0s in all other columns.  In other words, we're generating a zero-filled
    # matrix with a diagonal of 1s.
    my $vector = join (' ', 1, (0) x $#$parts);
    my %location;
    for (@$parts) {
        $location{$_} = $vector;
    } continue {
        $vector =~ s/1 0/0 1/;
    }

    # Output a table showing which partition each volume is located on.
    print VOLSDAT "param located (tr) : # where is volume now\n",
        join (' ', @$parts), " :=\n";
    my $result = query_database ($dbh, 'volname, server, part', 'volumes',
                                 $request, 'used desc');
    my ($volume, $server, $part);
    for my $line (@$result) {
        my ($volume, $server, $part) = @$line;
        for ($volume, $server, $part) {
            s/ //g;
        }
        $part =~ s%/%.%;
        if ($$parts[0] =~ /\./) {
            printf VOLSDAT "%-22s %s\n", $volume, $location{$server . $part};
        } else {
            printf VOLSDAT "%-22s %s\n", $volume, $location{$server};
        }
    }
    print VOLSDAT ";\n\n";
}

# Generate an AMPL data file, listing accesses, size, and volume count.  Takes
# the file name, a flag saying whether to balance only by server, the
# locations and capacity arrays, and the total capacity.
sub ampl_write_data ( $$\@\@$ ) {
    my ($name, $server, $parts, $capacity, $total) = @_;
    open (VOLSDAT, "> vols.$name.dat")
        or die "$0: can't create vols.$name.dat: $!\n";

    # Open a database connection and get the SQL constraint that selects just
    # the volumes we're dealing with.
    my $dbh = Stanford::LSDB::AFSDB->connect;
    my $request = query_constraint ($dbh, @$parts);

    # Modify the locations hash to only go by server if $server is set.
    if ($server) {
        compress_parts ($parts, $capacity);
    }

    # Determine the fraction of capacity represented by each location.
    my @factors;
    for (my $i = 0; $i < @$capacity; $i++) {
        $factors[$i] = $$capacity[$i] / $total;
    }

    # Print out the constraints on the solution.
    ampl_constraint_count ($dbh, $request, @factors, @$parts);
    ampl_constraint_size ($dbh, $request, @factors, @$parts);
    ampl_constraint_quota ($dbh, $request, @factors, @$parts);
    ampl_constraint_accesses ($dbh, $request, @factors, @$parts);

    # Print out the current volume data and locations.
    ampl_volume_data ($dbh, $request);
    ampl_locations ($dbh, $request, @$parts);

    # Done.
    close VOLSDAT;
    $dbh->disconnect;
}

# Generate the static portion of the AMPL input script.  Takes the name to use
# in the file name and the name of the model file to use.
sub ampl_write_control ( $$ ) {
    my ($name, $control) = @_;
    open (AMPLIN, "> vols.$name.in")
        or die "$0: can't create vols.$name.in: $!\n";
    print AMPLIN unquote (<<"EOA");
:       model /afs/ir/service/afs/data/$control.mod;
:       data vols.$name.dat;
:       options solver cplexamp;
:       options show_stats 1;
:       options cplex_options 'prestats=1 mipdisplay=1 mipsolutions=2';
:       display {i in PARTS} sum {j in VOLS} located[i,j];
:       display {i in PARTS} sum {j in VOLS} located[i,j]*accesses[j];
:       display {i in PARTS} sum {j in VOLS} located[i,j]*size[j];
:       display {i in PARTS} sum {j in VOLS} located[i,j]*quota[j];
:       solve;
:       display {i in PARTS} sum {j in VOLS} assigned[i,j];
:       display {i in PARTS} sum {j in VOLS} assigned[i,j]*accesses[j];
:       display {i in PARTS} sum {j in VOLS} assigned[i,j]*size[j];
:       display {i in PARTS} sum {j in VOLS} assigned[i,j]*quota[j];
:       display {i in PARTS, j in VOLS} assigned[i,j]*(1-located[i,j]);
:       quit;
EOA
    close AMPLIN;
}

##############################################################################
# Solution parsing
##############################################################################

# Expecting either a solution file in $ARGV[0] or the solution on standard
# input, parse the AMPL solution to the balancing problem and print to
# standard output a list of volumes that should be moved, their current
# location as a whitespace-separated server and partition pair, and the
# destination to which they should be moved as a whitespace-separated server
# and partition pair.  (This file can be processed with mvto -s -L.)
sub ampl_parse_solution () {
    local $_;
    while (<>) {
        last if $_ eq "assigned[i,j]*(1 - located[i,j]) [*,*] (tr)\n";
    }

    # If the next lines begin with #, they're mappings from codes to
    # server/partition pairs because all the pairs didn't fit across the
    # screen.  The pairs look like:
    #
    #       # $3 = afssvr1.vicepc
    #
    # Build a mapping table with them.
    my %mapping;
    while (<>) {
        last unless /^\#/;
        my ($variable, $value) = (split)[1,3];
        $mapping{"'" . $variable . "'"} = $value;
    }

    # The next line will be the column headers that tell us what columns
    # represent what servers and partitions.  The first column will be a ':';
    # the rest will either be server.partition or a variable that's in the
    # mapping hash.
    my @columns = split;
    die "$0: can't parse solution at line $.\n" unless $columns[0] eq ':';
    shift @columns;
    if ($columns[$#columns] eq ':=') {
        pop @columns;
    } else {
        die "$0: can't parse solution at line $.\n"
            unless (<> =~ /^\s*:=\s*$/);
    }
    for (@columns) {
        $_ = $mapping{$_} if defined $mapping{$_};
        s%\.% /%;
    }

    # Now we have a whole set of volume names followed by columns of 0s and at
    # most one 1.  If there are no 1s, the volume shouldn't be moved.
    # Otherwise, it should be moved from wherever it is now to the server and
    # partition indicated by the 1.
    while (<>) {
        last if /^;/;
        my ($volume, $movement) = split (' ', $_, 2);
        die "$0: can't parse solution at line $.\n" if $movement =~ /[^10\s]/;
        next if $movement !~ /^((?:0\s+)*)1/;
        my $prefix = $1;
        my $index = ($prefix =~ tr/0//);
        my $destination = $columns[$index];
        if ($destination !~ / /) {
            $destination .= ' .';
        }

        # Now, get the current location of the volume so that we can write
        # output suitable for processing with mvto -s.  This currently doesn't
        # work for read-only volumes.
        my ($size, $location) = volinfo $volume;
        warn "$0: unable to examine volume $volume, skipping\n" if !$location;
        print "$volume $location $destination\n";
    }
}

##############################################################################
# Main routine
##############################################################################

my $fullpath = $0;
$0 =~ s%^.*/%%;

# Parse command-line options.
my ($help, $partition, $results, $server, $version);
Getopt::Long::config ('bundling');
GetOptions ('f|fudge=f'   => \$FUDGE,
            'h|help'      => \$help,
            'p|partition' => \$partition,
            'r|results'   => \$results,
            's|server'    => \$server,
            'v|version'   => \$version) or exit 1;
if ($help) {
    print "Feeding myself to perldoc, please wait....\n";
    exec ('perldoc', '-t', $fullpath);
} elsif ($version) {
    my $version = join (' ', (split (' ', $ID))[1..3]);
    $version =~ s/,v\b//;
    $version =~ s/(\S+)$/($1)/;
    $version =~ tr%/%-%;
    print $version, "\n";
    exit;
}
die "Usage: $0 <server> <partition> [<server> <partition> ...]\n"
    unless (@ARGV % 2 == 0 || $results);
die "$0: only one of -s or -p may be given\n" if ($server && $partition);
die "$0: -r may not be combined with -p or -s\n"
    if ($results && ($server || $partition));
die "Usage: $0 -r [<solution-file>]\n" if ($results && @ARGV > 1);

# If called with -r, parse the AMPL results and generate the commands required
# to implement the balance.  Otherwise, read the servers and partitions from
# the command line, get the usage, and then write the AMPL input and data
# file.
if ($results) {
    ampl_parse_solution;
} else {
    my %servers = server_list (@ARGV);
    my (@parts, @capacity, @factors);
    my $totalcapacity = partinfo (\%servers, \@parts, \@capacity);
    my $name = filename (\%servers);
    ampl_write_data ($name, $server, @parts, @capacity, $totalcapacity);
    if ($partition) {
        ampl_write_control ($name, 'balance-size');
    } else {
        ampl_write_control ($name, 'balance');
    }
}
exit 0;
__END__

##############################################################################
# Documentation
##############################################################################

=head1 NAME

afs-balance - Balance AFS servers using AMPL

=head1 SYNOPSIS

afs-balance [B<-hpsv>] [B<-f> I<fudge>] I<server> I<partition> [I<server>
I<partition> ...]

afs-balance B<-r> [I<solution>]

=head1 REQUIREMENTS

The Stanford::LSDB::AFSDB Perl module (or some editing of the script to
substitute in some other way of connecting to the database), AMPL with the
CPLEX solver to solve the linear programming problem, B<partinfo> to obtain
partition capacity information, and B<mvto> to implement the solution.

=head1 DESCRIPTION

Drawing data about AFS volumes from the AFS Oracle database,
B<afs-balance> phrases an AFS server balancing problem as an AMPL problem
so that it can be solved with linear programming tools.

B<afs-balance> takes a list of server and partition pairs on the command
line and, for each volume on those servers and partitions, gathers the size,
quota, and accesses in the past day from the information loaded the previous
night into the AFS database.  It also determines the total capacity
available across all of those server partitions and then defines target
goals for balancing, specifying that each partition must contain volumes
that add up to at least 97.5% of its fair share of the total accesses and
total size.  (Quota is not used for balancing.)  This 97.5% fudge factor can
be adjusted with the B<-f> command-line argument.

AFS servers may be specified as a number, in which case C<afssvr> will be
implicitly added to the beginning of the number to form the system name.
The partition may be a single partition, a partition range expressed as
something like C<a-k>, or C<.> which indicates all partitions on that
server.

All of this data is written out into a file whose name starts with C<vols.>
and ends with C<.dat> and contains an abbreviated list of the servers and
partitions affected in the name.  An AMPL control file, referencing one of
the AMPL models used to specify the constraints, is then written out with
the same file name but ending in C<.in>.  AMPL should then be run and given
as input the C<.in> file, saving its output to a C<.out> file.

Finally, once AMPL generates a C<.out> file, B<afs-balance> should be run
on the C<.out> file with the B<-r> flag, saving its output in a separate
file.  This mode will parse the AMPL output and generate a list of volumes
that have to be moved, where they should be moved from, and where they
should be moved to.  That output can then be used as input to C<mvto -s -L>.

See the L<EXAMPLES> section for a detailed example.

By default, B<afs-balance> balances servers so that each partition has a
number of volumes proportionate to its share of the total space, satisfies
the size balance, and satisfies the accesses balance.  This can take a very
long time when attempting to balance across many partitions.  In this
situation, you may want to instead do the balance in two parts using B<-s>
and then B<-p> as described below.

If B<afs-balance> is run with the B<-s> option, rather than balancing at
the level of partitions, it will balance at the level of servers.
Restrictions on which partitions on each server are involved in the balance
will still be honored, but the AMPL solution will only indicate what server
the volume should be moved to rather than server and partition, and will
only attempt to equalize accesses, volumes, and space usage between the
servers.  The resulting output file, after processing with B<-r>, will tell
B<mvto> to put the volume on the least full partition on the destination
srver.

If B<afs-balance> is run with the B<-p> option, the balancing will be done
at the partition level but accesses will be ignored.  Balancing accesses
across partitions on a single server isn't particularly useful, and this
will allow the balance solution to be found far faster.

The combination of these two options can balance much larger cells.  First,
balance between some set of servers using B<-s> and implement that solution.
Then, wait for the AFS database to update its data and balance between
partitions using B<-p> on each of the individual servers involved.  It will
take longer and will mean more total volume moves, but will arrive at the
same quality of solution as doing a full balance.

=head1 OPTIONS

=over 4

=item B<-f> I<fudge>, B<--fudge>=I<fudge>

The ratio by which to reduce the size and access targets for each location
in the balance.  The default is 0.975.  Note that accesses can vary widely
and often some AFS volumes have significantly more accesses than others, so
setting this value too high can cause AMPL to fail to find a solution.

=item B<-h>, B<--help>

Print out this documentation (which is done simply by feeding the script to
C<perldoc -t>).

=item B<-p>, B<--partition>

Balance without regard to accesses, which is the mode that one should use
when balancing across the partitions of a single server.  The data file is
the same, but a different model will be used that doesn't include a
constraint on accesses.

=item B<-r>, B<--results>

Rather than generate the AMPL input files, process an AMPL output file and
produce a list of volumes, source locations, and destination locations
reflecting the volume moves that AMPL considered necessary.  This output
will go to standard output (although generally you want to redirect it to a
file) and will contain one line for each volume, formatted as:

    <volume> <server> <partition> <server> <partition>

where the first server/partition pair is where the volume is now, and the
second pair is where it should be moved to.

This output is suitable as input to C<mvto -s -L>.

=item B<-s>, B<--server>

Balance by server, ignoring partitions.  When these AMPL results are
processed by B<afs-balance> with the B<-r> flag, the resulting destination
information will specify the partition as C<.>.  (This means that if not all
partitions of some servers are valid destinations for this balance, you will
have to modify the output file before running B<mvto>.)

=item B<-v>, B<--version>

Print the version of B<afs-balance> and exit.

=back

=head1 EXAMPLES

To do a full balance of all partitions on afssvr2, partitions /vicepa
through /vicepc on afssvr5, and partition /vicepa on afssvr6, one would run
the command:

    afs-balance 2 . 5 a-c 6 a

B<afs-balance> will write out two files:

    vols.2+5a-c+6a.dat
    vols.2+5a-c+6a.in

To calculate the balancing solution, run:

    ampl < vols.2+5a-c+6a.in > vols.2+5a-c+6a.out

Once B<ampl> finishes, review the C<.out> file to make sure that AMPL found
a valid solution.  It starts by printing out data about the initial server
state and then prints out status information about whether it found a
solution.

If it did find a valid solution, run:

    afs-balance -r vols.2+5a-c+6a.out > vols.2+5a-c+6a.list

Review that list to make sure it looks reasonable and then apply the
balancing solution with:

    mvto -s -L vols.2+5a-c+6a.list

If, instead, you wished to balance all partitions between afssvr1, afssvr2,
afssvr3, and partitions /vicepa through /vicepk on afssvr4, this may be too
large of a problem for AMPL to solve in a reasonable amount of time.
Instead, you can use the two-phase approach.  Start by balancing between
just the servers:

    afs-balance -s 1 . 2 . 3 . 4 a-k
    ampl < vols.1+2+3+4a-k.in > vols.1+2+3+4a-k.out

Make sure that AMPL got a solution and then run:

    afs-balance -r vols.1+2+3+4a-k.out > vols.1+2+3+4a-k.list

Normally, you could just apply this list with B<mvto>, but note that only
partitions /vicepa through /vicepk are participating in the balance on
afssvr4 (the other partitions might be used for something else, like
read-only replicas).  B<afs-balance> is not currently smart enough to
figure that out, so you need to modify the output list to change any
occurance of C<afssvr4 .> to C<afssvr4 a-k>.  Once you've done that, run:

    mvto -s -L vols.1+2+3+4a-k.list

to apply the server balance.

Now, wait for a day for the nightly refresh of the AFS database to pick up
the new volume locations, and then run individual server balances for each
of the affected servers:

    afs-balance -p 1 .
    ampl < vols.1.in > vols.1.out
    afs-balance -r vols.1.out > vols.1.list
    mvto -s -L vols.1.out

(checking the AMPL output first before applying the results), repeating this
for each of the affected servers.

=head1 FILES

=over 4

=item F</afs/ir/service/afs/data/balance.mod>

=item F</afs/ir/service/afs/data/balance-size.mod>

The AMPL models used for balancing, which express the variables and
constraints involved in the AMPL program.  F<balance-size.mod> is identical
to F<balance.mod> but doesn't include the constraint on accesses (it's used
when B<afs-balance> is invoked with the B<-p> option).

=back

=head1 NOTES

Current, B<afs-balance> always requests the CPLEX solver in its AMPL input
file.  This is the solver that we use, and it works well for solving this
type of balancing problem.  We have not experimented with using any other
solver, but it's easy enough to change the script to specify a different
one.

Sometimes, AMPL will be unable to come up with any solution and will bail
out saying that the problem isn't feasible, meaning that there is no
possible arrangement of the volumes that satisfies the constraints.  Usually
this is due to one volume with an extremely high access count in the past
day, such that there is no combination of other volumes that can balance
that one out and give partitions roughly equal total access counts.  This
can also be caused by a single particularly large volume.

When this happens, there are a few things that you can do:

=over 4

=item 1.

Wait for the next day and try the balance again.  If the access spike is
anomalous, this often works.

=item 2.

Reduce the fudge parameter using the B<-f> option.  This tells AMPL to
tolerate more deviation in the sizes and accesses of the locations across
which it is balancing.  AMPL will still equalize the number of volumes, but
the solution won't be as nice in equal size and accesses.  Note that this
parameter affects both size and access count; there isn't a way to affect
just one and not the other.

=item 3.

Manually edit the resulting C<.dat> file and reduce the threshold
requirements for either size or accesses, whichever is causing problems (you
can usually tell by scanning the volume information and looking for
particular volumes with anomalously large values).

=item 4.

Lie to AMPL by editing the resulting C<.dat> file and reducing the access
count or size of the offending volume.  AMPL will then construct a solution
based on the value that you told it.  This is safe to do for accesses; be
careful with doing this for size and don't overfill a partition.  If you do
this, you'll also need to reduce all the threshold values for that statistic
accordingly, since otherwise since you've reduced the total access count (or
total size) there won't be enough to spread around to meet the thresholds
required for each partition.

=back

Unfortunately, none of the ways of working around AMPL's failure to find a
solution are automated.  They all require a bit of manual fiddling and
manual investigation.

=head1 BUGS

B<afs-balance> currently cannot handle balancing read-only replicas
properly.  It can prepare the AMPL problem, although the way it determines
what type of volume to balance is by checking to see if there are any
read/write volumes on the affected partitions and balancing only read/write
volumes if there are, which may not be what is desired.  It cannot, however,
produce a list for B<mvto> from the result, and the list that it does
produce will cause B<mvto> to do the wrong thing.

Using data from a SQL database of AFS volume information is required.  There
is no way of doing a balance from vos listvol output, even though enough
information would be available to do so.  (You can always just load the vos
listvol output into a database, but then you have to modify this script
since the table names are hard-coded.)

As mentioned above, B<-r> doesn't correctly handle the case where only some
of a server's partitions are participating in a balance run with B<-s>.  In
particular, it will output instructions that will cause B<mvto> to pick the
least loaded partition across the entire server, not limited to just the
participating partitions.  The output has to be modified before using
B<mvto>.

The B<partinfo> requirement isn't strictly necessary and is only nice for
the pretty output when preparing the balance.  B<afs-balance> should cope
if B<partinfo> isn't available.

=head1 AUTHORS

Written by Neil Crellin <neilc@wallaby.cc> and Russ Allbery
<rra@stanford.edu>, based on an idea and an AMPL model by Neil Crellin.

=head1 COPYRIGHT AND LICENSE

Copyright 1998, 1999, 2005 Board of Trustees, Leland Stanford
Jr. University.

This program is free software; you may redistribute it and/or modify it
under the same terms as Perl itself.

=head1 SEE ALSO

mvto(1), partinfo(1)

The AMPL and CPLEX implementation that we've always used for balancing is
the one from L<http://www.ilog.com/>.  This is commercial software that has
to be purchased.  It may be possible to use this program with a free version
of AMPL and a free solver, but we have not investigated doing so.

B<mvto> is available from L<http://www.eyrie.org/~eagle/software/mvto/>.
B<partinfo> is available from
L<http://www.eyrie.org/~eagle/software/partinfo/>.

The current version of this program is available its web page at
L<http://www.eyrie.org/~eagle/software/afs-balance/>.

=cut