#!/bin/sh # Wipe dead job: Stop job's pbs_mom daemons, remove job files, restart pbs_mom. # Usage: wipejob job-id # Author: Ole Holm Nielsen, Ole.H.Nielsen@fysik.dtu.dk SSH="ssh -n -x" PING="/bin/ping -c 1 -w 3" # Torque commands used QSTAT=/usr/local/bin/qstat QDEL=/usr/local/bin/qdel # Mail program MAIL=/bin/mail # Name of this cluster CLUSTERNAME=NIFLHEIM # Mail to the superuser SUPERUSERMAIL=root@audhumbla.fysik.dtu.dk # Temporary file JOBSTATUS=/tmp/jobstatus.$$ # Catch signals trap "rm -f $JOBSTATUS; exit 2" 1 2 3 14 15 19 # Check command arguments if test $# -ne 1 then echo Usage: $0 job-id exit 1 fi JOB=$1 action_done='wiped' # Check if this job-ID can be inquired successfully. # (The qstat flag "-1" is only available from Torque 2.1) $QSTAT -f -1 $JOB > $JOBSTATUS if test "$?" != "0" then echo Error inquiring about job $JOB exit 1 fi # Check the job state JOBSTATE="`cat $JOBSTATUS | grep job_state | awk '{print $3}'`" if test "$JOBSTATE" != "R" then echo The job $JOB is not running, it has state=$JOBSTATE exit 1 fi # Get the Torque resource exec_host # Replace "+" by newline for "+"-separated nodelists from Torque # Print only unique nodenames ("uniq" command) because SMP nodes may be repeated NODELIST="`cat $JOBSTATUS | grep exec_host | awk '{print $3}'| sed -e 's/\/.//g' -e 's/+/\n/g' | uniq`" if test -z "$NODELIST" then echo Error: The node list is empty exit 1 fi # Get the number of nodes and node properties used NODES="`cat $JOBSTATUS | grep Resource_List.nodes | awk '{print $3}'`" echo This job uses $NODES nodes echo Nodelist for job-id $JOB: $NODELIST # Loop over nodes and execute the "ps" command for node in $NODELIST do echo '----- Node' $node '-----' if $PING $node 2>&1 > /dev/null then $SSH $node service pbs_mom stop echo Removing Torque job files for job $JOB on node $node $SSH $node "rm -rf /var/spool/torque/mom_priv/jobs/$JOB*" $SSH $node service pbs_mom start else echo '*** WARNING ***' Cannot ping host ${node} ! fi done echo Delete and purge job $JOB $QDEL -p $JOB # Send mail to the user echo Sending mail to user $SUPERUSERMAIL cat $JOBSTATUS | $MAIL -s "WARNING: $CLUSTERNAME job $JOB has been ${action_done} by the superuser" $SUPERUSERMAIL rm -f $JOBSTATUS