#!/bin/sh # # #___INFO__MARK_BEGIN__ ########################################################################## # # The Contents of this file are made available subject to the terms of # the Sun Industry Standards Source License Version 1.2 # # Sun Microsystems Inc., March, 2001 # # # Sun Industry Standards Source License Version 1.2 # ================================================= # The contents of this file are subject to the Sun Industry Standards # Source License Version 1.2 (the "License"); You may not use this file # except in compliance with the License. You may obtain a copy of the # License at http://gridengine.sunsource.net/Gridengine_SISSL_license.html # # Software provided under this License is provided on an "AS IS" basis, # WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, # WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS, # MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING. # See the License for the specific provisions governing your rights and # obligations concerning the Software. # # The Initial Developer of the Original Code is: Sun Microsystems, Inc. # # Copyright: 2001 by Sun Microsystems, Inc. # # All Rights Reserved. # ########################################################################## #___INFO__MARK_END__ set +u # don't treat unset parameters as an error set +e # don't exit on bad command status ckpt_dir=$3 if [ ! -f $ckpt_dir/ckpt.log ]; then touch $ckpt_dir/ckpt.log chmod 666 $ckpt_dir/ckpt.log fi if [ "$SGE_TASK_ID" = "undefined" -o "$SGE_TASK_ID" = "" ]; then jobid=$JOB_ID jobdir=$JOB_ID.1 else jobid=$JOB_ID.$SGE_TASK_ID jobdir=$JOB_ID.$SGE_TASK_ID fi # # create temp directory for holding checkpoint info # tmpdir=$ckpt_dir/ckpt.$jobid mkdir -p $tmpdir cd $tmpdir # # create log file # F=$tmpdir/checkpoint.log touch $F exec >> $F 2>&1 echo ------------------------------------------------------------- echo `basename $0` called at `date` echo called by: `id` echo with args: $* # # Get original job_pid and osjobid # job_pid=`cat job_pid` osjobid=`cat osjobid` # restore the O.S. job identifier to the jobs directory # NOTE: do we need to restore osjobid? # NOTE: do we need to restore job_pid? job_dir=`dirname $JOB_SCRIPT`/../active_jobs/$jobdir echo original job_pid=$job_pid echo original osjobid=$osjobid ls -la $job_dir/job_pid cat $job_dir/job_pid ls -la $job_dir/osjobid cat $job_dir/osjobid #echo $job_pid > $job_dir/job_pid #echo $osjobid > $job_dir/osjobid echo `date +"%D %T"` Job $jobid "(job_pid=$job_pid, osjobid=$osjobid)" restarting >> $ckpt_dir/ckpt.log # # If previous restart file exists, we assume that we # died during a checkpoint and we should recover using # this file # if [ -f chkpnt_$jobid.save ]; then mv chkpnt_$jobid.save chkpnt_$jobid fi # # Register restart file, just in case it's not registered. This could # happen if the host died during the last checkpoint # echo /usr/bin/rsvresf chkpnt_$jobid /usr/bin/rsvresf chkpnt_$jobid # # Now restart the job and wait for it to complete # echo /usr/bin/restart -f -i -w chkpnt_$jobid /usr/bin/restart -f -i -w chkpnt_$jobid exit_status=$? echo Exit status of restart command: $exit_status # Now be careful: The restart command is the parent process of the restarted # job. Grid Engine is the parent process of the restart command. # If the job was killed (probably due to a migration request), we need to # tell our parent that by killing ourselves. Grid Engine will also detect an # exit status > 128 analogous to a KILL #if [ $exit_status = 1 ] #then # jstat=`/bin/acctcom -j $job_id -b -p -Z -f -v /usr/adm/acct/day/pacct | $SGE_ROOT/ckpt/cray_parse_job_status $2` # echo "jobstatus $job_id $2 = $jstat" # if [ "$jstat" = "" ] # then # exit_status=100 # elif [ "$jstat" = "0" ] # then # exit_status=0 # else # exit_status=`expr $jstat + 128` # fi #fi # If killing ourselves didn't help or the exit_status was < 128 exit # with the exit status of our child echo `date +"%D %T"` Job $jobid "(job_pid=$job_pid, osjobid=$osjobid) exiting, status=$exit_status" >> $ckpt_dir/ckpt.log echo Exiting with exit status: $exit_status exit $exit_status