#!/bin/ksh # # #___INFO__MARK_BEGIN__ ########################################################################## # # The Contents of this file are made available subject to the terms of # the Sun Industry Standards Source License Version 1.2 # # Sun Microsystems Inc., March, 2001 # # # Sun Industry Standards Source License Version 1.2 # ================================================= # The contents of this file are subject to the Sun Industry Standards # Source License Version 1.2 (the "License"); You may not use this file # except in compliance with the License. You may obtain a copy of the # License at http://gridengine.sunsource.net/Gridengine_SISSL_license.html # # Software provided under this License is provided on an "AS IS" basis, # WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, # WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS, # MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING. # See the License for the specific provisions governing your rights and # obligations concerning the Software. # # The Initial Developer of the Original Code is: Sun Microsystems, Inc. # # Copyright: 2001 by Sun Microsystems, Inc. # # All Rights Reserved. # ########################################################################## #___INFO__MARK_END__ set +u ckpt_dir=$3 if [ ! -f $ckpt_dir/ckpt.log ]; then touch $ckpt_dir/ckpt.log chmod 666 $ckpt_dir/ckpt.log fi # create temp directory for holding checkpoint info tmpdir=$ckpt_dir/ckpt.$1 mkdir -p $tmpdir cd $tmpdir # create log file #F=$tmpdir/checkpoint.log F=~/$REQNAME.co$1 touch $F print ------------------------------------------------------------- >> $F 2>&1 print `basename $0` called at `date` >> $F 2>&1 print called by: `id` >> $F 2>&1 print with args: $* >> $F 2>&1 print `date +"%D %T"` Job $1 "(pid=$2)" restarting >> $ckpt_dir/ckpt.log currcpr=`cat currcpr` if [ "$currcpr" != "2" ]; then currcpr=1 fi print Restart command: cpr -r cpr_$1.$currcpr >> $F 2>&1 cpr -r cpr_$1.$currcpr >> $F 2>&1 # Now be careful: The restart command is the parent process of the restarted # job. SGE is the parent process of the restart command. # If the job was killed (probably due to a migration request), we need to # tell our parent that by killing ourselves. SGE will also detect an # exit status > 128 analogous to a KILL exit_status=$? print Exit status of restart command: $exit_status >> $F 2>&1 # poll for job completion (based on job script pid) if [ $exit_status -gt 0 ]; then exit_status=100 else while ps -p $2 do sleep 2 done fi # This doesn't work under Irix 6.2, since the variable $$ is not # correctly set #if [ $exit_status -gt 128 ]; then # signal=`expr $exit_status - 128` # print Killing ourself: kill -$signal $$ >> $F 2>&1 # /usr/bin/kill -$signal $pid >> $F 2>&1 #fi # If killing ourselves didn't help or the exit_status was < 128 exit # with the exit status of our child print `date +"%D %T"` Job $1 "(pid=$2) exiting, status=$exit_status" >> $ckpt_dir/ckpt.log print Exiting with exit status: $exit_status >> $F 2>&1 exit $exit_status