#!/bin/sh # # #___INFO__MARK_BEGIN__ ########################################################################## # # The Contents of this file are made available subject to the terms of # the Sun Industry Standards Source License Version 1.2 # # Sun Microsystems Inc., March, 2001 # # # Sun Industry Standards Source License Version 1.2 # ================================================= # The contents of this file are subject to the Sun Industry Standards # Source License Version 1.2 (the "License"); You may not use this file # except in compliance with the License. You may obtain a copy of the # License at http://gridengine.sunsource.net/Gridengine_SISSL_license.html # # Software provided under this License is provided on an "AS IS" basis, # WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, # WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS, # MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING. # See the License for the specific provisions governing your rights and # obligations concerning the Software. # # The Initial Developer of the Original Code is: Sun Microsystems, Inc. # # Copyright: 2001 by Sun Microsystems, Inc. # # All Rights Reserved. # ########################################################################## #___INFO__MARK_END__ set +u ckpt_dir=$3 if [ ! -f $ckpt_dir/ckpt.log ]; then touch $ckpt_dir/ckpt.log chmod 666 $ckpt_dir/ckpt.log fi # create temp directory for holding checkpoint info if [ "$SGE_TASK_ID" = "undefined" -o "$SGE_TASK_ID" = "" ]; then jobid=$JOB_ID jobdir=$JOB_ID.1 else jobid=$JOB_ID.$SGE_TASK_ID jobdir=$JOB_ID.$SGE_TASK_ID fi tmpdir=$ckpt_dir/ckpt.$jobid mkdir -p $tmpdir cd $tmpdir # create log file F=$tmpdir/checkpoint.log touch $F exec >> $F 2>&1 echo ------------------------------------------------------------- echo `basename $0` called at `date` echo called by: `id` echo with args: $* echo SGE_TASK_ID=$SGE_TASK_ID echo JOB_ID=$JOB_ID # Cray checkpoint workaround - delete the job script so chkpnt(1) # will save and restore it. For this to work, "shell_start_mode" # should be set to "script_from_stdin" in the global cluster # configuration #rm -f $JOB_SCRIPT # get the checkpoint identifier if [ -f osjobid ] then osjobid=`cat osjobid` else job_dir=`dirname $JOB_SCRIPT`/../active_jobs/$jobdir echo job_dir=$job_dir echo JOB_SCRIPT=$JOB_SCRIPT osjobid=`cat $job_dir/osjobid` echo $osjobid > osjobid fi if [ -f job_pid ] then job_pid=`cat job_pid` else job_dir=`dirname $JOB_SCRIPT`/../active_jobs/$jobdir echo job_dir=$job_dir echo JOB_SCRIPT=$JOB_SCRIPT job_pid=`cat $job_dir/job_pid` echo $job_pid > job_pid fi # # Save any existing restart file, just in case the # host crashes while checkpointing # echo Removing any old restart files from restart database echo /usr/bin/rmresf -d chkpnt_$jobid /usr/bin/rmresf -d chkpnt_$jobid mv chkpnt_$jobid chkpnt_$jobid.save # # Checkpoint the job # echo /usr/bin/chkpnt -P $job_pid -v -f chkpnt_$jobid /usr/bin/chkpnt -P $job_pid -v -f chkpnt_$jobid cc=$? if [ $cc -eq 0 ]; then rm -f chkpnt_$jobid.save fi echo `date +"%D %T"` Job $jobid "(job_pid=$job_pid, osjobid=$osjobid) checkpointed, status=$cc" echo `date +"%D %T"` Job $jobid "(job_pid=$job_pid, osjobid=$osjobid) checkpointed, status=$cc" >> $ckpt_dir/ckpt.log exit $cc