#!/usr/bin/ksh

#syntax:  error_recovery batchdir 
# $1 = batch directory
#----------------------------------------------------------
#-----------------------------------------------
# set some defaults
#returns 1 if error recovery is in effect
#returns 0 if nothing can be done
#-----------------------------------------------
#--------------------------------
#set environment variables
#--------------------------------
. $WFU_PATH/global.site.ksh

#---------------------------------------
# Make sure an error has been detected
#---------------------------------------


batchdir=$1
batchlog=$batchdir/batch_log
batchlog_temp=$batchdir/error_recovery_batch_log
output_matlab=$batchdir/output_matlab
output_matlab_temp=$batchdir/error_recovery_output_matlab
batchque=$batchdir/batch_que
recovery_log=$batchdir/error_recovery_log
recovery_log_messages=$batchdir/error_recovery_log_messages
fullque=`ls $batchdir/batch.flist 2>>/dev/null`
if test ${#fullque} -eq 0 
	then batch_master=`ls $batchdir/batch_master.flist 2>>/dev/null`
	if test ${#batch_master} -eq 0
		then echo unable to identify $fullque >> $recovery_log_messages
		echo unable to identify $fullque >> $batchlog
		message='-------------NO ERROR RECOVERY PERFORMED---------------'
		echo $message >> $recovery_log_messages
		echo $message >> $batchlog
		echo 0
		exit
	fi
	fullque=`$script_path/getline $batch_master 2`
fi
runtime=$SECONDS
max_tries=3
awk=`echo awk`
lastline=$script_path/lastline
error_flag=`check_errors $batchdir $runtime`

cdate=`date`
current_time=`$script_path/systime`
if test $rsh_flag -gt 0
	then cdate=`rsh $timekeeper date`
	current_time=`rsh $timekeeper $script_path/systime`
fi
db=`echo ' | ' `

echo timekeeper is $timekeeper >> $recovery_log_messages
echo entered error_recovery with $batchdir on $cdate$db$db$current_time >> $recovery_log_messages
#------------------------------------------------------
#Check if the batch_display had a problem
#Don't worry about error recovery 
#-------------------------------------------------------
display_batch_flag=`getline $batchque 1 | grep -c display_batch`
if test $display_batch_flag -gt 0
	then error_flag=0
	message='FINAL DISPLAY BATCH PROBLEM---------NO ERROR RECOVERY PERFORMED'
	echo $message >> $recovery_log_messages
	echo $message >> $batchlog
fi


#------------------------------------------------------
#check contents of kill file
#for valid error recovery procedure
#if kill file is empty, do not initiate error recovery
#-------------------------------------------------------
sn3d_flag=0
wait_for_flag=0
kill_message=`cat $batchdir/kill  2>>/dev/null`
kill_file=`ls $batchdir/kill  2>>/dev/null`
len=${#kill_message}
killfile_detected=${#kill_file}
if test $len -gt 0
	then batch_control_kill=`echo $kill_message | grep 'kill_batch executed on' `
	len2=${#batch_control_kill}
	if test $len2 -gt 0
		then error_flag=0
		message='KILL FILE GENERATED THROUGH BATCH_CONTROL---------NO ERROR RECOVERY PERFORMED'
		echo $message >> $recovery_log_messages
		echo $message >> $batchlog
	fi
	wait_for_timeout=`echo $kill_message | grep 'waitfor timeout' `
	len2=${#wait_for_timeout}
	if test $len2 -gt 0
		then wait_for_flag=1
		sn3d=`echo $kill_message | grep '_sn3d' `
		if test ${#sn3d} -gt 0
			then sn3d_flag=1
		fi
	fi		
fi
if test $len -eq 0
	then message='EMPTY KILL FILE DETECTED---------NO ERROR RECOVERY PERFORMED'
	if test $killfile_detected -gt 1
	then    error_flag=0
		echo $message >> $recovery_log_messages
		echo $message >> $batchlog
	fi
fi
#if test $killfile_detected -eq 0
#	then message='NO KILL FILE DETECTED---------ERROR RECOVERY ACTIVE'
#	echo $message >> $recovery_log_messages
#	echo $message >> $batchlog
#fi


#-------------------------------------------
#If we timed out waiting for normalization
#all subsequent runs will die
#Try to jumpstart normalization
#-------------------------------------------
if test $sn3d_flag -eq 1
	then error_flag=1
	message='WAITFOR TIMEOUT ON sn3d file---------Attempting restart for sn3d'
	#sn3d_dir=`extract_pname $batchdir`
	sn3d_dir=`dirname $batchdir`
	echo $message in $sn3d_dir >> $recovery_log_messages
	echo $message >> $batchlog
	startup=`$script_path/quick_start $sn3d_dir error_recovery=0`
fi
	
if test $error_flag -eq 0
	then echo 0
	exit
fi

#-----------------------------------------------
#Turn off error recovery for grid engine for now
#------------------------------------------------

if test $grid_engine -eq 1
	then   message='------------GRID ENGINE ERROR RECOVERY HAS BEEN DISABLED--------------------'
	#message_blank='***************************************'
	if test $grid_engine_error_recovery -eq 1
		then message='------------GRID ENGINE ERROR RECOVERY HAS BEEN ENABLED--------------------'
	fi
	#echo "$message_blank" >> $batchlog
	echo $message >> $batchlog
	echo $message >> $recovery_log_messages
	#echo $message_blank >> $batchlog
	if test $grid_engine_error_recovery -eq 0
		then  error_flag=0
		echo 0
		exit
	fi
fi


#------------------------------------
#read the error recovery file
#------------------------------------
current_attempt=1
last_attempt=`$lastline $recovery_log 2>>/dev/null`
len=${#last_attempt}
if test $len -gt 0
#	then attempts=`echo $last_attempt | $awk '{print $1}' `
	then attempts=`echo $last_attempt | $awk '{print $2}' `
	if test $attempts -ge $max_tries
		then echo 0
		echo 'Maximal Error Recovery Performed' >> $batchlog
		cat $recovery_log >> $recovery_log_messages
		exit
	fi
	let current_attempt=attempts+1
fi

#total_scripts=`$script_path/getline $fullque 1`
total_scripts=`cat $fullque | head -1`
#-----------------------------------------------------------------
#convert total_scripts from string to numeric for tail operations
#-----------------------------------------------------------------
total_scripts=$((total_scripts + 0))
scripts_remaining=`grep [a-z,A-Z] $batchque | cat -n | cut -f1 | $lastline`
reload=$scripts_remaining
if test $current_attempt -eq 1
	then reload=$scripts_remaining
fi
if test $current_attempt -eq 2
	then let reload=scripts_remaining+1
	if test $reload -gt $total_scripts
		then reload=$scripts_remaining
	fi
fi
if test $current_attempt -eq 3
	then reload=$total_scripts
fi

echo hostname is `hostname` >> $recovery_log_messages
echo contents of $fullque >> $recovery_log_messages 
echo ----------- >> $recovery_log_messages
cat $fullque >> $recovery_log_messages
echo ----------- >> $recovery_log_messages
echo batchque is $batchque >> $recovery_log_messages
echo current_attempt is $current_attempt >> $recovery_log_messages
echo total_scripts is $total_scripts >> $recovery_log_messages
echo scripts_remaining is $scripts_remaining >> $recovery_log_messages
echo reload is $reload >> $recovery_log_messages





#--------------------------------
#update the error recovery file
#--------------------------------
echo attempt $current_attempt scripts to reload $reload on $cdate$db$db$current_time>> $recovery_log

#--------------------------------
#regenerate a batchque
#--------------------------------
cat $fullque | tail -$reload > $batchdir/batchtemp
rm $batchque
mv $batchdir/batchtemp $batchque
cat $batchlog >> $batchlog_temp
echo '  ' >> $batchlog_temp
echo '  ' >> $batchlog_temp
echo ERROR RECOVERY ATTEMPT $current_attempt >> $batchlog_temp
echo '  ' >> $batchlog_temp
echo '  ' >> $batchlog_temp

cat $output_matlab >> $output_matlab_temp 2>>/dev/null
echo '*******************************' >> $output_matlab_temp
echo ERROR RECOVERY ATTEMPT $current_attempt >> $output_matlab_temp
echo '*******************************' >> $output_matlab_temp
echo "cat $fullque | tail -$reload" >> $recovery_log_messages
echo contents of $batchque are now: >> $recovery_log_messages
cat $batchque >> $recovery_log_messages
echo '*******************************' >> $recovery_log_messages


echo 1
exit








