#!/usr/bin/ksh
. $WFU_PATH/global.site.ksh

#---------------------------------------------------
# This program will automatically check processing status
# It is looking for hung jobs and will reinitiate quick_start
# $1 =  batchdir
#syntax:  T3_mighty_mouse $batchdir
#----------------------------------------------------

batchdir=$1
pollhours=1
killhours=10
let sleeptime=pollhours*60*60
let killtime=killhours*60*60
#let sleeptime=10
#let killtime=10
batch_log=$batchdir/batch_log
batchque=$batchdir/batch_que
restart_batch=0

echo "-----------------------------------------"
today=`date`
echo "T3 mighty mouse launched on $today"


batch_id=`getline $batch_log 1 | tr '|' ' ' | tr ' ' '/'`
batch_id=`basename $batch_id`
len=${#batch_id}
if test $len -eq 0
	then 	echo UNABLE TO GET BATCH ID
	exit
fi

#------------------------------------------------
#Check if a previous instance is still running
#-----------------------------------------------
kill_process_file=`ls $batchdir/T3_mighty_mouse_shell* 2>/dev/null | awk '{print $1}'`
if test ${#kill_process_file} -gt 0
	then kill_process=`cat $kill_process_file`
	$kill_process
	rm $kill_process_file
fi


#--------------------------------
#Drop the processing status file
#--------------------------------
lhost=`hostname`
kill_process_file=$batchdir/T3_mighty_mouse_shell$$
echo "rsh $lhost kill -9 $$" > $kill_process_file
echo "rsh $lhost kill -9 $$" 
echo batch_id $batch_id


#----------------------
#exit batch subroutine
#----------------------
exit_batch ()  {
	echo $exit_message
	#kill_process=`cat $kill_process_file`
	rm $kill_process_file
	if test $restart_batch -eq 1
		then echo $batchdir
		subject_line=`echo T3_mighty_mouse_reinitiating_$batchdir`
		email_address=`cat $batchdir/batch_email`
		bad_script=`lastline $batch_log`
		echo 'The bad script is: '
		echo $bad_script
		$script_path/mail_file $batchdir/T3_mm_log $subject_line $email_address
		$script_path/quick_start $batchdir
	fi
	echo exiting T3 mighty mouse
	exit
}
	

#----------------------
#Check log sizes
#----------------------
check_logs ()  {
unix_len=`ls -l $batchdir/output_unix 2>/dev/null | awk '{print $5}'`
matlab_len=`ls -l $batchdir/output_matlab 2>/dev/null | awk '{print $5}'`
idl_len=`ls -l $batchdir/output_idl 2>/dev/null | awk '{print $5}'`
cpu_len=`ls -l $batchdir/output_cpufinder 2>/dev/null | awk '{print $5}'`
batch_len=`ls -l $batchdir/batch_log 2>/dev/null | awk '{print $5}'`
if test ${#unix_len} -eq 0
		then unix_len=0
fi
if test ${#matlab_len} -eq 0
		then matlab_len=0
fi
if test ${#idl_len} -eq 0
		then idl_len=0
fi
if test ${#cpu_len} -eq 0
		then cpu_len=0
fi

if test ${#batch_len} -eq 0
		then batch_len=0
fi
let log_len1=unix_len+matlab_len+idl_len+cpu_len+batch_len
echo $log_len1
#unix_len=`cat -n $batchdir/output_unix 2>/dev/null | lastline | awk  '{print $1}'`
}

log_len0=`check_logs`
running=1
while test $running -eq 1
	do
	restart_batch=0
	echo going to sleep...
	sleep $sleeptime
	echo Waking up...

	#-----------------------------------
	# First match up batch ids
	#-----------------------------------
	batch_id_flag=1
	batch_id2=`getline $batch_log 1 | tr '|' ' ' | tr ' ' '/'`
	batch_id2=`basename $batch_id2`
	len=${#batch_id2}
	if test $len -eq 0
		then 	exit_message=`echo "UNABLE TO GET BATCH ID"`
		batch_id_flag=0
		running=0
		exit_batch
	fi
	if test $batch_id -ne $batch_id2
		then 	exit_message=`echo "BATCHID mismatch"`
		batch_id_flag=0
		running=0
		exit_batch
	fi

	go_back_to_sleep=0
	jobfile=`ls $batchdir/job[0-9]* | grep job`
	len=${#jobfile}
	if test $len -gt 0
		then go_back_to_sleep=1
		echo $jobfile pending....
	fi

	waiting_file=`ls $batchdir | grep waiting`
	len=${#waiting_file}
	if test $len -gt 0
		then go_back_to_sleep=1
		echo waiting for file
		cat $waiting_file
	fi

	if test $go_back_to_sleep -eq 0 
		then error_flag=0
		#error_flag=`check_errors $batchdir`
		#-----------------------
		#check for a kill file
		#-----------------------
		message=`echo NO MESSAGE`
		killfile=`ls $batchdir | grep kill`
		len=${#killfile}
		if test $len -gt 0
			then error_flag=1
			message=`echo ------ERROR KILL FILE DETECTED---------` 
		fi

		if test $error_flag -eq 1
			then exit_message=`echo "Batch Errors $message"`
			running=0
			exit_batch
		fi
	
		#--------------------------------------
		#Check if the batch has been completed
		#--------------------------------------
		next_script=`getline $batchque 1`
		len=${#next_script}
		if test $len -eq 0
			then exit_message=`echo "THIS BATCH IS COMPLETED"`
			running=0
			exit_batch
		fi

		log_len1=`check_logs`
		echo $log_len0 $log_len1
		if test $log_len1 -lt $log_len0
			then exit_message=`echo "LOG SHRINKAGE...THIS BATCH HAS PROBABLY BEEN RESTARTED"`
			running=0
			exit_batch
		fi


		if test $log_len1 -eq $log_len0
			then exit_message=`echo "THIS BATCH IS HUNG...reinitiating quick_start"`
			running=0
			restart_batch=1
			exit_batch
		fi

		log_len0=$log_len1
	fi
	if test $SECONDS -gt $killtime
			then exit_message=`echo "T3 timeout...$killhours hours have elapsed"`
			running=0
			exit_batch
	fi

done
restart_batch=0
exit_batch
exit
