# hpc-testing
# Copyright (C) 2018 SUSE LLC
# 
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

_check_sm_lid()
{
	local host=$1
	local expected_lid=$2

	for i in $(seq 1 10); do
		SM_LID=$(tpq $host sminfo | sed -e 's/.*sm lid \([0-9]\+\) sm guid.*/\1/')
		if [ "$SM_LID" == "sminfo: iberror: failed: query" ]; then
			#New SM is still not up
			echo "SM still not up... Waiting a bit more .."
			sleep 2
			continue
		fi
		if [ "$SM_LID" != "$expected_lid" ]; then
			echo "WARNING:On host $host: SM is on LID $SM_LID instead of $expected_lid"
			echo "Waiting a bit to see how it goes..."
			sleep 2
			continue
		fi
		# We have the right SM lid
		return 0
	done
	if [ "$SM_LID" != "$expected_lid" ]; then
		fatal_error "On host $host: SM is on LID $SM_LID instead of $expected_lid"
	fi
}

test_sm_failover()
{
	local host1=$1
	local lid1=$2
	local host2=$3
	local lid2=$4

	# Kill any SM just in case
	kill_opensm $host1
	kill_opensm $host2

	# Generate configuration files to reduce fail over delays
	for host in $host1 $host2; do
		tp $host 'echo -e "sminfo_polling_timeout 2000\npolling_retry_number 2" > hpc-testing-opensm.conf'
	done
	start_opensm $host1 -p 5 -F hpc-testing-opensm.conf
	sleep 5

	_check_sm_lid $host1 $lid1
	_check_sm_lid $host2 $lid1

	#Run a bunch of time to make sure it really works
	for iter in $(seq 1 5); do
		echo "Migration test $iter"
		start_opensm $host2 -p 10 -F hpc-testing-opensm.conf
		# Leave some time for the migration to work
		sleep 5

		_check_sm_lid $host1 $lid2
		_check_sm_lid $host2 $lid2

		echo "Failover test $iter"
		# Kill SM on H2
		kill_opensm $host2
		# Leave time for the fail over
		sleep 5

		_check_sm_lid $host1 $lid1
		_check_sm_lid $host2 $lid1
	done
}
