From e1f05831c7418d4ca8e92ab16e8c6fba9938fbd2 Mon Sep 17 00:00:00 2001
From: Steve Singer <ssinger@ca.afilias.info>
Date: Sun, 11 May 2014 21:37:45 -0400
Subject: [PATCH] Bug 342 - Fix various failover issues

This commit fixes some related failover issues
1.  The query slonik used to get a node list during failover
    was ignoring non-failover targets. This meant that we
    would never call preFailover on a node that wasn't a failover
    target.  This node might then still be trying to use
    a failed provider.
2.  A node (say node 3) might process a FAILOVER_NODE
    event from node 2 (the new-origin) but might not
    have yet processed the last event from node 1, the
    failed node.  We need to make sure that the
    subscription paths and listen networks are updated
    so we don't only listen for events and data from
    failed nodes.
---
 src/backend/slony1_funcs.sql |   52 ++++++++++++++++++++++++++++++++++--------
 src/slonik/slonik.c          |   44 ++++++++++++++++++++++++++---------
 2 files changed, 75 insertions(+), 21 deletions(-)

diff --git a/src/backend/slony1_funcs.sql b/src/backend/slony1_funcs.sql
index f2cbfcb..88a0e2c 100644
--- a/src/backend/slony1_funcs.sql
+++ b/src/backend/slony1_funcs.sql
@@ -1259,6 +1259,17 @@ begin
 	if found then
 	   v_restart_required:=true;
 	end if;
+	-- 
+	-- if this node is receiving a subscription from the backup node
+	-- with a failed node as the provider we need to fix this.
+	update @NAMESPACE@.sl_subscribe set 
+	        sub_provider=p_backup_node
+		from @NAMESPACE@.sl_set
+		where set_id = sub_set
+		and set_origin=p_failed_node
+		and sub_provider = ANY(p_failed_nodes)
+		and sub_receiver=@NAMESPACE@.getLocalNodeId('_@CLUSTERNAME@');
+
 	-- ----
 	-- Terminate all connections of the failed node the hard way
 	-- ----
@@ -1443,7 +1454,7 @@ begin
 					   where sub_set = v_set
 					   and sub_provider=p_failed_node
 					   and sub_receiver=receive_node.no_id
-					   and receive_node.no_failed=false;
+					   and receive_node.no_failed=false;			
 
 			for v_row in select * from @NAMESPACE@.sl_table
 				where tab_set = v_set
@@ -1505,6 +1516,22 @@ begin
 			end if;
 		end if;
 	end loop;
+	
+	--If there are any subscriptions with 
+	--the failed_node being the provider then
+	--we want to redirect those subscriptions
+	--to come from the backup node.
+	--
+	-- The backup node should be a valid
+	-- provider for all subscriptions served
+	-- by the failed node. (otherwise it
+	-- wouldn't be a allowable backup node).
+	update @NAMESPACE@.sl_subscribe	       
+	       set sub_provider=p_backup_node
+	       from @NAMESPACE@.sl_node
+	       where sub_provider=p_failed_node
+	       and sl_node.no_id=sub_receiver
+	       and sl_node.no_failed=false;	
 
 	update @NAMESPACE@.sl_node
 		   set no_active=false WHERE 
@@ -2393,7 +2420,8 @@ begin
 		update @NAMESPACE@.sl_subscribe
 				set sub_provider = v_sub_last
 				where sub_set = p_set_id
-					and sub_receiver = v_sub_node;
+					and sub_receiver = v_sub_node
+					and sub_receiver <> v_sub_last;
 
 		v_sub_last = v_sub_node;
 		v_sub_node = v_sub_next;
@@ -5034,9 +5062,12 @@ begin
 		-- we use for this origin. We are a cascaded subscriber
 		-- for sets from this node.
 		else
-				if exists (select true from @NAMESPACE@.sl_set, @NAMESPACE@.sl_subscribe
+				if exists (select true from @NAMESPACE@.sl_set, @NAMESPACE@.sl_subscribe,
+				   	  	       @NAMESPACE@.sl_node provider
 						where set_origin = v_row.origin
 						  and sub_set = set_id
+						  and sub_provider=provider.no_id
+						  and provider.no_failed = false
 						  and sub_receiver = v_row.receiver
 						  and sub_active)
 				then
@@ -5056,20 +5087,21 @@ begin
 		if v_row.failed then
 		
 		--for every failed node we delete all sl_listen entries
-		--except via providers (listed in sl_subscribe).
+		--except via providers (listed in sl_subscribe)
+		--or failover candidates (sl_failover_targets)
 		--we do this to prevent a non-failover candidate
 		--that is more ahead of the failover candidate from
 		--sending events to the failover candidate that
 		--are 'too far ahead'
 		delete from @NAMESPACE@.sl_listen where
 			   li_origin=v_row.origin and
-			   li_receiver=v_row.receiver
+			   li_receiver=v_row.receiver			
 			   and li_provider not in 
-			   	   (select sub_provider from
-				   @NAMESPACE@.sl_subscribe,
-				   @NAMESPACE@.sl_set where	
-				   sub_set=set_id
-				   and set_origin=v_row.origin);
+			       (select sub_provider from
+			       @NAMESPACE@.sl_subscribe,
+			       @NAMESPACE@.sl_set where
+			       sub_set=set_id
+			       and set_origin=v_row.origin);
 		end if;
 --		   insert into @NAMESPACE@.sl_listen
 --		   		  (li_origin,li_provider,li_receiver)
diff --git a/src/slonik/slonik.c b/src/slonik/slonik.c
index 176890d..cf6215f 100644
--- a/src/slonik/slonik.c
+++ b/src/slonik/slonik.c
@@ -2987,7 +2987,7 @@ slonik_failed_node(SlonikStmt_failed_node * stmt)
 					 "    on (sl_node.no_id=sl_failover_targets.backup_id "
 					 "        and set_origin=%d )"
 					 "    where no_id not in ( %s ) "
-					 "    and backup_id not in ( %s ) "
+					 "    and ( backup_id not in ( %s ) or backup_id is null) "
 					 "    order by no_id; ",
 					 stmt->hdr.script->clustername,
 					 stmt->hdr.script->clustername,
@@ -3069,7 +3069,8 @@ slonik_failed_node(SlonikStmt_failed_node * stmt)
 				rc = -1;
 				goto cleanup;
 			}
-			if (PQgetvalue(res1, i, 0) != NULL)
+
+			if (! PQgetisnull(res1, i, 1) )
 			{
 				nodeinfo[i].failover_candidate = true;
 			}
@@ -3374,7 +3375,7 @@ fail_node_promote(SlonikStmt_failed_node * stmt,
 	              SlonDString * failed_node_list)
 {
 	int64		max_seqno = 0;
-	int			max_node_idx = 0;
+	int			max_node_idx = -1;
 	int			backup_idx = 0;
 	char		ev_seqno_c[64];
 	SlonDString query;
@@ -3383,7 +3384,8 @@ fail_node_promote(SlonikStmt_failed_node * stmt,
 	PGresult   *res1;
 	SlonikAdmInfo *adminfo1;
 	SlonikStmt_wait_event wait_event;
-
+	int64 backup_node_seqno = 0;
+	
 	dstring_init(&query);
 	
 
@@ -3396,10 +3398,9 @@ fail_node_promote(SlonikStmt_failed_node * stmt,
 
 		int64		ev_seqno;
 
-		if (!nodeinfo[i].failover_candidate)
-			continue;
-		if (nodeinfo[i].no_id == node_entry->backup_node)
-			backup_idx = i;
+		//if (!nodeinfo[i].failover_candidate)
+		//	continue;
+		
 		slon_mkquery(&query,
 					 "select max(ev_seqno) "
 					 "	from \"_%s\".sl_event "
@@ -3414,9 +3415,14 @@ fail_node_promote(SlonikStmt_failed_node * stmt,
 			goto cleanup;
 		}
 		slon_scanint64(PQgetvalue(res1, 0, 0), &ev_seqno);
-
+		if (nodeinfo[i].no_id == node_entry->backup_node) 
+		{
+			backup_idx = i;
+			backup_node_seqno = ev_seqno;
+		}
 		nodeinfo[i].max_seqno = ev_seqno;
-		if (nodeinfo[i].max_seqno > max_seqno)
+
+		if (nodeinfo[i].max_seqno > max_seqno  && nodeinfo[i].failover_candidate )
 		{
 			max_seqno = nodeinfo[i].max_seqno;
 			max_node_idx = i;
@@ -3424,18 +3430,34 @@ fail_node_promote(SlonikStmt_failed_node * stmt,
 		PQclear(res1);
 
 	}
+	if( max_node_idx == -1)
+	{
+		/**
+		 * no maximum ahead node was found. 
+		 */
+	}
+
 	if (nodeinfo[max_node_idx].no_id != node_entry->backup_node)
 	{
 		if (nodeinfo[max_node_idx].max_seqno ==
 			nodeinfo[backup_idx].max_seqno)
 			max_node_idx = backup_idx;
 	}
-	adminfo1 = nodeinfo[max_node_idx].adminfo;
 
 
+	
+
 	/*
 	 * Now execute all FAILED_NODE events on the most ahead candidate
+	 * 
+	 * If there is no failover candiate we use the requested backup node.
 	 */
+	if(max_node_idx < 0) 
+	{
+		max_node_idx = backup_idx;
+		max_seqno = backup_node_seqno;
+	}
+	adminfo1 = nodeinfo[max_node_idx].adminfo;
 	sprintf(ev_seqno_c, INT64_FORMAT, max_seqno);
 	slon_mkquery(&query,
 				 "lock table \"_%s\".sl_event_lock, \"_%s\".sl_config_lock;"
-- 
1.7.10.4