From e1f05831c7418d4ca8e92ab16e8c6fba9938fbd2 Mon Sep 17 00:00:00 2001 From: Steve Singer Date: Sun, 11 May 2014 21:37:45 -0400 Subject: [PATCH] Bug 342 - Fix various failover issues This commit fixes some related failover issues 1. The query slonik used to get a node list during failover was ignoring non-failover targets. This meant that we would never call preFailover on a node that wasn't a failover target. This node might then still be trying to use a failed provider. 2. A node (say node 3) might process a FAILOVER_NODE event from node 2 (the new-origin) but might not have yet processed the last event from node 1, the failed node. We need to make sure that the subscription paths and listen networks are updated so we don't only listen for events and data from failed nodes. --- src/backend/slony1_funcs.sql | 52 ++++++++++++++++++++++++++++++++++-------- src/slonik/slonik.c | 44 ++++++++++++++++++++++++++--------- 2 files changed, 75 insertions(+), 21 deletions(-) diff --git a/src/backend/slony1_funcs.sql b/src/backend/slony1_funcs.sql index f2cbfcb..88a0e2c 100644 --- a/src/backend/slony1_funcs.sql +++ b/src/backend/slony1_funcs.sql @@ -1259,6 +1259,17 @@ begin if found then v_restart_required:=true; end if; + -- + -- if this node is receiving a subscription from the backup node + -- with a failed node as the provider we need to fix this. + update @NAMESPACE@.sl_subscribe set + sub_provider=p_backup_node + from @NAMESPACE@.sl_set + where set_id = sub_set + and set_origin=p_failed_node + and sub_provider = ANY(p_failed_nodes) + and sub_receiver=@NAMESPACE@.getLocalNodeId('_@CLUSTERNAME@'); + -- ---- -- Terminate all connections of the failed node the hard way -- ---- @@ -1443,7 +1454,7 @@ begin where sub_set = v_set and sub_provider=p_failed_node and sub_receiver=receive_node.no_id - and receive_node.no_failed=false; + and receive_node.no_failed=false; for v_row in select * from @NAMESPACE@.sl_table where tab_set = v_set @@ -1505,6 +1516,22 @@ begin end if; end if; end loop; + + --If there are any subscriptions with + --the failed_node being the provider then + --we want to redirect those subscriptions + --to come from the backup node. + -- + -- The backup node should be a valid + -- provider for all subscriptions served + -- by the failed node. (otherwise it + -- wouldn't be a allowable backup node). + update @NAMESPACE@.sl_subscribe + set sub_provider=p_backup_node + from @NAMESPACE@.sl_node + where sub_provider=p_failed_node + and sl_node.no_id=sub_receiver + and sl_node.no_failed=false; update @NAMESPACE@.sl_node set no_active=false WHERE @@ -2393,7 +2420,8 @@ begin update @NAMESPACE@.sl_subscribe set sub_provider = v_sub_last where sub_set = p_set_id - and sub_receiver = v_sub_node; + and sub_receiver = v_sub_node + and sub_receiver <> v_sub_last; v_sub_last = v_sub_node; v_sub_node = v_sub_next; @@ -5034,9 +5062,12 @@ begin -- we use for this origin. We are a cascaded subscriber -- for sets from this node. else - if exists (select true from @NAMESPACE@.sl_set, @NAMESPACE@.sl_subscribe + if exists (select true from @NAMESPACE@.sl_set, @NAMESPACE@.sl_subscribe, + @NAMESPACE@.sl_node provider where set_origin = v_row.origin and sub_set = set_id + and sub_provider=provider.no_id + and provider.no_failed = false and sub_receiver = v_row.receiver and sub_active) then @@ -5056,20 +5087,21 @@ begin if v_row.failed then --for every failed node we delete all sl_listen entries - --except via providers (listed in sl_subscribe). + --except via providers (listed in sl_subscribe) + --or failover candidates (sl_failover_targets) --we do this to prevent a non-failover candidate --that is more ahead of the failover candidate from --sending events to the failover candidate that --are 'too far ahead' delete from @NAMESPACE@.sl_listen where li_origin=v_row.origin and - li_receiver=v_row.receiver + li_receiver=v_row.receiver and li_provider not in - (select sub_provider from - @NAMESPACE@.sl_subscribe, - @NAMESPACE@.sl_set where - sub_set=set_id - and set_origin=v_row.origin); + (select sub_provider from + @NAMESPACE@.sl_subscribe, + @NAMESPACE@.sl_set where + sub_set=set_id + and set_origin=v_row.origin); end if; -- insert into @NAMESPACE@.sl_listen -- (li_origin,li_provider,li_receiver) diff --git a/src/slonik/slonik.c b/src/slonik/slonik.c index 176890d..cf6215f 100644 --- a/src/slonik/slonik.c +++ b/src/slonik/slonik.c @@ -2987,7 +2987,7 @@ slonik_failed_node(SlonikStmt_failed_node * stmt) " on (sl_node.no_id=sl_failover_targets.backup_id " " and set_origin=%d )" " where no_id not in ( %s ) " - " and backup_id not in ( %s ) " + " and ( backup_id not in ( %s ) or backup_id is null) " " order by no_id; ", stmt->hdr.script->clustername, stmt->hdr.script->clustername, @@ -3069,7 +3069,8 @@ slonik_failed_node(SlonikStmt_failed_node * stmt) rc = -1; goto cleanup; } - if (PQgetvalue(res1, i, 0) != NULL) + + if (! PQgetisnull(res1, i, 1) ) { nodeinfo[i].failover_candidate = true; } @@ -3374,7 +3375,7 @@ fail_node_promote(SlonikStmt_failed_node * stmt, SlonDString * failed_node_list) { int64 max_seqno = 0; - int max_node_idx = 0; + int max_node_idx = -1; int backup_idx = 0; char ev_seqno_c[64]; SlonDString query; @@ -3383,7 +3384,8 @@ fail_node_promote(SlonikStmt_failed_node * stmt, PGresult *res1; SlonikAdmInfo *adminfo1; SlonikStmt_wait_event wait_event; - + int64 backup_node_seqno = 0; + dstring_init(&query); @@ -3396,10 +3398,9 @@ fail_node_promote(SlonikStmt_failed_node * stmt, int64 ev_seqno; - if (!nodeinfo[i].failover_candidate) - continue; - if (nodeinfo[i].no_id == node_entry->backup_node) - backup_idx = i; + //if (!nodeinfo[i].failover_candidate) + // continue; + slon_mkquery(&query, "select max(ev_seqno) " " from \"_%s\".sl_event " @@ -3414,9 +3415,14 @@ fail_node_promote(SlonikStmt_failed_node * stmt, goto cleanup; } slon_scanint64(PQgetvalue(res1, 0, 0), &ev_seqno); - + if (nodeinfo[i].no_id == node_entry->backup_node) + { + backup_idx = i; + backup_node_seqno = ev_seqno; + } nodeinfo[i].max_seqno = ev_seqno; - if (nodeinfo[i].max_seqno > max_seqno) + + if (nodeinfo[i].max_seqno > max_seqno && nodeinfo[i].failover_candidate ) { max_seqno = nodeinfo[i].max_seqno; max_node_idx = i; @@ -3424,18 +3430,34 @@ fail_node_promote(SlonikStmt_failed_node * stmt, PQclear(res1); } + if( max_node_idx == -1) + { + /** + * no maximum ahead node was found. + */ + } + if (nodeinfo[max_node_idx].no_id != node_entry->backup_node) { if (nodeinfo[max_node_idx].max_seqno == nodeinfo[backup_idx].max_seqno) max_node_idx = backup_idx; } - adminfo1 = nodeinfo[max_node_idx].adminfo; + + /* * Now execute all FAILED_NODE events on the most ahead candidate + * + * If there is no failover candiate we use the requested backup node. */ + if(max_node_idx < 0) + { + max_node_idx = backup_idx; + max_seqno = backup_node_seqno; + } + adminfo1 = nodeinfo[max_node_idx].adminfo; sprintf(ev_seqno_c, INT64_FORMAT, max_seqno); slon_mkquery(&query, "lock table \"_%s\".sl_event_lock, \"_%s\".sl_config_lock;" -- 1.7.10.4