<!-- 
RSS generated by JIRA (8.20.10#820010-sha1:ace47f9899e9ee25d7157d59aa17ab06aee30d3d) at Wed Feb 07 19:56:09 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>OpenDaylight JIRA</title>
    <link>https://jira.opendaylight.org</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>8.20.10</version>
        <build-number>820010</build-number>
        <build-date>22-06-2022</build-date>
    </build-info>


<item>
            <title>[CONTROLLER-1675] Leadership transfer failed: Follower is not ready to become leader</title>
                <link>https://jira.opendaylight.org/browse/CONTROLLER-1675</link>
                <project id="10113" key="CONTROLLER">controller</project>
                    <description>&lt;p&gt;This happened &lt;span class=&quot;error&quot;&gt;&amp;#91;0&amp;#93;&lt;/span&gt; in CSIT for module-based shard with tell-based protocol. Karaf.log &lt;span class=&quot;error&quot;&gt;&amp;#91;1&amp;#93;&lt;/span&gt; shows large number of warnings such as:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2017-05-14 06:18:34,666 | WARN | lt-dispatcher-22 | FrontendClientMetadataBuilder | 199 - org.opendaylight.controller.sal-distributed-datastore - 1.5.0.Carbon | member-1-shard-default-config: Unknown history for purged transaction member-3-datastore-config-fe-0-chn-2-txn-1522-0, ignoring
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Here is the error and five infos before it:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt; 2017-05-14 06:18:34,674 | INFO | lt-dispatcher-28 | ShardManager | 199 - org.opendaylight.controller.sal-distributed-datastore - 1.5.0.Carbon | shard-manager-config Received follower initial sync status for member-1-shard-default-config status sync done false
 2017-05-14 06:18:34,716 | INFO | ult-dispatcher-2 | ShardManager | 199 - org.opendaylight.controller.sal-distributed-datastore - 1.5.0.Carbon | shard-manager-config Received follower initial sync status for member-1-shard-default-config status sync done true
 2017-05-14 06:18:34,716 | INFO | ult-dispatcher-2 | ShardManager | 199 - org.opendaylight.controller.sal-distributed-datastore - 1.5.0.Carbon | shard-manager-config Received follower initial sync status for member-1-shard-default-config status sync done false
 2017-05-14 06:18:34,724 | INFO | lt-dispatcher-29 | ShardManager | 199 - org.opendaylight.controller.sal-distributed-datastore - 1.5.0.Carbon | shard-manager-config Received follower initial sync status for member-1-shard-default-config status sync done true
 2017-05-14 06:18:34,724 | INFO | lt-dispatcher-29 | ShardManager | 199 - org.opendaylight.controller.sal-distributed-datastore - 1.5.0.Carbon | shard-manager-config: Received LeaderStateChanged message: LeaderStateChanged [memberId=member-1-shard-default-config, leaderId=null, leaderPayloadVersion=5]
 2017-05-14 06:18:44,695 | ERROR | lt-dispatcher-21 | ClusterAdminRpcService | 201 - org.opendaylight.controller.sal-cluster-admin-impl - 1.5.0.Carbon | Leadership transfer failed for shard default.
 org.opendaylight.controller.cluster.raft.LeadershipTransferFailedException: Failed to transfer leadership to member-1-shard-default-config. Follower is not ready to become leader
 at org.opendaylight.controller.cluster.raft.RaftActor$1.onFailure(RaftActor.java:304)
 at org.opendaylight.controller.cluster.raft.RaftActorLeadershipTransferCohort.finish(RaftActorLeadershipTransferCohort.java:187)
 at org.opendaylight.controller.cluster.raft.RaftActorLeadershipTransferCohort.abortTransfer(RaftActorLeadershipTransferCohort.java:138)
 at org.opendaylight.controller.cluster.raft.RaftActorLeadershipTransferCohort$1.doCancel(RaftActorLeadershipTransferCohort.java:112)
 at org.opendaylight.controller.cluster.raft.TimedRunnable.cancel(TimedRunnable.java:47)
 at org.opendaylight.controller.cluster.raft.TimedRunnable.lambda$new$0(TimedRunnable.java:33)
 at org.opendaylight.controller.cluster.raft.RaftActor.handleCommand(RaftActor.java:264)
 at org.opendaylight.controller.cluster.common.actor.AbstractUntypedPersistentActor.onReceiveCommand(AbstractUntypedPersistentActor.java:31)
 at akka.persistence.UntypedPersistentActor.onReceive(PersistentActor.scala:170)
 at org.opendaylight.controller.cluster.common.actor.MeteringBehavior.apply(MeteringBehavior.java:104)
 at akka.actor.ActorCell$$anonfun$become$1.applyOrElse(ActorCell.scala:544)
 at akka.actor.Actor$class.aroundReceive(Actor.scala:497)
 at akka.persistence.UntypedPersistentActor.akka$persistence$Eventsourced$$super$aroundReceive(PersistentActor.scala:168)
 at akka.persistence.Eventsourced$$anon$1.stateReceive(Eventsourced.scala:664)
 at akka.persistence.Eventsourced$class.aroundReceive(Eventsourced.scala:183)
 at akka.persistence.UntypedPersistentActor.aroundReceive(PersistentActor.scala:168)
 at akka.actor.ActorCell.receiveMessage(ActorCell.scala:526)
 at akka.actor.ActorCell.invoke(ActorCell.scala:495)
 at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:257)
 at akka.dispatch.Mailbox.run(Mailbox.scala:224)
 at akka.dispatch.Mailbox.exec(Mailbox.scala:234)
 at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
 at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
 at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
 at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;0&amp;#93;&lt;/span&gt; &lt;a href=&quot;https://logs.opendaylight.org/releng/jenkins092/controller-csit-3node-clustering-only-carbon/684/archives/log.html.gz#s1-s29-t3-k2-k5-k3-k1-k4-k7-k1&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://logs.opendaylight.org/releng/jenkins092/controller-csit-3node-clustering-only-carbon/684/archives/log.html.gz#s1-s29-t3-k2-k5-k3-k1-k4-k7-k1&lt;/a&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;1&amp;#93;&lt;/span&gt; &lt;a href=&quot;https://logs.opendaylight.org/releng/jenkins092/controller-csit-3node-clustering-only-carbon/684/archives/odl1_karaf.log.gz&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://logs.opendaylight.org/releng/jenkins092/controller-csit-3node-clustering-only-carbon/684/archives/odl1_karaf.log.gz&lt;/a&gt;&lt;/p&gt;</description>
                <environment>&lt;p&gt;Operating System: All&lt;br/&gt;
Platform: All&lt;/p&gt;</environment>
        <key id="26229">CONTROLLER-1675</key>
            <summary>Leadership transfer failed: Follower is not ready to become leader</summary>
                <type id="10104" iconUrl="https://jira.opendaylight.org/secure/viewavatar?size=xsmall&amp;avatarId=10303&amp;avatarType=issuetype">Bug</type>
                                                <status id="10003" iconUrl="https://jira.opendaylight.org/images/icons/status_generic.gif" description="">Confirmed</status>
                    <statusCategory id="2" key="new" colorName="blue-gray"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="-1">Unassigned</assignee>
                                    <reporter username="vrpolak">Vratko Polak</reporter>
                        <labels>
                    </labels>
                <created>Mon, 15 May 2017 10:51:38 +0000</created>
                <updated>Tue, 25 Jul 2023 08:24:37 +0000</updated>
                                                                            <component>clustering</component>
                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                                                                <comments>
                            <comment id="52210" author="jmorvay@cisco.com" created="Mon, 15 May 2017 11:16:53 +0000"  >&lt;p&gt;I will try to look into logs for more details, but generally I would suggest to retry requests to change shard leadership to local node in case of above-mentioned failure. There are some scenarios where we cannot grant leadership to some follower node, because the follower node is not up to date with current leader.&lt;/p&gt;</comment>
                            <comment id="52211" author="rovarga" created="Mon, 15 May 2017 14:42:44 +0000"  >&lt;p&gt;This is a problem of the follower not being fully caught up. As a first step create a dedicated subclass of LeadershipTransferFailedException, which will indicate that the followers are not fully caught up.&lt;/p&gt;</comment>
                            <comment id="52212" author="jmorvay@cisco.com" created="Wed, 17 May 2017 13:21:17 +0000"  >&lt;p&gt;Can you please provide debug logs for org.opendaylight.controller.cluster.raft?&lt;/p&gt;</comment>
                            <comment id="52213" author="anipbu" created="Wed, 17 May 2017 18:18:32 +0000"  >&lt;p&gt;We are looking to build Carbon RC2 tomorrow 5/18 at 23:59 UTC time assuming there are no blocker bugs.  Is there an ETA for when a fix can be merged and this bug resolved for stable/carbon branch?&lt;/p&gt;</comment>
                            <comment id="52214" author="jmorvay@cisco.com" created="Thu, 18 May 2017 07:27:25 +0000"  >&lt;p&gt;(In reply to A H from comment #4)&lt;br/&gt;
&amp;gt; We are looking to build Carbon RC2 tomorrow 5/18 at 23:59 UTC time assuming&lt;br/&gt;
&amp;gt; there are no blocker bugs.  Is there an ETA for when a fix can be merged and&lt;br/&gt;
&amp;gt; this bug resolved for stable/carbon branch?&lt;/p&gt;

&lt;p&gt;I am still analyzing this, so I cannot give you ETA yet. But I will try to do my best to update you with ETA and possibly resolve this today.&lt;/p&gt;</comment>
                            <comment id="52215" author="jmorvay@cisco.com" created="Fri, 19 May 2017 08:34:24 +0000"  >&lt;p&gt;We cannot reproduce this anymore on latest builds.&lt;/p&gt;</comment>
                            <comment id="52216" author="vrpolak" created="Mon, 22 May 2017 07:14:48 +0000"  >&lt;p&gt;This happened again &lt;span class=&quot;error&quot;&gt;&amp;#91;2&amp;#93;&lt;/span&gt;. Looks like a heisenbug.&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;2&amp;#93;&lt;/span&gt; &lt;a href=&quot;https://logs.opendaylight.org/releng/jenkins092/controller-csit-3node-clustering-only-carbon/715/archives/log.html.gz#s1-s24-t5-k2-k7-k3-k1-k4-k7-k1&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://logs.opendaylight.org/releng/jenkins092/controller-csit-3node-clustering-only-carbon/715/archives/log.html.gz#s1-s24-t5-k2-k7-k3-k1-k4-k7-k1&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="52217" author="rovarga" created="Mon, 22 May 2017 09:18:15 +0000"  >&lt;p&gt;At least some of this seems to be timing-dependent:&lt;/p&gt;

&lt;p&gt;2017-05-22 00:46:21,801 | INFO  | ult-dispatcher-2 | ClientActorBehavior              | 197 - org.opendaylight.controller.cds-access-client - 1.1.0.Carbon | member-2-frontend-datastore-config: refreshing backend for shard 0&lt;br/&gt;
2017-05-22 00:46:25,407 | ERROR | ult-dispatcher-2 | ClusterAdminRpcService           | 201 - org.opendaylight.controller.sal-cluster-admin-impl - 1.5.0.Carbon | Leadership transfer failed for shard default.&lt;br/&gt;
org.opendaylight.controller.cluster.raft.LeadershipTransferFailedException: Failed to transfer leadership to member-2-shard-default-config. Follower is not ready to become leader&lt;br/&gt;
        at org.opendaylight.controller.cluster.raft.RaftActor$1.onFailure(RaftActor.java:304)&lt;br/&gt;
        at org.opendaylight.controller.cluster.raft.RaftActor$1.onSuccess(RaftActor.java:294)&lt;br/&gt;
        at org.opendaylight.controller.cluster.raft.RaftActorLeadershipTransferCohort.finish(RaftActorLeadershipTransferCohort.java:185)&lt;br/&gt;
        at org.opendaylight.controller.cluster.raft.RaftActorLeadershipTransferCohort.lambda$transferComplete$0(RaftActorLeadershipTransferCohort.java:159)&lt;br/&gt;
        at org.opendaylight.controller.cluster.raft.RaftActor.handleCommand(RaftActor.java:264)&lt;br/&gt;
        at org.opendaylight.controller.cluster.common.actor.AbstractUntypedPersistentActor.onReceiveCommand(AbstractUntypedPersistentActor.java:31)&lt;br/&gt;
        at akka.persistence.UntypedPersistentActor.onReceive(PersistentActor.scala:170)&lt;br/&gt;
        at org.opendaylight.controller.cluster.common.actor.MeteringBehavior.apply(MeteringBehavior.java:104)&lt;br/&gt;
        at akka.actor.ActorCell$$anonfun$become$1.applyOrElse(ActorCell.scala:544)&lt;br/&gt;
        at akka.actor.Actor$class.aroundReceive(Actor.scala:497)&lt;br/&gt;
        at akka.persistence.UntypedPersistentActor.akka$persistence$Eventsourced$$super$aroundReceive(PersistentActor.scala:168)&lt;br/&gt;
        at akka.persistence.Eventsourced$$anon$1.stateReceive(Eventsourced.scala:664)&lt;br/&gt;
        at akka.persistence.Eventsourced$class.aroundReceive(Eventsourced.scala:183)&lt;br/&gt;
        at akka.persistence.UntypedPersistentActor.aroundReceive(PersistentActor.scala:168)&lt;br/&gt;
        at akka.actor.ActorCell.receiveMessage(ActorCell.scala:526)&lt;br/&gt;
        at akka.actor.ActorCell.invoke(ActorCell.scala:495)&lt;br/&gt;
        at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:257)&lt;br/&gt;
        at akka.dispatch.Mailbox.run(Mailbox.scala:224)&lt;br/&gt;
        at akka.dispatch.Mailbox.exec(Mailbox.scala:234)&lt;br/&gt;
        at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)&lt;br/&gt;
        at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)&lt;br/&gt;
        at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)&lt;br/&gt;
        at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)&lt;br/&gt;
2017-05-22 00:46:25,408 | INFO  | ult-dispatcher-2 | EmptyLocalActorRef               | 174 - com.typesafe.akka.slf4j - 2.4.17 | Message &lt;span class=&quot;error&quot;&gt;&amp;#91;akka.actor.Status$Success&amp;#93;&lt;/span&gt; from Actor&lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.29.12.44:2550/user/shardmanager-config/member-1-shard-default-config#1342993691&amp;#93;&lt;/span&gt; to Actor&lt;span class=&quot;error&quot;&gt;&amp;#91;akka://opendaylight-cluster-data/temp/$g&amp;#93;&lt;/span&gt; was not delivered. &lt;span class=&quot;error&quot;&gt;&amp;#91;2&amp;#93;&lt;/span&gt; dead letters encountered. This logging can be turned off or adjusted with configuration settings &apos;akka.log-dead-letters&apos; and &apos;akka.log-dead-letters-during-shutdown&apos;.&lt;br/&gt;
2017-05-22 00:46:25,419 | INFO  | lt-dispatcher-21 | Shard                            | 192 - org.opendaylight.controller.sal-clustering-commons - 1.5.0.Carbon | member-2-shard-default-config (Candidate): Starting new election term 3&lt;/p&gt;

&lt;p&gt;I.e. just a millisecond after we time out, we are getting a success message and the target actor is initiating new elections.&lt;/p&gt;

&lt;p&gt;I think RequestLeadership message should include a timeout (or absolute deadline) by which the transfer needs to occur.&lt;/p&gt;</comment>
                            <comment id="52218" author="colin@colindixon.com" created="Mon, 22 May 2017 19:03:11 +0000"  >&lt;p&gt;Is this only using the tell-based protocol? Also, do we think this is a regression from earlier releases or not? I&apos;m trying to assess if this is really something we want to hold Carbon up for.&lt;/p&gt;</comment>
                            <comment id="52219" author="jmorvay@cisco.com" created="Tue, 23 May 2017 08:32:43 +0000"  >&lt;p&gt;(In reply to Colin Dixon from comment #9)&lt;br/&gt;
&amp;gt; Is this only using the tell-based protocol? Also, do we think this is a&lt;br/&gt;
&amp;gt; regression from earlier releases or not? I&apos;m trying to assess if this is&lt;br/&gt;
&amp;gt; really something we want to hold Carbon up for.&lt;/p&gt;

&lt;p&gt;This is new functionality, so no regression here. Downgrading importance to normal.&lt;/p&gt;</comment>
                            <comment id="52220" author="rovarga" created="Tue, 23 May 2017 11:30:06 +0000"  >&lt;p&gt;Well, it still affects leader transfer on shutdown &amp;#8211; which is not new functionality.&lt;/p&gt;

&lt;p&gt;Anyway after thinking about this a bit more, I think the problem comes from the timeout value. The code is:&lt;/p&gt;

&lt;p&gt;        // We&apos;ll wait an election timeout period for a new leader to be elected plus some cushion to take into&lt;br/&gt;
        // account the variance.&lt;br/&gt;
        final long electionTimeout = raftActor.getRaftActorContext().getConfigParams()&lt;br/&gt;
                .getElectionTimeOutInterval().toMillis();&lt;br/&gt;
        final int variance = raftActor.getRaftActorContext().getConfigParams().getElectionTimeVariance();&lt;br/&gt;
        newLeaderTimeoutInMillis = electionTimeout + variance * 2;&lt;/p&gt;

&lt;p&gt;This would be fine if the follower started acting on the request immediately, but under load the follower mailbox can have quite a few requests outstanding, and will not starting acting on our request until it has completed going through them &amp;#8211; which introduces additional delay which need to account for.&lt;/p&gt;

&lt;p&gt;The hard part is figuring out how much delay is tolerable, for which there is no hard-n-fast rule. What we need to ensure is that the target client knows that needs to initiate elections and then that they were kicked off &amp;#8211; and the only tools available to us are RaftRPCs.&lt;/p&gt;

&lt;p&gt;Hence I think this really needs to be a two-step process involving the journal, as we need to know when the target follower has received the message and when it is expected to start acting on it.&lt;/p&gt;

&lt;p&gt;A first-cut idea is to store a journal entry and follow what happends to its replication &amp;#8211; starting current timer only after the target follower has appended it in its journal... but that needs more thought.&lt;/p&gt;</comment>
                            <comment id="52221" author="tpantelis" created="Tue, 23 May 2017 11:44:57 +0000"  >&lt;p&gt;I&apos;m not sure appending a special journal entry is necessary just to transfer leadership. Transferring leadership on shutdown is really not something that needs to do done or completed, hence the timeout - it&apos;s a best effort to speed up new leader election but we don&apos;t want to hold up shutdown indefinitely. If transfer doesn&apos;t complete then a new leader will eventually get elected anyway once a follower selection time out occurs.&lt;/p&gt;</comment>
                            <comment id="52222" author="rovarga" created="Tue, 23 May 2017 12:22:03 +0000"  >&lt;p&gt;That is true for the shutdown case, but not for the pure transfer case. What we are seeing is that the attempt to transfer indicates a failure (follower not ready) after which (milliseconds later) the follower actually initiates new election.&lt;/p&gt;

&lt;p&gt;Since this is expose to applications via an asynchronous API, indicating a false negative is harmful, certainly more than a longer transfer period.&lt;/p&gt;</comment>
                            <comment id="52223" author="colin@colindixon.com" created="Tue, 23 May 2017 14:19:27 +0000"  >&lt;p&gt;Really simple question: is not using the tell-based protocol a workaround?&lt;/p&gt;</comment>
                            <comment id="52224" author="colin@colindixon.com" created="Tue, 23 May 2017 14:20:53 +0000"  >&lt;p&gt;&amp;gt; Well, it still affects leader transfer on shutdown &amp;#8211; which is not new functionality.&lt;/p&gt;

&lt;p&gt;While not new functionality, is this is a regression from prior releases or just a new bug we&apos;ve found that was present before?&lt;/p&gt;</comment>
                            <comment id="52225" author="tpantelis" created="Tue, 23 May 2017 14:21:48 +0000"  >&lt;p&gt;It doesn&apos;t matter if it&apos;s a requested transfer or not. It&apos;s still a best effort - no guarantee to fulfill the request - it still needs to be time-boxed. If it&apos;s a matter of it taking longer then expected when under load then the test needs to take that into account, ie wait longer or try the request again.&lt;/p&gt;

&lt;p&gt;Regardless, this does not need to block carbon - there has been no change in behavior or regression - it&apos;s just that no one has tested leadership transfer under load with certain expectations.&lt;/p&gt;</comment>
                            <comment id="52226" author="tpantelis" created="Tue, 23 May 2017 14:33:25 +0000"  >&lt;p&gt;(In reply to Colin Dixon from comment #15)&lt;br/&gt;
&amp;gt; &amp;gt; Well, it still affects leader transfer on shutdown &amp;#8211; which is not new functionality.&lt;br/&gt;
&amp;gt; &lt;br/&gt;
&amp;gt; While not new functionality, is this is a regression from prior releases or&lt;br/&gt;
&amp;gt; just a new bug we&apos;ve found that was present before?&lt;/p&gt;

&lt;p&gt;It&apos;s not a regression nor is it really a bug with leadership transfer. As I&apos;ve mentioned it&apos;s a bast effort with a deadline. Maybe there&apos;s things we can do elsewhere to eliminate or lessen the chance for failure in the test case (eg switch to artery) but, for now, we probably need to lower expectations accordingly in the test.&lt;/p&gt;</comment>
                            <comment id="52227" author="rovarga" created="Tue, 23 May 2017 14:48:35 +0000"  >&lt;p&gt;Well, the severity is already normal, so that should clear up the Carbon angst.&lt;/p&gt;

&lt;p&gt;Tom, I am not arguing the shutdown case, although it may be interesting to reduce the churn potential even then.&lt;/p&gt;

&lt;p&gt;What I am arguing is the switchover case, which is a process exposed to application developers. The use case there is per-OF-switch-shard, which you want to have co-located with the active connection.&lt;/p&gt;

&lt;p&gt;My concern here is what the application sees and how it can interpret things. As things stand right now, we tell the application that the movement has failed (authoritatively) and milliseconds after that the move is initiated, but does not complete (I think).&lt;/p&gt;

&lt;p&gt;Now from end user perspective, the request to move a leader should be non-disruptive, i.e. it should be able to complete even under load. The way I see this happening is:&lt;/p&gt;

&lt;p&gt;1) OF switch connects&lt;br/&gt;
2) OFP requests leadership move&lt;br/&gt;
3) OFP starts dumping data into the DS&lt;br/&gt;
4) ... at some point later leadership move completes&lt;/p&gt;

&lt;p&gt;Now it is up to OFP to decide what to do if the move fails, but certainly another attempt is a valid choice &amp;#8211; which would mean that OFP is going to retry, further cause backend election churn &amp;#8211; slowing things down.&lt;/p&gt;

&lt;p&gt;Hence I would really like to provide a result which is authoritative: the move will not happen and there will be no further artefacts on the backend due to the attempt. Or the movement succeeds in due time &amp;#8211; if the app is willing to wait for the result, it can take minutes for all it cares. Since the process is based on a CompletionStage, the app can wait for a certain time for an authoritative response, or decide that it no longer cares (and enters the unknown territory).&lt;/p&gt;</comment>
                            <comment id="52228" author="tpantelis" created="Tue, 23 May 2017 16:05:45 +0000"  >&lt;p&gt;I assume you&apos;re referring to the &quot;CompletionStage&amp;lt;Void&amp;gt; makeLeaderLocal()&quot; API. &lt;/p&gt;

&lt;p&gt;There are several ways a transfer request can fail:&lt;/p&gt;

&lt;p&gt;1) There&apos;s no current leader to fulfill the request&lt;br/&gt;
2) The message to the leader to request leadership gets dropped and times out (eg leader node isn&apos;t available at that moment)&lt;br/&gt;
3) The leader is not able to get the requested node&apos;s log caught up within a specific period of time. This could occur if the requested node is not available or is far behind or is under a lot of load and is responding slowly.&lt;br/&gt;
4) The old leader cannot confirm that the new leader candidate actually became leader within a specific period of time. &lt;br/&gt;
5) The requested node is non-voting and thus cannot become leader (we should check for that case up front)&lt;/p&gt;

&lt;p&gt;So there&apos;s basically w phases to leadership transfer - first try to get the or some follower up-to-date such that if it initiates an election, it could win. After that, send TimeoutNow to immediately start an election. However there&apos;s still no guarantee the follower will win the election (or even receive the TimeoutNow message in a timely manner or at all). It &lt;b&gt;should&lt;/b&gt; get the prior leader&apos;s vote but if there&apos;s more than 3 nodes, it still needs a vote from at least one other node.  &lt;/p&gt;

&lt;p&gt;For 1), it could fail fast or wait for a period of time to see if a leader emerges and then retry the request (it might already do that in the ShardManager). This is an authoritative failure.&lt;/p&gt;

&lt;p&gt;For 2) and 3), it has put a deadline on it and aborted so these are authoritative failures.&lt;/p&gt;

&lt;p&gt;For 4), it may succeed and thus may not be an authoritative failure.&lt;/p&gt;

&lt;p&gt;For these failures, a client retry could succeed (we could add a flag in the exception). It also seems to make sense for the client to supply the deadline.&lt;/p&gt;

&lt;p&gt;For 5), it&apos;s an invalid request so fail fast (authoritative failure). This is a case where the move will not ever happen (unless the voting configuration is changed by the user) so no point in retrying.&lt;/p&gt;</comment>
                            <comment id="52229" author="vrpolak" created="Fri, 23 Jun 2017 13:50:48 +0000"  >&lt;p&gt;Something very similar to this happened &lt;span class=&quot;error&quot;&gt;&amp;#91;3&amp;#93;&lt;/span&gt; after considerable time of not being visible.&lt;br/&gt;
That is after a suite refactor (due to &lt;a href=&quot;https://jira.opendaylight.org/browse/CONTROLLER-1711&quot; title=&quot;Listener registration lost when local replica is removed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;CONTROLLER-1711&quot;&gt;&lt;del&gt;CONTROLLER-1711&lt;/del&gt;&lt;/a&gt;), so this is a first time there are three transaction writers in total, and there is a listener on the member which stays follower. (In one previous Sandbox run, this did not happen.)&lt;/p&gt;

&lt;p&gt;I do not see any undelivered messaged in Karaf log of either the old leader &lt;span class=&quot;error&quot;&gt;&amp;#91;4&amp;#93;&lt;/span&gt; and expected new leader &lt;span class=&quot;error&quot;&gt;&amp;#91;5&amp;#93;&lt;/span&gt; (between movement initiation at 11:02:30,526 and giving up at 11:02:57,006), so this is not &lt;a href=&quot;https://jira.opendaylight.org/browse/CONTROLLER-1717&quot; title=&quot;RequestTimeoutException due to &amp;quot;Failed to transfer leadership&amp;quot; after become-prefix-leader with RoleChangeNotification not delivered&quot; class=&quot;issue-link&quot; data-issue-key=&quot;CONTROLLER-1717&quot;&gt;&lt;del&gt;CONTROLLER-1717&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;But I see multiple out-of-sync lines on the expected new leader, and there is also sync status flapping I did not expect after &lt;span class=&quot;error&quot;&gt;&amp;#91;6&amp;#93;&lt;/span&gt;, so the new failure fits comments of this Bug.&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;3&amp;#93;&lt;/span&gt; &lt;a href=&quot;https://logs.opendaylight.org/releng/jenkins092/controller-csit-3node-clustering-only-carbon/759/log.html.gz#s1-s36-t3-k2-k9-k3-k1-k4-k7-k1&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://logs.opendaylight.org/releng/jenkins092/controller-csit-3node-clustering-only-carbon/759/log.html.gz#s1-s36-t3-k2-k9-k3-k1-k4-k7-k1&lt;/a&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;4&amp;#93;&lt;/span&gt; &lt;a href=&quot;https://logs.opendaylight.org/releng/jenkins092/controller-csit-3node-clustering-only-carbon/759/odl3_karaf.log.gz&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://logs.opendaylight.org/releng/jenkins092/controller-csit-3node-clustering-only-carbon/759/odl3_karaf.log.gz&lt;/a&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;5&amp;#93;&lt;/span&gt; &lt;a href=&quot;https://logs.opendaylight.org/releng/jenkins092/controller-csit-3node-clustering-only-carbon/759/odl1_karaf.log.gz&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://logs.opendaylight.org/releng/jenkins092/controller-csit-3node-clustering-only-carbon/759/odl1_karaf.log.gz&lt;/a&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;6&amp;#93;&lt;/span&gt; &lt;a href=&quot;https://bugs.opendaylight.org/show_bug.cgi?id=8618#c11&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://bugs.opendaylight.org/show_bug.cgi?id=8618#c11&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="52230" author="vrpolak" created="Mon, 18 Sep 2017 11:46:00 +0000"  >&lt;p&gt;Lowering severity to Minor, as no failures of this kind have been seen in around two months.&lt;/p&gt;

&lt;p&gt;From the comments I have a feeling such failures are still possible, in suites with greater data churn.&lt;/p&gt;</comment>
                            <comment id="70115" author="rovarga" created="Sat, 6 Nov 2021 11:06:31 +0000"  >&lt;p&gt;The unknown history part might be caused by &lt;a href=&quot;https://jira.opendaylight.org/browse/CONTROLLER-1942&quot; title=&quot;DeadTransactionException during initialization&quot; class=&quot;issue-link&quot; data-issue-key=&quot;CONTROLLER-1942&quot;&gt;&lt;del&gt;CONTROLLER-1942&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10000">
                    <name>Blocks</name>
                                                                <inwardlinks description="is blocked by">
                                        <issuelink>
            <issuekey id="26066">CONTROLLER-1512</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                            <customfield id="customfield_11400" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                        <customfield id="customfield_10208" key="com.atlassian.jira.plugin.system.customfieldtypes:textfield">
                        <customfieldname>External issue ID</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>8446</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10201" key="com.atlassian.jira.plugin.system.customfieldtypes:url">
                        <customfieldname>External issue URL</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue><![CDATA[https://bugs.opendaylight.org/show_bug.cgi?id=8446]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10206" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Issue Type</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10305"><![CDATA[Improvement]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10000" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>0|i02s1b:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                </customfields>
    </item>
</channel>
</rss>