<!-- 
RSS generated by JIRA (8.20.10#820010-sha1:ace47f9899e9ee25d7157d59aa17ab06aee30d3d) at Wed Feb 07 19:55:40 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>OpenDaylight JIRA</title>
    <link>https://jira.opendaylight.org</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>8.20.10</version>
        <build-number>820010</build-number>
        <build-date>22-06-2022</build-date>
    </build-info>


<item>
            <title>[CONTROLLER-1486] Clustering: Datastore may fail with &quot;Shard XXX has no leader. Try again later&quot;</title>
                <link>https://jira.opendaylight.org/browse/CONTROLLER-1486</link>
                <project id="10113" key="CONTROLLER">controller</project>
                    <description>&lt;p&gt;Found by clustering test run: &lt;a href=&quot;https://jenkins.opendaylight.org/releng/view/netconf/job/netconf-csit-3node-clustering-only-beryllium/53/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://jenkins.opendaylight.org/releng/view/netconf/job/netconf-csit-3node-clustering-only-beryllium/53/&lt;/a&gt;&lt;br/&gt;
The relevant report is from odl2_karaf.log (see the test run artifacts or the attachment which contains a copy of the logs):&lt;/p&gt;

&lt;p&gt;2016-02-18 22:11:04,009 | WARN  | qtp862704672-67  | BrokerFacade                     | 211 - org.opendaylight.netconf.sal-rest-connector - 1.3.0.SNAPSHOT | Exception by reading OPERATIONAL via Restconf: /(urn:TBD:params:xml:ns:yang:network-topology?revision=2013-10-21)network-topology/topology/topology[&lt;/p&gt;
{(urn:TBD:params:xml:ns:yang:network-topology?revision=2013-10-21)topology-id=topology-netconf}
&lt;p&gt;]/node/node[&lt;/p&gt;
{(urn:TBD:params:xml:ns:yang:network-topology?revision=2013-10-21)node-id=netconf-test-device}
&lt;p&gt;] java.util.concurrent.ExecutionException: ReadFailedException{message=Error executeRead ReadData for path /(urn:TBD:params:xml:ns:yang:network-topology?revision=2013-10-21)network-topology/topology/topology[&lt;/p&gt;
{(urn:TBD:params:xml:ns:yang:network-topology?revision=2013-10-21)topology-id=topology-netconf}
&lt;p&gt;]/node/node[&lt;/p&gt;
{(urn:TBD:params:xml:ns:yang:network-topology?revision=2013-10-21)node-id=netconf-test-device}
&lt;p&gt;], errorList=[RpcError [message=Error executeRead ReadData for path /(urn:TBD:params:xml:ns:yang:network-topology?revision=2013-10-21)network-topology/topology/topology[&lt;/p&gt;
{(urn:TBD:params:xml:ns:yang:network-topology?revision=2013-10-21)topology-id=topology-netconf}
&lt;p&gt;]/node/node[&lt;/p&gt;
{(urn:TBD:params:xml:ns:yang:network-topology?revision=2013-10-21)node-id=netconf-test-device}
&lt;p&gt;], severity=ERROR, errorType=APPLICATION, tag=operation-failed, applicationTag=null, info=null, cause=org.opendaylight.controller.md.sal.common.api.data.DataStoreUnavailableException: Shard member-2-shard-topology-operational currently has no leader. Try again later.]]} 	at org.opendaylight.yangtools.util.concurrent.MappingCheckedFuture.wrapInExecutionException(MappingCheckedFuture.java:63)&lt;/p&gt;

&lt;p&gt;and in odl1_karaf.log (the timestamp is quite weird, according to it the error below happened 2 minutes BEFORE the error above):&lt;/p&gt;

&lt;p&gt;2016-02-18 22:09:12,905 | WARN  | lt-dispatcher-50 | ConcurrentDOMDataBroker          | 143 - org.opendaylight.controller.sal-distributed-datastore - 1.3.0.SNAPSHOT | Tx: DOM-CHAIN-0-0 Error during phase CAN_COMMIT, starting Abort akka.pattern.AskTimeoutException: Ask timed out on [ActorSelection&lt;span class=&quot;error&quot;&gt;&amp;#91;Anchor(akka.tcp://opendaylight-cluster-data@10.30.11.66:2550/), Path(/user/shardmanager-config/member-3-shard-topology-config)&amp;#93;&lt;/span&gt;] after &lt;span class=&quot;error&quot;&gt;&amp;#91;5000 ms&amp;#93;&lt;/span&gt; 	at akka.pattern.PromiseActorRef$$anonfun$1.apply$mcV$sp(AskSupport.scala:334)&lt;span class=&quot;error&quot;&gt;&amp;#91;128:com.typesafe.akka.actor:2.3.14&amp;#93;&lt;/span&gt; 	at akka.actor.Scheduler$$anon$7.run(Scheduler.scala:117)&lt;span class=&quot;error&quot;&gt;&amp;#91;128:com.typesafe.akka.actor:2.3.14&amp;#93;&lt;/span&gt; 	at scala.concurrent.Future$InternalCallbackExecutor$.unbatchedExecute(Future.scala:599)&lt;span class=&quot;error&quot;&gt;&amp;#91;125:org.scala-lang.scala-library:2.11.7.v20150622-112736-1fbce4612c&amp;#93;&lt;/span&gt; 	at scala.concurrent.BatchingExecutor$class.execute(BatchingExecutor.scala:109)&lt;span class=&quot;error&quot;&gt;&amp;#91;125:org.scala-lang.scala-library:2.11.7.v20150622-112736-1fbce4612c&amp;#93;&lt;/span&gt; 	at scala.concurrent.Future$InternalCallbackExecutor$.execute(Future.scala:597)&lt;span class=&quot;error&quot;&gt;&amp;#91;125:org.scala-lang.scala-library:2.11.7.v20150622-112736-1fbce4612c&amp;#93;&lt;/span&gt; 	at akka.actor.LightArrayRevolverScheduler$TaskHolder.executeTask(Scheduler.scala:467)&lt;span class=&quot;error&quot;&gt;&amp;#91;128:com.typesafe.akka.actor:2.3.14&amp;#93;&lt;/span&gt; 	at akka.actor.LightArrayRevolverScheduler$$anon$8.executeBucket$1(Scheduler.scala:419)&lt;span class=&quot;error&quot;&gt;&amp;#91;128:com.typesafe.akka.actor:2.3.14&amp;#93;&lt;/span&gt; 	at akka.actor.LightArrayRevolverScheduler$$anon$8.nextTick(Scheduler.scala:423)&lt;span class=&quot;error&quot;&gt;&amp;#91;128:com.typesafe.akka.actor:2.3.14&amp;#93;&lt;/span&gt; 	at akka.actor.LightArrayRevolverScheduler$$anon$8.run(Scheduler.scala:375)&lt;span class=&quot;error&quot;&gt;&amp;#91;128:com.typesafe.akka.actor:2.3.14&amp;#93;&lt;/span&gt; 	at java.lang.Thread.run(Thread.java:745)&lt;span class=&quot;error&quot;&gt;&amp;#91;:1.7.0_85&amp;#93;&lt;/span&gt; 2016-02-18 22:09:12,909 | ERROR | CommitFutures-1  | TopologyNodeWriter               | 240 - org.opendaylight.netconf.topology - 1.0.0.SNAPSHOT | org.opendaylight.controller.md.sal.binding.impl.BindingDOMTransactionChainAdapter@63d9f743: TransactionChain(DOM-CHAIN-0-0) TransactionCommitFailedException&lt;/p&gt;
{message=canCommit encountered an unexpected failure, errorList=[RpcError [message=canCommit encountered an unexpected failure, severity=ERROR, errorType=APPLICATION, tag=operation-failed, applicationTag=null, info=null, cause=akka.pattern.AskTimeoutException: Ask timed out on [ActorSelection[Anchor(akka.tcp://opendaylight-cluster-data@10.30.11.66:2550/), Path(/user/shardmanager-config/member-3-shard-topology-config)]] after [5000 ms]]]}
&lt;p&gt; FAILED! 2016-02-18 22:09:12,909 | ERROR | CommitFutures-2  | TopologyNodeWriter               | 240 - org.opendaylight.netconf.topology - 1.0.0.SNAPSHOT | topology-netconf: Transaction(init topology container) DOM-CHAIN-0-0 FAILED! TransactionCommitFailedException&lt;/p&gt;
{message=canCommit encountered an unexpected failure, errorList=[RpcError [message=canCommit encountered an unexpected failure, severity=ERROR, errorType=APPLICATION, tag=operation-failed, applicationTag=null, info=null, cause=akka.pattern.AskTimeoutException: Ask timed out on [ActorSelection[Anchor(akka.tcp://opendaylight-cluster-data@10.30.11.66:2550/), Path(/user/shardmanager-config/member-3-shard-topology-config)]] after [5000 ms]]]}

&lt;p&gt;According to the discussion with the developers, the most likely cause is something like this:&lt;/p&gt;

&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;Leader election fails or something tries to write to the datastore before the leader election is done.&lt;/li&gt;
	&lt;li&gt;Netconf topology hits the datastore failure and tries to restart.&lt;/li&gt;
	&lt;li&gt;Netconf topology crashes because it is already registered in entity ownership service.&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;&lt;a href=&quot;https://jira.opendaylight.org/browse/CONTROLLER-1468&quot; title=&quot;[Clustering] Datastore operations failure when leader is down&quot; class=&quot;issue-link&quot; data-issue-key=&quot;CONTROLLER-1468&quot;&gt;&lt;del&gt;CONTROLLER-1468&lt;/del&gt;&lt;/a&gt; might be relevant as it is about datastore operation failure when leader is down (in this case it appears leader is not known yet).&lt;/p&gt;</description>
                <environment>&lt;p&gt;Operating System: All&lt;br/&gt;
Platform: All&lt;/p&gt;</environment>
        <key id="26040">CONTROLLER-1486</key>
            <summary>Clustering: Datastore may fail with &quot;Shard XXX has no leader. Try again later&quot;</summary>
                <type id="10104" iconUrl="https://jira.opendaylight.org/secure/viewavatar?size=xsmall&amp;avatarId=10303&amp;avatarType=issuetype">Bug</type>
                                                <status id="5" iconUrl="https://jira.opendaylight.org/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="green"/>
                                    <resolution id="10002">Duplicate</resolution>
                                        <assignee username="-1">Unassigned</assignee>
                                    <reporter username="jbehran@cisco.com">Jozef Behran</reporter>
                        <labels>
                    </labels>
                <created>Fri, 19 Feb 2016 12:18:45 +0000</created>
                <updated>Thu, 19 Oct 2017 21:29:17 +0000</updated>
                            <resolved>Mon, 19 Sep 2016 14:44:38 +0000</resolved>
                                    <version>Post-Helium</version>
                                                    <component>clustering</component>
                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                                                                <comments>
                            <comment id="51293" author="jbehran@cisco.com" created="Fri, 19 Feb 2016 12:18:45 +0000"  >&lt;p&gt;Attachment logs.tgz has been added with description: Backup of logs from the test run exhibiting the failure&lt;/p&gt;</comment>
                            <comment id="51289" author="tpantelis" created="Fri, 19 Feb 2016 13:09:26 +0000"  >&lt;p&gt;The attached tar contains the log files with .xz extension which is some binary format. What is the .xz format and how to &quot;decrypt&quot; it?&lt;/p&gt;</comment>
                            <comment id="51290" author="tpantelis" created="Thu, 25 Feb 2016 03:12:13 +0000"  >&lt;p&gt;In looking at the logs, shard leaders were elected when the nodes started up, member-3 (10.30.11.66) became the leader of the topology-config shard:&lt;/p&gt;

&lt;p&gt;2016-02-18 22:09:06,412 | INFO  | lt-dispatcher-23 | ShardManager                     | 143 - org.opendaylight.controller.sal-distributed-datastore - 1.3.0.SNAPSHOT | shard-manager-config: Received role changed for member-3-shard-topology-config from Candidate to Leader&lt;/p&gt;

&lt;p&gt;member-1 (10.30.11.144) became the leader of the topology-oper shard.&lt;/p&gt;

&lt;p&gt;The akka cluster leader was member-1 however it lost connection with member-3 about 5 sec later:&lt;/p&gt;

&lt;p&gt;2016-02-18 22:09:11,817 | WARN  | lt-dispatcher-22 | ClusterCoreDaemon                | 129 - com.typesafe.akka.slf4j - 2.3.14 | Cluster Node &lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.11.144:2550&amp;#93;&lt;/span&gt; - Marking node(s) as UNREACHABLE &lt;span class=&quot;error&quot;&gt;&amp;#91;Member(address = akka.tcp://opendaylight-cluster-data@10.30.11.66:2550, status = Up)&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;It reconnected 2 sec later:&lt;/p&gt;

&lt;p&gt;2016-02-18 22:09:13,816 | INFO  | lt-dispatcher-27 | kka://opendaylight-cluster-data) | 129 - com.typesafe.akka.slf4j - 2.3.14 | Cluster Node &lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.11.144:2550&amp;#93;&lt;/span&gt; - Marking node(s) as REACHABLE &lt;span class=&quot;error&quot;&gt;&amp;#91;Member(address = akka.tcp://opendaylight-cluster-data@10.30.11.66:2550, status = Up)&amp;#93;&lt;/span&gt; &lt;/p&gt;

&lt;p&gt;Same with member-2. So there was an ~8-10 sec period of disconnect which resulted in the DataStoreUnavailableException and AskTimeoutException errors.&lt;/p&gt;

&lt;p&gt;What caused the disconnect? Don&apos;t know. The nodes were still starting up so it could&apos;ve been due to a long GC pause or there was an actual issue in the network causing a temporary outage.&lt;/p&gt;

&lt;p&gt;For Boron, we intend to look into making it more resilient to short disconnects and timeouts, possibly by retrying transactions.&lt;/p&gt;</comment>
                            <comment id="51291" author="ranjithkumar_t@hcl.com" created="Mon, 19 Sep 2016 13:39:31 +0000"  >&lt;p&gt;Steps to be followed:&lt;br/&gt;
=====================&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;Downloaded the latest distribution patch &quot;distribution-karaf-0.5.0-20160902.020649-4739.tar.gz&quot;.&lt;/li&gt;
	&lt;li&gt;Changes made in akka.conf configuration file.&lt;/li&gt;
	&lt;li&gt;Installed required features for openstack (odl-ovsdb-openstack).&lt;/li&gt;
	&lt;li&gt;Clustering happens and initial network topology is initiated in operational data store.&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;Observation:&lt;br/&gt;
============&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;The following are the cluster nodes 10.106.138.110, 10.106.138.154 and 10.106.138.155.&lt;/li&gt;
	&lt;li&gt;Leader node is 10.106.138.110 the remaining two nodes are followers.&lt;/li&gt;
	&lt;li&gt;In 3 node cluster, If we logout or kill shard Leader node, in the remaining two nodes, one of the node will select as a Leader but I am not getting any exception like &quot;DataStoreUnavailableException&quot; for specific member in karaf log.&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;I have tested manually many times but the bug was not reproduced.&lt;/p&gt;

&lt;p&gt;I have attached the karaf log for your your reference.&lt;/p&gt;</comment>
                            <comment id="51294" author="ranjithkumar_t@hcl.com" created="Mon, 19 Sep 2016 13:41:11 +0000"  >&lt;p&gt;Attachment 5391 logs.zip has been added with description: karaf logs for all odl controllers&lt;/p&gt;</comment>
                            <comment id="51292" author="tpantelis" created="Mon, 19 Sep 2016 14:44:38 +0000"  >&lt;p&gt;It appears inflight transactions failed due to a temporary disconnect. As mentioned there&apos;s work underway to retry transactions in this case. Marking this as a duplicate of &lt;a href=&quot;https://jira.opendaylight.org/browse/CONTROLLER-1483&quot; title=&quot;akka.pattern.AskTimeoutException on follower while BGP peer introduces 1M prefixes&quot; class=&quot;issue-link&quot; data-issue-key=&quot;CONTROLLER-1483&quot;&gt;&lt;del&gt;CONTROLLER-1483&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10000">
                    <name>Blocks</name>
                                                                <inwardlinks description="is blocked by">
                                        <issuelink>
            <issuekey id="26037">CONTROLLER-1483</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10002">
                    <name>Duplicate</name>
                                            <outwardlinks description="duplicates">
                                        <issuelink>
            <issuekey id="26037">CONTROLLER-1483</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="13567" name="5391 logs.zip" size="106139" author="ranjithkumar_t@hcl.com" created="Mon, 19 Sep 2016 13:41:11 +0000"/>
                            <attachment id="13566" name="logs.tgz" size="48157" author="jbehran@cisco.com" created="Fri, 19 Feb 2016 12:18:45 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                            <customfield id="customfield_11400" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                        <customfield id="customfield_10208" key="com.atlassian.jira.plugin.system.customfieldtypes:textfield">
                        <customfieldname>External issue ID</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>5391</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10201" key="com.atlassian.jira.plugin.system.customfieldtypes:url">
                        <customfieldname>External issue URL</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue><![CDATA[https://bugs.opendaylight.org/show_bug.cgi?id=5391]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                <customfield id="customfield_10000" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>0|i02qvb:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                </customfields>
    </item>
</channel>
</rss>