<!-- 
RSS generated by JIRA (8.20.10#820010-sha1:ace47f9899e9ee25d7157d59aa17ab06aee30d3d) at Wed Feb 07 19:55:16 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>OpenDaylight JIRA</title>
    <link>https://jira.opendaylight.org</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>8.20.10</version>
        <build-number>820010</build-number>
        <build-date>22-06-2022</build-date>
    </build-info>


<item>
            <title>[CONTROLLER-1333] Clustering : Performance issues in BGP scale test</title>
                <link>https://jira.opendaylight.org/browse/CONTROLLER-1333</link>
                <project id="10113" key="CONTROLLER">controller</project>
                    <description>&lt;p&gt;After connecting BGP to a feed, it timeouts after about 3 minutes with ask.timeout exceptions. Build is: distribution-karaf-0.3.0-20150522.085838-1738. I&apos;ll send steps to reproduce in an email. Here&apos;s the exception:&lt;/p&gt;

&lt;p&gt;2015-05-22 07:48:52,740 | INFO  | oupCloseable-3-1 | AbstractBGPSessionNegotiator     | 227 - org.opendaylight.bgpcep.bgp-rib-impl - 0.4.0.SNAPSHOT | BGP Session established successfully.&lt;br/&gt;
2015-05-22 07:53:15,620 | ERROR | lt-dispatcher-39 | LocalThreePhaseCommitCohort      | 169 - org.opendaylight.controller.sal-distributed-datastore - 1.2.0.SNAPSHOT | Failed to prepare transaction member-1-txn-6043 on backend&lt;br/&gt;
akka.pattern.AskTimeoutException: Ask timed out on &lt;a href=&quot;#-1012054239)]&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot;&gt;ActorSelection[Anchor(akka://opendaylight-cluster-data/), Path(/user/shardmanager-operational/member-1-shard-default-operational#-1012054239)]&lt;/a&gt; after &lt;span class=&quot;error&quot;&gt;&amp;#91;30000 ms&amp;#93;&lt;/span&gt;&lt;br/&gt;
        at akka.pattern.PromiseActorRef$$anonfun$1.apply$mcV$sp(AskSupport.scala:334)&lt;span class=&quot;error&quot;&gt;&amp;#91;154:com.typesafe.akka.actor:2.3.10&amp;#93;&lt;/span&gt;&lt;br/&gt;
        at akka.actor.Scheduler$$anon$7.run(Scheduler.scala:117)&lt;span class=&quot;error&quot;&gt;&amp;#91;154:com.typesafe.akka.actor:2.3.10&amp;#93;&lt;/span&gt;&lt;br/&gt;
        at scala.concurrent.Future$InternalCallbackExecutor$.scala$concurrent$Future$InternalCallbackExecutor$$unbatchedExecute(Future.scala:694)&lt;span class=&quot;error&quot;&gt;&amp;#91;151:org.scala-lang.scala-library:2.10.4.v20140209-180020-VFINAL-b66a39653b&amp;#93;&lt;/span&gt;&lt;br/&gt;
        at scala.concurrent.Future$InternalCallbackExecutor$.execute(Future.scala:691)&lt;span class=&quot;error&quot;&gt;&amp;#91;151:org.scala-lang.scala-library:2.10.4.v20140209-180020-VFINAL-b66a39653b&amp;#93;&lt;/span&gt;&lt;br/&gt;
        at akka.actor.LightArrayRevolverScheduler$TaskHolder.executeTask(Scheduler.scala:467)&lt;span class=&quot;error&quot;&gt;&amp;#91;154:com.typesafe.akka.actor:2.3.10&amp;#93;&lt;/span&gt;&lt;br/&gt;
        at akka.actor.LightArrayRevolverScheduler$$anon$8.executeBucket$1(Scheduler.scala:419)&lt;span class=&quot;error&quot;&gt;&amp;#91;154:com.typesafe.akka.actor:2.3.10&amp;#93;&lt;/span&gt;&lt;br/&gt;
        at akka.actor.LightArrayRevolverScheduler$$anon$8.nextTick(Scheduler.scala:423)&lt;span class=&quot;error&quot;&gt;&amp;#91;154:com.typesafe.akka.actor:2.3.10&amp;#93;&lt;/span&gt;&lt;br/&gt;
        at akka.actor.LightArrayRevolverScheduler$$anon$8.run(Scheduler.scala:375)&lt;span class=&quot;error&quot;&gt;&amp;#91;154:com.typesafe.akka.actor:2.3.10&amp;#93;&lt;/span&gt;&lt;br/&gt;
        at java.lang.Thread.run(Thread.java:745)&lt;span class=&quot;error&quot;&gt;&amp;#91;:1.7.0_79&amp;#93;&lt;/span&gt;&lt;br/&gt;
2015-05-22 07:53:15,621 | ERROR | lt-dispatcher-38 | LocalThreePhaseCommitCohort      | 169 - org.opendaylight.controller.sal-distributed-datastore - 1.2.0.SNAPSHOT | Failed to prepare transaction member-1-txn-4520 on backend&lt;br/&gt;
akka.pattern.AskTimeoutException: Ask timed out on &lt;a href=&quot;#-1012054239)]&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot;&gt;ActorSelection[Anchor(akka://opendaylight-cluster-data/), Path(/user/shardmanager-operational/member-1-shard-default-operational#-1012054239)]&lt;/a&gt; after &lt;span class=&quot;error&quot;&gt;&amp;#91;30000 ms&amp;#93;&lt;/span&gt;&lt;br/&gt;
        at akka.pattern.PromiseActorRef$$anonfun$1.apply$mcV$sp(AskSupport.scala:334)&lt;span class=&quot;error&quot;&gt;&amp;#91;154:com.typesafe.akka.actor:2.3.10&amp;#93;&lt;/span&gt;&lt;br/&gt;
        at akka.actor.Scheduler$$anon$7.run(Scheduler.scala:117)&lt;span class=&quot;error&quot;&gt;&amp;#91;154:com.typesafe.akka.actor:2.3.10&amp;#93;&lt;/span&gt;&lt;br/&gt;
        at scala.concurrent.Future$InternalCallbackExecutor$.scala$concurrent$Future$InternalCallbackExecutor$$unbatchedExecute(Future.scala:694)&lt;span class=&quot;error&quot;&gt;&amp;#91;151:org.scala-lang.scala-library:2.10.4.v20140209-180020-VFINAL-b66a39653b&amp;#93;&lt;/span&gt;&lt;br/&gt;
        at scala.concurrent.Future$InternalCallbackExecutor$.execute(Future.scala:691)&lt;span class=&quot;error&quot;&gt;&amp;#91;151:org.scala-lang.scala-library:2.10.4.v20140209-180020-VFINAL-b66a39653b&amp;#93;&lt;/span&gt;&lt;br/&gt;
        at akka.actor.LightArrayRevolverScheduler$TaskHolder.executeTask(Scheduler.scala:467)&lt;span class=&quot;error&quot;&gt;&amp;#91;154:com.typesafe.akka.actor:2.3.10&amp;#93;&lt;/span&gt;&lt;br/&gt;
        at akka.actor.LightArrayRevolverScheduler$$anon$8.executeBucket$1(Scheduler.scala:419)&lt;span class=&quot;error&quot;&gt;&amp;#91;154:com.typesafe.akka.actor:2.3.10&amp;#93;&lt;/span&gt;&lt;br/&gt;
        at akka.actor.LightArrayRevolverScheduler$$anon$8.nextTick(Scheduler.scala:423)&lt;span class=&quot;error&quot;&gt;&amp;#91;154:com.typesafe.akka.actor:2.3.10&amp;#93;&lt;/span&gt;&lt;br/&gt;
        at akka.actor.LightArrayRevolverScheduler$$anon$8.run(Scheduler.scala:375)&lt;span class=&quot;error&quot;&gt;&amp;#91;154:com.typesafe.akka.actor:2.3.10&amp;#93;&lt;/span&gt;&lt;br/&gt;
        at java.lang.Thread.run(Thread.java:745)&lt;span class=&quot;error&quot;&gt;&amp;#91;:1.7.0_79&amp;#93;&lt;/span&gt;&lt;br/&gt;
2015-05-22 07:53:15,622 | WARN  | lt-dispatcher-23 | ConcurrentDOMDataBroker          | 169 - org.opendaylight.controller.sal-distributed-datastore - 1.2.0.SNAPSHOT | Tx: DOM-CHAIN-6-2369 Error during phase CAN_COMMIT, starting Abort&lt;br/&gt;
akka.pattern.AskTimeoutException: Ask timed out on &lt;a href=&quot;#-1012054239)]&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot;&gt;ActorSelection[Anchor(akka://opendaylight-cluster-data/), Path(/user/shardmanager-operational/member-1-shard-default-operational#-1012054239)]&lt;/a&gt; after &lt;span class=&quot;error&quot;&gt;&amp;#91;30000 ms&amp;#93;&lt;/span&gt;&lt;br/&gt;
        at akka.pattern.PromiseActorRef$$anonfun$1.apply$mcV$sp(AskSupport.scala:334)&lt;span class=&quot;error&quot;&gt;&amp;#91;154:com.typesafe.akka.actor:2.3.10&amp;#93;&lt;/span&gt;&lt;br/&gt;
        at akka.actor.Scheduler$$anon$7.run(Scheduler.scala:117)&lt;span class=&quot;error&quot;&gt;&amp;#91;154:com.typesafe.akka.actor:2.3.10&amp;#93;&lt;/span&gt;&lt;br/&gt;
        at scala.concurrent.Future$InternalCallbackExecutor$.scala$concurrent$Future$InternalCallbackExecutor$$unbatchedExecute(Future.scala:694)&lt;span class=&quot;error&quot;&gt;&amp;#91;151:org.scala-lang.scala-library:2.10.4.v20140209-180020-VFINAL-b66a39653b&amp;#93;&lt;/span&gt;&lt;br/&gt;
        at scala.concurrent.Future$InternalCallbackExecutor$.execute(Future.scala:691)&lt;span class=&quot;error&quot;&gt;&amp;#91;151:org.scala-lang.scala-library:2.10.4.v20140209-180020-VFINAL-b66a39653b&amp;#93;&lt;/span&gt;&lt;br/&gt;
        at akka.actor.LightArrayRevolverScheduler$TaskHolder.executeTask(Scheduler.scala:467)&lt;span class=&quot;error&quot;&gt;&amp;#91;154:com.typesafe.akka.actor:2.3.10&amp;#93;&lt;/span&gt;&lt;br/&gt;
        at akka.actor.LightArrayRevolverScheduler$$anon$8.executeBucket$1(Scheduler.scala:419)&lt;span class=&quot;error&quot;&gt;&amp;#91;154:com.typesafe.akka.actor:2.3.10&amp;#93;&lt;/span&gt;&lt;br/&gt;
        at akka.actor.LightArrayRevolverScheduler$$anon$8.nextTick(Scheduler.scala:423)&lt;span class=&quot;error&quot;&gt;&amp;#91;154:com.typesafe.akka.actor:2.3.10&amp;#93;&lt;/span&gt;&lt;br/&gt;
        at akka.actor.LightArrayRevolverScheduler$$anon$8.run(Scheduler.scala:375)&lt;span class=&quot;error&quot;&gt;&amp;#91;154:com.typesafe.akka.actor:2.3.10&amp;#93;&lt;/span&gt;&lt;br/&gt;
        at java.lang.Thread.run(Thread.java:745)&lt;span class=&quot;error&quot;&gt;&amp;#91;:1.7.0_79&amp;#93;&lt;/span&gt;&lt;/p&gt;</description>
                <environment>&lt;p&gt;Operating System: All&lt;br/&gt;
Platform: All&lt;/p&gt;</environment>
        <key id="25887">CONTROLLER-1333</key>
            <summary>Clustering : Performance issues in BGP scale test</summary>
                <type id="10104" iconUrl="https://jira.opendaylight.org/secure/viewavatar?size=xsmall&amp;avatarId=10303&amp;avatarType=issuetype">Bug</type>
                                                <status id="5" iconUrl="https://jira.opendaylight.org/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="green"/>
                                    <resolution id="10000">Done</resolution>
                                        <assignee username="moraja@cisco.com">Moiz Raja</assignee>
                                    <reporter username="dkutenicsova">Dana Kutenicsova</reporter>
                        <labels>
                    </labels>
                <created>Fri, 22 May 2015 12:28:31 +0000</created>
                <updated>Wed, 10 Jun 2015 14:50:08 +0000</updated>
                            <resolved>Wed, 10 Jun 2015 14:50:08 +0000</resolved>
                                    <version>Post-Helium</version>
                                                    <component>clustering</component>
                        <due></due>
                            <votes>0</votes>
                                    <watches>4</watches>
                                                                                                                <comments>
                            <comment id="50674" author="moraja@cisco.com" created="Wed, 27 May 2015 01:09:58 +0000"  >&lt;p&gt;I enabled debug logging, grepped for YimeoutException and then grepped on some of the transactions that were failing. I see a pattern (which I cannot explain yet). Most of the transactions that fail with the timeout happen because &quot;FinfPrimaryShard&quot; fails with a timeout.&lt;/p&gt;

&lt;p&gt;See this snippet -&amp;gt; &lt;a href=&quot;http://pastebin.com/yBh2nrSs&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://pastebin.com/yBh2nrSs&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;A couple of curious things,&lt;/p&gt;

&lt;p&gt;a. The AkkaTimeout shows the actor to be the shard&lt;br/&gt;
b. The timeout is 30 seconds which is the same as the commit timeout&lt;/p&gt;

&lt;p&gt;This does not make sense because the find primary request is sent to the ShardManager which is the parent of Shard.&lt;/p&gt;</comment>
                            <comment id="50675" author="moraja@cisco.com" created="Wed, 27 May 2015 01:50:59 +0000"  >&lt;p&gt;I reran the test and now I see the timeouts to be 5 seconds with the actor still being the shard - puzzling - I&apos;m wondering if there is some mixup happening here and the timeout is being delivered to the wrong handler.&lt;/p&gt;</comment>
                            <comment id="50676" author="moraja@cisco.com" created="Wed, 27 May 2015 02:32:56 +0000"  >&lt;p&gt;Based on what I am seeing I think the following patch (which was merged May 26th) will help because it avoids looking up the primary shard unless ShardManager actually removes it from the PrimaryShardInfo cache.&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://git.opendaylight.org/gerrit/#/c/20571/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://git.opendaylight.org/gerrit/#/c/20571/&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Dana, if you could try this out it would be good. I&apos;m hitting some trouble building an integration distribution otherwise would have tried it myself.&lt;/p&gt;</comment>
                            <comment id="50677" author="tpantelis" created="Wed, 27 May 2015 02:40:13 +0000"  >&lt;p&gt;That is odd. What&apos;s even odder is that the ActorContext uses the shard-initialization-timeout for FindPrimary which is set to 5 min. &lt;/p&gt;

&lt;p&gt;Did you have ActorContext and ShardManager debug on as well? If the PrimaryShardInfo future wasn&apos;t cached yet then we should&apos;ve seen see the &quot;Sending message FindPrimary...&quot; output from ActorContext. It seems the PrimaryShardInfo future was not yet cached b/c the &quot;Find primary for shard default failed&quot; output was logged from an akka dispatcher thread which indicates it was invoked via the OnComplete callback. The AbstractTransactionContextFactory first checks Future#isCompleted to elide the async OnComplete wait on the Future so if it was already completed then the output would&apos;ve been logged on the &quot;oupCloseable-3-1&quot; thread. &lt;/p&gt;

&lt;p&gt;Getting back to the AskTimeoutException indicating the &quot;default&quot; shard actor path, a possible explanation is that the call to actorSelection in ActorSelection#onPrimaryShardFound is what threw the AskTimeoutException. However, where did the 30000 ms timeout some from? Also the &quot;findPrimaryShardAsync received ...&quot; debug output should&apos;ve been seen (assuming you had ActorContext debug on).&lt;/p&gt;

&lt;p&gt;In the paste bin, the NoOpTransactionContext didn&apos;t print the AskTimeoutException stack trace - possibly b/c it was already printed earlier too many times. Hopefully that would give an indication as to the source of the ask failure.&lt;/p&gt;</comment>
                            <comment id="50678" author="moraja@cisco.com" created="Wed, 27 May 2015 02:47:01 +0000"  >&lt;p&gt;Since bugzilla won&apos;t let me I&apos;ve uploaded the whole debug enabled log file here &lt;a href=&quot;https://cisco.box.com/s/eesdiopbocetz2qkv4orjy3inknde7v1&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://cisco.box.com/s/eesdiopbocetz2qkv4orjy3inknde7v1&lt;/a&gt;. &lt;/p&gt;

&lt;p&gt;Note that in this log you will see Akka timeouts happening at 5 seconds but the actor is still the Shard.&lt;/p&gt;</comment>
                            <comment id="50679" author="tpantelis" created="Wed, 27 May 2015 05:35:19 +0000"  >&lt;p&gt;Notice the time interval between these 2 log entries for tx member-1-txn-4668:&lt;/p&gt;

&lt;p&gt;2015-05-26 21:11:39,003 | DEBUG | oupCloseable-3-1 | SingleCommitCohortProxy          | 169 - org.opendaylight.controller.sal-distributed-datastore - 1.2.0.SNAPSHOT | Tx member-1-txn-4668 canCommit&lt;br/&gt;
2015-05-26 21:18:51,324 | DEBUG | ult-dispatcher-3 | bstractTransactionContextFactory | 169 - org.opendaylight.controller.sal-distributed-datastore - 1.2.0.SNAPSHOT | Tx member-1-txn-4668: Find primary for shard default failed&lt;/p&gt;

&lt;p&gt;Over 7 minutes passed between the time it was created and readied by the caller and the time it failed to find the primary shard.&lt;/p&gt;

&lt;p&gt;The ActorContext sent out 2 FindPrimary messages ~9 sec after the tx was created:&lt;/p&gt;

&lt;p&gt;2015-05-26 21:11:48,516 | DEBUG | lt-dispatcher-39 | ActorContext                     | 169 - org.opendaylight.controller.sal-distributed-datastore - 1.2.0.SNAPSHOT | Sending message class org.opendaylight.controller.cluster.datastore.messages.FindPrimary to Actor&lt;a href=&quot;#967672581&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot;&gt;akka://opendaylight-cluster-data/user/shardmanager-operational#967672581&lt;/a&gt;&lt;br/&gt;
2015-05-26 21:11:48,517 | DEBUG | ult-dispatcher-5 | ActorContext                     | 169 - org.opendaylight.controller.sal-distributed-datastore - 1.2.0.SNAPSHOT | Sending message class org.opendaylight.controller.cluster.datastore.messages.FindPrimary to Actor&lt;a href=&quot;#967672581&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot;&gt;akka://opendaylight-cluster-data/user/shardmanager-operational#967672581&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;I assume 1 of them was for this tx. It appears the 9 sec gap is primarily due to waiting for the previous ready Future to complete. The requests were processed by the ShardManager and the responses came back quickly to the ActorContext:&lt;/p&gt;

&lt;p&gt;2015-05-26 21:11:48,517 | DEBUG | lt-dispatcher-39 | ShardManager                     | 169 - org.opendaylight.controller.sal-distributed-datastore - 1.2.0.SNAPSHOT | shard-manager-operational: Found primary for default: LocalPrimaryShardFound &lt;span class=&quot;error&quot;&gt;&amp;#91;primaryPath=akka.tcp://opendaylight-cluster-data@127.0.0.1:2550/user/shardmanager-operational/member-1-shard-default-operational#-701003399, localShardDataTree=org.opendaylight.yangtools.yang.data.impl.schema.tree.InMemoryDataTree@58bde7ca&amp;#93;&lt;/span&gt;&lt;br/&gt;
2015-05-26 21:11:48,517 | DEBUG | lt-dispatcher-39 | ShardManager                     | 169 - org.opendaylight.controller.sal-distributed-datastore - 1.2.0.SNAPSHOT | shard-manager-operational: Found primary for default: LocalPrimaryShardFound &lt;span class=&quot;error&quot;&gt;&amp;#91;primaryPath=akka.tcp://opendaylight-cluster-data@127.0.0.1:2550/user/shardmanager-operational/member-1-shard-default-operational#-701003399, localShardDataTree=org.opendaylight.yangtools.yang.data.impl.schema.tree.InMemoryDataTree@58bde7ca&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;2015-05-26 21:11:48,517 | DEBUG | ult-dispatcher-5 | ActorContext                     | 169 - org.opendaylight.controller.sal-distributed-datastore - 1.2.0.SNAPSHOT | findPrimaryShardAsync received: LocalPrimaryShardFound &lt;span class=&quot;error&quot;&gt;&amp;#91;primaryPath=akka.tcp://opendaylight-cluster-data@127.0.0.1:2550/user/shardmanager-operational/member-1-shard-default-operational#-701003399, localShardDataTree=org.opendaylight.yangtools.yang.data.impl.schema.tree.InMemoryDataTree@58bde7ca&amp;#93;&lt;/span&gt;&lt;br/&gt;
2015-05-26 21:11:48,517 | DEBUG | lt-dispatcher-33 | ActorContext                     | 169 - org.opendaylight.controller.sal-distributed-datastore - 1.2.0.SNAPSHOT | findPrimaryShardAsync received: LocalPrimaryShardFound &lt;span class=&quot;error&quot;&gt;&amp;#91;primaryPath=akka.tcp://opendaylight-cluster-data@127.0.0.1:2550/user/shardmanager-operational/member-1-shard-default-operational#-701003399, localShardDataTree=org.opendaylight.yangtools.yang.data.impl.schema.tree.InMemoryDataTree@58bde7ca&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;So the FindPrimary message itself was quick. Also it succeeded so the AskTimeoutEx didn&apos;t come from that message. So what accounts for the 7 min gap between &quot;findPrimaryShardAsync received&quot; and the AbstractTransactionContextFactory receiving the Future callback? There could&apos;ve been a long GC pause (for that long?). Or it could&apos;ve taken that long for a thread to become available and process the OnComplete callback (not likely). But neither of those account for the AskTimeoutEx. &lt;/p&gt;

&lt;p&gt;I suspect it was onFindPrimaryShardFound that took 7 min. The only code I see that could be the culprit are the call to actorSelection and the call to onShardInfoUpdated while holding the lock. The latter shouldn&apos;t take that long even if the lock was contended. The call to actorSelection looks suspicious. I&apos;m not sure what it does internally - maybe it tries to &quot;ping&quot; the shard actor but times out and throws the AskTimeoutEx - again that would explain the presence of the shard actor path in the AskTimeoutEx. It would be interesting to put a try/catch around actorSelection.&lt;/p&gt;

&lt;p&gt;I agree that &lt;a href=&quot;https://git.opendaylight.org/gerrit/#/c/20571/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://git.opendaylight.org/gerrit/#/c/20571/&lt;/a&gt; should help here. It should alleviate the excessive FindPrimary messages.&lt;/p&gt;</comment>
                            <comment id="50680" author="moraja@cisco.com" created="Wed, 27 May 2015 06:02:22 +0000"  >&lt;p&gt;Another curious thing about most of the transactions failing with AskTimeout - there were at most 1 or 2 modifications in those transactions. Why would that be when most transactions seem to have thousands?&lt;/p&gt;</comment>
                            <comment id="50681" author="moraja@cisco.com" created="Wed, 27 May 2015 06:26:39 +0000"  >&lt;p&gt;Tom, I don&apos;t think that actorSelection taking a long time is the root cause. Creating an actorSelection is supposed to be cheap. Using an actor selection is more expensive than an ActorRef. If we go with your theory then actorSelection would have to do an ask to the actor and wait for a response. &lt;/p&gt;

&lt;p&gt;Note that the timeouts we are seeing 5s/30s are the kind of timeouts we have set. &lt;/p&gt;

&lt;p&gt;I think there is something else going on here.&lt;/p&gt;

&lt;p&gt;Also note a bunch of consecutive transactions failed.&lt;/p&gt;

&lt;p&gt;550777:2015-05-26 21:18:51,324 | DEBUG | ult-dispatcher-3 | bstractTransactionContextFactory | 169 - org.opendaylight.controller.sal-distributed-datastore - 1.2.0.SNAPSHOT | Tx member-1-txn-4668: Find primary for shard default failed&lt;br/&gt;
550905:2015-05-26 21:19:01,529 | DEBUG | ult-dispatcher-3 | bstractTransactionContextFactory | 169 - org.opendaylight.controller.sal-distributed-datastore - 1.2.0.SNAPSHOT | Tx member-1-txn-4669: Find primary for shard default failed&lt;br/&gt;
551017:2015-05-26 21:19:01,531 | DEBUG | ult-dispatcher-3 | bstractTransactionContextFactory | 169 - org.opendaylight.controller.sal-distributed-datastore - 1.2.0.SNAPSHOT | Tx member-1-txn-4670: Find primary for shard default failed&lt;br/&gt;
551092:2015-05-26 21:19:04,493 | DEBUG | ult-dispatcher-3 | bstractTransactionContextFactory | 169 - org.opendaylight.controller.sal-distributed-datastore - 1.2.0.SNAPSHOT | Tx member-1-txn-4671: Find primary for shard default failed&lt;br/&gt;
551165:2015-05-26 21:19:04,495 | DEBUG | ult-dispatcher-3 | bstractTransactionContextFactory | 169 - org.opendaylight.controller.sal-distributed-datastore - 1.2.0.SNAPSHOT | Tx member-1-txn-4672: Find primary for shard default failed&lt;br/&gt;
551288:2015-05-26 21:19:09,490 | DEBUG | ult-dispatcher-3 | bstractTransactionContextFactory | 169 - org.opendaylight.controller.sal-distributed-datastore - 1.2.0.SNAPSHOT | Tx member-1-txn-4673: Find primary for shard default failed&lt;br/&gt;
551348:2015-05-26 21:19:09,493 | DEBUG | ult-dispatcher-3 | bstractTransactionContextFactory | 169 - org.opendaylight.controller.sal-distributed-datastore - 1.2.0.SNAPSHOT | Tx member-1-txn-4674: Find primary for shard default failed&lt;/p&gt;</comment>
                            <comment id="50682" author="dkutenicsova" created="Wed, 27 May 2015 09:53:08 +0000"  >&lt;p&gt;I tried build &lt;a href=&quot;https://nexus.opendaylight.org/content/repositories/opendaylight.snapshot/org/opendaylight/integration/distribution-karaf/0.3.0-SNAPSHOT/distribution-karaf-0.3.0-20150527.075221-1862.tar.gz&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://nexus.opendaylight.org/content/repositories/opendaylight.snapshot/org/opendaylight/integration/distribution-karaf/0.3.0-SNAPSHOT/distribution-karaf-0.3.0-20150527.075221-1862.tar.gz&lt;/a&gt; , which should contain the change 20571. From my point of view it didn&apos;t help, the logs are in ~/27-05-fail-karaf.log&lt;/p&gt;</comment>
                            <comment id="50683" author="tpantelis" created="Wed, 27 May 2015 12:15:31 +0000"  >&lt;p&gt;An actorSelecton shouldn&apos;t take that long but then again nothing should &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.opendaylight.org/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt; But something took 7 min and the logs indicate it wasn&apos;t FindPrimary...&lt;/p&gt;

&lt;p&gt;I think the bunch of consecutive failed transactions is b/c they were all backed up then timed out similarly.&lt;/p&gt;

&lt;p&gt;I agree the 5s/30s timeouts look like ours - that part is odd. The 5s is more odd b/c that would correspond to the operation timeout but we only use that now for remote tx operations - all these txs are local.&lt;/p&gt;

&lt;p&gt;(In reply to Moiz Raja from comment #8)&lt;br/&gt;
&amp;gt; Tom, I don&apos;t think that actorSelection taking a long time is the root cause.&lt;br/&gt;
&amp;gt; Creating an actorSelection is supposed to be cheap. Using an actor selection&lt;br/&gt;
&amp;gt; is more expensive than an ActorRef. If we go with your theory then&lt;br/&gt;
&amp;gt; actorSelection would have to do an ask to the actor and wait for a response. &lt;br/&gt;
&amp;gt; &lt;br/&gt;
&amp;gt; Note that the timeouts we are seeing 5s/30s are the kind of timeouts we have&lt;br/&gt;
&amp;gt; set. &lt;br/&gt;
&amp;gt; &lt;br/&gt;
&amp;gt; I think there is something else going on here.&lt;br/&gt;
&amp;gt; &lt;br/&gt;
&amp;gt; Also note a bunch of consecutive transactions failed.&lt;br/&gt;
&amp;gt; &lt;br/&gt;
&amp;gt; 550777:2015-05-26 21:18:51,324 | DEBUG | ult-dispatcher-3 |&lt;br/&gt;
&amp;gt; bstractTransactionContextFactory | 169 -&lt;br/&gt;
&amp;gt; org.opendaylight.controller.sal-distributed-datastore - 1.2.0.SNAPSHOT | Tx&lt;br/&gt;
&amp;gt; member-1-txn-4668: Find primary for shard default failed&lt;br/&gt;
&amp;gt; 550905:2015-05-26 21:19:01,529 | DEBUG | ult-dispatcher-3 |&lt;br/&gt;
&amp;gt; bstractTransactionContextFactory | 169 -&lt;br/&gt;
&amp;gt; org.opendaylight.controller.sal-distributed-datastore - 1.2.0.SNAPSHOT | Tx&lt;br/&gt;
&amp;gt; member-1-txn-4669: Find primary for shard default failed&lt;br/&gt;
&amp;gt; 551017:2015-05-26 21:19:01,531 | DEBUG | ult-dispatcher-3 |&lt;br/&gt;
&amp;gt; bstractTransactionContextFactory | 169 -&lt;br/&gt;
&amp;gt; org.opendaylight.controller.sal-distributed-datastore - 1.2.0.SNAPSHOT | Tx&lt;br/&gt;
&amp;gt; member-1-txn-4670: Find primary for shard default failed&lt;br/&gt;
&amp;gt; 551092:2015-05-26 21:19:04,493 | DEBUG | ult-dispatcher-3 |&lt;br/&gt;
&amp;gt; bstractTransactionContextFactory | 169 -&lt;br/&gt;
&amp;gt; org.opendaylight.controller.sal-distributed-datastore - 1.2.0.SNAPSHOT | Tx&lt;br/&gt;
&amp;gt; member-1-txn-4671: Find primary for shard default failed&lt;br/&gt;
&amp;gt; 551165:2015-05-26 21:19:04,495 | DEBUG | ult-dispatcher-3 |&lt;br/&gt;
&amp;gt; bstractTransactionContextFactory | 169 -&lt;br/&gt;
&amp;gt; org.opendaylight.controller.sal-distributed-datastore - 1.2.0.SNAPSHOT | Tx&lt;br/&gt;
&amp;gt; member-1-txn-4672: Find primary for shard default failed&lt;br/&gt;
&amp;gt; 551288:2015-05-26 21:19:09,490 | DEBUG | ult-dispatcher-3 |&lt;br/&gt;
&amp;gt; bstractTransactionContextFactory | 169 -&lt;br/&gt;
&amp;gt; org.opendaylight.controller.sal-distributed-datastore - 1.2.0.SNAPSHOT | Tx&lt;br/&gt;
&amp;gt; member-1-txn-4673: Find primary for shard default failed&lt;br/&gt;
&amp;gt; 551348:2015-05-26 21:19:09,493 | DEBUG | ult-dispatcher-3 |&lt;br/&gt;
&amp;gt; bstractTransactionContextFactory | 169 -&lt;br/&gt;
&amp;gt; org.opendaylight.controller.sal-distributed-datastore - 1.2.0.SNAPSHOT | Tx&lt;br/&gt;
&amp;gt; member-1-txn-4674: Find primary for shard default failed&lt;/p&gt;</comment>
                            <comment id="50684" author="tpantelis" created="Wed, 27 May 2015 12:23:04 +0000"  >&lt;p&gt;I can&apos;t get to ~/27-05-fail-karaf.log  &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.opendaylight.org/images/icons/emoticons/sad.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;

&lt;p&gt;(In reply to Dana Kutenicsova from comment #9)&lt;br/&gt;
&amp;gt; I tried build&lt;br/&gt;
&amp;gt; &lt;a href=&quot;https://nexus.opendaylight.org/content/repositories/opendaylight.snapshot/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://nexus.opendaylight.org/content/repositories/opendaylight.snapshot/&lt;/a&gt;&lt;br/&gt;
&amp;gt; org/opendaylight/integration/distribution-karaf/0.3.0-SNAPSHOT/distribution-&lt;br/&gt;
&amp;gt; karaf-0.3.0-20150527.075221-1862.tar.gz , which should contain the change&lt;br/&gt;
&amp;gt; 20571. From my point of view it didn&apos;t help, the logs are in&lt;br/&gt;
&amp;gt; ~/27-05-fail-karaf.log&lt;/p&gt;</comment>
                            <comment id="50685" author="moraja@cisco.com" created="Wed, 27 May 2015 15:18:00 +0000"  >&lt;p&gt;Here is 27-05-karaf.log &lt;a href=&quot;https://cisco.box.com/s/m5o616g8qsye2urcvr844zfim2l5ikt5&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://cisco.box.com/s/m5o616g8qsye2urcvr844zfim2l5ikt5&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;But it does not have debug logging. I&apos;ll enable that again and run the test then post the log after.&lt;/p&gt;</comment>
                            <comment id="50686" author="moraja@cisco.com" created="Wed, 27 May 2015 16:39:46 +0000"  >&lt;p&gt;Debug logs &lt;a href=&quot;https://cisco.box.com/s/qbw7pexrgkesgbuqpjgtij2p7kojf324&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://cisco.box.com/s/qbw7pexrgkesgbuqpjgtij2p7kojf324&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="50687" author="tpantelis" created="Wed, 27 May 2015 19:05:09 +0000"  >&lt;p&gt;The default-operational shard sent the LeaderStateChanged event at :&lt;/p&gt;

&lt;p&gt;2015-05-27 12:20:34,187 | INFO  | er-data-akka.actor.default-dispatcher-34 | ShardManager                     | 169 - org.opendaylight.controller.sal-distributed-datastore - 1.2.0.SNAPSHOT | shard-manager-operational: Received LeaderStateChanged message: LeaderStateChanged &lt;span class=&quot;error&quot;&gt;&amp;#91;memberId=member-1-shard-default-operational, leaderId=member-1-shard-default-operational&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;The FindPrimary message was sent and received 5 sec later&lt;/p&gt;

&lt;p&gt;2015-05-27 12:20:39,113 | DEBUG | er-data-akka.actor.default-dispatcher-16 | ActorContext                     | 169 - org.opendaylight.controller.sal-distributed-datastore - 1.2.0.SNAPSHOT | findPrimaryShardAsync received: LocalPrimaryShardFound &lt;span class=&quot;error&quot;&gt;&amp;#91;primaryPath=akka.tcp://opendaylight-cluster-data@127.0.0.1:2550/user/shardmanager-operational/member-1-shard-default-operational#-1521885687, localShardDataTree=org.opendaylight.yangtools.yang.data.impl.schema.tree.InMemoryDataTree@3eb14c15&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;In fact this is the only such log entry for default-operational, which makes sense b/c it should&apos;ve been cached thereafter and there were no more LeaderStateChanged events to clear it.&lt;/p&gt;

&lt;p&gt;Also we see that the local data tree was resolved so it must have gotten put in the cache:&lt;/p&gt;

&lt;p&gt;2015-05-27 12:20:39,114 | DEBUG | er-data-akka.actor.default-dispatcher-16 | bstractTransactionContextFactory | 169 - org.opendaylight.controller.sal-distributed-datastore - 1.2.0.SNAPSHOT | Shard default resolved to local data tree - adding local factory&lt;/p&gt;

&lt;p&gt;But yet a tx ~35 sec later failed to find the primary shard. Notice the 11 sec gap. &lt;/p&gt;

&lt;p&gt;2015-05-27 12:21:14,949 | DEBUG | er-data-akka.actor.default-dispatcher-19 | SingleCommitCohortProxy          | 169 - org.opendaylight.controller.sal-distributed-datastore - 1.2.0.SNAPSHOT | Tx member-1-txn-7907 canCommit&lt;br/&gt;
2015-05-27 12:32:15,902 | DEBUG | er-data-akka.actor.default-dispatcher-32 | bstractTransactionContextFactory | 169 - org.opendaylight.controller.sal-distributed-datastore - 1.2.0.SNAPSHOT | Tx member-1-txn-7907: Find primary for shard default failed&lt;/p&gt;

&lt;p&gt;But I see no FindPrimary message going out between 12:21:14 and 12:32:15 which indicates it got the Future from the cache. But how could the Future indicate failure when we only put immediate, successful Futures in the cache? Plus other transactions at 12:20:39+ succeeded b/c I see ReadyLocalTransaction messages going out.&lt;/p&gt;

&lt;p&gt;It sort of seems like there&apos;s something funky going on with akka/scala Futures - almost like it somehow loses its completed state and gets some unrelated failure associated to it.&lt;/p&gt;

&lt;p&gt;Maybe we should try switching to use ListenableFutures for the cache...&lt;/p&gt;</comment>
                            <comment id="50688" author="moraja@cisco.com" created="Wed, 27 May 2015 20:41:18 +0000"  >&lt;p&gt;Could the future returned to us be mutable? Could it be reused by akka and set to something else? What if we cached the PrimaryShardInfo instead of the future?&lt;/p&gt;</comment>
                            <comment id="50689" author="moraja@cisco.com" created="Wed, 27 May 2015 21:15:30 +0000"  >&lt;p&gt;Mutable futures is making less sense to me now. In PrimaryShardInfoFutureCache#putSuccessful we create a new future instead of storing the one from akka#ask. &lt;/p&gt;

&lt;p&gt;Now it only looks like this may be a case of a confused future...&lt;/p&gt;</comment>
                            <comment id="50690" author="tpantelis" created="Wed, 27 May 2015 21:56:56 +0000"  >&lt;p&gt;Yeah - it doesn&apos;t make sense. We need more debug logging - I&apos;ll push a patch for that. I want to try to prove that the scala/akka Future got messed up somehow or maybe I did something wrong in the isCompleted code block.&lt;/p&gt;

&lt;p&gt;We really don&apos;t need to use a scala Future here anyway - ListenableFutures are more of a known commodity  - I feel more comfortable with them - I&apos;ve read the source code and fully understand how they work. I don&apos;t really know enough scala to be able to understand their code at this point.&lt;/p&gt;</comment>
                            <comment id="50691" author="moraja@cisco.com" created="Wed, 27 May 2015 23:53:18 +0000"  >&lt;p&gt;Tom, hold off on the ListenableFuture changes till we put in more diagnostics. I&apos;ll try to push a diagnostic patch as well.&lt;/p&gt;</comment>
                            <comment id="50692" author="moraja@cisco.com" created="Thu, 4 Jun 2015 05:14:27 +0000"  >&lt;p&gt;This appears to have been fixed by &lt;a href=&quot;https://git.opendaylight.org/gerrit/#/c/21749/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://git.opendaylight.org/gerrit/#/c/21749/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="50693" author="moraja@cisco.com" created="Tue, 9 Jun 2015 14:30:26 +0000"  >&lt;p&gt;On my machine running a test with 100K routes takes 15 mins for CDS vs 4.5 mins for IMDS.&lt;/p&gt;</comment>
                            <comment id="50694" author="moraja@cisco.com" created="Wed, 10 Jun 2015 14:50:08 +0000"  >&lt;p&gt;Performance is acceptable now : 1M routes ingested in 5m&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10002">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="25626">CONTROLLER-1072</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                            <customfield id="customfield_11400" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                        <customfield id="customfield_10208" key="com.atlassian.jira.plugin.system.customfieldtypes:textfield">
                        <customfieldname>External issue ID</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>3340</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10201" key="com.atlassian.jira.plugin.system.customfieldtypes:url">
                        <customfieldname>External issue URL</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue><![CDATA[https://bugs.opendaylight.org/show_bug.cgi?id=3340]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10206" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Issue Type</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10300"><![CDATA[Bug]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10204" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>ODL SR Target Milestone</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10315"><![CDATA[Lithium]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                    <customfield id="customfield_10000" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>0|i02pxb:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                </customfields>
    </item>
</channel>
</rss>