<!-- 
RSS generated by JIRA (8.20.10#820010-sha1:ace47f9899e9ee25d7157d59aa17ab06aee30d3d) at Wed Feb 07 19:56:34 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>OpenDaylight JIRA</title>
    <link>https://jira.opendaylight.org</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>8.20.10</version>
        <build-number>820010</build-number>
        <build-date>22-06-2022</build-date>
    </build-info>


<item>
            <title>[CONTROLLER-1838] follower reports 401 (unauthorized) and 500 (Internal Error) when leader is isolated.</title>
                <link>https://jira.opendaylight.org/browse/CONTROLLER-1838</link>
                <project id="10113" key="CONTROLLER">controller</project>
                    <description>&lt;p&gt;This is to track the sporadic (but very frequent) failures we see in this &lt;a href=&quot;https://jenkins.opendaylight.org/releng/view/controller/job/controller-csit-3node-rest-clust-cars-perf-only-fluorine/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;job &lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The general flow of the job seems to be as follows:&lt;/p&gt;

&lt;ol&gt;
	&lt;li&gt;set up a 3 node cluster&lt;/li&gt;
	&lt;li&gt;run a &lt;a href=&quot;https://github.com/opendaylight/integration-test/blob/master/tools/odl-mdsal-clustering-tests/scripts/cluster_rest_script.py&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;python tool &lt;/a&gt; to add 10k cars, using one of the &lt;b&gt;follower&lt;/b&gt; nodes. It&apos;s determining leadership with the &quot;car&quot; shard on the config side.&lt;/li&gt;
	&lt;li&gt;make sure that cars are being added, by ensuring the count of cars at /restconf/config/car:cars is increasing&lt;/li&gt;
	&lt;li&gt;isolate the &lt;b&gt;leader&lt;/b&gt; node by blocking all traffic outbound from the leader node&apos;s ip to the two follower ips.&lt;/li&gt;
	&lt;li&gt;verify a new leader gets elected&lt;/li&gt;
	&lt;li&gt;wait for the tool to finish adding all 10k cars.&lt;/li&gt;
	&lt;li&gt;check that the new leader will report that 10k cars are added. &lt;b&gt;THIS FAILS&lt;/b&gt;&lt;/li&gt;
	&lt;li&gt;add the isolated node back&lt;/li&gt;
	&lt;li&gt;delete the 10k cars&lt;/li&gt;
	&lt;li&gt;stop the test tool with a CTRL-C&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;I can&apos;t make total sense of it yet, but the robot logs in step 6 show this output&lt;br/&gt;
from the test tool:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
2018-06-18 02:18:38,781 INFO: Add 10000 car(s) to 10.30.170.28:8181 (1 per request) 2018-06-18 02:18:47,348 INFO: Request started at Mon Jun 18 02:18:42 2018 finished with following detais 2018-06-18 02:18:47,349 INFO: Request started at Mon Jun 18 02:18:42 2018 finished with following detais 2018-06-18 02:18:47,349 INFO: &amp;lt;PreparedRequest [POST]&amp;gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;but in last step that stops the tool, there is a flushing of the test tools stdout and &lt;br/&gt;
we can see this:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
http:&lt;span class=&quot;code-comment&quot;&gt;//10.30.170.28:8181/restconf/config/car:cars 2018-06-18 02:18:47,349 INFO: &amp;lt;PreparedRequest [POST]&amp;gt; http://10.30.170.28:8181/restconf/config/car:cars 2018-06-18 02:18:47,350 INFO: Headers {&lt;span class=&quot;code-quote&quot;&gt;&apos;Content-Length&apos;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&apos;127&apos;&lt;/span&gt;, &lt;span class=&quot;code-quote&quot;&gt;&apos;Content-Type&apos;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&apos;application/json&apos;&lt;/span&gt;, &lt;span class=&quot;code-quote&quot;&gt;&apos;Authorization&apos;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&apos;Basic YWRtaW46YWRtaW4=&apos;&lt;/span&gt;}: 2018-06-18 02:18:47,350 INFO: Headers {&lt;span class=&quot;code-quote&quot;&gt;&apos;Content-Length&apos;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&apos;127&apos;&lt;/span&gt;, &lt;span class=&quot;code-quote&quot;&gt;&apos;Content-Type&apos;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&apos;application/json&apos;&lt;/span&gt;, &lt;span class=&quot;code-quote&quot;&gt;&apos;Authorization&apos;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&apos;Basic YWRtaW46YWRtaW4=&apos;&lt;/span&gt;}: 2018-06-18 02:18:47,350 INFO: Body: {&lt;span class=&quot;code-quote&quot;&gt;&quot;car-entry&quot;&lt;/span&gt;: [{&lt;span class=&quot;code-quote&quot;&gt;&quot;category&quot;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&quot;my_category&quot;&lt;/span&gt;, &lt;span class=&quot;code-quote&quot;&gt;&quot;model&quot;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&quot;model361&quot;&lt;/span&gt;, &lt;span class=&quot;code-quote&quot;&gt;&quot;manufacturer&quot;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&quot;my_manufacturer&quot;&lt;/span&gt;, &lt;span class=&quot;code-quote&quot;&gt;&quot;id&quot;&lt;/span&gt;: 361, &lt;span class=&quot;code-quote&quot;&gt;&quot;year&quot;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&quot;2015&quot;&lt;/span&gt;}]} 2018-06-18 02:18:47,350 INFO: Body: {&lt;span class=&quot;code-quote&quot;&gt;&quot;car-entry&quot;&lt;/span&gt;: [{&lt;span class=&quot;code-quote&quot;&gt;&quot;category&quot;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&quot;my_category&quot;&lt;/span&gt;, &lt;span class=&quot;code-quote&quot;&gt;&quot;model&quot;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&quot;model353&quot;&lt;/span&gt;, &lt;span class=&quot;code-quote&quot;&gt;&quot;manufacturer&quot;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&quot;my_manufacturer&quot;&lt;/span&gt;, &lt;span class=&quot;code-quote&quot;&gt;&quot;id&quot;&lt;/span&gt;: 353, &lt;span class=&quot;code-quote&quot;&gt;&quot;year&quot;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&quot;2015&quot;&lt;/span&gt;}]} 2018-06-18 02:18:47,351 INFO: Response: &amp;lt;html&amp;gt; &amp;lt;head&amp;gt; &amp;lt;meta http-equiv=&lt;span class=&quot;code-quote&quot;&gt;&quot;Content-Type&quot;&lt;/span&gt; content=&lt;span class=&quot;code-quote&quot;&gt;&quot;text/html;charset=utf-8&quot;&lt;/span&gt;/&amp;gt; &amp;lt;title&amp;gt;Error 401 Unauthorized&amp;lt;/title&amp;gt; &amp;lt;/head&amp;gt; &amp;lt;body&amp;gt;&amp;lt;h2&amp;gt;HTTP ERROR 401&amp;lt;/h2&amp;gt; &amp;lt;p&amp;gt;Problem accessing /restconf/config/car:cars. Reason: &amp;lt;pre&amp;gt; Unauthorized&amp;lt;/pre&amp;gt;&amp;lt;/p&amp;gt; &amp;lt;/body&amp;gt; &amp;lt;/html&amp;gt; 2018-06-18 02:18:47,352 INFO: Response: {&lt;span class=&quot;code-quote&quot;&gt;&quot;errors&quot;&lt;/span&gt;:{&lt;span class=&quot;code-quote&quot;&gt;&quot;error&quot;&lt;/span&gt;:[{&lt;span class=&quot;code-quote&quot;&gt;&quot;error-type&quot;&lt;/span&gt;:&lt;span class=&quot;code-quote&quot;&gt;&quot;application&quot;&lt;/span&gt;,&lt;span class=&quot;code-quote&quot;&gt;&quot;error-tag&quot;&lt;/span&gt;:&lt;span class=&quot;code-quote&quot;&gt;&quot;operation-failed&quot;&lt;/span&gt;,&lt;span class=&quot;code-quote&quot;&gt;&quot;error-message&quot;&lt;/span&gt;:&lt;span class=&quot;code-quote&quot;&gt;&quot;read execution failed&quot;&lt;/span&gt;,&lt;span class=&quot;code-quote&quot;&gt;&quot;error-info&quot;&lt;/span&gt;:&lt;span class=&quot;code-quote&quot;&gt;&quot;ReadFailedException{message=Error checking DataExists &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; path /(urn:opendaylight:params:xml:ns:yang:controller:config:sal-clustering-it:car?revision=2014-08-18)cars/car-entry/car-entry[{(urn:opendaylight:params:xml:ns:yang:controller:config:sal-clustering-it:car?revision=2014-08-18)id=353}], errorList=[RpcError [message=Error checking DataExists &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; path /(urn:opendaylight:params:xml:ns:yang:controller:config:sal-clustering-it:car?revision=2014-08-18)cars/car-entry/car-entry[{(urn:opendaylight:params:xml:ns:yang:controller:config:sal-clustering-it:car?revision=2014-08-18)id=353}], severity=ERROR, errorType=APPLICATION, tag=operation-failed, applicationTag=&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt;, info=&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt;, cause=akka.pattern.AskTimeoutException: Ask timed out on [ActorSelection[Anchor(akka.tcp://opendaylight-cluster-data@10.30.170.29:2550/), Path(/user/shardmanager-config/member-1-shard-car-config/shard-car-member-2:datastore-config@0:11946_12283#-1964161993)]] after [5000 ms]. Sender[&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt;] sent message of type \&quot;&lt;/span&gt;org.opendaylight.controller.cluster.datastore.messages.DataExists\&lt;span class=&quot;code-quote&quot;&gt;&quot;.]]}\n\tat org.opendaylight.controller.cluster.datastore.RemoteTransactionContext$2.onComplete(RemoteTransactionContext.java:242)\n\tat akka.dispatch.OnComplete.internal(Future.scala:260)\n\tat akka.dispatch.OnComplete.internal(Future.scala:258)\n\tat akka.dispatch.japi$CallbackBridge.apply(Future.scala:188)\n\tat akka.dispatch.japi$CallbackBridge.apply(Future.scala:185)\n\tat scala.concurrent.impl.CallbackRunnable.run(Promise.scala:60)\n\tat akka.dispatch.BatchingExecutor$AbstractBatch.processBatch(BatchingExecutor.scala:55)\n\tat akka.dispatch.BatchingExecutor$BlockableBatch.$anonfun$run$1(BatchingExecutor.scala:91)\n\tat scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:12)\n\tat scala.concurrent.BlockContext$.withBlockContext(BlockContext.scala:81)\n\tat akka.dispatch.BatchingExecutor$BlockableBatch.run(BatchingExecutor.scala:91)\n\tat akka.dispatch.TaskInvocation.run(AbstractDispatcher.scala:40)\n\tat akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(ForkJoinExecutorConfigurator.scala:43)\n\tat akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)\n\tat akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)\n\tat akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)\n\tat akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)\nCaused by: akka.pattern.AskTimeoutException: Ask timed out on [ActorSelection[Anchor(akka.tcp://opendaylight-cluster-data@10.30.170.29:2550/), Path(/user/shardmanager-config/member-1-shard-car-config/shard-car-member-2:datastore-config@0:11946_12283#-1964161993)]] after [5000 ms]. Sender[&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt;] sent message of type \&quot;&lt;/span&gt;org.opendaylight.controller.cluster.datastore.messages.DataExists\&lt;span class=&quot;code-quote&quot;&gt;&quot;.\n\tat akka.pattern.PromiseActorRef$.$anonfun$defaultOnTimeout$1(AskSupport.scala:595)\n\tat akka.pattern.PromiseActorRef$.$anonfun$apply$1(AskSupport.scala:605)\n\tat akka.actor.Scheduler$$anon$4.run(Scheduler.scala:140)\n\tat scala.concurrent.Future$InternalCallbackExecutor$.unbatchedExecute(Future.scala:870)\n\tat scala.concurrent.BatchingExecutor.execute(BatchingExecutor.scala:109)\n\tat scala.concurrent.BatchingExecutor.execute$(BatchingExecutor.scala:103)\n\tat scala.concurrent.Future$InternalCallbackExecutor$.execute(Future.scala:868)\n\tat akka.actor.LightArrayRevolverScheduler$TaskHolder.executeTask(LightArrayRevolverScheduler.scala:328)\n\tat akka.actor.LightArrayRevolverScheduler$$anon$4.executeBucket$1(LightArrayRevolverScheduler.scala:279)\n\tat akka.actor.LightArrayRevolverScheduler$$anon$4.nextTick(LightArrayRevolverScheduler.scala:283)\n\tat akka.actor.LightArrayRevolverScheduler$$anon$4.run(LightArrayRevolverScheduler.scala:235)\n\tat java.lang.&lt;span class=&quot;code-object&quot;&gt;Thread&lt;/span&gt;.run(&lt;span class=&quot;code-object&quot;&gt;Thread&lt;/span&gt;.java:748)\n&quot;&lt;/span&gt;}]}} 2018-06-18 02:18:47,352 INFO: &amp;lt;Response [401]&amp;gt; Unauthorized 2018-06-18 02:18:47,353 INFO: &amp;lt;Response [500]&amp;gt; Internal Server Error&lt;/span&gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;That is where I think we need to dive in. It seems that there may be some hiccup&lt;br/&gt;
on the follower node, when the leader is being isolated where typical REST interactions&lt;br/&gt;
have trouble. Looks like we got an unauthorized 401, which matches up timestamps&lt;br/&gt;
with this from the &lt;a href=&quot;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/controller-csit-3node-rest-clust-cars-perf-only-fluorine/119/odl2_karaf.log.gz&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;karaf log&lt;/a&gt;:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
2018-06-18T02:18:47,330 | WARN  | qtp414495725-104 | MDSALDynamicAuthorizationFilter  | 195 - org.opendaylight.aaa.shiro - 0.8.0.SNAPSHOT | MDSAL attempt to read Http Authz Container failed, disallowing access
org.opendaylight.controller.md.sal.common.api.data.ReadFailedException: read execution failed
	at org.opendaylight.controller.cluster.datastore.compat.LegacyDOMStoreAdapter$1.newWithCause(LegacyDOMStoreAdapter.java:40) ~[?:?]
	at org.opendaylight.controller.cluster.datastore.compat.LegacyDOMStoreAdapter$1.newWithCause(LegacyDOMStoreAdapter.java:36) ~[?:?]
	at org.opendaylight.yangtools.util.concurrent.ExceptionMapper.apply(ExceptionMapper.java:91) ~[308:org.opendaylight.yangtools.util:2.0.5]
	at org.opendaylight.yangtools.util.concurrent.ExceptionMapper.apply(ExceptionMapper.java:40) ~[308:org.opendaylight.yangtools.util:2.0.5]
	at org.opendaylight.mdsal.common.api.MappingCheckedFuture.mapException(MappingCheckedFuture.java:62) ~[?:?]
	at org.opendaylight.mdsal.common.api.MappingCheckedFuture.wrapInExecutionException(MappingCheckedFuture.java:66) ~[?:?]
	at org.opendaylight.mdsal.common.api.MappingCheckedFuture.get(MappingCheckedFuture.java:79) ~[?:?]
	at com.google.common.util.concurrent.Uninterruptibles.getUninterruptibly(Uninterruptibles.java:168) ~[32:com.google.guava:23.6.0.jre]
	at com.google.common.util.concurrent.Futures.getDone(Futures.java:1436) ~[32:com.google.guava:23.6.0.jre]
	at com.google.common.util.concurrent.AbstractTransformFuture.run(AbstractTransformFuture.java:85) ~[32:com.google.guava:23.6.0.jre]
	at com.google.common.util.concurrent.MoreExecutors$DirectExecutor.execute(MoreExecutors.java:398) ~[32:com.google.guava:23.6.0.jre]
	at com.google.common.util.concurrent.AbstractFuture.executeListener(AbstractFuture.java:1015) ~[32:com.google.guava:23.6.0.jre]
	at com.google.common.util.concurrent.AbstractFuture.complete(AbstractFuture.java:868) ~[32:com.google.guava:23.6.0.jre]
	at com.google.common.util.concurrent.AbstractFuture.setException(AbstractFuture.java:713) ~[32:com.google.guava:23.6.0.jre]
	at com.google.common.util.concurrent.SettableFuture.setException(SettableFuture.java:54) ~[32:com.google.guava:23.6.0.jre]
	at org.opendaylight.controller.cluster.datastore.RemoteTransactionContext$2.onComplete(RemoteTransactionContext.java:241) ~[?:?]
	at akka.dispatch.OnComplete.internal(Future.scala:260) ~[?:?]
	at akka.dispatch.OnComplete.internal(Future.scala:258) ~[?:?]
	at akka.dispatch.japi$CallbackBridge.apply(Future.scala:188) ~[?:?]
	at akka.dispatch.japi$CallbackBridge.apply(Future.scala:185) ~[?:?]
	at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:60) ~[337:org.scala-lang.scala-library:2.12.5.v20180316-130912-VFINAL-30a1428]
	at akka.dispatch.BatchingExecutor$AbstractBatch.processBatch(BatchingExecutor.scala:55) ~[?:?]
	at akka.dispatch.BatchingExecutor$BlockableBatch.$anonfun$run$1(BatchingExecutor.scala:91) ~[?:?]
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:12) [337:org.scala-lang.scala-library:2.12.5.v20180316-130912-VFINAL-30a1428]
	at scala.concurrent.BlockContext$.withBlockContext(BlockContext.scala:81) [337:org.scala-lang.scala-library:2.12.5.v20180316-130912-VFINAL-30a1428]
	at akka.dispatch.BatchingExecutor$BlockableBatch.run(BatchingExecutor.scala:91) [35:com.typesafe.akka.actor:2.5.11]
	at akka.dispatch.TaskInvocation.run(AbstractDispatcher.scala:40) [35:com.typesafe.akka.actor:2.5.11]
	at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(ForkJoinExecutorConfigurator.scala:43) [35:com.typesafe.akka.actor:2.5.11]
	at akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) [35:com.typesafe.akka.actor:2.5.11]
	at akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) [35:com.typesafe.akka.actor:2.5.11]
	at akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) [35:com.typesafe.akka.actor:2.5.11]
	at akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) [35:com.typesafe.akka.actor:2.5.11]
Caused by: org.opendaylight.controller.md.sal.common.api.data.ReadFailedException: Error checking ReadData &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; path /(urn:opendaylight:params:xml:ns:yang:aaa?revision=2016-12-14)http-authorization
	at org.opendaylight.controller.cluster.datastore.RemoteTransactionContext$2.onComplete(RemoteTransactionContext.java:242) ~[?:?]
	... 16 more
Caused by: akka.pattern.AskTimeoutException: Ask timed out on [ActorSelection[Anchor(akka.tcp:&lt;span class=&quot;code-comment&quot;&gt;//opendaylight-cluster-data@10.30.170.29:2550/), Path(/user/shardmanager-config/member-1-shard-&lt;span class=&quot;code-keyword&quot;&gt;default&lt;/span&gt;-config/shard-&lt;span class=&quot;code-keyword&quot;&gt;default&lt;/span&gt;-member-2:datastore-config@0:11947_12284#-916780901)]] after [5000 ms]. Sender[&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt;] sent message of type &lt;span class=&quot;code-quote&quot;&gt;&quot;org.opendaylight.controller.cluster.datastore.messages.ReadData&quot;&lt;/span&gt;.
&lt;/span&gt;	at akka.pattern.PromiseActorRef$.$anonfun$defaultOnTimeout$1(AskSupport.scala:595) ~[?:?]
	at akka.pattern.PromiseActorRef$.$anonfun$apply$1(AskSupport.scala:605) ~[?:?]
	at akka.actor.Scheduler$$anon$4.run(Scheduler.scala:140) ~[?:?]
	at scala.concurrent.Future$InternalCallbackExecutor$.unbatchedExecute(Future.scala:870) ~[?:?]
	at scala.concurrent.BatchingExecutor.execute(BatchingExecutor.scala:109) ~[?:?]
	at scala.concurrent.BatchingExecutor.execute$(BatchingExecutor.scala:103) ~[?:?]
	at scala.concurrent.Future$InternalCallbackExecutor$.execute(Future.scala:868) ~[?:?]
	at akka.actor.LightArrayRevolverScheduler$TaskHolder.executeTask(LightArrayRevolverScheduler.scala:328) ~[?:?]
	at akka.actor.LightArrayRevolverScheduler$$anon$4.executeBucket$1(LightArrayRevolverScheduler.scala:279) ~[?:?]
	at akka.actor.LightArrayRevolverScheduler$$anon$4.nextTick(LightArrayRevolverScheduler.scala:283) ~[?:?]
	at akka.actor.LightArrayRevolverScheduler$$anon$4.run(LightArrayRevolverScheduler.scala:235) ~[?:?]
	at java.lang.&lt;span class=&quot;code-object&quot;&gt;Thread&lt;/span&gt;.run(&lt;span class=&quot;code-object&quot;&gt;Thread&lt;/span&gt;.java:748) ~[?:?]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I can&apos;t find where that internal error 500 comes from though.&lt;/p&gt;

&lt;p&gt;I would not expect one healthy node in the majority, when a single node dies, to start&lt;br/&gt;
giving me 500 or 401 and just take my requests and handle them.&lt;/p&gt;

&lt;p&gt;In this specific example, only 487 of the 10k cars were configured, so it seems there&lt;br/&gt;
was some major trouble.&lt;/p&gt;

&lt;p&gt;all three odl karaf logs:&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/controller-csit-3node-rest-clust-cars-perf-only-fluorine/119/odl1_karaf.log.gz&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;initial leader &lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/controller-csit-3node-rest-clust-cars-perf-only-fluorine/119/odl2_karaf.log.gz&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;follower that was pointed at by the test tool &lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/controller-csit-3node-rest-clust-cars-perf-only-fluorine/119/odl3_karaf.log.gz&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;other follower that was just minding it&apos;s own business &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.opendaylight.org/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt; &lt;/a&gt;&lt;/p&gt;
</description>
                <environment></environment>
        <key id="30151">CONTROLLER-1838</key>
            <summary>follower reports 401 (unauthorized) and 500 (Internal Error) when leader is isolated.</summary>
                <type id="10104" iconUrl="https://jira.opendaylight.org/secure/viewavatar?size=xsmall&amp;avatarId=10303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.opendaylight.org/images/icons/priorities/critical.svg">High</priority>
                        <status id="10004" iconUrl="https://jira.opendaylight.org/images/icons/status_generic.gif" description="">Verified</status>
                    <statusCategory id="3" key="done" colorName="green"/>
                                    <resolution id="10000">Done</resolution>
                                        <assignee username="tpantelis">Tom Pantelis</assignee>
                                    <reporter username="jluhrsen">Jamo Luhrsen</reporter>
                        <labels>
                            <label>csit:3node</label>
                    </labels>
                <created>Tue, 19 Jun 2018 18:39:47 +0000</created>
                <updated>Thu, 9 Aug 2018 14:00:42 +0000</updated>
                            <resolved>Thu, 12 Jul 2018 05:49:35 +0000</resolved>
                                    <version>Fluorine</version>
                                    <fixVersion>Fluorine</fixVersion>
                                    <component>clustering</component>
                        <due></due>
                            <votes>0</votes>
                                    <watches>9</watches>
                                                                                                                                                            <comments>
                            <comment id="63494" author="tpantelis" created="Tue, 19 Jun 2018 19:10:06 +0000"  >&lt;p&gt;Notice the&#160;AskTimeoutException. This&#160;can happen for transaction(s) in flight when the leader is isolated, if the timing is right. So this is expected with the ask-based protocol&#160;however the new tell-based protocol promises to alleviate inflight failures (via retries) as was discussed on the call.&lt;/p&gt;

&lt;p&gt;The 401 is b/c of a read failure in aaa - apparently it relies on the mdsal DS for authentication (I didn&apos;t know that).&lt;/p&gt;</comment>
                            <comment id="63496" author="jluhrsen" created="Tue, 19 Jun 2018 19:15:48 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=tpantelis&quot; class=&quot;user-hover&quot; rel=&quot;tpantelis&quot;&gt;tpantelis&lt;/a&gt;, just to be clear, if we run this job with the tell-based protocol we will expect to never hit this problem? I think I can&lt;br/&gt;
quickly try that in the sandbox and assuming it passes, we can run in a frequent loop to get the confidence we want.&lt;/p&gt;</comment>
                            <comment id="63497" author="tpantelis" created="Tue, 19 Jun 2018 19:20:06 +0000"  >&lt;p&gt;From my understanding, it shouldn&apos;t occur.&lt;/p&gt;</comment>
                            <comment id="63499" author="rgoulding" created="Tue, 19 Jun 2018 19:53:53 +0000"  >&lt;p&gt;Not for authentication, just authorization.&#160; We derive the authorization rules for MdsalDynamicAuthorizationFilter&#160;from data stored in MD-SAL, the same place everyone stores their data.&#160; If it is isolated, then we cannot get that info, and we fail-closed.&#160; So yes, this is correct behavior.&lt;/p&gt;</comment>
                            <comment id="63501" author="shague@redhat.com" created="Tue, 19 Jun 2018 19:57:30 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=tpantelis&quot; class=&quot;user-hover&quot; rel=&quot;tpantelis&quot;&gt;tpantelis&lt;/a&gt; what would be the outcome from this issue - that all operations using aaa will fail?&lt;/p&gt;

&lt;p&gt;Assuming we can&apos;t switch to tell-based, should something be fixed here in aaa or somewhere else? I am guessing we would hit this same error in he downstream jobs also.&lt;/p&gt;</comment>
                            <comment id="63506" author="tpantelis" created="Tue, 19 Jun 2018 20:04:57 +0000"  >&lt;p&gt;aaa is simply doing a read and it fails. Whether it should fail-fast and disallow access is another story. That&apos;s the safest approach as it doesn&apos;t know if it should allow access. However it could cache the last known password and configuration - other systems work that way or provide the option to behave that way.&lt;/p&gt;

&lt;p&gt;Either way it probably doesn&apos;t matter since if aaa can&apos;t access the DS then the user request likely won&apos;t be able to either.&lt;/p&gt;</comment>
                            <comment id="63509" author="rgoulding" created="Tue, 19 Jun 2018 20:10:59 +0000"  >&lt;p&gt;That isn&apos;t entirely true.&#160; First off, it isn&apos;t authentication;&#160; that is handled through H2 still.&#160; It is Authorization.&#160; Authorization&#160;data shouldn&apos;t be cached IMHO, and I wouldn&apos;t merge a patch that built a cache for it.&#160; There needs to be a single source of truth;&#160; it isn&apos;t like other data, which can be stale.&lt;/p&gt;</comment>
                            <comment id="63510" author="ecelgp" created="Tue, 19 Jun 2018 20:12:46 +0000"  >&lt;p&gt;Jamo, can you try with tell-based protocol? here is an example:&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://git.opendaylight.org/gerrit/#/c/63592/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://git.opendaylight.org/gerrit/#/c/63592/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="63511" author="rgoulding" created="Tue, 19 Jun 2018 20:14:28 +0000"  >&lt;p&gt;It does beg the question if we should pre-empt AAA 401 with a check in RESTCONF which returns 5xx in the case of isolation.&#160; I would be behind that, but don&apos;t have cycles to contribute.&lt;/p&gt;</comment>
                            <comment id="63513" author="tpantelis" created="Tue, 19 Jun 2018 20:27:35 +0000"  >&lt;p&gt;I know what you&apos;re saying but I don&apos;t completely agree with that.&#160;It could use the last known good config if the latest config can&apos;t be obtained. We did&#160;similar&#160;in a previous product I worked on years back to allow that based on user-enablement - it was specifically requested (I recall Windows does that too wrt LDAP authentication). Also I think the initial &quot;admin&quot; user should have &quot;super&quot;&#160;privileges, ie it should be authorized for everything so there&apos;s always a way in (otherwise users can shoot themselves in the foot with a mis-configuration and not have any way to back it out other then to completely re-install - seen that happen before).&#160;&#160;&lt;/p&gt;

&lt;p&gt;Anyway I&apos;m digressing.... it is what it is now - CSIT can expect 401 if HA is lost.&lt;/p&gt;</comment>
                            <comment id="63514" author="rgoulding" created="Tue, 19 Jun 2018 20:40:25 +0000"  >&lt;p&gt;I envision a situation in which someone gets canned when a node is in isolation but still has access via RESTCONF until the node rejoins the cluster.&#160; This is a bit silly I guess, but man do I not want my name on that CVE &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.opendaylight.org/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;We could probably also just make a super user account at some point too.&#160; I&apos;d say admin, but at this point most people unfortunately only use that account!&#160; At least from non-ONAP context.&lt;/p&gt;</comment>
                            <comment id="63519" author="tpantelis" created="Tue, 19 Jun 2018 21:27:25 +0000"  >&lt;p&gt;In my experience, the initial user created on installation is automatically a super-user to bootstrap things. It can&apos;t be deleted nor have it&apos;s privileges downgraded - again a safety net to avoid shooting oneself in the foot.&#160;&lt;/p&gt;</comment>
                            <comment id="63520" author="shague@redhat.com" created="Tue, 19 Jun 2018 21:34:04 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=rgoulding&quot; class=&quot;user-hover&quot; rel=&quot;rgoulding&quot;&gt;rgoulding&lt;/a&gt; &lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=tpantelis&quot; class=&quot;user-hover&quot; rel=&quot;tpantelis&quot;&gt;tpantelis&lt;/a&gt; What exactly would fail during this time when the authorization is failing because aaa can&apos;t read from ds? All restconf calls?  Does that mean all restconf calls should not be relied upon? Meaning in the context of our csit tests, there are scenarios where the tests use restconf to verify states and if we don&apos;t see a 200 for instance, we would fail a test.&lt;/p&gt;

&lt;p&gt;In this case, the test itself is not written properly and should be changed to not rely on the restconf calls.&lt;/p&gt;</comment>
                            <comment id="63521" author="tpantelis" created="Tue, 19 Jun 2018 21:38:40 +0000"  >&lt;p&gt;If the DS isn&apos;t available then it appears currently all restconf calls will fail with 401 b/c it can&apos;t authenticate.&lt;/p&gt;</comment>
                            <comment id="63522" author="rgoulding" created="Tue, 19 Jun 2018 21:43:53 +0000"  >&lt;p&gt;So the MDSALDynamicAuthorizationFilter is a Shiro AuthorizationFilter that returns boolean whether &quot;isAccessAllowed(...)&quot;.&#160; It is executed on ingress of REST requests to anything protected by AAA.&#160; Part of isAccessAllowed(...) is getting the existing authorization rules &lt;span class=&quot;error&quot;&gt;&amp;#91;0&amp;#93;&lt;/span&gt;.&#160; If it fails w/ a ReadFailedException (i.e., an indication of Node isolation), then we return false &lt;span class=&quot;error&quot;&gt;&amp;#91;1&amp;#93;&lt;/span&gt;, which results in a 401.&lt;/p&gt;

&lt;p&gt;Is this correct behavior?&#160; Well it functions as designed, but it ain&apos;t pretty. Really, we ought to&#160;put a check in RESTCONF to check if a Node is isolated prior to executing any AAA Filter code.&#160; In other words, a Filter in front of AAA which will return a 5XX if the node is isolated, preferably with a nice message (though one could argue how much disclosure we should give the client!).&lt;/p&gt;

&lt;p&gt;If you are mapping 200 to failure, well that means any time a REST call is made against an isolated node, you will have a test failure.&#160; Since&#160;isolation cases can occur and are &quot;expected&quot; in our target environment, I doubt that you want to map directly to failure.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;0&amp;#93;&lt;/span&gt;&#160;&lt;a href=&quot;https://github.com/opendaylight/aaa/blob/master/aaa-shiro/impl/src/main/java/org/opendaylight/aaa/shiro/realm/MDSALDynamicAuthorizationFilter.java#L53&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/opendaylight/aaa/blob/master/aaa-shiro/impl/src/main/java/org/opendaylight/aaa/shiro/realm/MDSALDynamicAuthorizationFilter.java#L53&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;1&amp;#93;&lt;/span&gt;&#160;&lt;a href=&quot;https://github.com/opendaylight/aaa/blob/master/aaa-shiro/impl/src/main/java/org/opendaylight/aaa/shiro/realm/MDSALDynamicAuthorizationFilter.java#L91&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/opendaylight/aaa/blob/master/aaa-shiro/impl/src/main/java/org/opendaylight/aaa/shiro/realm/MDSALDynamicAuthorizationFilter.java#L91&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="63523" author="rgoulding" created="Tue, 19 Jun 2018 21:48:26 +0000"  >&lt;p&gt;We could do that.&#160; I know some stakeholders have expressed desire for this in the past, but it hasn&apos;t been prioritized by anyone thus far.&#160; It really wouldn&apos;t take that much.&#160; But as you stated before, we still have that issue of RESTCONF calls probably failing anyway since that code is attempting reads against the DS on an isolated node still.&lt;/p&gt;</comment>
                            <comment id="63524" author="shague@redhat.com" created="Tue, 19 Jun 2018 21:59:21 +0000"  >&lt;p&gt;Not mapping a 200 to failure, but the opposite - if we don&apos;t get a 200 when querying for something we consider that a failure. Take a case where a test starts, some events happen that should write something to datastore. To validate that happened the test would read from datastore via restconf expecting to see a 200 for the resource requested. But if during this isolated condition all restconf calls are going to fail, then that test is always going to fail - regardless if the ds was updated correctly or not. That itself might be a another badly written part of this test.&lt;/p&gt;</comment>
                            <comment id="63525" author="jluhrsen" created="Tue, 19 Jun 2018 22:08:39 +0000"  >&lt;p&gt;Thanks &lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=ecelgp&quot; class=&quot;user-hover&quot; rel=&quot;ecelgp&quot;&gt;ecelgp&lt;/a&gt;. I see you have a similar patch to something I &lt;a href=&quot;https://git.opendaylight.org/gerrit/#/c/62387/1/csit/testplans/netvirt-3node-openstack.txt&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;tried &lt;/a&gt; (with bad results) like 10 months ago.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;anyway, here&apos;s the &lt;a href=&quot;https://git.opendaylight.org/gerrit/#/c/73209/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;cars/people version &lt;/a&gt; and I&apos;m trying it in the &lt;a href=&quot;https://jenkins.opendaylight.org/sandbox/job/jamo-controller-csit-3node-rest-clust-cars-perf-only-fluorine/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;sandbox&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="63526" author="jluhrsen" created="Tue, 19 Jun 2018 22:13:00 +0000"  >&lt;p&gt;At the risk of sounding stupid here, can I double check that I understand this properly?&lt;/p&gt;

&lt;p&gt;this specific issue in this JIRA has some external REST client talking to a single happy node. That node&lt;br/&gt;
is never isolated. So, in that sense, this external client is never attempting to talk to an isolated node.&lt;/p&gt;

&lt;p&gt;But, I think the idea here is that this node has to do it&apos;s sideways stuff to make reads/writes to the leader&lt;br/&gt;
node. And there is a chance that when we block that leader to isolate it, this happy node is doing some&lt;br/&gt;
transactions to that node and those will fail. So those errors and responses (401 and 500) kind of bubble&lt;br/&gt;
up from the side (leader isolated node) and back down through the happy node to the rest client.&lt;/p&gt;

&lt;p&gt;correct?&lt;/p&gt;</comment>
                            <comment id="63527" author="shague@redhat.com" created="Tue, 19 Jun 2018 22:20:05 +0000"  >&lt;p&gt;In the netvirt 3node jobs we have hundreds of those &quot;MDSAL attempt to read Http Authz Container failed, disallowing access&quot; errors. No idea if they cause problems, but they are there in the jobs.&lt;/p&gt;</comment>
                            <comment id="63528" author="tpantelis" created="Wed, 20 Jun 2018 00:31:14 +0000"  >&lt;p&gt;yes. All transactions have to go to the leader. During isolation there is a period of time where there is no leader that is able to make progress until it stabilizes with a new leader in the majority partition (assuming it&apos;s able to do so). Transactions initiated during that window in the majority partition may still timeout and fail. No transactions initiated in the isolated partition will succeed until the partition heals. So if the external tool was&#160;adding cars on the isolated node, with the &quot;ask&quot;-based protocol, the transactions initiated during isolation will all fail. If it was adding cars on one of the other nodes they &lt;b&gt;could&lt;/b&gt; all succeed if none were initiated during the ~5-6 sec&#160;period&#160;before akka detects the leader node is unreachable.&lt;/p&gt;

&lt;p&gt;The new &quot;tell&quot;-based protocol is intended to retry transactions in the hope that they will eventually succeed. Of course even that has its upper limit deadline. If 2 nodes were down for&#160;many minutes/hours you would expect it to eventually give up and fail.&#160;&lt;/p&gt;</comment>
                            <comment id="63529" author="tpantelis" created="Wed, 20 Jun 2018 00:45:46 +0000"  >&lt;p&gt;yeah DS access&#160;may still fail even if it got thru aaa. But other NB endpoints could still be accessed, eg RPCs, netconf mount point, jolokia etc.&#160;An admin might want to access JMX data to check the akka cluster state etc. But with no access&#160;via the NB, an admin has no visibility into the system (unless thru CLI).&lt;/p&gt;</comment>
                            <comment id="63532" author="jluhrsen" created="Wed, 20 Jun 2018 05:23:00 +0000"  >&lt;p&gt;thanks for all the explanations, &lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=tpantelis&quot; class=&quot;user-hover&quot; rel=&quot;tpantelis&quot;&gt;tpantelis&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;I&apos;m wondering what the negative side effects are to dial up the timeout from 5s to 1s? that won&apos;t get rid of this&lt;br/&gt;
timing period, but it would prevent these issues in a chatty REST environment (some). But, any latency or&lt;br/&gt;
other network blip would start causing leadership changes.&lt;/p&gt;

&lt;p&gt;Anyway, let&apos;s see how the tell-based stuff behaves with our simple car/people job. still waiting on results.&lt;/p&gt;</comment>
                            <comment id="63550" author="tpantelis" created="Wed, 20 Jun 2018 12:21:26 +0000"  >&lt;p&gt;yup - reducing the time out will open the door for false positives. Garbage collection is the biggest culprit and accounts for most of that ~5s default cushion. You can&apos;t avoid a &quot;blackout&quot; period altogether - it happens with networking protocols like spanning tree etc.&#160; The best we can do I think is to retry to hopefully overcome relatively short transient failures.&#160;&lt;/p&gt;</comment>
                            <comment id="63554" author="rovarga" created="Wed, 20 Jun 2018 13:36:16 +0000"  >&lt;p&gt;Note that tell-based protocol will abort (i.e. render frontend unusable) if it cannot make forward progress in 10 minutes (or so). Instance restart is required to recover from that.&lt;/p&gt;</comment>
                            <comment id="63555" author="jluhrsen" created="Wed, 20 Jun 2018 17:21:36 +0000"  >&lt;p&gt;looking for help.&lt;/p&gt;

&lt;p&gt;this tell-based config is having trouble in the job. &lt;a href=&quot;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/builder-copy-sandbox-logs/125/jamo-controller-csit-3node-rest-clust-cars-perf-only-fluorine/5/robot-plugin/log.html.gz&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;Example robot logs&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;basically, the cars suite runs first and passes (like it will do every so often), then a suite runs that&lt;br/&gt;
will change to tell-based protocol which passes. Then we run the cars suite again. What happens&lt;br/&gt;
there:&lt;/p&gt;

&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;get cluster state so it knows leader and two followers&lt;/li&gt;
	&lt;li&gt;start adding cars&lt;/li&gt;
	&lt;li&gt;isolate leader&lt;/li&gt;
	&lt;li&gt;try to talk to a follower&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;that last step fails, because there is a read timeout (restconf read). The &lt;a href=&quot;https://logs.opendaylight.org/sandbox/vex-yul-odl-jenkins-2/jamo-controller-csit-3node-rest-clust-cars-perf-only-fluorine/5/odl2_karaf.log.gz&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;karaf.log &lt;/a&gt; for that controller&lt;br/&gt;
has a lot of repeating logs that seem interesting.&lt;/p&gt;

&lt;p&gt;like a whole mess of these:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
2018-06-20T16:49:08,981 | WARN  | opendaylight-cluster-data-shard-dispatcher-45 | FrontendClientMetadataBuilder    | 217 - org.opendaylight.controller.sal-distributed-datastore - 1.8.0.SNAPSHOT | member-2-shard-car-config: Unknown history &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; aborted transaction member-1-datastore-config-fe-0-txn-19-1, ignoring
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;followed by a bunch of:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
2018-06-20T16:51:07,052 | INFO  | opendaylight-cluster-data-shard-dispatcher-27 | Shard                            | 209 - org.opendaylight.controller.sal-clustering-commons - 1.8.0.SNAPSHOT | member-2-shard-topology-config (Leader): Cannot append entries because sender&apos;s term 2 is less than 3
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;then some of this:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
2018-06-20T16:51:11,046 | INFO  | opendaylight-cluster-data-shard-dispatcher-26 | Shard                            | 209 - org.opendaylight.controller.sal-clustering-commons - 1.8.0.SNAPSHOT | member-2-shard-car-config (Leader): handleAppendEntriesReply - received unsuccessful reply: AppendEntriesReply [term=3, success=&lt;span class=&quot;code-keyword&quot;&gt;false&lt;/span&gt;, followerId=member-1-shard-car-config, logLastIndex=1595, logLastTerm=2, forceInstallSnapshot=&lt;span class=&quot;code-keyword&quot;&gt;false&lt;/span&gt;, payloadVersion=5, raftVersion=3], leader snapshotIndex: 1595
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;


&lt;p&gt;can I get some ideas of what to try, where to look?&lt;/p&gt;</comment>
                            <comment id="63557" author="tpantelis" created="Wed, 20 Jun 2018 19:36:13 +0000"  >&lt;p&gt;So the attached karaf log was from member-2. After the 2nd startup&#160;at&#160;2018-06-20T16:47:35,134, looking at the cars-config shard, member-1 was initially the leader:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2018-06-20T16:47:48,178 | INFO&#160; | opendaylight-cluster-data-shard-dispatcher-27 | ShardManager&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160;| 217 - org.opendaylight.controller.sal-distributed-datastore - 1.8.0.SNAPSHOT | shard-manager-config: Received LeaderStateChanged message: LeaderStateChanged [memberId=member-2-shard-car-config, leaderId=member-1-shard-car-config, leaderPayloadVersion=5]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Then 10.30.170.22, which I assume is member-1, became unreachable:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2018-06-20T16:49:23,879 | INFO  | opendaylight-cluster-data-shard-dispatcher-30 | Shard                            | 209 - org.opendaylight.controller.sal-clustering-commons - 1.8.0.SNAPSHOT | member-2-shard-car-config (Follower): Leader akka.tcp://opendaylight-cluster-data@10.30.170.22:2550 is unreachable
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2018-06-20T16:49:23,896 | INFO  | opendaylight-cluster-data-shard-dispatcher-30 | ShardManager                     | 217 - org.opendaylight.controller.sal-distributed-datastore - 1.8.0.SNAPSHOT | shard-manager-config: Received LeaderStateChanged message: LeaderStateChanged [memberId=member-2-shard-car-config, leaderId=member-2-shard-car-config, leaderPayloadVersion=5]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Then there&apos;s a bunch of these:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2018-06-20T16:51:07,079 | INFO  | opendaylight-cluster-data-shard-dispatcher-45 | Shard                            | 209 - org.opendaylight.controller.sal-clustering-commons - 1.8.0.SNAPSHOT | member-2-shard-car-config (Leader): Cannot append entries because sender&apos;s term 2 is less than 3
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I assume this is during the time of isolation however it appears these messages are coming from member-1. Is traffic just blocked one way? Unless this was after the isolation was lifted but not sure if the test got that far.&lt;/p&gt;

&lt;p&gt;I don&apos;t see any significant ERRORs in the log (just 2 related to the karaf shell). And no WARNs except for those listed here. So it&apos;s unclear from this log why the rest request timed out. The test step Verify_New_Car_Leader_Elected that failed  indicates it&apos;s attempting to access jolokia to get the leader status but it appears the actual request that failed was attempting /restconf/modules. Either way, it may be that aaa auth was trying to read from the DS but spun. &quot;tell&quot;-based has different behavior with retries so I think it has a much longer timeout period - &lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=rovarga&quot; class=&quot;user-hover&quot; rel=&quot;rovarga&quot;&gt;rovarga&lt;/a&gt; knows that code much better than me. I assume aaa would be hitting the default-config shard - the member-2 log indicates member-3 was the leader but I don&apos;t see any indication in that log either.  &lt;/p&gt;

&lt;p&gt;The log from member-1 shows a lot of &quot;Follower is out-of-sync...&quot; messages which corresponds to the &quot;handleAppendEntriesReply - received unsuccessful reply...&quot; on member-2. Something strange going on there... Also these should not have occurred during the isolation period if traffic to/from member-1 was blocked.... &lt;/p&gt;

&lt;p&gt;It would be ideal to manually run the isolation test steps - one can control the steps and verify and debug the live system at various points.&lt;/p&gt;

&lt;p&gt;The isolation scenario is the hardest - perhaps we should first get the other test scenarios working reliably first (unless they already are &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.opendaylight.org/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;).&lt;/p&gt;
</comment>
                            <comment id="63558" author="tpantelis" created="Wed, 20 Jun 2018 19:48:44 +0000"  >&lt;p&gt;If the read timeouts are due to aaa trying to access the DS, perhaps CSIT can install odl-restconf-noauth to take that out of the equation. At least then&#160;Verify_New_Car_Leader_Elected could succeed and convey the results.&lt;/p&gt;</comment>
                            <comment id="63561" author="jluhrsen" created="Wed, 20 Jun 2018 20:58:33 +0000"  >&lt;blockquote&gt;&lt;p&gt;If the read timeouts are due to aaa trying to access the DS, perhaps CSIT can install odl-restconf-noauth to take that out of the equation. At least then&#160;Verify_New_Car_Leader_Elected could succeed and convey the results.&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;I tried this, and the first run was good.&lt;/p&gt;</comment>
                            <comment id="63562" author="jluhrsen" created="Wed, 20 Jun 2018 21:55:16 +0000"  >&lt;p&gt;well, another run failed with the same symptoms even with the odl-restconf-noauth&lt;/p&gt;</comment>
                            <comment id="63563" author="tpantelis" created="Wed, 20 Jun 2018 22:08:09 +0000"  >&lt;p&gt;I think we&apos;re going to have to try to run the isolation steps manually. I think we really need live systems. As noted above, after isolation (or what I think from the log is when it occurred), I see another member became car leader as expected and it snapshotted at 19999 log entries which indicates thousands of cars were created after isolation. Also the logs indicate the partition was healed after about 2 min and member-1 joined back and was&#160;receiving and replying to messages&#160;by the new leader. I see no reason in the logs for the rest&#160;timeout which wasn&apos;t even hitting the DS anyway (appears it was hitting restconf/modules).&lt;/p&gt;

&lt;p&gt;How is the rest of the suite? Is the isolation test the only one failing?&lt;/p&gt;</comment>
                            <comment id="63564" author="jluhrsen" created="Wed, 20 Jun 2018 22:08:31 +0000"  >&lt;p&gt;some notes:&lt;/p&gt;

&lt;p&gt;the isolation is done by blocking all traffic OUTPUT from the leader. So, the other two nodes will stop hearing from it, but it will still be hearing from them&lt;/p&gt;

&lt;p&gt;I don&apos;t have a quick local setup to reproduce this manually, but can work on it. Still, it&apos;s very easy to try things out with CSIT in the sandbox and just&lt;br/&gt;
throw mud against the wall. Keep the ideas coming.&lt;/p&gt;

&lt;p&gt;the /restconf/modules endpoint is just the first endpoint in the list of requests that will be made on the way for the test to determine the leader. That&lt;br/&gt;
just happens to be first. I&apos;m guessing all of the rest endpoints are going to fail if that one does.&lt;/p&gt;

&lt;p&gt;remember, the tests aren&apos;t dealing with the node that is marked in isolation. Once those iptables rules are in place on that initial leader,&lt;br/&gt;
all the interactions are done with the two other nodes that used to be a follower and now should have one leader and one follower.&lt;/p&gt;</comment>
                            <comment id="63565" author="jluhrsen" created="Wed, 20 Jun 2018 22:13:56 +0000"  >&lt;p&gt;this is the whole suite. see the 10 steps in the description for the high level.&lt;/p&gt;

&lt;p&gt;do you think a totally stopped node test, instead of isolation is going to be better for us to look at?&lt;/p&gt;</comment>
                            <comment id="63566" author="jluhrsen" created="Wed, 20 Jun 2018 22:20:57 +0000"  >&lt;p&gt;btw, if you read below, I think we can say that tell-based protocol was not any kind of silver bullet here.&lt;/p&gt;</comment>
                            <comment id="63567" author="tpantelis" created="Wed, 20 Jun 2018 22:30:13 +0000"  >&lt;p&gt;Our&#160;comments are getting crossed. From&#160;what I see in logs, things appear to be working as expected. The&#160;downside with automated tests is that you have to try to analyze things post-mortem - sometimes it really helps to have live systems to poke around in.&#160;&#160;&lt;/p&gt;</comment>
                            <comment id="63568" author="jluhrsen" created="Wed, 20 Jun 2018 22:45:26 +0000"  >&lt;p&gt;of course. I don&apos;t disagree with having a live setup to debug being better (assuming we can reproduce). I don&apos;t have one right now. do you?&lt;/p&gt;</comment>
                            <comment id="63569" author="tpantelis" created="Wed, 20 Jun 2018 22:53:25 +0000"  >&lt;p&gt;I don&apos;t have access to a 3&#160;node env right now. The best I can do is run 3&#160;ODL instances on the same VM on my MAC.&#160;Not sure what&#160;the system commands are to block traffic - in&#160;my case they would each run on a different akka port.&lt;/p&gt;</comment>
                            <comment id="63570" author="tpantelis" created="Wed, 20 Jun 2018 22:58:51 +0000"  >&lt;p&gt;Doesn&apos;t it do that too? From what I recall 3 years ago when I did some work on the tests, it would&#160;stop a node and verify continuity and also 2 to verify expected failures.&#160; It also tested/verified recovery from persistence. Assuming all those tests still exist, are they running reliably?&lt;/p&gt;</comment>
                            <comment id="63571" author="jluhrsen" created="Wed, 20 Jun 2018 23:00:05 +0000"  >&lt;p&gt;I can try to get a setup going as well, but might take me a day or so.&lt;/p&gt;

&lt;p&gt;here&apos;s the commands the CSIT is using to isolate the leader:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;sudo /sbin/iptables -I OUTPUT -p all --source $LEADER_IP --destination $LEADER_IP -j DROP
sudo /sbin/iptables -I OUTPUT -p all --source $LEADER_IP --destination $FOLLOWER_1_IP -j DROP
sudo /sbin/iptables -I OUTPUT -p all --source $LEADER_IP --destination $FOLLOWER_2_IP -j DROP
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;note that first line is odd, as it blocks traffic to and from it&apos;s own ip. I think that&apos;s probably just a relic&lt;br/&gt;
of keeping it easy in the test code. not sure. I don&apos;t think it matters if you have it or not.&lt;/p&gt;</comment>
                            <comment id="63572" author="tpantelis" created="Wed, 20 Jun 2018 23:00:32 +0000"  >&lt;p&gt;Back then the isolation test didn&apos;t exist.&lt;/p&gt;</comment>
                            <comment id="63573" author="tpantelis" created="Wed, 20 Jun 2018 23:02:34 +0000"  >&lt;p&gt;I assume it&apos;s possible to just block a port.&lt;/p&gt;</comment>
                            <comment id="63574" author="jluhrsen" created="Wed, 20 Jun 2018 23:04:18 +0000"  >&lt;p&gt;yes, we can do that too. are you asking to try that in CSIT? or how to do it for yourself locally?&lt;/p&gt;</comment>
                            <comment id="63575" author="tpantelis" created="Wed, 20 Jun 2018 23:07:41 +0000"  >&lt;p&gt;That would be for running 3 instances on a single VM on my MAC - each instance has IP 127.0.0.1 but different ports.&lt;/p&gt;</comment>
                            <comment id="63576" author="jluhrsen" created="Wed, 20 Jun 2018 23:10:59 +0000"  >&lt;p&gt;try this:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;sudo /sbin/iptables -I OUTPUT -p tcp --dport 2550 -j DROP
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="63597" author="tpantelis" created="Thu, 21 Jun 2018 12:34:37 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=jluhrsen&quot; class=&quot;user-hover&quot; rel=&quot;jluhrsen&quot;&gt;jluhrsen&lt;/a&gt; &lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=ecelgp&quot; class=&quot;user-hover&quot; rel=&quot;ecelgp&quot;&gt;ecelgp&lt;/a&gt; There&apos;s been a lot comments flying around - I didn&apos;t get an answer here. However I cloned integration/test and I do see the original tests or at least most of them under&#160;csit/suites/controller/Clustering_Datastore.&#160;carpeople_crud.robot tests basic functionality. car_failover_crud.robot is one that stops a node to test HA. There&apos;s&#160;car_failover_crud_isolation.robot that uses isolation. The one you&apos;re running is under&#160;ThreeNodes_Datastore (not sure how it differs from&#160;car_failover_crud_isolation).&#160; There&apos;s also a bunch of other tests spread out in different directories - dom_data_broker, cluster_singleton, singleton_service - I think all those were Vratko&apos;s so i assume no one&apos;s maintained those or really know what they do and what state they&apos;re in.&lt;/p&gt;

&lt;p&gt;So do the ones under&#160;Clustering_Datastore (maybe sans isolation) run reliably? I think that&apos;s a good first step - start simple/basic and work our way up.&lt;/p&gt;</comment>
                            <comment id="63607" author="jluhrsen" created="Thu, 21 Jun 2018 15:58:39 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=tpantelis&quot; class=&quot;user-hover&quot; rel=&quot;tpantelis&quot;&gt;tpantelis&lt;/a&gt; that suite you are referring to is doing &lt;a href=&quot;https://jenkins.opendaylight.org/releng/view/controller/job/controller-csit-3node-clustering-all-fluorine/123/robot/controller-clustering.txt/Car%20Failover%20Crud/graph?zoomSignificant=false&amp;amp;failedOnly=false&amp;amp;criticalOnly=false&amp;amp;maxBuildsToShow=0&amp;amp;hd=true&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;fine &lt;/a&gt;, it seems:&lt;/p&gt;

&lt;p&gt;So, that more simple case is working which is good. It&apos;s definitely being a little more gentle with the tests,&lt;br/&gt;
where it&apos;s doing some CRUD stuff with cars/people, then stopping and waiting patiently for a leader to&lt;br/&gt;
be downed and a new leader to take over, then some more CRUD.&lt;/p&gt;

&lt;p&gt;The job we are looking at, is proactively trying to use the cluster while a leader is being isolated.&lt;/p&gt;

&lt;p&gt;Also, I ran some different combos (tell based and not) in the sandbox, and made a new suite (to kill the leader,&lt;br/&gt;
instead of isolating). All combos (I want to double check) seem to exhibiting the same fundamental symptoms&lt;br/&gt;
that all of the &quot;cars&quot; are not added to the cluster when downing &lt;b&gt;or&lt;/b&gt; isolating a node, and it will come with the&lt;br/&gt;
tell based and non-tell based protocols. I &lt;b&gt;THINK&lt;/b&gt; it&apos;s a little less frequent or easy to hit when it&apos;s the tell-based&lt;br/&gt;
config using the kill suite, if that gives any clues to anything.&lt;/p&gt;

&lt;p&gt;Additionally, the more I think about this problem the more it&apos;s reminding me of some issue we think is&lt;br/&gt;
happening in netvirt 3node jobs. Essentially, there is a point when some openstack services think the&lt;br/&gt;
networking services are down (ODL not responding) so they mark compute nodes as invalid and doing&lt;br/&gt;
openstack instance creates end up with instances in ERROR state. My hunch is that every time we see&lt;br/&gt;
that issue, we&apos;ll be able to trace it back to a test that downed/isolated a cluster node and the openstack&lt;br/&gt;
services noticed they couldn&apos;t talk (401s or 500s) to the ODL cluster.&lt;/p&gt;</comment>
                            <comment id="63616" author="rovarga" created="Thu, 21 Jun 2018 16:57:12 +0000"  >&lt;p&gt;tell-based protocol has these timeouts:&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;dead leader detection: 30 seconds of no activity from leader, we will start looking for new leader&lt;/li&gt;
	&lt;li&gt;request timeout: 2 minutes of no response to request, the request times out&lt;/li&gt;
	&lt;li&gt;frontend abort: 15 minutes of no forward progress, the frontend will abort, client node needs to be restarted&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;We can eventually get rid of the request timeout, but we need it now to make UT work.&lt;/p&gt;</comment>
                            <comment id="63618" author="tpantelis" created="Thu, 21 Jun 2018 20:28:35 +0000"  >&lt;p&gt;I tested locally with &quot;ask&quot; protocol running the car stress-test RPC creating 500 cars at 10/sec. I tested with graceful leader shutdown and killing the leader, both 5 times. I also manually did GET on car:cars&#160;quickly over and over for a bit&#160;after stopping the leader - these succeeded.&#160; &#160;With&#160;graceful leader shutdown, all the cars were successfully created in each run. With leader kill, I saw some car failures during transition which is expected.&lt;/p&gt;</comment>
                            <comment id="63619" author="tpantelis" created="Thu, 21 Jun 2018 21:39:57 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=jluhrsen&quot; class=&quot;user-hover&quot; rel=&quot;jluhrsen&quot;&gt;jluhrsen&lt;/a&gt; Re: the suppression of the first 4 failed loops and only displaying the failure form the last one... the downside is that we lose the details of the first 4.&#160;It would be interesting to see if the first follower that succeeded in the last try also had failures previously and if the second one had the same 5 sec timeout failures each time. Maybe it succeeded in a prior try but the first one failed etc.&#160; So I think it would be good to show all the failures at least for now.&lt;/p&gt;</comment>
                            <comment id="63620" author="tpantelis" created="Fri, 22 Jun 2018 00:29:27 +0000"  >&lt;p&gt;Also I asked Ryan about the HTTP request logging but that is not upstream. I supposed it could be re-invented upstream or maybe &lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=ecelgp&quot; class=&quot;user-hover&quot; rel=&quot;ecelgp&quot;&gt;ecelgp&lt;/a&gt;&#160;could&#160;help to facilitate a quicker path &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.opendaylight.org/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="63629" author="tpantelis" created="Fri, 22 Jun 2018 13:02:52 +0000"  >&lt;p&gt;One possible theory for the read timeout is that the internal HTTP server (jersey/jetty whatever) gets overloaded. I assume it has a thread pool and queue to handle the incoming requests.&#160;During the leader&#160;transition, add car requests from the python&#160;script will be delayed and thus block the thread.&#160;Over time&#160;all threads in the pool are in use and blocked and the queue builds up which may&#160;result in the client timing out.&#160;&#160;&lt;/p&gt;</comment>
                            <comment id="63649" author="ariel.adam" created="Sun, 24 Jun 2018 07:18:41 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=tpantelis&quot; class=&quot;user-hover&quot; rel=&quot;tpantelis&quot;&gt;tpantelis&lt;/a&gt; can you think if a way aside from logs to check the status of the threads and queues on the HTTP server?&#160;&lt;/p&gt;

&lt;p&gt;For the purpose of testing can we compile an HTTP server with logs and manually push it in?&lt;/p&gt;</comment>
                            <comment id="63650" author="tpantelis" created="Sun, 24 Jun 2018 11:44:36 +0000"  >&lt;p&gt;&#160;I haven&apos;t looked into that yet. But if you or a&#160;colleague could help on that front that would be great.&lt;/p&gt;</comment>
                            <comment id="63686" author="jluhrsen" created="Tue, 26 Jun 2018 06:22:50 +0000"  >&lt;p&gt;This would explain it, but now we gotta prove it.&lt;/p&gt;

&lt;p&gt;I have a good local setup I can use for this (not my laptop), but it does not have the cluster-test-app features,&lt;br/&gt;
so I can&apos;t do the add-cars trick.&lt;/p&gt;

&lt;p&gt;Trying to figure out some other quick/easy &quot;create&quot; I can script so simulate this scenario.&lt;/p&gt;

&lt;p&gt;BTW, like &lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=tpantelis&quot; class=&quot;user-hover&quot; rel=&quot;tpantelis&quot;&gt;tpantelis&lt;/a&gt;and &lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=thapar&quot; class=&quot;user-hover&quot; rel=&quot;thapar&quot;&gt;thapar&lt;/a&gt; I cannot reproduce the troubles yet. I will additionally try&lt;br/&gt;
on a different setup to use a distribution with the cluster test app features so I can also &lt;br/&gt;
run the add-cars script.&lt;/p&gt;</comment>
                            <comment id="63702" author="tpantelis" created="Tue, 26 Jun 2018 12:23:17 +0000"  >&lt;p&gt;I tried it with the cars stress-test RPC but that&apos;s not using restconf like the python script. The theory would hold water if the script sends restconf requests&#160; simultaneously. But if it sends requests synchronously one at a time then there would only be one outstanding request at a time which shouldn&apos;t overload.&lt;/p&gt;</comment>
                            <comment id="63709" author="jluhrsen" created="Tue, 26 Jun 2018 15:26:02 +0000"  >&lt;p&gt;it does, from what I can tell. It&apos;s using 10 threads in the job, so I assume that is up to 10 rest calls in parallel. I will be trying to get all of this running locally today and see what happens.&lt;/p&gt;</comment>
                            <comment id="63721" author="jluhrsen" created="Tue, 26 Jun 2018 22:17:43 +0000"  >&lt;p&gt;I was able to run this tool in our sandbox with only a single thread (which takes longer, as expected), but that too resulted in&lt;br/&gt;
restconf dying on us. back to the drawing board for ideas.&lt;/p&gt;

&lt;p&gt;for reference:&lt;br/&gt;
&lt;a href=&quot;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/builder-copy-sandbox-logs/143/jamo-controller-csit-3node-rest-clust-cars-perf-only-fluorine/6/robot-plugin/log.html.gz&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/builder-copy-sandbox-logs/143/jamo-controller-csit-3node-rest-clust-cars-perf-only-fluorine/6/robot-plugin/log.html.gz&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="63726" author="thapar" created="Wed, 27 Jun 2018 07:53:26 +0000"  >&lt;p&gt;I was able to run the tool locally and get the 500 error, kind of. All the cars addition went through in the end, but bringing leader down, up and then the new leader down and up again caused some 500 errors. Not sure if same issue as this one, but at least it shows the exception that shows up in 500.&lt;/p&gt;

&lt;p&gt;title&amp;gt;Error 500 &amp;lt;/title&amp;gt;&lt;br/&gt;
&amp;lt;/head&amp;gt;&lt;br/&gt;
&amp;lt;body&amp;gt;&lt;br/&gt;
&amp;lt;h2&amp;gt;HTTP ERROR: 500&amp;lt;/h2&amp;gt;&lt;br/&gt;
&amp;lt;p&amp;gt;Problem accessing /jolokia/read/org.opendaylight.controller:type=DistributedConfigDatastore,Category=Shards,name=member-3-shard-default-config. Reason:&lt;br/&gt;
&amp;lt;pre&amp;gt;    org.apache.shiro.session.UnknownSessionException: There is no session with id &lt;span class=&quot;error&quot;&gt;&amp;#91;6cabdaaf-1727-4144-a075-07940bfe15c5&amp;#93;&lt;/span&gt;&amp;lt;/pre&amp;gt;&amp;lt;/p&amp;gt;&lt;br/&gt;
&amp;lt;hr /&amp;gt;&lt;br/&gt;
I&apos;ve attached the relevant karaf log.&lt;/p&gt;

&lt;p&gt;I was tracking status through Tim&apos;s monitor script in odltools and I saw 401 pop up briefly when follower became leader. I think it has something to do with the follower we&apos;re tracking becomes leader or not. Maybe we should log/track leadership status in the jobs that are running the test?&lt;/p&gt;</comment>
                            <comment id="63727" author="rovarga" created="Wed, 27 Jun 2018 12:56:14 +0000"  >&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;ReadTimeout: HTTPConnectionPool(host=&lt;span class=&quot;code-quote&quot;&gt;&apos;10.30.170.108&apos;&lt;/span&gt;, port=8181): Read timed out. (read timeout=5.0)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;this looks like a client-side timeout, i.e. RESTCONF has not responded within 5 seconds.&lt;/p&gt;</comment>
                            <comment id="63728" author="tpantelis" created="Wed, 27 Jun 2018 12:59:34 +0000"  >&lt;p&gt;yes it is - the next question is why is restconf/jetty not responding...&lt;/p&gt;</comment>
                            <comment id="63730" author="rovarga" created="Wed, 27 Jun 2018 13:15:00 +0000"  >&lt;p&gt;I suggest taking a look at &lt;a href=&quot;https://git.opendaylight.org/gerrit/#/c/73455/5&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://git.opendaylight.org/gerrit/#/c/73455/5&lt;/a&gt; and &lt;a href=&quot;https://git.opendaylight.org/gerrit/#/c/73475/1&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://git.opendaylight.org/gerrit/#/c/73475/1&lt;/a&gt; to see if they improve the situation (in which case this boils down to AAA). Otherwise we require debug logs to see what is going on in the restconf code.&lt;/p&gt;</comment>
                            <comment id="63731" author="tpantelis" created="Wed, 27 Jun 2018 13:22:18 +0000"  >&lt;p&gt;yeah - the aaa DS access was discussed earlier in this Jira (I know it&apos;s huge and hard to follow).&#160; To take that out of the equation, I suggested installing restconf noauth feature - I believe all subsequent testing has been done with that.&#160;&lt;/p&gt;</comment>
                            <comment id="63737" author="rovarga" created="Wed, 27 Jun 2018 13:53:40 +0000"  >&lt;p&gt;Right, but doesn&apos;t that just disable &lt;b&gt;authentication&lt;/b&gt;, with &lt;b&gt;authorization&lt;/b&gt; being still in the picture?&lt;/p&gt;</comment>
                            <comment id="63739" author="tpantelis" created="Wed, 27 Jun 2018 14:40:44 +0000"  >&lt;p&gt;hmm - I assumed it would disable both - have to check on that...&lt;/p&gt;</comment>
                            <comment id="63769" author="jluhrsen" created="Thu, 28 Jun 2018 04:14:37 +0000"  >&lt;p&gt;I&apos;ve taken the distro created from c/73475 which is on top of c/73455 and am running in against this&lt;br/&gt;
cars job in the sandbox and have seen 14 consecutive passes. This is with the tell-based protocol.&lt;/p&gt;

&lt;p&gt;this is promising.&lt;/p&gt;

&lt;p&gt;there is something failing now, however, that I wasn&apos;t expecting. The &lt;a href=&quot;https://jenkins.opendaylight.org/releng/user/jluhrsen/my-views/view/controller%203node/job/controller-csit-3node-rest-clust-cars-perf-only-fluorine/129/robot/controller-rest-clust-cars-perf.txt/010%20Crud%20Mdsal%20Perf/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;Crud mdsal perf suite &lt;/a&gt;&lt;br/&gt;
is having a test case (purchasing cars) that is coming up short of it&apos;s 10k number. Haven&apos;t&lt;br/&gt;
looked any closer, but there is no node isolation/downing in this suite, and it seems to pass&lt;br/&gt;
fine in the releng (ask based, and without these aaa patches) every time.&lt;/p&gt;

&lt;p&gt;Anyway, let me keep collecting data with the aaa patches. I&apos;ll run some jobs without the&lt;br/&gt;
tell based protocol for starters.&lt;/p&gt;</comment>
                            <comment id="63770" author="jluhrsen" created="Thu, 28 Jun 2018 05:40:17 +0000"  >&lt;p&gt;right, so ask-based with the c/73475 distro is giving the same results as what we initially&lt;br/&gt;
reported in this jira (500 error &quot;read execution failed&quot;, seen on the first try).&lt;/p&gt;

&lt;p&gt;Wondering what to do now?&lt;/p&gt;

&lt;p&gt;Did we have an explanation for why a script talking to /restconf (to add cars) would get&lt;br/&gt;
a 500 internal error when the leader is isolated? &lt;/p&gt;

&lt;p&gt;Also, this info is coming from the scripts stdout and there is no hint of it in any of the&lt;br/&gt;
karaf.log files (that I can see). Any ideas why it&apos;s not coming as an ERROR in the&lt;br/&gt;
logs?&lt;/p&gt;
</comment>
                            <comment id="63777" author="tpantelis" created="Thu, 28 Jun 2018 12:09:57 +0000"  >&lt;p&gt;500 is internal server error - could come from anywhere. If it&apos;s emanating from restconf code then there should be additional error info in the output&#160;corresponding&#160;to the RFC (I know you&apos;ve seen it before). Either way we need to see the full HTTP response output.&lt;/p&gt;</comment>
                            <comment id="63780" author="jluhrsen" created="Thu, 28 Jun 2018 14:07:01 +0000"  >&lt;p&gt;do we need more than this:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
2018-06-28 04:32:23,034 INFO: Request started at Thu Jun 28 04:32:17 2018 finished with following detais
2018-06-28 04:32:23,034 INFO: &amp;lt;PreparedRequest [POST]&amp;gt; http:&lt;span class=&quot;code-comment&quot;&gt;//10.30.170.131:8181/restconf/config/car:cars
&lt;/span&gt;2018-06-28 04:32:23,034 INFO: Headers {&lt;span class=&quot;code-quote&quot;&gt;&apos;Content-Length&apos;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&apos;127&apos;&lt;/span&gt;, &lt;span class=&quot;code-quote&quot;&gt;&apos;Content-Type&apos;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&apos;application/json&apos;&lt;/span&gt;, &lt;span class=&quot;code-quote&quot;&gt;&apos;Authorization&apos;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&apos;Basic YWRtaW46YWRtaW4=&apos;&lt;/span&gt;}:
2018-06-28 04:32:23,035 INFO: Body: {&lt;span class=&quot;code-quote&quot;&gt;&quot;car-entry&quot;&lt;/span&gt;: [{&lt;span class=&quot;code-quote&quot;&gt;&quot;category&quot;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&quot;my_category&quot;&lt;/span&gt;, &lt;span class=&quot;code-quote&quot;&gt;&quot;model&quot;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&quot;model701&quot;&lt;/span&gt;, &lt;span class=&quot;code-quote&quot;&gt;&quot;manufacturer&quot;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&quot;my_manufacturer&quot;&lt;/span&gt;, &lt;span class=&quot;code-quote&quot;&gt;&quot;id&quot;&lt;/span&gt;: 701, &lt;span class=&quot;code-quote&quot;&gt;&quot;year&quot;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&quot;2015&quot;&lt;/span&gt;}]}
2018-06-28 04:32:23,035 INFO: Response: {&lt;span class=&quot;code-quote&quot;&gt;&quot;errors&quot;&lt;/span&gt;:{&lt;span class=&quot;code-quote&quot;&gt;&quot;error&quot;&lt;/span&gt;:[{&lt;span class=&quot;code-quote&quot;&gt;&quot;error-type&quot;&lt;/span&gt;:&lt;span class=&quot;code-quote&quot;&gt;&quot;application&quot;&lt;/span&gt;,&lt;span class=&quot;code-quote&quot;&gt;&quot;error-tag&quot;&lt;/span&gt;:&lt;span class=&quot;code-quote&quot;&gt;&quot;operation-failed&quot;&lt;/span&gt;,&lt;span class=&quot;code-quote&quot;&gt;&quot;error-message&quot;&lt;/span&gt;:&lt;span class=&quot;code-quote&quot;&gt;&quot;read execution failed&quot;&lt;/span&gt;,&lt;span class=&quot;code-quote&quot;&gt;&quot;error-info&quot;&lt;/span&gt;:&lt;span class=&quot;code-quote&quot;&gt;&quot;ReadFailedException{message=Error checking DataExists &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; path /(urn:opendaylight:params:xml:ns:yang:controller:config:sal-clustering-it:car?revision=2014-08-18)cars/car-entry/car-entry[{(urn:opendaylight:params:xml:ns:yang:controller:config:sal-clustering-it:car?revision=2014-08-18)id=701}], errorList=[RpcError [message=Error checking DataExists &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; path /(urn:opendaylight:params:xml:ns:yang:controller:config:sal-clustering-it:car?revision=2014-08-18)cars/car-entry/car-entry[{(urn:opendaylight:params:xml:ns:yang:controller:config:sal-clustering-it:car?revision=2014-08-18)id=701}], severity=ERROR, errorType=APPLICATION, tag=operation-failed, applicationTag=&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt;, info=&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt;, cause=akka.pattern.AskTimeoutException: Ask timed out on [ActorSelection[Anchor(akka.tcp:&lt;span class=&quot;code-comment&quot;&gt;//opendaylight-cluster-data@10.30.170.170:2550/), Path(/user/shardmanager-config/member-1-shard-car-config/shard-car-member-2:datastore-config@0:7884_728#-1820175598)]] after [5000 ms]. Sender[&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt;] sent message of type \&quot;&lt;/span&gt;org.opendaylight.controller.cluster.datastore.messages.DataExists\&lt;span class=&quot;code-quote&quot;&gt;&quot;.]]}\n\tat org.opendaylight.controller.cluster.datastore.RemoteTransactionContext$2.onComplete(RemoteTransactionContext.java:247)\n\tat akka.dispatch.OnComplete.internal(Future.scala:260)\n\tat akka.dispatch.OnComplete.internal(Future.scala:258)\n\tat akka.dispatch.japi$CallbackBridge.apply(Future.scala:188)\n\tat akka.dispatch.japi$CallbackBridge.apply(Future.scala:185)\n\tat scala.concurrent.impl.CallbackRunnable.run(Promise.scala:60)\n\tat akka.dispatch.BatchingExecutor$AbstractBatch.processBatch(BatchingExecutor.scala:55)\n\tat akka.dispatch.BatchingExecutor$BlockableBatch.$anonfun$run$1(BatchingExecutor.scala:91)\n\tat scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:12)\n\tat scala.concurrent.BlockContext$.withBlockContext(BlockContext.scala:81)\n\tat akka.dispatch.BatchingExecutor$BlockableBatch.run(BatchingExecutor.scala:91)\n\tat akka.dispatch.TaskInvocation.run(AbstractDispatcher.scala:40)\n\tat akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(ForkJoinExecutorConfigurator.scala:43)\n\tat akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)\n\tat akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)\n\tat akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)\n\tat akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)\nCaused by: akka.pattern.AskTimeoutException: Ask timed out on [ActorSelection[Anchor(akka.tcp://opendaylight-cluster-data@10.30.170.170:2550/), Path(/user/shardmanager-config/member-1-shard-car-config/shard-car-member-2:datastore-config@0:7884_728#-1820175598)]] after [5000 ms]. Sender[&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt;] sent message of type \&quot;&lt;/span&gt;org.opendaylight.controller.cluster.datastore.messages.DataExists\&lt;span class=&quot;code-quote&quot;&gt;&quot;.\n\tat akka.pattern.PromiseActorRef$.$anonfun$defaultOnTimeout$1(AskSupport.scala:595)\n\tat akka.pattern.PromiseActorRef$.$anonfun$apply$1(AskSupport.scala:605)\n\tat akka.actor.Scheduler$$anon$4.run(Scheduler.scala:140)\n\tat scala.concurrent.Future$InternalCallbackExecutor$.unbatchedExecute(Future.scala:870)\n\tat scala.concurrent.BatchingExecutor.execute(BatchingExecutor.scala:109)\n\tat scala.concurrent.BatchingExecutor.execute$(BatchingExecutor.scala:103)\n\tat scala.concurrent.Future$InternalCallbackExecutor$.execute(Future.scala:868)\n\tat akka.actor.LightArrayRevolverScheduler$TaskHolder.executeTask(LightArrayRevolverScheduler.scala:328)\n\tat akka.actor.LightArrayRevolverScheduler$$anon$4.executeBucket$1(LightArrayRevolverScheduler.scala:279)\n\tat akka.actor.LightArrayRevolverScheduler$$anon$4.nextTick(LightArrayRevolverScheduler.scala:283)\n\tat akka.actor.LightArrayRevolverScheduler$$anon$4.run(LightArrayRevolverScheduler.scala:235)\n\tat java.lang.&lt;span class=&quot;code-object&quot;&gt;Thread&lt;/span&gt;.run(&lt;span class=&quot;code-object&quot;&gt;Thread&lt;/span&gt;.java:748)\n&quot;&lt;/span&gt;}]}}
&lt;/span&gt;2018-06-28 04:32:23,035 INFO: &amp;lt;Response [500]&amp;gt; Internal Server Error
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="63782" author="tpantelis" created="Thu, 28 Jun 2018 14:12:39 +0000"  >&lt;p&gt;yeah that&apos;s the error info. The usual&#160;AskTimeoutException&#160;with ask-based.&lt;/p&gt;</comment>
                            <comment id="63783" author="jluhrsen" created="Thu, 28 Jun 2018 14:14:59 +0000"  >&lt;p&gt;ok.&lt;/p&gt;

&lt;p&gt;what next then?&lt;/p&gt;

&lt;p&gt;why is this not showing up in the karaf.log?&lt;/p&gt;

&lt;p&gt;how can we avoid this failure in CSIT?&lt;/p&gt;</comment>
                            <comment id="63784" author="tpantelis" created="Thu, 28 Jun 2018 14:23:32 +0000"  >&lt;p&gt;There should be a WARN logged by CDS. restconf only logs request failures to DEBUG I believe.&lt;/p&gt;

&lt;p&gt;You can&apos;t reliably avoid this with ask-based as discussed before. If a tx hits at the right time during leader transition it may time out.&lt;/p&gt;</comment>
                            <comment id="63786" author="tpantelis" created="Thu, 28 Jun 2018 14:29:33 +0000"  >&lt;p&gt;A recourse is to retry the request on the client side.&#160;&#160;&lt;/p&gt;</comment>
                            <comment id="63787" author="jluhrsen" created="Thu, 28 Jun 2018 14:32:38 +0000"  >&lt;p&gt;s/avoid/workaround&lt;/p&gt;

&lt;p&gt;We don&apos;t want to have any failures in these jobs. we want them to pass 100% every time so we&lt;br/&gt;
can use them as benchmarks without having to dig all the way to the bottom of each failure to finally realize it was a random chance ask-timeout.&lt;/p&gt;

&lt;p&gt;Do we have to add retries to the python tool? (looks like it already claims to do that)&lt;/p&gt;

&lt;p&gt;How long is the window expected to be here (MAX)?&lt;/p&gt;

&lt;p&gt;Can we tweak some timers in the clustering config to reduce the chance?&lt;/p&gt;
</comment>
                            <comment id="63790" author="tpantelis" created="Thu, 28 Jun 2018 15:13:29 +0000"  >&lt;p&gt;The window depends&#160;mostly on akka internals and how it detects failed nodes. Time out settings can be&#160;tweaked but there will always be a period of time of leader transition and re-convergence, no matter how small, where a tx can fail if inflight.&#160; The recourse is to retry the entire transaction, whether internally or on the client-side. ask-based does not retry&#160;inflight&#160;transactions - tell-based was introduced to do that.&#160; The test you&apos;re running can never be guaranteed 100% success with&#160;ask-based.&lt;/p&gt;</comment>
                            <comment id="63791" author="jluhrsen" created="Thu, 28 Jun 2018 15:36:09 +0000"  >&lt;p&gt;no...&lt;/p&gt;

&lt;p&gt;please help me figure this out.&lt;/p&gt;

&lt;p&gt;let&apos;s say you are a real user and you want to add 10k cars with an external client. So you start&lt;br/&gt;
adding them (X per request). Somewhere in the middle you get a 500 response to your request,&lt;br/&gt;
but you know that might happen because of a cluster hiccup.&lt;/p&gt;

&lt;p&gt;what are you going to do next?&lt;/p&gt;

&lt;p&gt;Is it just a simple retry of that exact same request?  Assuming that, how many times would you&lt;br/&gt;
retry, and how often? When do you give up and start worrying that something major is broken?&lt;/p&gt;</comment>
                            <comment id="63793" author="rovarga" created="Thu, 28 Jun 2018 15:56:39 +0000"  >&lt;p&gt;This is the usual state reconciliation. What you do depends on what you are doing, but a fair recovery strategy is:&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;read what&apos;s there&lt;/li&gt;
	&lt;li&gt;figure out what needs to go there&lt;/li&gt;
	&lt;li&gt;continue pushing&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;Retry/backoff/escalation is an application-specific thing &#8211; in telco&apos;s there&apos;s like 20% of ONAP to deal with that.&lt;/p&gt;</comment>
                            <comment id="63794" author="jluhrsen" created="Thu, 28 Jun 2018 16:05:58 +0000"  >&lt;p&gt;sure. I&apos;ll see what I can figure out from the client side here.&lt;/p&gt;

&lt;p&gt;still wondering things like:&lt;/p&gt;

&lt;p&gt;how long should I plan to deal with errored responses, before marking a failure that we will all know is legit showing a problem?&lt;/p&gt;

&lt;p&gt;btw, for this very specific case when we are trying to create X cars in a single request, and we get that 500 ask-timeout&lt;br/&gt;
response, does that mean the entire request failed, or is it possible that &quot;some&quot; of the cars would have been created?&lt;/p&gt;</comment>
                            <comment id="63795" author="rovarga" created="Thu, 28 Jun 2018 16:07:54 +0000"  >&lt;p&gt;AskTimeout means the transaction is doomed, other transactions are not affected (unless they report an error).&lt;/p&gt;

&lt;p&gt;As for granular error reporting ... yeah, someone has to go and define the error contracts on the NB.&lt;/p&gt;</comment>
                            <comment id="63796" author="jluhrsen" created="Thu, 28 Jun 2018 16:25:16 +0000"  >&lt;p&gt;it might be this simple:&lt;br/&gt;
  &lt;a href=&quot;https://git.opendaylight.org/gerrit/#/c/73209/13/tools/odl-mdsal-clustering-tests/scripts/cluster_rest_script.py&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://git.opendaylight.org/gerrit/#/c/73209/13/tools/odl-mdsal-clustering-tests/scripts/cluster_rest_script.py&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;I have one example of this working, but it looks like there may be more than 2m of retries (see timestamps&lt;br/&gt;
below from the script output (first two lines)):&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;2018-06-28 15:43:20,657 INFO: Add 10000 car(s) to 10.30.170.52:8181 (1 per request)
2018-06-28 15:45:25,211 INFO: Response code(s) got per number of requests: {204: 1003}
2018-06-28 15:45:25,212 INFO: Response code(s) got per number of requests: {204: 996}
2018-06-28 15:45:25,233 INFO: Response code(s) got per number of requests: {204: 993}
2018-06-28 15:45:25,234 INFO: Response code(s) got per number of requests: {204: 998}
2018-06-28 15:45:25,234 INFO: Response code(s) got per number of requests: {204: 996}
2018-06-28 15:45:25,235 INFO: Response code(s) got per number of requests: {204: 1004}
2018-06-28 15:45:25,260 INFO: Response code(s) got per number of requests: {204: 1013}
2018-06-28 15:45:25,261 INFO: Response code(s) got per number of requests: {204: 1005}
2018-06-28 15:45:25,261 INFO: Response code(s) got per number of requests: {204: 1001}
2018-06-28 15:45:25,261 INFO: Response code(s) got per number of requests: {204: 991}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I still want to know proper timing bounds to use, so we can still have failures when real problems are under&lt;br/&gt;
the hood.&lt;/p&gt;</comment>
                            <comment id="63936" author="jluhrsen" created="Thu, 5 Jul 2018 20:40:18 +0000"  >&lt;p&gt;To update this one, with &lt;a href=&quot;https://git.opendaylight.org/gerrit/#/c/73475/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;this AAA patch&apos;s &lt;/a&gt; distribution, and &lt;a href=&quot;https://git.opendaylight.org/gerrit/#/c/73209/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;these small changes in the CSIT code &lt;/a&gt; we are&lt;br/&gt;
no longer hitting the major problem of RESTCONF being unavailble (read timeouts). We do still have some periods of inaccessibility where we&lt;br/&gt;
will get some 500 errors in ask-based, but those are expected (askTimeouts) and our python test tool is now made to retry on those. If/when&lt;br/&gt;
we merge the AAA and CSIT patches, we can close this bug.&lt;/p&gt;

&lt;p&gt;while that is good news, it should be noted that a new kind of failure is now coming with the AAA patch in a different suite in&lt;br/&gt;
the same job. So, if we merge the AAA patch, we will need a new jira to track it and find a way to fix it. the TL;DR for that is&lt;br/&gt;
that we are doing &quot;add cars&quot;, then &quot;add people&quot;, then &quot;purchase cars&quot; with validation between each. The validation for the&lt;br/&gt;
&quot;purchase&quot; step is failing where we are usually near, but not all the way to the full 10k purchases. This failure is there &lt;b&gt;most&lt;/b&gt;&lt;br/&gt;
of the time, but not always. There is no leader up/down or isolation in this suite. It happens in both ask and tell based&lt;br/&gt;
protocols. That&apos;s all I know at this point.&lt;/p&gt;</comment>
                            <comment id="64018" author="jluhrsen" created="Thu, 12 Jul 2018 05:49:35 +0000"  >&lt;p&gt;two aaa patches:&lt;br/&gt;
&lt;a href=&quot;https://git.opendaylight.org/gerrit/#/c/73475/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://git.opendaylight.org/gerrit/#/c/73475/&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://git.opendaylight.org/gerrit/#/c/73209/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://git.opendaylight.org/gerrit/#/c/73209/&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;and one int/test patch:&lt;br/&gt;
&lt;a href=&quot;https://git.opendaylight.org/gerrit/#/c/73209/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://git.opendaylight.org/gerrit/#/c/73209/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="64019" author="jluhrsen" created="Thu, 12 Jul 2018 05:51:06 +0000"  >&lt;p&gt;The jobs are now passing 100% for this problem.&lt;/p&gt;

&lt;p&gt;A new, less severe problem is now happening in a different suite of&lt;br/&gt;
this same job. We will open a new bug for this.&lt;/p&gt;</comment>
                            <comment id="64026" author="ecelgp" created="Thu, 12 Jul 2018 18:59:23 +0000"  >&lt;p&gt;Awesome &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.opendaylight.org/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10003">
                    <name>Relates</name>
                                                                <inwardlinks description="relates to">
                                        <issuelink>
            <issuekey id="30150">NETVIRT-1315</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="14721" name="karaf.log.1.gz" size="364954" author="thapar" created="Wed, 27 Jun 2018 07:54:52 +0000"/>
                    </attachments>
                <subtasks>
                            <subtask id="30182">CONTROLLER-1839</subtask>
                            <subtask id="30183">CONTROLLER-1840</subtask>
                            <subtask id="30184">CONTROLLER-1841</subtask>
                            <subtask id="30185">CONTROLLER-1842</subtask>
                    </subtasks>
                <customfields>
                                                                            <customfield id="customfield_11400" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10002" key="com.pyxis.greenhopper.jira:gh-epic-link">
                        <customfieldname>Epic Link</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>NETVIRT-996</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    <customfield id="customfield_10000" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>0|i03fov:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                </customfields>
    </item>
</channel>
</rss>