<!-- 
RSS generated by JIRA (8.20.10#820010-sha1:ace47f9899e9ee25d7157d59aa17ab06aee30d3d) at Wed Feb 07 19:56:36 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>OpenDaylight JIRA</title>
    <link>https://jira.opendaylight.org</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>8.20.10</version>
        <build-number>820010</build-number>
        <build-date>22-06-2022</build-date>
    </build-info>


<item>
            <title>[CONTROLLER-1849] controller not coming up healthy after being killed and restarted (401 after 5m)</title>
                <link>https://jira.opendaylight.org/browse/CONTROLLER-1849</link>
                <project id="10113" key="CONTROLLER">controller</project>
                    <description>&lt;p&gt;Here is an &lt;a href=&quot;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/openflowplugin-csit-3node-clustering-only-oxygen/300/robot-plugin/log.html.gz#s1-s6-t2-k2-k6-k1-k1-k2-k8&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;example &lt;/a&gt; of this problem from the robot logs.&lt;/p&gt;

&lt;p&gt;the &lt;a href=&quot;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/openflowplugin-csit-3node-clustering-only-oxygen/300/odl1_karaf.log.gz&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;karaf log &lt;/a&gt; is pretty messy if you look from&lt;br/&gt;
the point when it was restarted until the end. Search on timestamp &quot;Jun 19, 2018 1:31:27&quot;&lt;br/&gt;
for when the restart was initiated.&lt;/p&gt;

&lt;p&gt;The controller was initially killed with &apos;kill -9&apos;&lt;/p&gt;

&lt;p&gt;The controller logs, and robot logs can be found &lt;a href=&quot;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/openflowplugin-csit-3node-clustering-only-oxygen/300/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;here &lt;/a&gt;&lt;/p&gt;</description>
                <environment></environment>
        <key id="30272">CONTROLLER-1849</key>
            <summary>controller not coming up healthy after being killed and restarted (401 after 5m)</summary>
                <type id="10104" iconUrl="https://jira.opendaylight.org/secure/viewavatar?size=xsmall&amp;avatarId=10303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="5" iconUrl="https://jira.opendaylight.org/images/icons/priorities/trivial.svg">Lowest</priority>
                        <status id="5" iconUrl="https://jira.opendaylight.org/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="green"/>
                                    <resolution id="10001">Won&apos;t Do</resolution>
                                        <assignee username="tpantelis">Tom Pantelis</assignee>
                                    <reporter username="jluhrsen">Jamo Luhrsen</reporter>
                        <labels>
                            <label>csit:3node</label>
                    </labels>
                <created>Tue, 3 Jul 2018 22:12:39 +0000</created>
                <updated>Thu, 13 Dec 2018 03:57:20 +0000</updated>
                            <resolved>Thu, 13 Dec 2018 03:57:20 +0000</resolved>
                                    <version>Oxygen</version>
                                                        <due></due>
                            <votes>1</votes>
                                    <watches>7</watches>
                                                                                                                <comments>
                            <comment id="63888" author="jluhrsen" created="Tue, 3 Jul 2018 22:16:37 +0000"  >&lt;p&gt;&lt;a href=&quot;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/openflowplugin-csit-3node-clustering-only-oxygen/291&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;another example &lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="63889" author="jluhrsen" created="Tue, 3 Jul 2018 22:19:02 +0000"  >&lt;p&gt;and &lt;a href=&quot;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/openflowplugin-csit-3node-clustering-only-oxygen/288/robot-plugin/log.html.gz#s1-s6-t2-k2-k4-k1-k1-k2-k8-k1&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;another &lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="63892" author="tpantelis" created="Wed, 4 Jul 2018 01:59:37 +0000"  >&lt;p&gt;After the restart, here&apos;s a grep of all akka output:&lt;/p&gt;

&lt;p&gt;2018-06-19T13:31:55,180 | INFO&#160; | opendaylight-cluster-data-akka.actor.default-dispatcher-3 | Slf4jLogger&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; | 48 - com.typesafe.akka.slf4j - 2.5.11 | Slf4jLogger started&lt;/p&gt;

&lt;p&gt;2018-06-19T13:31:55,254 | INFO&#160; | opendaylight-cluster-data-akka.actor.default-dispatcher-4 | Remoting &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; | 48 - com.typesafe.akka.slf4j - 2.5.11 | Starting remoting&lt;/p&gt;

&lt;p&gt;2018-06-19T13:31:55,437 | INFO&#160; | opendaylight-cluster-data-akka.actor.default-dispatcher-15 | Remoting &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; | 48 - com.typesafe.akka.slf4j - 2.5.11 | Remoting started; listening on addresses :&lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.170.115:2550&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;2018-06-19T13:31:55,466 | INFO&#160; | opendaylight-cluster-data-akka.actor.default-dispatcher-2 | Cluster(akka://opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node &lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.170.115:2550&amp;#93;&lt;/span&gt; - Starting up...&lt;/p&gt;

&lt;p&gt;2018-06-19T13:31:55,529 | INFO&#160; | opendaylight-cluster-data-akka.actor.default-dispatcher-15 | Cluster(akka://opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node &lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.170.115:2550&amp;#93;&lt;/span&gt; - Registered cluster JMX MBean &lt;span class=&quot;error&quot;&gt;&amp;#91;akka:type=Cluster&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;2018-06-19T13:31:55,530 | INFO&#160; | opendaylight-cluster-data-akka.actor.default-dispatcher-15 | Cluster(akka://opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node &lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.170.115:2550&amp;#93;&lt;/span&gt; - Started up successfully&lt;/p&gt;

&lt;p&gt;&lt;b&gt;2018-06-19T13:32:07,632 | INFO&#160; | opendaylight-cluster-data-akka.actor.default-dispatcher-14 | Cluster(akka://opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node &lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.170.115:2550&amp;#93;&lt;/span&gt; - Node &lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.170.115:2550&amp;#93;&lt;/span&gt; is JOINING, roles &lt;span class=&quot;error&quot;&gt;&amp;#91;member-1, dc-default&amp;#93;&lt;/span&gt;&lt;/b&gt;&lt;/p&gt;

&lt;p&gt;2018-06-19T13:32:07,648 | INFO&#160; | opendaylight-cluster-data-akka.actor.default-dispatcher-14 | Cluster(akka://opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node &lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.170.115:2550&amp;#93;&lt;/span&gt; - Leader is moving node &lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.170.115:2550&amp;#93;&lt;/span&gt; to &lt;span class=&quot;error&quot;&gt;&amp;#91;Up&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;2018-06-19T13:36:40,475 | INFO&#160; | SystemReadyService-0 | TestBundleDiag &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; | 238 - org.opendaylight.infrautils.ready-impl - 1.3.3.SNAPSHOT | OK com.typesafe.akka.slf4j:2.5.11: OSGi state = Active, Karaf bundleState = Active, due to: Declarative Services&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;What that tells me is that it wasn&apos;t able to connect to another node so it made itself leader and joined itself thereby forming a cluster island. This is member-1 which I assume is the first seed node listed so it has special behavior that allows it to do that (the other seed nodes cannot).&#160; &#160;This is governed by the&#160;seed-node-timeout setting in the akka.conf which we set to 12s. So from the log, between&#160;13:31:55 and&#160;13:32:07, which is 12s, it tried to connect to either of the other 2 nodes but was unable to do so, thus it thinks no other node is up and forms its own cluster. The shards don&apos;t hear from a leader so they go to Candidate and&#160;keep starting elections over and over trying to become leader.&lt;/p&gt;

&lt;p&gt;So either there was an issue in the CSIT environment that prevented connectivity or an issue with akka that prevented it from connecting to either of the other 2 nodes (or maybe a combination of both). Either way, can&apos;t tell anything more from this log. Do we still have the logs from the other 2 nodes? Perhaps there might be akka messages in one or both of those logs indicating connection attempts from member-1.&lt;/p&gt;

&lt;p&gt;I assume this Is this Oxygen? - can you&#160;set the affected version?&lt;/p&gt;</comment>
                            <comment id="63893" author="tpantelis" created="Wed, 4 Jul 2018 02:41:21 +0000"  >&lt;p&gt;Looking at odl2&apos;s log, I see a&#160;bunch of these repeated:&lt;/p&gt;

&lt;p&gt;2018-06-19T13:31:55,865 | INFO&#160; | opendaylight-cluster-data-akka.actor.default-dispatcher-3 | Cluster(akka://opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node &lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.170.123:2550&amp;#93;&lt;/span&gt; - Received InitJoin message from [Actor&lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.170.115:2550/system/cluster/core/daemon/firstSeedNodeProcess-1#915987295&amp;#93;&lt;/span&gt;] to &lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.170.123:2550&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;2018-06-19T13:31:55,865 | INFO&#160; | opendaylight-cluster-data-akka.actor.default-dispatcher-3 | Cluster(akka://opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node &lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.170.123:2550&amp;#93;&lt;/span&gt; - Sending InitJoinAck message from node &lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.170.123:2550&amp;#93;&lt;/span&gt; to [Actor&lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.170.115:2550/system/cluster/core/daemon/firstSeedNodeProcess-1#915987295&amp;#93;&lt;/span&gt;]&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Followed by a bunch of these:&lt;/p&gt;

&lt;p&gt;2018-06-19T13:32:18,275 | INFO&#160; | opendaylight-cluster-data-akka.actor.default-dispatcher-24 | Cluster(akka://opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node &lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.170.123:2550&amp;#93;&lt;/span&gt; - Leader can currently not perform its duties, reachability status: [akka.tcp://opendaylight-cluster-data@10.30.170.123:2550 -&amp;gt; akka.tcp://opendaylight-cluster-data@10.30.170.115:2550: Unreachable &lt;span class=&quot;error&quot;&gt;&amp;#91;Unreachable&amp;#93;&lt;/span&gt; (3), akka.tcp://opendaylight-cluster-data@10.30.170.158:2550 -&amp;gt; akka.tcp://opendaylight-cluster-data@10.30.170.115:2550: Unreachable &lt;span class=&quot;error&quot;&gt;&amp;#91;Unreachable&amp;#93;&lt;/span&gt; (2)], member status: &lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.170.115:2550 Up seen=false, akka.tcp://opendaylight-cluster-data@10.30.170.123:2550 Up seen=true, akka.tcp://opendaylight-cluster-data@10.30.170.158:2550 Up seen=true&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;And we see similar in odl3&apos;s log:&lt;/p&gt;

&lt;p&gt;2018-06-19T13:31:55,862 | INFO&#160; | opendaylight-cluster-data-akka.actor.default-dispatcher-3 | Cluster(akka://opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node &lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.170.158:2550&amp;#93;&lt;/span&gt; - Received InitJoin message from [Actor&lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.170.115:2550/system/cluster/core/daemon/firstSeedNodeProcess-1#915987295&amp;#93;&lt;/span&gt;] to &lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.170.158:2550&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;2018-06-19T13:31:55,862 | INFO&#160; | opendaylight-cluster-data-akka.actor.default-dispatcher-3 | Cluster(akka://opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node &lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.170.158:2550&amp;#93;&lt;/span&gt; - Sending InitJoinAck message from node &lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.170.158:2550&amp;#93;&lt;/span&gt; to [Actor&lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.170.115:2550/system/cluster/core/daemon/firstSeedNodeProcess-1#915987295&amp;#93;&lt;/span&gt;]&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;So join messages are getting thru to the other nodes but no indication why odl1 (10.30.170.115) was not joined back in. Another akka mystery. I can try opening a bug in akka and see if we get any support.&lt;/p&gt;</comment>
                            <comment id="63906" author="tpantelis" created="Thu, 5 Jul 2018 02:08:04 +0000"  >&lt;p&gt;So odl1 was restarted several times over the course of an hour. The time previous to 06-19T13:31:27 when it didn&apos;t rejoin was at 06-19T13:30:06. On odl2, we see:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2018-06-19T13:30:32,044 | INFO  | opendaylight-cluster-data-akka.actor.default-dispatcher-25 | Cluster(akka://opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node [akka.tcp://opendaylight-cluster-data@10.30.170.123:2550] - Received InitJoin message from [Actor[akka.tcp://opendaylight-cluster-data@10.30.170.115:2550/system/cluster/core/daemon/firstSeedNodeProcess-1#383916203]] to [akka.tcp://opendaylight-cluster-data@10.30.170.123:2550]

2018-06-19T13:30:32,045 | INFO  | opendaylight-cluster-data-akka.actor.default-dispatcher-25 | Cluster(akka://opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node [akka.tcp://opendaylight-cluster-data@10.30.170.123:2550] - Sending InitJoinAck message from node [akka.tcp://opendaylight-cluster-data@10.30.170.123:2550] to [Actor[akka.tcp://opendaylight-cluster-data@10.30.170.115:2550/system/cluster/core/daemon/firstSeedNodeProcess-1#383916203]]

2018-06-19T13:30:32,146 | INFO  | opendaylight-cluster-data-akka.actor.default-dispatcher-25 | Cluster(akka://opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node [akka.tcp://opendaylight-cluster-data@10.30.170.123:2550] - New incarnation of existing member [Member(address = akka.tcp://opendaylight-cluster-data@10.30.170.115:2550, status = Up)] is trying to join. Existing will be removed from the cluster and then new member will be allowed to join.

2018-06-19T13:30:32,147 | INFO  | opendaylight-cluster-data-akka.actor.default-dispatcher-25 | Cluster(akka://opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node [akka.tcp://opendaylight-cluster-data@10.30.170.123:2550] - Marking unreachable node [akka.tcp://opendaylight-cluster-data@10.30.170.115:2550] as [Down]

2018-06-19T13:30:33,282 | INFO  | opendaylight-cluster-data-akka.actor.default-dispatcher-4 | Cluster(akka://opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node [akka.tcp://opendaylight-cluster-data@10.30.170.123:2550] - Leader is removing unreachable node [akka.tcp://opendaylight-cluster-data@10.30.170.115:2550]

2018-06-19T13:30:42,777 | INFO  | opendaylight-cluster-data-akka.actor.default-dispatcher-9 | Cluster(akka://opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node [akka.tcp://opendaylight-cluster-data@10.30.170.123:2550] - Received InitJoin message from [Actor[akka.tcp://opendaylight-cluster-data@10.30.170.115:2550/system/cluster/core/daemon/firstSeedNodeProcess-2#-1096982950]] to [akka.tcp://opendaylight-cluster-data@10.30.170.123:2550]

2018-06-19T13:30:42,777 | INFO  | opendaylight-cluster-data-akka.actor.default-dispatcher-9 | Cluster(akka://opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node [akka.tcp://opendaylight-cluster-data@10.30.170.123:2550] - Sending InitJoinAck message from node [akka.tcp://opendaylight-cluster-data@10.30.170.123:2550] to [Actor[akka.tcp://opendaylight-cluster-data@10.30.170.115:2550/system/cluster/core/daemon/firstSeedNodeProcess-2#-1096982950]]

2018-06-19T13:30:42,825 | INFO  | opendaylight-cluster-data-akka.actor.default-dispatcher-9 | Cluster(akka://opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node [akka.tcp://opendaylight-cluster-data@10.30.170.123:2550] - Node [akka.tcp://opendaylight-cluster-data@10.30.170.115:2550] is JOINING, roles [member-1, dc-default]

2018-06-19T13:30:44,278 | INFO  | opendaylight-cluster-data-akka.actor.default-dispatcher-25 | Cluster(akka://opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node [akka.tcp://opendaylight-cluster-data@10.30.170.123:2550] - Leader is moving node [akka.tcp://opendaylight-cluster-data@10.30.170.115:2550] to [Up]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;On that restart, odl1 was let back in as expected. About 30 sec later it was killed again:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2018-06-19T13:31:15,640 | WARN  | opendaylight-cluster-data-akka.actor.default-dispatcher-9 | NettyTransport                   | 48 - com.typesafe.akka.slf4j - 2.5.11 | Remote connection to [/10.30.170.115:50464] failed with java.io.IOException: Connection reset by peer

2018-06-19T13:31:15,642 | WARN  | opendaylight-cluster-data-akka.actor.default-dispatcher-22 | ReliableDeliverySupervisor       | 48 - com.typesafe.akka.slf4j - 2.5.11 | Association with remote system [akka.tcp://opendaylight-cluster-data@10.30.170.115:2550] has failed, address is now gated for [5000] ms. Reason: [Disassociated] 

2018-06-19T13:31:19,275 | WARN  | opendaylight-cluster-data-akka.actor.default-dispatcher-9 | ClusterCoreDaemon                | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node [akka.tcp://opendaylight-cluster-data@10.30.170.123:2550] - Marking node(s) as UNREACHABLE [Member(address = akka.tcp://opendaylight-cluster-data@10.30.170.115:2550, status = Up)]. Node roles [member-2, dc-default]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;So odl1 was killed and restarted twice in &amp;lt; 2 minutes. The second time it wasn&apos;t allowed back into the cluster within the 12s seed node timeout period. I wonder if there&apos;s some logic or edge case in akka which caused it to delay the rejoin due to the quick kills and restarts. Maybe a longer seed-node-timeout setting like 25s would help. I can try to reproduce manually by doing quick kills/restarts.&lt;/p&gt;</comment>
                            <comment id="63922" author="tpantelis" created="Thu, 5 Jul 2018 13:16:33 +0000"  >&lt;p&gt;So looking at odl1 during the prior successful restart at&#160;06-19T13:30:06, we see:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2018-06-19T13:30:42,821 | INFO&#160; | opendaylight-cluster-data-akka.actor.default-dispatcher-6 | Cluster(akka://opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node [akka.tcp://opendaylight-cluster-data@10.30.170.115:2550] - Received InitJoinAck message from [Actor[akka.tcp://opendaylight-cluster-data@10.30.170.123:2550/system/cluster/core/daemon#1793055218]] to [akka.tcp://opendaylight-cluster-data@10.30.170.115:2550]

2018-06-19T13:30:42,854 | INFO&#160; | opendaylight-cluster-data-akka.actor.default-dispatcher-22 | Cluster(akka://opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node [akka.tcp://opendaylight-cluster-data@10.30.170.115:2550] - Welcome from [akka.tcp://opendaylight-cluster-data@10.30.170.123:2550]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;So akka does print out the InitJoinAck response from another node. We don&apos;t see this during the 06-19T13:31:27 restart. I wonder if it&apos;s possible there was only one-way network communication between 10.30.170.115 and the other nodes during this time - the behavior seems to indicate that...&lt;/p&gt;</comment>
                            <comment id="63937" author="jluhrsen" created="Thu, 5 Jul 2018 20:48:22 +0000"  >&lt;p&gt;I am assuming (based on newer comments) that you found the other logs&lt;br/&gt;
you wanted.&lt;/p&gt;

&lt;p&gt;yes, oxygen (fyi, you can see what version in the url of the csit job). I have&lt;br/&gt;
updated the affected version (sorry for forgetting that)&lt;/p&gt;</comment>
                            <comment id="63938" author="tpantelis" created="Thu, 5 Jul 2018 20:52:05 +0000"  >&lt;p&gt;yeah I notices your link afterwards. I figured it was Oxygen since I saw config system output but I purged that stuff from Fluorine &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.opendaylight.org/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                            <comment id="63939" author="jluhrsen" created="Thu, 5 Jul 2018 21:03:58 +0000"  >&lt;p&gt;I&apos;m trying to better understand how the suite is working (it&apos;s new to me as&lt;br/&gt;
well), but here&apos;s the high level robot code:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;    Kill Switchs Old Owner    ${switch_name}
    Restart Switchs Old Owner    ${switch_name}
    Kill Switchs Successor    ${switch_name}
    Restart Switchs Successor    ${switch_name}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The kill keywords are first figuring out the &quot;owner&quot; or &quot;successor&quot; before&lt;br/&gt;
killing the controller. So, in this specific case I think the first keyword finds&lt;br/&gt;
that odl1 was found to be the Owner and killed. So, I guess odl2 or odl3&lt;br/&gt;
would now become the owner. Then odl1 is restarted and maybe becomes&lt;br/&gt;
the successor for that switch. If that&apos;s the case, then the step to kill the&lt;br/&gt;
successory will again take down odl1 and restart it.&lt;/p&gt;

&lt;p&gt;I&apos;m hoping that infra communication being broken and only one-way is&lt;br/&gt;
so unlikely that we can ignore that for now, but I have no way to prove&lt;br/&gt;
that.&lt;/p&gt;

&lt;p&gt;I suppose one effort we can take is to write some automation to quickly&lt;br/&gt;
take a controller down/up and check for whatever the smoking gun is in&lt;br/&gt;
this case. &lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=tpantelis&quot; class=&quot;user-hover&quot; rel=&quot;tpantelis&quot;&gt;tpantelis&lt;/a&gt; is there some string or set of strings we can poll&lt;br/&gt;
on in the karaf.log to know if we hit this &quot;controller on an island&quot;&lt;br/&gt;
scenario? I can work on something to run repeatedly looking for this&lt;br/&gt;
problem.&lt;/p&gt;</comment>
                            <comment id="63940" author="tpantelis" created="Thu, 5 Jul 2018 21:25:04 +0000"  >&lt;p&gt;yeah the broken infra one-way communication theory seems unlikely. But the behavior seen in the logs  is consistent with there being broken one-way communication somewhere so that&apos;s why I mentioned it. The logs show odl1 sent InitAckJoin to both odl2 and odl3 and both show they replied with InitAckJoinAck. But we don&apos;t the see the InitAckJoinAck received message in odl1 as is seen in a successful rejoin scenario. We also don&apos;t see odl2 or odl3 marking odl1 as REACHABLE even tho both received multiple messages from odl1. According to my experience with akka and reading their docs, there shouldn&apos;t be any reason odl2 or odl3 shouldn&apos;t allow odl1 back in provided there&apos;s full 2-way communication. Unless there&apos;s some edge-case bug in akka where it still has stale state for odl1....  akka has strict rules wrt convergence where all surviving nodes have to agree to remove the old node incarnation and let the new node incarnation join.&lt;/p&gt;

&lt;p&gt;I&apos;m going to try locally to keep killing/restarting the first seed node - hopefully tomorrow.    &lt;/p&gt;

&lt;p&gt;As far as what to look for....  you don&apos;t want to see odl1 reporting this after the restart:&lt;/p&gt;

&lt;p&gt;2018-06-19T13:32:07,632 | INFO  | opendaylight-cluster-data-akka.actor.default-dispatcher-14 | Cluster(akka://opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node &lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.170.115:2550&amp;#93;&lt;/span&gt; - Node &lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.170.115:2550&amp;#93;&lt;/span&gt; is JOINING, roles &lt;span class=&quot;error&quot;&gt;&amp;#91;member-1, dc-default&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;2018-06-19T13:32:07,648 | INFO  | opendaylight-cluster-data-akka.actor.default-dispatcher-14 | Cluster(akka://opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node &lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.170.115:2550&amp;#93;&lt;/span&gt; - Leader is moving node &lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.170.115:2550&amp;#93;&lt;/span&gt; to &lt;span class=&quot;error&quot;&gt;&amp;#91;Up&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;That means it joined itself and &quot;Up&quot;ed itself.  You want to see &lt;/p&gt;

&lt;p&gt;2018-06-19T13:30:42,854 | INFO  | opendaylight-cluster-data-akka.actor.default-dispatcher-22 | Cluster(akka://opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node &lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.170.115:2550&amp;#93;&lt;/span&gt; - Welcome from &lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.170.123:2550&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;meaning one of the other 2 nodes, 10.30.170.123 in this case, let it back into the surviving cluster.&lt;/p&gt;</comment>
                            <comment id="63941" author="jluhrsen" created="Thu, 5 Jul 2018 21:41:33 +0000"  >&lt;p&gt;gotcha, so if I just write some script to down/up a single node and check&lt;br/&gt;
for &quot;$ip.&lt;b&gt;Leader is moving node.&lt;/b&gt;$ip.* to &lt;span class=&quot;error&quot;&gt;&amp;#91;Up&amp;#93;&lt;/span&gt;&quot; and exit if found, I&apos;ll catch&lt;br/&gt;
this island case. I can also poll for that &quot;Welcome&quot; message to know&lt;br/&gt;
when to keep trying.&lt;/p&gt;</comment>
                            <comment id="63942" author="tpantelis" created="Thu, 5 Jul 2018 21:49:19 +0000"  >&lt;p&gt;It has to be a 3-node setup where you down/up the first seed node listed in the akka.conf.  As I mentioned earlier, the first seed node has special significance in that it bootstraps the cluster so it must be running in order to initially form a cluster and is the only node that can join itself. All subsequent seed nodes listed must be able to contact another seed and join with it. So you wouldn&apos;t see this 2 cluster island issue if a non-first seed node  (odl2 or odl3) were restarted. &lt;/p&gt;</comment>
                            <comment id="63943" author="jluhrsen" created="Thu, 5 Jul 2018 22:05:28 +0000"  >&lt;p&gt;ok, makes sense. Can you tell me how to figure out the first seed node? I assume there is something in the logs.&lt;/p&gt;</comment>
                            <comment id="63944" author="tpantelis" created="Thu, 5 Jul 2018 22:51:54 +0000"  >&lt;p&gt;Open configuration/initial/akka.conf - look for &quot;seed-nodes = ...&quot;   - the first one in that list.&lt;/p&gt;</comment>
                            <comment id="63960" author="tpantelis" created="Sun, 8 Jul 2018 16:16:39 +0000"  >&lt;p&gt;I think it would be good to determine if this issue also occurs if the node is gracefully shut down. I read somewhere that&#160;forcefully killing a process does not cleanly shut down the ports and thus a&#160;port&#160;may stay blocked for a while depending on the operating system.&#160;I&apos;m not sure how such a state would manifest in akka however - we didn&apos;t see any error/failure reported by akka.&lt;/p&gt;</comment>
                            <comment id="63975" author="jluhrsen" created="Mon, 9 Jul 2018 16:31:53 +0000"  >&lt;p&gt;this is obviously a good idea and will be a good data point, but I can&apos;t really do this at this time. This&lt;br/&gt;
problem is only seen infrequently in CSIT and I haven&apos;t reproduced it locally. So, without a solid&lt;br/&gt;
way to hit it, adding the graceful shutdown to CSIT won&apos;t tell us much.&lt;/p&gt;

&lt;p&gt;I am currently trying to reproduce locally.&lt;/p&gt;</comment>
                            <comment id="63981" author="jluhrsen" created="Mon, 9 Jul 2018 22:49:43 +0000"  >&lt;p&gt;here&apos;s a simple bash script I&apos;m using to try to locally reproduce. I&apos;m using the netvirt-ha-docker tooling to make a local cluster.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;#!/bin/bash

&lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; &lt;span class=&quot;code-keyword&quot;&gt;true&lt;/span&gt;
&lt;span class=&quot;code-keyword&quot;&gt;do&lt;/span&gt;
  sudo docker exec -i odl_172.28.5.1 ps aux | 
                                     grep karaf.main | grep -v grep | 
                                     awk &lt;span class=&quot;code-quote&quot;&gt;&apos;{print &lt;span class=&quot;code-quote&quot;&gt;&quot;kill -9&quot;&lt;/span&gt;,$2}&apos;&lt;/span&gt; | 
                                     sudo docker exec -i odl_172.28.5.1 sh
  # pick a random value between 1-90 seconds &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; when we start karaf again
  sleepy_time=$((RANDOM%90))
  echo &lt;span class=&quot;code-quote&quot;&gt;&quot;Waiting $sleepy_time seconds after killing karaf, before starting&quot;&lt;/span&gt;
  sleep $sleepy_time
  sudo docker exec -i odl_172.28.5.1 /odlha/karaf/target/assembly/bin/start

  tries=0
  until [ $tries -ge 60 ]
  &lt;span class=&quot;code-keyword&quot;&gt;do&lt;/span&gt;
      ((tries++))
      echo &lt;span class=&quot;code-quote&quot;&gt;&quot;$tries th check &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 200 code&quot;&lt;/span&gt;
      RESP=$(curl --silent --write-out &lt;span class=&quot;code-quote&quot;&gt;&quot;HTTPSTATUS:%{http_code}&quot;&lt;/span&gt; -u &lt;span class=&quot;code-quote&quot;&gt;&quot;admin:admin&quot;&lt;/span&gt; http:&lt;span class=&quot;code-comment&quot;&gt;//172.28.5.1:8181/jolokia/read/org.opendaylight.controller:Category\=ShardManager,name\=shard-manager-config,type\=DistributedConfigDatastore)
&lt;/span&gt;      CODE=$(echo $RESP | tr -d &lt;span class=&quot;code-quote&quot;&gt;&apos;\n&apos;&lt;/span&gt; | sed -e &lt;span class=&quot;code-quote&quot;&gt;&apos;s/.*HTTPSTATUS:&lt;span class=&quot;code-comment&quot;&gt;//&apos;&lt;/span&gt;)
&lt;/span&gt;
      &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; [ $CODE -eq &lt;span class=&quot;code-quote&quot;&gt;&quot;200&quot;&lt;/span&gt; ]; then
          &lt;span class=&quot;code-keyword&quot;&gt;break&lt;/span&gt;
      fi
      sleep 2;
  done

  # we&apos;ll have looped &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 120s (60 tries) above &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; 200 never came, so maybe things are broken
  &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; [ $tries -eq 60 ]; then
      echo &lt;span class=&quot;code-quote&quot;&gt;&quot;Might have caught the bugger&quot;&lt;/span&gt;; exit 1
  fi
done
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="63982" author="jluhrsen" created="Mon, 9 Jul 2018 23:22:06 +0000"  >&lt;p&gt;I basically tried the above script manually all morning (maybe 20-30 times), but once I scripted it and stopped looking&lt;br/&gt;
I hit the bug on the 13th iteration.&lt;/p&gt;

&lt;p&gt;I do see this (smoking gun, I think):&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
2018-07-09T23:11:59,187 | INFO  | opendaylight-cluster-data-akka.actor.&lt;span class=&quot;code-keyword&quot;&gt;default&lt;/span&gt;-dispatcher-59 | Cluster(akka:&lt;span class=&quot;code-comment&quot;&gt;//opendaylight-cluster-data) | 48 - com.typesafe.akka.slf
&lt;/span&gt;4j - 2.5.11 | Cluster Node [akka.tcp:&lt;span class=&quot;code-comment&quot;&gt;//opendaylight-cluster-data@172.28.5.1:2550] - Leader is moving node [akka.tcp://opendaylight-cluster-data@172.28.5.1:2550] to [
&lt;/span&gt;Up]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;&lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=tpantelis&quot; class=&quot;user-hover&quot; rel=&quot;tpantelis&quot;&gt;tpantelis&lt;/a&gt;, let me try to reproduce this a few more times so we can get a sense of how frequent I can hit it.&lt;/p&gt;

&lt;p&gt;Assuming I can hit it with regularity, I&apos;ll try your idea of doing a &quot;stop&quot; instead of a &quot;kill -9&quot;.&lt;/p&gt;

&lt;p&gt;In the meantime, is there anything else to try/look for while doing this to help get to the root cause.&lt;/p&gt;

&lt;p&gt;BTW, this is all done on a single laptop, so I hope that can rule out infra networking issues.&lt;/p&gt;</comment>
                            <comment id="63983" author="tpantelis" created="Mon, 9 Jul 2018 23:40:39 +0000"  >&lt;p&gt;One other thing I can think of is to bump the&#160;seed-node-timeout from 12s to say 25s. It&apos;s specified in&#160; the&#160;configuration/factory/akka.conf but that file is overwritten on feature install. You&apos;d want to override in&#160;configuration/initial/akka.conf which is preserved.&lt;/p&gt;

&lt;p&gt;I think running 3 instances on the same box (separate dockers I assume) should rule out networking issues. Given that, it&apos;s likely some edge case issue in akka. As mentioned before it may have to do with killing/restarting in relatively quick succession. Let&apos;s try to characterize it as much as we can then we can open an issue with akka and see what happens from there.&lt;/p&gt;</comment>
                            <comment id="64009" author="jluhrsen" created="Tue, 10 Jul 2018 23:02:52 +0000"  >&lt;p&gt;I posted &lt;a href=&quot;https://git.opendaylight.org/gerrit/#/c/73899/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;my script to gerrit &lt;/a&gt; since I kept tweaking&lt;br/&gt;
it and wanted to make it available for others if they were so inclined to try this out locally as well.&lt;/p&gt;

&lt;p&gt;Over the past 24 hours I was able to reproduce this 6 times. Each reproduction took iterations of&lt;br/&gt;
71, 13, 20, 25, 205, 30 tries. This was using the default seed-node-timeout of 12s.&lt;/p&gt;

&lt;p&gt;Currently, I&apos;m running this script with seed-node-timeout of 30s. I will report back.&lt;/p&gt;</comment>
                            <comment id="64067" author="ecelgp" created="Tue, 17 Jul 2018 16:21:33 +0000"  >&lt;p&gt;Jamo, did you finally try with seed-node-timeout of 30s?&lt;br/&gt;
I am asking because I saw the issue yesterday:&lt;br/&gt;
&lt;a href=&quot;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/openflowplugin-csit-3node-clustering-only-fluorine/147/robot-plugin/log.html.gz&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/openflowplugin-csit-3node-clustering-only-fluorine/147/robot-plugin/log.html.gz&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="64068" author="jluhrsen" created="Tue, 17 Jul 2018 21:07:51 +0000"  >&lt;p&gt;I was reproducing these kinds of issues locally, and for a week I was able to hit this&lt;br/&gt;
kind of issue where killing/restarting a node was giving me a 401 unauthorized&lt;br/&gt;
after automating it and letting it try over and over. Another problem that might be&lt;br/&gt;
related is that cluster syncstatus gets stuck at &apos;false&apos;. I that is tracked in &lt;a href=&quot;https://jira.opendaylight.org/browse/CONTROLLER-1768&quot; title=&quot;SyncStatus stays false for more than 5minutes after bringing 2 of 3 nodes down and back up.&quot; class=&quot;issue-link&quot; data-issue-key=&quot;CONTROLLER-1768&quot;&gt;&lt;del&gt;CONTROLLER-1768&lt;/del&gt;&lt;/a&gt;.&lt;br/&gt;
This &lt;a href=&quot;https://jira.opendaylight.org/browse/CONTROLLER-1768&quot; title=&quot;SyncStatus stays false for more than 5minutes after bringing 2 of 3 nodes down and back up.&quot; class=&quot;issue-link&quot; data-issue-key=&quot;CONTROLLER-1768&quot;&gt;&lt;del&gt;CONTROLLER-1768&lt;/del&gt;&lt;/a&gt; bug was coming when I moved seed-node-timeout to 30s,&lt;br/&gt;
but I was not able to locally reproduce this bug &lt;a href=&quot;https://jira.opendaylight.org/browse/CONTROLLER-1849&quot; title=&quot;controller not coming up healthy after being killed and restarted (401 after 5m)&quot; class=&quot;issue-link&quot; data-issue-key=&quot;CONTROLLER-1849&quot;&gt;&lt;del&gt;CONTROLLER-1849&lt;/del&gt;&lt;/a&gt; any longer,&lt;br/&gt;
even when moving back to 12s.&lt;/p&gt;

&lt;p&gt;During our kernel call today, one idea was that we can do away with dynamicAuthorization&lt;br/&gt;
altogether. &lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=tpantelis&quot; class=&quot;user-hover&quot; rel=&quot;tpantelis&quot;&gt;tpantelis&lt;/a&gt; mentioned he could push a patch for that, but it looked simple&lt;br/&gt;
enough so I already made one.&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://git.opendaylight.org/gerrit/#/c/74157/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://git.opendaylight.org/gerrit/#/c/74157/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="64069" author="tpantelis" created="Tue, 17 Jul 2018 22:16:55 +0000"  >&lt;p&gt;Looking at the logs from Luis test, we see similar behavior where odl1 didn&apos;t rejoin the cluster and joined itself after 22s. It did get an InitJoinAck response from odl2.&lt;/p&gt;

&lt;p&gt;odl1:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2018-07-16T11:18:20,251 | INFO  | opendaylight-cluster-data-akka.actor.default-dispatcher-21 | Cluster(akka://opendaylight-cluster-data) | 41 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node [akka.tcp://opendaylight-cluster-data@10.30.170.94:2550] - Received InitJoinAck message from [Actor[akka.tcp://opendaylight-cluster-data@10.30.170.26:2550/system/cluster/core/daemon#-1596937295]] to [akka.tcp://opendaylight-cluster-data@10.30.170.94:2550]

2018-07-16T11:18:42,585 | INFO  | opendaylight-cluster-data-akka.actor.default-dispatcher-19 | Cluster(akka://opendaylight-cluster-data) | 41 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node [akka.tcp://opendaylight-cluster-data@10.30.170.94:2550] - Node [akka.tcp://opendaylight-cluster-data@10.30.170.94:2550] is JOINING, roles [member-1, dc-default]

2018-07-16T11:18:42,614 | INFO  | opendaylight-cluster-data-akka.actor.default-dispatcher-19 | Cluster(akka://opendaylight-cluster-data) | 41 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node [akka.tcp://opendaylight-cluster-data@10.30.170.94:2550] - Leader is moving node [akka.tcp://opendaylight-cluster-data@10.30.170.94:2550] to [Up]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt; 

&lt;p&gt;odl2:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt; 
2018-07-16T11:18:14,848 | WARN  | opendaylight-cluster-data-akka.actor.default-dispatcher-21 | ReliableDeliverySupervisor       | 41 - com.typesafe.akka.slf4j - 2.5.11 | Association with remote system [akka.tcp://opendaylight-cluster-data@10.30.170.94:2550] has failed, address is now gated for [5000] ms. Reason: [Association failed with [akka.tcp://opendaylight-cluster-data@10.30.170.94:2550]] Caused by: [Connection refused: /10.30.170.94:2550]

2018-07-16T11:18:20,105 | INFO  | opendaylight-cluster-data-akka.actor.default-dispatcher-2 | Cluster(akka://opendaylight-cluster-data) | 41 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node [akka.tcp://opendaylight-cluster-data@10.30.170.26:2550] - Received InitJoin message from [Actor[akka.tcp://opendaylight-cluster-data@10.30.170.94:2550/system/cluster/core/daemon/firstSeedNodeProcess-1#-2094984711]] to [akka.tcp://opendaylight-cluster-data@10.30.170.26:2550]

2018-07-16T11:18:20,106 | INFO  | opendaylight-cluster-data-akka.actor.default-dispatcher-2 | Cluster(akka://opendaylight-cluster-data) | 41 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node [akka.tcp://opendaylight-cluster-data@10.30.170.26:2550] - Sending InitJoinAck message from node [akka.tcp://opendaylight-cluster-data@10.30.170.26:2550] to [Actor[akka.tcp://opendaylight-cluster-data@10.30.170.94:2550/system/cluster/core/daemon/firstSeedNodeProcess-1#-2094984711]]

2018-07-16T11:18:20,291 | INFO  | opendaylight-cluster-data-akka.actor.default-dispatcher-18 | Cluster(akka://opendaylight-cluster-data) | 41 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node [akka.tcp://opendaylight-cluster-data@10.30.170.26:2550] - New incarnation of existing member [Member(address = akka.tcp://opendaylight-cluster-data@10.30.170.94:2550, status = Up)] is trying to join. *Existing will be removed from the cluster and then new member will be allowed to join.*

2018-07-16T11:18:20,292 | INFO  | opendaylight-cluster-data-akka.actor.default-dispatcher-18 | Cluster(akka://opendaylight-cluster-data) | 41 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node [akka.tcp://opendaylight-cluster-data@10.30.170.26:2550] - *Marking unreachable node [akka.tcp://opendaylight-cluster-data@10.30.170.94:2550] as [Down]*

2018-07-16T11:18:21,226 | INFO  | opendaylight-cluster-data-akka.actor.default-dispatcher-22 | Cluster(akka://opendaylight-cluster-data) | 41 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node [akka.tcp://opendaylight-cluster-data@10.30.170.26:2550] - *Leader is removing unreachable node [akka.tcp://opendaylight-cluster-data@10.30.170.94:2550]*

2018-07-16T11:18:30,564 | INFO  | opendaylight-cluster-data-akka.actor.default-dispatcher-2 | Cluster(akka://opendaylight-cluster-data) | 41 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node [akka.tcp://opendaylight-cluster-data@10.30.170.26:2550] - Received InitJoin message from [Actor[akka.tcp://opendaylight-cluster-data@10.30.170.94:2550/system/cluster/core/daemon/firstSeedNodeProcess-2#2006731653]] to [akka.tcp://opendaylight-cluster-data@10.30.170.26:2550]

2018-07-16T11:18:30,565 | INFO  | opendaylight-cluster-data-akka.actor.default-dispatcher-2 | Cluster(akka://opendaylight-cluster-data) | 41 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node [akka.tcp://opendaylight-cluster-data@10.30.170.26:2550] - Sending InitJoinAck message from node [akka.tcp://opendaylight-cluster-data@10.30.170.26:2550] to [Actor[akka.tcp://opendaylight-cluster-data@10.30.170.94:2550/system/cluster/core/daemon/firstSeedNodeProcess-2#2006731653]]

several more InitJoin/InitJoinAck sequences...

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt; 

&lt;p&gt;Similar InitJoin/InitJoinAck sequences on odl3. &lt;/p&gt;

&lt;p&gt;In this sequence, odl2 actually saw that odl1 was restarted with a new incarnation, marked it as Down, and stated it was removing the previous unreachable incarnation. It then is supposed to allow the new incarnation in but never did which led odl1 to join itself after the 12s seed-node-timeout. Strange. I would say this definitely is an issue with akka. &lt;/p&gt;</comment>
                            <comment id="64070" author="tpantelis" created="Tue, 17 Jul 2018 23:13:30 +0000"  >&lt;p&gt;I opened &lt;a href=&quot;https://github.com/akka/akka/issues/25361&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/akka/akka/issues/25361&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="64124" author="ecelgp" created="Thu, 19 Jul 2018 21:16:05 +0000"  >&lt;p&gt;When you run OF cluster test many times like I did in sandbox this week, I can see the issue of controller not joining the cluster after kill+start also occurs in instances different than 1. For example see this run in sandbox where instance 3 is killed and started and does not join:&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://logs.opendaylight.org/sandbox/vex-yul-odl-jenkins-2/openflowplugin-csit-3node-clustering-only-fluorine/50/robot-plugin/log.html.gz&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://logs.opendaylight.org/sandbox/vex-yul-odl-jenkins-2/openflowplugin-csit-3node-clustering-only-fluorine/50/robot-plugin/log.html.gz&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;I think the root cause for this may be the same as with instance 1, so do we want to open a new bug or just track it here?&lt;/p&gt;</comment>
                            <comment id="64125" author="tpantelis" created="Thu, 19 Jul 2018 21:21:04 +0000"  >&lt;p&gt;It&apos;s not exactly the same scenario since only the first seed node can join itself - the others cannot and will try forever to re-join. But I would say the fact that it doesn&apos;t rejoin is the same issue. So I think we can track it here.&lt;/p&gt;</comment>
                            <comment id="64126" author="ecelgp" created="Thu, 19 Jul 2018 21:21:36 +0000"  >&lt;p&gt;Also from my test this is the most frequent issue the OF cluster suite has today, it is also very apparent because during the suite we are killing and starting instance few times. &lt;/p&gt;</comment>
                            <comment id="64127" author="tpantelis" created="Thu, 19 Jul 2018 21:23:11 +0000"  >&lt;p&gt;Have you tried w/o killing it, ie shut it down gracefully? It would be interesting to see if it happens in that case.&lt;/p&gt;</comment>
                            <comment id="64128" author="ecelgp" created="Thu, 19 Jul 2018 21:24:50 +0000"  >&lt;p&gt;Yes, that was my point, at the root there is the issue of an instance not joining regardless of what happens later. I really thought this problem was resolved in akka recent upgrade but in the end it just happens less often than before.&lt;/p&gt;</comment>
                            <comment id="64129" author="tpantelis" created="Thu, 19 Jul 2018 21:27:16 +0000"  >&lt;p&gt;right - failure to rejoin is the root issue. &lt;/p&gt;</comment>
                            <comment id="64130" author="ecelgp" created="Thu, 19 Jul 2018 21:31:38 +0000"  >&lt;p&gt;Yes, I think shutdown works or fails less often, let me change most kills to shutdown and see what happens.&lt;/p&gt;</comment>
                            <comment id="64131" author="tpantelis" created="Thu, 19 Jul 2018 21:40:18 +0000"  >&lt;p&gt;I&apos;d like to see if it doesn&apos;t fail at all if shutdown gracefully. If that&apos;s the case then that gives us another data point to further characterize the issue and we can report that on the akka issue.&lt;/p&gt;</comment>
                            <comment id="64144" author="ecelgp" created="Sat, 21 Jul 2018 00:05:34 +0000"  >&lt;p&gt;After 1 day running I only see failure in the kill+start scenario which means something in the kill adds to the problem.&lt;/p&gt;</comment>
                            <comment id="64146" author="tpantelis" created="Sun, 22 Jul 2018 13:50:25 +0000"  >&lt;p&gt;I got a response from akka folks - they&apos;ve asked us to enable debug.&lt;/p&gt;

&lt;p&gt;In configuration/initial/akka.conf, merge in these settings:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;akka {
  loglevel = DEBUG 
  remote { 
     ...
     log-received-messages = on 
     log-sent-messages = on
  }
 ...
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;In etc/org.ops4j.pax.logging.cfg, add:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;log4j2.logger.cluster.name=akka.cluster
log4j2.logger.cluster.level=DEBUG
log4j2.logger.remote.name=akka.remote
log4j2.logger.remote.level=DEBUG
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This will produce a lot of output so I&apos;d suggest to pare down the tests to only what&apos;s needed to repro, eg don&apos;t re-run the tests with tell-based and don&apos;t run the script that creates 10k cars. Also when restarting the node after kill, delete the previous log so it starts with a clean one.&lt;/p&gt;</comment>
                            <comment id="64147" author="ecelgp" created="Mon, 23 Jul 2018 03:41:46 +0000"  >&lt;p&gt;I started a small suite with all debugs on in sandbox and so far no luck reproducing the problem &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.opendaylight.org/images/icons/emoticons/sad.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                            <comment id="64154" author="tpantelis" created="Mon, 23 Jul 2018 13:48:15 +0000"  >&lt;p&gt;Jamo had a test script running in sandbox that was reproducing it - perhaps get that going again. The akka folks have also suggested we try artery to see if it reproduces - so once we get a test/script that reproduces it.&lt;/p&gt;</comment>
                            <comment id="64163" author="jluhrsen" created="Mon, 23 Jul 2018 21:13:41 +0000"  >&lt;p&gt;At this point, I think I&apos;m still able to reproduce this, but the high level symptom is now a 404 on /restconf or /jolokia&lt;/p&gt;

&lt;p&gt;I did not try with the debugs enabled yet. I wanted to double check my automation could hit it.&lt;/p&gt;

&lt;p&gt;I can see this in the karaf.log, which I have been assuming is our smoking gun (1st seed node is on an island)&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;2018-07-23T20:04:13,072 | INFO  | opendaylight-cluster-data-akka.actor.&lt;span class=&quot;code-keyword&quot;&gt;default&lt;/span&gt;-dispatcher-51 | Cluster(akka:&lt;span class=&quot;code-comment&quot;&gt;//opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node [akka.tcp://opendaylight-cluster-data@172.28.5.1:2550] - Leader is moving node [akka.tcp://opendaylight-cluster-data@172.28.5.1:2550] to [Up]&lt;/span&gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I didn&apos;t run this with the debugs, but something else is bothersome. Seems like a lot of blueprint errors are&lt;br/&gt;
coming. Could this be some other kind of bug that just ends up looking the same at the end? I attached&lt;br/&gt;
the karaf.log from the 1st seed node (I trimmed out everything except the logs after the final restart before&lt;br/&gt;
it hit the 404) &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.opendaylight.org/secure/attachment/14788/14788_karaf.log.since_last_restart.xz&quot; title=&quot;karaf.log.since_last_restart.xz attached to CONTROLLER-1849&quot;&gt;karaf.log.since_last_restart.xz&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.opendaylight.org/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; &lt;/p&gt;</comment>
                            <comment id="64164" author="tpantelis" created="Mon, 23 Jul 2018 21:18:58 +0000"  >&lt;p&gt;If shard leaders don&apos;t converge then there will be blueprint timeouts in other apps b/c the DS isn&apos;t ready. Most apps depend on the DS. Even if an app does startup, there will be timeout failures when it tries to access the DS.&lt;/p&gt;</comment>
                            <comment id="64165" author="jluhrsen" created="Mon, 23 Jul 2018 21:21:09 +0000"  >&lt;p&gt;ok, so to be clear, we think this is the same exact bug right?&lt;/p&gt;

&lt;p&gt;what about that 404 then? I thought with dynamic auth disabled... oh wait, that was for 401. ok, so 404 is probably expected now right?&lt;br/&gt;
when we hit the bug, I mean.&lt;/p&gt;</comment>
                            <comment id="64166" author="tpantelis" created="Mon, 23 Jul 2018 21:31:48 +0000"  >&lt;p&gt;404 is &quot;not found&quot;. I assume that&apos;s coming from jetty so AAA and restconf haven&apos;t started b/c they depend on the DS. So it all comes back to the root cause, ie akka not rejoining.&lt;/p&gt;

&lt;p&gt;At this point we need a reproduction with the akka debug enabled so we can send that off to them.  A minimal repro would be best, ie bring up the 3 nodes, verify cluster is OK, then kill/restart the first seed node over and over. So don&apos;t create any cars/people etc.  Also delete the karaf log before each restart until if fails.&lt;/p&gt;</comment>
                            <comment id="64167" author="jluhrsen" created="Mon, 23 Jul 2018 21:36:32 +0000"  >&lt;blockquote&gt;&lt;p&gt;404 is &quot;not found&quot;. I assume that&apos;s coming from jetty so AAA and restconf haven&apos;t started b/c they depend on the DS. So it all comes back to the root cause, ie akka not rejoining.&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;got it. that&apos;s what I was hoping.&lt;/p&gt;
&lt;blockquote&gt;&lt;p&gt;At this point we need a reproduction with the akka debug enabled so we can send that off to them. A minimal repro would be best, ie bring up the 3 nodes, verify cluster is OK, then kill/restart the first seed node over and over. So don&apos;t create any cars/people etc. Also delete the karaf log before each restart until if fails.&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;that&apos;s all I am doing. it&apos;s just a slightly improved version of &lt;a href=&quot;https://jira.opendaylight.org/browse/CONTROLLER-1849?focusedCommentId=63981&amp;amp;page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-63981&quot; class=&quot;external-link&quot; rel=&quot;nofollow&quot;&gt;this &lt;/a&gt;&lt;/p&gt;

&lt;p&gt;my automation is not going to delete the karaf.log, but I assume I can manually truncate the logs to make it&lt;br/&gt;
 easier. It&apos;s really just looking for some known entry on a restart and finding the last one. I think it&apos;s as simple as finding the last&lt;br/&gt;
 line with the string &quot;SimpleFileLock lock&quot; to know when the last restart was done.&lt;/p&gt;</comment>
                            <comment id="64168" author="tpantelis" created="Mon, 23 Jul 2018 21:51:54 +0000"  >&lt;p&gt;yeah. It would make it a little easier to delete the log if possible but it&apos;s OK. &lt;/p&gt;</comment>
                            <comment id="64180" author="ecelgp" created="Tue, 24 Jul 2018 16:44:05 +0000"  >&lt;p&gt;Lets keep this bug open for the issue we see in OFP of instance failing to join cluster after it is killed and started and shows auth error 401 when jolokia/REST is invoked on it:&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/openflowplugin-csit-3node-clustering-only-oxygen/336/robot-plugin/log.html.gz#s1-s2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/openflowplugin-csit-3node-clustering-only-oxygen/336/robot-plugin/log.html.gz#s1-s2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="64182" author="jluhrsen" created="Tue, 24 Jul 2018 18:32:43 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=tpantelis&quot; class=&quot;user-hover&quot; rel=&quot;tpantelis&quot;&gt;tpantelis&lt;/a&gt;, what do you want here? &lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=ecelgp&quot; class=&quot;user-hover&quot; rel=&quot;ecelgp&quot;&gt;ecelgp&lt;/a&gt;&apos;s link above is to a node (&lt;b&gt;NOT&lt;/b&gt; the 1st seed node) that was bounced&lt;br/&gt;
and came back up giving a 401.&lt;/p&gt;

&lt;p&gt;We don&apos;t have that 401 any more when we got rid of dynamic auth for the case when we get an island leader on the first&lt;br/&gt;
seed node. At least I can&apos;t hit it with my local reproduction.&lt;/p&gt;

&lt;p&gt;New bug, or just keep this one to track all of these very similar problems?&lt;/p&gt;</comment>
                            <comment id="64183" author="ecelgp" created="Tue, 24 Jul 2018 18:34:36 +0000"  >&lt;p&gt;This is seen in the starting instance (member-2):&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
2018-07-24T11:31:56,731 | WARN  | opendaylight-cluster-data-akka.actor.&lt;span class=&quot;code-keyword&quot;&gt;default&lt;/span&gt;-dispatcher-2 | JoinSeedNodeProcess              | 48 - com.typesafe.akka.slf4j - 2.5.11 | Couldn&apos;t join seed nodes after [16] attempts, will &lt;span class=&quot;code-keyword&quot;&gt;try&lt;/span&gt; again. seed-nodes=[akka.tcp:&lt;span class=&quot;code-comment&quot;&gt;//opendaylight-cluster-data@10.30.171.71:2550, akka.tcp://opendaylight-cluster-data@10.30.171.61:2550]&lt;/span&gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="64184" author="tpantelis" created="Tue, 24 Jul 2018 18:39:13 +0000"  >&lt;p&gt;Looks  like that was in Oxygen - not sure of the state there. I&apos;ve been focusing on Fluorine.  That looks like the same issue except it wasn&apos;t the first seed node. Again, root cause is a node not rejoining, first seed node or not. I think we see it much more frequently with the first seed node b/c it can join itself with a deadline of only 12s - the others will just keep trying to join the existing cluster indefinitely which I think has a 5 min deadline in CSIT. &lt;/p&gt;</comment>
                            <comment id="64185" author="jluhrsen" created="Tue, 24 Jul 2018 18:42:37 +0000"  >&lt;p&gt;I&apos;ve &lt;b&gt;only&lt;/b&gt; been using Oxygen in my local reproduction of this. I thought we could not even hit the island node issue if it wasn&apos;t the first seed node?&lt;/p&gt;</comment>
                            <comment id="64186" author="tpantelis" created="Tue, 24 Jul 2018 18:45:44 +0000"  >&lt;p&gt;It won&apos;t hit the island node issue but... see my previous note. To our code it doesn&apos;t matter - if the node doesn&apos;t join, we can&apos;t converge shard leaders.&lt;/p&gt;</comment>
                            <comment id="64187" author="jluhrsen" created="Tue, 24 Jul 2018 18:50:54 +0000"  >&lt;p&gt;ok, so what was the big deal about only doing this with the first seed node?&lt;br/&gt;
and how do we explain the 401 now?&lt;/p&gt;</comment>
                            <comment id="64188" author="tpantelis" created="Tue, 24 Jul 2018 18:56:40 +0000"  >&lt;p&gt;Because up until now we&apos;ve only saw it for the first seed node and, as I explained above, it is much more likely to happen.&lt;/p&gt;

&lt;p&gt;I don&apos;t know why there would still be a 401.&lt;/p&gt;</comment>
                            <comment id="64189" author="jluhrsen" created="Tue, 24 Jul 2018 22:05:14 +0000"  >&lt;p&gt;I think/thought I had this reproduced locally with the akka debugs enabled, but I&apos;m having trouble making sense of my logs&lt;br/&gt;
and timestamps.&lt;/p&gt;

&lt;p&gt;I do see this this output repeated 31 times at the end of my karaf log:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
Jul 24, 2018 12:52:34 AM org.apache.karaf.main.lock.SimpleFileLock lock
INFO: Trying to lock /odlha/karaf/target/assembly/lock
Jul 24, 2018 12:52:34 AM org.apache.karaf.main.lock.SimpleFileLock lock
INFO: Lock acquired
Jul 24, 2018 12:52:34 AM org.apache.karaf.main.Main$KarafLockCallback lockAquired
INFO: Lock acquired. Setting startlevel to 100
Jul 24, 2018 12:54:29 AM org.apache.karaf.main.lock.SimpleFileLock lock
INFO: Trying to lock /odlha/karaf/target/assembly/lock
Jul 24, 2018 12:54:29 AM org.apache.karaf.main.lock.SimpleFileLock lock
INFO: Lock acquired
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;which reminds me of &lt;a href=&quot;https://jira.opendaylight.org/browse/CONTROLLER-1845&quot; title=&quot;Karaf takes 7 minutes to start&quot; class=&quot;issue-link&quot; data-issue-key=&quot;CONTROLLER-1845&quot;&gt;&lt;del&gt;CONTROLLER-1845&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;but, the timestamps of those messages are off, compared to the message right before they started:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
2018-07-24T00:51:10,559 | INFO  | Blueprint Event Dispatcher: 1 | BlueprintBundleTracker           | 199 - org.opendaylight.controller.blueprint - 0.8.3.SNAPSHOT | Blueprint container &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; bun
dle org.opendaylight.netvirt.qosservice-impl_0.6.3.SNAPSHOT [359] was successfully created
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I think the karaf made logs are using a different method of creating time stamps.&lt;/p&gt;

&lt;p&gt;Also, my other two controllers, which I stopped at the same time (all three got a finall kill -9 at the same time) have&lt;br/&gt;
the same final timestamp in their logs (2018-07-24T05:24:32), but I don&apos;t see that timestamp in my first node&lt;br/&gt;
(the one I&apos;m bouncing)&lt;/p&gt;

&lt;p&gt;very confused.&lt;/p&gt;

&lt;p&gt;I&apos;ll try again, with some additional steps in my bouncer scripts to keep track of timestamps better.&lt;/p&gt;</comment>
                            <comment id="64194" author="ecelgp" created="Tue, 24 Jul 2018 23:18:08 +0000"  >&lt;p&gt;Coming back to: &lt;a href=&quot;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/openflowplugin-csit-3node-clustering-only-oxygen/336/robot-plugin/log.html.gz#s1-s2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/openflowplugin-csit-3node-clustering-only-oxygen/336/robot-plugin/log.html.gz#s1-s2&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;I see the restarting member-2 fails to join:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
2018-07-24T11:28:43,869 | INFO  | opendaylight-cluster-data-akka.actor.&lt;span class=&quot;code-keyword&quot;&gt;default&lt;/span&gt;-dispatcher-4 | Slf4jLogger                      | 48 - com.typesafe.akka.slf4j - 2.5.11 | Slf4jLogger started
2018-07-24T11:28:43,912 | INFO  | opendaylight-cluster-data-akka.actor.&lt;span class=&quot;code-keyword&quot;&gt;default&lt;/span&gt;-dispatcher-4 | Remoting                         | 48 - com.typesafe.akka.slf4j - 2.5.11 | Starting remoting
2018-07-24T11:28:44,143 | INFO  | opendaylight-cluster-data-akka.actor.&lt;span class=&quot;code-keyword&quot;&gt;default&lt;/span&gt;-dispatcher-6 | Remoting                         | 48 - com.typesafe.akka.slf4j - 2.5.11 | Remoting started; listening on addresses :[akka.tcp:&lt;span class=&quot;code-comment&quot;&gt;//opendaylight-cluster-data@10.30.171.67:2550]
&lt;/span&gt;2018-07-24T11:28:44,168 | INFO  | opendaylight-cluster-data-akka.actor.&lt;span class=&quot;code-keyword&quot;&gt;default&lt;/span&gt;-dispatcher-6 | Cluster(akka:&lt;span class=&quot;code-comment&quot;&gt;//opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node [akka.tcp://opendaylight-cluster-data@10.30.171.67:2550] - Starting up...
&lt;/span&gt;2018-07-24T11:28:44,268 | INFO  | opendaylight-cluster-data-akka.actor.&lt;span class=&quot;code-keyword&quot;&gt;default&lt;/span&gt;-dispatcher-4 | Cluster(akka:&lt;span class=&quot;code-comment&quot;&gt;//opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node [akka.tcp://opendaylight-cluster-data@10.30.171.67:2550] - Registered cluster JMX MBean [akka:type=Cluster]
&lt;/span&gt;2018-07-24T11:28:44,269 | INFO  | opendaylight-cluster-data-akka.actor.&lt;span class=&quot;code-keyword&quot;&gt;default&lt;/span&gt;-dispatcher-4 | Cluster(akka:&lt;span class=&quot;code-comment&quot;&gt;//opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node [akka.tcp://opendaylight-cluster-data@10.30.171.67:2550] - Started up successfully
&lt;/span&gt;
2018-07-24T11:29:08,440 | WARN  | opendaylight-cluster-data-akka.actor.&lt;span class=&quot;code-keyword&quot;&gt;default&lt;/span&gt;-dispatcher-23 | JoinSeedNodeProcess              | 48 - com.typesafe.akka.slf4j - 2.5.11 | Couldn&apos;t join seed nodes after [2] attempts, will &lt;span class=&quot;code-keyword&quot;&gt;try&lt;/span&gt; again. seed-nodes=[akka.tcp:&lt;span class=&quot;code-comment&quot;&gt;//opendaylight-cluster-data@10.30.171.71:2550, akka.tcp://opendaylight-cluster-data@10.30.171.61:2550]&lt;/span&gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;while the other 2 members receive and respond to the join requests:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
2018-07-24T11:28:44,623 | INFO  | opendaylight-cluster-data-akka.actor.&lt;span class=&quot;code-keyword&quot;&gt;default&lt;/span&gt;-dispatcher-3 | Cluster(akka:&lt;span class=&quot;code-comment&quot;&gt;//opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node [akka.tcp://opendaylight-cluster-data@10.30.171.71:2550] - Received InitJoin message from [Actor[akka.tcp://opendaylight-cluster-data@10.30.171.67:2550/system/cluster/core/daemon/joinSeedNodeProcess-1#1711052280]] to [akka.tcp://opendaylight-cluster-data@10.30.171.71:2550]
&lt;/span&gt;2018-07-24T11:28:44,624 | INFO  | opendaylight-cluster-data-akka.actor.&lt;span class=&quot;code-keyword&quot;&gt;default&lt;/span&gt;-dispatcher-3 | Cluster(akka:&lt;span class=&quot;code-comment&quot;&gt;//opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node [akka.tcp://opendaylight-cluster-data@10.30.171.71:2550] - Sending InitJoinAck message from node [akka.tcp://opendaylight-cluster-data@10.30.171.71:2550] to [Actor[akka.tcp://opendaylight-cluster-data@10.30.171.67:2550/system/cluster/core/daemon/joinSeedNodeProcess-1#1711052280]]
&lt;/span&gt;
2018-07-24T11:28:56,431 | INFO  | opendaylight-cluster-data-akka.actor.&lt;span class=&quot;code-keyword&quot;&gt;default&lt;/span&gt;-dispatcher-3 | Cluster(akka:&lt;span class=&quot;code-comment&quot;&gt;//opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node [akka.tcp://opendaylight-cluster-data@10.30.171.71:2550] - Received InitJoin message from [Actor[akka.tcp://opendaylight-cluster-data@10.30.171.67:2550/system/cluster/core/daemon/joinSeedNodeProcess-1#1711052280]] to [akka.tcp://opendaylight-cluster-data@10.30.171.71:2550]
&lt;/span&gt;2018-07-24T11:28:56,434 | INFO  | opendaylight-cluster-data-akka.actor.&lt;span class=&quot;code-keyword&quot;&gt;default&lt;/span&gt;-dispatcher-3 | Cluster(akka:&lt;span class=&quot;code-comment&quot;&gt;//opendaylight-cluster-data) | 48 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node [akka.tcp://opendaylight-cluster-data@10.30.171.71:2550] - Sending InitJoinAck message from node [akka.tcp://opendaylight-cluster-data@10.30.171.71:2550] to [Actor[akka.tcp://opendaylight-cluster-data@10.30.171.67:2550/system/cluster/core/daemon/joinSeedNodeProcess-1#1711052280]]&lt;/span&gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This really seems an issue in the restarting member not getting or processing InitJoinAck from the other 2.&lt;/p&gt;</comment>
                            <comment id="64196" author="tpantelis" created="Wed, 25 Jul 2018 00:13:01 +0000"  >&lt;p&gt;yup - it&apos;s the same pattern I&apos;ve seen in numerous logs.&lt;/p&gt;</comment>
                            <comment id="64197" author="tpantelis" created="Wed, 25 Jul 2018 00:45:30 +0000"  >&lt;p&gt;I think it will be easier to make sense out of it if it restarts with a clean log.&lt;/p&gt;</comment>
                            <comment id="64198" author="tpantelis" created="Wed, 25 Jul 2018 01:48:57 +0000"  >&lt;p&gt;So the 401&apos;s occur in Oxygen b/c AAA shiro doesn&apos;t initialize due to the DS not ready:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2018-07-24T11:33:26,723 | ERROR | paxweb-extender-1-thread-1 | EnvironmentLoader                | 143 - org.apache.shiro.web - 1.3.2 | Shiro environment initialization failed
java.lang.RuntimeException: Error obtaining AAAShiroProvider
	at org.opendaylight.aaa.shiro.web.env.KarafIniWebEnvironment.init(KarafIniWebEnvironment.java:76) ~[189:org.opendaylight.aaa.shiro:0.7.3]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Without shiro, we can&apos;t authenticate. This code was refactored in Fluorine such that I believe we no longer get 401&apos;s but 404&apos;s.&lt;/p&gt;

&lt;p&gt;Again either way, it&apos;s all due to shards not converging due to akka not rejoining. &lt;/p&gt;</comment>
                            <comment id="64244" author="jluhrsen" created="Wed, 25 Jul 2018 22:10:28 +0000"  >&lt;p&gt;quick note that I let myself get sidetracked by &lt;a href=&quot;https://jira.opendaylight.org/browse/CONTROLLER-1845&quot; title=&quot;Karaf takes 7 minutes to start&quot; class=&quot;issue-link&quot; data-issue-key=&quot;CONTROLLER-1845&quot;&gt;&lt;del&gt;CONTROLLER-1845&lt;/del&gt;&lt;/a&gt; since I bit me pretty irritatingly when&lt;br/&gt;
trying to repro this one with the extra akka debugs. When 1845 hits, it makes the repro efforts for this&lt;br/&gt;
worthless.&lt;/p&gt;</comment>
                            <comment id="64307" author="jluhrsen" created="Fri, 27 Jul 2018 21:56:09 +0000"  >&lt;p&gt;Here are the logs with these extra debugs from a recent reproduction in my local setup &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.opendaylight.org/secure/attachment/14800/14800_karaf.1.log&quot; title=&quot;karaf.1.log attached to CONTROLLER-1849&quot;&gt;karaf.1.log&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.opendaylight.org/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.opendaylight.org/secure/attachment/14801/14801_karaf.2.log&quot; title=&quot;karaf.2.log attached to CONTROLLER-1849&quot;&gt;karaf.2.log&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.opendaylight.org/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.opendaylight.org/secure/attachment/14802/14802_karaf.3.log&quot; title=&quot;karaf.3.log attached to CONTROLLER-1849&quot;&gt;karaf.3.log&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.opendaylight.org/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;karaf.1.log is the 1st seed node, and the node I am doing the kill -9 and start repeatedly until we hit this bug.&lt;/p&gt;

&lt;p&gt;I&apos;ve trimmed karaf.1.log down to only include the logs since it&apos;s final restart. You can find this log (which we think&lt;br/&gt;
 is our smoking gun) in it:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;2018-07-27T10:25:04,024 | INFO | opendaylight-cluster-data-akka.actor.&lt;span class=&quot;code-keyword&quot;&gt;default&lt;/span&gt;-dispatcher-55 | Cluster(akka:&lt;span class=&quot;code-comment&quot;&gt;//opendaylight-cluster-data) | 90 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node [akka.tcp://opendaylight-cluster-data@172.28.5.1:2550] - Leader is moving node [akka.tcp://opendaylight-cluster-data@172.28.5.1:2550] to [Up]&lt;/span&gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;you can see 172.28.5.1 is moving itself to Leader &lt;span class=&quot;error&quot;&gt;&amp;#91;Up&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;The other two karaf.log files are from the other controllers, and I have trimmed them out to only include logs from the&lt;br/&gt;
timestamp of the 1st seed node&apos;s last startup.&lt;/p&gt;</comment>
                            <comment id="64308" author="tpantelis" created="Fri, 27 Jul 2018 21:58:15 +0000"  >&lt;p&gt;nice - thanks&lt;/p&gt;</comment>
                            <comment id="64429" author="tpantelis" created="Tue, 31 Jul 2018 12:26:04 +0000"  >&lt;p&gt;I attached the logs to the akka issue. The bottom line is that node1 was able to send messages (InitJoin) to nodes 2 and 3 but no messages from nodes 2 and 3 were received by node1 (IntJoinAck, Heartbeat). We&apos;ll see what they think but I suspect the next step might be to run wireshark or something to see what&apos;s actually getting on the wire. I think the issue is on node1&apos;s side - otherwise it would mean both nodes 2 and 3 would experience the same issue simultaneously which is much less likely. Plus nodes 2 and 3 were communicating back and forth fine.&lt;/p&gt;</comment>
                            <comment id="64432" author="shague@redhat.com" created="Tue, 31 Jul 2018 14:15:59 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=jluhrsen&quot; class=&quot;user-hover&quot; rel=&quot;jluhrsen&quot;&gt;jluhrsen&lt;/a&gt; look at &lt;a href=&quot;https://git.opendaylight.org/gerrit/#/c/74585/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://git.opendaylight.org/gerrit/#/c/74585/&lt;/a&gt; for how we added the tcpdump for port 6653 ofp traffic on devstack nodes. That function can be used to start one for akka traffic. We might need a variant &lt;a href=&quot;https://git.opendaylight.org/gerrit/#/c/74586/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://git.opendaylight.org/gerrit/#/c/74586/&lt;/a&gt; because the tests stop the tcpdump based on the grep of tcpdump in ps, so at the end of each suite it will kill all running tcpdump&apos;s. That check should be more selective if we need to keep the akka tcpdump running for the whole job duration. If the job is a single suite then it is not a problem.&lt;/p&gt;</comment>
                            <comment id="64460" author="tpantelis" created="Wed, 1 Aug 2018 14:38:35 +0000"  >&lt;p&gt;Here&apos;s the docs for Akka artery : &lt;a href=&quot;https://doc.akka.io/docs/akka/2.5.11/remoting-artery.html&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://doc.akka.io/docs/akka/2.5.11/remoting-artery.html&lt;/a&gt;. They have added TCP support since I last looked at it but another article I read suggests it isn&apos;t quite ready for production. I would think the default is UDP but it can be explicitly set via &lt;em&gt;transport = aeron-udp&lt;/em&gt; as outlined in the &lt;b&gt;Preparing your ActorSystem for Remoting&lt;/b&gt; section. Also seed node addresses must start with &quot;akka://opendaylight-cluster-data...&quot;, ie drop the &quot;.tcp&quot;.&lt;/p&gt;</comment>
                            <comment id="64635" author="jluhrsen" created="Tue, 14 Aug 2018 22:35:56 +0000"  >&lt;p&gt;I was able to reproduce this with a few more debugs happening. I&apos;m doing a netstat to see which ports are open and I did a tcpdump for&lt;br/&gt;
all traffic on port 2550 for this node that comes up and joins itself as leader (island).&lt;/p&gt;

&lt;p&gt;I didn&apos;t see any difference in the output of the netstat command from a passing run and a failing run. It looks like this:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
tcp        0      0 127.0.0.11:39681        0.0.0.0:*               LISTEN      -
tcp6       0      0 :::44444                :::*                    LISTEN      12969/java
tcp6       0      0 :::8101                 :::*                    LISTEN      12969/java
tcp6       0      0 127.0.0.1:1099          :::*                    LISTEN      12969/java
tcp6       0      0 127.0.0.1:35949         :::*                    LISTEN      12969/java
tcp6       0      0 :::8181                 :::*                    LISTEN      12969/java
tcp6       0      0 172.28.5.1:2550         :::*                    LISTEN      12969/java
tcp6       0      0 :::34519                :::*                    LISTEN      12969/java
udp        0      0 127.0.0.11:43904        0.0.0.0:*                           -
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;what&apos;s weird though, is that this controller 1 (first seed node) doesn&apos;t have port 8185 lit up, in the passing or failing&lt;br/&gt;
case. I haven&apos;t figured that out yet, and probably just noise in all of this, but wanted to point it out. So, for example,&lt;br/&gt;
the netstat in controller 2 looks like this:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
tcp        0      0 127.0.0.11:38895        0.0.0.0:*               LISTEN      -
tcp6       0      0 127.0.0.1:36513         :::*                    LISTEN      4201/java
tcp6       0      0 :::8101                 :::*                    LISTEN      4201/java
tcp6       0      0 127.0.0.1:1099          :::*                    LISTEN      4201/java
tcp6       0      0 :::46097                :::*                    LISTEN      4201/java
tcp6       0      0 :::8181                 :::*                    LISTEN      4201/java
tcp6       0      0 172.28.5.2:2550         :::*                    LISTEN      4201/java
tcp6       0      0 :::8185                 :::*                    LISTEN      4201/java
tcp6       0      0 :::44444                :::*                    LISTEN      4201/java [^104.pcap] 
udp        0      0 127.0.0.11:47322        0.0.0.0:*                           -
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Attached are two packet captures that open fine in wireshark. filename 104.pcap is from&lt;br/&gt;
a successful fun where the controller was killed and restarted and successfully joined the&lt;br/&gt;
cluster. filename 105.pcap is from the failure case. also attached is 105.1.log which is the&lt;br/&gt;
karaf log from the controller 1 in the failure case. I don&apos;t have karaf logs from the other&lt;br/&gt;
controllers in the failure case, nor any from the passing case(s).&lt;/p&gt;

&lt;p&gt; &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.opendaylight.org/secure/attachment/14914/14914_104.pcap&quot; title=&quot;104.pcap attached to CONTROLLER-1849&quot;&gt;104.pcap&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.opendaylight.org/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;  &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.opendaylight.org/secure/attachment/14915/14915_105.pcap&quot; title=&quot;105.pcap attached to CONTROLLER-1849&quot;&gt;105.pcap&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.opendaylight.org/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;  &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.opendaylight.org/secure/attachment/14916/14916_105.1.log&quot; title=&quot;105.1.log attached to CONTROLLER-1849&quot;&gt;105.1.log&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.opendaylight.org/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; &lt;/p&gt;</comment>
                            <comment id="64640" author="tpantelis" created="Wed, 15 Aug 2018 15:42:25 +0000"  >&lt;p&gt;The packet capture basically shows what we&apos;ve seen in the logs, ie node1 is sending out InitJoin messages to 2 &amp;amp; 3 and InitJoinAck messages are coming back. Also Heartbeat messages are incoming from 2 &amp;amp; 3 along with our AppendEntries heartbeats. So node1 is sending out messages but incoming messages are not getting processed for some reason. I&apos;ll update the akka ticket.&lt;/p&gt;

&lt;p&gt;I think the next steps are:&lt;/p&gt;

&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;try it with graceful shutdown and see if it repros&lt;/li&gt;
&lt;/ul&gt;


&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;try &lt;em&gt;akka.remote.use-passive-connections = off&lt;/em&gt; as the akka engineer suggested&lt;/li&gt;
&lt;/ul&gt;


&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;try artery&lt;/li&gt;
&lt;/ul&gt;


</comment>
                            <comment id="64641" author="jluhrsen" created="Wed, 15 Aug 2018 18:15:03 +0000"  >&lt;p&gt;How can we debug this further to root cause it? You noted in the &lt;a href=&quot;https://github.com/akka/akka/issues/25361&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;akka bug &lt;/a&gt; we know&lt;br/&gt;
the packets are coming from the other nodes, but the akka debugs we tried before do not show those being received. Is there no&lt;br/&gt;
lower level debugs we can enable in karaf to try to trace the packets being processed?&lt;/p&gt;

&lt;p&gt;The akka folks on that github bug have suggested we try adding extra debugs in akka itself and trying with&lt;br/&gt;
a snapshot of that. Can you help me do that?&lt;/p&gt;

&lt;p&gt;Everything you&apos;ve listed above is a workaround idea. I can try all of those and maybe they will give us some&lt;br/&gt;
clues, but we have some debug momentum at this point and I think we should keep fighting.&lt;/p&gt;

&lt;p&gt;BTW, I&apos;ve now run more than 800 iterations of this reproduction script locally with a graceful shutdown and&lt;br/&gt;
have not seen the issue. I think that&apos;s safe to use as confirmation that it is not coming with a graceful&lt;br/&gt;
restart. Not that it &lt;b&gt;only&lt;/b&gt; comes with a kill -9 restart, but at least we have a little extra data.&lt;/p&gt;</comment>
                            <comment id="64642" author="tpantelis" created="Wed, 15 Aug 2018 18:39:33 +0000"  >&lt;p&gt;&amp;lt; lower level debugs we can enable in karaf to try to trace the packets being processed?&lt;/p&gt;

&lt;p&gt;It&apos;s not karaf - it&apos;s akka code.&lt;/p&gt;

&lt;p&gt;&amp;gt;The akka folks on that github bug have suggested we try adding extra debugs in akka itself and trying with&lt;br/&gt;
&amp;gt; a snapshot of that. Can you help me do that?&lt;/p&gt;

&lt;p&gt;Their code has to be modified to add more debug - I think they mean they would add it and supply a snapshot. &lt;/p&gt;

&lt;p&gt;In the end we may have to workaround this or at least doc it as a known anomaly. Even if they find the root cause and are able to fix it, that would likely come in a new akka version that we would have to eventually upgrade to (unless they can patch the version we use). artery is not a workaround - we want to switch to it anyway - hopefully in Neon - we&apos;ll have to longer term anyway since the netty-based version will be removed at some point.   &lt;/p&gt;


</comment>
                            <comment id="64643" author="jluhrsen" created="Wed, 15 Aug 2018 19:42:12 +0000"  >&lt;blockquote&gt;
&lt;p&gt;&amp;lt; lower level debugs we can enable in karaf to try to trace the packets being processed?&lt;/p&gt;

&lt;p&gt;It&apos;s not karaf - it&apos;s akka code.&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;I know it&apos;s akka code that&apos;s not receiving the initJoinAck, but that doesn&apos;t mean it&apos;s getting&lt;br/&gt;
dropped in akka code does it? Maybe it&apos;s dropped earlier in whatever connection handling&lt;br/&gt;
we have? There has to be something grabbing these packets to hand up to akka? Forget&lt;br/&gt;
I said &quot;karaf&quot;. Like netty or whatever that magic is we are using in ODL? Obviously I don&apos;t&lt;br/&gt;
fully know these details. Maybe akka is doing all this work itself.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;&amp;gt;The akka folks on that github bug have suggested we try adding extra debugs in akka itself and trying with&lt;br/&gt;
&amp;gt; a snapshot of that. Can you help me do that?&lt;/p&gt;

&lt;p&gt;Their code has to be modified to add more debug - I think they mean they would add it and supply a snapshot.&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;That&apos;s not how I took it. Could you clarify that with them? I don&apos;t mind to keep digging here and&lt;br/&gt;
make further progress on the root cause and possible fix.&lt;/p&gt;

&lt;blockquote&gt;

&lt;p&gt;In the end we may have to workaround this or at least doc it as a known anomaly. Even if they find the root cause and are able to fix it, that would likely come in a new akka version that we would have to eventually upgrade to (unless they can patch the version we use). artery is not a workaround - we want to switch to it anyway - hopefully in Neon - we&apos;ll have to longer term anyway since the netty-based version will be removed at some point.&lt;/p&gt;
&lt;/blockquote&gt;

&lt;p&gt;atery &lt;b&gt;is&lt;/b&gt; a workaround. If the bug is not in artery and we move to artery we are not fixing any bug. We&lt;br/&gt;
are ignoring it and moving to something that doesn&apos;t have it.&lt;/p&gt;

&lt;p&gt;and, if the move to artery is similar to our move to tell-based, then who knows when that will happen.&lt;/p&gt;

&lt;p&gt;we&apos;ve upgraded akka before, so I&apos;m not understanding why that is something to worry about.&lt;/p&gt;

&lt;p&gt;I &lt;b&gt;will&lt;/b&gt; try will try with artery at some point.&lt;/p&gt;</comment>
                            <comment id="64644" author="tpantelis" created="Wed, 15 Aug 2018 20:27:01 +0000"  >&lt;p&gt;&amp;gt;  Maybe akka is doing all this work itself.&lt;/p&gt;

&lt;p&gt;Yes it is - all of the code in question is  akka&lt;/p&gt;


&lt;p&gt;&amp;gt; That&apos;s not how I took it. Could you clarify that with them? I don&apos;t mind to keep digging here and&lt;br/&gt;
&amp;gt; make further progress on the root cause and possible fix.&lt;/p&gt;

&lt;p&gt;&quot;*Next step would probably be to add some additional logging to Akka *&quot;  - there is no more debug logging to enable - code has to be added to add more detailed logging.&lt;/p&gt;

&lt;p&gt;&amp;gt; atery is a workaround. If the bug is not in artery and we move to artery we are not fixing any bug. We&lt;br/&gt;
&amp;gt; are ignoring it and moving to something that doesn&apos;t have it.&lt;/p&gt;

&lt;p&gt;yes b/c artery is their next-generation remoting layer. The netty-based one is essentially deprecated and will be removed. So we have to go to it anyway. In fact we&apos;re probably lucky they&apos;re even helping us at all (for free anyway). &lt;/p&gt;</comment>
                            <comment id="64645" author="jluhrsen" created="Wed, 15 Aug 2018 20:59:42 +0000"  >
&lt;blockquote&gt;

&lt;p&gt;&amp;gt; Maybe akka is doing all this work itself.&lt;/p&gt;

&lt;p&gt;Yes it is - all of the code in question is akka&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;ok, I just must be confused with the netty we are using in ODL for other things and&lt;br/&gt;
what you mean by netty-based akka.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;&amp;gt; That&apos;s not how I took it. Could you clarify that with them? I don&apos;t mind to keep digging here and&lt;br/&gt;
&amp;gt; make further progress on the root cause and possible fix.&lt;/p&gt;

&lt;p&gt;&quot;*Next step would probably be to add some additional logging to Akka *&quot; - there is no more debug logging to enable - code has to be added to add more detailed logging.&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;well, to be fair, I took what you quoted above as them suggesting &lt;b&gt;we&lt;/b&gt; make the&lt;br/&gt;
changes for additional logging, not that they would.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;&amp;gt; atery is a workaround. If the bug is not in artery and we move to artery we are not fixing any bug. We&lt;br/&gt;
&amp;gt; are ignoring it and moving to something that doesn&apos;t have it.&lt;/p&gt;

&lt;p&gt;yes b/c artery is their next-generation remoting layer. The netty-based one is essentially deprecated and will be removed. So we have to go to it anyway. In fact we&apos;re probably lucky they&apos;re even helping us at all (for free anyway).&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;ok, fair enough. Let me try artery next.&lt;/p&gt;</comment>
                            <comment id="64648" author="tpantelis" created="Wed, 15 Aug 2018 21:30:34 +0000"  >&lt;p&gt;&amp;gt; well, to be fair, I took what you quoted above as them suggesting we make the&lt;br/&gt;
&amp;gt; changes for additional logging, not that they would.&lt;/p&gt;

&lt;p&gt;yeah - I think you&apos;re right. I&apos;ll look into that when I have some cycles....&lt;/p&gt;</comment>
                            <comment id="64731" author="jluhrsen" created="Thu, 23 Aug 2018 05:58:19 +0000"  >&lt;p&gt;having a really hard time making artery work, but still am trying.&lt;/p&gt;</comment>
                            <comment id="64934" author="jluhrsen" created="Wed, 12 Sep 2018 05:13:21 +0000"  >&lt;p&gt;I neglected to update this, but a week or two ago I was finally able to get artery to work and force&lt;br/&gt;
the shared memory disk usage to be cleaned on every kill/restart iteration. I was able to run the&lt;br/&gt;
reproduction script for more than 400 iterations without hitting this bug. I think that&apos;s good enough&lt;br/&gt;
for now.&lt;/p&gt;

&lt;p&gt;So, maybe this means our root cause is going to be somewhere in the netty transport for akka. I&lt;br/&gt;
know the plan is to eventually move everything to tell based w/ artery. I don&apos;t think there is anyone&lt;br/&gt;
willing to keep digging on a root cause and fix for this. This is also not coming if we do graceful&lt;br/&gt;
restarts.&lt;/p&gt;

&lt;p&gt;We can change all of our tests to only do graceful restarts which should hopefully prevent this&lt;br/&gt;
bug from marking failures in our CSIT. When we get to a legit tell/artery implementation I think&lt;br/&gt;
we should try to get back to our kill/restart cases as those are still legit cases imho.&lt;/p&gt;</comment>
                            <comment id="64944" author="tpantelis" created="Thu, 13 Sep 2018 16:41:02 +0000"  >&lt;p&gt;&amp;gt;&#160;I don&apos;t think there is anyone willing to keep digging on a root cause and fix for this.&#160;&lt;/p&gt;

&lt;p&gt;It would be either me or Robert and I don&apos;t have cycles (nor desire &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.opendaylight.org/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;). I think I can speak for Robert as well. Also this is a rather obscure and extreme test scenario and not very likely it would be seen in production anyway.&lt;/p&gt;

&lt;p&gt;&amp;gt;&#160;We can change all of our tests to only do graceful restarts...&lt;/p&gt;

&lt;p&gt;Sounds good.&lt;/p&gt;</comment>
                            <comment id="64949" author="shague@redhat.com" created="Mon, 17 Sep 2018 01:47:58 +0000"  >&lt;p&gt;I converted the NetVirt CSIT to use graceful start and stop. It looks promising and we don&apos;t see the OLFE&apos;s and other weird exceptions, but it does bring out other issues. openflowplugin and ovsdb have various bundles shutting down, but not all at once, and the listening ports are still open. The switches continue sending to the ODL shutting down. We need to clean those issues up.&lt;/p&gt;

&lt;p&gt;That being said, it seems like we are adapting the tests to accommodate bugs. I could see if we kept the test as is, then maybe they shouldn&apos;t fail or we should have checks to know that there will be issues during this time and not fail.&lt;/p&gt;</comment>
                            <comment id="64950" author="tpantelis" created="Mon, 17 Sep 2018 02:35:20 +0000"  >&lt;p&gt;Graceful shutdown should be the norm in production so CSIT should be testing it and, if bundles have issues shutting down gracefully, they should be addressed. &quot;kill -9&quot; should be a last resort as it could cause leave persistence&#160;in a corrupted state. So it sounds like introducing graceful shutdown is uncovering issues which is a good thing.&lt;/p&gt;</comment>
                            <comment id="64964" author="ecelgp" created="Mon, 17 Sep 2018 16:24:11 +0000"  >&lt;p&gt;FYI I left 1 test in OFP doing the kill so at least I can track how often the kill fails:&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://git.opendaylight.org/gerrit/#/c/74412/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://git.opendaylight.org/gerrit/#/c/74412/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="65450" author="jluhrsen" created="Tue, 30 Oct 2018 17:44:42 +0000"  >&lt;p&gt;This basic problem of a controller not coming up healthy after restarting still happens. Yes, the last example&lt;br/&gt;
I saw was in the &apos;kill -9&apos; scenario. So I suppose we want to keep this bug open, but I&apos;ll lower the priority of it. I really don&apos;t&lt;br/&gt;
know who/when/how it will ever be addressed though. There is just nobody to work on it, and &apos;kill -9&apos; is hopefully&lt;br/&gt;
not a common way for people to stop ODL.&lt;/p&gt;</comment>
                            <comment id="65452" author="tpantelis" created="Tue, 30 Oct 2018 19:47:55 +0000"  >&lt;p&gt;It&apos;s an issue in akka and it would not be a trivial effort to learn that large code base (not to mention scala) plus it&apos;s an old code base that has essentially been replaced by artery, which is why the akka devs aren&apos;t anxious to jump on it (at least not for free). Plus it&apos;s an extreme test scenario (killing/restarting the process quickly over and over) that realistically wouldn&apos;t be seen in production (which I&apos;m sure is why it&apos;s never been reported in the 10 years akka&apos;s been around) or if it ever did, you got bigger problems than the node not rejoining the cluster after like the 50th time or whatever &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.opendaylight.org/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                            <comment id="65453" author="jluhrsen" created="Tue, 30 Oct 2018 20:06:05 +0000"  >&lt;p&gt;fair enough, but just to be clear, this is not happening because of killing/restarting quickly over and over for 50 times. yes, that was the script&lt;br/&gt;
I tried to share that made it easier to reproduce, but the failures are still seen in our automation where it&apos;s killing, waiting to make sure&lt;br/&gt;
the process is dead, then starting and again waiting/polling for the controller to come up (so not a 401) and be in syncstatus true.&lt;/p&gt;

&lt;p&gt;besides the &quot;kill -9&quot; part, it&apos;s pretty gentle.&lt;/p&gt;

&lt;p&gt;again, I understand nobody wants or plans to work on it. I have lowered the priority.&lt;/p&gt;

&lt;p&gt;maybe tell-based will be better, but that work is yet to be done and the jobs we have for tell based have failures to figure&lt;br/&gt;
out as well&lt;/p&gt;</comment>
                            <comment id="65454" author="tpantelis" created="Tue, 30 Oct 2018 21:14:52 +0000"  >&lt;p&gt;It&apos;s not related to tell-based - that&apos;s our stuff.  It&apos;s in akka or a lib it uses (eg netty) or possibly at the TCP-level where something is stale due to the process being killed. AFAIK, from you and Luis testing, it does not occur if the process is gracefully shutdown nor does it occur after the first kill/restart (ie it takes multiple iterations if it occurs at all). Also it has not been seen to occur with the new remoting code base, artery, (with UDP at least) so IMO, it makes more sense to focus on switching to artery rather than spending the time to try to learn, debug, possibly fix the old akka code base where it appears Lightbend has little to no interest in continuing to support (at least for free). &lt;/p&gt;</comment>
                            <comment id="65455" author="jluhrsen" created="Tue, 30 Oct 2018 21:24:32 +0000"  >&lt;p&gt;yeah, I said &apos;tell-based&quot;, but confused that with &quot;artery&quot;. my bad.&lt;/p&gt;

&lt;p&gt;but yeah, it&apos;s &lt;b&gt;very&lt;/b&gt; clear this bug is not getting fix and that there is no interest from anyone to&lt;br/&gt;
look any deeper. Gotta keep it open though, so we can point back to it when our automation&lt;br/&gt;
continues to have these failures. We have moved a lot of the tests to use graceful stop/start&lt;br/&gt;
so hopefully it&apos;s not as frequent any more.&lt;/p&gt;
</comment>
                            <comment id="65624" author="rovarga" created="Wed, 14 Nov 2018 12:54:01 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=tpantelis&quot; class=&quot;user-hover&quot; rel=&quot;tpantelis&quot;&gt;tpantelis&lt;/a&gt; I noticed &lt;a href=&quot;https://github.com/akka/akka/issues/25632&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/akka/akka/issues/25632&lt;/a&gt; in 2.5.17 (which we still cannot have, as things are in-flux and we&apos;ll hopefully get 2.5.19). Is that something that can help?&lt;/p&gt;</comment>
                            <comment id="65641" author="tpantelis" created="Wed, 14 Nov 2018 13:40:54 +0000"  >&lt;p&gt;That is related to quarantining but that doesn&apos;t happen in this scenario (that (usually) happens with node isolation) plus, even if the killed node was somehow quarantined, restarting clears the quarantined state.  No split brain occurs here - it&apos;s a simple node kill/restart.&lt;/p&gt;</comment>
                            <comment id="65979" author="jluhrsen" created="Thu, 13 Dec 2018 03:57:02 +0000"  >&lt;p&gt;moved this to Low priority and closing as we aren&apos;t seeing failures any more because we moved to&lt;br/&gt;
a graceful stop/start, but I&apos;m sure we would see it if we went back to the kill -9. Also, it&lt;br/&gt;
doesn&apos;t come when we use artery (even with kill -9) and hopefully that is going to be&lt;br/&gt;
what is default in the near future.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10003">
                    <name>Relates</name>
                                            <outwardlinks description="relates to">
                                        <issuelink>
            <issuekey id="30150">NETVIRT-1315</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="28817">CONTROLLER-1790</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="14914" name="104.pcap" size="429265" author="jluhrsen" created="Tue, 14 Aug 2018 22:31:53 +0000"/>
                            <attachment id="14916" name="105.1.log" size="161715" author="jluhrsen" created="Tue, 14 Aug 2018 22:35:48 +0000"/>
                            <attachment id="14915" name="105.pcap" size="1743302" author="jluhrsen" created="Tue, 14 Aug 2018 22:32:04 +0000"/>
                            <attachment id="14800" name="karaf.1.log" size="593549" author="jluhrsen" created="Fri, 27 Jul 2018 21:50:00 +0000"/>
                            <attachment id="14801" name="karaf.2.log" size="5499466" author="jluhrsen" created="Fri, 27 Jul 2018 21:50:04 +0000"/>
                            <attachment id="14802" name="karaf.3.log" size="5136992" author="jluhrsen" created="Fri, 27 Jul 2018 21:50:03 +0000"/>
                            <attachment id="14788" name="karaf.log.since_last_restart.xz" size="85192" author="jluhrsen" created="Mon, 23 Jul 2018 21:13:37 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                            <customfield id="customfield_11400" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    <customfield id="customfield_10202" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Priority</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10304"><![CDATA[Low]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10000" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>0|i03gev:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                </customfields>
    </item>
</channel>
</rss>