<!-- 
RSS generated by JIRA (8.20.10#820010-sha1:ace47f9899e9ee25d7157d59aa17ab06aee30d3d) at Wed Feb 07 19:56:44 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>OpenDaylight JIRA</title>
    <link>https://jira.opendaylight.org</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>8.20.10</version>
        <build-number>820010</build-number>
        <build-date>22-06-2022</build-date>
    </build-info>


<item>
            <title>[CONTROLLER-1901] cluster node quarantined, but the node did not auto restart when restore the network connection </title>
                <link>https://jira.opendaylight.org/browse/CONTROLLER-1901</link>
                <project id="10113" key="CONTROLLER">controller</project>
                    <description>&lt;p&gt;In a three-node cluster environment, a cluster member is isolated by manual network isolation, and then the network is restored. It is found that the cluster members are not restarted.&lt;/p&gt;
&lt;h4&gt;&lt;a name=&quot;Environment%3A&quot;&gt;&lt;/a&gt;Environment:&lt;/h4&gt;

&lt;p&gt;3 cluster nodes&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;member-01&#65306;172.20.14.162
member-02&#65306;172.20.14.163
member-03&#65306;172.20.14.164&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;odl-version: Oxygen-sr4(0.8.4)&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;
&lt;h4&gt;&lt;a name=&quot;Stepstoreproduce%3A&quot;&gt;&lt;/a&gt;Steps to reproduce:&lt;/h4&gt;
&lt;ol&gt;
	&lt;li&gt;config cluster&lt;/li&gt;
	&lt;li&gt;cluster nodes start&lt;/li&gt;
	&lt;li&gt;install feature:odl-mdsal-all&lt;/li&gt;
	&lt;li&gt;add reject route on node01 as:&lt;br/&gt;
 route add -host 172.20.14.163 reject&lt;br/&gt;
 route add -host 172.20.14.164 reject&lt;/li&gt;
	&lt;li&gt;few minutes later delete reject route as :&lt;br/&gt;
 route del -host 172.20.14.163 reject&lt;br/&gt;
 route del -host 172.20.14.164 reject&lt;/li&gt;
	&lt;li&gt;log always print &quot;is still unreachable or has not been restarted. Keeping it quarantined.&quot;&lt;/li&gt;
	&lt;li&gt;node did not restart&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;cluster config use default settings, such as node01:&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;odl-cluster-data {
  akka {
    remote {
      artery {
        enabled = off
        canonical.hostname = &quot;172.20.14.162&quot;
        canonical.port = 2550
      }

      netty.tcp {
        hostname = &quot;172.20.14.162&quot;
        port = 2550
      }

      # when under load we might trip a false positive on the failure detector
      transport-failure-detector {
        # heartbeat-interval = 4 s
        # acceptable-heartbeat-pause = 16s #
      }
    }

    cluster {
      # Remove &quot;.tcp&quot; when using artery.
      seed-nodes = [&quot;akka.tcp://opendaylight-cluster-data@172.20.14.162:2550&quot;, &quot;akka.tcp://opendaylight-cluster-data@172.20.14.163:2550&quot;, &quot;akka.tcp://opendaylight-cluster-data@172.20.14.164:2550&quot;] roles = [&quot;member-1&quot;]
    }

    persistence {
      # By default the snapshots/journal directories live in KARAF_HOME. You can choose to put it somewhere else by
      # modifying the following two properties. The directory location specified may be a relative or absolute path.
      # The relative path is always relative to KARAF_HOME.
      snapshot-store.local.dir = &quot;target/snapshots&quot;
      journal.leveldb.dir = &quot;target/journal&quot;
      journal {
        leveldb {
          # Set native = off to use a Java-only implementation of leveldb.
          # Note that the Java-only version is not currently considered by Akka to be production quality.
          # native = off
        }
      }
    }
  }
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</description>
                <environment>&lt;p&gt;3 cluster nodes&lt;br/&gt;
member-01&#65306;172.20.14.162&lt;br/&gt;
member-02&#65306;172.20.14.163&lt;br/&gt;
member-03&#65306;172.20.14.164&lt;/p&gt;

&lt;p&gt;odl-version:Oxygen-sr4(0.8.4)&lt;/p&gt;

&lt;p&gt;1,config cluster&lt;br/&gt;
2,cluster nodes start&lt;br/&gt;
3,install feature:odl-mdsal-all&lt;br/&gt;
4,add reject route on node01 as:&lt;br/&gt;
   route add -host 172.20.14.163 reject&lt;br/&gt;
   route add -host 172.20.14.164 reject&lt;/p&gt;

&lt;p&gt;5,few minutes later delete reject route as :&lt;br/&gt;
   route del -host 172.20.14.163 reject&lt;br/&gt;
   route del -host 172.20.14.164 reject&lt;/p&gt;

&lt;p&gt;6, log always print &quot;is still unreachable or has not been restarted. Keeping it quarantined.&quot;&lt;br/&gt;
7, node did not restart&lt;/p&gt;




&lt;p&gt;cluster config use default settings,such as node01:&lt;br/&gt;
such as node01:&lt;br/&gt;
  odl-cluster-data {&lt;br/&gt;
  akka {&lt;br/&gt;
    remote {&lt;br/&gt;
      artery &lt;/p&gt;
{
        enabled = off
        canonical.hostname = &quot;172.20.14.162&quot;
        canonical.port = 2550
      }
&lt;p&gt;      netty.tcp &lt;/p&gt;
{
        hostname = &quot;172.20.14.162&quot;
        port = 2550
      }
&lt;ol&gt;
	&lt;li&gt;when under load we might trip a false positive on the failure detector&lt;/li&gt;
	&lt;li&gt;transport-failure-detector 
{
        # heartbeat-interval = 4 s
        # acceptable-heartbeat-pause = 16s
      # }
&lt;p&gt;    }&lt;/p&gt;&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;    cluster &lt;/p&gt;
{
      # Remove &quot;.tcp&quot; when using artery.
      seed-nodes = [&quot;akka.tcp://opendaylight-cluster-data@172.20.14.162:2550&quot;,
                                &quot;akka.tcp://opendaylight-cluster-data@172.20.14.163:2550&quot;,
                                &quot;akka.tcp://opendaylight-cluster-data@172.20.14.164:2550&quot;]

      roles = [&quot;member-1&quot;]

    }

&lt;p&gt;    persistence {&lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;By default the snapshots/journal directories live in KARAF_HOME. You can choose to put it somewhere else by&lt;/li&gt;
	&lt;li&gt;modifying the following two properties. The directory location specified may be a relative or absolute path.&lt;/li&gt;
	&lt;li&gt;The relative path is always relative to KARAF_HOME.&lt;/li&gt;
&lt;/ol&gt;


&lt;ol&gt;
	&lt;li&gt;snapshot-store.local.dir = &quot;target/snapshots&quot;&lt;/li&gt;
	&lt;li&gt;journal.leveldb.dir = &quot;target/journal&quot;&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;      journal {&lt;br/&gt;
        leveldb &lt;/p&gt;
{
          # Set native = off to use a Java-only implementation of leveldb.
          # Note that the Java-only version is not currently considered by Akka to be production quality.

          # native = off
        }
&lt;p&gt;      }&lt;br/&gt;
    }&lt;br/&gt;
  }&lt;br/&gt;
}&lt;/p&gt;</environment>
        <key id="31744">CONTROLLER-1901</key>
            <summary>cluster node quarantined, but the node did not auto restart when restore the network connection </summary>
                <type id="10104" iconUrl="https://jira.opendaylight.org/secure/viewavatar?size=xsmall&amp;avatarId=10303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.opendaylight.org/images/icons/priorities/major.svg">Medium</priority>
                        <status id="5" iconUrl="https://jira.opendaylight.org/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="green"/>
                                    <resolution id="10000">Done</resolution>
                                        <assignee username="Bosong">Bo Song</assignee>
                                    <reporter username="Bosong">Bo Song</reporter>
                        <labels>
                            <label>csit:3node</label>
                    </labels>
                <created>Thu, 13 Jun 2019 08:24:41 +0000</created>
                <updated>Wed, 2 Jun 2021 09:31:35 +0000</updated>
                            <resolved>Wed, 2 Jun 2021 09:31:35 +0000</resolved>
                                    <version>Oxygen SR4</version>
                                    <fixVersion>Sodium SR4</fixVersion>
                                    <component>clustering</component>
                        <due>Tue, 11 Jun 2019 00:00:00 +0000</due>
                            <votes>0</votes>
                                    <watches>1</watches>
                                                                                                                <comments>
                            <comment id="66899" author="bosong" created="Thu, 13 Jun 2019 09:02:20 +0000"  >&lt;p&gt;when test on n-sr2, node quarantine and automatic restart is possible.&lt;br/&gt;
I read the log notice that on n-sr2 log will  print &quot;Got quarantined by akka.tcp://opendaylight-cluster-data@xx.xx.xx.xx:2550&quot; before auto-restart.&lt;br/&gt;
so i check  the o-sr4 code and add log found that node did not received any message and the restart method will  not be called.&lt;/p&gt;

&lt;p&gt;method  &quot;onReceive&quot;:&lt;br/&gt;
&lt;a href=&quot;https://github.com/opendaylight/controller/blob/release/oxygen-sr4/opendaylight/md-sal/sal-clustering-commons/src/main/java/org/opendaylight/controller/cluster/common/actor/QuarantinedMonitorActor.java&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;QuarantinedMonitorActor&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Now I&apos;m trying to solve this problem, hope I can get some help about this problem, thanks.&lt;/p&gt;</comment>
                            <comment id="66925" author="bosong" created="Tue, 25 Jun 2019 08:04:37 +0000"  >&lt;p&gt;I have tested Fluorine-SR2&#65292; still has this problem.&lt;br/&gt;
Here are the problems I found&#65306;&lt;br/&gt;
odl-controller subscribe the &#8220;ThisActorSystemQuarantinedEvent&#8221; and call karaf.restart method. &lt;br/&gt;
Oxygen-sr4 dependency new akka-version and akka do not publish &#8220;ThisActorSystemQuarantinedEvent&#8221; during the operation process. so I change subscribe event from &#8220;ThisActorSystemQuarantinedEvent&#8221; to RemotingLifecycleEvent(super class) and found akka publish &quot;AssociationErrorEvent&quot;  when  restore network with the detail log:&lt;/p&gt;

&lt;p&gt;_onReceive AssociationErrorEvent AssociationError &lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@172.20.14.162:2550&amp;#93;&lt;/span&gt; -&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@172.20.14.163:2550&amp;#93;&lt;/span&gt;: Error &lt;span class=&quot;error&quot;&gt;&amp;#91;Invalid address: akka.tcp://opendaylight-cluster-data@172.20.14.163:2550&amp;#93;&lt;/span&gt; [&lt;br/&gt;
akka.remote.InvalidAssociation: Invalid address: akka.tcp://opendaylight-cluster-data@172.20.14.163:2550&lt;br/&gt;
Caused by: akka.remote.transport.Transport$InvalidAssociationException: The remote system has a UID that has been quarantined. Association aborted._&lt;/p&gt;

&lt;p&gt;I read akka code about publish event, publish business has changed, not much has been discovered.&lt;br/&gt;
I have already made a repair on my odl-project based on the &quot;AssociationErrorEvent&quot;, and I will write out the details of the modification recently. But I do not intend to submit  it to the community, because I feel this solution is not good, just for your reference.&lt;/p&gt;</comment>
                            <comment id="66942" author="bosong" created="Wed, 26 Jun 2019 07:08:27 +0000"  >&lt;p&gt;Here is my solution(based on stable/oxygen):&lt;br/&gt;
  when receive &quot;AssociationErrorEvent&quot; and message contains  &quot;The remote system has a UID that has been quarantined&quot;, ready to call restart method.&lt;br/&gt;
Before restart , it will count the number of remote addresses to restart the Isolated single node. Previous designs may restart two other nodes, and my approach guarantees business uninterrupted.&lt;br/&gt;
This solution works only for three-nodes clusters, more nodes  may have problems. I&apos;ve tested it several times in a three-nodes environment and it&apos;s stable.&lt;/p&gt;

&lt;p&gt;I have submitted my codes to my github&#65306;&lt;br/&gt;
&lt;a href=&quot;https://github.com/WillSongBo/controller/commit/4468d5b726fc1c57b7c5d631694a11165e081867#diff-815a2b08de862966176ed6838be6c0d8&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;change-diff&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;just for your reference. Looking forward to better solutions&lt;img class=&quot;emoticon&quot; src=&quot;https://jira.opendaylight.org/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;.&lt;/p&gt;
</comment>
                            <comment id="69243" author="bosong" created="Wed, 2 Jun 2021 09:29:43 +0000"  >&lt;h2&gt;&lt;a name=&quot;CONTROLLER1904havealreadybeensolved&quot;&gt;&lt;/a&gt;&lt;a href=&quot;https://jira.opendaylight.org/browse/CONTROLLER-1904&quot; title=&quot;DistributedEntityOwnershipService may silently lose registrations&quot; class=&quot;issue-link&quot; data-issue-key=&quot;CONTROLLER-1904&quot;&gt;&lt;del&gt;CONTROLLER-1904&lt;/del&gt;&lt;/a&gt; have already been solved&lt;/h2&gt;</comment>
                            <comment id="69244" author="bosong" created="Wed, 2 Jun 2021 09:31:35 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.opendaylight.org/browse/CONTROLLER-1941&quot; class=&quot;external-link&quot; rel=&quot;nofollow&quot;&gt;https://jira.opendaylight.org/browse/CONTROLLER-1941&lt;/a&gt;&#160;&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10003">
                    <name>Relates</name>
                                            <outwardlinks description="relates to">
                                        <issuelink>
            <issuekey id="32709">CONTROLLER-1941</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="15223" name="log.rar" size="76798" author="Bosong" created="Thu, 13 Jun 2019 08:18:19 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                            <customfield id="customfield_11400" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    <customfield id="customfield_10202" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Priority</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10301"><![CDATA[Normal]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10000" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>0|i03o6f:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                </customfields>
    </item>
</channel>
</rss>