<!-- 
RSS generated by JIRA (8.20.10#820010-sha1:ace47f9899e9ee25d7157d59aa17ab06aee30d3d) at Wed Feb 07 20:15:04 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>OpenDaylight JIRA</title>
    <link>https://jira.opendaylight.org</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>8.20.10</version>
        <build-number>820010</build-number>
        <build-date>22-06-2022</build-date>
    </build-info>


<item>
            <title>[NETCONF-454] Three node cluster does not recover from  isolation+ rejoin of a member.</title>
                <link>https://jira.opendaylight.org/browse/NETCONF-454</link>
                <project id="10142" key="NETCONF">netconf</project>
                    <description>&lt;ol&gt;
	&lt;li&gt;
	&lt;ol&gt;
		&lt;li&gt;Aim:&lt;br/&gt;
Verify that a three node cluster can recover from  isolation+rejoin of a member.&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
&lt;/ol&gt;


&lt;ol&gt;
	&lt;li&gt;
	&lt;ol&gt;
		&lt;li&gt;Description:&lt;br/&gt;
A 3 node cluster was deployed. The netconf test tool was  connected and we looked behind  the mountpoint from each member of the cluster.&lt;br/&gt;
We then found the leader of the config-topology shard and the config-inventory shard.&lt;br/&gt;
A different memeber was isolated. So in this  the  leader was member 2. Member-1 was isolated using iptables.&lt;br/&gt;
We looked beyond the mount point  of the netconf device from each member of the cluster.&lt;br/&gt;
The isolated member returned HTTP 401 when trying to view the netconf-topology using RESTCONF&lt;br/&gt;
Member-2 ,Member-3 returned HTTP 200 and data when trying to view the netconf-topology and look beyond the  mount point.&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;LOgs are attached. Member-1 was  rejoined at approx 04:44&lt;/p&gt;

&lt;p&gt;Results when trying to view the netconf-topology after rejoin&lt;br/&gt;
Try 1) Member 1: Null Member-2: 404 Member-3: 500&lt;br/&gt;
Try 2) Member 1: 200 Member-2: 404 Member-3: 200&lt;br/&gt;
Try 3) Member 1: 200 Member-2: 401 Member-3: 200&lt;/p&gt;

&lt;p&gt;Results when  looking beyond the  mount point after rejoin.&lt;br/&gt;
Try 1) Member 1: 404 Member-2: 404 Member-3: 500&lt;br/&gt;
Try 2) Member 1: 404 Member-2: 404 Member-3: 500&lt;br/&gt;
Try 3) Member 1: 404 Member-2: 401 Member-3: 500&lt;/p&gt;


&lt;ol&gt;
	&lt;li&gt;
	&lt;ol&gt;
		&lt;li&gt;Test artifact&lt;br/&gt;
Carbon&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
&lt;/ol&gt;


&lt;ol&gt;
	&lt;li&gt;
	&lt;ol&gt;
		&lt;li&gt;Test setup&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;Member-1: 10.10.199.105 &lt;br/&gt;
Member-2: 10.10.199.88 &lt;br/&gt;
Member-3: 10.10.199.183 &lt;br/&gt;
Netconf device: 10.10.199.185&lt;/p&gt;




&lt;ol&gt;
	&lt;li&gt;
	&lt;ol&gt;
		&lt;li&gt;Steps&lt;br/&gt;
Configure a 3 node cluster&lt;br/&gt;
Start odl on members&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;Set  the  log levels for clustering.  &lt;br/&gt;
log:set DEBUG org.opendaylight.controller.cluster  &lt;br/&gt;
log:set INFO org.opendaylight.controller.cluster.datastore.node.utils  &lt;/p&gt;


&lt;ol&gt;
	&lt;li&gt;Features loaded:&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;odl-jolokia odl-netconf-clustered-topology odl-netconf-connector-all odl-restconf odl-netconf-mdsal&lt;/p&gt;


&lt;ol&gt;
	&lt;li&gt;
	&lt;ol&gt;
		&lt;li&gt;REST API CALLS&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
	&lt;li&gt;Find the leader of the shards on DistributedConfigDatastore.&lt;/li&gt;
&lt;/ol&gt;


&lt;ol&gt;
	&lt;li&gt;
	&lt;ol&gt;
		&lt;li&gt;
		&lt;ol&gt;
			&lt;li&gt;Leader of shard-topology-config on DistributedConfigDatastore  on each member&lt;/li&gt;
		&lt;/ol&gt;
		&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;curl --request GET \&lt;br/&gt;
  --url &apos;http://10.10.199.XXX:8181/jolokia/read/org.opendaylight.controller:Category=Shards,name=member-1-shard-topology-config,type=DistributedConfigDatastore&apos; \&lt;br/&gt;
  --header &apos;accept: application/json&apos; \&lt;br/&gt;
  --header &apos;authorization: Basic YWRtaW46YWRtaW4=&apos; \&lt;br/&gt;
  --header &apos;cache-control: no-cache&apos; \&lt;br/&gt;
  --header &apos;content-type: application/json&apos; \&lt;br/&gt;
  --header &apos;postman-token: 19ee9a88-1c09-6762-6a54-0b44c8cba583&apos;&lt;/p&gt;

&lt;p&gt;Result:  &lt;/p&gt;




&lt;ol&gt;
	&lt;li&gt;
	&lt;ol&gt;
		&lt;li&gt;
		&lt;ol&gt;
			&lt;li&gt;Leader of shard-inventory-config on DistributedConfigDatastore on each member&lt;/li&gt;
		&lt;/ol&gt;
		&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;curl --request GET \&lt;br/&gt;
  --url &apos;http://10.10.199.XXX:8181/jolokia/read/org.opendaylight.controller:Category=Shards,name=member-1-shard-inventory-config,type=DistributedConfigDatastore&apos; \&lt;br/&gt;
  --header &apos;accept: application/json&apos; \&lt;br/&gt;
  --header &apos;authorization: Basic YWRtaW46YWRtaW4=&apos; \&lt;br/&gt;
  --header &apos;cache-control: no-cache&apos; \&lt;br/&gt;
  --header &apos;content-type: application/json&apos; \&lt;br/&gt;
  --header &apos;postman-token: 1dc38630-8b41-10e4-cdfa-fb53639c543a&apos;&lt;/p&gt;

&lt;p&gt;Result:  &lt;/p&gt;




&lt;ol&gt;
	&lt;li&gt;
	&lt;ol&gt;
		&lt;li&gt;
		&lt;ol&gt;
			&lt;li&gt;Confirm no device added to network topology&lt;/li&gt;
		&lt;/ol&gt;
		&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;curl --request GET \&lt;br/&gt;
  --url &lt;a href=&quot;http://10.10.199.105:8181/restconf/operational/network-topology:network-topology&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://10.10.199.105:8181/restconf/operational/network-topology:network-topology&lt;/a&gt; \&lt;br/&gt;
  --header &apos;accept: application/xml&apos; \&lt;br/&gt;
  --header &apos;authorization: Basic YWRtaW46YWRtaW4=&apos; \&lt;br/&gt;
  --header &apos;cache-control: no-cache&apos; \&lt;br/&gt;
  --header &apos;content-type: application/xml&apos; \&lt;br/&gt;
  --header &apos;postman-token: 1633d048-98ee-ae9a-00dd-b9f33749cb70&apos; &lt;/p&gt;

&lt;p&gt;Result:&lt;/p&gt;

&lt;ol&gt;
	&lt;li&gt;
	&lt;ol&gt;
		&lt;li&gt;
		&lt;ol&gt;
			&lt;li&gt;Start device on testtools-vm&lt;/li&gt;
		&lt;/ol&gt;
		&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;java -Xmx1G -XX:MaxPermSize=256M -Dorg.apache.sshd.registerBouncyCastle=false -jar netconf-testtool-1.2-Carbon-executable.jar  --device-count 1 --debug true --schemas-dir ./schemas --md-sal true &lt;/p&gt;




&lt;ol&gt;
	&lt;li&gt;
	&lt;ol&gt;
		&lt;li&gt;
		&lt;ol&gt;
			&lt;li&gt;Add device&lt;/li&gt;
		&lt;/ol&gt;
		&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;curl --request PUT \&lt;br/&gt;
  --url &lt;a href=&quot;http://10.10.199.105:8181/restconf/config/network-topology:network-topology/topology/topology-netconf/node/netconf-test-device&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://10.10.199.105:8181/restconf/config/network-topology:network-topology/topology/topology-netconf/node/netconf-test-device&lt;/a&gt; \&lt;br/&gt;
  --header &apos;authorization: Basic YWRtaW46YWRtaW4=&apos; \&lt;br/&gt;
  --header &apos;cache-control: no-cache&apos; \&lt;br/&gt;
  --header &apos;content-type: application/xml&apos; \&lt;br/&gt;
  --header &apos;postman-token: a3868282-fbca-58e4-08ee-fc522a730e30&apos; \&lt;br/&gt;
  --data &apos; &amp;lt;node xmlns=&quot;urn:TBD:params:xml:ns:yang:network-topology&quot;&amp;gt;\r\n   &amp;lt;node-id&amp;gt;netconf-test-device&amp;lt;/node-id&amp;gt;\r\n   &amp;lt;host xmlns=&quot;urn:opendaylight:netconf-node-topology&quot;&amp;gt;10.10.199.185&amp;lt;/host&amp;gt;\r\n   &amp;lt;port xmlns=&quot;urn:opendaylight:netconf-node-topology&quot;&amp;gt;17830&amp;lt;/port&amp;gt;\r\n   &amp;lt;username xmlns=&quot;urn:opendaylight:netconf-node-topology&quot;&amp;gt;root&amp;lt;/username&amp;gt;\r\n   &amp;lt;password xmlns=&quot;urn:opendaylight:netconf-node-topology&quot;&amp;gt;root&amp;lt;/password&amp;gt;\r\n   &amp;lt;tcp-only xmlns=&quot;urn:opendaylight:netconf-node-topology&quot;&amp;gt;false&amp;lt;/tcp-only&amp;gt;\r\n   &amp;lt;keepalive-delay xmlns=&quot;urn:opendaylight:netconf-node-topology&quot;&amp;gt;0&amp;lt;/keepalive-delay&amp;gt;\r\n &amp;lt;/node&amp;gt;\r\n \r\n &apos;&lt;/p&gt;

&lt;p&gt;Result:&lt;br/&gt;
HTTP 200  &lt;/p&gt;

&lt;ol&gt;
	&lt;li&gt;
	&lt;ol&gt;
		&lt;li&gt;
		&lt;ol&gt;
			&lt;li&gt;Confirm device  is  connected  in the operational topology on each of  the nodes.&lt;/li&gt;
		&lt;/ol&gt;
		&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
&lt;/ol&gt;



&lt;p&gt;curl --request GET \&lt;br/&gt;
  --url &lt;a href=&quot;http://10.10.199.XXX:8181/restconf/operational/network-topology:network-topology&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://10.10.199.XXX:8181/restconf/operational/network-topology:network-topology&lt;/a&gt; \&lt;br/&gt;
  --header &apos;accept: application/xml&apos; \&lt;br/&gt;
  --header &apos;authorization: Basic YWRtaW46YWRtaW4=&apos; \&lt;br/&gt;
  --header &apos;cache-control: no-cache&apos; \&lt;br/&gt;
  --header &apos;content-type: application/xml&apos; \&lt;br/&gt;
  --header &apos;postman-token: 1633d048-98ee-ae9a-00dd-b9f33749cb70&apos; &lt;/p&gt;

&lt;p&gt;Result:&lt;br/&gt;
HTTP 200  &lt;/p&gt;


&lt;ol&gt;
	&lt;li&gt;
	&lt;ol&gt;
		&lt;li&gt;
		&lt;ol&gt;
			&lt;li&gt;look beyond the  mount point from each of the nodes&lt;/li&gt;
		&lt;/ol&gt;
		&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;curl --request GET \&lt;br/&gt;
  --url &lt;a href=&quot;http://10.10.199.XXX:8181/restconf/operational/network-topology:network-topology/topology/topology-netconf/node/netconf-test-device/yang-ext:mount&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://10.10.199.XXX:8181/restconf/operational/network-topology:network-topology/topology/topology-netconf/node/netconf-test-device/yang-ext:mount&lt;/a&gt; \&lt;br/&gt;
  --header &apos;accept: application/xml&apos; \&lt;br/&gt;
  --header &apos;authorization: Basic YWRtaW46YWRtaW4=&apos; \&lt;br/&gt;
  --header &apos;cache-control: no-cache&apos; \&lt;br/&gt;
  --header &apos;content-type: application/xml&apos; \&lt;br/&gt;
  --header &apos;postman-token: e17517ae-f008-b96e-43ef-960804d7df87&apos;&lt;/p&gt;


&lt;p&gt;Result:&lt;br/&gt;
HTTP 200  &lt;/p&gt;


&lt;ol&gt;
	&lt;li&gt;
	&lt;ol&gt;
		&lt;li&gt;
		&lt;ol&gt;
			&lt;li&gt;Find leader of shard-topology-config on DistributedConfigDatastore on each member after isolation&lt;/li&gt;
		&lt;/ol&gt;
		&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;Member-XXX&lt;br/&gt;
	curl --request GET \&lt;br/&gt;
	  --url &apos;http://10.10.199.XXX:8181/jolokia/read/org.opendaylight.controller:Category=Shards,name=member-1-shard-inventory-config,type=DistributedConfigDatastore&apos; \&lt;br/&gt;
	  --header &apos;accept: application/json&apos; \&lt;br/&gt;
	  --header &apos;authorization: Basic YWRtaW46YWRtaW4=&apos; \&lt;br/&gt;
	  --header &apos;cache-control: no-cache&apos; \&lt;br/&gt;
	  --header &apos;content-type: application/json&apos; \&lt;br/&gt;
	  --header &apos;postman-token: 607b0a83-e559-f1c6-dfb6-c18fde1c9364&apos;&lt;/p&gt;</description>
                <environment>&lt;p&gt;Operating System: All&lt;br/&gt;
Platform: All&lt;/p&gt;</environment>
        <key id="21467">NETCONF-454</key>
            <summary>Three node cluster does not recover from  isolation+ rejoin of a member.</summary>
                <type id="10104" iconUrl="https://jira.opendaylight.org/secure/viewavatar?size=xsmall&amp;avatarId=10303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.opendaylight.org/images/icons/priorities/critical.svg">High</priority>
                        <status id="10003" iconUrl="https://jira.opendaylight.org/images/icons/status_generic.gif" description="">Confirmed</status>
                    <statusCategory id="2" key="new" colorName="blue-gray"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="ivanhrasko">Ivan Hrasko</assignee>
                                    <reporter username="ricjhill@gmail.com">RichardHill</reporter>
                        <labels>
                            <label>pt</label>
                    </labels>
                <created>Wed, 16 Aug 2017 12:40:33 +0000</created>
                <updated>Mon, 22 Jan 2024 21:55:19 +0000</updated>
                                            <version>2.0.2</version>
                                    <fixVersion>7.0.0</fixVersion>
                    <fixVersion>5.0.10</fixVersion>
                    <fixVersion>6.0.7</fixVersion>
                                    <component>netconf</component>
                        <due></due>
                            <votes>1</votes>
                                    <watches>7</watches>
                                                                                                                <comments>
                            <comment id="40159" author="ricjhill@gmail.com" created="Wed, 16 Aug 2017 12:49:44 +0000"  >&lt;p&gt;The postman collection was run  to investigate the state of cluster&lt;br/&gt;
Collection examines each member of cluster and verifies if:&lt;/p&gt;

&lt;ul&gt;
	&lt;li&gt;netconf-topology is available&lt;/li&gt;
	&lt;li&gt;netconf device is connected&lt;/li&gt;
	&lt;li&gt;data available behind the mount point of netconf device&lt;/li&gt;
&lt;/ul&gt;


&lt;ul&gt;
	&lt;li&gt;leader of shard-topology-config&lt;/li&gt;
	&lt;li&gt;leader  of shard-inventory-config&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;The colloection was run before and after isolation, &lt;br/&gt;
after isolation we waited for some minutes  then ran again&lt;br/&gt;
After rejoin the collection was run almost immediately then after a wait for  a few minutes.&lt;/p&gt;</comment>
                            <comment id="40181" author="ricjhill@gmail.com" created="Wed, 16 Aug 2017 12:49:44 +0000"  >&lt;p&gt;Attachment postman-collection-and-results-isolation-and-rejoin.zip has been added with description: postman results and collection used to examine cluster&lt;/p&gt;</comment>
                            <comment id="40160" author="ricjhill@gmail.com" created="Wed, 16 Aug 2017 13:17:52 +0000"  >
&lt;ol&gt;
	&lt;li&gt;Isolation of member-1 time:&lt;br/&gt;
approx 2017-08-16 04:35&lt;/li&gt;
&lt;/ol&gt;


&lt;ol&gt;
	&lt;li&gt;Rejoin time:&lt;br/&gt;
approx 2017-08-16 04:44&lt;/li&gt;
&lt;/ol&gt;
</comment>
                            <comment id="40182" author="ricjhill@gmail.com" created="Wed, 16 Aug 2017 13:17:52 +0000"  >&lt;p&gt;Attachment logs-carbon-isolation-rejoin.zip has been added with description: logs from each memeber of cluster&lt;/p&gt;</comment>
                            <comment id="40161" author="ricjhill@gmail.com" created="Wed, 16 Aug 2017 13:29:44 +0000"  >&lt;p&gt;Error seen on member-1 after rejoin&lt;/p&gt;

&lt;p&gt;2017-08-16 04:44:18,623 | ERROR |  Bundle Shutdown | NetconfTopologyManager           | 303 - org.opendaylight.netconf.topology-singleton - 1.2.0.Carbon | Error at closing topology context. InstanceIdentifier: KeyedInstanceIdentifier&lt;/p&gt;
{targetType=interface org.opendaylight.yang.gen.v1.urn.tbd.params.xml.ns.yang.network.topology.rev131021.network.topology.topology.Node, path=[org.opendaylight.yang.gen.v1.urn.tbd.params.xml.ns.yang.network.topology.rev131021.NetworkTopology, org.opendaylight.yang.gen.v1.urn.tbd.params.xml.ns.yang.network.topology.rev131021.network.topology.Topology[key=TopologyKey [_topologyId=Uri [_value=topology-netconf]]], org.opendaylight.yang.gen.v1.urn.tbd.params.xml.ns.yang.network.topology.rev131021.network.topology.topology.Node[key=NodeKey [_nodeId=Uri [_value=netconf-test-device]]]]}
&lt;p&gt;java.lang.NullPointerException: RemoteDevice&lt;/p&gt;
{netconf-test-device}
&lt;p&gt;: Device communicator was not created.&lt;br/&gt;
	at com.google.common.base.Preconditions.checkNotNull(Preconditions.java:226)&lt;br/&gt;
	at org.opendaylight.netconf.topology.singleton.impl.RemoteDeviceConnectorImpl.stopRemote&lt;/p&gt;</comment>
                            <comment id="40162" author="ricjhill@gmail.com" created="Wed, 16 Aug 2017 13:31:13 +0000"  >&lt;ol&gt;
	&lt;li&gt;
	&lt;ol&gt;
		&lt;li&gt;Error seen on member-3 after rejoin&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;2017-08-16 04:45:09,241 | ERROR | qtp666241474-424 | ContainerResponse                | 179 - com.sun.jersey.jersey-server - 1.17.0 | The RuntimeException could not be mapped to a response, re-throwing to the HTTP container&lt;br/&gt;
java.lang.IllegalStateException: Can&apos;t create ProxyReadTransaction&lt;br/&gt;
	at org.opendaylight.netconf.topology.singleton.impl.ProxyDOMDataBroker.newReadOnlyTransaction(ProxyDOMDataBroker.java:76)&lt;br/&gt;
	at org.opendaylight.netconf.sal.restconf.impl.BrokerFacade.readOperationalData(BrokerFacade.java:207)&lt;span class=&quot;error&quot;&gt;&amp;#91;307:org.opendaylight.netconf.sal-rest-connector:1.5.0.Carbon&amp;#93;&lt;/span&gt;&lt;br/&gt;
	at org.opendaylight.netconf.sal.restconf.impl.RestconfImpl.readOperationalData(RestconfImpl.java:736)&lt;span class=&quot;error&quot;&gt;&amp;#91;307:org.opendaylight.netconf.sal-rest-connector:1.5.0.Carbon&amp;#93;&lt;/span&gt;&lt;br/&gt;
	at org.opendaylight.netconf.sal.restconf.impl.StatisticsRestconfServiceWrapper.readOperationalData(StatisticsRestconfServiceWrapper.java:116)[307:org.opendaylight.netconf.sal-rest&lt;/p&gt;</comment>
                            <comment id="40163" author="ricjhill@gmail.com" created="Thu, 17 Aug 2017 12:41:03 +0000"  >&lt;p&gt;java -version&lt;br/&gt;
java version &quot;1.8.0_131&quot;&lt;br/&gt;
Java(TM) SE Runtime Environment (build 1.8.0_131-b11)&lt;br/&gt;
Java HotSpot(TM) 64-Bit Server VM (build 25.131-b11, mixed mode)&lt;/p&gt;</comment>
                            <comment id="40164" author="rovarga" created="Thu, 17 Aug 2017 13:41:54 +0000"  >&lt;p&gt;This is a problem in netconf, not controller.&lt;/p&gt;</comment>
                            <comment id="40165" author="tolvecky@frinx.io" created="Thu, 17 Aug 2017 14:32:58 +0000"  >&lt;p&gt;I think there is also a problem in clustering as well, because node-2 did not respond to restconf requests that were querying md-sal topology:&lt;br/&gt;
&amp;gt; Results when trying to view the netconf-topology after rejoin&lt;br/&gt;
&amp;gt; Try 1) Member 1: Null Member-2: 404 Member-3: 500&lt;br/&gt;
&amp;gt; Try 2) Member 1: 200 Member-2: 404 Member-3: 200&lt;br/&gt;
&amp;gt; Try 3) Member 1: 200 Member-2: 401 Member-3: 200&lt;/p&gt;

&lt;p&gt;Reading logs of node-2 I see TimeoutException in blueprint. This happened after the node got quarantined:&lt;br/&gt;
2017-08-16 04:43:58,092 | WARN  | ult-dispatcher-3 | QuarantinedMonitorActor          | 239 - org.opendaylight.controller.sal-clustering-commons - 1.5.0.Carbon | Got quarantined&lt;/p&gt;

&lt;p&gt;Fix for &lt;a href=&quot;https://jira.opendaylight.org/browse/CONTROLLER-1627&quot; title=&quot;LinkageError for union type after switchover or warm restart of instances cluster causing write tx to fail&quot; class=&quot;issue-link&quot; data-issue-key=&quot;CONTROLLER-1627&quot;&gt;&lt;del&gt;CONTROLLER-1627&lt;/del&gt;&lt;/a&gt; should resolve that problem.&lt;/p&gt;</comment>
                            <comment id="40166" author="vrpolak" created="Mon, 21 Aug 2017 13:20:05 +0000"  >&lt;p&gt;&amp;gt; Try 1) Member 1: Null Member-2: 404 Member-3: 500&lt;/p&gt;

&lt;p&gt;What is the Jolokia response in this case?&lt;br/&gt;
I see similar failures in Netconf cluster tests even without prior isolation, but the Jolokia response &lt;span class=&quot;error&quot;&gt;&amp;#91;0&amp;#93;&lt;/span&gt; is different from a corresponding Nitrogen Bug &lt;span class=&quot;error&quot;&gt;&amp;#91;1&amp;#93;&lt;/span&gt;.&lt;/p&gt;

&lt;p&gt;&amp;gt; Fix for &lt;a href=&quot;https://jira.opendaylight.org/browse/CONTROLLER-1627&quot; title=&quot;LinkageError for union type after switchover or warm restart of instances cluster causing write tx to fail&quot; class=&quot;issue-link&quot; data-issue-key=&quot;CONTROLLER-1627&quot;&gt;&lt;del&gt;CONTROLLER-1627&lt;/del&gt;&lt;/a&gt; should resolve that problem.&lt;/p&gt;

&lt;p&gt;We will test the fix.&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;0&amp;#93;&lt;/span&gt; &lt;a href=&quot;https://logs.opendaylight.org/releng/jenkins092/netconf-csit-3node-clustering-only-carbon/630/log.html.gz#s1-s5-t13-k2-k2-k8-k1-k2-k1-k1-k3-k1&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://logs.opendaylight.org/releng/jenkins092/netconf-csit-3node-clustering-only-carbon/630/log.html.gz#s1-s5-t13-k2-k2-k8-k1-k2-k1-k1-k3-k1&lt;/a&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;1&amp;#93;&lt;/span&gt; &lt;a href=&quot;https://bugs.opendaylight.org/show_bug.cgi?id=9006#c4&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://bugs.opendaylight.org/show_bug.cgi?id=9006#c4&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="40167" author="ricjhill@gmail.com" created="Tue, 22 Aug 2017 09:43:08 +0000"  >&lt;p&gt;Regarding  &lt;/p&gt;

&lt;p&gt;&amp;gt;&amp;gt; What is the Jolokia response in this case?&lt;/p&gt;

&lt;p&gt;&amp;gt;&amp;gt; Member 1: Null Member-2: 404 Member-3: 500&lt;/p&gt;

&lt;p&gt;The response is recorded in the postman results attached. Its possibly not clear from the bug report which  result files  map to which Try.&lt;/p&gt;


&lt;p&gt;Netconf-testtool-reconnecton.postman_test_run_after_rejoin.json = Try 1&lt;br/&gt;
Netconf-testtool-reconnecton.postman_test_run_after_rejoin1.json = Try 2&lt;br/&gt;
Netconf-testtool-reconnecton.postman_test_run_after_rejoin2.json = Try 3&lt;/p&gt;



&lt;p&gt;Netconf-testtool-reconnecton.postman_test_run_after_rejoin.json:   Line 693 shows the results for  Try 1&lt;/p&gt;



&lt;p&gt;		{&lt;br/&gt;
			&quot;id&quot;: &quot;6b6505be-db1e-ca25-28a3-9b4fb52f8473&quot;,&lt;br/&gt;
			&quot;name&quot;: &quot;View  operational network-topology member-1&quot;,&lt;br/&gt;
			&quot;url&quot;: &quot;http://&lt;tt&gt;controller-1-ip-addr&lt;/tt&gt;:&lt;tt&gt;port&lt;/tt&gt;/restconf/operational/network-topology:network-topology&quot;,&lt;br/&gt;
			&quot;totalTime&quot;: 0,&lt;br/&gt;
			&quot;responseCode&quot;: &lt;/p&gt;
{
				&quot;code&quot;: 0,
				&quot;name&quot;: &quot;&quot;,
				&quot;detail&quot;: &quot;&quot;
			}
&lt;p&gt;,&lt;br/&gt;
			&quot;tests&quot;: &lt;/p&gt;
{
				&quot;Response Body &quot;: true
			}
&lt;p&gt;,&lt;br/&gt;
			&quot;testPassFailCounts&quot;: {&lt;br/&gt;
				&quot;Response Body &quot;: &lt;/p&gt;
{
					&quot;pass&quot;: 1,
					&quot;fail&quot;: 0
				}
&lt;p&gt;			},&lt;br/&gt;
			&quot;times&quot;: [&lt;br/&gt;
				1008&lt;br/&gt;
			],&lt;br/&gt;
			&quot;allTests&quot;: [&lt;/p&gt;
				{
					&quot;Response Body &quot;: true
				}
&lt;p&gt;			],&lt;br/&gt;
			&quot;time&quot;: 1008,&lt;br/&gt;
			&quot;totalRequestTime&quot;: 1008,&lt;br/&gt;
			&quot;iterationResults&quot;: {}&lt;br/&gt;
		}&lt;/p&gt;</comment>
                            <comment id="40168" author="vrpolak" created="Tue, 22 Aug 2017 11:24:38 +0000"  >&lt;p&gt;I am still not sure if there are two different bugs, or just two descriptions of the same bug, so I have opened &lt;a href=&quot;https://jira.opendaylight.org/browse/CONTROLLER-1754&quot; title=&quot;Carbon: Sporadic cluster failure when member is restarted in Netconf cluster test&quot; class=&quot;issue-link&quot; data-issue-key=&quot;CONTROLLER-1754&quot;&gt;&lt;del&gt;CONTROLLER-1754&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="40169" author="ricjhill@gmail.com" created="Tue, 22 Aug 2017 12:46:50 +0000"  >&lt;p&gt;Regarding &lt;/p&gt;

&lt;p&gt;&amp;gt;&amp;gt; I am still not sure if there are two different bugs,&lt;/p&gt;

&lt;p&gt;What information do you  need to help investigate?&lt;/p&gt;</comment>
                            <comment id="40170" author="vrpolak" created="Fri, 25 Aug 2017 12:55:30 +0000"  >&lt;p&gt;&amp;gt; odl-jolokia odl-netconf-clustered-topology odl-netconf-connector-all odl-restconf odl-netconf-mdsal&lt;/p&gt;

&lt;p&gt;I think odl-netconf-connector-all is not really compatible with odl-netconf-clustered-topology. Does this happen without any odl-netconf-connector- feature?&lt;/p&gt;

&lt;p&gt;&amp;gt; What information do you need to help investigate?&lt;/p&gt;

&lt;p&gt;Ideally, we would want a Robot suite contributed to Integration/Test so that we can run it with various ODL builds on Sandbox. I think this suite:&lt;br/&gt;
&lt;a href=&quot;https://github.com/opendaylight/integration-test/blob/master/csit/suites/netconf/clustering/entity.robot#L111&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/opendaylight/integration-test/blob/master/csit/suites/netconf/clustering/entity.robot#L111&lt;/a&gt;&lt;br/&gt;
with ClusterManagement.Isolate_Member_From_List_Or_All (and ClusteManagement.Rejoin_Member_From_List_Or_All at line 138) would work, but I know you have to wait carefully after rejoin, it takes a while before the rejoining member is fully up-to-date.&lt;/p&gt;

&lt;p&gt;Sadly, I am on PTO next week, otherwise I would attempt to create such a suite myself.&lt;/p&gt;</comment>
                            <comment id="40171" author="ricjhill@gmail.com" created="Tue, 5 Sep 2017 11:08:50 +0000"  >&lt;p&gt;I&apos;ll push something&lt;/p&gt;</comment>
                            <comment id="40172" author="ricjhill@gmail.com" created="Wed, 20 Sep 2017 12:09:25 +0000"  >&lt;p&gt;Ive  tried to push this but  I  struggled  working out  why  coala was rejecting  the  submission.&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://git.opendaylight.org/gerrit/#/c/62765/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://git.opendaylight.org/gerrit/#/c/62765/&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;TOX validation failed and  was not  sure why. Can you  point me to some  documents  about  configuring &quot;coala&quot; please. After trial and error I  eventually removed some commented out code and added a linebreak in the   git  command  which  got it  to pass.&lt;/p&gt;</comment>
                            <comment id="40173" author="vrpolak" created="Thu, 21 Sep 2017 14:05:24 +0000"  >&lt;p&gt;&amp;gt; &lt;a href=&quot;https://git.opendaylight.org/gerrit/#/c/62765/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://git.opendaylight.org/gerrit/#/c/62765/&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;I have adapted that to run on Sandbox (without other suites). The test failed as described. See run &lt;span class=&quot;error&quot;&gt;&amp;#91;4&amp;#93;&lt;/span&gt; and archived logs &lt;span class=&quot;error&quot;&gt;&amp;#91;5&amp;#93;&lt;/span&gt; (both will get deleted over weekend).&lt;/p&gt;

&lt;p&gt;That was run against current Carbon snapshot build, and errors in logs look somewhat different. I will try to set DEBUG verbosity (on appropriate packages I have yet to narrow down) to see more details.&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;4&amp;#93;&lt;/span&gt; &lt;a href=&quot;https://jenkins.opendaylight.org/sandbox/job/netconf-csit-3node-clustering-all-carbon/1/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://jenkins.opendaylight.org/sandbox/job/netconf-csit-3node-clustering-all-carbon/1/&lt;/a&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;5&amp;#93;&lt;/span&gt; &lt;a href=&quot;https://logs.opendaylight.org/sandbox/jenkins091/netconf-csit-3node-clustering-all-carbon/1/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://logs.opendaylight.org/sandbox/jenkins091/netconf-csit-3node-clustering-all-carbon/1/&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;&amp;gt; TOX validation failed and  was not  sure why.&lt;/p&gt;

&lt;p&gt;If you have Bash with activated virtualenv with Python 2.7 and tox, you can type &quot;tox&quot; in your test repository to run the same checks. Error messages are easier to locate that way.&lt;/p&gt;</comment>
                            <comment id="40174" author="vrpolak" created="Thu, 21 Sep 2017 14:11:38 +0000"  >&lt;p&gt;&amp;gt; see more details&lt;/p&gt;

&lt;p&gt;From INFO logs I got an impression that device owner has moved after rejoin, so this may be hitting the same Singleton deficiency as &lt;a href=&quot;https://jira.opendaylight.org/browse/MDSAL-291&quot; title=&quot;Member isolation test fails in Carbon and Nitrogen&quot; class=&quot;issue-link&quot; data-issue-key=&quot;MDSAL-291&quot;&gt;&lt;del&gt;MDSAL-291&lt;/del&gt;&lt;/a&gt; is hitting.&lt;/p&gt;</comment>
                            <comment id="40175" author="vrpolak" created="Thu, 21 Sep 2017 18:39:55 +0000"  >&lt;p&gt;&amp;gt; I will try to set DEBUG verbosity&lt;/p&gt;

&lt;p&gt;Run #3: &lt;span class=&quot;error&quot;&gt;&amp;#91;6&amp;#93;&lt;/span&gt;.&lt;br/&gt;
Member-1 was the old device owner to get isolated and it was the one not responding correctly after the rejoin. Member-2 is the new device owner.&lt;/p&gt;

&lt;p&gt;&amp;gt; may be hitting the same Singleton deficiency as &lt;a href=&quot;https://jira.opendaylight.org/browse/MDSAL-291&quot; title=&quot;Member isolation test fails in Carbon and Nitrogen&quot; class=&quot;issue-link&quot; data-issue-key=&quot;MDSAL-291&quot;&gt;&lt;del&gt;MDSAL-291&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Nope. According to member-1 log &lt;span class=&quot;error&quot;&gt;&amp;#91;7&amp;#93;&lt;/span&gt;, RELEASING_OWNERSHIP state did not get stuck:&lt;br/&gt;
2017-09-21 15:23:00,694 | DEBUG | ult-dispatcher-2 | ClusterSingletonServiceGroupImpl | 155 - org.opendaylight.mdsal.singleton-dom-impl - 2.2.2.SNAPSHOT | Service group KeyedInstanceIdentifier&lt;/p&gt;
{targetType=interface org.opendaylight.yang.gen.v1.urn.tbd.params.xml.ns.yang.network.topology.rev131021.network.topology.topology.Node, path=[org.opendaylight.yang.gen.v1.urn.tbd.params.xml.ns.yang.network.topology.rev131021.NetworkTopology, org.opendaylight.yang.gen.v1.urn.tbd.params.xml.ns.yang.network.topology.rev131021.network.topology.Topology[key=TopologyKey [_topologyId=Uri [_value=topology-netconf]]], org.opendaylight.yang.gen.v1.urn.tbd.params.xml.ns.yang.network.topology.rev131021.network.topology.topology.Node[key=NodeKey [_nodeId=Uri [_value=netconf-test-device]]]]}
&lt;p&gt; switching from RELEASING_OWNERSHIP to STANDBY&lt;/p&gt;

&lt;p&gt;Currently, I am suspicious about the way the new master removes &lt;span class=&quot;error&quot;&gt;&amp;#91;8&amp;#93;&lt;/span&gt; a slave mount point (not sure on which member):&lt;br/&gt;
2017-09-21 15:22:57,627 | INFO  | lt-dispatcher-50 | NetconfTopologyContext           | 282 - org.opendaylight.netconf.topology-singleton - 1.2.2.SNAPSHOT | Master was selected: IpAddress [_ipv4Address=Ipv4Address &lt;span class=&quot;error&quot;&gt;&amp;#91;_value=10.29.15.99&amp;#93;&lt;/span&gt;]&lt;br/&gt;
2017-09-21 15:22:57,627 | DEBUG | lt-dispatcher-50 | NetconfNodeManager               | 282 - org.opendaylight.netconf.topology-singleton - 1.2.2.SNAPSHOT | RemoteDevice&lt;/p&gt;
{netconf-test-device}: Sending message to unregister slave mountpoint on Actor&lt;a href=&quot;#-411647032&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot;&gt;akka://opendaylight-cluster-data/user/$v#-411647032&lt;/a&gt;&lt;br/&gt;
2017-09-21 15:22:57,627 | DEBUG | lt-dispatcher-50 | NetconfNodeManager               | 282 - org.opendaylight.netconf.topology-singleton - 1.2.2.SNAPSHOT | RemoteDevice{netconf-test-device}
&lt;p&gt;: Sending poison pill to Actor&lt;a href=&quot;#-411647032&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot;&gt;akka://opendaylight-cluster-data/user/$v#-411647032&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;6&amp;#93;&lt;/span&gt; &lt;a href=&quot;https://jenkins.opendaylight.org/sandbox/job/netconf-csit-3node-clustering-all-carbon/3/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://jenkins.opendaylight.org/sandbox/job/netconf-csit-3node-clustering-all-carbon/3/&lt;/a&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;7&amp;#93;&lt;/span&gt; &lt;a href=&quot;https://logs.opendaylight.org/sandbox/jenkins091/netconf-csit-3node-clustering-all-carbon/3/odl1_karaf.log.gz&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://logs.opendaylight.org/sandbox/jenkins091/netconf-csit-3node-clustering-all-carbon/3/odl1_karaf.log.gz&lt;/a&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;8&amp;#93;&lt;/span&gt; &lt;a href=&quot;https://logs.opendaylight.org/sandbox/jenkins091/netconf-csit-3node-clustering-all-carbon/3/odl2_karaf.log.gz&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://logs.opendaylight.org/sandbox/jenkins091/netconf-csit-3node-clustering-all-carbon/3/odl2_karaf.log.gz&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="40176" author="vrpolak" created="Mon, 25 Sep 2017 12:58:39 +0000"  >&lt;p&gt;&amp;gt; Run #3: &lt;span class=&quot;error&quot;&gt;&amp;#91;6&amp;#93;&lt;/span&gt;.&lt;/p&gt;

&lt;p&gt;For this week, &lt;span class=&quot;error&quot;&gt;&amp;#91;9&amp;#93;&lt;/span&gt; seems to be the same, although I have not looked into karaf.log yet.&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;9&amp;#93;&lt;/span&gt; &lt;a href=&quot;https://jenkins.opendaylight.org/sandbox/job/netconf-csit-3node-clustering-only-carbon/1/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://jenkins.opendaylight.org/sandbox/job/netconf-csit-3node-clustering-only-carbon/1/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="40177" author="vrpolak" created="Tue, 26 Sep 2017 12:13:58 +0000"  >&lt;p&gt;&amp;gt; &lt;a href=&quot;https://git.opendaylight.org/gerrit/#/c/62765/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://git.opendaylight.org/gerrit/#/c/62765/&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The suite is merged. I am still investigating whether the failure is in Singleton implementation.&lt;/p&gt;</comment>
                            <comment id="40178" author="vrpolak" created="Wed, 18 Oct 2017 14:54:43 +0000"  >&lt;p&gt;Finally I think I understand what is going on well enough to write a comment.&lt;/p&gt;

&lt;p&gt;First, Netconf clustered topology is based on Singleton service, which is based on Entity ownership service, which uses a specific Shard to store its data, and when the previous Leader of that shard is isolated, it enters IsolatedLeader state and it cannot make progress. Only after rejoining the affected member learns its lost its ownership and another member is already an owner. The isolation suite frequently passes because Entity ownership service takes some time to notify the member, and until that happens it still considers itself a master, keeps opened SSH connection to the device, and everything looks good from the Robot point of view.&lt;/p&gt;

&lt;p&gt;I am less sure on details of what happens when this bug is present:&lt;br/&gt;
There is a listener on config topology changes. If a device is configured, the listener (on each member) registers a singleton application, and starts another (slave) listener, this time on operational topology. Singleton service instantiates the application on one member (called master). Master stops the local slave listener, and creates a netconf connector, updating the operational topology status. When connector connects, master creates a local mdsal &quot;master&quot; mountpoint others can use to access the device. Slaves get notified (by &quot;connected&quot; status in operational topology) and create local mdsal &quot;slave&quot; mountpoints (to forward requests to the master).&lt;br/&gt;
So far so good, but the catch is that the writes to operational topology are only done by a component which handles the connection, which assumes it is the only connection to the specified device. But in the isolation scenario we have both the old master and the new master.&lt;br/&gt;
If the singleton service decides the current owner should not be an owner, netconf code does the following (perhaps in a different order) removes the master mount point, tears down the netconf connection, updates the operational topology and creates the slave listener again. Other members get notified (by seeing the device has no longer status connected in operational topology) and they remove their slave mountpoints. Only after the previous owner has done everything, singleton service should instantiate the application on a new owner.&lt;/p&gt;

&lt;p&gt;That would work well if it was true that there is always at most one owner across the cluster. But in the isolation scenario, the new owner updates the operational topology first, and then the old owner deletes that without the new owner realizing. Also, other slaves interpret that as device disconnecting and they remove their mount points as well, resulting in this Bug.&lt;/p&gt;

&lt;p&gt;I am not sure whether Entity ownership service could be modified to guarantee at most one owner across cluster even in isolation scenarios.&lt;br/&gt;
If not, Netconf code has to accommodate for late old owner writes.&lt;/p&gt;

&lt;p&gt;P.S.: This week Sandbox job with perhaps too much traces: &lt;span class=&quot;error&quot;&gt;&amp;#91;10&amp;#93;&lt;/span&gt;.&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;10&amp;#93;&lt;/span&gt; &lt;a href=&quot;https://jenkins.opendaylight.org/sandbox/job/netconf-csit-3node-clustering-only-oxygen/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://jenkins.opendaylight.org/sandbox/job/netconf-csit-3node-clustering-only-oxygen/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="40179" author="vrpolak" created="Wed, 18 Oct 2017 15:08:50 +0000"  >&lt;p&gt;&amp;gt; This week Sandbox job&lt;/p&gt;

&lt;p&gt;Selected lines from karaf log &lt;span class=&quot;error&quot;&gt;&amp;#91;11&amp;#93;&lt;/span&gt;:&lt;/p&gt;

&lt;p&gt;2017-10-18 12:12:49,195 | INFO  | nsole user karaf | core                             | 167 - org.apache.karaf.log.core - 4.0.10 | ROBOT MESSAGE: Starting test Find_And_Isolate_Device_Entity_Owner&lt;/p&gt;

&lt;p&gt;2017-10-18 12:12:51,052 | INFO  | nsole user karaf | core                             | 167 - org.apache.karaf.log.core - 4.0.10 | ROBOT MESSAGE: Starting test Wait_For_New_Owner_To_Appear&lt;/p&gt;

&lt;p&gt;2017-10-18 12:13:01,731 | INFO  | nsole user karaf | core                             | 167 - org.apache.karaf.log.core - 4.0.10 | ROBOT MESSAGE: Starting test Rejoin_Original_Entity_Owner&lt;/p&gt;

&lt;p&gt;2017-10-18 12:13:02,385 | DEBUG | lt-dispatcher-21 | sterSingletonServiceProviderImpl | 298 - org.opendaylight.mdsal.singleton-dom-impl - 2.4.0.SNAPSHOT | Ownership change for ClusterSingletonService Provider DOMEntityOwnershipChange [entity=DOMEntity [type=org.opendaylight.mdsal.ServiceEntityType, id=/(urn:opendaylight:params:xml:ns:yang:mdsal:core:general-entity?revision=2015-09-30)entity/entity[{(urn:opendaylight:params:xml:ns:yang:mdsal:core:general-entity?revision=2015-09-30)name=KeyedInstanceIdentifier{targetType=interface org.opendaylight.yang.gen.v1.urn.tbd.params.xml.ns.yang.network.topology.rev131021.network.topology.topology.Node, path=[org.opendaylight.yang.gen.v1.urn.tbd.params.xml.ns.yang.network.topology.rev131021.NetworkTopology, org.opendaylight.yang.gen.v1.urn.tbd.params.xml.ns.yang.network.topology.rev131021.network.topology.Topology[key=TopologyKey [_topologyId=Uri &lt;span class=&quot;error&quot;&gt;&amp;#91;_value=topology-netconf&amp;#93;&lt;/span&gt;]], org.opendaylight.yang.gen.v1.urn.tbd.params.xml.ns.yang.network.topology.rev131021.network.topology.topology.Node[key=NodeKey [_nodeId=Uri &lt;span class=&quot;error&quot;&gt;&amp;#91;_value=netconf-test-device&amp;#93;&lt;/span&gt;]]]}}]], state=LOCAL_OWNERSHIP_LOST_NEW_OWNER &lt;span class=&quot;error&quot;&gt;&amp;#91;wasOwner=true, isOwner=false, hasOwner=true&amp;#93;&lt;/span&gt;, inJeopardy=false]&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;11&amp;#93;&lt;/span&gt; &lt;a href=&quot;https://logs.opendaylight.org/sandbox/jenkins091/netconf-csit-3node-clustering-only-oxygen/48/odl1_karaf.log.gz&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://logs.opendaylight.org/sandbox/jenkins091/netconf-csit-3node-clustering-only-oxygen/48/odl1_karaf.log.gz&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="40180" author="vrpolak" created="Wed, 18 Oct 2017 15:59:17 +0000"  >&lt;p&gt;&amp;gt; I am not sure whether Entity ownership service could be modified to&lt;br/&gt;
&amp;gt; guarantee at most one owner across cluster even in isolation scenarios.&lt;/p&gt;

&lt;p&gt;If I am reading this &lt;span class=&quot;error&quot;&gt;&amp;#91;12&amp;#93;&lt;/span&gt; comment correctly, two owners during cluster partition is an expected outcome.&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;12&amp;#93;&lt;/span&gt; &lt;a href=&quot;https://github.com/opendaylight/controller/blob/release/nitrogen/opendaylight/md-sal/sal-distributed-datastore/src/main/java/org/opendaylight/controller/cluster/datastore/entityownership/EntityOwnershipShard.java#L462-L467&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/opendaylight/controller/blob/release/nitrogen/opendaylight/md-sal/sal-distributed-datastore/src/main/java/org/opendaylight/controller/cluster/datastore/entityownership/EntityOwnershipShard.java#L462-L467&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="59732" author="tcere" created="Thu, 19 Oct 2017 12:54:53 +0000"  >&lt;p&gt;I am pretty sure that singleton service should be trying to shut down the service on the isolated node. Although if we are doing any datastore operations in the close of the Netconf service based on singleton we wont shutdown until the datastore is available so if im right we will have two owners on rejoin. No idea atm what the appropriate solution would be I would have to think about this.&lt;/p&gt;</comment>
                            <comment id="59797" author="vrpolak" created="Thu, 26 Oct 2017 10:29:58 +0000"  >&lt;p&gt;Workaround attempt (master branch): &lt;span class=&quot;error&quot;&gt;&amp;#91;13&amp;#93;&lt;/span&gt;.&lt;br/&gt;
Not fully tested yet, but ready for code review.&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;13&amp;#93;&lt;/span&gt; &lt;a href=&quot;https://git.opendaylight.org/gerrit/#/c/64440/8&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://git.opendaylight.org/gerrit/#/c/64440/8&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="59800" author="vrpolak" created="Thu, 26 Oct 2017 13:01:24 +0000"  >&lt;p&gt;&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;13&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;Testing shows two issues. One is specific to the current patch not cleaning up operational state &lt;span class=&quot;error&quot;&gt;&amp;#91;14&amp;#93;&lt;/span&gt; (3 occurrences), other looks like a controller bug &lt;a href=&quot;https://jira.opendaylight.org/browse/CONTROLLER-1784&quot; title=&quot;DataTreeChangeListener when registered during isolation (possibly) does not receive notifications even after rejoin&quot; class=&quot;issue-link&quot; data-issue-key=&quot;CONTROLLER-1784&quot;&gt;&lt;del&gt;CONTROLLER-1784&lt;/del&gt;&lt;/a&gt; (1 occurrence).&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;14&amp;#93;&lt;/span&gt; &lt;a href=&quot;https://git.opendaylight.org/gerrit/#/c/64706/2/netconf/sal-netconf-connector/src/main/java/org/opendaylight/netconf/sal/connect/netconf/sal/NetconfDeviceTopologyAdapter.java@144&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://git.opendaylight.org/gerrit/#/c/64706/2/netconf/sal-netconf-connector/src/main/java/org/opendaylight/netconf/sal/connect/netconf/sal/NetconfDeviceTopologyAdapter.java@144&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="69031" author="ivanhrasko" created="Wed, 31 Mar 2021 09:36:38 +0000"  >&lt;p&gt;The problem is partially caused by mdsal-clustering behavior and partially by netconf-clustering behavior.&lt;/p&gt;

&lt;p&gt;The problem in mdsal-clustering is that after the isolation the previous leader gets leader again and in general we can say that there is too much leader transitions than is necessary.&lt;/p&gt;

&lt;p&gt;I believe the sequence should be as this:&lt;br/&gt;
 we have a leader, the leader gets isolated, in the meantime new leader is elected, after rejoin the previous leader accepts its follower role to the new leader.&lt;/p&gt;

&lt;p&gt;But after applying this patch &lt;a href=&quot;https://git.opendaylight.org/gerrit/c/netconf/+/95511&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://git.opendaylight.org/gerrit/c/netconf/+/95511&lt;/a&gt; we get logs like this:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;21
 2021-03-24T15:52:32,600 | INFO | opendaylight-cluster-data-akka.actor.default-dispatcher-36 | NetconfTopologyContext | 289 - org.opendaylight.netconf.topology-singleton - 1.13.2.SNAPSHOT | Master was selected: IpAddress{_ipv4Address=Ipv4Address{_value=192.168.56.25}}
21
 2021-03-24T15:52:36,819 | INFO | remote-connector-processing-executor-10 | MasterSalFacade | 289 - org.opendaylight.netconf.topology-singleton - 1.13.2.SNAPSHOT | Device RemoteDevice{netconf-test-device} connected - registering master mount point
ISOLATION 15:52:43
23
 2021-03-24T15:52:52,229 | INFO | opendaylight-cluster-data-akka.actor.default-dispatcher-36 | NetconfTopologyContext | 289 - org.opendaylight.netconf.topology-singleton - 1.13.2.SNAPSHOT | Master was selected: IpAddress{_ipv4Address=Ipv4Address{_value=192.168.56.25}}
23
 2021-03-24T15:52:53,496 | INFO | remote-connector-processing-executor-12 | MasterSalFacade | 289 - org.opendaylight.netconf.topology-singleton - 1.13.2.SNAPSHOT | Device RemoteDevice{netconf-test-device} connected - registering master mount point
NEW OWNER 15:52:53.660
21
 2021-03-24T15:52:55,988 | INFO | opendaylight-cluster-data-akka.actor.default-dispatcher-20 | NetconfTopologyContext | 289 - org.opendaylight.netconf.topology-singleton - 1.13.2.SNAPSHOT | Master was removed: IpAddress{_ipv4Address=Ipv4Address{_value=192.168.56.25}}
21
 2021-03-24T15:52:55,991 | INFO | opendaylight-cluster-data-akka.actor.default-dispatcher-20 | MasterSalFacade | 289 - org.opendaylight.netconf.topology-singleton - 1.13.2.SNAPSHOT | Device RemoteDevice{netconf-test-device} disconnected - unregistering master mount point
REJOIN 15:53:57
21
 2021-03-24T15:54:00,702 | INFO | opendaylight-cluster-data-akka.actor.default-dispatcher-20 | NetconfTopologyContext | 289 - org.opendaylight.netconf.topology-singleton - 1.13.2.SNAPSHOT | Master was selected: IpAddress{_ipv4Address=Ipv4Address{_value=192.168.56.25}}
21
 2021-03-24T15:54:00,909 | INFO | remote-connector-processing-executor-1 | MasterSalFacade | 289 - org.opendaylight.netconf.topology-singleton - 1.13.2.SNAPSHOT | Device RemoteDevice{netconf-test-device} connected - registering master mount point
21
 2021-03-24T15:54:08,477 | INFO | opendaylight-cluster-data-akka.actor.default-dispatcher-23 | NetconfTopologyContext | 289 - org.opendaylight.netconf.topology-singleton - 1.13.2.SNAPSHOT | Master was removed: IpAddress{_ipv4Address=Ipv4Address{_value=192.168.56.25}}
RESYNC 15:54:08.844
22
 2021-03-24T15:54:09,253 | INFO | opendaylight-cluster-data-akka.actor.default-dispatcher-19 | NetconfTopologyContext | 289 - org.opendaylight.netconf.topology-singleton - 1.13.2.SNAPSHOT | Master was selected: IpAddress{_ipv4Address=Ipv4Address{_value=192.168.56.25}}
23
 2021-03-24T15:54:09,261 | INFO | opendaylight-cluster-data-akka.actor.default-dispatcher-37 | NetconfTopologyContext | 289 - org.opendaylight.netconf.topology-singleton - 1.13.2.SNAPSHOT | Master was removed: IpAddress{_ipv4Address=Ipv4Address{_value=192.168.56.25}}
23
 2021-03-24T15:54:09,268 | INFO | opendaylight-cluster-data-akka.actor.default-dispatcher-37 | MasterSalFacade | 289 - org.opendaylight.netconf.topology-singleton - 1.13.2.SNAPSHOT | Device RemoteDevice{netconf-test-device} disconnected - unregistering master mount point
22
 2021-03-24T15:54:09,478 | INFO | remote-connector-processing-executor-1 | MasterSalFacade | 289 - org.opendaylight.netconf.topology-singleton - 1.13.2.SNAPSHOT | Device RemoteDevice{netconf-test-device} connected - registering master mount point
22
 2021-03-24T15:54:11,938 | INFO | opendaylight-cluster-data-notification-dispatcher-49 | NetconfTopologyContext | 289 - org.opendaylight.netconf.topology-singleton - 1.13.2.SNAPSHOT | Master was removed: IpAddress{_ipv4Address=Ipv4Address{_value=192.168.56.25}}
22
 2021-03-24T15:54:11,961 | INFO | opendaylight-cluster-data-notification-dispatcher-49 | MasterSalFacade | 289 - org.opendaylight.netconf.topology-singleton - 1.13.2.SNAPSHOT | Device RemoteDevice{netconf-test-device} disconnected - unregistering master mount point
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;The problem in netconf clustering is that NetconfTopologyContext class is using to control leader-follower transition the RemoteDeviceConnector class. Its methods work well for connecting and disconnecting the device but they are misused for leadership change.&lt;/p&gt;

&lt;p&gt;When a node become to be leader RemoteDeviceConnector#startRemoteDeviceConnection is called. This method starts &quot;master&quot; mount point and write to operational datastore status &quot;connected&quot; for the device.&lt;br/&gt;
 The followers are listening for changes in operational datastore and when they receive &quot;connected&quot; they create their &quot;slave&quot; mountpoints.&lt;/p&gt;

&lt;p&gt;When a node become to be follower RemoteDeviceConnector#stopRemoteDeviceConnection is called. This method stops &quot;master&quot; mountpoint.. and writes to operational datastore &quot;connecting&quot; for the device ... and finally deletes node data from operational datastore. And that&apos;s the problem. After previous leader isolation when new leader is elected it writes status &quot;connected&quot; and followers establish their &quot;slave&quot; mountpoints. But after rejoin we can get disconnect with receiving &quot;connecting&quot; and after all node data are deleted from operational datastore.&lt;/p&gt;

&lt;p&gt;I have made a test run without removing the config data for the device in the teardown but I end up in the state when all &quot;slave&quot; mountpoints were down and operational data for node were gone. The controller was not able to recover connection. The only solution was to restart test device!&lt;/p&gt;

&lt;p&gt;I suggest we need becomeLeader() and becomeFollower() methods which will do:&lt;/p&gt;

&lt;p&gt;becomeLeader() will write &quot;connecting&quot; to operational datastore to cause reconnect on followers (they need to drop &quot;slave&quot; mount points redirected to previous leader), then it will establish &quot;master&quot; mount point and if that succeeds it can write &quot;connected&quot; to operational datastore - thus notifying followers to create &quot;slave&quot; mountpoints.&lt;/p&gt;

&lt;p&gt;becomeFollower() will just stop &quot;master&quot; mount point and writes nothing to operational datastore. I think that the current implementation by writing to operational datastore after node is no more leader breaks the netconf clustering idea that only leader can write device status.&lt;/p&gt;

&lt;p&gt;That looks simple but the implementation of RemoteDeviceConnector using NetconfConnectorDTO class is making this hard problem. We will need to rework some of the classes and unit tests.&lt;/p&gt;</comment>
                            <comment id="69039" author="ivanhrasko" created="Wed, 7 Apr 2021 13:02:24 +0000"  >&lt;p&gt;&lt;b&gt;After fixing the previous problems we occasionally get to the following problem:&lt;/b&gt;&lt;/p&gt;

&lt;p&gt;Node 22 was the leader.&lt;br/&gt;
 We have isolated node 22 from nodes 21 and 23 using iptables.&lt;br/&gt;
 During isolation nodes 21 and 23 were able to provide mountpoints.&lt;/p&gt;

&lt;p&gt;We have broken the isolation and noticed the following logs:&lt;/p&gt;

&lt;p&gt;&lt;b&gt;node 21:&lt;/b&gt;&lt;br/&gt;
 no logs at all (it means that this node did not even get operational datastore notifications)&lt;/p&gt;

&lt;p&gt;&lt;b&gt;node 22:&lt;/b&gt;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2021-04-07T14:35:42.142869+02:00[Europe/Bratislava]: I am master
2021-04-07T14:35:42.289366+02:00[Europe/Bratislava]: Updating data: true from master: akka.tcp://opendaylight-cluster-data@192.168.56.22:2550
2021-04-07T14:35:47.022939+02:00[Europe/Bratislava]: I am follower
2021-04-07T14:35:47.039052+02:00[Europe/Bratislava]: modification...
2021-04-07T14:35:47.049861+02:00[Europe/Bratislava]: Connecting slave mount point to: akka.tcp://opendaylight-cluster-data@192.168.56.22:2550
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&lt;b&gt;node 23:&lt;/b&gt;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt; 2021-04-07T14:35:46.612573+02:00[Europe/Bratislava]: modification...
 2021-04-07T14:35:46.626221+02:00[Europe/Bratislava]: Disconnecting slave mountpoint
 2021-04-07T14:35:47.040825+02:00[Europe/Bratislava]: modification...
 2021-04-07T14:35:47.048678+02:00[Europe/Bratislava]: Connecting slave mount point to: akka.tcp://opendaylight-cluster-data@192.168.56.22:2550
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;We see that node 21 is somehow out of game and nodes 22 and 23 were trying to form leader - follower relationship.&lt;br/&gt;
 But node 22 failed to hold its leadership, become follower creating not working &quot;slave&quot; mountpoint to itself.&lt;br/&gt;
 We can only speculate that probably the node 21 was supposed to be new leader but failed to receive notification about that.&lt;/p&gt;

&lt;p&gt;&lt;b&gt;Note:&lt;/b&gt;&lt;br/&gt;
 the logs you can see are not from the application code but from IDE debugger:&lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;I am master - means that nodes instantiateServiceInstance method was called&lt;/li&gt;
	&lt;li&gt;I am follower - means that node closeServiceInstance method was called&lt;/li&gt;
	&lt;li&gt;Updating data: ... - means tat node is writing data to operational DS (by writing true leader informs its followers to create &quot;slave&quot; mount points)&lt;/li&gt;
	&lt;li&gt;modification - means that node received notification about change in operational DS&lt;/li&gt;
	&lt;li&gt;Disconnecting slave mount point - means the node is disconnecting its &quot;slave&quot; mountpoint&lt;/li&gt;
	&lt;li&gt;Connecting slave mount point to: ... - means that node is connecting its &quot;slave&quot; mountpoint to leader with IP address as you can see in debugger message&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;instantiateServiceInstance and closeServiceInstance methods are implemented in:&lt;br/&gt;
 org.opendaylight.netconf.topology.singleton.impl.NetconfTopologyContext implements ClusterSingletonService, AutoCloseable&lt;/p&gt;

&lt;p&gt;.. and here is the output of &lt;b&gt;/rests/data/entity-owners:entity-owners&lt;/b&gt;:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-json&quot;&gt;
{
  &lt;span class=&quot;code-quote&quot;&gt;&quot;entity-owners:entity-owners&quot;&lt;/span&gt;: {
    &lt;span class=&quot;code-quote&quot;&gt;&quot;entity-type&quot;&lt;/span&gt;: [
      {
        &lt;span class=&quot;code-quote&quot;&gt;&quot;type&quot;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&quot;org.opendaylight.mdsal.ServiceEntityType&quot;&lt;/span&gt;,
        &lt;span class=&quot;code-quote&quot;&gt;&quot;entity&quot;&lt;/span&gt;: [
          {
            &lt;span class=&quot;code-quote&quot;&gt;&quot;id&quot;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&quot;/odl-general-entity:entity[name=&lt;span class=&quot;code-quote&quot;&gt;&apos;KeyedInstanceIdentifier{targetType=&lt;span class=&quot;code-keyword&quot;&gt;interface&lt;/span&gt; org.opendaylight.yang.gen.v1.urn.tbd.params.xml.ns.yang.network.topology.rev131021.network.topology.topology.Node, path=[org.opendaylight.yang.gen.v1.urn.tbd.params.xml.ns.yang.network.topology.rev131021.NetworkTopology, org.opendaylight.yang.gen.v1.urn.tbd.params.xml.ns.yang.network.topology.rev131021.network.topology.Topology[key=TopologyKey{_topologyId=Uri{_value=topology-netconf}}], org.opendaylight.yang.gen.v1.urn.tbd.params.xml.ns.yang.network.topology.rev131021.network.topology.topology.Node[key=NodeKey{_nodeId=Uri{_value=netconf-test-device}}]]}&apos;&lt;/span&gt;]&quot;&lt;/span&gt;,
            &lt;span class=&quot;code-quote&quot;&gt;&quot;candidate&quot;&lt;/span&gt;: [
              {
                &lt;span class=&quot;code-quote&quot;&gt;&quot;name&quot;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&quot;member-1&quot;&lt;/span&gt;
              },
              {
                &lt;span class=&quot;code-quote&quot;&gt;&quot;name&quot;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&quot;member-3&quot;&lt;/span&gt;
              },
              {
                &lt;span class=&quot;code-quote&quot;&gt;&quot;name&quot;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&quot;member-2&quot;&lt;/span&gt;
              }
            ],
            &lt;span class=&quot;code-quote&quot;&gt;&quot;owner&quot;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&quot;member-1&quot;&lt;/span&gt;
          }
        ]
      },
      {
        &lt;span class=&quot;code-quote&quot;&gt;&quot;type&quot;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&quot;org.opendaylight.mdsal.AsyncServiceCloseEntityType&quot;&lt;/span&gt;,
        &lt;span class=&quot;code-quote&quot;&gt;&quot;entity&quot;&lt;/span&gt;: [
          {
            &lt;span class=&quot;code-quote&quot;&gt;&quot;id&quot;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&quot;/odl-general-entity:entity[name=&lt;span class=&quot;code-quote&quot;&gt;&apos;KeyedInstanceIdentifier{targetType=&lt;span class=&quot;code-keyword&quot;&gt;interface&lt;/span&gt; org.opendaylight.yang.gen.v1.urn.tbd.params.xml.ns.yang.network.topology.rev131021.network.topology.topology.Node, path=[org.opendaylight.yang.gen.v1.urn.tbd.params.xml.ns.yang.network.topology.rev131021.NetworkTopology, org.opendaylight.yang.gen.v1.urn.tbd.params.xml.ns.yang.network.topology.rev131021.network.topology.Topology[key=TopologyKey{_topologyId=Uri{_value=topology-netconf}}], org.opendaylight.yang.gen.v1.urn.tbd.params.xml.ns.yang.network.topology.rev131021.network.topology.topology.Node[key=NodeKey{_nodeId=Uri{_value=netconf-test-device}}]]}&apos;&lt;/span&gt;]&quot;&lt;/span&gt;,
            &lt;span class=&quot;code-quote&quot;&gt;&quot;candidate&quot;&lt;/span&gt;: [
              {
                &lt;span class=&quot;code-quote&quot;&gt;&quot;name&quot;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&quot;member-1&quot;&lt;/span&gt;
              }
            ],
            &lt;span class=&quot;code-quote&quot;&gt;&quot;owner&quot;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&quot;member-1&quot;&lt;/span&gt;
          }
        ]
      }
    ]
  }
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="69103" author="ivanhrasko" created="Tue, 13 Apr 2021 17:18:35 +0000"  >&lt;p&gt;We have reproduced this problem again.&lt;/p&gt;

&lt;p&gt;The problem is as follows:&lt;br/&gt;
you can see that at time 2021-04-12T14:16:33.861299+02:00&lt;br/&gt;
node 23 as the leader is writing data to operational datastore.&lt;/p&gt;

&lt;p&gt;But the isolated leader node 21 after isolation got notification that&lt;br/&gt;
it is leader again at 2021-04-12T14:17:36.064744+02:00&lt;br/&gt;
and is writing to operational DS at 2021-04-12T14:17:36.292559+02:00.&lt;br/&gt;
It holds the leadership only for 4 seconds and then become follower.&lt;br/&gt;
That is not correct because the real leader is still node 23.&lt;/p&gt;

&lt;p&gt;The thing that makes Netconf clustering to crash is that node 21&lt;br/&gt;
got notification that it is leader and then that it is follower and node 23&lt;br/&gt;
has no information about this &quot;chaos&quot;. Node 23 never got notification&lt;br/&gt;
that it losts its leadership (and probably that never even happen) thus&lt;br/&gt;
node 23 has no change to write the correct data to opeartional&lt;br/&gt;
datastore again.&lt;/p&gt;

&lt;p&gt;This results in incorrect leader address stored in operational DS as you can&lt;br/&gt;
see at time 2021-04-12T14:17:40.151471+02:00 when node 21 is connecting&lt;br/&gt;
&quot;slave&quot; mountpoint to itself.&lt;/p&gt;

&lt;p&gt;21&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2021-04-12T14:16:16.658098+02:00[Europe/Bratislava]: Registering node manager: null
2021-04-12T14:16:16.672608+02:00[Europe/Bratislava]: registering..
2021-04-12T14:16:16.692363+02:00[Europe/Bratislava]: I am leader
2021-04-12T14:16:16.699679+02:00[Europe/Bratislava]: Unregistering node manager: org.opendaylight.netconf.topology.singleton.impl.NetconfNodeManager@58d4a02
2021-04-12T14:16:18.089251+02:00[Europe/Bratislava]: connecting device.. true
2021-04-12T14:16:18.106551+02:00[Europe/Bratislava]: Updating data: true from master: akka.tcp://opendaylight-cluster-data@192.168.56.21:2550
2021-04-12T14:16:34.811554+02:00[Europe/Bratislava]: I am follower
2021-04-12T14:16:34.826534+02:00[Europe/Bratislava]: Disconnecting device.. false
2021-04-12T14:16:34.841825+02:00[Europe/Bratislava]: Registering node manager: org.opendaylight.netconf.topology.singleton.impl.NetconfNodeManager@58d4a02
2021-04-12T14:16:34.856144+02:00[Europe/Bratislava]: modification...
2021-04-12T14:16:34.863838+02:00[Europe/Bratislava]: Connecting slave mount point to: akka.tcp://opendaylight-cluster-data@192.168.56.21:2550
2021-04-12T14:17:36.064744+02:00[Europe/Bratislava]: I am leader
2021-04-12T14:17:36.076016+02:00[Europe/Bratislava]: Unregistering node manager: org.opendaylight.netconf.topology.singleton.impl.NetconfNodeManager@9603f02
2021-04-12T14:17:36.263286+02:00[Europe/Bratislava]: connecting device.. true
2021-04-12T14:17:36.292559+02:00[Europe/Bratislava]: Updating data: true from master: akka.tcp://opendaylight-cluster-data@192.168.56.21:2550
2021-04-12T14:17:40.090901+02:00[Europe/Bratislava]: I am follower
2021-04-12T14:17:40.118272+02:00[Europe/Bratislava]: Disconnecting device.. false
2021-04-12T14:17:40.130646+02:00[Europe/Bratislava]: Registering node manager: org.opendaylight.netconf.topology.singleton.impl.NetconfNodeManager@9603f02
2021-04-12T14:17:40.144148+02:00[Europe/Bratislava]: modification...
2021-04-12T14:17:40.151471+02:00[Europe/Bratislava]: Connecting slave mount point to: akka.tcp://opendaylight-cluster-data@192.168.56.21:2550
Disconnected from the target VM, address: &apos;192.168.56.21:5005&apos;, transport: &apos;socket&apos;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;22&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2021-04-12T14:16:17.677425+02:00[Europe/Bratislava]: Registering node manager: null
2021-04-12T14:16:17.700051+02:00[Europe/Bratislava]: registering..
2021-04-12T14:16:17.703418+02:00[Europe/Bratislava]: modification...
2021-04-12T14:16:17.725131+02:00[Europe/Bratislava]: Disconnecting slave mountpoint
2021-04-12T14:16:18.140807+02:00[Europe/Bratislava]: modification...
2021-04-12T14:16:18.147951+02:00[Europe/Bratislava]: Connecting slave mount point to: akka.tcp://opendaylight-cluster-data@192.168.56.21:2550
2021-04-12T14:16:20.613060+02:00[Europe/Bratislava]: master: Actor[akka.tcp://opendaylight-cluster-data@192.168.56.21:2550/user/akka.tcp:opendaylight-cluster-data@192.168.56.21:2550_netconf-test-device#770934166]
2021-04-12T14:16:33.702581+02:00[Europe/Bratislava]: modification...
2021-04-12T14:16:33.712800+02:00[Europe/Bratislava]: Disconnecting slave mountpoint
2021-04-12T14:16:33.885460+02:00[Europe/Bratislava]: modification...
2021-04-12T14:16:33.891728+02:00[Europe/Bratislava]: Connecting slave mount point to: akka.tcp://opendaylight-cluster-data@192.168.56.23:2550
2021-04-12T14:17:31.334643+02:00[Europe/Bratislava]: master: Actor[akka.tcp://opendaylight-cluster-data@192.168.56.23:2550/user/akka.tcp:opendaylight-cluster-data@192.168.56.23:2550_netconf-test-device#-189182088]
2021-04-12T14:17:40.008555+02:00[Europe/Bratislava]: modification...
2021-04-12T14:17:40.018033+02:00[Europe/Bratislava]: Disconnecting slave mountpoint
2021-04-12T14:17:40.121272+02:00[Europe/Bratislava]: modification...
2021-04-12T14:17:40.127474+02:00[Europe/Bratislava]: Connecting slave mount point to: akka.tcp://opendaylight-cluster-data@192.168.56.21:2550
Disconnected from the target VM, address: &apos;192.168.56.22:5005&apos;, transport: &apos;socket&apos;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;23&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2021-04-12T14:16:17.676719+02:00[Europe/Bratislava]: Registering node manager: null
2021-04-12T14:16:17.693843+02:00[Europe/Bratislava]: registering..
2021-04-12T14:16:17.696230+02:00[Europe/Bratislava]: modification...
2021-04-12T14:16:17.714960+02:00[Europe/Bratislava]: Disconnecting slave mountpoint
2021-04-12T14:16:18.142020+02:00[Europe/Bratislava]: modification...
2021-04-12T14:16:18.149924+02:00[Europe/Bratislava]: Connecting slave mount point to: akka.tcp://opendaylight-cluster-data@192.168.56.21:2550
2021-04-12T14:16:33.668206+02:00[Europe/Bratislava]: I am leader
2021-04-12T14:16:33.675129+02:00[Europe/Bratislava]: Unregistering node manager: org.opendaylight.netconf.topology.singleton.impl.NetconfNodeManager@3254b5b4
2021-04-12T14:16:33.847092+02:00[Europe/Bratislava]: connecting device.. true
2021-04-12T14:16:33.861299+02:00[Europe/Bratislava]: Updating data: true from master: akka.tcp://opendaylight-cluster-data@192.168.56.23:2550
Disconnected from the target VM, address: &apos;192.168.56.23:5005&apos;, transport: &apos;socket&apos;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;We have created workaround for this in &lt;a href=&quot;https://git.opendaylight.org/gerrit/c/netconf/+/95755&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://git.opendaylight.org/gerrit/c/netconf/+/95755&lt;/a&gt;&lt;br/&gt;
where we added 15s delay for all nodes that is possibly &quot;isolated leader after rejoin&quot; preventing&lt;br/&gt;
it writing data to operational datastore if it fails to hold its leadership for at least 15s time period.&lt;/p&gt;

&lt;p&gt;We conclude that the root of this problem is not in Netconf clustering but in mdsal clustering.&lt;/p&gt;</comment>
                            <comment id="69501" author="ivanhrasko" created="Tue, 3 Aug 2021 13:29:23 +0000"  >&lt;p&gt;We have to stop the work on this task because of currently not working ClusterSingletonService from the following reasons:&lt;/p&gt;

&lt;p&gt;The base logic of the Netconf clustering is located in:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt; class NetconfTopologyContext implements ClusterSingletonService, AutoCloseable
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;The principle is based on:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;ClusterSingletonService#instantiateServiceInstance
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;and&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;ClusterSingletonService#closeServiceInstance
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;methods.&lt;/p&gt;

&lt;p&gt;NetconfTopologyContext class expects that #instantiateServiceInstance method is invoked when&lt;br/&gt;
 a node on which is running becomes a leader.&lt;br/&gt;
 When leader is isolated and a new leader is elected it is expected that #instantiateServiceInstance is invoked on a new leader&apos;s node.&lt;br/&gt;
 Invoking of #instantiateServiceInstance is used to run process of establishing a &quot;master&quot; mountpoint to the device.&lt;/p&gt;

&lt;p&gt;The same is expected with #closeServiceInstance.&lt;br/&gt;
 When a node is no more leader this method should be invoked.&lt;br/&gt;
 Invoking of #closeServiceInstance is used to run process of establishing &quot;slave&quot; mountpoints (throught &quot;master&quot; mountpoint) to the device.&lt;/p&gt;

&lt;p&gt;Since changes in clustering has been made for task: &lt;a href=&quot;https://jira.opendaylight.org/browse/CONTROLLER-1982&quot; class=&quot;external-link&quot; rel=&quot;nofollow&quot;&gt;https://jira.opendaylight.org/browse/CONTROLLER-1982&lt;/a&gt;&lt;br/&gt;
 #instantiateServiceInstance method is invoked on the current leader node only when topology node is created. When the leader changes the method is not invoked on the new leader.&lt;br/&gt;
 Thus new leader does not establish a &quot;master&quot; mountpoint connection to the device and followers still holds (no more working) &quot;slave&quot; mountpoints to the previous leader.&lt;/p&gt;

&lt;p&gt;Similarly, #closeServiceInstance is called only on the deleting the topology node on the original leader.&lt;/p&gt;

&lt;p&gt;Note:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;/rests/data/entity-owners:entity-owners?content=nonconfig
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;is no more available, thus to get info which node is leader we have switched to use:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;/jolokia/read/org.opendaylight.controller:Category=Shards,name=\{member}-shard-topology-operational,type=DistributedOperationalDatastore
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="69510" author="rovarga" created="Tue, 10 Aug 2021 20:54:36 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.opendaylight.org/browse/CONTROLLER-1992&quot; class=&quot;external-link&quot; rel=&quot;nofollow&quot;&gt;https://jira.opendaylight.org/browse/CONTROLLER-1992&lt;/a&gt; tracks missing entity ownership state knobs. Note that the jolokia endpoint does not really have much correlation to EOS decisions.&lt;/p&gt;</comment>
                            <comment id="69511" author="rovarga" created="Tue, 10 Aug 2021 20:59:34 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=ivanhrasko&quot; class=&quot;user-hover&quot; rel=&quot;ivanhrasko&quot;&gt;ivanhrasko&lt;/a&gt;&#160; can recreate the lack of callback invocation in &lt;a href=&quot;https://github.com/opendaylight/controller/blob/master/opendaylight/md-sal/eos-dom-akka/src/test/java/org/opendaylight/controller/eos/akka/ThreeNodeBaseTest.java&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/opendaylight/controller/blob/master/opendaylight/md-sal/eos-dom-akka/src/test/java/org/opendaylight/controller/eos/akka/ThreeNodeBaseTest.java&lt;/a&gt; ?&lt;/p&gt;</comment>
                            <comment id="71591" author="rovarga" created="Sun, 6 Nov 2022 19:14:06 +0000"  >&lt;p&gt;The patches are marked as work-in-progress. Please update them and this issue once they are ready.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10002">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="28634">CONTROLLER-1784</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10003">
                    <name>Relates</name>
                                            <outwardlinks description="relates to">
                                        <issuelink>
            <issuekey id="34301">CONTROLLER-1992</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="relates to">
                                        <issuelink>
            <issuekey id="36948">NETCONF-1039</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="12727" name="logs-carbon-isolation-rejoin.zip" size="594459" author="ricjhill@gmail.com" created="Wed, 16 Aug 2017 13:17:52 +0000"/>
                            <attachment id="12726" name="postman-collection-and-results-isolation-and-rejoin.zip" size="94713" author="ricjhill@gmail.com" created="Wed, 16 Aug 2017 12:49:44 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                            <customfield id="customfield_11400" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                        <customfield id="customfield_10208" key="com.atlassian.jira.plugin.system.customfieldtypes:textfield">
                        <customfieldname>External issue ID</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>8999</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10201" key="com.atlassian.jira.plugin.system.customfieldtypes:url">
                        <customfieldname>External issue URL</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue><![CDATA[https://bugs.opendaylight.org/show_bug.cgi?id=8999]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10206" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Issue Type</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10300"><![CDATA[Bug]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10000" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>0|i01yn3:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                </customfields>
    </item>
</channel>
</rss>