<!-- 
RSS generated by JIRA (8.20.10#820010-sha1:ace47f9899e9ee25d7157d59aa17ab06aee30d3d) at Wed Feb 07 19:56:36 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>OpenDaylight JIRA</title>
    <link>https://jira.opendaylight.org</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>8.20.10</version>
        <build-number>820010</build-number>
        <build-date>22-06-2022</build-date>
    </build-info>


<item>
            <title>[CONTROLLER-1848] Sporadic CSIT failure in RPC tests (501 when hitting /restconf/operations/basic-rpc-test:basic-global)</title>
                <link>https://jira.opendaylight.org/browse/CONTROLLER-1848</link>
                <project id="10113" key="CONTROLLER">controller</project>
                    <description>&lt;p&gt;The suite seems to kill and restart the &quot;brt&quot; (not sure what brt means yet) owner and at that point there is&lt;br/&gt;
some trouble (501 response) when trying to hit the &quot;/restconf/operations/basic-rpc-test:basic-global&quot;&lt;br/&gt;
endpoint.&lt;/p&gt;

&lt;p&gt;the response text is:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 	{&lt;span class=&quot;code-quote&quot;&gt;&quot;errors&quot;&lt;/span&gt;:{&lt;span class=&quot;code-quote&quot;&gt;&quot;error&quot;&lt;/span&gt;:[{&lt;span class=&quot;code-quote&quot;&gt;&quot;error-type&quot;&lt;/span&gt;:&lt;span class=&quot;code-quote&quot;&gt;&quot;application&quot;&lt;/span&gt;,&lt;span class=&quot;code-quote&quot;&gt;&quot;error-tag&quot;&lt;/span&gt;:&lt;span class=&quot;code-quote&quot;&gt;&quot;operation-not-supported&quot;&lt;/span&gt;,&lt;span class=&quot;code-quote&quot;&gt;&quot;error-message&quot;&lt;/span&gt;:&lt;span class=&quot;code-quote&quot;&gt;&quot;No implementation of RPC AbsoluteSchemaPath{path=[(urn:opendaylight:controller:basic-rpc-test?revision=2016-01-20)basic-global]} available&quot;&lt;/span&gt;}]}}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="30270">CONTROLLER-1848</key>
            <summary>Sporadic CSIT failure in RPC tests (501 when hitting /restconf/operations/basic-rpc-test:basic-global)</summary>
                <type id="10104" iconUrl="https://jira.opendaylight.org/secure/viewavatar?size=xsmall&amp;avatarId=10303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.opendaylight.org/images/icons/priorities/major.svg">Medium</priority>
                        <status id="5" iconUrl="https://jira.opendaylight.org/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="green"/>
                                    <resolution id="10002">Duplicate</resolution>
                                        <assignee username="tpantelis">Tom Pantelis</assignee>
                                    <reporter username="vpickard">Victor Pickard</reporter>
                        <labels>
                            <label>csit:3node</label>
                    </labels>
                <created>Tue, 3 Jul 2018 14:36:37 +0000</created>
                <updated>Wed, 1 Aug 2018 12:22:43 +0000</updated>
                            <resolved>Mon, 30 Jul 2018 17:50:58 +0000</resolved>
                                                                        <due></due>
                            <votes>0</votes>
                                    <watches>4</watches>
                                                                                                                <comments>
                            <comment id="63868" author="tpantelis" created="Tue, 3 Jul 2018 14:43:48 +0000"  >&lt;p&gt;That message is normal from akka - it&apos;s reporting a node is unreachable which I assume is part of the test.&#160;&lt;/p&gt;</comment>
                            <comment id="63931" author="jluhrsen" created="Thu, 5 Jul 2018 19:14:23 +0000"  >&lt;p&gt;as I commented in the kernel call, this jira was opened because of a failing test. I think &lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=vpickard&quot; class=&quot;user-hover&quot; rel=&quot;vpickard&quot;&gt;vpickard&lt;/a&gt; was just trying to find some&lt;br/&gt;
more specific reason for the failure (to be helpful) and not neccessarily calling out that error message.&lt;/p&gt;

&lt;p&gt;maybe it will be better to use more generic jira summaries like &quot;CSIT test case XYZ intermittently failing&quot; with some comments&lt;br/&gt;
showing certain suspect logs like &quot;Leader can not perform its duties&quot; so we can keep better track of our issues? just an idea,&lt;br/&gt;
but I&apos;m not sure if this one getting closed resulted in a new one being open, because I doubt the intermittent test failure was&lt;br/&gt;
addressed yet.&lt;/p&gt;</comment>
                            <comment id="63932" author="tpantelis" created="Thu, 5 Jul 2018 19:23:14 +0000"  >&lt;p&gt;Sure. You can reopen this and change summary etc or open a new one - doesn&apos;t really matter. I thought this was created b/c you saw that ominous message in the log and thought it was an issue.&lt;/p&gt;</comment>
                            <comment id="63933" author="jluhrsen" created="Thu, 5 Jul 2018 19:30:16 +0000"  >&lt;p&gt;here is the original bug description for posterity&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Controller clustering CSIT is failing intermittently, I see this in the karaf logs 7 times for this particular run &lt;span class=&quot;error&quot;&gt;&amp;#91;1&amp;#93;&lt;/span&gt;&lt;/p&gt;



&lt;p&gt;2018-07-03T06:49:24,867 | INFO | opendaylight-cluster-data-akka.actor.default-dispatcher-33 | Cluster(akka://opendaylight-cluster-data) | 51 - com.typesafe.akka.slf4j - 2.5.11 | Cluster Node &lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.170.209:2550&amp;#93;&lt;/span&gt; - Leader can currently not perform its duties, reachability status: [akka.tcp://opendaylight-cluster-data@10.30.170.209:2550 -&amp;gt; akka.tcp://opendaylight-cluster-data@10.30.170.221:2550: Unreachable &lt;span class=&quot;error&quot;&gt;&amp;#91;Unreachable&amp;#93;&lt;/span&gt; (1), akka.tcp://opendaylight-cluster-data@10.30.170.220:2550 -&amp;gt; akka.tcp://opendaylight-cluster-data@10.30.170.221:2550: Unreachable &lt;span class=&quot;error&quot;&gt;&amp;#91;Unreachable&amp;#93;&lt;/span&gt; (1)], member status: &lt;span class=&quot;error&quot;&gt;&amp;#91;akka.tcp://opendaylight-cluster-data@10.30.170.209:2550 Up seen=true, akka.tcp://opendaylight-cluster-data@10.30.170.220:2550 Up seen=true, akka.tcp://opendaylight-cluster-data@10.30.170.221:2550 Up seen=false&amp;#93;&lt;/span&gt;&lt;/p&gt;



&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;1&amp;#93;&lt;/span&gt; &lt;a href=&quot;https://jenkins.opendaylight.org/sandbox/job/controller-csit-3node-clustering-vpickard-all-oxygen/13/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://jenkins.opendaylight.org/sandbox/job/controller-csit-3node-clustering-vpickard-all-oxygen/13/&lt;/a&gt;&lt;/p&gt;&lt;/blockquote&gt;</comment>
                            <comment id="63934" author="jluhrsen" created="Thu, 5 Jul 2018 19:34:19 +0000"  >&lt;p&gt;thanks &lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=tpantelis&quot; class=&quot;user-hover&quot; rel=&quot;tpantelis&quot;&gt;tpantelis&lt;/a&gt;, I just repopened this jira. the description had a link to the failing job, so I use that to&lt;br/&gt;
add a few details.&lt;/p&gt;

&lt;p&gt;we need to keep digging from here.&lt;/p&gt;</comment>
                            <comment id="63955" author="vpickard" created="Fri, 6 Jul 2018 20:35:01 +0000"  >&lt;p&gt;Archived logs since this is a sandbox job&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/builder-copy-sandbox-logs/163/controller-csit-3node-clustering-vpickard-all-oxygen/13/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/builder-copy-sandbox-logs/163/controller-csit-3node-clustering-vpickard-all-oxygen/13/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="64042" author="jhershbe" created="Sun, 15 Jul 2018 10:35:22 +0000"  >&lt;p&gt;BRT stands for &quot;Basic RPC Test&quot; &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.opendaylight.org/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                            <comment id="64047" author="tpantelis" created="Mon, 16 Jul 2018 03:03:31 +0000"  >&lt;p&gt;We need to understand what the test does and need the log files in order to do post-mortem analysis.&lt;/p&gt;</comment>
                            <comment id="64053" author="vpickard" created="Mon, 16 Jul 2018 17:08:17 +0000"  >&lt;p&gt;The archived logs are here:&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/builder-copy-sandbox-logs/163/controller-csit-3node-clustering-vpickard-all-oxygen/13/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/builder-copy-sandbox-logs/163/controller-csit-3node-clustering-vpickard-all-oxygen/13/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="64055" author="tpantelis" created="Mon, 16 Jul 2018 18:07:30 +0000"  >&lt;p&gt;So the &quot;old owner&quot; was odl3. There are no errors in the odl3 karaf log after restart around 2018-07-03T06:50:20. In fact there&apos;s no log messages at all after 6:52:28 - the robot log.html indicates the basic-global RPC was sent around 6:54. We need to enable debug for org.opendaylight.controller.remote.rpc and try to reproduce.&lt;/p&gt;

&lt;p&gt;If possible, in the Restart_Killed_Member step, it would be helpful to save off the karaf.log and delete it so the node starts with a clean log. &lt;/p&gt;</comment>
                            <comment id="64071" author="tpantelis" created="Tue, 17 Jul 2018 23:57:36 +0000"  >&lt;p&gt;Looking at the Run_Rpc keyword, it issues the RPC once.  When a node restarts, it takes a bit of time for it to learn about RPC registrations from other nodes via the gossip functionality. So I suspect the test issued the RPC before odl3 knew about the remote RPC implementation, hence the 501 &quot;No implementation&quot; error. So the test should retry via Wait Until Keyword Succeeds. &lt;/p&gt;</comment>
                            <comment id="64073" author="jluhrsen" created="Wed, 18 Jul 2018 04:47:54 +0000"  >&lt;p&gt;what&apos;s a reasonable time for this though. looking at the log &lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=vpickard&quot; class=&quot;user-hover&quot; rel=&quot;vpickard&quot;&gt;vpickard&lt;/a&gt; posted in the comment above, the&lt;br/&gt;
killed memeber is restarted, and it polls for ~2 minutes until syncstatus is true, then it&apos;s another 1.5m before&lt;br/&gt;
the RPC is issued. So, 3.5m from the time of restart, but probably more importantly, 1.5m since the cluster&lt;br/&gt;
sync was said to be ok.&lt;/p&gt;

&lt;p&gt;we can certainly add a WUKS to that step, but it seems like it&apos;s already happening quite a long time later&lt;br/&gt;
for a cluster member to not know about the RPC. right?&lt;/p&gt;</comment>
                            <comment id="64077" author="tpantelis" created="Wed, 18 Jul 2018 12:22:02 +0000"  >&lt;p&gt;I didn&apos;t notice 1.5m passed. That should be plenty of time. Does it clean the data dir before restarting? If so, then it&apos;s re-installing the world so that can take time.  It was just an idea...&lt;/p&gt;

&lt;p&gt;We need a failure run with org.opendaylight.controller.remote.rpc debug enabled. Maybe pull out that one suite and run over and over in the sandbox or whatever?&lt;/p&gt;</comment>
                            <comment id="64109" author="jhershbe" created="Thu, 19 Jul 2018 11:42:27 +0000"  >&lt;p&gt;The few times I&apos;ve seen this reproduced it seems that it is caused by 2/3 of the nodes being down or otherwise unreachable. Manual explicit tests of this have produced the same outcome - when 2/3 nodes are down (or in &quot;deadlock&quot;) rpc calls return 501.&lt;/p&gt;</comment>
                            <comment id="64110" author="tpantelis" created="Thu, 19 Jul 2018 12:06:36 +0000"  >&lt;p&gt;So the BasicRpcTestProvider implements the basic-global RPC. It is also a ClusterSingletonService - the RPC is registered when it gets the singleton lock and is spun up. Therefore with 2/3 nodes down and the prior lock owner was one of those 2 then consensus is lost and the remaining node isn&apos;t able to become the lock owner. Therefore there is no BasicRpcTestProvider instance running and no basic-global RPC implementation.&lt;/p&gt;</comment>
                            <comment id="64122" author="jhershbe" created="Thu, 19 Jul 2018 15:09:13 +0000"  >&lt;p&gt;Thanks!&lt;/p&gt;</comment>
                            <comment id="64412" author="jluhrsen" created="Mon, 30 Jul 2018 17:50:58 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=jhershbe&quot; class=&quot;user-hover&quot; rel=&quot;jhershbe&quot;&gt;jhershbe&lt;/a&gt;looked back through the history of Oxygen jobs and found that the failures&lt;br/&gt;
seen are coming because of two existing bugs:&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://jira.opendaylight.org/browse/CONTROLLER-1849&quot; title=&quot;controller not coming up healthy after being killed and restarted (401 after 5m)&quot; class=&quot;issue-link&quot; data-issue-key=&quot;CONTROLLER-1849&quot;&gt;&lt;del&gt;CONTROLLER-1849&lt;/del&gt;&lt;/a&gt; and &lt;a href=&quot;https://jira.opendaylight.org/browse/CONTROLLER-1742&quot; title=&quot;RetiredGenerationException in cluster sanity suite&quot; class=&quot;issue-link&quot; data-issue-key=&quot;CONTROLLER-1742&quot;&gt;&lt;del&gt;CONTROLLER-1742&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;marking this as duplicate because of that.&lt;/p&gt;</comment>
                            <comment id="64454" author="jhershbe" created="Wed, 1 Aug 2018 06:41:34 +0000"  >&lt;p&gt;From what I can tell this works via EOS. Not a question, just recording this for posterity.&lt;/p&gt;</comment>
                            <comment id="64459" author="tpantelis" created="Wed, 1 Aug 2018 12:22:43 +0000"  >&lt;p&gt;yes - EOS via ClusterSingletonService&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10003">
                    <name>Relates</name>
                                            <outwardlinks description="relates to">
                                        <issuelink>
            <issuekey id="30150">NETVIRT-1315</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                            <customfield id="customfield_11400" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                <customfield id="customfield_10000" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>0|i03gef:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                </customfields>
    </item>
</channel>
</rss>