<!-- 
RSS generated by JIRA (8.20.10#820010-sha1:ace47f9899e9ee25d7157d59aa17ab06aee30d3d) at Wed Feb 07 19:56:36 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>OpenDaylight JIRA</title>
    <link>https://jira.opendaylight.org</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>8.20.10</version>
        <build-number>820010</build-number>
        <build-date>22-06-2022</build-date>
    </build-info>


<item>
            <title>[CONTROLLER-1851] Sporadic failures in controller-csit-3node-rest-clust-cars-perf</title>
                <link>https://jira.opendaylight.org/browse/CONTROLLER-1851</link>
                <project id="10113" key="CONTROLLER">controller</project>
                    <description>&lt;p&gt;Verify Purchases test case of Crud MDSAL Perf &lt;span class=&quot;error&quot;&gt;&amp;#91;1&amp;#93;&lt;/span&gt; has started failing sporadically after fix for &lt;a href=&quot;https://jira.opendaylight.org/browse/CONTROLLER-1838&quot; title=&quot;follower reports 401 (unauthorized) and 500 (Internal Error) when leader is isolated.&quot; class=&quot;issue-link&quot; data-issue-key=&quot;CONTROLLER-1838&quot;&gt;&lt;del&gt;CONTROLLER-1838&lt;/del&gt;&lt;/a&gt; got merged &lt;span class=&quot;error&quot;&gt;&amp;#91;may or may not be related&amp;#93;&lt;/span&gt;. There is no node restart, just simple CRUD operations in a cluster setup and occasionally about 9550 of the 10000 purchases go through.&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;1&amp;#93;&lt;/span&gt; &lt;a href=&quot;https://jenkins.opendaylight.org/releng/view/controller/job/controller-csit-3node-rest-clust-cars-perf-only-fluorine/142/robot/controller-rest-clust-cars-perf.txt/010%20Crud%20Mdsal%20Perf/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://jenkins.opendaylight.org/releng/view/controller/job/controller-csit-3node-rest-clust-cars-perf-only-fluorine/142/robot/controller-rest-clust-cars-perf.txt/010%20Crud%20Mdsal%20Perf/&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;2&amp;#93;&lt;/span&gt; &lt;a href=&quot;https://jenkins.opendaylight.org/releng/view/controller/job/controller-csit-3node-rest-clust-cars-perf-only-fluorine/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://jenkins.opendaylight.org/releng/view/controller/job/controller-csit-3node-rest-clust-cars-perf-only-fluorine/&lt;/a&gt;&lt;/p&gt;</description>
                <environment></environment>
        <key id="30323">CONTROLLER-1851</key>
            <summary>Sporadic failures in controller-csit-3node-rest-clust-cars-perf</summary>
                <type id="10104" iconUrl="https://jira.opendaylight.org/secure/viewavatar?size=xsmall&amp;avatarId=10303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.opendaylight.org/images/icons/priorities/critical.svg">High</priority>
                        <status id="5" iconUrl="https://jira.opendaylight.org/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="green"/>
                                    <resolution id="10003">Cannot Reproduce</resolution>
                                        <assignee username="tpantelis">Tom Pantelis</assignee>
                                    <reporter username="thapar">Vishal Thapar</reporter>
                        <labels>
                            <label>csit:3node</label>
                    </labels>
                <created>Thu, 12 Jul 2018 06:27:01 +0000</created>
                <updated>Tue, 14 Aug 2018 16:32:39 +0000</updated>
                            <resolved>Tue, 14 Aug 2018 16:32:39 +0000</resolved>
                                    <version>Fluorine</version>
                                    <fixVersion>Fluorine</fixVersion>
                                    <component>clustering</component>
                        <due></due>
                            <votes>0</votes>
                                    <watches>3</watches>
                                                                                                                <comments>
                            <comment id="64043" author="tpantelis" created="Sun, 15 Jul 2018 12:23:39 +0000"  >&lt;p&gt;The first robot failure I see is in Purchase Cars -&amp;gt;&#160;Read Until Prompt -&amp;gt;&#160;&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;
No match found for &apos;&amp;gt;&apos; in 6 seconds

Output:

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I&apos;m not clear on what this step is doing. &lt;br/&gt;
&lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=thapar&quot; class=&quot;user-hover&quot; rel=&quot;thapar&quot;&gt;thapar&lt;/a&gt; Can you provide more information/context for this failure?&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="64044" author="thapar" created="Sun, 15 Jul 2018 12:33:42 +0000"  >&lt;p&gt;That is not an error. It is waiting for test script to finish and it detects it from the prompt, hence waiting for &apos;&amp;gt;&apos;. If it doesn&apos;t get it in 6 seconds it waits for it again. It is just the way most tests are written, wait for n seconds with 5 retries. If you notice very next step it goes through. That is why it is not marked a failure. Actual failure occurs in Verify Purchases.&lt;/p&gt;</comment>
                            <comment id="64045" author="tpantelis" created="Sun, 15 Jul 2018 14:18:42 +0000"  >&lt;p&gt;And that fails due to &quot;Keyword &apos;Purchase Is Completed&apos; failed after retrying for 5 minutes. The last error was: 9946.0 != 10000.0&quot;. That doesn&apos;t tell me much. If some car purchase RPCs are failing then it would be helpful to see the error reasons. Can you please point me to that? Or if the test isn&apos;t outputting such failures then we need it to.   Also it would be helpful to know how the script issues RPCs, eg does it round-robin to the nodes, or send all to 1 node and if so which one...&lt;/p&gt;

&lt;p&gt;So is it that the failure was triggered b/c the script didn&apos;t finish by the expected deadline time? &lt;/p&gt;</comment>
                            <comment id="64046" author="tpantelis" created="Mon, 16 Jul 2018 02:44:14 +0000"  >&lt;p&gt;A car purchase is done via the buyCar RPC (implemented by PurchaseCarProvider). This publishes a yang notification that is received by the PeopleCarListener which writes a car-person item to the data store. The PurchaseCarProvider logs the message &quot;Routed RPC buyCar : generating notification for buying car &lt;span class=&quot;error&quot;&gt;&amp;#91;{}&amp;#93;&lt;/span&gt;&quot; and PeopleCarListener logs &quot;Successfully added car-person entry: &lt;span class=&quot;error&quot;&gt;&amp;#91;{}&amp;#93;&lt;/span&gt;&quot; on successful transaction. So searching the odl1_karaf.log.gz (using grep &quot;&amp;lt;message&amp;gt;&quot; odl1_karaf.log.gz |wc -l) we see:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2461 &quot;Routed RPC buyCar&quot; messages
2461 &quot;Successfully added car-person entry&quot; messages
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;In odl2_karaf.log.gz:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;5586 &quot;Routed RPC buyCar&quot; messages
5586 &quot;Successfully added car-person entry&quot; messages
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;In  odl3_karaf.log.gz:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;11899 &quot;Routed RPC buyCar&quot; messages
11899 &quot;Successfully added car-person entry&quot; messages
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;So in each the number of buyCar RPCs matches the number of car-person items successfully created in the DS. There were no failures. In fact there were no ERRORs logged at all except a few for &quot;Karaf ssh console user karaf | ShellUtil&quot;.&lt;/p&gt;

&lt;p&gt;The total number of car-person items created is 19946 - I assume 9946 were created during the first run and 10000 after the tell-based restart.  From the log output, the first run only received 9946 buyCar RPCs. So either there&apos;s an issue with the script where it didn&apos;t actually send 10K or the missing RPCs never reached the PurchaseCarProvider.&lt;/p&gt;

&lt;p&gt;I assume cluster_rest_script_purchase_cars.log.gz is the correct output file. According to that 10K requests completed and returned 200 (no failures):&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2018-07-11 04:24:45,916 INFO: Add 10000 purchase(s) to 10.30.170.153,10.30.170.134,10.30.170.111:8181 (1 per request)
2018-07-11 04:25:22,715 INFO: Response code(s) got per number of requests: {200: 1990}
2018-07-11 04:25:22,717 INFO: Response code(s) got per number of requests: {200: 2051}
2018-07-11 04:25:22,718 INFO: Response code(s) got per number of requests: {200: 1413}
2018-07-11 04:25:22,721 INFO: Response code(s) got per number of requests: {200: 1574}
2018-07-11 04:25:22,723 INFO: Response code(s) got per number of requests: {200: 1414}
2018-07-11 04:25:22,724 INFO: Response code(s) got per number of requests: {200: 1558}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;However since there&apos;s only one such log file, I assume this is from the second run which succeeded with 10k created which means the one from the first run got overwritten. So we don&apos;t know the request/response data from the first run. We need to retain both.&lt;/p&gt;

&lt;p&gt;On a side note, I have a suggestion for the tests to make post-mortem analysis easier which I&apos;ve mentioned before. On node restart, save off the karaf log file to a different name, eg karaf.log.run1, and delete the karaf.log before restarting so it starts with a clean log. This will make it much easier to tell which log messages go with each run.&lt;/p&gt;

&lt;p&gt;In fact, I would suggest to separate the ask-based and tell-based into different tests instead of running them back to back in the same test. I think this would make tracking and analysis easier and avoids the issue of log files being overwritten and having to explicitly retain the ones from the prior run. Also we had decided we&apos;re only focusing on ask-based right now.&lt;/p&gt;</comment>
                            <comment id="64051" author="thapar" created="Mon, 16 Jul 2018 08:54:47 +0000"  >&lt;p&gt;Thanks a ton Tom. Yes, does look like log files get overwritten. Should retain logs or have them as separate tests. In netvirt we normally log test case/suite name into karaf logs too, which makes it easier to map logs to individual test cases, better than separate file for each run. I&apos;m still getting familiar with controller CSIT, will do the changes once figure out how/where to do them.&lt;/p&gt;</comment>
                            <comment id="64074" author="thapar" created="Wed, 18 Jul 2018 05:50:25 +0000"  >&lt;p&gt;Most recent run has failures in second run &lt;span class=&quot;error&quot;&gt;&amp;#91;3&amp;#93;&lt;/span&gt;, so we got the right logs for verify purchases &lt;span class=&quot;error&quot;&gt;&amp;#91;4&amp;#93;&lt;/span&gt;. There are total of 78 errors, all 501, which matches with result - 9922.0 != 10000.0&lt;/p&gt;

&lt;p&gt;2018-07-18 02:29:30,430 INFO: &amp;lt;PreparedRequest &lt;span class=&quot;error&quot;&gt;&amp;#91;POST&amp;#93;&lt;/span&gt;&amp;gt; &lt;a href=&quot;http://10.30.170.157:8181/restconf/operations/car-purchase:buy-car&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://10.30.170.157:8181/restconf/operations/car-purchase:buy-car&lt;/a&gt;&lt;br/&gt;
2018-07-18 02:29:30,430 INFO: Response: {&quot;errors&quot;:{&quot;error&quot;:[{&quot;error-type&quot;:&quot;application&quot;,&quot;error-tag&quot;:&quot;operation-not-supported&quot;,&quot;error-message&quot;:&quot;No implementation of RPC AbsoluteSchemaPath&lt;/p&gt;
{path=[(urn:opendaylight:params:xml:ns:yang:controller:config:sal-clustering-it:car-purchase?revision=2014-08-18)buy-car]}
&lt;p&gt; available&quot;}]}}&lt;br/&gt;
2018-07-18 02:29:30,431 INFO: Headers &lt;/p&gt;
{&apos;Content-Length&apos;: &apos;149&apos;, &apos;Content-Type&apos;: &apos;application/json&apos;, &apos;Authorization&apos;: &apos;Basic YWRtaW46YWRtaW4=&apos;}
&lt;p&gt;:&lt;br/&gt;
2018-07-18 02:29:30,432 INFO: &amp;lt;Response &lt;span class=&quot;error&quot;&gt;&amp;#91;501&amp;#93;&lt;/span&gt;&amp;gt; Not Implemented&lt;br/&gt;
2018-07-18 02:29:30,433 INFO: Body: {&quot;input&quot;: {&quot;car-purchase:person&quot;: &quot;/people:people/people:person&lt;span class=&quot;error&quot;&gt;&amp;#91;people:id=&amp;#39;9687&amp;#39;&amp;#93;&lt;/span&gt;&quot;, &quot;car-purchase:car-id&quot;: &quot;9687&quot;, &quot;car-purchase:person-id&quot;: &quot;9687&quot;}}&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;3&amp;#93;&lt;/span&gt; &lt;a href=&quot;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/controller-csit-3node-rest-clust-cars-perf-only-fluorine/149/robot-plugin/log.html.gz#s1-s4-t6&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/controller-csit-3node-rest-clust-cars-perf-only-fluorine/149/robot-plugin/log.html.gz#s1-s4-t6&lt;/a&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;4&amp;#93;&lt;/span&gt; &lt;a href=&quot;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/controller-csit-3node-rest-clust-cars-perf-only-fluorine/149/cluster_rest_script_purchase_cars.log.gz&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/controller-csit-3node-rest-clust-cars-perf-only-fluorine/149/cluster_rest_script_purchase_cars.log.gz&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="64075" author="thapar" created="Wed, 18 Jul 2018 08:43:35 +0000"  >&lt;p&gt;Logs copied to &lt;span class=&quot;error&quot;&gt;&amp;#91;5&amp;#93;&lt;/span&gt; avoid purge during weekend.&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;5&amp;#93;&lt;/span&gt; &lt;a href=&quot;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/builder-copy-sandbox-logs/238&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/builder-copy-sandbox-logs/238&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="64083" author="tpantelis" created="Wed, 18 Jul 2018 13:38:05 +0000"  >&lt;p&gt;So the routed RPCs for buyCar are registered via callback of successful person commit (PeopleProvider).  Searching for the log message &quot;RPC addPerson : routed rpc registered&quot;, we see:&lt;/p&gt;

&lt;p&gt;odl1: 4947&lt;br/&gt;
odl2: 4306&lt;br/&gt;
odl3: 10747&lt;/p&gt;

&lt;p&gt;This adds up to 15053 which doesn&apos;t seem to make sense. There were 2 runs so I&apos;d expect 20k.&lt;/p&gt;

&lt;p&gt;On odl1, I see the RPC registered for person 9687. Unfortunately the script doesn&apos;t output the node IP on failure so we don&apos;t know which node it sent the buyCar RPC to for person 9687. We need it to print the entire URL, ie including host IP. My suspicion is that the RPC reg had not yet been distributed to the &quot;purchasing&quot; node yet. If so then the script would need to retry.&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=thapar&quot; class=&quot;user-hover&quot; rel=&quot;thapar&quot;&gt;thapar&lt;/a&gt; So next steps:&lt;/p&gt;

&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;Please take out the tell-based restart so it only runs the suite once.&lt;/li&gt;
	&lt;li&gt;Enable org.opendaylight.controller.remote.rpc debug (that will produce a lot of output with 10K items)&lt;/li&gt;
	&lt;li&gt;Change the script to print the entire URL, ie including host, on failure&lt;/li&gt;
&lt;/ul&gt;
</comment>
                            <comment id="64084" author="thapar" created="Wed, 18 Jul 2018 14:11:34 +0000"  >&lt;p&gt;Isn&apos;t this line showing the URL to which request is made:&lt;br/&gt;
2018-07-18 02:29:30,651 INFO: &amp;lt;PreparedRequest &lt;span class=&quot;error&quot;&gt;&amp;#91;POST&amp;#93;&lt;/span&gt;&amp;gt; &lt;a href=&quot;http://10.30.170.157:8181/restconf/operations/car-purchase:buy-car&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://10.30.170.157:8181/restconf/operations/car-purchase:buy-car&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;I am having some issues pushing to sandbox, have raised helpdesk ticket and will do the changes once they&apos;re resolved.&lt;/p&gt;</comment>
                            <comment id="64086" author="tpantelis" created="Wed, 18 Jul 2018 14:36:47 +0000"  >&lt;p&gt;yeah - sorry - I don&apos;t know why I didn&apos;t see that.&lt;/p&gt;</comment>
                            <comment id="64087" author="thapar" created="Wed, 18 Jul 2018 15:02:12 +0000"  >&lt;p&gt;No problem. I&apos;ll push the change for tell only.&lt;/p&gt;</comment>
                            <comment id="64088" author="tpantelis" created="Wed, 18 Jul 2018 15:04:02 +0000"  >&lt;p&gt;ask-based only, ie take out the tell-based restart.&lt;/p&gt;</comment>
                            <comment id="64089" author="thapar" created="Wed, 18 Jul 2018 15:07:36 +0000"  >&lt;p&gt;Ah yes, sorry for poorly worded response.&lt;/p&gt;</comment>
                            <comment id="64171" author="thapar" created="Tue, 24 Jul 2018 09:48:57 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=tpantelis&quot; class=&quot;user-hover&quot; rel=&quot;tpantelis&quot;&gt;tpantelis&lt;/a&gt; Can you look at &lt;a href=&quot;https://logs.opendaylight.org/sandbox/vex-yul-odl-jenkins-2/thapar-controller-csit-3node-rest-clust-cars-perf-only-fluorine/1/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://logs.opendaylight.org/sandbox/vex-yul-odl-jenkins-2/thapar-controller-csit-3node-rest-clust-cars-perf-only-fluorine/1/&lt;/a&gt; and see if you get all the logs? This change is to retain logs from both runs and sandbox was testing the patch for it.&lt;/p&gt;</comment>
                            <comment id="64175" author="tpantelis" created="Tue, 24 Jul 2018 13:08:30 +0000"  >&lt;p&gt;Sorry I&apos;m not clear on what you&apos;re asking... &quot;see if you get all the logs&quot; for what? &lt;/p&gt;</comment>
                            <comment id="64176" author="thapar" created="Tue, 24 Jul 2018 13:16:54 +0000"  >&lt;p&gt;I meant if it was still overwriting the logs, I was having some network issues and couldn&apos;t open it, but now I see there is still only one file for car purchase log.&lt;/p&gt;</comment>
                            <comment id="64469" author="thapar" created="Thu, 2 Aug 2018 07:32:21 +0000"  >&lt;p&gt;After separating out, not seeing the failures in ask based for over 30 runs now.&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://jenkins.opendaylight.org/sandbox/user/thapar/my-views/view/MyView/job/thapar-controller-csit-3node-rest-clust-cars-perf-only-fluorine/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://jenkins.opendaylight.org/sandbox/user/thapar/my-views/view/MyView/job/thapar-controller-csit-3node-rest-clust-cars-perf-only-fluorine/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="64626" author="jluhrsen" created="Tue, 14 Aug 2018 16:30:12 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=thapar&quot; class=&quot;user-hover&quot; rel=&quot;thapar&quot;&gt;thapar&lt;/a&gt; please close this if the jobs are not hitting this failure any more&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                            <customfield id="customfield_11400" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                <customfield id="customfield_10000" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>0|i03gpz:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                </customfields>
    </item>
</channel>
</rss>