<!-- 
RSS generated by JIRA (8.20.10#820010-sha1:ace47f9899e9ee25d7157d59aa17ab06aee30d3d) at Wed Feb 07 20:24:20 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>OpenDaylight JIRA</title>
    <link>https://jira.opendaylight.org</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>8.20.10</version>
        <build-number>820010</build-number>
        <build-date>22-06-2022</build-date>
    </build-info>


<item>
            <title>[NETVIRT-1549] ping failures to external network PNF</title>
                <link>https://jira.opendaylight.org/browse/NETVIRT-1549</link>
                <project id="10144" key="NETVIRT">netvirt</project>
                    <description>&lt;p&gt;There are some sporadic failures in the external_network suite around PNF.&lt;/p&gt;

&lt;p&gt;as of now, I see two types:&lt;/p&gt;

&lt;p&gt;1) the initial ping to the PNF in our apex job fails, but it appears the flow rule ends up being&lt;br/&gt;
programmed eventually such that subsequent test cases will pass.&lt;/p&gt;

&lt;p&gt;2) the ping to the PNF fails &lt;b&gt;after&lt;/b&gt; floating IP assignment is done in our non-conntrack&lt;br/&gt;
(e.g. &quot;controller&quot;) job.&lt;/p&gt;

&lt;p&gt;some things to note:&lt;/p&gt;

&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;the apex &lt;a href=&quot;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/builder-copy-sandbox-logs/598/jamo-netvirt-csit-1node-0cmb-1ctl-2cmp-apex-queens-gate-snat-conntrack-neon/1/robot-plugin/log_full.html.gz#s1-s2-t1&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;failure example here &lt;/a&gt; fails the first ping from instance 1 to the PNF, and&lt;br/&gt;
the flow rule to hit the PNF is not there, but the 2nd ping from instance 2 passes and&lt;br/&gt;
the flow rule is then seen in both compute nodes. At that point, all further test cases&lt;br/&gt;
are passing. That seems to indicate some really slow learning/programming or&lt;br/&gt;
possibly the initial ping from instance 1 never even triggered the path to learning and&lt;br/&gt;
programming that proper flow.&lt;/li&gt;
&lt;/ul&gt;


&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;the apex job is conntrack based&lt;/li&gt;
&lt;/ul&gt;


&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;the &quot;controller&quot; job skips the first initial ping test case as it&apos;s not expected to work&lt;br/&gt;
in the non-conntrack job anyway. But when the controller job fails you can see the&lt;br/&gt;
flow tables through the whole test suite and the PNF flow is never programmed.&lt;br/&gt;
&lt;a href=&quot;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/netvirt-csit-1node-0cmb-1ctl-2cmp-openstack-queens-upstream-stateful-neon/830/robot-plugin/log_full.html.gz#s1-s3&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;example job here &lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;


&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;we don&apos;t ever seem to hit this type of sporadic failure in the conntrack job that&lt;br/&gt;
is based on devstack, only in the apex job. One major difference is in how fast&lt;br/&gt;
the tests start from when ODL is brought up. It&apos;s a matter of a few minutes at&lt;br/&gt;
most, as opposed to the devstack based job where it can be 30-40m with ODL&lt;br/&gt;
running before devstack is done stacking and tests begin.&lt;/li&gt;
&lt;/ul&gt;

</description>
                <environment></environment>
        <key id="31260">NETVIRT-1549</key>
            <summary>ping failures to external network PNF</summary>
                <type id="10104" iconUrl="https://jira.opendaylight.org/secure/viewavatar?size=xsmall&amp;avatarId=10303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.opendaylight.org/images/icons/priorities/critical.svg">High</priority>
                        <status id="3" iconUrl="https://jira.opendaylight.org/images/icons/statuses/inprogress.png" description="This issue is being actively worked on at the moment by the assignee.">In Progress</status>
                    <statusCategory id="4" key="indeterminate" colorName="yellow"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="xcheara">Chetan Arakere Gowdru</assignee>
                                    <reporter username="jluhrsen">Jamo Luhrsen</reporter>
                        <labels>
                            <label>apex:gate</label>
                            <label>csit</label>
                            <label>csit:failures</label>
                            <label>csit:sporadic</label>
                    </labels>
                <created>Mon, 7 Jan 2019 22:47:20 +0000</created>
                <updated>Fri, 31 Jan 2020 18:41:56 +0000</updated>
                                                                            <component>General</component>
                        <due></due>
                            <votes>0</votes>
                                    <watches>5</watches>
                                                                                                                <comments>
                            <comment id="66145" author="shague@redhat.com" created="Tue, 8 Jan 2019 14:18:49 +0000"  >&lt;p&gt;Adding email thread:&lt;br/&gt;
jira here:&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://jira.opendaylight.org/browse/NETVIRT-1549&quot; class=&quot;external-link&quot; rel=&quot;nofollow&quot;&gt;https://jira.opendaylight.org/browse/NETVIRT-1549&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;we can maybe debug further there, but I&apos;m starting to wonder if&lt;br/&gt;
we have two different bugs here.&lt;/p&gt;

&lt;p&gt;in the one apex example I gave the failure was in the first initial&lt;br/&gt;
pnf ping which is expected to pass. In that job, it fails, but the&lt;br/&gt;
flow eventually gets programmed and further test cases are ok.&lt;/p&gt;

&lt;p&gt;in the devstack job that we were looking at eventually it was&lt;br/&gt;
failing in the test case after the floating ip was assigned. in&lt;br/&gt;
that case, the flow never got programmed at all.&lt;/p&gt;

&lt;p&gt;also, Aswin noted and I verified that the devstack job for&lt;br/&gt;
conntrack does not see this problem. &lt;b&gt;BUT&lt;/b&gt; the apex job does&lt;br/&gt;
see it which is also conntrack, so I don&apos;t think we can say&lt;br/&gt;
that it&apos;s not there in conntrack.&lt;/p&gt;

&lt;p&gt;It happened on the first gate job I ran in the sandbox just now,&lt;br/&gt;
so hopefully we can figure it out and resolve it so I can move&lt;br/&gt;
on with my plan to make the voting gate job.&lt;/p&gt;

&lt;p&gt;JamO&lt;/p&gt;


&lt;p&gt;On 1/7/19 9:36 AM, Aswin Suryanarayanan wrote:&lt;br/&gt;
&amp;gt;&#160;&lt;br/&gt;
&amp;gt;&#160;&lt;br/&gt;
&amp;gt; On Mon, Jan 7, 2019 at 10:45 PM Sam Hague &amp;lt;&lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;mailto:shague@redhat.com&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;shague@redhat.com&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.opendaylight.org/images/icons/mail_small.gif&quot; height=&quot;12&quot; width=&quot;13&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&#160;&amp;lt;&lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;mailto:[shague@redhat.com&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;[shague@redhat.com&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.opendaylight.org/images/icons/mail_small.gif&quot; height=&quot;12&quot; width=&quot;13&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;|mailto:shague@redhat.com]&amp;gt;&amp;gt; wrote:&lt;br/&gt;
&lt;span class=&quot;image-wrap&quot; style=&quot;&quot;&gt;&lt;img src=&quot;https://ssl.gstatic.com/ui/v1/icons/mail/images/cleardot.gif&quot; style=&quot;border: 0px solid black&quot; /&gt;&lt;/span&gt;&lt;br/&gt;
&amp;gt;&#160;&lt;br/&gt;
&amp;gt;&#160;&lt;br/&gt;
&amp;gt;&#160;&lt;br/&gt;
&amp;gt;&#160; &#160; &#160;On Mon, Jan 7, 2019 at 11:41 AM Aswin Suryanarayanan &amp;lt;&lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;mailto:asuryana@redhat.com&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;asuryana@redhat.com&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.opendaylight.org/images/icons/mail_small.gif&quot; height=&quot;12&quot; width=&quot;13&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&#160;&amp;lt;&lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;mailto:[asuryana@redhat.com&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;[asuryana@redhat.com&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.opendaylight.org/images/icons/mail_small.gif&quot; height=&quot;12&quot; width=&quot;13&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;|mailto:asuryana@redhat.com]&amp;gt;&amp;gt; wrote:&lt;br/&gt;
&amp;gt;&#160;&lt;br/&gt;
&amp;gt;&#160; &#160; &#160; &#160; &#160;When I look at &lt;span class=&quot;error&quot;&gt;&amp;#91;1&amp;#93;&lt;/span&gt; I see that the ARP to the PNF(10.10.10.253) is never answered. Can we suspect anything wrong&lt;br/&gt;
&amp;gt;&#160; &#160; &#160; &#160; &#160;with the PNF? Is it another virtual node?&lt;br/&gt;
&amp;gt;&#160;&lt;br/&gt;
&amp;gt;&#160; &#160; &#160;Would increasing the number of pings help here? Or the ttl piece? One thing I did add in the ping patch was I added&lt;br/&gt;
&amp;gt;&#160; &#160; &#160;-W1 to the ping args - this sets the wait timeout to 1s. The timeout applies when no response is received, so it&lt;br/&gt;
&amp;gt;&#160; &#160; &#160;made sense to lower this to minimum because we retry via the keyword rather than let ping retry - gives us better&lt;br/&gt;
&amp;gt;&#160; &#160; &#160;control.&lt;br/&gt;
&amp;gt;&#160;&lt;br/&gt;
&amp;gt;&#160; &#160; &#160;The -t1 was always there and that means 1 ttl. Maybe that should be bumped and could help? Not sure how to verify&lt;br/&gt;
&amp;gt;&#160; &#160; &#160;this, but when you say the arp is never received I wonder if there is some other hop sometimes. How did you verify&lt;br/&gt;
&amp;gt;&#160; &#160; &#160;the ARP is not answered - via debug logs, mdsal, flows, punt to controller or tcpdump?&lt;br/&gt;
&amp;gt;&#160;&lt;br/&gt;
&amp;gt; Don&apos;t think it a timing issue. Even though ping test failed the if the PNF sends a response ODL will learn it even after&#160;&lt;br/&gt;
&amp;gt; the test case. I was expecting this log to be present&#160; for 10.10.10.253, which is not there.&lt;br/&gt;
&amp;gt;&#160;&lt;br/&gt;
&amp;gt; AbstractIpLearnNotificationHandler | 368 - org.opendaylight.netvirt.vpnmanager-impl - 0.8.0.SNAPSHOT | Received ARP/NA for sender MAC E6:3F:53:B4:D9:CF and sender IP 10.10.10.253 via interface 48400575920930:br-physnet1-pa:trunk&lt;br/&gt;
&amp;gt;&#160;&lt;br/&gt;
&amp;gt; The below confirms ARP was send.&lt;br/&gt;
&amp;gt;&#160;&lt;br/&gt;
&amp;gt; ArpUtils&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160;| 367 - org.opendaylight.netvirt.vpnmanager-impl - 0.8.0.SNAPSHOT | SubnetRoutePacketInHandler: sendArpRequest dpnId 70852734155410, actions [], groupId 210022, senderIPAddress 10.10.10.5, targetIPAddress 10.10.10.253&lt;br/&gt;
&amp;gt;&#160;&lt;br/&gt;
&amp;gt;&#160;&lt;br/&gt;
&amp;gt; It could be a an issue with the pipeline too. But since the ARP learning is successful for the dc-gw(10.10.10.250) I&#160;&lt;br/&gt;
&amp;gt; tend to think pipeline should be fine. Because both of them hit almost the same flows.&lt;br/&gt;
&amp;gt;&#160;&lt;br/&gt;
&amp;gt;&#160;&lt;br/&gt;
&amp;gt;&#160; &#160; &#160;ping -W1 -t1 -c1 10.10.10.253&lt;br/&gt;
&amp;gt;&#160;&lt;br/&gt;
&amp;gt;&#160;&lt;br/&gt;
&amp;gt;&#160; &#160; &#160; &#160; &#160;Surprisingly conntrack job&lt;span class=&quot;error&quot;&gt;&amp;#91;2&amp;#93;&lt;/span&gt; seems to be not hitting this for the time being. &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.opendaylight.org/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;br/&gt;
&amp;gt;&#160;&lt;br/&gt;
&amp;gt;&#160; &#160; &#160; &#160; &#160;&lt;span class=&quot;error&quot;&gt;&amp;#91;1&amp;#93;&lt;/span&gt;&lt;a href=&quot;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/netvirt-csit-1node-0cmb-1ctl-2cmp-openstack-queens-upstream-stateful-neon/830/odl_1/odl1_karaf.log.gz&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/netvirt-csit-1node-0cmb-1ctl-2cmp-openstack-queens-upstream-stateful-neon/830/odl_1/odl1_karaf.log.gz&lt;/a&gt;&lt;br/&gt;
&amp;gt;&#160; &#160; &#160; &#160; &#160;&lt;span class=&quot;error&quot;&gt;&amp;#91;2&amp;#93;&lt;/span&gt;&lt;a href=&quot;https://jenkins.opendaylight.org/releng/job/netvirt-csit-1node-0cmb-1ctl-2cmp-openstack-queens-gate-stateful-snat-conntrack-fluorine/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://jenkins.opendaylight.org/releng/job/netvirt-csit-1node-0cmb-1ctl-2cmp-openstack-queens-gate-stateful-snat-conntrack-fluorine/&lt;/a&gt;&lt;br/&gt;
&amp;gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;br/&gt;
&amp;gt;&#160; &#160; &#160; &#160; &#160;On Mon, Jan 7, 2019 at 9:48 PM Jamo Luhrsen &amp;lt;&lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;mailto:jluhrsen@redhat.com&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;jluhrsen@redhat.com&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.opendaylight.org/images/icons/mail_small.gif&quot; height=&quot;12&quot; width=&quot;13&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&#160;&amp;lt;&lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;mailto:[jluhrsen@redhat.com&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;[jluhrsen@redhat.com&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.opendaylight.org/images/icons/mail_small.gif&quot; height=&quot;12&quot; width=&quot;13&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;|mailto:jluhrsen@redhat.com]&amp;gt;&amp;gt; wrote:&lt;br/&gt;
&amp;gt;&#160;&lt;br/&gt;
&amp;gt;&#160; &#160; &#160; &#160; &#160; &#160; &#160;yep, I noticed this too before the break. this is after Sam&apos;s fix to make our&lt;br/&gt;
&amp;gt;&#160; &#160; &#160; &#160; &#160; &#160; &#160;ping more redundant to get around the issue you saw before with a slight delay&lt;br/&gt;
&amp;gt;&#160; &#160; &#160; &#160; &#160; &#160; &#160;for flows to get programmed.&lt;br/&gt;
&amp;gt;&#160;&lt;br/&gt;
&amp;gt;&#160; &#160; &#160; &#160; &#160; &#160; &#160;JamO&lt;br/&gt;
&amp;gt;&#160;&lt;br/&gt;
&amp;gt;&#160; &#160; &#160; &#160; &#160; &#160; &#160;On 1/7/19 4:12 AM, Sam Hague wrote:&lt;br/&gt;
&amp;gt;&#160; &#160; &#160; &#160; &#160; &#160; &#160; &amp;gt; Aswin,&lt;br/&gt;
&amp;gt;&#160; &#160; &#160; &#160; &#160; &#160; &#160; &amp;gt;&lt;br/&gt;
&amp;gt;&#160; &#160; &#160; &#160; &#160; &#160; &#160; &amp;gt; looks like the PNF is randomly failing again. &lt;span class=&quot;error&quot;&gt;&amp;#91;1&amp;#93;&lt;/span&gt; is the report showing the below test failing in the job&lt;br/&gt;
&amp;gt;&#160; &#160; &#160; &#160; &#160; &#160; &#160;numbers&lt;br/&gt;
&amp;gt;&#160; &#160; &#160; &#160; &#160; &#160; &#160; &amp;gt; listed. Any ideas?&lt;br/&gt;
&amp;gt;&#160; &#160; &#160; &#160; &#160; &#160; &#160; &amp;gt;&lt;br/&gt;
&amp;gt;&#160; &#160; &#160; &#160; &#160; &#160; &#160; &amp;gt; Ping External Network PNF from Vm Instance 1 After Floating IP Ass... &lt;span class=&quot;error&quot;&gt;&amp;#91;733, 743, 802, 815, 819, 820, 828&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;gt;&#160; &#160; &#160; &#160; &#160; &#160; &#160; &amp;gt;&lt;br/&gt;
&amp;gt;&#160; &#160; &#160; &#160; &#160; &#160; &#160; &amp;gt; Thanks, Sam&lt;br/&gt;
&amp;gt;&#160; &#160; &#160; &#160; &#160; &#160; &#160; &amp;gt;&lt;br/&gt;
&amp;gt;&#160; &#160; &#160; &#160; &#160; &#160; &#160; &amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;1&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;gt;&#160; &#160; &#160; &#160; &#160; &#160; &#160; &amp;gt;&lt;br/&gt;
&amp;gt;&#160; &#160; &#160; &#160; &#160; &#160; &#160;&lt;a href=&quot;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/netvirt-job-reports/87/netvirt-csit-1node-0cmb-1ctl-2cmp-openstack-queens-upstream-stateful-neon.console.txt.gz&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/netvirt-job-reports/87/netvirt-csit-1node-0cmb-1ctl-2cmp-openstack-queens-upstream-stateful-neon.console.txt.gz&lt;/a&gt;&lt;br/&gt;
&amp;gt;&#160;&lt;br/&gt;
&#160;&lt;br/&gt;
&#160;&lt;br/&gt;
&#160;&lt;br/&gt;
&#160;&lt;br/&gt;
&#160;&lt;/p&gt;</comment>
                            <comment id="66146" author="shague@redhat.com" created="Tue, 8 Jan 2019 14:20:17 +0000"  >&lt;p&gt;&#160;&lt;/p&gt;


&lt;p&gt;On Mon, Jan 7, 2019 at 10:35 PM Jamo Luhrsen &amp;lt;jluhrsen@redhat.com&amp;gt; wrote:&lt;br/&gt;
Aswin,&lt;/p&gt;

&lt;p&gt;I saw this is happening in the apex/gate job too:&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://jenkins.opendaylight.org/releng/job/netvirt-csit-1node-0cmb-1ctl-2cmp-apex-queens-gate-snat-conntrack-neon/159/robot/openstack/02_external_network/Initial%20Ping%20To%20External%20Network%20PNF%20from%20Vm%20Instance%201/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://jenkins.opendaylight.org/releng/job/netvirt-csit-1node-0cmb-1ctl-2cmp-apex-queens-gate-snat-conntrack-neon/159/robot/openstack/02_external_network/Initial%20Ping%20To%20External%20Network%20PNF%20from%20Vm%20Instance%201/&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The gate job issue was for the run I started. I have fixed it as part of this patch. This should remove some of the exceptions too.&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;1&amp;#93;&lt;/span&gt;https://git.opendaylight.org/gerrit/#/c/73180/8/vpnmanager/impl/src/main/java/org/opendaylight/netvirt/vpnmanager/iplearn/AbstractIpLearnNotificationHandler.java&lt;br/&gt;
in that apex job, the PNF is on the control node which is also the odl node.&lt;/p&gt;

&lt;p&gt;in the non-apex job, the PNF is on the control node, but odl is on it&apos;s own&lt;br/&gt;
node.&lt;/p&gt;

&lt;p&gt;also, note that the apex job is running with conntrack.&lt;/p&gt;

&lt;p&gt;should we just get a JIRA filed and start asking for others to help&lt;br/&gt;
too? I think it might be one of our final bugs to figure out before&lt;br/&gt;
making apex jobs fully gating.&lt;br/&gt;
Did the apex job failed recently? Agree it could be something else too we can file a Jira for this.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;JamO&lt;/p&gt;

&lt;p&gt;On 1/7/19 8:41 AM, Aswin Suryanarayanan wrote:&lt;br/&gt;
&amp;gt; When I look at &lt;span class=&quot;error&quot;&gt;&amp;#91;1&amp;#93;&lt;/span&gt; I see that the ARP to the PNF(10.10.10.253) is never answered. Can we suspect anything wrong with the &lt;br/&gt;
&amp;gt; PNF? Is it another virtual node?&lt;br/&gt;
&amp;gt; &lt;br/&gt;
&amp;gt; Surprisingly conntrack job&lt;span class=&quot;error&quot;&gt;&amp;#91;2&amp;#93;&lt;/span&gt; seems to be not hitting this for the time being. &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.opendaylight.org/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;br/&gt;
&amp;gt; &lt;br/&gt;
&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;1&amp;#93;&lt;/span&gt;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/netvirt-csit-1node-0cmb-1ctl-2cmp-openstack-queens-upstream-stateful-neon/830/odl_1/odl1_karaf.log.gz&lt;br/&gt;
&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;2&amp;#93;&lt;/span&gt;https://jenkins.opendaylight.org/releng/job/netvirt-csit-1node-0cmb-1ctl-2cmp-openstack-queens-gate-stateful-snat-conntrack-fluorine/&lt;br/&gt;
&amp;gt; &lt;br/&gt;
&amp;gt; On Mon, Jan 7, 2019 at 9:48 PM Jamo Luhrsen &amp;lt;jluhrsen@redhat.com &amp;lt;&lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;mailto:jluhrsen@redhat.com&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;jluhrsen@redhat.com&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.opendaylight.org/images/icons/mail_small.gif&quot; height=&quot;12&quot; width=&quot;13&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&amp;gt;&amp;gt; wrote:&lt;br/&gt;
&amp;gt; &lt;br/&gt;
&amp;gt; yep, I noticed this too before the break. this is after Sam&apos;s fix to make our&lt;br/&gt;
&amp;gt; ping more redundant to get around the issue you saw before with a slight delay&lt;br/&gt;
&amp;gt; for flows to get programmed.&lt;br/&gt;
&amp;gt; &lt;br/&gt;
&amp;gt; JamO&lt;br/&gt;
&amp;gt; &lt;br/&gt;
&amp;gt; On 1/7/19 4:12 AM, Sam Hague wrote:&lt;br/&gt;
&amp;gt; &amp;gt; Aswin,&lt;br/&gt;
&amp;gt; &amp;gt;&lt;br/&gt;
&amp;gt; &amp;gt; looks like the PNF is randomly failing again. &lt;span class=&quot;error&quot;&gt;&amp;#91;1&amp;#93;&lt;/span&gt; is the report showing the below test failing in the job numbers&lt;br/&gt;
&amp;gt; &amp;gt; listed. Any ideas?&lt;br/&gt;
&amp;gt; &amp;gt;&lt;br/&gt;
&amp;gt; &amp;gt; Ping External Network PNF from Vm Instance 1 After Floating IP Ass... &lt;span class=&quot;error&quot;&gt;&amp;#91;733, 743, 802, 815, 819, 820, 828&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;gt; &amp;gt;&lt;br/&gt;
&amp;gt; &amp;gt; Thanks, Sam&lt;br/&gt;
&amp;gt; &amp;gt;&lt;br/&gt;
&amp;gt; &amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;1&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;gt; &amp;gt;&lt;br/&gt;
&amp;gt; &lt;a href=&quot;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/netvirt-job-reports/87/netvirt-csit-1node-0cmb-1ctl-2cmp-openstack-queens-upstream-stateful-neon.console.txt.gz&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/netvirt-job-reports/87/netvirt-csit-1node-0cmb-1ctl-2cmp-openstack-queens-upstream-stateful-neon.console.txt.gz&lt;/a&gt;&lt;br/&gt;
&amp;gt;&lt;/p&gt;</comment>
                            <comment id="66151" author="jluhrsen" created="Wed, 9 Jan 2019 16:38:49 +0000"  >&lt;p&gt;btw, &lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=aswins&quot; class=&quot;user-hover&quot; rel=&quot;aswins&quot;&gt;aswins&lt;/a&gt; the apex job ODL karaf log is &lt;a href=&quot;https://logs.opendaylight.org/releng/vex-yul-odl-jenkins-1/builder-copy-sandbox-logs/598/jamo-netvirt-csit-1node-0cmb-1ctl-2cmp-apex-queens-gate-snat-conntrack-neon/1/controller_sos/sosreport-controllerreport-20190107194050/opt/opendaylight/data/log/karaf.log.gz&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;here&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="66154" author="aswins" created="Wed, 9 Jan 2019 19:01:55 +0000"  >&lt;p&gt;I could see the first ARP send is not replied and we are recieving it back via the external interface and we are ignoring it since we are the sender. So the flows should be fine.&lt;/p&gt;

&lt;div class=&quot;panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;panelHeader&quot; style=&quot;border-bottom-width: 1px;&quot;&gt;&lt;b&gt;Not working&lt;/b&gt;&lt;/div&gt;&lt;div class=&quot;panelContent&quot;&gt;
&lt;p&gt;2019-01-07T19:31:49,627 | INFO  | pool-20-thread-1 | SubnetRoutePacketInHandler       | 363 - org.opendaylight.netvirt.vpnmanager-impl - 0.8.0.SNAPSHOT | SUBNETROUTE: onPacketReceived: Processing IP Packet received with Source IP 10.10.10.26 and Target IP 10.10.10.253 and vpnId 100001&lt;br/&gt;
2019-01-07T19:31:49,631 | INFO  | pool-20-thread-1 | ArpUtils                         | 363 - org.opendaylight.netvirt.vpnmanager-impl - 0.8.0.SNAPSHOT | SubnetRoutePacketInHandler: sendArpRequest dpnId 238484387869016, actions [], groupId 210012, senderIPAddress 10.10.10.26, targetIPAddress 10.10.10.253&lt;br/&gt;
2019-01-07T19:31:49,635 | ERROR | pool-20-thread-1 | SubnetRoutePacketInHandler       | 363 - org.opendaylight.netvirt.vpnmanager-impl - 0.8.0.SNAPSHOT | Vpn interface tun9c586aac495 doesn&apos;t exist.&lt;br/&gt;
2019-01-07T19:31:49,640 | INFO  | pool-20-thread-1 | ArpNotificationHandler           | 363 - org.opendaylight.netvirt.vpnmanager-impl - 0.8.0.SNAPSHOT | ArpNotification Non-Gratuitous Request Received from interface 75753836899984:br-datacentre-:trunk and IP 10.10.10.26 having MAC FA:16:3E:25:14:FE target destination 10.10.10.253, ignoring..&lt;br/&gt;
2019-01-07T19:31:49,640 | INFO  | pool-20-thread-1 | ArpNotificationHandler           | 363 - org.opendaylight.netvirt.vpnmanager-impl - 0.8.0.SNAPSHOT | ArpNotification Non-Gratuitous Request Received from interface 260122599241271:br-datacentre-:trunk and IP 10.10.10.26 having MAC FA:16:3E:25:14:FE target destination 10.10.10.253, ignoring..&lt;/p&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;When the same ARP is send again it works. Though the sender and other parameters remains the same.&lt;/p&gt;

&lt;div class=&quot;panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;panelHeader&quot; style=&quot;border-bottom-width: 1px;&quot;&gt;&lt;b&gt;Working&lt;/b&gt;&lt;/div&gt;&lt;div class=&quot;panelContent&quot;&gt;
&lt;p&gt;2019-01-07T19:32:17,325 | INFO  | pool-20-thread-1 | SubnetRoutePacketInHandler       | 363 - org.opendaylight.netvirt.vpnmanager-impl - 0.8.0.SNAPSHOT | SUBNETROUTE: onPacketReceived: Processing IP Packet received with Source IP 10.10.10.26 and Target IP 10.10.10.253 and vpnId 100001&lt;br/&gt;
2019-01-07T19:32:17,326 | INFO  | pool-20-thread-1 | ArpUtils                         | 363 - org.opendaylight.netvirt.vpnmanager-impl - 0.8.0.SNAPSHOT | SubnetRoutePacketInHandler: sendArpRequest dpnId 238484387869016, actions [], groupId 210012, senderIPAddress 10.10.10.26, targetIPAddress 10.10.10.253&lt;br/&gt;
2019-01-07T19:32:17,327 | ERROR | pool-20-thread-1 | SubnetRoutePacketInHandler       | 363 - org.opendaylight.netvirt.vpnmanager-impl - 0.8.0.SNAPSHOT | Vpn interface tun39aee49f130 doesn&apos;t exist.&lt;br/&gt;
2019-01-07T19:32:17,332 | INFO  | pool-20-thread-1 | ArpNotificationHandler           | 363 - org.opendaylight.netvirt.vpnmanager-impl - 0.8.0.SNAPSHOT | ArpNotification Response Received from interface 238484387869016:br-datacentre-:trunk and IP 10.10.10.253 having MAC 1A:F5:1B:58:E8:89, learning MAC&lt;br/&gt;
2019-01-07T19:32:17,333 | INFO  | pool-20-thread-1 | AbstractIpLearnNotificationHandler | 363 - org.opendaylight.netvirt.vpnmanager-impl - 0.8.0.SNAPSHOT | Received ARP/NA for sender MAC 1A:F5:1B:58:E8:89 and sender IP 10.10.10.253 via interface 238484387869016:br-datacentre-:trun&lt;/p&gt;
&lt;/div&gt;&lt;/div&gt;


&lt;p&gt;Though we have many pings only one reached the controller. Rest all are dropped. This is due to the rule below which creates a drop rule with a 10s time out along with sending the packet to the controller .This  is to make the controller less loaded and give it some time to learn the Mac via ARP. Once the ARP is resolved a new flow will be installed in table 21 and ping does not reach  table 22. So all the ping retries are over before the 10s time and the controller  tries only one ARP for the test case. Can we have some bigger interval between the pings so that controller gets another chance to ARP?&lt;/p&gt;


&lt;div class=&quot;panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;panelHeader&quot; style=&quot;border-bottom-width: 1px;&quot;&gt;&lt;b&gt;Flow&lt;/b&gt;&lt;/div&gt;&lt;div class=&quot;panelContent&quot;&gt;
&lt;p&gt;table=22, n_packets=1, n_bytes=98, priority=0,ip actions=CONTROLLER:65535,learn(table=22,hard_timeout=10,priority=10,cookie=0x8000010,eth_type=0x800,NXM_OF_IP_DST[],OXM_OF_METADATA&lt;span class=&quot;error&quot;&gt;&amp;#91;1..23&amp;#93;&lt;/span&gt;)&lt;/p&gt;
&lt;/div&gt;&lt;/div&gt;

</comment>
                            <comment id="66155" author="shague@redhat.com" created="Thu, 10 Jan 2019 01:45:42 +0000"  >&lt;p&gt;Yes, we can easily bump the retries, from 8 to 16 for example. The current call has retries set to 8 and each try is 1s. That was a recent change so that makes sense this started around same time.&lt;/p&gt;

&lt;p&gt;Question is why does the different path happen? And why randomly? Wonder if we need to configure the deployment of the pnf differently.&lt;/p&gt;</comment>
                            <comment id="66159" author="aswins" created="Thu, 10 Jan 2019 12:48:26 +0000"  >&lt;p&gt;Ya yet figure out why this is random. But since we are seeing the ARP we send,  we are sure ODL is sending it out. So the only thing to check is if ODL ignored the response for some reason or the PNF didn&apos;t send one. Do we get some packet count for the PNF?&lt;/p&gt;

&lt;div class=&quot;panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;panelHeader&quot; style=&quot;border-bottom-width: 1px;&quot;&gt;&lt;b&gt;ARP with router IP and PNF as destinaton&lt;/b&gt;&lt;/div&gt;&lt;div class=&quot;panelContent&quot;&gt;
&lt;p&gt;2019-01-07T19:31:49,640 | INFO | pool-20-thread-1 | ArpNotificationHandler | 363 - org.opendaylight.netvirt.vpnmanager-impl - 0.8.0.SNAPSHOT | ArpNotification Non-Gratuitous Request Received from interface 260122599241271:br-datacentre-:trunk and IP 10.10.10.26 having MAC FA:16:3E:25:14:FE target destination 10.10.10.253, ignoring..&lt;/p&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="66162" author="jluhrsen" created="Thu, 10 Jan 2019 23:56:30 +0000"  >&lt;p&gt;This is not reproducing any more. In more than 70 tries now it did not come back. This is regarding the&lt;br/&gt;
apex job failure. I&apos;ll let my looping sandbox job keep running, but at this point I don&apos;t know how much&lt;br/&gt;
time we want to sink in to it any more.&lt;/p&gt;

&lt;p&gt;We have the other flavor of this bug (in the devstack, non-conntrack job) that we may wan to worry&lt;br/&gt;
about at some point, but even that hasn&apos;t happened in over a week.&lt;/p&gt;</comment>
                            <comment id="66169" author="aswins" created="Fri, 11 Jan 2019 15:01:04 +0000"  >&lt;p&gt;The devstack flavor have the same root cause. In non-conntrack job there is only one test case for PNF rest all are skipped. So we never see the route learned as there is not second test case.&lt;/p&gt;

&lt;div class=&quot;panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;panelHeader&quot; style=&quot;border-bottom-width: 1px;&quot;&gt;&lt;b&gt;devstack log&lt;/b&gt;&lt;/div&gt;&lt;div class=&quot;panelContent&quot;&gt;
&lt;p&gt;2019-01-07T11:06:43,240 | INFO  | pool-14-thread-1 | SubnetRoutePacketInHandler       | 367 - org.opendaylight.netvirt.vpnmanager-impl - 0.8.0.SNAPSHOT | SUBNETROUTE: onPacketReceived: Processing IP Packet received with Source IP 10.10.10.13 and Target IP 10.10.10.253 and vpnId 100024&lt;br/&gt;
2019-01-07T11:06:43,249 | INFO  | pool-14-thread-1 | ArpUtils                         | 367 - org.opendaylight.netvirt.vpnmanager-impl - 0.8.0.SNAPSHOT | SubnetRoutePacketInHandler: sendArpRequest dpnId 70852734155410, actions [], groupId 210022, senderIPAddress 10.10.10.5, targetIPAddress 10.10.10.253&lt;br/&gt;
2019-01-07T11:06:43,264 | INFO  | pool-14-thread-1 | ArpNotificationHandler           | 367 - org.opendaylight.netvirt.vpnmanager-impl - 0.8.0.SNAPSHOT | ArpNotification Non-Gratuitous Request Received from interface 273634256852812:br-physnet1-pa:trunk and IP 10.10.10.5 having MAC FA:16:3E:60:70:D0 target destination 10.10.10.253, ignoring..&lt;br/&gt;
2019-01-07T11:06:43,266 | INFO  | pool-14-thread-1 | ArpNotificationHandler           | 367 - org.opendaylight.netvirt.vpnmanager-impl - 0.8.0.SNAPSHOT | ArpNotification Non-Gratuitous Request Received from interface 206713479921264:br-physnet1-pa:trunk and IP 10.10.10.5 having MAC FA:16:3E:60:70:D0 target destination 10.10.10.253, ignoring..&lt;/p&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="67712" author="xcheara" created="Thu, 30 Jan 2020 07:24:51 +0000"  >&lt;p&gt;@Jamo, &lt;/p&gt;

&lt;p&gt;I could see the External Network PNF cases are stable in magnesium.&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://jenkins.opendaylight.org/releng/view/netvirt-csit/job/netvirt-csit-1node-1cmb-0ctl-0cmp-openstack-rocky-upstream-stateful-snat-conntrack-magnesium/69/robot/openstack/03_external_network/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://jenkins.opendaylight.org/releng/view/netvirt-csit/job/netvirt-csit-1node-1cmb-0ctl-0cmp-openstack-rocky-upstream-stateful-snat-conntrack-magnesium/69/robot/openstack/03_external_network/&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Do we still see this as an open issue ??&lt;/p&gt;</comment>
                            <comment id="67802" author="jluhrsen" created="Fri, 31 Jan 2020 18:41:56 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.opendaylight.org/secure/ViewProfile.jspa?name=xcheara&quot; class=&quot;user-hover&quot; rel=&quot;xcheara&quot;&gt;xcheara&lt;/a&gt;,&lt;/p&gt;

&lt;p&gt;it&apos;s been a really long time since I paid attention to netvirt failures, but I found this &lt;a href=&quot;https://jenkins.opendaylight.org/releng/view/netvirt-csit/job/netvirt-csit-1node-0cmb-1ctl-2cmp-openstack-rocky-upstream-stateful-magnesium/146//robot/openstack/03_external_network/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;job&lt;/a&gt; which seems to still&lt;br/&gt;
have it.&lt;/p&gt;

&lt;p&gt;a couple more:&lt;br/&gt;
&lt;a href=&quot;https://jenkins.opendaylight.org/releng/view/netvirt-csit/job/netvirt-csit-1node-0cmb-1ctl-2cmp-openstack-queens-upstream-stateful-magnesium/135/robot/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://jenkins.opendaylight.org/releng/view/netvirt-csit/job/netvirt-csit-1node-0cmb-1ctl-2cmp-openstack-queens-upstream-stateful-magnesium/135/robot/&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://jenkins.opendaylight.org/releng/view/netvirt-csit/job/netvirt-csit-1node-1cmb-0ctl-0cmp-openstack-queens-upstream-stateful-magnesium/70/robot/openstack/03_external_network/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://jenkins.opendaylight.org/releng/view/netvirt-csit/job/netvirt-csit-1node-1cmb-0ctl-0cmp-openstack-queens-upstream-stateful-magnesium/70/robot/openstack/03_external_network/&lt;/a&gt;&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                            <customfield id="customfield_11400" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                <customfield id="customfield_10000" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>0|i03lvj:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                </customfields>
    </item>
</channel>
</rss>