Quantcast

Any news about "msk0 watchdog timeout" regression in 10-RELEASE?

classic Classic list List threaded Threaded
9 messages Options
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

Any news about "msk0 watchdog timeout" regression in 10-RELEASE?

Vitaly Magerya
Hi, folks. I've just upgraded to 10.0-RELEASE, and my msk(4) card
stopped working; it would work for a few minutes, and then it will
start printing "msk0 watchdog timeout" messages with an interrupt
storm accompanying it. I think this problem was described earlier
this month in [1].

My question is: was there a workaround found, or should I just
downgrade back to 9.2?

[1] https://lists.freebsd.org/pipermail/freebsd-stable/2014-January/076676.html
_______________________________________________
[hidden email] mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-stable
To unsubscribe, send any mail to "[hidden email]"
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

Re: Any news about "msk0 watchdog timeout" regression in 10-RELEASE?

Curtis Villamizar

In message <[hidden email]>
Vitaly Magerya writes:
 

> Hi, folks. I've just upgraded to 10.0-RELEASE, and my msk(4) card
> stopped working; it would work for a few minutes, and then it will
> start printing "msk0 watchdog timeout" messages with an interrupt
> storm accompanying it. I think this problem was described earlier
> this month in [1].
>  
> My question is: was there a workaround found, or should I just
> downgrade back to 9.2?
>  
> [1] https://lists.freebsd.org/pipermail/freebsd-stable/2014-January/076676.html


I have mine working but I haven't done a lot of reboots to see if it
is a "fix" or luck.

There is a lot of junk that you won't need in the code that is running
well for me.  But here it is, as-is warts and all.

I've been swamped lately and haven't had time to look at this further.

Curtis


Notes-

  1.  the change in watchdog code has no effect (not hit when working,
      does not fix things when not working).

  2.  Lots of printf thingies in there that you can delete if you
      like.  If things work you don't hit this code.

  3.  Chanes to the interrupt handler also seem to do nothing (good or
      bad) if things are working.

  4.  Why this is working for me is at this point a mystery but
      whether it works for you gives us another data point.


Index: if_msk.c
===================================================================
--- if_msk.c (revision 260441)
+++ if_msk.c (working copy)
@@ -2161,6 +2161,10 @@
  count = imin(4096, roundup2(count, 1024));
  sc->msk_stat_count = count;
  stat_sz = count * sizeof(struct msk_stat_desc);
+#if 1
+ device_printf(sc->msk_dev,
+      "msk_status_dma_alloc: %d %lu\n", count, stat_sz);
+#endif
  error = bus_dma_tag_create(
     bus_get_dma_tag(sc->msk_dev), /* parent */
     MSK_STAT_ALIGN, 0, /* alignment, boundary */
@@ -2975,6 +2979,14 @@
  }
 }
 
+#if 1
+static uint32_t msk_last_status = 0;
+static uint16_t last_stat_put_idx = 0;
+static uint32_t last_msk_control = 0;
+static uint16_t last_good_stat_put_idx = 0;
+static uint32_t last_good_msk_control = 0;
+#endif
+
 static void
 msk_watchdog(struct msk_if_softc *sc_if)
 {
@@ -2995,7 +3007,70 @@
  return;
  }
 
- if_printf(ifp, "watchdog timeout\n");
+#if 1
+ if_printf(ifp,
+"watchdog timeout: 0x%08x\n  (0x%04x 0x%08x) (0x%04x 0x%08x) 0x%08x 0x%08x\n",
+  msk_last_status,
+  sc_if->msk_softc->msk_stat_cons, last_msk_control,
+  last_good_stat_put_idx, last_good_msk_control,
+  last_stat_put_idx, sc_if->msk_softc->msk_stat_count);
+ {
+  struct msk_softc *sc = sc_if->msk_softc;
+  uint16_t cons, count;
+  struct msk_stat_desc *sd;
+  uint32_t control;
+#if 0
+  char linebuf[8192];
+  char *pt = linebuf;
+  size_t bytes = 8192;
+  size_t used;
+
+  count = sc->msk_stat_count;
+  for (cons = 0; cons < count; ++cons) {
+    if ((cons > 0) && ((cons & 0xff) == 0)) {
+      if_printf(ifp, "%s\n", linebuf);
+      pt = linebuf;
+      bytes = sizeof(linebuf);
+    }
+    if ((cons & 7) == 0) {
+      snprintf(pt, bytes - 1, "\n%03x ", cons);
+      used = strlen(pt); pt += used; bytes -= used;
+    } else if ((cons & 3) == 0) {
+      snprintf(pt, bytes - 1, " ");
+      used = strlen(pt); pt += used; bytes -= used;
+    }
+    sd = &sc->msk_stat_ring[cons];
+    control = le32toh(sd->msk_control);
+    snprintf(pt, bytes - 1, " %08x", control);
+    used = strlen(pt); pt += used; bytes -= used;
+  }
+  if_printf(ifp, "%s\n\n", linebuf);
+#endif
+  /* bump the count if we got stuck on HW_OWNER */
+  if (((msk_last_status & Y2_IS_STAT_BMU) != 0)
+      && (sc->msk_stat_cons != last_stat_put_idx)
+      && ((last_msk_control & HW_OWNER) == 0)) {
+    /* Sync status LEs. */
+    bus_dmamap_sync(sc->msk_stat_tag, sc->msk_stat_map,
+    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
+    cons = sc->msk_stat_cons;
+    count = sc->msk_stat_count;
+    do {
+      MSK_INC(cons, count);
+      sd = &sc->msk_stat_ring[cons];
+      control = le32toh(sd->msk_control);
+    } while ((cons != sc->msk_stat_cons)
+     && ((control & HW_OWNER) == 0));
+    if (cons != sc->msk_stat_cons) {
+      if_printf(ifp, "msk_stat_cons changed 0x%04x -> 0x%04x\n",
+ sc->msk_stat_cons, cons);
+      sc->msk_stat_cons = cons;
+    }
+  }
+ }
+#else
+ if_printf(ifp, "watchdog timeout: status\n");
+#endif
  ifp->if_oerrors++;
  ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
  msk_init_locked(sc_if);
@@ -3599,8 +3674,12 @@
  int rxput[2];
  struct msk_stat_desc *sd;
  uint32_t control, status;
- int cons, len, port, rxprog;
+ int len, port, rxprog;
+ uint16_t cons;
 
+#if 1
+ last_stat_put_idx = CSR_READ_2(sc, STAT_PUT_IDX);
+#endif
  if (sc->msk_stat_cons == CSR_READ_2(sc, STAT_PUT_IDX))
  return (0);
 
@@ -3614,8 +3693,15 @@
  for (;;) {
  sd = &sc->msk_stat_ring[cons];
  control = le32toh(sd->msk_control);
+#if 1
+ last_msk_control = control;
+#endif
  if ((control & HW_OWNER) == 0)
  break;
+#if 1
+ last_good_stat_put_idx = cons;
+ last_good_msk_control = control;
+#endif
  control &= ~HW_OWNER;
  sd->msk_control = htole32(control);
  status = le32toh(sd->msk_status);
@@ -3689,6 +3775,11 @@
  if (rxput[MSK_PORT_B] > 0)
  msk_rxput(sc->msk_if[MSK_PORT_B]);
 
+#if 1
+ last_stat_put_idx = CSR_READ_2(sc, STAT_PUT_IDX);
+#endif
+ if ((control & HW_OWNER) == 0)
+ return 1;
  return (sc->msk_stat_cons != CSR_READ_2(sc, STAT_PUT_IDX));
 }
 
@@ -3742,8 +3833,11 @@
  CSR_WRITE_4(sc, B0_IMSK, sc->msk_intrmask);
  CSR_READ_4(sc, B0_IMSK);
  }
- if ((status & Y2_IS_HW_ERR) != 0)
+ if ((status & Y2_IS_HW_ERR) != 0) {
  msk_intr_hwerr(sc);
+ device_printf(sc->msk_dev,
+      "Y2_IS_HW_ERR is set: status 0x%x\n", status);
+ }
 
  domore = msk_handle_events(sc);
  if ((status & Y2_IS_STAT_BMU) != 0 && domore == 0)
@@ -3762,6 +3856,17 @@
     !IFQ_DRV_IS_EMPTY(&ifp1->if_snd))
  msk_start_locked(ifp1);
 
+#if 1
+#define Y2_IS_OTHER_INTR \
+ (Y2_IS_ASF | Y2_IS_POLL_CHK | Y2_IS_IRQ_SW | Y2_IS_TIMINT | \
+ Y2_IS_CHK_TXS2 | Y2_IS_PSM_ACK | Y2_IS_PTP_TIST | Y2_IS_CHK_TXS1)
+ if ((status & (Y2_IS_OTHER_INTR)) != 0) {
+    device_printf(sc->msk_dev, "unknown interupt bits 0x%x\n",
+  status & (Y2_IS_OTHER_INTR));
+ }
+ msk_last_status = status;
+#endif
+
  MSK_UNLOCK(sc);
 }
 
Index: if_mskreg.h
===================================================================
--- if_mskreg.h (revision 260441)
+++ if_mskreg.h (working copy)
@@ -156,7 +156,7 @@
 #define DEVICEID_DLINK_DGE560SX 0x4002
 #define DEVICEID_DLINK_DGE560T 0x4b00
 
-#define BIT_31 (1 << 31)
+#define BIT_31 (1U << 31)
 #define BIT_30 (1 << 30)
 #define BIT_29 (1 << 29)
 #define BIT_28 (1 << 28)
@@ -2329,8 +2329,13 @@
  */
 #if (BUS_SPACE_MAXADDR > 0xFFFFFFFF)
 #define MSK_64BIT_DMA
+#if 1
+#define MSK_TX_RING_CNT 256
+#define MSK_RX_RING_CNT 256
+#else
 #define MSK_TX_RING_CNT 384
 #define MSK_RX_RING_CNT 512
+#endif
 #else
 #undef MSK_64BIT_DMA
 #define MSK_TX_RING_CNT 256
@@ -2539,8 +2544,8 @@
  bus_addr_t msk_stat_ring_paddr;
  int msk_int_holdoff;
  int msk_process_limit;
- int msk_stat_cons;
- int msk_stat_count;
+ uint16_t msk_stat_cons;
+ uint16_t msk_stat_count;
  struct mtx msk_mtx;
 };
 
_______________________________________________
[hidden email] mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-stable
To unsubscribe, send any mail to "[hidden email]"
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

Re: Any news about "msk0 watchdog timeout" regression in 10-RELEASE?

Vitaly Magerya
On 01/21/14 21:56, Curtis Villamizar wrote:
> I have mine working but I haven't done a lot of reboots to see if it
> is a "fix" or luck.
>
> There is a lot of junk that you won't need in the code that is running
> well for me.  But here it is, as-is warts and all.
>
> I've been swamped lately and haven't had time to look at this further.

I've tried the patch, and the testing went like this:
1) Reboot into fixed kernel => msk0 shows watchdog timeouts.
2) Reboot again => no timeouts, but the interrupt storm is still there.
3) Disable the machine completely for 15 minutes (take out the battery
too; it's a laptop), boot fixed kernel => msk works fine.
4) Reboot one more time => msk still works fine.
5) Reboot into 10-RELEASE kernel => watchdog timeouts.
6) Disable the machine completely for 15 minutes, boot fixed kernel =>
still watchdog timeouts.
7) Disable the machine for 30 minutes, boot fixed kernel => nope, still
doesn't work.

So, there was a success once (step 3), but I was not able to reproduce
it after that. Seems to be random.
_______________________________________________
[hidden email] mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-stable
To unsubscribe, send any mail to "[hidden email]"
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

Re: Any news about "msk0 watchdog timeout" regression in 10-RELEASE?

Curtis Villamizar

In message <[hidden email]>
Vitaly Magerya writes:

>
> On 01/21/14 21:56, Curtis Villamizar wrote:
> > I have mine working but I haven't done a lot of reboots to see if it
> > is a "fix" or luck.
> >
> > There is a lot of junk that you won't need in the code that is running
> > well for me.  But here it is, as-is warts and all.
> >
> > I've been swamped lately and haven't had time to look at this further.
>  
> I've tried the patch, and the testing went like this:
> 1) Reboot into fixed kernel => msk0 shows watchdog timeouts.
> 2) Reboot again => no timeouts, but the interrupt storm is still there.
> 3) Disable the machine completely for 15 minutes (take out the battery
>    too; it's a laptop), boot fixed kernel => msk works fine.
> 4) Reboot one more time => msk still works fine.
> 5) Reboot into 10-RELEASE kernel => watchdog timeouts.
> 6) Disable the machine completely for 15 minutes, boot fixed kernel =>
>    still watchdog timeouts.
> 7) Disable the machine for 30 minutes, boot fixed kernel => nope, still
>    doesn't work.
>  
> So, there was a success once (step 3), but I was not able to reproduce
> it after that. Seems to be random.


In my case I didn't have a problem if I didn't reboot the original
kernel but I only tried a few reboots.  I can't see how a chip could
retain any state after 30 minutes of no power so you are right that we
don't have a fix.  I haven't had time to look at this further and
don't generally reboot this machine (uptime 16 days since last I
looked at this).

When I'm no longer quite so swamped I'll look at this again.  It seems
we are the only two reporting this problem.  Please send lines of
these form from dmesg:

  mskc0: <Marvell Yukon 88E8057 Gigabit Ethernet> port 0xe800-0xe8ff
  mem 0xfebfc000-0xfebfffff irq 19 at device 0.0 on pci2

  msk0: <Marvell Technology Group Ltd. Yukon Ultra 2 Id 0xba Rev 0x00>
  on mskc0

That may indicate we have very similar chips.  If not, this msk
problem may be more widespread.

Curtis
_______________________________________________
[hidden email] mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-stable
To unsubscribe, send any mail to "[hidden email]"
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

Re: Any news about "msk0 watchdog timeout" regression in 10-RELEASE?

Vitaly Magerya
On 01/25/14 21:35, Curtis Villamizar wrote:
> When I'm no longer quite so swamped I'll look at this again.  It seems
> we are the only two reporting this problem.

To everyone reading this list: if you have an msk(4) NIC that doesn't
work on 10-RELEASE, now is the time to speak up.

> Please send lines of these form from dmesg:
>
>   mskc0: <Marvell Yukon 88E8057 Gigabit Ethernet> port 0xe800-0xe8ff
>   mem 0xfebfc000-0xfebfffff irq 19 at deviceD 0.0 on pci2
>
>   msk0: <Marvell Technology Group Ltd. Yukon Ultra 2 Id 0xba Rev 0x00>
>   on mskc0
>
> That may indicate we have very similar chips.  If not, this msk
> problem may be more widespread.

Mine goes like this:

  mskc0: <Marvell Yukon 88E8040 Fast Ethernet> port 0x2000-0x20ff
  mem 0xf0200000-0xf0203fff irq 18 at device 0.0 on pci9

  msk0: <Marvell Technology Group Ltd. Yukon FE+ Id 0xb8 Rev 0x00>
  on mskc0

Pretty different chips it seems.
_______________________________________________
[hidden email] mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-stable
To unsubscribe, send any mail to "[hidden email]"
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

Re: Any news about "msk0 watchdog timeout" regression in 10-RELEASE?

YongHyeon PYUN
On Sat, Jan 25, 2014 at 09:50:50PM +0200, Vitaly Magerya wrote:

> On 01/25/14 21:35, Curtis Villamizar wrote:
> > When I'm no longer quite so swamped I'll look at this again.  It seems
> > we are the only two reporting this problem.
>
> To everyone reading this list: if you have an msk(4) NIC that doesn't
> work on 10-RELEASE, now is the time to speak up.
>
> > Please send lines of these form from dmesg:
> >
> >   mskc0: <Marvell Yukon 88E8057 Gigabit Ethernet> port 0xe800-0xe8ff
> >   mem 0xfebfc000-0xfebfffff irq 19 at deviceD 0.0 on pci2
> >
> >   msk0: <Marvell Technology Group Ltd. Yukon Ultra 2 Id 0xba Rev 0x00>
> >   on mskc0
> >
> > That may indicate we have very similar chips.  If not, this msk
> > problem may be more widespread.
>
> Mine goes like this:
>
>   mskc0: <Marvell Yukon 88E8040 Fast Ethernet> port 0x2000-0x20ff
>   mem 0xf0200000-0xf0203fff irq 18 at device 0.0 on pci9
>
>   msk0: <Marvell Technology Group Ltd. Yukon FE+ Id 0xb8 Rev 0x00>
>   on mskc0
>
> Pretty different chips it seems.

Please try r261577.
_______________________________________________
[hidden email] mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-stable
To unsubscribe, send any mail to "[hidden email]"
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

Re: Any news about "msk0 watchdog timeout" regression in 10-RELEASE?

Curtis Villamizar

In message <[hidden email]>
Yonghyeon PYUN writes:
 

> On Sat, Jan 25, 2014 at 09:50:50PM +0200, Vitaly Magerya wrote:
> > On 01/25/14 21:35, Curtis Villamizar wrote:
> > > When I'm no longer quite so swamped I'll look at this again.  It seems
> > > we are the only two reporting this problem.
> >
> > To everyone reading this list: if you have an msk(4) NIC that doesn't
> > work on 10-RELEASE, now is the time to speak up.
> >
> > > Please send lines of these form from dmesg:
> > >
> > >   mskc0: <Marvell Yukon 88E8057 Gigabit Ethernet> port 0xe800-0xe8ff
> > >   mem 0xfebfc000-0xfebfffff irq 19 at deviceD 0.0 on pci2
> > >
> > >   msk0: <Marvell Technology Group Ltd. Yukon Ultra 2 Id 0xba Rev 0x00>
> > >   on mskc0
> > >
> > > That may indicate we have very similar chips.  If not, this msk
> > > problem may be more widespread.
> >
> > Mine goes like this:
> >
> >   mskc0: <Marvell Yukon 88E8040 Fast Ethernet> port 0x2000-0x20ff
> >   mem 0xf0200000-0xf0203fff irq 18 at device 0.0 on pci9
> >
> >   msk0: <Marvell Technology Group Ltd. Yukon FE+ Id 0xb8 Rev 0x00>
> >   on mskc0
> >
> > Pretty different chips it seems.
>  
> Please try r261577.


Just update sys/dev/msk, or do I need more than that?

Curtis
_______________________________________________
[hidden email] mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-stable
To unsubscribe, send any mail to "[hidden email]"
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

Re: Any news about "msk0 watchdog timeout" regression in 10-RELEASE?

Curtis Villamizar
In reply to this post by YongHyeon PYUN

In message <[hidden email]>
Yonghyeon PYUN writes:
 

> On Sat, Jan 25, 2014 at 09:50:50PM +0200, Vitaly Magerya wrote:
> > On 01/25/14 21:35, Curtis Villamizar wrote:
> > > When I'm no longer quite so swamped I'll look at this again.  It seems
> > > we are the only two reporting this problem.
> >
> > To everyone reading this list: if you have an msk(4) NIC that doesn't
> > work on 10-RELEASE, now is the time to speak up.
> >
> > > Please send lines of these form from dmesg:
> > >
> > >   mskc0: <Marvell Yukon 88E8057 Gigabit Ethernet> port 0xe800-0xe8ff
> > >   mem 0xfebfc000-0xfebfffff irq 19 at deviceD 0.0 on pci2
> > >
> > >   msk0: <Marvell Technology Group Ltd. Yukon Ultra 2 Id 0xba Rev 0x00>
> > >   on mskc0
> > >
> > > That may indicate we have very similar chips.  If not, this msk
> > > problem may be more widespread.
> >
> > Mine goes like this:
> >
> >   mskc0: <Marvell Yukon 88E8040 Fast Ethernet> port 0x2000-0x20ff
> >   mem 0xf0200000-0xf0203fff irq 18 at device 0.0 on pci9
> >
> >   msk0: <Marvell Technology Group Ltd. Yukon FE+ Id 0xb8 Rev 0x00>
> >   on mskc0
> >
> > Pretty different chips it seems.
>  
> Please try r261577.


Yonghyeon,

OK.  I assumed that you meant only sys/dev/msk and to use svn update
-r261577 in "head".  The only diff of any consequence relative to the
stable-10 branch is:

@@ -3749,9 +3750,6 @@
        if ((status & Y2_IS_STAT_BMU) != 0 && domore == 0)
                CSR_WRITE_4(sc, STAT_CTRL, SC_STAT_CLR_IRQ);
 
-       /* Clear TWSI IRQ. */
-       if ((status & Y2_IS_TWSI_RDY) != 0)
-               CSR_WRITE_4(sc, B2_I2C_IRQ, 1);
        /* Reenable interrupts. */
        CSR_WRITE_4(sc, B0_Y2_SP_ICR, 2);

I used the r261577 in "head" and this failed on first reboot.  That
was after a long power down (shut off power strip).

I reboot with the kernel that I had been using and it worked on first
reboot with no power down.

Rather limited testing but the fail on first reboot tells us what we
need to know.

Thanks for your continued interest in this.

Curtis
_______________________________________________
[hidden email] mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-stable
To unsubscribe, send any mail to "[hidden email]"
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

Re: Any news about "msk0 watchdog timeout" regression in 10-RELEASE?

porkshoulder
At the risk of sounding ignorant, what can I do to fix this issue on my box?
I installed 10.0-RELEASE on my desktop at home.

I don't have the exact output from dmesg, but I collected the following:

Marvell Yukon Ultra 2 Id 0xba Rev 0x00
Marvell Yukon 88E8057 Gigabit Ethernet
(Marvell 88E1149 Gigabit PHY)

The msk(4) driver is loaded. I can ping anything just fine, but as soon as I try a 'portsnap fetch', the interface essentially "freezes".

Anyway, can I patch the driver or should I just play with 9.2-RELEASE (I'm new to FreeBSD)?
Thanks!
-Joe
Loading...