Dragonfly under KVM
Gary Allan
dragonfly at gallan.co.uk
Sat Jun 7 08:29:46 PDT 2008
Matthew Dillon wrote:
:I've managed to get gdb attached and get some information.
:
:The kernel is getting caught in a while loop in lwkt_acquire. I can
:reliably trigger this with with a "make -j8 buildworld" under a SMP
:kernel (Otherwise identical to GENERIC, no optimisations.) The OS is
:completely unresponsive and all four cpu cores are running at 100%.
:
:I've included the debug information.
:
:Program received signal SIGINT, Interrupt.
:lwkt_acquire (td=0xc6a59e70) at /usr/src/sys/kern/lwkt_thread.c:1048
:1048 while (td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK))
:(gdb) l
:1043 mygd = mycpu;
:1044 if (gd != mycpu) {
:1045 cpu_lfence();
:1046 KKASSERT((td->td_flags & TDF_RUNQ) == 0);
:1047 crit_enter_gd(mygd);
:1048 while (td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK))
:1049 cpu_lfence();
:1050 td->td_gd = mygd;
:1051 TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq);
:1052 td->td_flags &= ~TDF_MIGRATING;
:(gdb) p td->td_flags
:$1 = 8390177
:(gdb) p td
:$2 = (thread_t) 0xc6a59e70
:(gdb) bt
:#0 lwkt_acquire (td=0xc6a59e70) at /usr/src/sys/kern/lwkt_thread.c:1048
:#1 0xc02c66af in bsd4_select_curproc (gd=0xff800000) at
:/usr/src/sys/kern/usched_bsd4.c:358
:#2 0xc02c6829 in bsd4_release_curproc (lp=0xea634c00) at
:/usr/src/sys/kern/usched_bsd4.c:322
:#3 0xc04b8239 in passive_release (td=0xdfe8aba0) at
:..
:
:Continuing execution does not appear to cause any problems.
:I can provide additional debugging info if required but I'm unsure of
:how to proceed with this myself.
:
:Regards
:
:Gary
This is great info. One thing, what do you mean by 'KVM'? What
is DragonFly running under, exactly?
I think once I understand the environment I may be able to figure out
why the acquisition loop is blowing up.
-Matt
Matthew Dillon
<dillon at backplane.com>
I've triggered another lock up and this time got a different trace.
Execution appears to be looping indefinitely inside LWKT code.
Debugging gives the output below. Again all four core are running at 100%.
Program received signal SIGINT, Interrupt.
lwkt_process_ipiq_core (sgd=<value optimized out>, ip=0xc67a7000, frame=0x0)
at /usr/src/sys/kern/lwkt_ipiq.c:522
522 while (wi - (ri = ip->ip_rindex) > 0) {
(gdb) l
517 *
518 * Note: due to additional IPI operations that the callback function
519 * may make, it is possible for both rindex and windex to
advance and
520 * thus for rindex to advance passed our cached windex.
521 */
522 while (wi - (ri = ip->ip_rindex) > 0) {
523 ri &= MAXCPUFIFO_MASK;
524 copy_func = ip->ip_func[ri];
525 copy_arg1 = ip->ip_arg1[ri];
526 copy_arg2 = ip->ip_arg2[ri];
(gdb) p wi
$5 = 356278
(gdb) p ip->ip_rindex
$6 = 356278
(gdb) bt
#0 lwkt_process_ipiq_core (sgd=<value optimized out>, ip=0xc67a7000,
frame=0x0) at /usr/src/sys/kern/lwkt_ipiq.c:522
#1 0xc02c94ad in lwkt_process_ipiq () at /usr/src/sys/kern/lwkt_ipiq.c:452
#2 0xc02c9830 in lwkt_send_ipiq3 (target=0xff808000,
func=0xc02c8519 <lwkt_schedule>, arg1=0xc0600170, arg2=0)
at /usr/src/sys/kern/lwkt_ipiq.c:185
#3 0xc02c863c in lwkt_schedule (td=0xc0600170)
at /usr/src/sys/sys/thread2.h:244
#4 0xc02c71e6 in bsd4_setrunqueue (lp=0xe5f8b400)
at /usr/src/sys/kern/usched_bsd4.c:551
#5 0xc02c72be in bsd4_acquire_curproc (lp=0xe5f8b400)
at /usr/src/sys/kern/usched_bsd4.c:271
#6 0xc04b9603 in syscall2 (frame=0xe5a02d40)
at /usr/src/sys/platform/pc32/i386/trap.c:349
#7 0xc04a3396 in Xint0x80_syscall ()
at /usr/src/sys/platform/pc32/i386/exception.s:876
#8 0xe5a02d40 in ?? ()
Backtrace stopped: previous frame inner to this frame (corrupt stack?)
(gdb) c
Continuing.
Program received signal SIGINT, Interrupt.
lwkt_process_ipiq_core (sgd=<value optimized out>, ip=0xc67ac000, frame=0x0)
at /usr/src/sys/kern/lwkt_ipiq.c:558
558 return(wi != ip->ip_windex);
(gdb) l
553 * ipiq. ip_npoll is left set as long as possible to reduce the
554 * number of IPIs queued by the originating cpu, but must be cleared
555 * *BEFORE* checking windex.
556 */
557 atomic_poll_release_int(&ip->ip_npoll);
558 return(wi != ip->ip_windex);
559 }
560
561 static void
562 lwkt_sync_ipiq(void *arg)
(gdb) p wi
$7 = 357733
(gdb) p ip->ip_windex
$8 = 357733
(gdb) bt
#0 lwkt_process_ipiq_core (sgd=<value optimized out>, ip=0xc67ac000,
frame=0x0) at /usr/src/sys/kern/lwkt_ipiq.c:558
#1 0xc02c94ad in lwkt_process_ipiq () at /usr/src/sys/kern/lwkt_ipiq.c:452
#2 0xc02c9830 in lwkt_send_ipiq3 (target=0xff808000,
func=0xc02c8519 <lwkt_schedule>, arg1=0xc0600170, arg2=0)
at /usr/src/sys/kern/lwkt_ipiq.c:185
#3 0xc02c863c in lwkt_schedule (td=0xc0600170)
at /usr/src/sys/sys/thread2.h:244
#4 0xc02c71e6 in bsd4_setrunqueue (lp=0xe5f8b400)
at /usr/src/sys/kern/usched_bsd4.c:551
#5 0xc02c72be in bsd4_acquire_curproc (lp=0xe5f8b400)
at /usr/src/sys/kern/usched_bsd4.c:271
#6 0xc04b9603 in syscall2 (frame=0xe5a02d40)
at /usr/src/sys/platform/pc32/i386/trap.c:349
#7 0xc04a3396 in Xint0x80_syscall ()
at /usr/src/sys/platform/pc32/i386/exception.s:876
#8 0xe5a02d40 in ?? ()
Backtrace stopped: previous frame inner to this frame (corrupt stack?)
(gdb) c
Continuing.
Program received signal SIGINT, Interrupt.
lwkt_process_ipiq_core (sgd=<value optimized out>, ip=0xc67b1000, frame=0x0)
at /usr/src/sys/kern/lwkt_ipiq.c:558
558 return(wi != ip->ip_windex);
(gdb) p wi
$9 = 372884
(gdb) p ip->ip_windex
$10 = 372884
(gdb) bt
#0 lwkt_process_ipiq_core (sgd=<value optimized out>, ip=0xc67b1000,
frame=0x0) at /usr/src/sys/kern/lwkt_ipiq.c:558
#1 0xc02c94ad in lwkt_process_ipiq () at /usr/src/sys/kern/lwkt_ipiq.c:452
#2 0xc02c9830 in lwkt_send_ipiq3 (target=0xff808000,
func=0xc02c8519 <lwkt_schedule>, arg1=0xc0600170, arg2=0)
at /usr/src/sys/kern/lwkt_ipiq.c:185
#3 0xc02c863c in lwkt_schedule (td=0xc0600170)
at /usr/src/sys/sys/thread2.h:244
#4 0xc02c71e6 in bsd4_setrunqueue (lp=0xe5f8b400)
at /usr/src/sys/kern/usched_bsd4.c:551
#5 0xc02c72be in bsd4_acquire_curproc (lp=0xe5f8b400)
at /usr/src/sys/kern/usched_bsd4.c:271
#6 0xc04b9603 in syscall2 (frame=0xe5a02d40)
at /usr/src/sys/platform/pc32/i386/trap.c:349
#7 0xc04a3396 in Xint0x80_syscall ()
at /usr/src/sys/platform/pc32/i386/exception.s:876
#8 0xe5a02d40 in ?? ()
Backtrace stopped: previous frame inner to this frame (corrupt stack?)
(gdb) c
Continuing.
Program received signal SIGINT, Interrupt.
0xc02c93a5 in lwkt_process_ipiq_core (sgd=<value optimized out>,
ip=0xff8001e8, frame=0x0) at /usr/src/sys/kern/lwkt_ipiq.c:522
522 while (wi - (ri = ip->ip_rindex) > 0) {
(gdb) p wi
$11 = 1343383299
(gdb) p ip->ip_rindex
$12 = 1343383298
(gdb) bt
#0 0xc02c93a5 in lwkt_process_ipiq_core (sgd=<value optimized out>,
ip=0xff8001e8, frame=0x0) at /usr/src/sys/kern/lwkt_ipiq.c:522
#1 0xc02c94df in lwkt_process_ipiq () at /usr/src/sys/kern/lwkt_ipiq.c:458
#2 0xc02c9830 in lwkt_send_ipiq3 (target=0xff808000,
func=0xc02c8519 <lwkt_schedule>, arg1=0xc0600170, arg2=0)
at /usr/src/sys/kern/lwkt_ipiq.c:185
#3 0xc02c863c in lwkt_schedule (td=0xc0600170)
at /usr/src/sys/sys/thread2.h:244
#4 0xc02c71e6 in bsd4_setrunqueue (lp=0xe5f8b400)
at /usr/src/sys/kern/usched_bsd4.c:551
#5 0xc02c72be in bsd4_acquire_curproc (lp=0xe5f8b400)
at /usr/src/sys/kern/usched_bsd4.c:271
#6 0xc04b9603 in syscall2 (frame=0xe5a02d40)
at /usr/src/sys/platform/pc32/i386/trap.c:349
#7 0xc04a3396 in Xint0x80_syscall ()
at /usr/src/sys/platform/pc32/i386/exception.s:876
#8 0xe5a02d40 in ?? ()
Backtrace stopped: previous frame inner to this frame (corrupt stack?)
(gdb) c
Continuing.
Program received signal SIGINT, Interrupt.
0xc02c9495 in lwkt_process_ipiq () at /usr/src/sys/kern/lwkt_ipiq.c:452
452 while (lwkt_process_ipiq_core(sgd, &ip[gd->gd_cpuid], NULL))
(gdb) l
447 for (n = 0; n < ncpus; ++n) {
448 if (n != gd->gd_cpuid) {
449 sgd = globaldata_find(n);
450 ip = sgd->gd_ipiq;
451 if (ip != NULL) {
452 while (lwkt_process_ipiq_core(sgd, &ip[gd->gd_cpuid], NULL))
453 ;
454 }
455 }
456 }
(gdb) bt
#0 0xc02c9495 in lwkt_process_ipiq () at /usr/src/sys/kern/lwkt_ipiq.c:452
#1 0xc02c9830 in lwkt_send_ipiq3 (target=0xff808000, Cannot access
memory at address 0x8
func=0xc02c8519 <lwkt_schedule>, arg1=0xc0600170, arg2=0)
at /usr/src/sys/kern/lwkt_ipiq.c:185
#2 0xc02c863c in lwkt_schedule (td=0xc0600170)
at /usr/src/sys/sys/thread2.h:244
#3 0xc02c71e6 in bsd4_setrunqueue (lp=0xe5f8b400)
at /usr/src/sys/kern/usched_bsd4.c:551
#4 0xc02c72be in bsd4_acquire_curproc (lp=0xe5f8b400)
at /usr/src/sys/kern/usched_bsd4.c:271
#5 0xc04b9603 in syscall2 (frame=0xe5a02d40)
at /usr/src/sys/platform/pc32/i386/trap.c:349
#6 0xc04a3396 in Xint0x80_syscall ()
at /usr/src/sys/platform/pc32/i386/exception.s:876
#7 0xe5a02d40 in ?? ()
Backtrace stopped: previous frame inner to this frame (corrupt stack?)
(gdb) c
Continuing.
Program received signal SIGINT, Interrupt.
0xc02c944c in lwkt_process_ipiq_core (sgd=<value optimized out>,
ip=<value optimized out>, frame=0x0) at
/usr/src/sys/kern/lwkt_ipiq.c:559
559 }
(gdb) l
554 * number of IPIs queued by the originating cpu, but must be cleared
555 * *BEFORE* checking windex.
556 */
557 atomic_poll_release_int(&ip->ip_npoll);
558 return(wi != ip->ip_windex);
559 }
560
561 static void
562 lwkt_sync_ipiq(void *arg)
563 {
(gdb) p wi
$13 = 357733
(gdb) p ip->ip_windex
Cannot access memory at address 0x8
(gdb) bt
#0 0xc02c944c in lwkt_process_ipiq_core (sgd=<value optimized out>,
ip=<value optimized out>, frame=0x0) at
/usr/src/sys/kern/lwkt_ipiq.c:559
#1 0xc02c94ad in lwkt_process_ipiq () at /usr/src/sys/kern/lwkt_ipiq.c:452
#2 0xc02c9830 in lwkt_send_ipiq3 (target=0xff808000,
func=0xc02c8519 <lwkt_schedule>, arg1=0xc0600170, arg2=0)
at /usr/src/sys/kern/lwkt_ipiq.c:185
#3 0xc02c863c in lwkt_schedule (td=0xc0600170)
at /usr/src/sys/sys/thread2.h:244
#4 0xc02c71e6 in bsd4_setrunqueue (lp=0xe5f8b400)
at /usr/src/sys/kern/usched_bsd4.c:551
#5 0xc02c72be in bsd4_acquire_curproc (lp=0xe5f8b400)
at /usr/src/sys/kern/usched_bsd4.c:271
#6 0xc04b9603 in syscall2 (frame=0xe5a02d40)
at /usr/src/sys/platform/pc32/i386/trap.c:349
#7 0xc04a3396 in Xint0x80_syscall ()
at /usr/src/sys/platform/pc32/i386/exception.s:876
#8 0xe5a02d40 in ?? ()
Backtrace stopped: previous frame inner to this frame (corrupt stack?)
(gdb) c
Continuing.
Program received signal SIGINT, Interrupt.
lwkt_process_ipiq_core (sgd=<value optimized out>, ip=0xc67ac000, frame=0x0)
at /usr/src/sys/kern/lwkt_ipiq.c:558
558 return(wi != ip->ip_windex);
(gdb) l
553 * ipiq. ip_npoll is left set as long as possible to reduce the
554 * number of IPIs queued by the originating cpu, but must be cleared
555 * *BEFORE* checking windex.
556 */
557 atomic_poll_release_int(&ip->ip_npoll);
558 return(wi != ip->ip_windex);
559 }
560
561 static void
562 lwkt_sync_ipiq(void *arg)
(gdb) p wi
$24 = 357733
(gdb) p ip->ip_windex
$25 = 357733
(gdb) bt
#0 lwkt_process_ipiq_core (sgd=<value optimized out>, ip=0xc67ac000,
frame=0x0) at /usr/src/sys/kern/lwkt_ipiq.c:558
#1 0xc02c94ad in lwkt_process_ipiq () at /usr/src/sys/kern/lwkt_ipiq.c:452
#2 0xc02c9830 in lwkt_send_ipiq3 (target=0xff808000,
func=0xc02c8519 <lwkt_schedule>, arg1=0xc0600170, arg2=0)
at /usr/src/sys/kern/lwkt_ipiq.c:185
#3 0xc02c863c in lwkt_schedule (td=0xc0600170)
at /usr/src/sys/sys/thread2.h:244
#4 0xc02c71e6 in bsd4_setrunqueue (lp=0xe5f8b400)
at /usr/src/sys/kern/usched_bsd4.c:551
#5 0xc02c72be in bsd4_acquire_curproc (lp=0xe5f8b400)
at /usr/src/sys/kern/usched_bsd4.c:271
#6 0xc04b9603 in syscall2 (frame=0xe5a02d40)
at /usr/src/sys/platform/pc32/i386/trap.c:349
#7 0xc04a3396 in Xint0x80_syscall ()
at /usr/src/sys/platform/pc32/i386/exception.s:876
#8 0xe5a02d40 in ?? ()
Backtrace stopped: previous frame inner to this frame (corrupt stack?)
(gdb) c
Continuing.
More information about the Bugs
mailing list