IPIQ messaging patch #1 for SMP

Sun Apr 10 00:02:55 PDT 2005

This will probably be committed sometime next week as well.  I would
    appreciate some additional testing of the patch (requires an SMP system
    of course).

    This patch optimizes the IPI messaging code as follows:

    * Does not send additional HW IPIs if the target cpu is still processing
      IPI messages related to a prior HW IPI.  This *greatly* reduces the
      overhead of sending back-to-back IPI messages to another cpu and
      removes most APIC stall conditions.  As a consequence systems
      now get more IPIQ FIFO-full events (before the APIC stall meant
      that the target cpu tended to drain the FIFO before the originating
      cpu could fill it up).  From testing the number of FIFO full 
      conditions that occur appears reasonable.

      Plus, if we ever needed it, this also means that we can pipeline large
      numbers of IPI messages between cpus at nearly line-rate, with
      no additional interrupt overhead beyond the first message.

    * Implement a passive IPI message that does not signal the target cpu
      at all.  The target cpu will process the message the next time it
      polls the queue (no less often then once a tick).  Adjust the
      slab free() code to use this new API when freeing blocks owned by 
      a different cpu.  Such messages have huge target latencies (obviously)
      but cost virtually nothing on either the originating or target cpus
      because no hardware IPI is generated for the message.

    * Improve the IPI Messaging FIFO's hysteresis code a bit.

						-Matt

Index: i386/i386/mp_machdep.c
===================================================================
RCS file: /cvs/src/sys/i386/i386/mp_machdep.c,v
retrieving revision 1.32
diff -u -r1.32 mp_machdep.c

--- i386/i386/mp_machdep.c	27 Feb 2005 10:57:24 -0000	1.32
+++ i386/i386/mp_machdep.c	10 Apr 2005 03:34:17 -0000
@@ -2398,16 +2398,23 @@
 	mem_range_AP_init();
 
 	/*
+	 * Once we go active we must process any IPIQ messages that may
+	 * have been queued, because no actual IPI will occur until we
+	 * set our bit in the smp_active_mask.  If we don't the IPI
+	 * message interlock could be left set which would also prevent
+	 * further IPIs.
+	 *
 	 * The idle loop doesn't expect the BGL to be held and while
 	 * lwkt_switch() normally cleans things up this is a special case
 	 * because we returning almost directly into the idle loop.
 	 *
 	 * The idle thread is never placed on the runq, make sure
-	 * nothing we've done put it thre.
+	 * nothing we've done put it there.
 	 */
 	KKASSERT(curthread->td_mpcount == 1);
 	smp_active_mask |= 1 << mycpu->gd_cpuid;
 	initclocks_pcpu();	/* clock interrupts (via IPIs) */
+	lwkt_process_ipiq();
 	rel_mplock();
 	KKASSERT((curthread->td_flags & TDF_RUNQ) == 0);
 }
Index: i386/include/atomic.h
===================================================================
RCS file: /cvs/src/sys/i386/include/atomic.h,v
retrieving revision 1.8
diff -u -r1.8 atomic.h
--- i386/include/atomic.h	29 Jul 2004 20:31:13 -0000	1.8
+++ i386/include/atomic.h	10 Apr 2005 04:50:28 -0000
@@ -127,10 +127,13 @@
 ATOMIC_ASM(subtract, long,  "subl %1,%0",  v)
 
 /*
- * atomic_poll_acquire_int(P)	Returns non-zero on success, 0 on failure
+ * atomic_poll_acquire_int(P)	Returns non-zero on success, 0 if the lock
+ *				has already been acquired.
  * atomic_poll_release_int(P)
  *
- * Currently these are hacks just to support the NDIS driver.
+ * These support the NDIS driver and are also used for IPIQ interlocks
+ * between cpus.  Both the acquisition and release must be 
+ * cache-synchronizing instructions.
  */
 
 #if defined(KLD_MODULE)
@@ -154,7 +157,7 @@
 void
 atomic_poll_release_int(volatile u_int *p)
 {
-	__asm __volatile("movl $0,%0" : "+m" (*p));
+	__asm __volatile(MPLOCKED "btrl $0,%0" : "+m" (*p));
 }
 
 #endif
Index: kern/kern_slaballoc.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_slaballoc.c,v
retrieving revision 1.28
diff -u -r1.28 kern_slaballoc.c
--- kern/kern_slaballoc.c	2 Apr 2005 15:53:56 -0000	1.28
+++ kern/kern_slaballoc.c	10 Apr 2005 05:16:09 -0000
@@ -754,12 +754,13 @@
 
     /*
      * If we do not own the zone then forward the request to the
-     * cpu that does.
+     * cpu that does.  Since the timing is non-critical, a passive
+     * message is sent.
      */
     if (z->z_CpuGd != gd) {
 	*(struct malloc_type **)ptr = type;
 #ifdef SMP
-	lwkt_send_ipiq(z->z_CpuGd, free_remote, ptr);
+	lwkt_send_ipiq_passive(z->z_CpuGd, free_remote, ptr);
 #else
 	panic("Corrupt SLZone");
 #endif
Index: kern/lwkt_ipiq.c
===================================================================
RCS file: /cvs/src/sys/kern/lwkt_ipiq.c,v
retrieving revision 1.8
diff -u -r1.8 lwkt_ipiq.c
--- kern/lwkt_ipiq.c	16 Jul 2004 05:51:10 -0000	1.8
+++ kern/lwkt_ipiq.c	10 Apr 2005 05:24:02 -0000
@@ -92,9 +92,12 @@
 #endif
 
 #ifdef SMP
-static __int64_t ipiq_count;
-static __int64_t ipiq_fifofull;
-static __int64_t ipiq_cscount;
+static __int64_t ipiq_count;	/* total calls to lwkt_send_ipiq*() */
+static __int64_t ipiq_fifofull;	/* number of fifo full conditions detected */
+static __int64_t ipiq_avoided;	/* interlock with target avoids cpu ipi */
+static __int64_t ipiq_passive;	/* passive IPI messages */
+static __int64_t ipiq_cscount;	/* number of cpu synchronizations */
+static int ipiq_optimized = 1;	/* XXX temporary sysctl */
 #endif
 
 #ifdef _KERNEL
@@ -102,7 +105,10 @@
 #ifdef SMP
 SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_count, CTLFLAG_RW, &ipiq_count, 0, "");
 SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_fifofull, CTLFLAG_RW, &ipiq_fifofull, 0, "");
+SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_avoided, CTLFLAG_RW, &ipiq_avoided, 0, "");
+SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_passive, CTLFLAG_RW, &ipiq_passive, 0, "");
 SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_cscount, CTLFLAG_RW, &ipiq_cscount, 0, "");
+SYSCTL_INT(_lwkt, OID_AUTO, ipiq_optimized, CTLFLAG_RW, &ipiq_optimized, 0, "");
 #endif
 
 #endif
@@ -118,14 +124,18 @@
  * on the cpu<->cpu ipiq matrix.  Each cpu owns a unique ipiq FIFO for every
  * possible target cpu.  The FIFO can be written.
  *
- * YYY If the FIFO fills up we have to enable interrupts and process the
- * IPIQ while waiting for it to empty or we may deadlock with another cpu.
- * Create a CPU_*() function to do this!
+ * If the FIFO fills up we have to enable interrupts to avoid an APIC
+ * deadlock and process pending IPIQs while waiting for it to empty.   
+ * Otherwise we may soft-deadlock with another cpu whos FIFO is also full.
  *
  * We can safely bump gd_intr_nesting_level because our crit_exit() at the
  * end will take care of any pending interrupts.
  *
- * Must be called from a critical section.
+ * The actual hardware IPI is avoided if the target cpu is already processing
+ * the queue from a prior IPI.  It is possible to pipeline IPI messages
+ * very quickly between cpus due to the FIFO hysteresis.
+ *
+ * Need not be called from a critical section.
  */
 int
 lwkt_send_ipiq(globaldata_t target, ipifunc_t func, void *arg)
@@ -149,18 +159,86 @@
     ip = &gd->gd_ipiq[target->gd_cpuid];
 
     /*
-     * We always drain before the FIFO becomes full so it should never
-     * become full.  We need to leave enough entries to deal with 
-     * reentrancy.
+     * Do not allow the FIFO to become full.  Interrupts must be physically
+     * enabled while we liveloop to avoid deadlocking the APIC.
+     */
+    if (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 2) {
+	unsigned int eflags = read_eflags();
+
+	if (atomic_poll_acquire_int(&ip->ip_npoll) || ipiq_optimized == 0)
+	    cpu_send_ipiq(target->gd_cpuid);
+	cpu_enable_intr();
+	++ipiq_fifofull;
+	while (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 4) {
+	    KKASSERT(ip->ip_windex - ip->ip_rindex != MAXCPUFIFO - 1);
+	    lwkt_process_ipiq();
+	}
+	write_eflags(eflags);
+    }
+
+    /*
+     * Queue the new message
      */
-    KKASSERT(ip->ip_windex - ip->ip_rindex != MAXCPUFIFO);
     windex = ip->ip_windex & MAXCPUFIFO_MASK;
     ip->ip_func[windex] = (ipifunc2_t)func;
     ip->ip_arg[windex] = arg;
     cpu_mb1();
     ++ip->ip_windex;
+    --gd->gd_intr_nesting_level;
+
+    /*
+     * signal the target cpu that there is work pending.
+     */
+    if (atomic_poll_acquire_int(&ip->ip_npoll)) {
+	cpu_send_ipiq(target->gd_cpuid);
+    } else {
+	if (ipiq_optimized == 0)
+	    cpu_send_ipiq(target->gd_cpuid);
+	++ipiq_avoided;
+    }
+    crit_exit();
+    return(ip->ip_windex);
+}
+
+/*
+ * Similar to lwkt_send_ipiq() but this function does not actually initiate
+ * the IPI to the target cpu unless the FIFO has become too full, so it is
+ * very fast.
+ *
+ * This function is used for non-critical IPI messages, such as memory
+ * deallocations.  The queue will typically be flushed by the target cpu at
+ * the next clock interrupt.
+ *
+ * Need not be called from a critical section.
+ */
+int
+lwkt_send_ipiq_passive(globaldata_t target, ipifunc_t func, void *arg)
+{
+    lwkt_ipiq_t ip;
+    int windex;
+    struct globaldata *gd = mycpu;
+
+    KKASSERT(target != gd);
+    crit_enter();
+    ++gd->gd_intr_nesting_level;
+#ifdef INVARIANTS
+    if (gd->gd_intr_nesting_level > 20)
+	panic("lwkt_send_ipiq: TOO HEAVILY NESTED!");
+#endif
+    KKASSERT(curthread->td_pri >= TDPRI_CRIT);
+    ++ipiq_count;
+    ++ipiq_passive;
+    ip = &gd->gd_ipiq[target->gd_cpuid];
+
+    /*
+     * Do not allow the FIFO to become full.  Interrupts must be physically
+     * enabled while we liveloop to avoid deadlocking the APIC.
+     */
     if (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 2) {
 	unsigned int eflags = read_eflags();
+
+	if (atomic_poll_acquire_int(&ip->ip_npoll) || ipiq_optimized == 0)
+	    cpu_send_ipiq(target->gd_cpuid);
 	cpu_enable_intr();
 	++ipiq_fifofull;
 	while (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 4) {
@@ -169,22 +247,33 @@
 	}
 	write_eflags(eflags);
     }
+
+    /*
+     * Queue the new message
+     */
+    windex = ip->ip_windex & MAXCPUFIFO_MASK;
+    ip->ip_func[windex] = (ipifunc2_t)func;
+    ip->ip_arg[windex] = arg;
+    cpu_mb1();
+    ++ip->ip_windex;
     --gd->gd_intr_nesting_level;
-    cpu_send_ipiq(target->gd_cpuid);	/* issues mem barrier if appropriate */
+
+    /*
+     * Do not signal the target cpu, it will pick up the IPI when it next
+     * polls (typically on the next tick).
+     */
     crit_exit();
     return(ip->ip_windex);
 }
 
 /*
- * Send an IPI request passively, return 0 on success and ENOENT on failure.
- * This routine does not recursive through lwkt_process_ipiq() nor does it
- * block trying to queue the actual IPI.  If we successfully queue the
- * message but fail to queue the IPI, we still count it as a success.
- * The occassional small race against a target cpu HLT is recovered at
- * the next clock interrupt.
+ * Send an IPI request without blocking, return 0 on success, ENOENT on 
+ * failure.  The actual queueing of the hardware IPI may still force us
+ * to spin and process incoming IPIs but that will eventually go away
+ * when we've gotten rid of the other general IPIs.
  */
 int
-lwkt_send_ipiq_passive(globaldata_t target, ipifunc_t func, void *arg)
+lwkt_send_ipiq_nowait(globaldata_t target, ipifunc_t func, void *arg)
 {
     lwkt_ipiq_t ip;
     int windex;
@@ -198,22 +287,24 @@
     ++ipiq_count;
     ip = &gd->gd_ipiq[target->gd_cpuid];
 
-    if (ip->ip_windex - ip->ip_rindex >= MAXCPUFIFO - 1) {
+    if (ip->ip_windex - ip->ip_rindex >= MAXCPUFIFO * 3 / 2)
 	return(ENOENT);
-    }
     windex = ip->ip_windex & MAXCPUFIFO_MASK;
     ip->ip_func[windex] = (ipifunc2_t)func;
     ip->ip_arg[windex] = arg;
     cpu_mb1();
     ++ip->ip_windex;
+
     /*
-     * passive mode doesn't work yet :-( 
+     * This isn't a passive IPI, we still have to signal the target cpu.
      */
-#if 1
-    cpu_send_ipiq(target->gd_cpuid);
-#else
-    cpu_send_ipiq_passive(target->gd_cpuid);
-#endif
+    if (atomic_poll_acquire_int(&ip->ip_npoll)) {
+	cpu_send_ipiq(target->gd_cpuid);
+    } else {
+	if (ipiq_optimized == 0)
+	    cpu_send_ipiq(target->gd_cpuid);
+	++ipiq_avoided;
+    }
     return(0);
 }
 
@@ -372,6 +463,14 @@
 	/* YYY memory barrier */
 	ip->ip_xindex = ip->ip_rindex;
     }
+
+    /*
+     * Return non-zero if there are more IPI messages pending on this
+     * ipiq.  ip_npoll is left set as long as possible to reduce the
+     * number of IPIs queued by the originating cpu, but must be cleared
+     * *BEFORE* checking windex.
+     */
+    atomic_poll_release_int(&ip->ip_npoll);
     return(wi != ip->ip_windex);
 }
 
Index: kern/lwkt_token.c
===================================================================
RCS file: /cvs/src/sys/kern/lwkt_token.c,v
retrieving revision 1.11
diff -u -r1.11 lwkt_token.c
--- kern/lwkt_token.c	1 Feb 2005 22:36:26 -0000	1.11
+++ kern/lwkt_token.c	10 Apr 2005 04:57:08 -0000
@@ -142,7 +142,7 @@
 		refs->tr_magic = LWKT_TOKREF_MAGIC2;	/* MP synched slowreq*/
 		refs->tr_reqgd = gd;
 		tok->t_reqcpu = gd;	/* MP unsynchronized 'fast' req */
-		if (lwkt_send_ipiq_passive(dgd, lwkt_reqtoken_remote, refs)) {
+		if (lwkt_send_ipiq_nowait(dgd, lwkt_reqtoken_remote, refs)) {
 		    /* failed */
 		    refs->tr_magic = LWKT_TOKREF_MAGIC1;
 		    break;
@@ -253,15 +253,17 @@
 	 */
 #if defined(MAKE_TOKENS_SPIN)
 	int x = 40000000;
+	int y = 10;
 	crit_enter();
 	while (lwkt_chktokens(td) == 0) {
 	    lwkt_process_ipiq();
 	    lwkt_drain_token_requests();
 	    if (--x == 0) {
 		x = 40000000;
-		printf("CHKTOKEN loop %d\n", gd->gd_cpuid);
+		printf("CHKTOKEN looping on cpu %d\n", gd->gd_cpuid);
 #ifdef _KERNEL
-		Debugger("x");
+		if (--y == 0)
+			panic("CHKTOKEN looping on cpu %d", gd->gd_cpuid);
 #endif
 	    }
 	    splz();
Index: sys/thread.h
===================================================================
RCS file: /cvs/src/sys/sys/thread.h,v
retrieving revision 1.60
diff -u -r1.60 thread.h
--- sys/thread.h	14 Jan 2005 02:20:24 -0000	1.60
+++ sys/thread.h	10 Apr 2005 04:56:12 -0000
@@ -144,7 +144,7 @@
     int		ip_windex;      /* only written by source cpu */
     ipifunc2_t	ip_func[MAXCPUFIFO];
     void	*ip_arg[MAXCPUFIFO];
-    int		ip_npoll;
+    u_int	ip_npoll;	/* synchronization to avoid excess IPIs */
 } lwkt_ipiq;
 
 /*
@@ -374,6 +374,7 @@
 extern void lwkt_setcpu_self(struct globaldata *rgd);
 extern int  lwkt_send_ipiq(struct globaldata *targ, ipifunc_t func, void *arg);
 extern int  lwkt_send_ipiq_passive(struct globaldata *targ, ipifunc_t func, void *arg);
+extern int  lwkt_send_ipiq_nowait(struct globaldata *targ, ipifunc_t func, void *arg);
 extern int  lwkt_send_ipiq_bycpu(int dcpu, ipifunc_t func, void *arg);
 extern int  lwkt_send_ipiq_mask(cpumask_t mask, ipifunc_t func, void *arg);
 extern void lwkt_wait_ipiq(struct globaldata *targ, int seq);