atomic 64 bit add for pentium+

Joerg Sonnenberger joerg at britannica.bec.de
Wed Dec 8 19:00:15 PST 2004


Hi all,
as promised on commits@, here is a generic 64 bit add operator for
Pentium+ and the necessary change for gencount_inc. Also attached
a small hack for cpuperf used for the numbers below.

The good message is that gencount_inc can be made critical section free,
the bad is the performance of cmpxchg8b on p4. Like so many other ops,
it totally sucks.

My P4 notebook: 115.857nS/loop for cmpxchg8b, compared to 7.517nS/loop
for cmpxchg.
Leaf (AMD64): 6.788nS/loop for cmpxchg8b, compared to 1.293nS/loop for
cmpxchg.

Intel sucks.

Conclusion: The overhead on AMD64 is much less and seems completely
acceptable, for P4 it depends. Matt, what's the speed of critical
sections on P4?

I'd like to get some numbers for other processors as well.

Joerg
Index: atomic.h
===================================================================
RCS file: /home/joerg/wd/repository/dragonflybsd/src/sys/i386/include/atomic.h,v
retrieving revision 1.8
diff -u -r1.8 atomic.h
--- atomic.h	29 Jul 2004 20:31:13 -0000	1.8
+++ atomic.h	9 Dec 2004 02:26:30 -0000
@@ -159,4 +159,26 @@
 
 #endif
 
+#if defined(I586_CPU) || defined(I686_CPU)
+static __inline
+void
+atomic_add_long_long(unsigned long long *p, unsigned long long v)
+{
+	__asm __volatile(
+	    "1:\n"
+	    "\tmovl %0, %%eax\n"
+	    "\taddl %%eax, %%ebx\n"
+	    "\tmovl 4+%0, %%edx\n"
+	    "\taddl %%edx, %%ecx\n"
+	    "\tjnc 2f\n"
+	    "\tincl %%ebx\n"
+	    "2:\n"
+	    "\tlock; cmpxchg8b %0\n"
+	    "\tjnz 1b"
+	    :
+	    : "m" (*p), "b" ((u_long)v), "c" ((u_long)(v >> 32))
+	    : "memory", "ax", "dx");
+}
+#endif
+
 #endif /* ! _MACHINE_ATOMIC_H_ */
Index: gencount.h
===================================================================
RCS file: /home/joerg/wd/repository/dragonflybsd/src/sys/i386/include/gencount.h,v
retrieving revision 1.1
diff -u -r1.1 gencount.h
--- gencount.h	8 Dec 2004 23:19:51 -0000	1.1
+++ gencount.h	9 Dec 2004 02:39:04 -0000
@@ -41,6 +41,8 @@
 #error "no user-servicable parts inside"
 #endif
 
+#include <machine/md_var.h>
+#include <machine/specialreg.h>
 #include <sys/types.h>
 #include <sys/thread2.h>
 
@@ -60,6 +62,12 @@
 static __inline void
 gencount_inc(gencount_t *gencnt)
 {
+#if defined(I586_CPU) || defined(I686_CPU)
+	if (cpu_feature & CPUID_CX8) {
+		atomic_add_long_long((uint64_t *)gencnt, 1);
+		return;
+	}
+#endif
 	crit_enter();
 	if (++gencnt->high == 0)
 		++gencnt->low;
Index: Makefile
===================================================================
RCS file: /home/joerg/wd/repository/dragonflybsd/src/test/cpuperf/Makefile,v
retrieving revision 1.2
diff -u -r1.2 Makefile
--- Makefile	9 Feb 2004 18:15:35 -0000	1.2
+++ Makefile	9 Dec 2004 02:43:30 -0000
@@ -4,7 +4,7 @@
 
 .PATH: ${.CURDIR}/../sysperf
 TARGETS=/tmp/cpu_add  /tmp/cpu_ladd /tmp/cpu_cmpadd /tmp/cpu_cmpexg \
-	/tmp/cpu_lcmpexg /tmp/cpu_call
+	/tmp/cpu_lcmpexg /tmp/cpu_call /tmp/cpu_cmpexg8b /tmp/cpu_lcmpexg8b
 
 CFLAGS= -O2 -g -I../sysperf
 
@@ -22,9 +22,15 @@
 /tmp/cpu_cmpexg: cputest.c cpu_cmpexg.S blib.c
 	$(CC) $(CFLAGS) ${.ALLSRC} -o ${.TARGET}
 
+/tmp/cpu_cmpexg8b: cputest.c cpu_cmpexg8b.S blib.c
+	$(CC) $(CFLAGS) ${.ALLSRC} -o ${.TARGET}
+
 /tmp/cpu_lcmpexg: cputest.c cpu_lcmpexg.S blib.c
 	$(CC) $(CFLAGS) ${.ALLSRC} -o ${.TARGET}
 
+/tmp/cpu_lcmpexg8b: cputest.c cpu_lcmpexg8b.S blib.c
+	$(CC) $(CFLAGS) ${.ALLSRC} -o ${.TARGET}
+
 /tmp/cpu_call: cputest.c cpu_call.S blib.c
 	$(CC) $(CFLAGS) ${.ALLSRC} -o ${.TARGET}
 
Index: cpu_cmpexg8b.S
===================================================================
RCS file: cpu_cmpexg8b.S
diff -N cpu_cmpexg8b.S
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ cpu_cmpexg8b.S	9 Dec 2004 02:41:38 -0000
@@ -0,0 +1,31 @@
+/* $DragonFly: src/test/cpuperf/cpu_cmpexg.S,v 1.1 2004/02/09 18:08:54 dillon Exp $ */
+
+	.globl	test_dummy
+	.globl	test_load
+	.globl	test_str
+
+	.p2align 5
+test_dummy:
+	movl	4(%esp),%ecx
+	movl	$0,%edx
+	movl	$1,%eax
+	movl	%ebx,%edx
+	movl	$0,(%ecx)
+	addl	$3,%eax
+	ret
+
+	.p2align 5
+test_load:
+	movl	4(%esp),%ecx
+	movl	$0,%edx
+	movl	$1,%eax
+	cmpxchg8b	(%ecx)	/* instruction under test */
+	movl	%ebx,%edx
+	movl	$0,(%ecx)
+	addl	$3,%eax
+	ret
+
+	.p2align 5
+test_str:
+	.asciz	"non-locked cmpxchg8b (successful exchange) in pipeline"
+
Index: cpu_lcmpexg8b.S
===================================================================
RCS file: cpu_lcmpexg8b.S
diff -N cpu_lcmpexg8b.S
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ cpu_lcmpexg8b.S	9 Dec 2004 02:43:02 -0000
@@ -0,0 +1,31 @@
+/* $DragonFly: src/test/cpuperf/cpu_lcmpexg.S,v 1.1 2004/02/09 18:08:54 dillon Exp $ */
+
+	.globl	test_dummy
+	.globl	test_load
+	.globl	test_str
+
+	.p2align 5
+test_dummy:
+	movl	4(%esp),%ecx
+	movl	$0,%edx
+	movl	$1,%eax
+	movl	%ebx,%edx
+	movl	$0,(%ecx)
+	addl	$3,%eax
+	ret
+
+	.p2align 5
+test_load:
+	movl	4(%esp),%ecx
+	movl	$0,%edx
+	movl	$1,%eax
+	lock; cmpxchg8b	(%ecx)	/* instruction under test */
+	movl	%ebx,%edx
+	movl	$0,(%ecx)
+	addl	$3,%eax
+	ret
+
+	.p2align 5
+test_str:
+	.asciz	"bus-locked cmpxchg8b (successful exchange) in pipeline"
+
Index: cputest.c
===================================================================
RCS file: /home/joerg/wd/repository/dragonflybsd/src/test/cpuperf/cputest.c,v
retrieving revision 1.1
diff -u -r1.1 cputest.c
--- cputest.c	9 Feb 2004 18:08:54 -0000	1.1
+++ cputest.c	9 Dec 2004 02:48:36 -0000
@@ -11,7 +11,7 @@
 
 extern char test_str[];
 
-int junk;
+int junk[10];
 
 int
 main(int ac, char **av)
@@ -24,19 +24,19 @@
     printf("CPUTEST %s\n", test_str);
     start_timing();
     for (i = 0; ; ++i) {
-	test_load(&junk);
+	test_load(junk);
 	if ((i & 65535) == 0 && get_timing() > 1000000)
 	    break;
     }
     ttl = i * 4;
     start_timing();
     for (i = 0; i < ttl; ++i) {
-	test_dummy(&junk);
+	test_dummy(junk);
     }
     us1 = get_timing();
     start_timing();
     for (i = 0; i < ttl; ++i) {
-	test_load(&junk);
+	test_load(junk);
     }
     us2 = get_timing();
     stop_timing2(ttl, us2 - us1, "instruction overhead:");




More information about the Submit mailing list