syscall messaging interface API

Thu Jul 24 10:55:17 PDT 2003

:> Syscalls speed is critical if you use a hybrid user/kernel space thread
:> blocking/cond-var primitive to do 1:1 threading. Linux's "futex" mechanism
:> exploits and optimizes it to serve the "greater threading good". :)
:> 
:> It shouldn't be ignored.
:
:Even in Linux, the contested case is rare.  I've found that a
:*busy* login server handles on the order of 10,000 syscalls per
:second, while syscall overhead is on the order of 500 ns.  The
:optimized syscall path in Linux 2.4.9 saves about 11% over FreeBSD
:4.X, at the expense of special cases for syscalls with more than
:four arguments and other kludges.  So the potential improvement
:we're talking about here is probably well under 0.01%.
:
:Even if you did manage to come up with a program where ``syscall
:throughput'' is significant, batching the calls as Matt suggested
:would provide much better performance benefits than microoptimization.

    Right. I really think batching is the solution to these sorts
    of specialized situations.  For example, if you had a massively
    multi-threaded program doing hundreds of thousands of I/O ops a
    second it would be quite reasonable for the threading system to
    simply build up chains of syscall messages as the threads try to
    run them and then dispatch them all to the kernel in one fell
    swoop.

    The absolute best cooperative-switching code I can write that's
    still useable in a real threading system takes about 36ns to 
    switch between userland threads on a 1.2GHz P4 (around 46ns
    on my 1GHz P3).  I've included it below as an example.  With
    such low overhead this means that one can basically use a
    userland threading abstraction to treat system calls as normal
    blocking entities, but have the threading system queue the
    system call messages and then dispatch a whole bunch at once 
    rather then running them one at a time.

					-Matt
					Matthew Dillon 
					<dillon at xxxxxxxxxxxxx>

/*
 * sw.S
 */

		.text
		.globl	qswitch, qrestore, qinit, qirestore, qstart

#if 0

#define PUSHAL		pushal		/* these are slow */
#define POPAL		popal

#endif

#if 1

/* call-used only */
#define PUSHAL		\
	pushl	%ebx ;	\
	pushl	%esi ;	\
	pushl	%edi ;	\
	pushl	%ebp 

#define POPAL		\
	popl	%ebp ;	\
	popl	%edi ;	\
	popl	%esi ;	\
	popl	%ebx

#endif

#if 0

/* call-used only */
#define PUSHAL			\
	subl	$16,%esp ;	\
	movl	%ebx,(%esp) ;	\
	movl	%esi,4(%esp) ;	\
	movl	%edi,8(%esp) ;	\
	movl	%ebp,12(%esp)

#define POPAL			\
	movl	(%esp),%ebx ;	\
	movl	4(%esp),%esi ;  \
	movl	8(%esp),%edi ;  \
	movl	12(%esp),%ebp ; \
	addl	$16,%esp

#endif

		/* qswitch(&olddesc, newdesc) */
qswitch:
		movl	8(%esp),%eax	/* new state */
		movl	4(%esp),%edx	/* save old state */
		PUSHAL
		pushl	$qrestore
		movl	%esp,(%edx)
		movl	%eax,%esp
		ret

		/* qstart(newdesc) */
qstart:
		movl	4(%esp),%esp
		ret

qrestore:
		POPAL
		ret

		/* newdesc = qinit(topofstk, func, arg) */
qinit:
		movl	4(%esp),%eax
		subl	$16,%eax
		movl	$qirestore,0(%eax)
		movl	8(%esp),%ecx
		movl	%ecx,4(%eax)
		movl	$0,8(%eax)
		movl	12(%esp),%ecx
		movl	%ecx,12(%eax)
		ret /* return eax */

qirestore:
		ret

/*
 * swtest.c
 *
 * Test best-case userland thread switching speed
 */
void qswitch(int **oldsw, int *newsw);
void qstart(int *newsw);
int *qinit(void *topstk, void *func, void *arg);

static void do_task1(void *arg);
static void do_task2(void *arg);

int *task1;
int *task2;
char stk1[16384];
char stk2[16384];
int count;

int
main(int ac, char **av)
{
    task1 = qinit(stk1 + sizeof(stk1), do_task1, NULL);
    task2 = qinit(stk2 + sizeof(stk2), do_task2, NULL);
    /*start_timing();*/
    qstart(task1);
}

static void
do_task1(void *arg)
{
    for (;;) {
	qswitch(&task1, task2);
	if (++count > 10000000) {
		/*stop_timing(count, "uthread_switch");*/
		exit(0);
	}
    }
}

static void
do_task2(void *arg)
{
    for (;;) {
	++count;
	qswitch(&task2, task1);
    }
}