Current stable tag slip status 23-Mar-2005

Thu Mar 24 04:27:21 PST 2005

:dillon wrote @ Wed, 23 Mar 2005 13:21:50 -0800 (PST):
:
:>     NFS TCP connection failure  - still diagnosing
:>         (reported by someone through Jeffrey Hsu)
:
:That someone was probably me.
:
:> 	The report is that a large volume of NFS operations over a TCP
:> 	mount result in the TCP connection dying.  A TCP trace seems to
:> 	show the TCP connection getting a FIN and resetting. 
:> 
:> 	We don't have any further information, so we don't know why the TCP
:> 	connection was closed.  It could have been an nfsd being killed on
:> 	the server, or it could have been something else.
:
:The problems exists since month and also with a kernel from today.
:The system is a VIA Epia (might be the vr0).
:The cp dies with "Input/output error".
:
:
:Andy

    Try this patch.  It took 8 hours to find the bug, but I definitely
    found it.  The patch should fix it but I haven't completely tested
    it.

    What is happening is that the TCP protocol stack is upcalling into
    the NFS RPC parsing code on the NFS server, which then reads the mbuf
    chain directly off the TCP socket's receive buffer and processes it.

    At the same time the TCP socket may be placed on the work queue for
    any number of NFSD threads which may also call into the tcp stack to
    retrieve tcp data and then try to parse out RPC's.

    The bug is that due to a blocking condition in the parsing code,
    nfsrv_getstream(), which occurs when it has to split an mbuf (when
    multiple RPCs are present in the same mbuf, which happens under heavy
    loads), and due to another blocking condition in soreceive() (the 
    sockbuf's lock), it is possible for the NFS server to read portions
    of the TCP stream out of order.  This corrupts the TCP stream and causes
    the NFS server to disconnect it.  The NFS client typically reconnects
    automatically but RPCs can get corrupted before the actual disconnect 
    and cause errors to be returned for filesytem ops issued by the client.

    The NFS code is designed to be capable of massively parallel operations,
    with both TCP and UDP mounts, so it isn't as simple as having a single
    thread reading RPC requests, doing the operation, and writing out
    responses.

    This patch should correct the problem and also pipeline TCP ops a bit
    better.  It will be committed on thursday if my overnight testing
    succeeds.

					-Matt
					Matthew Dillon 
					<dillon at xxxxxxxxxxxxx>


Index: nfs_socket.c
===================================================================
RCS file: /cvs/src/sys/vfs/nfs/nfs_socket.c,v
retrieving revision 1.23
diff -u -r1.23 nfs_socket.c

--- nfs_socket.c	24 Mar 2005 06:44:27 -0000	1.23
+++ nfs_socket.c	24 Mar 2005 12:09:10 -0000
@@ -2054,6 +2054,7 @@
 nfsrv_rcv(struct socket *so, void *arg, int waitflag)
 {
 	struct nfssvc_sock *slp = (struct nfssvc_sock *)arg;
+	struct nfsrv_rec *rec;
 	struct mbuf *m;
 	struct mbuf *mp;
 	struct sockaddr *nam;
@@ -2062,25 +2063,49 @@
 
 	if ((slp->ns_flag & SLP_VALID) == 0)
 		return;
-#ifdef notdef
+
 	/*
-	 * Define this to test for nfsds handling this under heavy load.
+	 * Only allow two completed RPC records to build up before we
+	 * stop reading data from the socket.  Otherwise we could eat
+	 * an infinite amount of memory parsing and queueing up RPC
+	 * requests.  This should give pretty good feedback to the TCP
+	 * layer and prevents a memory crunch for other protocols.
+	 *
+	 * Note that the same service socket can be dispatched to several
+	 * nfs servers simultaniously.
+	 *
+	 * the tcp protocol callback calls us with MB_DONTWAIT.  
+	 * nfsd calls us with MB_WAIT (typically).
 	 */
-	if (waitflag == MB_DONTWAIT) {
-		slp->ns_flag |= SLP_NEEDQ; goto dorecs;
+	rec = STAILQ_FIRST(&slp->ns_rec);
+	if (rec && waitflag == MB_DONTWAIT && STAILQ_NEXT(rec, nr_link)) {
+		slp->ns_flag |= SLP_NEEDQ;
+		goto dorecs;
 	}
-#endif
+
+	/*
+	 * Handle protocol specifics to parse an RPC request.  We always
+	 * pull from the socket using non-blocking I/O.
+	 */
 	auio.uio_td = NULL;
 	if (so->so_type == SOCK_STREAM) {
 		/*
-		 * If there are already records on the queue, defer soreceive()
-		 * to an nfsd so that there is feedback to the TCP layer that
-		 * the nfs servers are heavily loaded.
+		 * The data has to be read in an orderly fashion from a TCP
+		 * stream, unlike a UDP socket.  It is possible for soreceive
+		 * and/or nfsrv_getstream() to block, so make sure only one
+		 * entity is messing around with the TCP stream at any given
+		 * moment.  The receive sockbuf's lock in soreceive is not
+		 * sufficient.
+		 *
+		 * Note that this procedure can be called from any number of
+		 * NFS severs *OR* can be upcalled directly from a TCP
+		 * protocol thread.
 		 */
-		if (STAILQ_FIRST(&slp->ns_rec) && waitflag == MB_DONTWAIT) {
+		if (slp->ns_flag & SLP_GETSTREAM) {
 			slp->ns_flag |= SLP_NEEDQ;
 			goto dorecs;
 		}
+		slp->ns_flag |= SLP_GETSTREAM;
 
 		/*
 		 * Do soreceive().
@@ -2093,6 +2118,7 @@
 				slp->ns_flag |= SLP_NEEDQ;
 			else
 				slp->ns_flag |= SLP_DISCONN;
+			slp->ns_flag &= ~SLP_GETSTREAM;
 			goto dorecs;
 		}
 		m = mp;
@@ -2108,7 +2134,8 @@
 		slp->ns_rawend = m;
 
 		/*
-		 * Now try and parse record(s) out of the raw stream data.
+		 * Now try and parse as many record(s) as we can out of the
+		 * raw stream data.
 		 */
 		error = nfsrv_getstream(slp, waitflag);
 		if (error) {
@@ -2117,7 +2144,12 @@
 			else
 				slp->ns_flag |= SLP_NEEDQ;
 		}
+		slp->ns_flag &= ~SLP_GETSTREAM;
 	} else {
+		/*
+		 * For UDP soreceive typically pulls just one packet, loop
+		 * to get the whole batch.
+		 */
 		do {
 			auio.uio_resid = 1000000000;
 			flags = MSG_DONTWAIT;
@@ -2151,13 +2183,17 @@
 	}
 
 	/*
-	 * Now try and process the request records, non-blocking.
+	 * If we were upcalled from the tcp protocol layer and we have
+	 * fully parsed records ready to go, or there is new data pending,
+	 * or something went wrong, try to wake up an nfsd thread to deal
+	 * with it.
 	 */
 dorecs:
 	if (waitflag == MB_DONTWAIT &&
-		(STAILQ_FIRST(&slp->ns_rec)
-		 || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN))))
+	    (STAILQ_FIRST(&slp->ns_rec)
+	     || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)))) {
 		nfsrv_wakenfsd(slp);
+	}
 }
 
 /*
@@ -2174,15 +2210,10 @@
 	struct mbuf *om, *m2, *recm;
 	u_int32_t recmark;
 
-	if (slp->ns_flag & SLP_GETSTREAM)
-		panic("nfs getstream");
-	slp->ns_flag |= SLP_GETSTREAM;
 	for (;;) {
 	    if (slp->ns_reclen == 0) {
-		if (slp->ns_cc < NFSX_UNSIGNED) {
-			slp->ns_flag &= ~SLP_GETSTREAM;
+		if (slp->ns_cc < NFSX_UNSIGNED)
 			return (0);
-		}
 		m = slp->ns_raw;
 		if (m->m_len >= NFSX_UNSIGNED) {
 			bcopy(mtod(m, caddr_t), (caddr_t)&recmark, NFSX_UNSIGNED);
@@ -2212,7 +2243,6 @@
 			log(LOG_ERR, "%s (%d) from nfs client\n",
 			    "impossible packet length",
 			    slp->ns_reclen);
-			slp->ns_flag &= ~SLP_GETSTREAM;
 			return (EPERM);
 		}
 	    }
@@ -2247,7 +2277,6 @@
 					m->m_len -= slp->ns_reclen - len;
 					len = slp->ns_reclen;
 				} else {
-					slp->ns_flag &= ~SLP_GETSTREAM;
 					return (EWOULDBLOCK);
 				}
 			} else if ((len + m->m_len) == slp->ns_reclen) {
@@ -2266,7 +2295,6 @@
 		slp->ns_cc -= len;
 		slp->ns_reclen = 0;
 	    } else {
-		slp->ns_flag &= ~SLP_GETSTREAM;
 		return (0);
 	    }