From cace564f8b6260e806f5e28d7f192fd0e0c603ed Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 13 Sep 2016 10:52:50 -0400 Subject: svcrdma: Tail iovec leaves an orphaned DMA mapping The ctxt's count field is overloaded to mean the number of pages in the ctxt->page array and the number of SGEs in the ctxt->sge array. Typically these two numbers are the same. However, when an inline RPC reply is constructed from an xdr_buf with a tail iovec, the head and tail often occupy the same page, but each are DMA mapped independently. In that case, ->count equals the number of pages, but it does not equal the number of SGEs. There's one more SGE, for the tail iovec. Hence there is one more DMA mapping than there are pages in the ctxt->page array. This isn't a real problem until the server's iommu is enabled. Then each RPC reply that has content in that iovec orphans a DMA mapping that consists of real resources. krb5i and krb5p always populate that tail iovec. After a couple million sent krb5i/p RPC replies, the NFS server starts behaving erratically. Reboot is needed to clear the problem. Fixes: 9d11b51ce7c1 ("svcrdma: Fix send_reply() scatter/gather set-up") Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index d6917b896d3a..3584bc8864c4 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -86,6 +86,7 @@ struct svc_rdma_op_ctxt { unsigned long flags; enum dma_data_direction direction; int count; + unsigned int mapped_sges; struct ib_sge sge[RPCSVC_MAXPAGES]; struct page *pages[RPCSVC_MAXPAGES]; }; @@ -193,6 +194,14 @@ struct svcxprt_rdma { #define RPCSVC_MAXPAYLOAD_RDMA RPCSVC_MAXPAYLOAD +/* Track DMA maps for this transport and context */ +static inline void svc_rdma_count_mappings(struct svcxprt_rdma *rdma, + struct svc_rdma_op_ctxt *ctxt) +{ + ctxt->mapped_sges++; + atomic_inc(&rdma->sc_dma_used); +} + /* svc_rdma_backchannel.c */ extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp, -- cgit v1.2.3 From 5d48709656584420f31b361c4b1a3ebf1d68b225 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 13 Sep 2016 10:53:07 -0400 Subject: rpcrdma: RDMA/CM private message data structure Introduce data structure used by both client and server to exchange implementation details during RDMA/CM connection establishment. This is an experimental out-of-band exchange between Linux RPC-over-RDMA Version One implementations, replacing the deprecated CCP (see RFC 5666bis). The purpose of this extension is to enable prototyping of features that might be introduced in a subsequent version of RPC-over-RDMA. Suggested by Christoph Hellwig and Devesh Sharma. Signed-off-by: Chuck Lever Reviewed-by: Sagi Grimberg Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/rpc_rdma.h | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'include') diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h index 3b1ff38f0c37..a7da6bf56610 100644 --- a/include/linux/sunrpc/rpc_rdma.h +++ b/include/linux/sunrpc/rpc_rdma.h @@ -41,6 +41,7 @@ #define _LINUX_SUNRPC_RPC_RDMA_H #include +#include #define RPCRDMA_VERSION 1 #define rpcrdma_version cpu_to_be32(RPCRDMA_VERSION) @@ -129,4 +130,38 @@ enum rpcrdma_proc { #define rdma_done cpu_to_be32(RDMA_DONE) #define rdma_error cpu_to_be32(RDMA_ERROR) +/* + * Private extension to RPC-over-RDMA Version One. + * Message passed during RDMA-CM connection set-up. + * + * Add new fields at the end, and don't permute existing + * fields. + */ +struct rpcrdma_connect_private { + __be32 cp_magic; + u8 cp_version; + u8 cp_flags; + u8 cp_send_size; + u8 cp_recv_size; +} __packed; + +#define rpcrdma_cmp_magic __cpu_to_be32(0xf6ab0e18) + +enum { + RPCRDMA_CMP_VERSION = 1, + RPCRDMA_CMP_F_SND_W_INV_OK = BIT(0), +}; + +static inline u8 +rpcrdma_encode_buffer_size(unsigned int size) +{ + return (size >> 10) - 1; +} + +static inline unsigned int +rpcrdma_decode_buffer_size(u8 val) +{ + return ((unsigned int)val + 1) << 10; +} + #endif /* _LINUX_SUNRPC_RPC_RDMA_H */ -- cgit v1.2.3 From 25d55296dd3eac23adb2ae46b67b65bf73b22fb2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 13 Sep 2016 10:53:23 -0400 Subject: svcrdma: support Remote Invalidation Support Remote Invalidation. A private message is exchanged with the client upon RDMA transport connect that indicates whether Send With Invalidation may be used by the server to send RPC replies. The invalidate_rkey is arbitrarily chosen from among rkeys present in the RPC-over-RDMA header's chunk lists. Send With Invalidate improves performance only when clients can recognize, while processing an RPC reply, that an rkey has already been invalidated. That has been submitted as a separate change. In the future, the RPC-over-RDMA protocol might support Remote Invalidation properly. The protocol needs to enable signaling between peers to indicate when Remote Invalidation can be used for each individual RPC. Signed-off-by: Chuck Lever Reviewed-by: Sagi Grimberg Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 3584bc8864c4..cc3ae16eac68 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -137,6 +137,7 @@ struct svcxprt_rdma { int sc_ord; /* RDMA read limit */ int sc_max_sge; int sc_max_sge_rd; /* max sge for read target */ + bool sc_snd_w_inv; /* OK to use Send With Invalidate */ atomic_t sc_sq_count; /* Number of SQ WR on queue */ unsigned int sc_sq_depth; /* Depth of SQ */ -- cgit v1.2.3 From b4c8eb0379c8d3300a54210edd2235fd1e81a8a6 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 16 Sep 2016 16:28:26 -0400 Subject: nfs: add a new NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK constant As defined in RFC 5661, section 18.16. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- include/uapi/linux/nfs4.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/nfs4.h b/include/uapi/linux/nfs4.h index 2b871e0858d9..4ae62796bfde 100644 --- a/include/uapi/linux/nfs4.h +++ b/include/uapi/linux/nfs4.h @@ -39,8 +39,9 @@ #define NFS4_FH_VOL_MIGRATION 0x0004 #define NFS4_FH_VOL_RENAME 0x0008 -#define NFS4_OPEN_RESULT_CONFIRM 0x0002 -#define NFS4_OPEN_RESULT_LOCKTYPE_POSIX 0x0004 +#define NFS4_OPEN_RESULT_CONFIRM 0x0002 +#define NFS4_OPEN_RESULT_LOCKTYPE_POSIX 0x0004 +#define NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK 0x0020 #define NFS4_SHARE_ACCESS_MASK 0x000F #define NFS4_SHARE_ACCESS_READ 0x0001 -- cgit v1.2.3 From 09bb8bfffd29c3dffb72bc2c69a062dfb1ae624c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 4 Aug 2016 10:19:06 +1000 Subject: exportfs: be careful to only return expected errors. When nfsd calls fh_to_dentry, it expect ESTALE or ENOMEM as errors. In particular it can be tempting to return ENOENT, but this is not handled well by nfsd. Rather than requiring strict adherence to error code code filesystems, treat all unexpected error codes the same as ESTALE. This is safest. Signed-off-by: NeilBrown Signed-off-by: J. Bruce Fields --- include/linux/exportfs.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index b03c0625fa6e..5ab958cdc50b 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -157,12 +157,13 @@ struct fid { * @fh_to_dentry is given a &struct super_block (@sb) and a file handle * fragment (@fh, @fh_len). It should return a &struct dentry which refers * to the same file that the file handle fragment refers to. If it cannot, - * it should return a %NULL pointer if the file was found but no acceptable - * &dentries were available, or an %ERR_PTR error code indicating why it - * couldn't be found (e.g. %ENOENT or %ENOMEM). Any suitable dentry can be - * returned including, if necessary, a new dentry created with d_alloc_root. - * The caller can then find any other extant dentries by following the - * d_alias links. + * it should return a %NULL pointer if the file cannot be found, or an + * %ERR_PTR error code of %ENOMEM if a memory allocation failure occurred. + * Any other error code is treated like %NULL, and will cause an %ESTALE error + * for callers of exportfs_decode_fh(). + * Any suitable dentry can be returned including, if necessary, a new dentry + * created with d_alloc_root. The caller can then find any other extant + * dentries by following the d_alias links. * * fh_to_parent: * Same as @fh_to_dentry, except that it returns a pointer to the parent -- cgit v1.2.3