From 06799a9085e12a778fe2851db550ab5911ad28fe Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 31 Jul 2022 15:41:05 +0300 Subject: net: bonding: replace dev_trans_start() with the jiffies of the last ARP/NS The bonding driver piggybacks on time stamps kept by the network stack for the purpose of the netdev TX watchdog, and this is problematic because it does not work with NETIF_F_LLTX devices. It is hard to say why the driver looks at dev_trans_start() of the slave->dev, considering that this is updated even by non-ARP/NS probes sent by us, and even by traffic not sent by us at all (for example PTP on physical slave devices). ARP monitoring in active-backup mode appears to still work even if we track only the last TX time of actual ARP probes. Signed-off-by: Vladimir Oltean Acked-by: Jay Vosburgh Signed-off-by: Jakub Kicinski --- include/net/bonding.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/bonding.h b/include/net/bonding.h index 6e78d657aa05..afd606df149a 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -161,8 +161,9 @@ struct slave { struct net_device *dev; /* first - useful for panic debug */ struct bonding *bond; /* our master */ int delay; - /* all three in jiffies */ + /* all 4 in jiffies */ unsigned long last_link_up; + unsigned long last_tx; unsigned long last_rx; unsigned long target_last_arp_rx[BOND_MAX_ARP_TARGETS]; s8 link; /* one of BOND_LINK_XXXX */ @@ -540,6 +541,16 @@ static inline unsigned long slave_last_rx(struct bonding *bond, return slave->last_rx; } +static inline void slave_update_last_tx(struct slave *slave) +{ + WRITE_ONCE(slave->last_tx, jiffies); +} + +static inline unsigned long slave_last_tx(struct slave *slave) +{ + return READ_ONCE(slave->last_tx); +} + #ifdef CONFIG_NET_POLL_CONTROLLER static inline netdev_tx_t bond_netpoll_send_skb(const struct slave *slave, struct sk_buff *skb) -- cgit v1.2.3 From e2dcac2f58f5a95ab092d1da237ffdc0da1832cf Mon Sep 17 00:00:00 2001 From: Jinghao Jia Date: Fri, 29 Jul 2022 20:17:13 +0000 Subject: BPF: Fix potential bad pointer dereference in bpf_sys_bpf() The bpf_sys_bpf() helper function allows an eBPF program to load another eBPF program from within the kernel. In this case the argument union bpf_attr pointer (as well as the insns and license pointers inside) is a kernel address instead of a userspace address (which is the case of a usual bpf() syscall). To make the memory copying process in the syscall work in both cases, bpfptr_t was introduced to wrap around the pointer and distinguish its origin. Specifically, when copying memory contents from a bpfptr_t, a copy_from_user() is performed in case of a userspace address and a memcpy() is performed for a kernel address. This can lead to problems because the in-kernel pointer is never checked for validity. The problem happens when an eBPF syscall program tries to call bpf_sys_bpf() to load a program but provides a bad insns pointer -- say 0xdeadbeef -- in the bpf_attr union. The helper calls __sys_bpf() which would then call bpf_prog_load() to load the program. bpf_prog_load() is responsible for copying the eBPF instructions to the newly allocated memory for the program; it creates a kernel bpfptr_t for insns and invokes copy_from_bpfptr(). Internally, all bpfptr_t operations are backed by the corresponding sockptr_t operations, which performs direct memcpy() on kernel pointers for copy_from/strncpy_from operations. Therefore, the code is always happy to dereference the bad pointer to trigger a un-handle-able page fault and in turn an oops. However, this is not supposed to happen because at that point the eBPF program is already verified and should not cause a memory error. Sample KASAN trace: [ 25.685056][ T228] ================================================================== [ 25.685680][ T228] BUG: KASAN: user-memory-access in copy_from_bpfptr+0x21/0x30 [ 25.686210][ T228] Read of size 80 at addr 00000000deadbeef by task poc/228 [ 25.686732][ T228] [ 25.686893][ T228] CPU: 3 PID: 228 Comm: poc Not tainted 5.19.0-rc7 #7 [ 25.687375][ T228] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS d55cb5a 04/01/2014 [ 25.687991][ T228] Call Trace: [ 25.688223][ T228] [ 25.688429][ T228] dump_stack_lvl+0x73/0x9e [ 25.688747][ T228] print_report+0xea/0x200 [ 25.689061][ T228] ? copy_from_bpfptr+0x21/0x30 [ 25.689401][ T228] ? _printk+0x54/0x6e [ 25.689693][ T228] ? _raw_spin_lock_irqsave+0x70/0xd0 [ 25.690071][ T228] ? copy_from_bpfptr+0x21/0x30 [ 25.690412][ T228] kasan_report+0xb5/0xe0 [ 25.690716][ T228] ? copy_from_bpfptr+0x21/0x30 [ 25.691059][ T228] kasan_check_range+0x2bd/0x2e0 [ 25.691405][ T228] ? copy_from_bpfptr+0x21/0x30 [ 25.691734][ T228] memcpy+0x25/0x60 [ 25.692000][ T228] copy_from_bpfptr+0x21/0x30 [ 25.692328][ T228] bpf_prog_load+0x604/0x9e0 [ 25.692653][ T228] ? cap_capable+0xb4/0xe0 [ 25.692956][ T228] ? security_capable+0x4f/0x70 [ 25.693324][ T228] __sys_bpf+0x3af/0x580 [ 25.693635][ T228] bpf_sys_bpf+0x45/0x240 [ 25.693937][ T228] bpf_prog_f0ec79a5a3caca46_bpf_func1+0xa2/0xbd [ 25.694394][ T228] bpf_prog_run_pin_on_cpu+0x2f/0xb0 [ 25.694756][ T228] bpf_prog_test_run_syscall+0x146/0x1c0 [ 25.695144][ T228] bpf_prog_test_run+0x172/0x190 [ 25.695487][ T228] __sys_bpf+0x2c5/0x580 [ 25.695776][ T228] __x64_sys_bpf+0x3a/0x50 [ 25.696084][ T228] do_syscall_64+0x60/0x90 [ 25.696393][ T228] ? fpregs_assert_state_consistent+0x50/0x60 [ 25.696815][ T228] ? exit_to_user_mode_prepare+0x36/0xa0 [ 25.697202][ T228] ? syscall_exit_to_user_mode+0x20/0x40 [ 25.697586][ T228] ? do_syscall_64+0x6e/0x90 [ 25.697899][ T228] entry_SYSCALL_64_after_hwframe+0x63/0xcd [ 25.698312][ T228] RIP: 0033:0x7f6d543fb759 [ 25.698624][ T228] Code: 08 5b 89 e8 5d c3 66 2e 0f 1f 84 00 00 00 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 97 a6 0e 00 f7 d8 64 89 01 48 [ 25.699946][ T228] RSP: 002b:00007ffc3df78468 EFLAGS: 00000287 ORIG_RAX: 0000000000000141 [ 25.700526][ T228] RAX: ffffffffffffffda RBX: 00007ffc3df78628 RCX: 00007f6d543fb759 [ 25.701071][ T228] RDX: 0000000000000090 RSI: 00007ffc3df78478 RDI: 000000000000000a [ 25.701636][ T228] RBP: 00007ffc3df78510 R08: 0000000000000000 R09: 0000000000300000 [ 25.702191][ T228] R10: 0000000000000005 R11: 0000000000000287 R12: 0000000000000000 [ 25.702736][ T228] R13: 00007ffc3df78638 R14: 000055a1584aca68 R15: 00007f6d5456a000 [ 25.703282][ T228] [ 25.703490][ T228] ================================================================== [ 25.704050][ T228] Disabling lock debugging due to kernel taint Update copy_from_bpfptr() and strncpy_from_bpfptr() so that: - for a kernel pointer, it uses the safe copy_from_kernel_nofault() and strncpy_from_kernel_nofault() functions. - for a userspace pointer, it performs copy_from_user() and strncpy_from_user(). Fixes: af2ac3e13e45 ("bpf: Prepare bpf syscall to be used from kernel and user space.") Link: https://lore.kernel.org/bpf/20220727132905.45166-1-jinghao@linux.ibm.com/ Signed-off-by: Jinghao Jia Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20220729201713.88688-1-jinghao@linux.ibm.com Signed-off-by: Alexei Starovoitov --- include/linux/bpfptr.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpfptr.h b/include/linux/bpfptr.h index 46e1757d06a3..79b2f78eec1a 100644 --- a/include/linux/bpfptr.h +++ b/include/linux/bpfptr.h @@ -49,7 +49,9 @@ static inline void bpfptr_add(bpfptr_t *bpfptr, size_t val) static inline int copy_from_bpfptr_offset(void *dst, bpfptr_t src, size_t offset, size_t size) { - return copy_from_sockptr_offset(dst, (sockptr_t) src, offset, size); + if (!bpfptr_is_kernel(src)) + return copy_from_user(dst, src.user + offset, size); + return copy_from_kernel_nofault(dst, src.kernel + offset, size); } static inline int copy_from_bpfptr(void *dst, bpfptr_t src, size_t size) @@ -78,7 +80,9 @@ static inline void *kvmemdup_bpfptr(bpfptr_t src, size_t len) static inline long strncpy_from_bpfptr(char *dst, bpfptr_t src, size_t count) { - return strncpy_from_sockptr(dst, (sockptr_t) src, count); + if (bpfptr_is_kernel(src)) + return strncpy_from_kernel_nofault(dst, src.kernel, count); + return strncpy_from_user(dst, src.user, count); } #endif /* _LINUX_BPFPTR_H */ -- cgit v1.2.3 From f1d41f7720c89705c20e4335a807b1c518c2e7be Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 2 Aug 2022 18:33:24 +0200 Subject: mptcp, btf: Add struct mptcp_sock definition when CONFIG_MPTCP is disabled The btf_sock_ids array needs struct mptcp_sock BTF ID for the bpf_skc_to_mptcp_sock helper. When CONFIG_MPTCP is disabled, the 'struct mptcp_sock' is not defined and resolve_btfids will complain with: [...] BTFIDS vmlinux WARN: resolve_btfids: unresolved symbol mptcp_sock [...] Add an empty definition for struct mptcp_sock when CONFIG_MPTCP is disabled. Fixes: 3bc253c2e652 ("bpf: Add bpf_skc_to_mptcp_sock_proto") Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann Reviewed-by: Mat Martineau Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220802163324.1873044-1-jolsa@kernel.org --- include/net/mptcp.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index ac9cf7271d46..412479ebf5ad 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -291,4 +291,8 @@ struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk); static inline struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk) { return NULL; } #endif +#if !IS_ENABLED(CONFIG_MPTCP) +struct mptcp_sock { }; +#endif + #endif /* __NET_MPTCP_H */ -- cgit v1.2.3 From 34aae2c2fb1e3d88a5aeee16715cb6bf0336cdce Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 9 Aug 2022 11:25:43 +0200 Subject: netfilter: nf_tables: validate variable length element extension Update template to validate variable length extensions. This patch adds a new .ext_len[id] field to the template to store the expected extension length. This is used to sanity check the initialization of the variable length extension. Use PTR_ERR() in nft_set_elem_init() to report errors since, after this update, there are two reason why this might fail, either because of ENOMEM or insufficient room in the extension field (EINVAL). Kernels up until 7e6bc1f6cabc ("netfilter: nf_tables: stricter validation of element data") allowed to copy more data to the extension than was allocated. This ext_len field allows to validate if the destination has the correct size as additional check. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 8bfb9c74afbf..7ece4fd0cf66 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -651,6 +651,7 @@ extern const struct nft_set_ext_type nft_set_ext_types[]; struct nft_set_ext_tmpl { u16 len; u8 offset[NFT_SET_EXT_NUM]; + u8 ext_len[NFT_SET_EXT_NUM]; }; /** @@ -680,7 +681,8 @@ static inline int nft_set_ext_add_length(struct nft_set_ext_tmpl *tmpl, u8 id, return -EINVAL; tmpl->offset[id] = tmpl->len; - tmpl->len += nft_set_ext_types[id].len + len; + tmpl->ext_len[id] = nft_set_ext_types[id].len + len; + tmpl->len += tmpl->ext_len[id]; return 0; } -- cgit v1.2.3 From 134941683b89d05b5e5c28c817c95049ba409d01 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 6 Aug 2022 17:39:20 +0200 Subject: netfilter: ip6t_LOG: Fix a typo in a comment s/_IPT_LOG_H/_IP6T_LOG_H/ While at it add some surrounding space to ease reading. Signed-off-by: Christophe JAILLET Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter_ipv6/ip6t_LOG.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/netfilter_ipv6/ip6t_LOG.h b/include/uapi/linux/netfilter_ipv6/ip6t_LOG.h index 23e91a9c2583..0b7b16dbdec2 100644 --- a/include/uapi/linux/netfilter_ipv6/ip6t_LOG.h +++ b/include/uapi/linux/netfilter_ipv6/ip6t_LOG.h @@ -17,4 +17,4 @@ struct ip6t_log_info { char prefix[30]; }; -#endif /*_IPT_LOG_H*/ +#endif /* _IP6T_LOG_H */ -- cgit v1.2.3 From 341b6941608762d8235f3fd1e45e4d7114ed8c2c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 8 Aug 2022 19:30:06 +0200 Subject: netfilter: nf_tables: upfront validation of data via nft_data_init() Instead of parsing the data and then validate that type and length are correct, pass a description of the expected data so it can be validated upfront before parsing it to bail out earlier. This patch adds a new .size field to specify the maximum size of the data area. The .len field is optional and it is used as an input/output field, it provides the specific length of the expected data in the input path. If then .len field is not specified, then obtained length from the netlink attribute is stored. This is required by cmp, bitwise, range and immediate, which provide no netlink attribute that describes the data length. The immediate expression uses the destination register type to infer the expected data type. Relying on opencoded validation of the expected data might lead to subtle bugs as described in 7e6bc1f6cabc ("netfilter: nf_tables: stricter validation of element data"). Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 7ece4fd0cf66..1554f1e7215b 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -223,11 +223,11 @@ struct nft_ctx { struct nft_data_desc { enum nft_data_types type; + unsigned int size; unsigned int len; }; -int nft_data_init(const struct nft_ctx *ctx, - struct nft_data *data, unsigned int size, +int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data, struct nft_data_desc *desc, const struct nlattr *nla); void nft_data_hold(const struct nft_data *data, enum nft_data_types type); void nft_data_release(const struct nft_data *data, enum nft_data_types type); -- cgit v1.2.3 From f323ef3a0d49e147365284bc1f02212e617b7f09 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 8 Aug 2022 19:30:07 +0200 Subject: netfilter: nf_tables: disallow jump to implicit chain from set element Extend struct nft_data_desc to add a flag field that specifies nft_data_init() is being called for set element data. Use it to disallow jump to implicit chain from set element, only jump to chain via immediate expression is allowed. Fixes: d0e2c7de92c7 ("netfilter: nf_tables: add NFT_CHAIN_BINDING") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 1554f1e7215b..99aae36c04b9 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -221,10 +221,15 @@ struct nft_ctx { bool report; }; +enum nft_data_desc_flags { + NFT_DATA_DESC_SETELEM = (1 << 0), +}; + struct nft_data_desc { enum nft_data_types type; unsigned int size; unsigned int len; + unsigned int flags; }; int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data, -- cgit v1.2.3 From 84b709d31063bacaabaaad1c5d85143150edd695 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 6 Aug 2022 18:02:36 +0200 Subject: ax88796: Fix some typo in a comment s/by caused/be caused/ s/ax88786/ax88796/ Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/7db4b622d2c3e5af58c1d1f32b81836f4af71f18.1659801746.git.christophe.jaillet@wanadoo.fr Signed-off-by: Jakub Kicinski --- include/net/ax88796.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ax88796.h b/include/net/ax88796.h index b658471f97f0..303100f08ab8 100644 --- a/include/net/ax88796.h +++ b/include/net/ax88796.h @@ -34,8 +34,8 @@ struct ax_plat_data { const unsigned char *buf, int star_page); void (*block_input)(struct net_device *dev, int count, struct sk_buff *skb, int ring_offset); - /* returns nonzero if a pending interrupt request might by caused by - * the ax88786. Handles all interrupts if set to NULL + /* returns nonzero if a pending interrupt request might be caused by + * the ax88796. Handles all interrupts if set to NULL */ int (*check_irq)(struct platform_device *pdev); }; -- cgit v1.2.3 From f329a0ebeaba4ffe91d431e0ac1ca7f9165872a4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 9 Aug 2022 16:27:40 -0700 Subject: genetlink: correct uAPI defines Commit 50a896cf2d6f ("genetlink: properly support per-op policy dumping") seems to have copy'n'pasted things a little incorrectly. The #define CTRL_ATTR_MCAST_GRP_MAX should have stayed right after the previous enum. The new CTRL_ATTR_POLICY_* needs its own define for MAX and that max should not contain the superfluous _DUMP in the name. We probably can't do anything about the CTRL_ATTR_POLICY_DUMP_MAX any more, there's likely code which uses it. For consistency (*cough* codegen *cough*) let's add the correctly name define nonetheless. Signed-off-by: Jakub Kicinski Reviewed-by: Johannes Berg Signed-off-by: David S. Miller --- include/uapi/linux/genetlink.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h index d83f214b4134..ddba3ca01e39 100644 --- a/include/uapi/linux/genetlink.h +++ b/include/uapi/linux/genetlink.h @@ -87,6 +87,8 @@ enum { __CTRL_ATTR_MCAST_GRP_MAX, }; +#define CTRL_ATTR_MCAST_GRP_MAX (__CTRL_ATTR_MCAST_GRP_MAX - 1) + enum { CTRL_ATTR_POLICY_UNSPEC, CTRL_ATTR_POLICY_DO, @@ -96,7 +98,6 @@ enum { CTRL_ATTR_POLICY_DUMP_MAX = __CTRL_ATTR_POLICY_DUMP_MAX - 1 }; -#define CTRL_ATTR_MCAST_GRP_MAX (__CTRL_ATTR_MCAST_GRP_MAX - 1) - +#define CTRL_ATTR_POLICY_MAX (__CTRL_ATTR_POLICY_DUMP_MAX - 1) #endif /* _UAPI__LINUX_GENERIC_NETLINK_H */ -- cgit v1.2.3 From 2a0133723f9ebeb751cfce19f74ec07e108bef1f Mon Sep 17 00:00:00 2001 From: Hawkins Jiawei Date: Fri, 5 Aug 2022 15:48:34 +0800 Subject: net: fix refcount bug in sk_psock_get (2) Syzkaller reports refcount bug as follows: ------------[ cut here ]------------ refcount_t: saturated; leaking memory. WARNING: CPU: 1 PID: 3605 at lib/refcount.c:19 refcount_warn_saturate+0xf4/0x1e0 lib/refcount.c:19 Modules linked in: CPU: 1 PID: 3605 Comm: syz-executor208 Not tainted 5.18.0-syzkaller-03023-g7e062cda7d90 #0 __refcount_add_not_zero include/linux/refcount.h:163 [inline] __refcount_inc_not_zero include/linux/refcount.h:227 [inline] refcount_inc_not_zero include/linux/refcount.h:245 [inline] sk_psock_get+0x3bc/0x410 include/linux/skmsg.h:439 tls_data_ready+0x6d/0x1b0 net/tls/tls_sw.c:2091 tcp_data_ready+0x106/0x520 net/ipv4/tcp_input.c:4983 tcp_data_queue+0x25f2/0x4c90 net/ipv4/tcp_input.c:5057 tcp_rcv_state_process+0x1774/0x4e80 net/ipv4/tcp_input.c:6659 tcp_v4_do_rcv+0x339/0x980 net/ipv4/tcp_ipv4.c:1682 sk_backlog_rcv include/net/sock.h:1061 [inline] __release_sock+0x134/0x3b0 net/core/sock.c:2849 release_sock+0x54/0x1b0 net/core/sock.c:3404 inet_shutdown+0x1e0/0x430 net/ipv4/af_inet.c:909 __sys_shutdown_sock net/socket.c:2331 [inline] __sys_shutdown_sock net/socket.c:2325 [inline] __sys_shutdown+0xf1/0x1b0 net/socket.c:2343 __do_sys_shutdown net/socket.c:2351 [inline] __se_sys_shutdown net/socket.c:2349 [inline] __x64_sys_shutdown+0x50/0x70 net/socket.c:2349 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 During SMC fallback process in connect syscall, kernel will replaces TCP with SMC. In order to forward wakeup smc socket waitqueue after fallback, kernel will sets clcsk->sk_user_data to origin smc socket in smc_fback_replace_callbacks(). Later, in shutdown syscall, kernel will calls sk_psock_get(), which treats the clcsk->sk_user_data as psock type, triggering the refcnt warning. So, the root cause is that smc and psock, both will use sk_user_data field. So they will mismatch this field easily. This patch solves it by using another bit(defined as SK_USER_DATA_PSOCK) in PTRMASK, to mark whether sk_user_data points to a psock object or not. This patch depends on a PTRMASK introduced in commit f1ff5ce2cd5e ("net, sk_msg: Clear sk_user_data pointer on clone if tagged"). For there will possibly be more flags in the sk_user_data field, this patch also refactor sk_user_data flags code to be more generic to improve its maintainability. Reported-and-tested-by: syzbot+5f26f85569bd179c18ce@syzkaller.appspotmail.com Suggested-by: Jakub Kicinski Acked-by: Wen Gu Signed-off-by: Hawkins Jiawei Reviewed-by: Jakub Sitnicki Signed-off-by: Jakub Kicinski --- include/linux/skmsg.h | 3 ++- include/net/sock.h | 68 ++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 50 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 153b6dec9b6a..48f4b645193b 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -278,7 +278,8 @@ static inline void sk_msg_sg_copy_clear(struct sk_msg *msg, u32 start) static inline struct sk_psock *sk_psock(const struct sock *sk) { - return rcu_dereference_sk_user_data(sk); + return __rcu_dereference_sk_user_data_with_flags(sk, + SK_USER_DATA_PSOCK); } static inline void sk_psock_set_state(struct sk_psock *psock, diff --git a/include/net/sock.h b/include/net/sock.h index a7273b289188..05a1bbdf5805 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -545,14 +545,26 @@ enum sk_pacing { SK_PACING_FQ = 2, }; -/* Pointer stored in sk_user_data might not be suitable for copying - * when cloning the socket. For instance, it can point to a reference - * counted object. sk_user_data bottom bit is set if pointer must not - * be copied. +/* flag bits in sk_user_data + * + * - SK_USER_DATA_NOCOPY: Pointer stored in sk_user_data might + * not be suitable for copying when cloning the socket. For instance, + * it can point to a reference counted object. sk_user_data bottom + * bit is set if pointer must not be copied. + * + * - SK_USER_DATA_BPF: Mark whether sk_user_data field is + * managed/owned by a BPF reuseport array. This bit should be set + * when sk_user_data's sk is added to the bpf's reuseport_array. + * + * - SK_USER_DATA_PSOCK: Mark whether pointer stored in + * sk_user_data points to psock type. This bit should be set + * when sk_user_data is assigned to a psock object. */ #define SK_USER_DATA_NOCOPY 1UL -#define SK_USER_DATA_BPF 2UL /* Managed by BPF */ -#define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF) +#define SK_USER_DATA_BPF 2UL +#define SK_USER_DATA_PSOCK 4UL +#define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF |\ + SK_USER_DATA_PSOCK) /** * sk_user_data_is_nocopy - Test if sk_user_data pointer must not be copied @@ -565,24 +577,40 @@ static inline bool sk_user_data_is_nocopy(const struct sock *sk) #define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data))) +/** + * __rcu_dereference_sk_user_data_with_flags - return the pointer + * only if argument flags all has been set in sk_user_data. Otherwise + * return NULL + * + * @sk: socket + * @flags: flag bits + */ +static inline void * +__rcu_dereference_sk_user_data_with_flags(const struct sock *sk, + uintptr_t flags) +{ + uintptr_t sk_user_data = (uintptr_t)rcu_dereference(__sk_user_data(sk)); + + WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK); + + if ((sk_user_data & flags) == flags) + return (void *)(sk_user_data & SK_USER_DATA_PTRMASK); + return NULL; +} + #define rcu_dereference_sk_user_data(sk) \ + __rcu_dereference_sk_user_data_with_flags(sk, 0) +#define __rcu_assign_sk_user_data_with_flags(sk, ptr, flags) \ ({ \ - void *__tmp = rcu_dereference(__sk_user_data((sk))); \ - (void *)((uintptr_t)__tmp & SK_USER_DATA_PTRMASK); \ -}) -#define rcu_assign_sk_user_data(sk, ptr) \ -({ \ - uintptr_t __tmp = (uintptr_t)(ptr); \ - WARN_ON_ONCE(__tmp & ~SK_USER_DATA_PTRMASK); \ - rcu_assign_pointer(__sk_user_data((sk)), __tmp); \ -}) -#define rcu_assign_sk_user_data_nocopy(sk, ptr) \ -({ \ - uintptr_t __tmp = (uintptr_t)(ptr); \ - WARN_ON_ONCE(__tmp & ~SK_USER_DATA_PTRMASK); \ + uintptr_t __tmp1 = (uintptr_t)(ptr), \ + __tmp2 = (uintptr_t)(flags); \ + WARN_ON_ONCE(__tmp1 & ~SK_USER_DATA_PTRMASK); \ + WARN_ON_ONCE(__tmp2 & SK_USER_DATA_PTRMASK); \ rcu_assign_pointer(__sk_user_data((sk)), \ - __tmp | SK_USER_DATA_NOCOPY); \ + __tmp1 | __tmp2); \ }) +#define rcu_assign_sk_user_data(sk, ptr) \ + __rcu_assign_sk_user_data_with_flags(sk, ptr, 0) static inline struct net *sock_net(const struct sock *sk) -- cgit v1.2.3 From 94ce3b64c62d4b628cf85cd0d9a370aca8f7e43a Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Wed, 10 Aug 2022 11:16:02 +0300 Subject: net/tls: Use RCU API to access tls_ctx->netdev Currently, tls_device_down synchronizes with tls_device_resync_rx using RCU, however, the pointer to netdev is stored using WRITE_ONCE and loaded using READ_ONCE. Although such approach is technically correct (rcu_dereference is essentially a READ_ONCE, and rcu_assign_pointer uses WRITE_ONCE to store NULL), using special RCU helpers for pointers is more valid, as it includes additional checks and might change the implementation transparently to the callers. Mark the netdev pointer as __rcu and use the correct RCU helpers to access it. For non-concurrent access pass the right conditions that guarantee safe access (locks taken, refcount value). Also use the correct helper in mlx5e, where even READ_ONCE was missing. The transition to RCU exposes existing issues, fixed by this commit: 1. bond_tls_device_xmit could read netdev twice, and it could become NULL the second time, after the NULL check passed. 2. Drivers shouldn't stop processing the last packet if tls_device_down just set netdev to NULL, before tls_dev_del was called. This prevents a possible packet drop when transitioning to the fallback software mode. Fixes: 89df6a810470 ("net/bonding: Implement TLS TX device offload") Fixes: c55dcdd435aa ("net/tls: Fix use-after-free after the TLS device goes down and up") Signed-off-by: Maxim Mikityanskiy Link: https://lore.kernel.org/r/20220810081602.1435800-1-maximmi@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/tls.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index b75b5727abdb..cb205f9d9473 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -237,7 +237,7 @@ struct tls_context { void *priv_ctx_tx; void *priv_ctx_rx; - struct net_device *netdev; + struct net_device __rcu *netdev; /* rw cache line */ struct cipher_context tx; -- cgit v1.2.3 From 5c221f0af68cfa9edcffd26ba6dbbc4b7ddb1700 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 9 Aug 2022 16:20:12 -0700 Subject: net: add missing kdoc for struct genl_multicast_group::flags Multicast group flags were added in commit 4d54cc32112d ("mptcp: avoid lock_fast usage in accept path"), but it missed adding the kdoc. Mention which flags go into that field, and do the same for op structs. Link: https://lore.kernel.org/r/20220809232012.403730-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/genetlink.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 7cb3fa8310ed..56a50e1c51b9 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -11,6 +11,7 @@ /** * struct genl_multicast_group - generic netlink multicast group * @name: name of the multicast group, names are per-family + * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM) */ struct genl_multicast_group { char name[GENL_NAMSIZ]; @@ -116,7 +117,7 @@ enum genl_validate_flags { * struct genl_small_ops - generic netlink operations (small version) * @cmd: command identifier * @internal_flags: flags used by the family - * @flags: flags + * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM) * @validate: validation flags from enum genl_validate_flags * @doit: standard command callback * @dumpit: callback for dumpers @@ -137,7 +138,7 @@ struct genl_small_ops { * struct genl_ops - generic netlink operations * @cmd: command identifier * @internal_flags: flags used by the family - * @flags: flags + * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM) * @maxattr: maximum number of attributes supported * @policy: netlink policy (takes precedence over family policy) * @validate: validation flags from enum genl_validate_flags -- cgit v1.2.3 From c2e75634cbe368065f140dd30bf8b1a0355158fd Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 10 Aug 2022 09:45:47 -0700 Subject: net: atm: bring back zatm uAPI Jiri reports that linux-atm does not build without this header. Bring it back. It's completely dead code but we can't break the build for user space :( Reported-by: Jiri Slaby Fixes: 052e1f01bfae ("net: atm: remove support for ZeitNet ZN122x ATM devices") Link: https://lore.kernel.org/all/8576aef3-37e4-8bae-bab5-08f82a78efd3@kernel.org/ Link: https://lore.kernel.org/r/20220810164547.484378-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/atm_zatm.h | 47 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 include/uapi/linux/atm_zatm.h (limited to 'include') diff --git a/include/uapi/linux/atm_zatm.h b/include/uapi/linux/atm_zatm.h new file mode 100644 index 000000000000..5135027b93c1 --- /dev/null +++ b/include/uapi/linux/atm_zatm.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* atm_zatm.h - Driver-specific declarations of the ZATM driver (for use by + driver-specific utilities) */ + +/* Written 1995-1999 by Werner Almesberger, EPFL LRC/ICA */ + + +#ifndef LINUX_ATM_ZATM_H +#define LINUX_ATM_ZATM_H + +/* + * Note: non-kernel programs including this file must also include + * sys/types.h for struct timeval + */ + +#include +#include + +#define ZATM_GETPOOL _IOW('a',ATMIOC_SARPRV+1,struct atmif_sioc) + /* get pool statistics */ +#define ZATM_GETPOOLZ _IOW('a',ATMIOC_SARPRV+2,struct atmif_sioc) + /* get statistics and zero */ +#define ZATM_SETPOOL _IOW('a',ATMIOC_SARPRV+3,struct atmif_sioc) + /* set pool parameters */ + +struct zatm_pool_info { + int ref_count; /* free buffer pool usage counters */ + int low_water,high_water; /* refill parameters */ + int rqa_count,rqu_count; /* queue condition counters */ + int offset,next_off; /* alignment optimizations: offset */ + int next_cnt,next_thres; /* repetition counter and threshold */ +}; + +struct zatm_pool_req { + int pool_num; /* pool number */ + struct zatm_pool_info info; /* actual information */ +}; + +#define ZATM_OAM_POOL 0 /* free buffer pool for OAM cells */ +#define ZATM_AAL0_POOL 1 /* free buffer pool for AAL0 cells */ +#define ZATM_AAL5_POOL_BASE 2 /* first AAL5 free buffer pool */ +#define ZATM_LAST_POOL ZATM_AAL5_POOL_BASE+10 /* max. 64 kB */ + +#define ZATM_TIMER_HISTORY_SIZE 16 /* number of timer adjustments to + record; must be 2^n */ + +#endif -- cgit v1.2.3