From 06799a9085e12a778fe2851db550ab5911ad28fe Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sun, 31 Jul 2022 15:41:05 +0300
Subject: net: bonding: replace dev_trans_start() with the jiffies of the last
 ARP/NS

The bonding driver piggybacks on time stamps kept by the network stack
for the purpose of the netdev TX watchdog, and this is problematic
because it does not work with NETIF_F_LLTX devices.

It is hard to say why the driver looks at dev_trans_start() of the
slave->dev, considering that this is updated even by non-ARP/NS probes
sent by us, and even by traffic not sent by us at all (for example PTP
on physical slave devices). ARP monitoring in active-backup mode appears
to still work even if we track only the last TX time of actual ARP
probes.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Acked-by: Jay Vosburgh <jay.vosburgh@canonical.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/bonding.h | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/bonding.h b/include/net/bonding.h
index 6e78d657aa05..afd606df149a 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -161,8 +161,9 @@ struct slave {
 	struct net_device *dev; /* first - useful for panic debug */
 	struct bonding *bond; /* our master */
 	int    delay;
-	/* all three in jiffies */
+	/* all 4 in jiffies */
 	unsigned long last_link_up;
+	unsigned long last_tx;
 	unsigned long last_rx;
 	unsigned long target_last_arp_rx[BOND_MAX_ARP_TARGETS];
 	s8     link;		/* one of BOND_LINK_XXXX */
@@ -540,6 +541,16 @@ static inline unsigned long slave_last_rx(struct bonding *bond,
 	return slave->last_rx;
 }
 
+static inline void slave_update_last_tx(struct slave *slave)
+{
+	WRITE_ONCE(slave->last_tx, jiffies);
+}
+
+static inline unsigned long slave_last_tx(struct slave *slave)
+{
+	return READ_ONCE(slave->last_tx);
+}
+
 #ifdef CONFIG_NET_POLL_CONTROLLER
 static inline netdev_tx_t bond_netpoll_send_skb(const struct slave *slave,
 					 struct sk_buff *skb)
-- 
cgit v1.2.3


From e2dcac2f58f5a95ab092d1da237ffdc0da1832cf Mon Sep 17 00:00:00 2001
From: Jinghao Jia <jinghao@linux.ibm.com>
Date: Fri, 29 Jul 2022 20:17:13 +0000
Subject: BPF: Fix potential bad pointer dereference in bpf_sys_bpf()

The bpf_sys_bpf() helper function allows an eBPF program to load another
eBPF program from within the kernel. In this case the argument union
bpf_attr pointer (as well as the insns and license pointers inside) is a
kernel address instead of a userspace address (which is the case of a
usual bpf() syscall). To make the memory copying process in the syscall
work in both cases, bpfptr_t was introduced to wrap around the pointer
and distinguish its origin. Specifically, when copying memory contents
from a bpfptr_t, a copy_from_user() is performed in case of a userspace
address and a memcpy() is performed for a kernel address.

This can lead to problems because the in-kernel pointer is never checked
for validity. The problem happens when an eBPF syscall program tries to
call bpf_sys_bpf() to load a program but provides a bad insns pointer --
say 0xdeadbeef -- in the bpf_attr union. The helper calls __sys_bpf()
which would then call bpf_prog_load() to load the program.
bpf_prog_load() is responsible for copying the eBPF instructions to the
newly allocated memory for the program; it creates a kernel bpfptr_t for
insns and invokes copy_from_bpfptr(). Internally, all bpfptr_t
operations are backed by the corresponding sockptr_t operations, which
performs direct memcpy() on kernel pointers for copy_from/strncpy_from
operations. Therefore, the code is always happy to dereference the bad
pointer to trigger a un-handle-able page fault and in turn an oops.
However, this is not supposed to happen because at that point the eBPF
program is already verified and should not cause a memory error.

Sample KASAN trace:

[   25.685056][  T228] ==================================================================
[   25.685680][  T228] BUG: KASAN: user-memory-access in copy_from_bpfptr+0x21/0x30
[   25.686210][  T228] Read of size 80 at addr 00000000deadbeef by task poc/228
[   25.686732][  T228]
[   25.686893][  T228] CPU: 3 PID: 228 Comm: poc Not tainted 5.19.0-rc7 #7
[   25.687375][  T228] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS d55cb5a 04/01/2014
[   25.687991][  T228] Call Trace:
[   25.688223][  T228]  <TASK>
[   25.688429][  T228]  dump_stack_lvl+0x73/0x9e
[   25.688747][  T228]  print_report+0xea/0x200
[   25.689061][  T228]  ? copy_from_bpfptr+0x21/0x30
[   25.689401][  T228]  ? _printk+0x54/0x6e
[   25.689693][  T228]  ? _raw_spin_lock_irqsave+0x70/0xd0
[   25.690071][  T228]  ? copy_from_bpfptr+0x21/0x30
[   25.690412][  T228]  kasan_report+0xb5/0xe0
[   25.690716][  T228]  ? copy_from_bpfptr+0x21/0x30
[   25.691059][  T228]  kasan_check_range+0x2bd/0x2e0
[   25.691405][  T228]  ? copy_from_bpfptr+0x21/0x30
[   25.691734][  T228]  memcpy+0x25/0x60
[   25.692000][  T228]  copy_from_bpfptr+0x21/0x30
[   25.692328][  T228]  bpf_prog_load+0x604/0x9e0
[   25.692653][  T228]  ? cap_capable+0xb4/0xe0
[   25.692956][  T228]  ? security_capable+0x4f/0x70
[   25.693324][  T228]  __sys_bpf+0x3af/0x580
[   25.693635][  T228]  bpf_sys_bpf+0x45/0x240
[   25.693937][  T228]  bpf_prog_f0ec79a5a3caca46_bpf_func1+0xa2/0xbd
[   25.694394][  T228]  bpf_prog_run_pin_on_cpu+0x2f/0xb0
[   25.694756][  T228]  bpf_prog_test_run_syscall+0x146/0x1c0
[   25.695144][  T228]  bpf_prog_test_run+0x172/0x190
[   25.695487][  T228]  __sys_bpf+0x2c5/0x580
[   25.695776][  T228]  __x64_sys_bpf+0x3a/0x50
[   25.696084][  T228]  do_syscall_64+0x60/0x90
[   25.696393][  T228]  ? fpregs_assert_state_consistent+0x50/0x60
[   25.696815][  T228]  ? exit_to_user_mode_prepare+0x36/0xa0
[   25.697202][  T228]  ? syscall_exit_to_user_mode+0x20/0x40
[   25.697586][  T228]  ? do_syscall_64+0x6e/0x90
[   25.697899][  T228]  entry_SYSCALL_64_after_hwframe+0x63/0xcd
[   25.698312][  T228] RIP: 0033:0x7f6d543fb759
[   25.698624][  T228] Code: 08 5b 89 e8 5d c3 66 2e 0f 1f 84 00 00 00 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 97 a6 0e 00 f7 d8 64 89 01 48
[   25.699946][  T228] RSP: 002b:00007ffc3df78468 EFLAGS: 00000287 ORIG_RAX: 0000000000000141
[   25.700526][  T228] RAX: ffffffffffffffda RBX: 00007ffc3df78628 RCX: 00007f6d543fb759
[   25.701071][  T228] RDX: 0000000000000090 RSI: 00007ffc3df78478 RDI: 000000000000000a
[   25.701636][  T228] RBP: 00007ffc3df78510 R08: 0000000000000000 R09: 0000000000300000
[   25.702191][  T228] R10: 0000000000000005 R11: 0000000000000287 R12: 0000000000000000
[   25.702736][  T228] R13: 00007ffc3df78638 R14: 000055a1584aca68 R15: 00007f6d5456a000
[   25.703282][  T228]  </TASK>
[   25.703490][  T228] ==================================================================
[   25.704050][  T228] Disabling lock debugging due to kernel taint

Update copy_from_bpfptr() and strncpy_from_bpfptr() so that:
 - for a kernel pointer, it uses the safe copy_from_kernel_nofault() and
   strncpy_from_kernel_nofault() functions.
 - for a userspace pointer, it performs copy_from_user() and
   strncpy_from_user().

Fixes: af2ac3e13e45 ("bpf: Prepare bpf syscall to be used from kernel and user space.")
Link: https://lore.kernel.org/bpf/20220727132905.45166-1-jinghao@linux.ibm.com/
Signed-off-by: Jinghao Jia <jinghao@linux.ibm.com>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20220729201713.88688-1-jinghao@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpfptr.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpfptr.h b/include/linux/bpfptr.h
index 46e1757d06a3..79b2f78eec1a 100644
--- a/include/linux/bpfptr.h
+++ b/include/linux/bpfptr.h
@@ -49,7 +49,9 @@ static inline void bpfptr_add(bpfptr_t *bpfptr, size_t val)
 static inline int copy_from_bpfptr_offset(void *dst, bpfptr_t src,
 					  size_t offset, size_t size)
 {
-	return copy_from_sockptr_offset(dst, (sockptr_t) src, offset, size);
+	if (!bpfptr_is_kernel(src))
+		return copy_from_user(dst, src.user + offset, size);
+	return copy_from_kernel_nofault(dst, src.kernel + offset, size);
 }
 
 static inline int copy_from_bpfptr(void *dst, bpfptr_t src, size_t size)
@@ -78,7 +80,9 @@ static inline void *kvmemdup_bpfptr(bpfptr_t src, size_t len)
 
 static inline long strncpy_from_bpfptr(char *dst, bpfptr_t src, size_t count)
 {
-	return strncpy_from_sockptr(dst, (sockptr_t) src, count);
+	if (bpfptr_is_kernel(src))
+		return strncpy_from_kernel_nofault(dst, src.kernel, count);
+	return strncpy_from_user(dst, src.user, count);
 }
 
 #endif /* _LINUX_BPFPTR_H */
-- 
cgit v1.2.3


From f1d41f7720c89705c20e4335a807b1c518c2e7be Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Tue, 2 Aug 2022 18:33:24 +0200
Subject: mptcp, btf: Add struct mptcp_sock definition when CONFIG_MPTCP is
 disabled

The btf_sock_ids array needs struct mptcp_sock BTF ID for the
bpf_skc_to_mptcp_sock helper.

When CONFIG_MPTCP is disabled, the 'struct mptcp_sock' is not
defined and resolve_btfids will complain with:

  [...]
  BTFIDS  vmlinux
  WARN: resolve_btfids: unresolved symbol mptcp_sock
  [...]

Add an empty definition for struct mptcp_sock when CONFIG_MPTCP
is disabled.

Fixes: 3bc253c2e652 ("bpf: Add bpf_skc_to_mptcp_sock_proto")
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20220802163324.1873044-1-jolsa@kernel.org
---
 include/net/mptcp.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index ac9cf7271d46..412479ebf5ad 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -291,4 +291,8 @@ struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk);
 static inline struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk) { return NULL; }
 #endif
 
+#if !IS_ENABLED(CONFIG_MPTCP)
+struct mptcp_sock { };
+#endif
+
 #endif /* __NET_MPTCP_H */
-- 
cgit v1.2.3


From 34aae2c2fb1e3d88a5aeee16715cb6bf0336cdce Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 9 Aug 2022 11:25:43 +0200
Subject: netfilter: nf_tables: validate variable length element extension

Update template to validate variable length extensions. This patch adds
a new .ext_len[id] field to the template to store the expected extension
length. This is used to sanity check the initialization of the variable
length extension.

Use PTR_ERR() in nft_set_elem_init() to report errors since, after this
update, there are two reason why this might fail, either because of
ENOMEM or insufficient room in the extension field (EINVAL).

Kernels up until 7e6bc1f6cabc ("netfilter: nf_tables: stricter
validation of element data") allowed to copy more data to the extension
than was allocated. This ext_len field allows to validate if the
destination has the correct size as additional check.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 8bfb9c74afbf..7ece4fd0cf66 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -651,6 +651,7 @@ extern const struct nft_set_ext_type nft_set_ext_types[];
 struct nft_set_ext_tmpl {
 	u16	len;
 	u8	offset[NFT_SET_EXT_NUM];
+	u8	ext_len[NFT_SET_EXT_NUM];
 };
 
 /**
@@ -680,7 +681,8 @@ static inline int nft_set_ext_add_length(struct nft_set_ext_tmpl *tmpl, u8 id,
 		return -EINVAL;
 
 	tmpl->offset[id] = tmpl->len;
-	tmpl->len	+= nft_set_ext_types[id].len + len;
+	tmpl->ext_len[id] = nft_set_ext_types[id].len + len;
+	tmpl->len	+= tmpl->ext_len[id];
 
 	return 0;
 }
-- 
cgit v1.2.3


From 134941683b89d05b5e5c28c817c95049ba409d01 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 6 Aug 2022 17:39:20 +0200
Subject: netfilter: ip6t_LOG: Fix a typo in a comment

s/_IPT_LOG_H/_IP6T_LOG_H/

While at it add some surrounding space to ease reading.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter_ipv6/ip6t_LOG.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter_ipv6/ip6t_LOG.h b/include/uapi/linux/netfilter_ipv6/ip6t_LOG.h
index 23e91a9c2583..0b7b16dbdec2 100644
--- a/include/uapi/linux/netfilter_ipv6/ip6t_LOG.h
+++ b/include/uapi/linux/netfilter_ipv6/ip6t_LOG.h
@@ -17,4 +17,4 @@ struct ip6t_log_info {
 	char prefix[30];
 };
 
-#endif /*_IPT_LOG_H*/
+#endif /* _IP6T_LOG_H */
-- 
cgit v1.2.3


From 341b6941608762d8235f3fd1e45e4d7114ed8c2c Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 8 Aug 2022 19:30:06 +0200
Subject: netfilter: nf_tables: upfront validation of data via nft_data_init()

Instead of parsing the data and then validate that type and length are
correct, pass a description of the expected data so it can be validated
upfront before parsing it to bail out earlier.

This patch adds a new .size field to specify the maximum size of the
data area. The .len field is optional and it is used as an input/output
field, it provides the specific length of the expected data in the input
path. If then .len field is not specified, then obtained length from the
netlink attribute is stored. This is required by cmp, bitwise, range and
immediate, which provide no netlink attribute that describes the data
length. The immediate expression uses the destination register type to
infer the expected data type.

Relying on opencoded validation of the expected data might lead to
subtle bugs as described in 7e6bc1f6cabc ("netfilter: nf_tables:
stricter validation of element data").

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 7ece4fd0cf66..1554f1e7215b 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -223,11 +223,11 @@ struct nft_ctx {
 
 struct nft_data_desc {
 	enum nft_data_types		type;
+	unsigned int			size;
 	unsigned int			len;
 };
 
-int nft_data_init(const struct nft_ctx *ctx,
-		  struct nft_data *data, unsigned int size,
+int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data,
 		  struct nft_data_desc *desc, const struct nlattr *nla);
 void nft_data_hold(const struct nft_data *data, enum nft_data_types type);
 void nft_data_release(const struct nft_data *data, enum nft_data_types type);
-- 
cgit v1.2.3


From f323ef3a0d49e147365284bc1f02212e617b7f09 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 8 Aug 2022 19:30:07 +0200
Subject: netfilter: nf_tables: disallow jump to implicit chain from set
 element

Extend struct nft_data_desc to add a flag field that specifies
nft_data_init() is being called for set element data.

Use it to disallow jump to implicit chain from set element, only jump
to chain via immediate expression is allowed.

Fixes: d0e2c7de92c7 ("netfilter: nf_tables: add NFT_CHAIN_BINDING")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 1554f1e7215b..99aae36c04b9 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -221,10 +221,15 @@ struct nft_ctx {
 	bool				report;
 };
 
+enum nft_data_desc_flags {
+	NFT_DATA_DESC_SETELEM	= (1 << 0),
+};
+
 struct nft_data_desc {
 	enum nft_data_types		type;
 	unsigned int			size;
 	unsigned int			len;
+	unsigned int			flags;
 };
 
 int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data,
-- 
cgit v1.2.3


From 84b709d31063bacaabaaad1c5d85143150edd695 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 6 Aug 2022 18:02:36 +0200
Subject: ax88796: Fix some typo in a comment

s/by caused/be caused/
s/ax88786/ax88796/

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Link: https://lore.kernel.org/r/7db4b622d2c3e5af58c1d1f32b81836f4af71f18.1659801746.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/ax88796.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/ax88796.h b/include/net/ax88796.h
index b658471f97f0..303100f08ab8 100644
--- a/include/net/ax88796.h
+++ b/include/net/ax88796.h
@@ -34,8 +34,8 @@ struct ax_plat_data {
 			const unsigned char *buf, int star_page);
 	void (*block_input)(struct net_device *dev, int count,
 			struct sk_buff *skb, int ring_offset);
-	/* returns nonzero if a pending interrupt request might by caused by
-	 * the ax88786. Handles all interrupts if set to NULL
+	/* returns nonzero if a pending interrupt request might be caused by
+	 * the ax88796. Handles all interrupts if set to NULL
 	 */
 	int (*check_irq)(struct platform_device *pdev);
 };
-- 
cgit v1.2.3


From f329a0ebeaba4ffe91d431e0ac1ca7f9165872a4 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 9 Aug 2022 16:27:40 -0700
Subject: genetlink: correct uAPI defines

Commit 50a896cf2d6f ("genetlink: properly support per-op policy dumping")
seems to have copy'n'pasted things a little incorrectly.

The #define CTRL_ATTR_MCAST_GRP_MAX should have stayed right
after the previous enum. The new CTRL_ATTR_POLICY_* needs
its own define for MAX and that max should not contain the
superfluous _DUMP in the name.

We probably can't do anything about the CTRL_ATTR_POLICY_DUMP_MAX
any more, there's likely code which uses it. For consistency
(*cough* codegen *cough*) let's add the correctly name define
nonetheless.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/genetlink.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h
index d83f214b4134..ddba3ca01e39 100644
--- a/include/uapi/linux/genetlink.h
+++ b/include/uapi/linux/genetlink.h
@@ -87,6 +87,8 @@ enum {
 	__CTRL_ATTR_MCAST_GRP_MAX,
 };
 
+#define CTRL_ATTR_MCAST_GRP_MAX (__CTRL_ATTR_MCAST_GRP_MAX - 1)
+
 enum {
 	CTRL_ATTR_POLICY_UNSPEC,
 	CTRL_ATTR_POLICY_DO,
@@ -96,7 +98,6 @@ enum {
 	CTRL_ATTR_POLICY_DUMP_MAX = __CTRL_ATTR_POLICY_DUMP_MAX - 1
 };
 
-#define CTRL_ATTR_MCAST_GRP_MAX (__CTRL_ATTR_MCAST_GRP_MAX - 1)
-
+#define CTRL_ATTR_POLICY_MAX (__CTRL_ATTR_POLICY_DUMP_MAX - 1)
 
 #endif /* _UAPI__LINUX_GENERIC_NETLINK_H */
-- 
cgit v1.2.3


From 2a0133723f9ebeb751cfce19f74ec07e108bef1f Mon Sep 17 00:00:00 2001
From: Hawkins Jiawei <yin31149@gmail.com>
Date: Fri, 5 Aug 2022 15:48:34 +0800
Subject: net: fix refcount bug in sk_psock_get (2)

Syzkaller reports refcount bug as follows:
------------[ cut here ]------------
refcount_t: saturated; leaking memory.
WARNING: CPU: 1 PID: 3605 at lib/refcount.c:19 refcount_warn_saturate+0xf4/0x1e0 lib/refcount.c:19
Modules linked in:
CPU: 1 PID: 3605 Comm: syz-executor208 Not tainted 5.18.0-syzkaller-03023-g7e062cda7d90 #0
 <TASK>
 __refcount_add_not_zero include/linux/refcount.h:163 [inline]
 __refcount_inc_not_zero include/linux/refcount.h:227 [inline]
 refcount_inc_not_zero include/linux/refcount.h:245 [inline]
 sk_psock_get+0x3bc/0x410 include/linux/skmsg.h:439
 tls_data_ready+0x6d/0x1b0 net/tls/tls_sw.c:2091
 tcp_data_ready+0x106/0x520 net/ipv4/tcp_input.c:4983
 tcp_data_queue+0x25f2/0x4c90 net/ipv4/tcp_input.c:5057
 tcp_rcv_state_process+0x1774/0x4e80 net/ipv4/tcp_input.c:6659
 tcp_v4_do_rcv+0x339/0x980 net/ipv4/tcp_ipv4.c:1682
 sk_backlog_rcv include/net/sock.h:1061 [inline]
 __release_sock+0x134/0x3b0 net/core/sock.c:2849
 release_sock+0x54/0x1b0 net/core/sock.c:3404
 inet_shutdown+0x1e0/0x430 net/ipv4/af_inet.c:909
 __sys_shutdown_sock net/socket.c:2331 [inline]
 __sys_shutdown_sock net/socket.c:2325 [inline]
 __sys_shutdown+0xf1/0x1b0 net/socket.c:2343
 __do_sys_shutdown net/socket.c:2351 [inline]
 __se_sys_shutdown net/socket.c:2349 [inline]
 __x64_sys_shutdown+0x50/0x70 net/socket.c:2349
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x46/0xb0
 </TASK>

During SMC fallback process in connect syscall, kernel will
replaces TCP with SMC. In order to forward wakeup
smc socket waitqueue after fallback, kernel will sets
clcsk->sk_user_data to origin smc socket in
smc_fback_replace_callbacks().

Later, in shutdown syscall, kernel will calls
sk_psock_get(), which treats the clcsk->sk_user_data
as psock type, triggering the refcnt warning.

So, the root cause is that smc and psock, both will use
sk_user_data field. So they will mismatch this field
easily.

This patch solves it by using another bit(defined as
SK_USER_DATA_PSOCK) in PTRMASK, to mark whether
sk_user_data points to a psock object or not.
This patch depends on a PTRMASK introduced in commit f1ff5ce2cd5e
("net, sk_msg: Clear sk_user_data pointer on clone if tagged").

For there will possibly be more flags in the sk_user_data field,
this patch also refactor sk_user_data flags code to be more generic
to improve its maintainability.

Reported-and-tested-by: syzbot+5f26f85569bd179c18ce@syzkaller.appspotmail.com
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: Hawkins Jiawei <yin31149@gmail.com>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skmsg.h |  3 ++-
 include/net/sock.h    | 68 ++++++++++++++++++++++++++++++++++++---------------
 2 files changed, 50 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 153b6dec9b6a..48f4b645193b 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -278,7 +278,8 @@ static inline void sk_msg_sg_copy_clear(struct sk_msg *msg, u32 start)
 
 static inline struct sk_psock *sk_psock(const struct sock *sk)
 {
-	return rcu_dereference_sk_user_data(sk);
+	return __rcu_dereference_sk_user_data_with_flags(sk,
+							 SK_USER_DATA_PSOCK);
 }
 
 static inline void sk_psock_set_state(struct sk_psock *psock,
diff --git a/include/net/sock.h b/include/net/sock.h
index a7273b289188..05a1bbdf5805 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -545,14 +545,26 @@ enum sk_pacing {
 	SK_PACING_FQ		= 2,
 };
 
-/* Pointer stored in sk_user_data might not be suitable for copying
- * when cloning the socket. For instance, it can point to a reference
- * counted object. sk_user_data bottom bit is set if pointer must not
- * be copied.
+/* flag bits in sk_user_data
+ *
+ * - SK_USER_DATA_NOCOPY:      Pointer stored in sk_user_data might
+ *   not be suitable for copying when cloning the socket. For instance,
+ *   it can point to a reference counted object. sk_user_data bottom
+ *   bit is set if pointer must not be copied.
+ *
+ * - SK_USER_DATA_BPF:         Mark whether sk_user_data field is
+ *   managed/owned by a BPF reuseport array. This bit should be set
+ *   when sk_user_data's sk is added to the bpf's reuseport_array.
+ *
+ * - SK_USER_DATA_PSOCK:       Mark whether pointer stored in
+ *   sk_user_data points to psock type. This bit should be set
+ *   when sk_user_data is assigned to a psock object.
  */
 #define SK_USER_DATA_NOCOPY	1UL
-#define SK_USER_DATA_BPF	2UL	/* Managed by BPF */
-#define SK_USER_DATA_PTRMASK	~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF)
+#define SK_USER_DATA_BPF	2UL
+#define SK_USER_DATA_PSOCK	4UL
+#define SK_USER_DATA_PTRMASK	~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF |\
+				  SK_USER_DATA_PSOCK)
 
 /**
  * sk_user_data_is_nocopy - Test if sk_user_data pointer must not be copied
@@ -565,24 +577,40 @@ static inline bool sk_user_data_is_nocopy(const struct sock *sk)
 
 #define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data)))
 
+/**
+ * __rcu_dereference_sk_user_data_with_flags - return the pointer
+ * only if argument flags all has been set in sk_user_data. Otherwise
+ * return NULL
+ *
+ * @sk: socket
+ * @flags: flag bits
+ */
+static inline void *
+__rcu_dereference_sk_user_data_with_flags(const struct sock *sk,
+					  uintptr_t flags)
+{
+	uintptr_t sk_user_data = (uintptr_t)rcu_dereference(__sk_user_data(sk));
+
+	WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK);
+
+	if ((sk_user_data & flags) == flags)
+		return (void *)(sk_user_data & SK_USER_DATA_PTRMASK);
+	return NULL;
+}
+
 #define rcu_dereference_sk_user_data(sk)				\
+	__rcu_dereference_sk_user_data_with_flags(sk, 0)
+#define __rcu_assign_sk_user_data_with_flags(sk, ptr, flags)		\
 ({									\
-	void *__tmp = rcu_dereference(__sk_user_data((sk)));		\
-	(void *)((uintptr_t)__tmp & SK_USER_DATA_PTRMASK);		\
-})
-#define rcu_assign_sk_user_data(sk, ptr)				\
-({									\
-	uintptr_t __tmp = (uintptr_t)(ptr);				\
-	WARN_ON_ONCE(__tmp & ~SK_USER_DATA_PTRMASK);			\
-	rcu_assign_pointer(__sk_user_data((sk)), __tmp);		\
-})
-#define rcu_assign_sk_user_data_nocopy(sk, ptr)				\
-({									\
-	uintptr_t __tmp = (uintptr_t)(ptr);				\
-	WARN_ON_ONCE(__tmp & ~SK_USER_DATA_PTRMASK);			\
+	uintptr_t __tmp1 = (uintptr_t)(ptr),				\
+		  __tmp2 = (uintptr_t)(flags);				\
+	WARN_ON_ONCE(__tmp1 & ~SK_USER_DATA_PTRMASK);			\
+	WARN_ON_ONCE(__tmp2 & SK_USER_DATA_PTRMASK);			\
 	rcu_assign_pointer(__sk_user_data((sk)),			\
-			   __tmp | SK_USER_DATA_NOCOPY);		\
+			   __tmp1 | __tmp2);				\
 })
+#define rcu_assign_sk_user_data(sk, ptr)				\
+	__rcu_assign_sk_user_data_with_flags(sk, ptr, 0)
 
 static inline
 struct net *sock_net(const struct sock *sk)
-- 
cgit v1.2.3


From 94ce3b64c62d4b628cf85cd0d9a370aca8f7e43a Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Wed, 10 Aug 2022 11:16:02 +0300
Subject: net/tls: Use RCU API to access tls_ctx->netdev

Currently, tls_device_down synchronizes with tls_device_resync_rx using
RCU, however, the pointer to netdev is stored using WRITE_ONCE and
loaded using READ_ONCE.

Although such approach is technically correct (rcu_dereference is
essentially a READ_ONCE, and rcu_assign_pointer uses WRITE_ONCE to store
NULL), using special RCU helpers for pointers is more valid, as it
includes additional checks and might change the implementation
transparently to the callers.

Mark the netdev pointer as __rcu and use the correct RCU helpers to
access it. For non-concurrent access pass the right conditions that
guarantee safe access (locks taken, refcount value). Also use the
correct helper in mlx5e, where even READ_ONCE was missing.

The transition to RCU exposes existing issues, fixed by this commit:

1. bond_tls_device_xmit could read netdev twice, and it could become
NULL the second time, after the NULL check passed.

2. Drivers shouldn't stop processing the last packet if tls_device_down
just set netdev to NULL, before tls_dev_del was called. This prevents a
possible packet drop when transitioning to the fallback software mode.

Fixes: 89df6a810470 ("net/bonding: Implement TLS TX device offload")
Fixes: c55dcdd435aa ("net/tls: Fix use-after-free after the TLS device goes down and up")
Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Link: https://lore.kernel.org/r/20220810081602.1435800-1-maximmi@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tls.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index b75b5727abdb..cb205f9d9473 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -237,7 +237,7 @@ struct tls_context {
 	void *priv_ctx_tx;
 	void *priv_ctx_rx;
 
-	struct net_device *netdev;
+	struct net_device __rcu *netdev;
 
 	/* rw cache line */
 	struct cipher_context tx;
-- 
cgit v1.2.3


From 5c221f0af68cfa9edcffd26ba6dbbc4b7ddb1700 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 9 Aug 2022 16:20:12 -0700
Subject: net: add missing kdoc for struct genl_multicast_group::flags

Multicast group flags were added in commit 4d54cc32112d ("mptcp: avoid
lock_fast usage in accept path"), but it missed adding the kdoc.

Mention which flags go into that field, and do the same for
op structs.

Link: https://lore.kernel.org/r/20220809232012.403730-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/genetlink.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 7cb3fa8310ed..56a50e1c51b9 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -11,6 +11,7 @@
 /**
  * struct genl_multicast_group - generic netlink multicast group
  * @name: name of the multicast group, names are per-family
+ * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM)
  */
 struct genl_multicast_group {
 	char			name[GENL_NAMSIZ];
@@ -116,7 +117,7 @@ enum genl_validate_flags {
  * struct genl_small_ops - generic netlink operations (small version)
  * @cmd: command identifier
  * @internal_flags: flags used by the family
- * @flags: flags
+ * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM)
  * @validate: validation flags from enum genl_validate_flags
  * @doit: standard command callback
  * @dumpit: callback for dumpers
@@ -137,7 +138,7 @@ struct genl_small_ops {
  * struct genl_ops - generic netlink operations
  * @cmd: command identifier
  * @internal_flags: flags used by the family
- * @flags: flags
+ * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM)
  * @maxattr: maximum number of attributes supported
  * @policy: netlink policy (takes precedence over family policy)
  * @validate: validation flags from enum genl_validate_flags
-- 
cgit v1.2.3


From c2e75634cbe368065f140dd30bf8b1a0355158fd Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 10 Aug 2022 09:45:47 -0700
Subject: net: atm: bring back zatm uAPI

Jiri reports that linux-atm does not build without this header.
Bring it back. It's completely dead code but we can't break
the build for user space :(

Reported-by: Jiri Slaby <jirislaby@kernel.org>
Fixes: 052e1f01bfae ("net: atm: remove support for ZeitNet ZN122x ATM devices")
Link: https://lore.kernel.org/all/8576aef3-37e4-8bae-bab5-08f82a78efd3@kernel.org/
Link: https://lore.kernel.org/r/20220810164547.484378-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/atm_zatm.h | 47 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 include/uapi/linux/atm_zatm.h

(limited to 'include')

diff --git a/include/uapi/linux/atm_zatm.h b/include/uapi/linux/atm_zatm.h
new file mode 100644
index 000000000000..5135027b93c1
--- /dev/null
+++ b/include/uapi/linux/atm_zatm.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* atm_zatm.h - Driver-specific declarations of the ZATM driver (for use by
+		driver-specific utilities) */
+
+/* Written 1995-1999 by Werner Almesberger, EPFL LRC/ICA */
+
+
+#ifndef LINUX_ATM_ZATM_H
+#define LINUX_ATM_ZATM_H
+
+/*
+ * Note: non-kernel programs including this file must also include
+ * sys/types.h for struct timeval
+ */
+
+#include <linux/atmapi.h>
+#include <linux/atmioc.h>
+
+#define ZATM_GETPOOL	_IOW('a',ATMIOC_SARPRV+1,struct atmif_sioc)
+						/* get pool statistics */
+#define ZATM_GETPOOLZ	_IOW('a',ATMIOC_SARPRV+2,struct atmif_sioc)
+						/* get statistics and zero */
+#define ZATM_SETPOOL	_IOW('a',ATMIOC_SARPRV+3,struct atmif_sioc)
+						/* set pool parameters */
+
+struct zatm_pool_info {
+	int ref_count;			/* free buffer pool usage counters */
+	int low_water,high_water;	/* refill parameters */
+	int rqa_count,rqu_count;	/* queue condition counters */
+	int offset,next_off;		/* alignment optimizations: offset */
+	int next_cnt,next_thres;	/* repetition counter and threshold */
+};
+
+struct zatm_pool_req {
+	int pool_num;			/* pool number */
+	struct zatm_pool_info info;	/* actual information */
+};
+
+#define ZATM_OAM_POOL		0	/* free buffer pool for OAM cells */
+#define ZATM_AAL0_POOL		1	/* free buffer pool for AAL0 cells */
+#define ZATM_AAL5_POOL_BASE	2	/* first AAL5 free buffer pool */
+#define ZATM_LAST_POOL	ZATM_AAL5_POOL_BASE+10 /* max. 64 kB */
+
+#define ZATM_TIMER_HISTORY_SIZE	16	/* number of timer adjustments to
+					   record; must be 2^n */
+
+#endif
-- 
cgit v1.2.3