From f16214c102f0f64b2f3546e989498525bd7b7708 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Mon, 11 Jul 2022 10:12:00 +0200 Subject: bpf: Fix 'dubious one-bit signed bitfield' warnings Our CI[1] reported these warnings when using Sparse: $ touch net/mptcp/bpf.c $ make C=1 net/mptcp/bpf.o net/mptcp/bpf.c: note: in included file: include/linux/bpf_verifier.h:348:26: error: dubious one-bit signed bitfield include/linux/bpf_verifier.h:349:29: error: dubious one-bit signed bitfield Set them as 'unsigned' to avoid warnings. [1] https://github.com/multipath-tcp/mptcp_net-next/actions/runs/2643588487 Fixes: 1ade23711971 ("bpf: Inline calls to bpf_loop when callback is known") Signed-off-by: Matthieu Baerts Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220711081200.2081262-1-matthieu.baerts@tessares.net --- include/linux/bpf_verifier.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 81b19669efba..2e3bad8640dc 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -345,10 +345,10 @@ struct bpf_verifier_state_list { }; struct bpf_loop_inline_state { - int initialized:1; /* set to true upon first entry */ - int fit_for_inline:1; /* true if callback function is the same - * at each call and flags are always zero - */ + unsigned int initialized:1; /* set to true upon first entry */ + unsigned int fit_for_inline:1; /* true if callback function is the same + * at each call and flags are always zero + */ u32 callback_subprogno; /* valid when fit_for_inline is true */ }; -- cgit v1.2.3 From 4201d9ab3e42d9e2a20320b751a931e6239c0df2 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 11 Jul 2022 09:28:27 -0700 Subject: bpf: reparent bpf maps on memcg offlining The memory consumed by a bpf map is always accounted to the memory cgroup of the process which created the map. The map can outlive the memory cgroup if it's used by processes in other cgroups or is pinned on bpffs. In this case the map pins the original cgroup in the dying state. For other types of objects (slab objects, non-slab kernel allocations, percpu objects and recently LRU pages) there is a reparenting process implemented: on cgroup offlining charged objects are getting reassigned to the parent cgroup. Because all charges and statistics are fully recursive it's a fairly cheap operation. For efficiency and consistency with other types of objects, let's do the same for bpf maps. Fortunately thanks to the objcg API, the required changes are minimal. Please, note that individual allocations (slabs, percpu and large kmallocs) already have the reparenting mechanism. This commit adds it to the saved map->memcg pointer by replacing it to map->objcg. Because dying cgroups are not visible for a user and all charges are recursive, this commit doesn't bring any behavior changes for a user. v2: added a missing const qualifier Signed-off-by: Roman Gushchin Reviewed-by: Shakeel Butt Link: https://lore.kernel.org/r/20220711162827.184743-1-roman.gushchin@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2b21f2a3452f..85a4db3e0536 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -221,7 +221,7 @@ struct bpf_map { u32 btf_vmlinux_value_type_id; struct btf *btf; #ifdef CONFIG_MEMCG_KMEM - struct mem_cgroup *memcg; + struct obj_cgroup *objcg; #endif char name[BPF_OBJ_NAME_LEN]; struct bpf_map_off_arr *off_arr; -- cgit v1.2.3 From 1d5f82d9dd477d5c66e0214a68c3e4f308eadd6d Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 5 Jul 2022 17:26:12 -0700 Subject: bpf, x86: fix freeing of not-finalized bpf_prog_pack syzbot reported a few issues with bpf_prog_pack [1], [2]. This only happens with multiple subprogs. In jit_subprogs(), we first call bpf_int_jit_compile() on each sub program. And then, we call it on each sub program again. jit_data is not freed in the first call of bpf_int_jit_compile(). Similarly we don't call bpf_jit_binary_pack_finalize() in the first call of bpf_int_jit_compile(). If bpf_int_jit_compile() failed for one sub program, we will call bpf_jit_binary_pack_finalize() for this sub program. However, we don't have a chance to call it for other sub programs. Then we will hit "goto out_free" in jit_subprogs(), and call bpf_jit_free on some subprograms that haven't got bpf_jit_binary_pack_finalize() yet. At this point, bpf_jit_binary_pack_free() is called and the whole 2MB page is freed erroneously. Fix this with a custom bpf_jit_free() for x86_64, which calls bpf_jit_binary_pack_finalize() if necessary. Also, with custom bpf_jit_free(), bpf_prog_aux->use_bpf_prog_pack is not needed any more, remove it. Fixes: 1022a5498f6f ("bpf, x86_64: Use bpf_jit_binary_pack_alloc") [1] https://syzkaller.appspot.com/bug?extid=2f649ec6d2eea1495a8f [2] https://syzkaller.appspot.com/bug?extid=87f65c75f4a72db05445 Reported-by: syzbot+2f649ec6d2eea1495a8f@syzkaller.appspotmail.com Reported-by: syzbot+87f65c75f4a72db05445@syzkaller.appspotmail.com Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20220706002612.4013790-1-song@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 - include/linux/filter.h | 8 ++++++++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 85a4db3e0536..a5bf00649995 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1044,7 +1044,6 @@ struct bpf_prog_aux { bool sleepable; bool tail_call_reachable; bool xdp_has_frags; - bool use_bpf_prog_pack; /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; /* function name for valid attach_btf_id */ diff --git a/include/linux/filter.h b/include/linux/filter.h index 4c1a8b247545..a5f21dc3c432 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1027,6 +1027,14 @@ u64 bpf_jit_alloc_exec_limit(void); void *bpf_jit_alloc_exec(unsigned long size); void bpf_jit_free_exec(void *addr); void bpf_jit_free(struct bpf_prog *fp); +struct bpf_binary_header * +bpf_jit_binary_pack_hdr(const struct bpf_prog *fp); + +static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) +{ + return list_empty(&fp->aux->ksym.lnode) || + fp->aux->ksym.lnode.prev == LIST_POISON2; +} struct bpf_binary_header * bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **ro_image, -- cgit v1.2.3 From ca2e1a627035002cd33d9667431e80bad90c25fa Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Thu, 7 Jul 2022 15:08:42 +0200 Subject: xsk: Mark napi_id on sendmsg() When application runs in busy poll mode and does not receive a single packet but only sends them, it is currently impossible to get into napi_busy_loop() as napi_id is only marked on Rx side in xsk_rcv_check(). In there, napi_id is being taken from xdp_rxq_info carried by xdp_buff. From Tx perspective, we do not have access to it. What we have handy is the xsk pool. Xsk pool works on a pool of internal xdp_buff wrappers called xdp_buff_xsk. AF_XDP ZC enabled drivers call xp_set_rxq_info() so each of xdp_buff_xsk has a valid pointer to xdp_rxq_info of underlying queue. Therefore, on Tx side, napi_id can be pulled from xs->pool->heads[0].xdp.rxq->napi_id. Hide this pointer chase under helper function, xsk_pool_get_napi_id(). Do this only for sockets working in ZC mode as otherwise rxq pointers would not be initialized. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20220707130842.49408-1-maciej.fijalkowski@intel.com --- include/net/xdp_sock_drv.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 4aa031849668..4277b0dcee05 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -44,6 +44,15 @@ static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, xp_set_rxq_info(pool, rxq); } +static inline unsigned int xsk_pool_get_napi_id(struct xsk_buff_pool *pool) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + return pool->heads[0].xdp.rxq->napi_id; +#else + return 0; +#endif +} + static inline void xsk_pool_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) { @@ -198,6 +207,11 @@ static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, { } +static inline unsigned int xsk_pool_get_napi_id(struct xsk_buff_pool *pool) +{ + return 0; +} + static inline void xsk_pool_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) { -- cgit v1.2.3 From fd1894224407c484f652ad456e1ce423e89bb3eb Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Fri, 15 Jul 2022 19:55:59 +0800 Subject: bpf: Don't redirect packets with invalid pkt_len Syzbot found an issue [1]: fq_codel_drop() try to drop a flow whitout any skbs, that is, the flow->head is null. The root cause, as the [2] says, is because that bpf_prog_test_run_skb() run a bpf prog which redirects empty skbs. So we should determine whether the length of the packet modified by bpf prog or others like bpf_prog_test is valid before forwarding it directly. LINK: [1] https://syzkaller.appspot.com/bug?id=0b84da80c2917757915afa89f7738a9d16ec96c5 LINK: [2] https://www.spinics.net/lists/netdev/msg777503.html Reported-by: syzbot+7a12909485b94426aceb@syzkaller.appspotmail.com Signed-off-by: Zhengchao Shao Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220715115559.139691-1-shaozhengchao@huawei.com Signed-off-by: Alexei Starovoitov --- include/linux/skbuff.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f6a27ab19202..82e8368ba6e6 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2459,6 +2459,14 @@ static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset) #endif /* NET_SKBUFF_DATA_USES_OFFSET */ +static inline void skb_assert_len(struct sk_buff *skb) +{ +#ifdef CONFIG_DEBUG_NET + if (WARN_ONCE(!skb->len, "%s\n", __func__)) + DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); +#endif /* CONFIG_DEBUG_NET */ +} + /* * Add data to an sk_buff */ -- cgit v1.2.3 From bdb2bc7599298ebb677e40fc92b1fa9e69e05098 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Fri, 15 Jul 2022 12:38:00 -0700 Subject: bpf: fix bpf_skb_pull_data documentation Fix documentation for bpf_skb_pull_data() helper for when len == 0. Fixes: fa15601ab31e ("bpf: add documentation for eBPF helpers (33-41)") Signed-off-by: Joanne Koong Acked-by: Quentin Monnet Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220715193800.3940070-1-joannelkoong@gmail.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 379e68fb866f..ffcbf79a556b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2361,7 +2361,8 @@ union bpf_attr { * Pull in non-linear data in case the *skb* is non-linear and not * all of *len* are part of the linear section. Make *len* bytes * from *skb* readable and writable. If a zero value is passed for - * *len*, then the whole length of the *skb* is pulled. + * *len*, then all bytes in the linear part of *skb* will be made + * readable and writable. * * This helper is only needed for reading and writing with direct * packet access. -- cgit v1.2.3 From 9cb61fda8c71baaff423fe5be2e609100860fa78 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 20 Jul 2022 08:52:20 -0700 Subject: bpf: Fix bpf_trampoline_{,un}link_cgroup_shim ifdef guards They were updated in kernel/bpf/trampoline.c to fix another build issue. We should to do the same for include/linux/bpf.h header. Fixes: 3908fcddc65d ("bpf: fix lsm_cgroup build errors on esoteric configs") Reported-by: Stephen Rothwell Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220720155220.4087433-1-sdf@google.com --- include/linux/bpf.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a5bf00649995..11950029284f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1254,9 +1254,6 @@ struct bpf_dummy_ops { int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); #endif -int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, - int cgroup_atype); -void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog); #else static inline const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id) { @@ -1280,6 +1277,13 @@ static inline int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, { return -EINVAL; } +#endif + +#if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM) +int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, + int cgroup_atype); +void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog); +#else static inline int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, int cgroup_atype) { -- cgit v1.2.3 From ab21d6063c01180a8e9b22a37b847e5819525d9f Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 21 Jul 2022 15:42:33 +0200 Subject: bpf: Introduce 8-byte BTF set Introduce support for defining flags for kfuncs using a new set of macros, BTF_SET8_START/BTF_SET8_END, which define a set which contains 8 byte elements (each of which consists of a pair of BTF ID and flags), using a new BTF_ID_FLAGS macro. This will be used to tag kfuncs registered for a certain program type as acquire, release, sleepable, ret_null, etc. without having to create more and more sets which was proving to be an unscalable solution. Now, when looking up whether a kfunc is allowed for a certain program, we can also obtain its kfunc flags in the same call and avoid further lookups. The resolve_btfids change is split into a separate patch. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220721134245.2450-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/btf_ids.h | 68 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 64 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 252a4befeab1..3cb0741e71d7 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -8,6 +8,15 @@ struct btf_id_set { u32 ids[]; }; +struct btf_id_set8 { + u32 cnt; + u32 flags; + struct { + u32 id; + u32 flags; + } pairs[]; +}; + #ifdef CONFIG_DEBUG_INFO_BTF #include /* for __PASTE */ @@ -25,7 +34,7 @@ struct btf_id_set { #define BTF_IDS_SECTION ".BTF_ids" -#define ____BTF_ID(symbol) \ +#define ____BTF_ID(symbol, word) \ asm( \ ".pushsection " BTF_IDS_SECTION ",\"a\"; \n" \ ".local " #symbol " ; \n" \ @@ -33,10 +42,11 @@ asm( \ ".size " #symbol ", 4; \n" \ #symbol ": \n" \ ".zero 4 \n" \ +word \ ".popsection; \n"); -#define __BTF_ID(symbol) \ - ____BTF_ID(symbol) +#define __BTF_ID(symbol, word) \ + ____BTF_ID(symbol, word) #define __ID(prefix) \ __PASTE(prefix, __COUNTER__) @@ -46,7 +56,14 @@ asm( \ * to 4 zero bytes. */ #define BTF_ID(prefix, name) \ - __BTF_ID(__ID(__BTF_ID__##prefix##__##name##__)) + __BTF_ID(__ID(__BTF_ID__##prefix##__##name##__), "") + +#define ____BTF_ID_FLAGS(prefix, name, flags) \ + __BTF_ID(__ID(__BTF_ID__##prefix##__##name##__), ".long " #flags "\n") +#define __BTF_ID_FLAGS(prefix, name, flags, ...) \ + ____BTF_ID_FLAGS(prefix, name, flags) +#define BTF_ID_FLAGS(prefix, name, ...) \ + __BTF_ID_FLAGS(prefix, name, ##__VA_ARGS__, 0) /* * The BTF_ID_LIST macro defines pure (unsorted) list @@ -145,10 +162,51 @@ asm( \ ".popsection; \n"); \ extern struct btf_id_set name; +/* + * The BTF_SET8_START/END macros pair defines sorted list of + * BTF IDs and their flags plus its members count, with the + * following layout: + * + * BTF_SET8_START(list) + * BTF_ID_FLAGS(type1, name1, flags) + * BTF_ID_FLAGS(type2, name2, flags) + * BTF_SET8_END(list) + * + * __BTF_ID__set8__list: + * .zero 8 + * list: + * __BTF_ID__type1__name1__3: + * .zero 4 + * .word (1 << 0) | (1 << 2) + * __BTF_ID__type2__name2__5: + * .zero 4 + * .word (1 << 3) | (1 << 1) | (1 << 2) + * + */ +#define __BTF_SET8_START(name, scope) \ +asm( \ +".pushsection " BTF_IDS_SECTION ",\"a\"; \n" \ +"." #scope " __BTF_ID__set8__" #name "; \n" \ +"__BTF_ID__set8__" #name ":; \n" \ +".zero 8 \n" \ +".popsection; \n"); + +#define BTF_SET8_START(name) \ +__BTF_ID_LIST(name, local) \ +__BTF_SET8_START(name, local) + +#define BTF_SET8_END(name) \ +asm( \ +".pushsection " BTF_IDS_SECTION ",\"a\"; \n" \ +".size __BTF_ID__set8__" #name ", .-" #name " \n" \ +".popsection; \n"); \ +extern struct btf_id_set8 name; + #else #define BTF_ID_LIST(name) static u32 __maybe_unused name[5]; #define BTF_ID(prefix, name) +#define BTF_ID_FLAGS(prefix, name, flags) #define BTF_ID_UNUSED #define BTF_ID_LIST_GLOBAL(name, n) u32 __maybe_unused name[n]; #define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 __maybe_unused name[1]; @@ -156,6 +214,8 @@ extern struct btf_id_set name; #define BTF_SET_START(name) static struct btf_id_set __maybe_unused name = { 0 }; #define BTF_SET_START_GLOBAL(name) static struct btf_id_set __maybe_unused name = { 0 }; #define BTF_SET_END(name) +#define BTF_SET8_START(name) static struct btf_id_set8 __maybe_unused name = { 0 }; +#define BTF_SET8_END(name) static struct btf_id_set8 __maybe_unused name = { 0 }; #endif /* CONFIG_DEBUG_INFO_BTF */ -- cgit v1.2.3 From a4703e3184320d6e15e2bc81d2ccf1c8c883f9d1 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 21 Jul 2022 15:42:35 +0200 Subject: bpf: Switch to new kfunc flags infrastructure Instead of populating multiple sets to indicate some attribute and then researching the same BTF ID in them, prepare a single unified BTF set which indicates whether a kfunc is allowed to be called, and also its attributes if any at the same time. Now, only one call is needed to perform the lookup for both kfunc availability and its attributes. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220721134245.2450-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 ++- include/linux/btf.h | 33 ++++++++++----------------------- 2 files changed, 12 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 11950029284f..a97751d845c9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1924,7 +1924,8 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, - struct bpf_reg_state *regs); + struct bpf_reg_state *regs, + u32 kfunc_flags); int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *reg); int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, diff --git a/include/linux/btf.h b/include/linux/btf.h index 1bfed7fa0428..6dfc6eaf7f8c 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -12,14 +12,11 @@ #define BTF_TYPE_EMIT(type) ((void)(type *)0) #define BTF_TYPE_EMIT_ENUM(enum_val) ((void)enum_val) -enum btf_kfunc_type { - BTF_KFUNC_TYPE_CHECK, - BTF_KFUNC_TYPE_ACQUIRE, - BTF_KFUNC_TYPE_RELEASE, - BTF_KFUNC_TYPE_RET_NULL, - BTF_KFUNC_TYPE_KPTR_ACQUIRE, - BTF_KFUNC_TYPE_MAX, -}; +/* These need to be macros, as the expressions are used in assembler input */ +#define KF_ACQUIRE (1 << 0) /* kfunc is an acquire function */ +#define KF_RELEASE (1 << 1) /* kfunc is a release function */ +#define KF_RET_NULL (1 << 2) /* kfunc returns a pointer that may be NULL */ +#define KF_KPTR_GET (1 << 3) /* kfunc returns reference to a kptr */ struct btf; struct btf_member; @@ -30,16 +27,7 @@ struct btf_id_set; struct btf_kfunc_id_set { struct module *owner; - union { - struct { - struct btf_id_set *check_set; - struct btf_id_set *acquire_set; - struct btf_id_set *release_set; - struct btf_id_set *ret_null_set; - struct btf_id_set *kptr_acquire_set; - }; - struct btf_id_set *sets[BTF_KFUNC_TYPE_MAX]; - }; + struct btf_id_set8 *set; }; struct btf_id_dtor_kfunc { @@ -378,9 +366,9 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); const char *btf_name_by_offset(const struct btf *btf, u32 offset); struct btf *btf_parse_vmlinux(void); struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog); -bool btf_kfunc_id_set_contains(const struct btf *btf, +u32 *btf_kfunc_id_set_contains(const struct btf *btf, enum bpf_prog_type prog_type, - enum btf_kfunc_type type, u32 kfunc_btf_id); + u32 kfunc_btf_id); int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, const struct btf_kfunc_id_set *s); s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id); @@ -397,12 +385,11 @@ static inline const char *btf_name_by_offset(const struct btf *btf, { return NULL; } -static inline bool btf_kfunc_id_set_contains(const struct btf *btf, +static inline u32 *btf_kfunc_id_set_contains(const struct btf *btf, enum bpf_prog_type prog_type, - enum btf_kfunc_type type, u32 kfunc_btf_id) { - return false; + return NULL; } static inline int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, const struct btf_kfunc_id_set *s) -- cgit v1.2.3 From 56e948ffc098a780fefb6c1784a3a2c7b81100a1 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 21 Jul 2022 15:42:36 +0200 Subject: bpf: Add support for forcing kfunc args to be trusted Teach the verifier to detect a new KF_TRUSTED_ARGS kfunc flag, which means each pointer argument must be trusted, which we define as a pointer that is referenced (has non-zero ref_obj_id) and also needs to have its offset unchanged, similar to how release functions expect their argument. This allows a kfunc to receive pointer arguments unchanged from the result of the acquire kfunc. This is required to ensure that kfunc that operate on some object only work on acquired pointers and not normal PTR_TO_BTF_ID with same type which can be obtained by pointer walking. The restrictions applied to release arguments also apply to trusted arguments. This implies that strict type matching (not deducing type by recursively following members at offset) and OBJ_RELEASE offset checks (ensuring they are zero) are used for trusted pointer arguments. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220721134245.2450-5-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index 6dfc6eaf7f8c..cdb376d53238 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -17,6 +17,38 @@ #define KF_RELEASE (1 << 1) /* kfunc is a release function */ #define KF_RET_NULL (1 << 2) /* kfunc returns a pointer that may be NULL */ #define KF_KPTR_GET (1 << 3) /* kfunc returns reference to a kptr */ +/* Trusted arguments are those which are meant to be referenced arguments with + * unchanged offset. It is used to enforce that pointers obtained from acquire + * kfuncs remain unmodified when being passed to helpers taking trusted args. + * + * Consider + * struct foo { + * int data; + * struct foo *next; + * }; + * + * struct bar { + * int data; + * struct foo f; + * }; + * + * struct foo *f = alloc_foo(); // Acquire kfunc + * struct bar *b = alloc_bar(); // Acquire kfunc + * + * If a kfunc set_foo_data() wants to operate only on the allocated object, it + * will set the KF_TRUSTED_ARGS flag, which will prevent unsafe usage like: + * + * set_foo_data(f, 42); // Allowed + * set_foo_data(f->next, 42); // Rejected, non-referenced pointer + * set_foo_data(&f->next, 42);// Rejected, referenced, but wrong type + * set_foo_data(&b->f, 42); // Rejected, referenced, but bad offset + * + * In the final case, usually for the purposes of type matching, it is deduced + * by looking at the type of the member at the offset, but due to the + * requirement of trusted argument, this deduction will be strict and not done + * for this case. + */ +#define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */ struct btf; struct btf_member; -- cgit v1.2.3 From d7e79c97c00ca82dace0e3b645d4b3b02fa273c2 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 21 Jul 2022 15:42:39 +0200 Subject: net: netfilter: Add kfuncs to allocate and insert CT Introduce bpf_xdp_ct_alloc, bpf_skb_ct_alloc and bpf_ct_insert_entry kfuncs in order to insert a new entry from XDP and TC programs. Introduce bpf_nf_ct_tuple_parse utility routine to consolidate common code. We extract out a helper __nf_ct_set_timeout, used by the ctnetlink and nf_conntrack_bpf code, extract it out to nf_conntrack_core, so that nf_conntrack_bpf doesn't need a dependency on CONFIG_NF_CT_NETLINK. Later this helper will be reused as a helper to set timeout of allocated but not yet inserted CT entry. The allocation functions return struct nf_conn___init instead of nf_conn, to distinguish allocated CT from an already inserted or looked up CT. This is later used to enforce restrictions on what kfuncs allocated CT can be used with. Signed-off-by: Lorenzo Bianconi Co-developed-by: Kumar Kartikeya Dwivedi Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220721134245.2450-8-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/net/netfilter/nf_conntrack_core.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 37866c8386e2..83a60c684e6c 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -84,4 +84,19 @@ void nf_conntrack_lock(spinlock_t *lock); extern spinlock_t nf_conntrack_expect_lock; +/* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */ + +#if (IS_BUILTIN(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ + (IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES) || \ + IS_ENABLED(CONFIG_NF_CT_NETLINK)) + +static inline void __nf_ct_set_timeout(struct nf_conn *ct, u64 timeout) +{ + if (timeout > INT_MAX) + timeout = INT_MAX; + WRITE_ONCE(ct->timeout, nfct_time_stamp + (u32)timeout); +} + +#endif + #endif /* _NF_CONNTRACK_CORE_H */ -- cgit v1.2.3 From 0b3892364431684e883682b85d008979e08d4ce6 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 21 Jul 2022 15:42:40 +0200 Subject: net: netfilter: Add kfuncs to set and change CT timeout Introduce bpf_ct_set_timeout and bpf_ct_change_timeout kfunc helpers in order to change nf_conn timeout. This is same as ctnetlink_change_timeout, hence code is shared between both by extracting it out to __nf_ct_change_timeout. It is also updated to return an error when it sees IPS_FIXED_TIMEOUT_BIT bit in ct->status, as that check was missing. It is required to introduce two kfuncs taking nf_conn___init and nf_conn instead of sharing one because KF_TRUSTED_ARGS flag causes strict type checking. This would disallow passing nf_conn___init to kfunc taking nf_conn, and vice versa. We cannot remove the KF_TRUSTED_ARGS flag as we only want to accept refcounted pointers and not e.g. ct->master. Apart from this, bpf_ct_set_timeout is only called for newly allocated CT so it doesn't need to inspect the status field just yet. Sharing the helpers even if it was possible would make timeout setting helper sensitive to order of setting status and timeout after allocation. Hence, bpf_ct_set_* kfuncs are meant to be used on allocated CT, and bpf_ct_change_* kfuncs are meant to be used on inserted or looked up CT entry. Co-developed-by: Lorenzo Bianconi Signed-off-by: Lorenzo Bianconi Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220721134245.2450-9-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/net/netfilter/nf_conntrack_core.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 83a60c684e6c..3b0f7d0eebae 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -97,6 +97,8 @@ static inline void __nf_ct_set_timeout(struct nf_conn *ct, u64 timeout) WRITE_ONCE(ct->timeout, nfct_time_stamp + (u32)timeout); } +int __nf_ct_change_timeout(struct nf_conn *ct, u64 cta_timeout); + #endif #endif /* _NF_CONNTRACK_CORE_H */ -- cgit v1.2.3 From ef69aa3a986ef94f01ce8b5b619f550db54432fe Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 21 Jul 2022 15:42:41 +0200 Subject: net: netfilter: Add kfuncs to set and change CT status Introduce bpf_ct_set_status and bpf_ct_change_status kfunc helpers in order to set nf_conn field of allocated entry or update nf_conn status field of existing inserted entry. Use nf_ct_change_status_common to share the permitted status field changes between netlink and BPF side by refactoring ctnetlink_change_status. It is required to introduce two kfuncs taking nf_conn___init and nf_conn instead of sharing one because KF_TRUSTED_ARGS flag causes strict type checking. This would disallow passing nf_conn___init to kfunc taking nf_conn, and vice versa. We cannot remove the KF_TRUSTED_ARGS flag as we only want to accept refcounted pointers and not e.g. ct->master. Hence, bpf_ct_set_* kfuncs are meant to be used on allocated CT, and bpf_ct_change_* kfuncs are meant to be used on inserted or looked up CT entry. Signed-off-by: Lorenzo Bianconi Co-developed-by: Kumar Kartikeya Dwivedi Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220721134245.2450-10-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/net/netfilter/nf_conntrack_core.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 3b0f7d0eebae..3cd3a6e631aa 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -98,6 +98,8 @@ static inline void __nf_ct_set_timeout(struct nf_conn *ct, u64 timeout) } int __nf_ct_change_timeout(struct nf_conn *ct, u64 cta_timeout); +void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off); +int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status); #endif -- cgit v1.2.3 From e423414375866a399ebbe55ed044acb39846e8bf Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 22 Jul 2022 13:36:05 +0200 Subject: bpf: Fix build error in case of !CONFIG_DEBUG_INFO_BTF BTF_ID_FLAGS macro needs to be able to take 0 or 1 args, so make it a variable argument. BTF_SET8_END is incorrect, it should just be empty. Reported-by: kernel test robot Fixes: ab21d6063c01 ("bpf: Introduce 8-byte BTF set") Signed-off-by: Kumar Kartikeya Dwivedi Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20220722113605.6513-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/btf_ids.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 3cb0741e71d7..2aea877d644f 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -206,7 +206,7 @@ extern struct btf_id_set8 name; #define BTF_ID_LIST(name) static u32 __maybe_unused name[5]; #define BTF_ID(prefix, name) -#define BTF_ID_FLAGS(prefix, name, flags) +#define BTF_ID_FLAGS(prefix, name, ...) #define BTF_ID_UNUSED #define BTF_ID_LIST_GLOBAL(name, n) u32 __maybe_unused name[n]; #define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 __maybe_unused name[1]; @@ -215,7 +215,7 @@ extern struct btf_id_set8 name; #define BTF_SET_START_GLOBAL(name) static struct btf_id_set __maybe_unused name = { 0 }; #define BTF_SET_END(name) #define BTF_SET8_START(name) static struct btf_id_set8 __maybe_unused name = { 0 }; -#define BTF_SET8_END(name) static struct btf_id_set8 __maybe_unused name = { 0 }; +#define BTF_SET8_END(name) #endif /* CONFIG_DEBUG_INFO_BTF */ -- cgit v1.2.3 From f96f644ab97abeed3f7007c953836a574ce928cc Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 19 Jul 2022 17:21:23 -0700 Subject: ftrace: Add modify_ftrace_direct_multi_nolock This is similar to modify_ftrace_direct_multi, but does not acquire direct_mutex. This is useful when direct_mutex is already locked by the user. Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20220720002126.803253-2-song@kernel.org --- include/linux/ftrace.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 979f6bfa2c25..acb35243ce5d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -340,6 +340,7 @@ unsigned long ftrace_find_rec_direct(unsigned long ip); int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr); int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr); int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr); +int modify_ftrace_direct_multi_nolock(struct ftrace_ops *ops, unsigned long addr); #else struct ftrace_ops; @@ -384,6 +385,10 @@ static inline int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned lo { return -ENODEV; } +static inline int modify_ftrace_direct_multi_nolock(struct ftrace_ops *ops, unsigned long addr) +{ + return -ENODEV; +} #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS -- cgit v1.2.3 From 53cd885bc5c3ea283cc9c00ca6446c778f00bfba Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 19 Jul 2022 17:21:24 -0700 Subject: ftrace: Allow IPMODIFY and DIRECT ops on the same function IPMODIFY (livepatch) and DIRECT (bpf trampoline) ops are both important users of ftrace. It is necessary to allow them work on the same function at the same time. First, DIRECT ops no longer specify IPMODIFY flag. Instead, DIRECT flag is handled together with IPMODIFY flag in __ftrace_hash_update_ipmodify(). Then, a callback function, ops_func, is added to ftrace_ops. This is used by ftrace core code to understand whether the DIRECT ops can share with an IPMODIFY ops. To share with IPMODIFY ops, the DIRECT ops need to implement the callback function and adjust the direct trampoline accordingly. If DIRECT ops is attached before the IPMODIFY ops, ftrace core code calls ENABLE_SHARE_IPMODIFY_PEER on the DIRECT ops before registering the IPMODIFY ops. If IPMODIFY ops is attached before the DIRECT ops, ftrace core code calls ENABLE_SHARE_IPMODIFY_SELF in __ftrace_hash_update_ipmodify. Owner of the DIRECT ops may return 0 if the DIRECT trampoline can share with IPMODIFY, so error code otherwise. The error code is propagated to register_ftrace_direct_multi so that onwer of the DIRECT trampoline can handle it properly. For more details, please refer to comment before enum ftrace_ops_cmd. Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/all/20220602193706.2607681-2-song@kernel.org/ Link: https://lore.kernel.org/all/20220718055449.3960512-1-song@kernel.org/ Link: https://lore.kernel.org/bpf/20220720002126.803253-3-song@kernel.org --- include/linux/ftrace.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index acb35243ce5d..0b61371e287b 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -208,6 +208,43 @@ enum { FTRACE_OPS_FL_DIRECT = BIT(17), }; +/* + * FTRACE_OPS_CMD_* commands allow the ftrace core logic to request changes + * to a ftrace_ops. Note, the requests may fail. + * + * ENABLE_SHARE_IPMODIFY_SELF - enable a DIRECT ops to work on the same + * function as an ops with IPMODIFY. Called + * when the DIRECT ops is being registered. + * This is called with both direct_mutex and + * ftrace_lock are locked. + * + * ENABLE_SHARE_IPMODIFY_PEER - enable a DIRECT ops to work on the same + * function as an ops with IPMODIFY. Called + * when the other ops (the one with IPMODIFY) + * is being registered. + * This is called with direct_mutex locked. + * + * DISABLE_SHARE_IPMODIFY_PEER - disable a DIRECT ops to work on the same + * function as an ops with IPMODIFY. Called + * when the other ops (the one with IPMODIFY) + * is being unregistered. + * This is called with direct_mutex locked. + */ +enum ftrace_ops_cmd { + FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF, + FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER, + FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER, +}; + +/* + * For most ftrace_ops_cmd, + * Returns: + * 0 - Success. + * Negative on failure. The return value is dependent on the + * callback. + */ +typedef int (*ftrace_ops_func_t)(struct ftrace_ops *op, enum ftrace_ops_cmd cmd); + #ifdef CONFIG_DYNAMIC_FTRACE /* The hash used to know what functions callbacks trace */ struct ftrace_ops_hash { @@ -250,6 +287,7 @@ struct ftrace_ops { unsigned long trampoline; unsigned long trampoline_size; struct list_head list; + ftrace_ops_func_t ops_func; #endif }; -- cgit v1.2.3 From 316cba62dfb7878b7353177e6a7da9cc0c979cde Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 19 Jul 2022 17:21:25 -0700 Subject: bpf, x64: Allow to use caller address from stack Currently we call the original function by using the absolute address given at the JIT generation. That's not usable when having trampoline attached to multiple functions, or the target address changes dynamically (in case of live patch). In such cases we need to take the return address from the stack. Adding support to retrieve the original function address from the stack by adding new BPF_TRAMP_F_ORIG_STACK flag for arch_prepare_bpf_trampoline function. Basically we take the return address of the 'fentry' call: function + 0: call fentry # stores 'function + 5' address on stack function + 5: ... The 'function + 5' address will be used as the address for the original function to call. Signed-off-by: Jiri Olsa Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220720002126.803253-4-song@kernel.org --- include/linux/bpf.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a97751d845c9..1d22df8bf306 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -751,6 +751,11 @@ struct btf_func_model { /* Return the return value of fentry prog. Only used by bpf_struct_ops. */ #define BPF_TRAMP_F_RET_FENTRY_RET BIT(4) +/* Get original function from stack instead of from provided direct address. + * Makes sense for trampolines with fexit or fmod_ret programs. + */ +#define BPF_TRAMP_F_ORIG_STACK BIT(5) + /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 * bytes on x86. */ -- cgit v1.2.3 From 00963a2e75a872e5fce4d0115ac2786ec86b57a6 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 19 Jul 2022 17:21:26 -0700 Subject: bpf: Support bpf_trampoline on functions with IPMODIFY (e.g. livepatch) When tracing a function with IPMODIFY ftrace_ops (livepatch), the bpf trampoline must follow the instruction pointer saved on stack. This needs extra handling for bpf trampolines with BPF_TRAMP_F_CALL_ORIG flag. Implement bpf_tramp_ftrace_ops_func and use it for the ftrace_ops used by BPF trampoline. This enables tracing functions with livepatch. This also requires moving bpf trampoline to *_ftrace_direct_mult APIs. Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/all/20220602193706.2607681-2-song@kernel.org/ Link: https://lore.kernel.org/bpf/20220720002126.803253-5-song@kernel.org --- include/linux/bpf.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1d22df8bf306..20c26aed7896 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -47,6 +47,7 @@ struct kobject; struct mem_cgroup; struct module; struct bpf_func_state; +struct ftrace_ops; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -756,6 +757,11 @@ struct btf_func_model { */ #define BPF_TRAMP_F_ORIG_STACK BIT(5) +/* This trampoline is on a function with another ftrace_ops with IPMODIFY, + * e.g., a live patch. This flag is set and cleared by ftrace call backs, + */ +#define BPF_TRAMP_F_SHARE_IPMODIFY BIT(6) + /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 * bytes on x86. */ @@ -838,9 +844,11 @@ struct bpf_tramp_image { struct bpf_trampoline { /* hlist for trampoline_table */ struct hlist_node hlist; + struct ftrace_ops *fops; /* serializes access to fields of this trampoline */ struct mutex mutex; refcount_t refcnt; + u32 flags; u64 key; struct { struct btf_func_model model; -- cgit v1.2.3