From 2cd008522707a59bf38c1f45d5c654eddbb86c20 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Mon, 30 May 2022 17:28:10 +0800 Subject: bpf: Unify data extension operation of jited_ksyms and jited_linfo We found that 32-bit environment can not print BPF line info due to a data inconsistency between jited_ksyms[0] and jited_linfo[0]. For example: jited_kyms[0] = 0xb800067c, jited_linfo[0] = 0xffffffffb800067c We know that both of them store BPF func address, but due to the different data extension operations when extended to u64, they may not be the same. We need to unify the data extension operations of them. Signed-off-by: Pu Lehui Signed-off-by: Daniel Borkmann Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/CAEf4BzZ-eDcdJZgJ+Np7Y=V-TVjDDvOMqPwzKjyWrh=i5juv4w@mail.gmail.com Link: https://lore.kernel.org/bpf/20220530092815.1112406-2-pulehui@huawei.com --- kernel/bpf/syscall.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2b69306d3c6e..aeb31137b2ed 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4090,14 +4090,15 @@ static int bpf_prog_get_info_by_fd(struct file *file, info.nr_jited_line_info = 0; if (info.nr_jited_line_info && ulen) { if (bpf_dump_raw_ok(file->f_cred)) { + unsigned long line_addr; __u64 __user *user_linfo; u32 i; user_linfo = u64_to_user_ptr(info.jited_line_info); ulen = min_t(u32, info.nr_jited_line_info, ulen); for (i = 0; i < ulen; i++) { - if (put_user((__u64)(long)prog->aux->jited_linfo[i], - &user_linfo[i])) + line_addr = (unsigned long)prog->aux->jited_linfo[i]; + if (put_user((__u64)line_addr, &user_linfo[i])) return -EFAULT; } } else { -- cgit v1.2.3 From cc1685546df87d9872e1ccef5bf56ac5262be0b1 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Mon, 30 May 2022 17:28:12 +0800 Subject: bpf: Correct the comment about insn_to_jit_off The insn_to_jit_off passed to bpf_prog_fill_jited_linfo should be the first byte of the next instruction, or the byte off to the end of the current instruction. Signed-off-by: Pu Lehui Signed-off-by: Daniel Borkmann Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220530092815.1112406-4-pulehui@huawei.com --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5f6f3f829b36..e78cc5eea4a5 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -176,7 +176,7 @@ void bpf_prog_jit_attempt_done(struct bpf_prog *prog) * here is relative to the prog itself instead of the main prog. * This array has one entry for each xlated bpf insn. * - * jited_off is the byte off to the last byte of the jited insn. + * jited_off is the byte off to the end of the jited insn. * * Hence, with * insn_start: -- cgit v1.2.3 From 4c46091ee985ae84c60c5e95055d779fcd291d87 Mon Sep 17 00:00:00 2001 From: Tadeusz Struk Date: Tue, 17 May 2022 11:04:20 -0700 Subject: bpf: Fix KASAN use-after-free Read in compute_effective_progs Syzbot found a Use After Free bug in compute_effective_progs(). The reproducer creates a number of BPF links, and causes a fault injected alloc to fail, while calling bpf_link_detach on them. Link detach triggers the link to be freed by bpf_link_free(), which calls __cgroup_bpf_detach() and update_effective_progs(). If the memory allocation in this function fails, the function restores the pointer to the bpf_cgroup_link on the cgroup list, but the memory gets freed just after it returns. After this, every subsequent call to update_effective_progs() causes this already deallocated pointer to be dereferenced in prog_list_length(), and triggers KASAN UAF error. To fix this issue don't preserve the pointer to the prog or link in the list, but remove it and replace it with a dummy prog without shrinking the table. The subsequent call to __cgroup_bpf_detach() or __cgroup_bpf_detach() will correct it. Fixes: af6eea57437a ("bpf: Implement bpf_link-based cgroup BPF program attachment") Reported-by: Signed-off-by: Tadeusz Struk Signed-off-by: Andrii Nakryiko Cc: Link: https://syzkaller.appspot.com/bug?id=8ebf179a95c2a2670f7cf1ba62429ec044369db4 Link: https://lore.kernel.org/bpf/20220517180420.87954-1-tadeusz.struk@linaro.org --- kernel/bpf/cgroup.c | 70 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 60 insertions(+), 10 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index afb414b26d01..7a394f7c205c 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -720,6 +720,60 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs, return ERR_PTR(-ENOENT); } +/** + * purge_effective_progs() - After compute_effective_progs fails to alloc new + * cgrp->bpf.inactive table we can recover by + * recomputing the array in place. + * + * @cgrp: The cgroup which descendants to travers + * @prog: A program to detach or NULL + * @link: A link to detach or NULL + * @atype: Type of detach operation + */ +static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog, + struct bpf_cgroup_link *link, + enum cgroup_bpf_attach_type atype) +{ + struct cgroup_subsys_state *css; + struct bpf_prog_array *progs; + struct bpf_prog_list *pl; + struct list_head *head; + struct cgroup *cg; + int pos; + + /* recompute effective prog array in place */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + if (percpu_ref_is_zero(&desc->bpf.refcnt)) + continue; + + /* find position of link or prog in effective progs array */ + for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) { + if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) + continue; + + head = &cg->bpf.progs[atype]; + list_for_each_entry(pl, head, node) { + if (!prog_list_prog(pl)) + continue; + if (pl->prog == prog && pl->link == link) + goto found; + pos++; + } + } +found: + BUG_ON(!cg); + progs = rcu_dereference_protected( + desc->bpf.effective[atype], + lockdep_is_held(&cgroup_mutex)); + + /* Remove the program from the array */ + WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos), + "Failed to purge a prog from array at index %d", pos); + } +} + /** * __cgroup_bpf_detach() - Detach the program or link from a cgroup, and * propagate the change to descendants @@ -739,7 +793,6 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_prog_list *pl; struct list_head *progs; u32 flags; - int err; atype = to_cgroup_bpf_attach_type(type); if (atype < 0) @@ -761,9 +814,12 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, pl->prog = NULL; pl->link = NULL; - err = update_effective_progs(cgrp, atype); - if (err) - goto cleanup; + if (update_effective_progs(cgrp, atype)) { + /* if update effective array failed replace the prog with a dummy prog*/ + pl->prog = old_prog; + pl->link = link; + purge_effective_progs(cgrp, old_prog, link, atype); + } /* now can actually delete it from this cgroup list */ list_del(&pl->node); @@ -775,12 +831,6 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, bpf_prog_put(old_prog); static_branch_dec(&cgroup_bpf_enabled_key[atype]); return 0; - -cleanup: - /* restore back prog or link */ - pl->prog = old_prog; - pl->link = link; - return err; } static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, -- cgit v1.2.3 From 6089fb325cf737eeb2c4d236c94697112ca860da Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 6 Jun 2022 23:26:00 -0700 Subject: bpf: Add btf enum64 support Currently, BTF only supports upto 32bit enum value with BTF_KIND_ENUM. But in kernel, some enum indeed has 64bit values, e.g., in uapi bpf.h, we have enum { BPF_F_INDEX_MASK = 0xffffffffULL, BPF_F_CURRENT_CPU = BPF_F_INDEX_MASK, BPF_F_CTXLEN_MASK = (0xfffffULL << 32), }; In this case, BTF_KIND_ENUM will encode the value of BPF_F_CTXLEN_MASK as 0, which certainly is incorrect. This patch added a new btf kind, BTF_KIND_ENUM64, which permits 64bit value to cover the above use case. The BTF_KIND_ENUM64 has the following three fields followed by the common type: struct bpf_enum64 { __u32 nume_off; __u32 val_lo32; __u32 val_hi32; }; Currently, btf type section has an alignment of 4 as all element types are u32. Representing the value with __u64 will introduce a pad for bpf_enum64 and may also introduce misalignment for the 64bit value. Hence, two members of val_hi32 and val_lo32 are chosen to avoid these issues. The kflag is also introduced for BTF_KIND_ENUM and BTF_KIND_ENUM64 to indicate whether the value is signed or unsigned. The kflag intends to provide consistent output of BTF C fortmat with the original source code. For example, the original BTF_KIND_ENUM bit value is 0xffffffff. The format C has two choices, printing out 0xffffffff or -1 and current libbpf prints out as unsigned value. But if the signedness is preserved in btf, the value can be printed the same as the original source code. The kflag value 0 means unsigned values, which is consistent to the default by libbpf and should also cover most cases as well. The new BTF_KIND_ENUM64 is intended to support the enum value represented as 64bit value. But it can represent all BTF_KIND_ENUM values as well. The compiler ([1]) and pahole will generate BTF_KIND_ENUM64 only if the value has to be represented with 64 bits. In addition, a static inline function btf_kind_core_compat() is introduced which will be used later when libbpf relo_core.c changed. Here the kernel shares the same relo_core.c with libbpf. [1] https://reviews.llvm.org/D124641 Acked-by: Andrii Nakryiko Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220607062600.3716578-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 142 +++++++++++++++++++++++++++++++++++++++++++++----- kernel/bpf/verifier.c | 2 +- 2 files changed, 129 insertions(+), 15 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 7bccaa4646e5..6c0d8480e15c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -309,6 +309,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_FLOAT] = "FLOAT", [BTF_KIND_DECL_TAG] = "DECL_TAG", [BTF_KIND_TYPE_TAG] = "TYPE_TAG", + [BTF_KIND_ENUM64] = "ENUM64", }; const char *btf_type_str(const struct btf_type *t) @@ -666,6 +667,7 @@ static bool btf_type_has_size(const struct btf_type *t) case BTF_KIND_ENUM: case BTF_KIND_DATASEC: case BTF_KIND_FLOAT: + case BTF_KIND_ENUM64: return true; } @@ -711,6 +713,11 @@ static const struct btf_decl_tag *btf_type_decl_tag(const struct btf_type *t) return (const struct btf_decl_tag *)(t + 1); } +static const struct btf_enum64 *btf_type_enum64(const struct btf_type *t) +{ + return (const struct btf_enum64 *)(t + 1); +} + static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) { return kind_ops[BTF_INFO_KIND(t->info)]; @@ -1019,6 +1026,7 @@ static const char *btf_show_name(struct btf_show *show) parens = "{"; break; case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: prefix = "enum"; break; default: @@ -1834,6 +1842,7 @@ __btf_resolve_size(const struct btf *btf, const struct btf_type *type, case BTF_KIND_UNION: case BTF_KIND_ENUM: case BTF_KIND_FLOAT: + case BTF_KIND_ENUM64: size = type->size; goto resolved; @@ -3670,6 +3679,7 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, { const struct btf_enum *enums = btf_type_enum(t); struct btf *btf = env->btf; + const char *fmt_str; u16 i, nr_enums; u32 meta_needed; @@ -3683,11 +3693,6 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (btf_type_kflag(t)) { - btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); - return -EINVAL; - } - if (t->size > 8 || !is_power_of_2(t->size)) { btf_verifier_log_type(env, t, "Unexpected size"); return -EINVAL; @@ -3718,7 +3723,8 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, if (env->log.level == BPF_LOG_KERNEL) continue; - btf_verifier_log(env, "\t%s val=%d\n", + fmt_str = btf_type_kflag(t) ? "\t%s val=%d\n" : "\t%s val=%u\n"; + btf_verifier_log(env, fmt_str, __btf_name_by_offset(btf, enums[i].name_off), enums[i].val); } @@ -3759,7 +3765,10 @@ static void btf_enum_show(const struct btf *btf, const struct btf_type *t, return; } - btf_show_type_value(show, "%d", v); + if (btf_type_kflag(t)) + btf_show_type_value(show, "%d", v); + else + btf_show_type_value(show, "%u", v); btf_show_end_type(show); } @@ -3772,6 +3781,109 @@ static struct btf_kind_operations enum_ops = { .show = btf_enum_show, }; +static s32 btf_enum64_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + const struct btf_enum64 *enums = btf_type_enum64(t); + struct btf *btf = env->btf; + const char *fmt_str; + u16 i, nr_enums; + u32 meta_needed; + + nr_enums = btf_type_vlen(t); + meta_needed = nr_enums * sizeof(*enums); + + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + if (t->size > 8 || !is_power_of_2(t->size)) { + btf_verifier_log_type(env, t, "Unexpected size"); + return -EINVAL; + } + + /* enum type either no name or a valid one */ + if (t->name_off && + !btf_name_valid_identifier(env->btf, t->name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + for (i = 0; i < nr_enums; i++) { + if (!btf_name_offset_valid(btf, enums[i].name_off)) { + btf_verifier_log(env, "\tInvalid name_offset:%u", + enums[i].name_off); + return -EINVAL; + } + + /* enum member must have a valid name */ + if (!enums[i].name_off || + !btf_name_valid_identifier(btf, enums[i].name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + + if (env->log.level == BPF_LOG_KERNEL) + continue; + + fmt_str = btf_type_kflag(t) ? "\t%s val=%lld\n" : "\t%s val=%llu\n"; + btf_verifier_log(env, fmt_str, + __btf_name_by_offset(btf, enums[i].name_off), + btf_enum64_value(enums + i)); + } + + return meta_needed; +} + +static void btf_enum64_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct btf_show *show) +{ + const struct btf_enum64 *enums = btf_type_enum64(t); + u32 i, nr_enums = btf_type_vlen(t); + void *safe_data; + s64 v; + + safe_data = btf_show_start_type(show, t, type_id, data); + if (!safe_data) + return; + + v = *(u64 *)safe_data; + + for (i = 0; i < nr_enums; i++) { + if (v != btf_enum64_value(enums + i)) + continue; + + btf_show_type_value(show, "%s", + __btf_name_by_offset(btf, + enums[i].name_off)); + + btf_show_end_type(show); + return; + } + + if (btf_type_kflag(t)) + btf_show_type_value(show, "%lld", v); + else + btf_show_type_value(show, "%llu", v); + btf_show_end_type(show); +} + +static struct btf_kind_operations enum64_ops = { + .check_meta = btf_enum64_check_meta, + .resolve = btf_df_resolve, + .check_member = btf_enum_check_member, + .check_kflag_member = btf_enum_check_kflag_member, + .log_details = btf_enum_log, + .show = btf_enum64_show, +}; + static s32 btf_func_proto_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) @@ -4438,6 +4550,7 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { [BTF_KIND_FLOAT] = &float_ops, [BTF_KIND_DECL_TAG] = &decl_tag_ops, [BTF_KIND_TYPE_TAG] = &modifier_ops, + [BTF_KIND_ENUM64] = &enum64_ops, }; static s32 btf_check_meta(struct btf_verifier_env *env, @@ -5299,7 +5412,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* skip modifiers */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (btf_type_is_small_int(t) || btf_type_is_enum(t)) + if (btf_type_is_small_int(t) || btf_is_any_enum(t)) /* accessing a scalar */ return true; if (!btf_type_is_ptr(t)) { @@ -5763,7 +5876,7 @@ static int __get_type_size(struct btf *btf, u32 btf_id, if (btf_type_is_ptr(t)) /* kernel size of pointer. Not BPF's size of pointer*/ return sizeof(void *); - if (btf_type_is_int(t) || btf_type_is_enum(t)) + if (btf_type_is_int(t) || btf_is_any_enum(t)) return t->size; *bad_type = t; return -EINVAL; @@ -5911,7 +6024,7 @@ static int btf_check_func_type_match(struct bpf_verifier_log *log, * to context only. And only global functions can be replaced. * Hence type check only those types. */ - if (btf_type_is_int(t1) || btf_type_is_enum(t1)) + if (btf_type_is_int(t1) || btf_is_any_enum(t1)) continue; if (!btf_type_is_ptr(t1)) { bpf_log(log, @@ -6408,7 +6521,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, t = btf_type_by_id(btf, t->type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (!btf_type_is_int(t) && !btf_type_is_enum(t)) { + if (!btf_type_is_int(t) && !btf_is_any_enum(t)) { bpf_log(log, "Global function %s() doesn't return scalar. Only those are supported.\n", tname); @@ -6423,7 +6536,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, t = btf_type_by_id(btf, args[i].type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (btf_type_is_int(t) || btf_type_is_enum(t)) { + if (btf_type_is_int(t) || btf_is_any_enum(t)) { reg->type = SCALAR_VALUE; continue; } @@ -7335,6 +7448,7 @@ recur: case BTF_KIND_UNION: case BTF_KIND_ENUM: case BTF_KIND_FWD: + case BTF_KIND_ENUM64: return 1; case BTF_KIND_INT: /* just reject deprecated bitfield-like integers; all other @@ -7387,10 +7501,10 @@ recur: * field-based relocations. This function assumes that root types were already * checked for name match. Beyond that initial root-level name check, names * are completely ignored. Compatibility rules are as follows: - * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs are considered compatible, but + * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs/ENUM64s are considered compatible, but * kind should match for local and target types (i.e., STRUCT is not * compatible with UNION); - * - for ENUMs, the size is ignored; + * - for ENUMs/ENUM64s, the size is ignored; * - for INT, size and signedness are ignored; * - for ARRAY, dimensionality is ignored, element types are checked for * compatibility recursively; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index aedac2ac02b9..2d2872682278 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10901,7 +10901,7 @@ static int check_btf_func(struct bpf_verifier_env *env, goto err_free; ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL); scalar_return = - btf_type_is_small_int(ret_type) || btf_type_is_enum(ret_type); + btf_type_is_small_int(ret_type) || btf_is_any_enum(ret_type); if (i && !scalar_return && env->subprog_info[i].has_ld_abs) { verbose(env, "LD_ABS is only allowed in functions that return 'int'.\n"); goto err_free; -- cgit v1.2.3 From 54a9c3a42d92d2b0d4e0f64214ebbbfcf7fbfda8 Mon Sep 17 00:00:00 2001 From: Feng Zhou Date: Fri, 10 Jun 2022 10:33:07 +0800 Subject: bpf: avoid grabbing spin_locks of all cpus when no free elems This patch use head->first in pcpu_freelist_head to check freelist having free or not. If having, grab spin_lock, or check next cpu's freelist. Before patch: hash_map performance ./map_perf_test 1 0:hash_map_perf pre-alloc 1043397 events per sec ... The average of the test results is around 1050000 events per sec. hash_map the worst: no free ./run_bench_bpf_hashmap_full_update.sh Setting up benchmark 'bpf-hashmap-ful-update'... Benchmark 'bpf-hashmap-ful-update' started. 1:hash_map_full_perf 15687 events per sec ... The average of the test results is around 16000 events per sec. ftrace trace: 0) | htab_map_update_elem() { 0) | __pcpu_freelist_pop() { 0) | _raw_spin_lock() 0) | _raw_spin_unlock() 0) | ... 0) + 25.188 us | } 0) + 28.439 us | } The test machine is 16C, trying to get spin_lock 17 times, in addition to 16c, there is an extralist. after patch: hash_map performance ./map_perf_test 1 0:hash_map_perf pre-alloc 1053298 events per sec ... The average of the test results is around 1050000 events per sec. hash_map worst: no free ./run_bench_bpf_hashmap_full_update.sh Setting up benchmark 'bpf-hashmap-ful-update'... Benchmark 'bpf-hashmap-ful-update' started. 1:hash_map_full_perf 555830 events per sec ... The average of the test results is around 550000 events per sec. ftrace trace: 0) | htab_map_update_elem() { 0) | alloc_htab_elem() { 0) 0.586 us | __pcpu_freelist_pop(); 0) 0.945 us | } 0) 8.669 us | } It can be seen that after adding this patch, the map performance is almost not degraded, and when free=0, first check head->first instead of directly acquiring spin_lock. Co-developed-by: Chengming Zhou Signed-off-by: Chengming Zhou Signed-off-by: Feng Zhou Link: https://lore.kernel.org/r/20220610023308.93798-2-zhoufeng.zf@bytedance.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/percpu_freelist.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index 3d897de89061..00b874c8e889 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -31,7 +31,7 @@ static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head, struct pcpu_freelist_node *node) { node->next = head->first; - head->first = node; + WRITE_ONCE(head->first, node); } static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head, @@ -130,14 +130,17 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s) orig_cpu = cpu = raw_smp_processor_id(); while (1) { head = per_cpu_ptr(s->freelist, cpu); + if (!READ_ONCE(head->first)) + goto next_cpu; raw_spin_lock(&head->lock); node = head->first; if (node) { - head->first = node->next; + WRITE_ONCE(head->first, node->next); raw_spin_unlock(&head->lock); return node; } raw_spin_unlock(&head->lock); +next_cpu: cpu = cpumask_next(cpu, cpu_possible_mask); if (cpu >= nr_cpu_ids) cpu = 0; @@ -146,10 +149,12 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s) } /* per cpu lists are all empty, try extralist */ + if (!READ_ONCE(s->extralist.first)) + return NULL; raw_spin_lock(&s->extralist.lock); node = s->extralist.first; if (node) - s->extralist.first = node->next; + WRITE_ONCE(s->extralist.first, node->next); raw_spin_unlock(&s->extralist.lock); return node; } @@ -164,15 +169,18 @@ ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s) orig_cpu = cpu = raw_smp_processor_id(); while (1) { head = per_cpu_ptr(s->freelist, cpu); + if (!READ_ONCE(head->first)) + goto next_cpu; if (raw_spin_trylock(&head->lock)) { node = head->first; if (node) { - head->first = node->next; + WRITE_ONCE(head->first, node->next); raw_spin_unlock(&head->lock); return node; } raw_spin_unlock(&head->lock); } +next_cpu: cpu = cpumask_next(cpu, cpu_possible_mask); if (cpu >= nr_cpu_ids) cpu = 0; @@ -181,11 +189,11 @@ ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s) } /* cannot pop from per cpu lists, try extralist */ - if (!raw_spin_trylock(&s->extralist.lock)) + if (!READ_ONCE(s->extralist.first) || !raw_spin_trylock(&s->extralist.lock)) return NULL; node = s->extralist.first; if (node) - s->extralist.first = node->next; + WRITE_ONCE(s->extralist.first, node->next); raw_spin_unlock(&s->extralist.lock); return node; } -- cgit v1.2.3 From 8c7dcb84e3b744b2b70baa7a44a9b1881c33a9c9 Mon Sep 17 00:00:00 2001 From: Delyan Kratunov Date: Tue, 14 Jun 2022 23:10:46 +0000 Subject: bpf: implement sleepable uprobes by chaining gps uprobes work by raising a trap, setting a task flag from within the interrupt handler, and processing the actual work for the uprobe on the way back to userspace. As a result, uprobe handlers already execute in a might_fault/_sleep context. The primary obstacle to sleepable bpf uprobe programs is therefore on the bpf side. Namely, the bpf_prog_array attached to the uprobe is protected by normal rcu. In order for uprobe bpf programs to become sleepable, it has to be protected by the tasks_trace rcu flavor instead (and kfree() called after a corresponding grace period). Therefore, the free path for bpf_prog_array now chains a tasks_trace and normal grace periods one after the other. Users who iterate under tasks_trace read section would be safe, as would users who iterate under normal read sections (from non-sleepable locations). The downside is that the tasks_trace latency affects all perf_event-attached bpf programs (and not just uprobe ones). This is deemed safe given the possible attach rates for kprobe/uprobe/tp programs. Separately, non-sleepable programs need access to dynamically sized rcu-protected maps, so bpf_run_prog_array_sleepables now conditionally takes an rcu read section, in addition to the overarching tasks_trace section. Signed-off-by: Delyan Kratunov Link: https://lore.kernel.org/r/ce844d62a2fd0443b08c5ab02e95bc7149f9aeb1.1655248076.git.delyank@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index e78cc5eea4a5..b5ffebcce6cc 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2279,6 +2279,21 @@ void bpf_prog_array_free(struct bpf_prog_array *progs) kfree_rcu(progs, rcu); } +static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu) +{ + struct bpf_prog_array *progs; + + progs = container_of(rcu, struct bpf_prog_array, rcu); + kfree_rcu(progs, rcu); +} + +void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs) +{ + if (!progs || progs == &bpf_empty_prog_array.hdr) + return; + call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb); +} + int bpf_prog_array_length(struct bpf_prog_array *array) { struct bpf_prog_array_item *item; -- cgit v1.2.3 From 64ad7556c75ea102eec2f5bcd60fe2d66ce70308 Mon Sep 17 00:00:00 2001 From: Delyan Kratunov Date: Tue, 14 Jun 2022 23:10:43 +0000 Subject: bpf: allow sleepable uprobe programs to attach uprobe and kprobe programs have the same program type, KPROBE, which is currently not allowed to load sleepable programs. To avoid adding a new UPROBE type, instead allow sleepable KPROBE programs to load and defer the is-it-actually-a-uprobe-program check to attachment time, where there's already validation of the corresponding perf_event. A corollary of this patch is that you can now load a sleepable kprobe program but cannot attach it. Acked-by: Andrii Nakryiko Signed-off-by: Delyan Kratunov Link: https://lore.kernel.org/r/fcd44a7cd204f372f6bb03ef794e829adeaef299.1655248076.git.delyank@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2d2872682278..eadc23a8452c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14829,8 +14829,8 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) } if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING && - prog->type != BPF_PROG_TYPE_LSM) { - verbose(env, "Only fentry/fexit/fmod_ret and lsm programs can be sleepable\n"); + prog->type != BPF_PROG_TYPE_LSM && prog->type != BPF_PROG_TYPE_KPROBE) { + verbose(env, "Only fentry/fexit/fmod_ret, lsm, and kprobe/uprobe programs can be sleepable\n"); return -EINVAL; } -- cgit v1.2.3 From 508362ac66b0478affb4e52cb8da98478312d72d Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Wed, 15 Jun 2022 16:48:43 +0300 Subject: bpf: Allow helpers to accept pointers with a fixed size Before this commit, the BPF verifier required ARG_PTR_TO_MEM arguments to be followed by ARG_CONST_SIZE holding the size of the memory region. The helpers had to check that size in runtime. There are cases where the size expected by a helper is a compile-time constant. Checking it in runtime is an unnecessary overhead and waste of BPF registers. This commit allows helpers to accept pointers to memory without the corresponding ARG_CONST_SIZE, given that they define the memory region size in struct bpf_func_proto and use ARG_PTR_TO_FIXED_SIZE_MEM type. arg_size is unionized with arg_btf_id to reduce the kernel image size, and it's valid because they are used by different argument types. Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Link: https://lore.kernel.org/r/20220615134847.3753567-3-maximmi@nvidia.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index eadc23a8452c..2859901ffbe3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5848,6 +5848,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; enum bpf_arg_type arg_type = fn->arg_type[arg]; enum bpf_reg_type type = reg->type; + u32 *arg_btf_id = NULL; int err = 0; if (arg_type == ARG_DONTCARE) @@ -5884,7 +5885,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, */ goto skip_type_check; - err = check_reg_type(env, regno, arg_type, fn->arg_btf_id[arg], meta); + /* arg_btf_id and arg_size are in a union. */ + if (base_type(arg_type) == ARG_PTR_TO_BTF_ID) + arg_btf_id = fn->arg_btf_id[arg]; + + err = check_reg_type(env, regno, arg_type, arg_btf_id, meta); if (err) return err; @@ -6011,6 +6016,11 @@ skip_type_check: * next is_mem_size argument below. */ meta->raw_mode = arg_type & MEM_UNINIT; + if (arg_type & MEM_FIXED_SIZE) { + err = check_helper_mem_access(env, regno, + fn->arg_size[arg], false, + meta); + } } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); @@ -6400,11 +6410,19 @@ static bool check_raw_mode_ok(const struct bpf_func_proto *fn) return count <= 1; } -static bool check_args_pair_invalid(enum bpf_arg_type arg_curr, - enum bpf_arg_type arg_next) +static bool check_args_pair_invalid(const struct bpf_func_proto *fn, int arg) { - return (base_type(arg_curr) == ARG_PTR_TO_MEM) != - arg_type_is_mem_size(arg_next); + bool is_fixed = fn->arg_type[arg] & MEM_FIXED_SIZE; + bool has_size = fn->arg_size[arg] != 0; + bool is_next_size = false; + + if (arg + 1 < ARRAY_SIZE(fn->arg_type)) + is_next_size = arg_type_is_mem_size(fn->arg_type[arg + 1]); + + if (base_type(fn->arg_type[arg]) != ARG_PTR_TO_MEM) + return is_next_size; + + return has_size == is_next_size || is_next_size == is_fixed; } static bool check_arg_pair_ok(const struct bpf_func_proto *fn) @@ -6415,11 +6433,11 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn) * helper function specification. */ if (arg_type_is_mem_size(fn->arg1_type) || - base_type(fn->arg5_type) == ARG_PTR_TO_MEM || - check_args_pair_invalid(fn->arg1_type, fn->arg2_type) || - check_args_pair_invalid(fn->arg2_type, fn->arg3_type) || - check_args_pair_invalid(fn->arg3_type, fn->arg4_type) || - check_args_pair_invalid(fn->arg4_type, fn->arg5_type)) + check_args_pair_invalid(fn, 0) || + check_args_pair_invalid(fn, 1) || + check_args_pair_invalid(fn, 2) || + check_args_pair_invalid(fn, 3) || + check_args_pair_invalid(fn, 4)) return false; return true; @@ -6460,7 +6478,10 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn) if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i]) return false; - if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i]) + if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i] && + /* arg_btf_id and arg_size are in a union. */ + (base_type(fn->arg_type[i]) != ARG_PTR_TO_MEM || + !(fn->arg_type[i] & MEM_FIXED_SIZE))) return false; } -- cgit v1.2.3 From dc368e1c658e4f478a45e8d1d5b0c8392ca87506 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 16 Jun 2022 15:54:07 -0700 Subject: bpf: Fix non-static bpf_func_proto struct definitions This patch does two things: 1) Marks the dynptr bpf_func_proto structs that were added in [1] as static, as pointed out by the kernel test robot in [2]. 2) There are some bpf_func_proto structs marked as extern which can instead be statically defined. [1] https://lore.kernel.org/bpf/20220523210712.3641569-1-joannelkoong@gmail.com/ [2] https://lore.kernel.org/bpf/62ab89f2.Pko7sI08RAKdF8R6%25lkp@intel.com/ Reported-by: kernel test robot Signed-off-by: Joanne Koong Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220616225407.1878436-1-joannelkoong@gmail.com --- kernel/bpf/helpers.c | 12 ++++++------ kernel/bpf/syscall.c | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 225806a02efb..a1c84d256f83 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -584,7 +584,7 @@ BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2) return strncmp(s1, s2, s1_sz); } -const struct bpf_func_proto bpf_strncmp_proto = { +static const struct bpf_func_proto bpf_strncmp_proto = { .func = bpf_strncmp, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1402,7 +1402,7 @@ BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr) */ #define BPF_PTR_POISON ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA)) -const struct bpf_func_proto bpf_kptr_xchg_proto = { +static const struct bpf_func_proto bpf_kptr_xchg_proto = { .func = bpf_kptr_xchg, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, @@ -1487,7 +1487,7 @@ error: return err; } -const struct bpf_func_proto bpf_dynptr_from_mem_proto = { +static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .func = bpf_dynptr_from_mem, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1513,7 +1513,7 @@ BPF_CALL_4(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src return 0; } -const struct bpf_func_proto bpf_dynptr_read_proto = { +static const struct bpf_func_proto bpf_dynptr_read_proto = { .func = bpf_dynptr_read, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1539,7 +1539,7 @@ BPF_CALL_4(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, return 0; } -const struct bpf_func_proto bpf_dynptr_write_proto = { +static const struct bpf_func_proto bpf_dynptr_write_proto = { .func = bpf_dynptr_write, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1566,7 +1566,7 @@ BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len return (unsigned long)(ptr->data + ptr->offset + offset); } -const struct bpf_func_proto bpf_dynptr_data_proto = { +static const struct bpf_func_proto bpf_dynptr_data_proto = { .func = bpf_dynptr_data, .gpl_only = false, .ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index aeb31137b2ed..7d5af5b99f0d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5131,7 +5131,7 @@ BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flag return *res ? 0 : -ENOENT; } -const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = { +static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = { .func = bpf_kallsyms_lookup_name, .gpl_only = false, .ret_type = RET_INTEGER, -- cgit v1.2.3 From 1ade23711971b0eececf0d7fedc29d3c1d2fce01 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Jun 2022 02:53:42 +0300 Subject: bpf: Inline calls to bpf_loop when callback is known Calls to `bpf_loop` are replaced with direct loops to avoid indirection. E.g. the following: bpf_loop(10, foo, NULL, 0); Is replaced by equivalent of the following: for (int i = 0; i < 10; ++i) foo(i, NULL); This transformation could be applied when: - callback is known and does not change during program execution; - flags passed to `bpf_loop` are always zero. Inlining logic works as follows: - During execution simulation function `update_loop_inline_state` tracks the following information for each `bpf_loop` call instruction: - is callback known and constant? - are flags constant and zero? - Function `optimize_bpf_loop` increases stack depth for functions where `bpf_loop` calls can be inlined and invokes `inline_bpf_loop` to apply the inlining. The additional stack space is used to spill registers R6, R7 and R8. These registers are used as loop counter, loop maximal bound and callback context parameter; Measurements using `benchs/run_bench_bpf_loop.sh` inside QEMU / KVM on i7-4710HQ CPU show a drop in latency from 14 ns/op to 2 ns/op. Signed-off-by: Eduard Zingerman Acked-by: Song Liu Link: https://lore.kernel.org/r/20220620235344.569325-4-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_iter.c | 9 +-- kernel/bpf/verifier.c | 180 ++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 180 insertions(+), 9 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index d5d96ceca105..7e8fd49406f6 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -723,9 +723,6 @@ const struct bpf_func_proto bpf_for_each_map_elem_proto = { .arg4_type = ARG_ANYTHING, }; -/* maximum number of loops */ -#define MAX_LOOPS BIT(23) - BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx, u64, flags) { @@ -733,9 +730,13 @@ BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx, u64 ret; u32 i; + /* Note: these safety checks are also verified when bpf_loop + * is inlined, be careful to modify this code in sync. See + * function verifier.c:inline_bpf_loop. + */ if (flags) return -EINVAL; - if (nr_loops > MAX_LOOPS) + if (nr_loops > BPF_MAX_LOOPS) return -E2BIG; for (i = 0; i < nr_loops; i++) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2859901ffbe3..bf72dc511df6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7124,6 +7124,41 @@ static int check_get_func_ip(struct bpf_verifier_env *env) return -ENOTSUPP; } +static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env) +{ + return &env->insn_aux_data[env->insn_idx]; +} + +static bool loop_flag_is_zero(struct bpf_verifier_env *env) +{ + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_reg_state *reg = ®s[BPF_REG_4]; + bool reg_is_null = register_is_null(reg); + + if (reg_is_null) + mark_chain_precision(env, BPF_REG_4); + + return reg_is_null; +} + +static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno) +{ + struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state; + + if (!state->initialized) { + state->initialized = 1; + state->fit_for_inline = loop_flag_is_zero(env); + state->callback_subprogno = subprogno; + return; + } + + if (!state->fit_for_inline) + return; + + state->fit_for_inline = (loop_flag_is_zero(env) && + state->callback_subprogno == subprogno); +} + static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { @@ -7276,6 +7311,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn err = check_bpf_snprintf_call(env, regs); break; case BPF_FUNC_loop: + update_loop_inline_state(env, meta.subprogno); err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, set_loop_callback_state); break; @@ -7682,11 +7718,6 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env, return true; } -static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env) -{ - return &env->insn_aux_data[env->insn_idx]; -} - enum { REASON_BOUNDS = -1, REASON_TYPE = -2, @@ -14315,6 +14346,142 @@ patch_call_imm: return 0; } +static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env, + int position, + s32 stack_base, + u32 callback_subprogno, + u32 *cnt) +{ + s32 r6_offset = stack_base + 0 * BPF_REG_SIZE; + s32 r7_offset = stack_base + 1 * BPF_REG_SIZE; + s32 r8_offset = stack_base + 2 * BPF_REG_SIZE; + int reg_loop_max = BPF_REG_6; + int reg_loop_cnt = BPF_REG_7; + int reg_loop_ctx = BPF_REG_8; + + struct bpf_prog *new_prog; + u32 callback_start; + u32 call_insn_offset; + s32 callback_offset; + + /* This represents an inlined version of bpf_iter.c:bpf_loop, + * be careful to modify this code in sync. + */ + struct bpf_insn insn_buf[] = { + /* Return error and jump to the end of the patch if + * expected number of iterations is too big. + */ + BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2), + BPF_MOV32_IMM(BPF_REG_0, -E2BIG), + BPF_JMP_IMM(BPF_JA, 0, 0, 16), + /* spill R6, R7, R8 to use these as loop vars */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset), + /* initialize loop vars */ + BPF_MOV64_REG(reg_loop_max, BPF_REG_1), + BPF_MOV32_IMM(reg_loop_cnt, 0), + BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3), + /* loop header, + * if reg_loop_cnt >= reg_loop_max skip the loop body + */ + BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5), + /* callback call, + * correct callback offset would be set after patching + */ + BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt), + BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx), + BPF_CALL_REL(0), + /* increment loop counter */ + BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1), + /* jump to loop header if callback returned 0 */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6), + /* return value of bpf_loop, + * set R0 to the number of iterations + */ + BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt), + /* restore original values of R6, R7, R8 */ + BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset), + BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset), + BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset), + }; + + *cnt = ARRAY_SIZE(insn_buf); + new_prog = bpf_patch_insn_data(env, position, insn_buf, *cnt); + if (!new_prog) + return new_prog; + + /* callback start is known only after patching */ + callback_start = env->subprog_info[callback_subprogno].start; + /* Note: insn_buf[12] is an offset of BPF_CALL_REL instruction */ + call_insn_offset = position + 12; + callback_offset = callback_start - call_insn_offset - 1; + env->prog->insnsi[call_insn_offset].imm = callback_offset; + + return new_prog; +} + +static bool is_bpf_loop_call(struct bpf_insn *insn) +{ + return insn->code == (BPF_JMP | BPF_CALL) && + insn->src_reg == 0 && + insn->imm == BPF_FUNC_loop; +} + +/* For all sub-programs in the program (including main) check + * insn_aux_data to see if there are bpf_loop calls that require + * inlining. If such calls are found the calls are replaced with a + * sequence of instructions produced by `inline_bpf_loop` function and + * subprog stack_depth is increased by the size of 3 registers. + * This stack space is used to spill values of the R6, R7, R8. These + * registers are used to store the loop bound, counter and context + * variables. + */ +static int optimize_bpf_loop(struct bpf_verifier_env *env) +{ + struct bpf_subprog_info *subprogs = env->subprog_info; + int i, cur_subprog = 0, cnt, delta = 0; + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + u16 stack_depth = subprogs[cur_subprog].stack_depth; + u16 stack_depth_roundup = round_up(stack_depth, 8) - stack_depth; + u16 stack_depth_extra = 0; + + for (i = 0; i < insn_cnt; i++, insn++) { + struct bpf_loop_inline_state *inline_state = + &env->insn_aux_data[i + delta].loop_inline_state; + + if (is_bpf_loop_call(insn) && inline_state->fit_for_inline) { + struct bpf_prog *new_prog; + + stack_depth_extra = BPF_REG_SIZE * 3 + stack_depth_roundup; + new_prog = inline_bpf_loop(env, + i + delta, + -(stack_depth + stack_depth_extra), + inline_state->callback_subprogno, + &cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = new_prog; + insn = new_prog->insnsi + i + delta; + } + + if (subprogs[cur_subprog + 1].start == i + delta + 1) { + subprogs[cur_subprog].stack_depth += stack_depth_extra; + cur_subprog++; + stack_depth = subprogs[cur_subprog].stack_depth; + stack_depth_roundup = round_up(stack_depth, 8) - stack_depth; + stack_depth_extra = 0; + } + } + + env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; + + return 0; +} + static void free_states(struct bpf_verifier_env *env) { struct bpf_verifier_state_list *sl, *sln; @@ -15052,6 +15219,9 @@ skip_full_check: ret = check_max_stack_depth(env); /* instruction rewrites happen after this point */ + if (ret == 0) + ret = optimize_bpf_loop(env); + if (is_priv) { if (ret == 0) opt_hard_wire_dead_code_branches(env); -- cgit v1.2.3 From 95acd8817e66d031d2e6ee7def3f1e1874819317 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Fri, 17 Jun 2022 12:57:34 +0200 Subject: bpf, x64: Add predicate for bpf2bpf with tailcalls support in JIT The BPF core/verifier is hard-coded to permit mixing bpf2bpf and tail calls for only x86-64. Change the logic to instead rely on a new weak function 'bool bpf_jit_supports_subprog_tailcalls(void)', which a capable JIT backend can override. Update the x86-64 eBPF JIT to reflect this. Signed-off-by: Tony Ambardar [jakub: drop MIPS bits and tweak patch subject] Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220617105735.733938-2-jakub@cloudflare.com --- kernel/bpf/core.c | 6 ++++++ kernel/bpf/verifier.c | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b5ffebcce6cc..f023cb399e3f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2729,6 +2729,12 @@ bool __weak bpf_jit_needs_zext(void) return false; } +/* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */ +bool __weak bpf_jit_supports_subprog_tailcalls(void) +{ + return false; +} + bool __weak bpf_jit_supports_kfunc_call(void) { return false; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bf72dc511df6..a20d7736a5b2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6154,7 +6154,8 @@ static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id) static bool allow_tail_call_in_subprogs(struct bpf_verifier_env *env) { - return env->prog->jit_requested && IS_ENABLED(CONFIG_X86_64); + return env->prog->jit_requested && + bpf_jit_supports_subprog_tailcalls(); } static int check_map_func_compatibility(struct bpf_verifier_env *env, -- cgit v1.2.3 From 9f0265e921dee14096943ee11f793fa076aa7a72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn-Thorben=20Hinz?= Date: Wed, 22 Jun 2022 21:12:24 +0200 Subject: bpf: Require only one of cong_avoid() and cong_control() from a TCP CC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the check for required and optional functions in a struct tcp_congestion_ops from bpf_tcp_ca.c. Rely on tcp_register_congestion_control() to reject a BPF CC that does not implement all required functions, as it will do for a non-BPF CC. When a CC implements tcp_congestion_ops.cong_control(), the alternate cong_avoid() is not in use in the TCP stack. Previously, a BPF CC was still forced to implement cong_avoid() as a no-op since it was non-optional in bpf_tcp_ca.c. Signed-off-by: Jörn-Thorben Hinz Reviewed-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220622191227.898118-3-jthinz@mailbox.tu-berlin.de Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_struct_ops.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index d9a3c9207240..7e0068c3399c 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -503,10 +503,9 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, goto unlock; } - /* Error during st_ops->reg(). It is very unlikely since - * the above init_member() should have caught it earlier - * before reg(). The only possibility is if there was a race - * in registering the struct_ops (under the same name) to + /* Error during st_ops->reg(). Can happen if this struct_ops needs to be + * verified as a whole, after all init_member() calls. Can also happen if + * there was a race in registering the struct_ops (under the same name) to * a sub-system through different struct_ops's maps. */ set_memory_nx((long)st_map->image, 1); -- cgit v1.2.3 From 395e942d34a25824457da379baf434b5d6da4dcc Mon Sep 17 00:00:00 2001 From: Simon Wang Date: Tue, 21 Jun 2022 23:19:23 -0400 Subject: bpf: Replace hard-coded 0 with BPF_K in check_alu_op Enhance readability a bit. Signed-off-by: Simon Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20220622031923.65692-1-wangchuanguo@inspur.com --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a20d7736a5b2..f228141c01c5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9096,7 +9096,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (opcode == BPF_END || opcode == BPF_NEG) { if (opcode == BPF_NEG) { - if (BPF_SRC(insn->code) != 0 || + if (BPF_SRC(insn->code) != BPF_K || insn->src_reg != BPF_REG_0 || insn->off != 0 || insn->imm != 0) { verbose(env, "BPF_NEG uses reserved fields\n"); -- cgit v1.2.3 From fb4e3b33e3e7f13befdf9ee232e34818c6cc5fb9 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 24 Jun 2022 05:06:12 +0300 Subject: bpf: Fix for use-after-free bug in inline_bpf_loop As reported by Dan Carpenter, the following statements in inline_bpf_loop() might cause a use-after-free bug: struct bpf_prog *new_prog; // ... new_prog = bpf_patch_insn_data(env, position, insn_buf, *cnt); // ... env->prog->insnsi[call_insn_offset].imm = callback_offset; The bpf_patch_insn_data() might free the memory used by env->prog. Fixes: 1ade23711971 ("bpf: Inline calls to bpf_loop when callback is known") Reported-by: Dan Carpenter Signed-off-by: Eduard Zingerman Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220624020613.548108-2-eddyz87@gmail.com --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f228141c01c5..4938477912cd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14417,7 +14417,7 @@ static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env, /* Note: insn_buf[12] is an offset of BPF_CALL_REL instruction */ call_insn_offset = position + 12; callback_offset = callback_start - call_insn_offset - 1; - env->prog->insnsi[call_insn_offset].imm = callback_offset; + new_prog->insnsi[call_insn_offset].imm = callback_offset; return new_prog; } -- cgit v1.2.3 From fd75733da2f376c0c8c6513c3cb2ac227082ec5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20M=C3=BCller?= Date: Thu, 23 Jun 2022 18:29:34 +0000 Subject: bpf: Merge "types_are_compat" logic into relo_core.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BPF type compatibility checks (bpf_core_types_are_compat()) are currently duplicated between kernel and user space. That's a historical artifact more than intentional doing and can lead to subtle bugs where one implementation is adjusted but another is forgotten. That happened with the enum64 work, for example, where the libbpf side was changed (commit 23b2a3a8f63a ("libbpf: Add enum64 relocation support")) to use the btf_kind_core_compat() helper function but the kernel side was not (commit 6089fb325cf7 ("bpf: Add btf enum64 support")). This patch addresses both the duplication issue, by merging both implementations and moving them into relo_core.c, and fixes the alluded to kind check (by giving preference to libbpf's already adjusted logic). For discussion of the topic, please refer to: https://lore.kernel.org/bpf/CAADnVQKbWR7oarBdewgOBZUPzryhRYvEbkhyPJQHHuxq=0K1gw@mail.gmail.com/T/#mcc99f4a33ad9a322afaf1b9276fb1f0b7add9665 Changelog: v1 -> v2: - limited libbpf recursion limit to 32 - changed name to __bpf_core_types_are_compat - included warning previously present in libbpf version - merged kernel and user space changes into a single patch Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220623182934.2582827-1-deso@posteo.net --- kernel/bpf/btf.c | 84 +------------------------------------------------------- 1 file changed, 1 insertion(+), 83 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f08037c31dd7..2e2066d6af94 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7416,87 +7416,6 @@ EXPORT_SYMBOL_GPL(register_btf_id_dtor_kfuncs); #define MAX_TYPES_ARE_COMPAT_DEPTH 2 -static -int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, - const struct btf *targ_btf, __u32 targ_id, - int level) -{ - const struct btf_type *local_type, *targ_type; - int depth = 32; /* max recursion depth */ - - /* caller made sure that names match (ignoring flavor suffix) */ - local_type = btf_type_by_id(local_btf, local_id); - targ_type = btf_type_by_id(targ_btf, targ_id); - if (btf_kind(local_type) != btf_kind(targ_type)) - return 0; - -recur: - depth--; - if (depth < 0) - return -EINVAL; - - local_type = btf_type_skip_modifiers(local_btf, local_id, &local_id); - targ_type = btf_type_skip_modifiers(targ_btf, targ_id, &targ_id); - if (!local_type || !targ_type) - return -EINVAL; - - if (btf_kind(local_type) != btf_kind(targ_type)) - return 0; - - switch (btf_kind(local_type)) { - case BTF_KIND_UNKN: - case BTF_KIND_STRUCT: - case BTF_KIND_UNION: - case BTF_KIND_ENUM: - case BTF_KIND_FWD: - case BTF_KIND_ENUM64: - return 1; - case BTF_KIND_INT: - /* just reject deprecated bitfield-like integers; all other - * integers are by default compatible between each other - */ - return btf_int_offset(local_type) == 0 && btf_int_offset(targ_type) == 0; - case BTF_KIND_PTR: - local_id = local_type->type; - targ_id = targ_type->type; - goto recur; - case BTF_KIND_ARRAY: - local_id = btf_array(local_type)->type; - targ_id = btf_array(targ_type)->type; - goto recur; - case BTF_KIND_FUNC_PROTO: { - struct btf_param *local_p = btf_params(local_type); - struct btf_param *targ_p = btf_params(targ_type); - __u16 local_vlen = btf_vlen(local_type); - __u16 targ_vlen = btf_vlen(targ_type); - int i, err; - - if (local_vlen != targ_vlen) - return 0; - - for (i = 0; i < local_vlen; i++, local_p++, targ_p++) { - if (level <= 0) - return -EINVAL; - - btf_type_skip_modifiers(local_btf, local_p->type, &local_id); - btf_type_skip_modifiers(targ_btf, targ_p->type, &targ_id); - err = __bpf_core_types_are_compat(local_btf, local_id, - targ_btf, targ_id, - level - 1); - if (err <= 0) - return err; - } - - /* tail recurse for return type check */ - btf_type_skip_modifiers(local_btf, local_type->type, &local_id); - btf_type_skip_modifiers(targ_btf, targ_type->type, &targ_id); - goto recur; - } - default: - return 0; - } -} - /* Check local and target types for compatibility. This check is used for * type-based CO-RE relocations and follow slightly different rules than * field-based relocations. This function assumes that root types were already @@ -7519,8 +7438,7 @@ recur: int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, __u32 targ_id) { - return __bpf_core_types_are_compat(local_btf, local_id, - targ_btf, targ_id, + return __bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id, MAX_TYPES_ARE_COMPAT_DEPTH); } -- cgit v1.2.3 From af3f4134006bf3bf894179c08aaf98ed5938cf4e Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 Jun 2022 10:43:04 -0700 Subject: bpf: add bpf_func_t and trampoline helpers I'll be adding lsm cgroup specific helpers that grab trampoline mutex. No functional changes. Reviewed-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220628174314.1216643-2-sdf@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/trampoline.c | 63 ++++++++++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 30 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 93c7675f0c9e..5466e15be61f 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -410,7 +410,7 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) } } -int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) +static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) { enum bpf_tramp_prog_type kind; struct bpf_tramp_link *link_exiting; @@ -418,44 +418,33 @@ int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline int cnt = 0, i; kind = bpf_attach_type_to_tramp(link->link.prog); - mutex_lock(&tr->mutex); - if (tr->extension_prog) { + if (tr->extension_prog) /* cannot attach fentry/fexit if extension prog is attached. * cannot overwrite extension prog either. */ - err = -EBUSY; - goto out; - } + return -EBUSY; for (i = 0; i < BPF_TRAMP_MAX; i++) cnt += tr->progs_cnt[i]; if (kind == BPF_TRAMP_REPLACE) { /* Cannot attach extension if fentry/fexit are in use. */ - if (cnt) { - err = -EBUSY; - goto out; - } + if (cnt) + return -EBUSY; tr->extension_prog = link->link.prog; - err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL, - link->link.prog->bpf_func); - goto out; - } - if (cnt >= BPF_MAX_TRAMP_LINKS) { - err = -E2BIG; - goto out; + return bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL, + link->link.prog->bpf_func); } - if (!hlist_unhashed(&link->tramp_hlist)) { + if (cnt >= BPF_MAX_TRAMP_LINKS) + return -E2BIG; + if (!hlist_unhashed(&link->tramp_hlist)) /* prog already linked */ - err = -EBUSY; - goto out; - } + return -EBUSY; hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) { if (link_exiting->link.prog != link->link.prog) continue; /* prog already linked */ - err = -EBUSY; - goto out; + return -EBUSY; } hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]); @@ -465,30 +454,44 @@ int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; } -out: + return err; +} + +int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) +{ + int err; + + mutex_lock(&tr->mutex); + err = __bpf_trampoline_link_prog(link, tr); mutex_unlock(&tr->mutex); return err; } -/* bpf_trampoline_unlink_prog() should never fail. */ -int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) +static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) { enum bpf_tramp_prog_type kind; int err; kind = bpf_attach_type_to_tramp(link->link.prog); - mutex_lock(&tr->mutex); if (kind == BPF_TRAMP_REPLACE) { WARN_ON_ONCE(!tr->extension_prog); err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, tr->extension_prog->bpf_func, NULL); tr->extension_prog = NULL; - goto out; + return err; } hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; - err = bpf_trampoline_update(tr); -out: + return bpf_trampoline_update(tr); +} + +/* bpf_trampoline_unlink_prog() should never fail. */ +int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) +{ + int err; + + mutex_lock(&tr->mutex); + err = __bpf_trampoline_unlink_prog(link, tr); mutex_unlock(&tr->mutex); return err; } -- cgit v1.2.3 From 00442143a2ab7f1da46fbf4d2a99c85df767d49a Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 Jun 2022 10:43:05 -0700 Subject: bpf: convert cgroup_bpf.progs to hlist This lets us reclaim some space to be used by new cgroup lsm slots. Before: struct cgroup_bpf { struct bpf_prog_array * effective[23]; /* 0 184 */ /* --- cacheline 2 boundary (128 bytes) was 56 bytes ago --- */ struct list_head progs[23]; /* 184 368 */ /* --- cacheline 8 boundary (512 bytes) was 40 bytes ago --- */ u32 flags[23]; /* 552 92 */ /* XXX 4 bytes hole, try to pack */ /* --- cacheline 10 boundary (640 bytes) was 8 bytes ago --- */ struct list_head storages; /* 648 16 */ struct bpf_prog_array * inactive; /* 664 8 */ struct percpu_ref refcnt; /* 672 16 */ struct work_struct release_work; /* 688 32 */ /* size: 720, cachelines: 12, members: 7 */ /* sum members: 716, holes: 1, sum holes: 4 */ /* last cacheline: 16 bytes */ }; After: struct cgroup_bpf { struct bpf_prog_array * effective[23]; /* 0 184 */ /* --- cacheline 2 boundary (128 bytes) was 56 bytes ago --- */ struct hlist_head progs[23]; /* 184 184 */ /* --- cacheline 5 boundary (320 bytes) was 48 bytes ago --- */ u8 flags[23]; /* 368 23 */ /* XXX 1 byte hole, try to pack */ /* --- cacheline 6 boundary (384 bytes) was 8 bytes ago --- */ struct list_head storages; /* 392 16 */ struct bpf_prog_array * inactive; /* 408 8 */ struct percpu_ref refcnt; /* 416 16 */ struct work_struct release_work; /* 432 72 */ /* size: 504, cachelines: 8, members: 7 */ /* sum members: 503, holes: 1, sum holes: 1 */ /* last cacheline: 56 bytes */ }; Suggested-by: Jakub Sitnicki Reviewed-by: Jakub Sitnicki Reviewed-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220628174314.1216643-3-sdf@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 76 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 44 insertions(+), 32 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 7a394f7c205c..4adb4f3ecb7f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -157,11 +157,12 @@ static void cgroup_bpf_release(struct work_struct *work) mutex_lock(&cgroup_mutex); for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) { - struct list_head *progs = &cgrp->bpf.progs[atype]; - struct bpf_prog_list *pl, *pltmp; + struct hlist_head *progs = &cgrp->bpf.progs[atype]; + struct bpf_prog_list *pl; + struct hlist_node *pltmp; - list_for_each_entry_safe(pl, pltmp, progs, node) { - list_del(&pl->node); + hlist_for_each_entry_safe(pl, pltmp, progs, node) { + hlist_del(&pl->node); if (pl->prog) bpf_prog_put(pl->prog); if (pl->link) @@ -217,12 +218,12 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl) /* count number of elements in the list. * it's slow but the list cannot be long */ -static u32 prog_list_length(struct list_head *head) +static u32 prog_list_length(struct hlist_head *head) { struct bpf_prog_list *pl; u32 cnt = 0; - list_for_each_entry(pl, head, node) { + hlist_for_each_entry(pl, head, node) { if (!prog_list_prog(pl)) continue; cnt++; @@ -291,7 +292,7 @@ static int compute_effective_progs(struct cgroup *cgrp, if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) continue; - list_for_each_entry(pl, &p->bpf.progs[atype], node) { + hlist_for_each_entry(pl, &p->bpf.progs[atype], node) { if (!prog_list_prog(pl)) continue; @@ -342,7 +343,7 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) cgroup_bpf_get(p); for (i = 0; i < NR; i++) - INIT_LIST_HEAD(&cgrp->bpf.progs[i]); + INIT_HLIST_HEAD(&cgrp->bpf.progs[i]); INIT_LIST_HEAD(&cgrp->bpf.storages); @@ -418,7 +419,7 @@ cleanup: #define BPF_CGROUP_MAX_PROGS 64 -static struct bpf_prog_list *find_attach_entry(struct list_head *progs, +static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs, struct bpf_prog *prog, struct bpf_cgroup_link *link, struct bpf_prog *replace_prog, @@ -428,12 +429,12 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs, /* single-attach case */ if (!allow_multi) { - if (list_empty(progs)) + if (hlist_empty(progs)) return NULL; - return list_first_entry(progs, typeof(*pl), node); + return hlist_entry(progs->first, typeof(*pl), node); } - list_for_each_entry(pl, progs, node) { + hlist_for_each_entry(pl, progs, node) { if (prog && pl->prog == prog && prog != replace_prog) /* disallow attaching the same prog twice */ return ERR_PTR(-EINVAL); @@ -444,7 +445,7 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs, /* direct prog multi-attach w/ replacement case */ if (replace_prog) { - list_for_each_entry(pl, progs, node) { + hlist_for_each_entry(pl, progs, node) { if (pl->prog == replace_prog) /* a match found */ return pl; @@ -480,7 +481,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; enum cgroup_bpf_attach_type atype; struct bpf_prog_list *pl; - struct list_head *progs; + struct hlist_head *progs; int err; if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) || @@ -503,7 +504,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, if (!hierarchy_allows_attach(cgrp, atype)) return -EPERM; - if (!list_empty(progs) && cgrp->bpf.flags[atype] != saved_flags) + if (!hlist_empty(progs) && cgrp->bpf.flags[atype] != saved_flags) /* Disallow attaching non-overridable on top * of existing overridable in this cgroup. * Disallow attaching multi-prog if overridable or none @@ -525,12 +526,22 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, if (pl) { old_prog = pl->prog; } else { + struct hlist_node *last = NULL; + pl = kmalloc(sizeof(*pl), GFP_KERNEL); if (!pl) { bpf_cgroup_storages_free(new_storage); return -ENOMEM; } - list_add_tail(&pl->node, progs); + if (hlist_empty(progs)) + hlist_add_head(&pl->node, progs); + else + hlist_for_each(last, progs) { + if (last->next) + continue; + hlist_add_behind(&pl->node, last); + break; + } } pl->prog = prog; @@ -556,7 +567,7 @@ cleanup: } bpf_cgroup_storages_free(new_storage); if (!old_prog) { - list_del(&pl->node); + hlist_del(&pl->node); kfree(pl); } return err; @@ -587,7 +598,7 @@ static void replace_effective_prog(struct cgroup *cgrp, struct cgroup_subsys_state *css; struct bpf_prog_array *progs; struct bpf_prog_list *pl; - struct list_head *head; + struct hlist_head *head; struct cgroup *cg; int pos; @@ -603,7 +614,7 @@ static void replace_effective_prog(struct cgroup *cgrp, continue; head = &cg->bpf.progs[atype]; - list_for_each_entry(pl, head, node) { + hlist_for_each_entry(pl, head, node) { if (!prog_list_prog(pl)) continue; if (pl->link == link) @@ -637,7 +648,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, enum cgroup_bpf_attach_type atype; struct bpf_prog *old_prog; struct bpf_prog_list *pl; - struct list_head *progs; + struct hlist_head *progs; bool found = false; atype = to_cgroup_bpf_attach_type(link->type); @@ -649,7 +660,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, if (link->link.prog->type != new_prog->type) return -EINVAL; - list_for_each_entry(pl, progs, node) { + hlist_for_each_entry(pl, progs, node) { if (pl->link == link) { found = true; break; @@ -688,7 +699,7 @@ out_unlock: return ret; } -static struct bpf_prog_list *find_detach_entry(struct list_head *progs, +static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs, struct bpf_prog *prog, struct bpf_cgroup_link *link, bool allow_multi) @@ -696,14 +707,14 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs, struct bpf_prog_list *pl; if (!allow_multi) { - if (list_empty(progs)) + if (hlist_empty(progs)) /* report error when trying to detach and nothing is attached */ return ERR_PTR(-ENOENT); /* to maintain backward compatibility NONE and OVERRIDE cgroups * allow detaching with invalid FD (prog==NULL) in legacy mode */ - return list_first_entry(progs, typeof(*pl), node); + return hlist_entry(progs->first, typeof(*pl), node); } if (!prog && !link) @@ -713,7 +724,7 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs, return ERR_PTR(-EINVAL); /* find the prog or link and detach it */ - list_for_each_entry(pl, progs, node) { + hlist_for_each_entry(pl, progs, node) { if (pl->prog == prog && pl->link == link) return pl; } @@ -737,7 +748,7 @@ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog, struct cgroup_subsys_state *css; struct bpf_prog_array *progs; struct bpf_prog_list *pl; - struct list_head *head; + struct hlist_head *head; struct cgroup *cg; int pos; @@ -754,7 +765,7 @@ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog, continue; head = &cg->bpf.progs[atype]; - list_for_each_entry(pl, head, node) { + hlist_for_each_entry(pl, head, node) { if (!prog_list_prog(pl)) continue; if (pl->prog == prog && pl->link == link) @@ -791,7 +802,7 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum cgroup_bpf_attach_type atype; struct bpf_prog *old_prog; struct bpf_prog_list *pl; - struct list_head *progs; + struct hlist_head *progs; u32 flags; atype = to_cgroup_bpf_attach_type(type); @@ -822,9 +833,10 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, } /* now can actually delete it from this cgroup list */ - list_del(&pl->node); + hlist_del(&pl->node); + kfree(pl); - if (list_empty(progs)) + if (hlist_empty(progs)) /* last program was detached, reset flags to zero */ cgrp->bpf.flags[atype] = 0; if (old_prog) @@ -852,7 +864,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, enum bpf_attach_type type = attr->query.attach_type; enum cgroup_bpf_attach_type atype; struct bpf_prog_array *effective; - struct list_head *progs; + struct hlist_head *progs; struct bpf_prog *prog; int cnt, ret = 0, i; u32 flags; @@ -891,7 +903,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, u32 id; i = 0; - list_for_each_entry(pl, progs, node) { + hlist_for_each_entry(pl, progs, node) { prog = prog_list_prog(pl); id = prog->aux->id; if (copy_to_user(prog_ids + i, &id, sizeof(id))) -- cgit v1.2.3 From 69fd337a975c7e690dfe49d9cb4fe5ba1e6db44e Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 Jun 2022 10:43:06 -0700 Subject: bpf: per-cgroup lsm flavor Allow attaching to lsm hooks in the cgroup context. Attaching to per-cgroup LSM works exactly like attaching to other per-cgroup hooks. New BPF_LSM_CGROUP is added to trigger new mode; the actual lsm hook we attach to is signaled via existing attach_btf_id. For the hooks that have 'struct socket' or 'struct sock' as its first argument, we use the cgroup associated with that socket. For the rest, we use 'current' cgroup (this is all on default hierarchy == v2 only). Note that for some hooks that work on 'struct sock' we still take the cgroup from 'current' because some of them work on the socket that hasn't been properly initialized yet. Behind the scenes, we allocate a shim program that is attached to the trampoline and runs cgroup effective BPF programs array. This shim has some rudimentary ref counting and can be shared between several programs attaching to the same lsm hook from different cgroups. Note that this patch bloats cgroup size because we add 211 cgroup_bpf_attach_type(s) for simplicity sake. This will be addressed in the subsequent patch. Also note that we only add non-sleepable flavor for now. To enable sleepable use-cases, bpf_prog_run_array_cg has to grab trace rcu, shim programs have to be freed via trace rcu, cgroup_bpf.effective should be also trace-rcu-managed + maybe some other changes that I'm not aware of. Reviewed-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220628174314.1216643-4-sdf@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_lsm.c | 48 ++++++++++++ kernel/bpf/btf.c | 11 +++ kernel/bpf/cgroup.c | 136 ++++++++++++++++++++++++++++++--- kernel/bpf/core.c | 2 + kernel/bpf/syscall.c | 10 +++ kernel/bpf/trampoline.c | 198 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 32 ++++++++ 7 files changed, 426 insertions(+), 11 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index c1351df9f7ee..0f72020bfdcf 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -16,6 +16,7 @@ #include #include #include +#include /* For every LSM hook that allows attachment of BPF programs, declare a nop * function where a BPF program can be attached. @@ -35,6 +36,44 @@ BTF_SET_START(bpf_lsm_hooks) #undef LSM_HOOK BTF_SET_END(bpf_lsm_hooks) +/* List of LSM hooks that should operate on 'current' cgroup regardless + * of function signature. + */ +BTF_SET_START(bpf_lsm_current_hooks) +/* operate on freshly allocated sk without any cgroup association */ +BTF_ID(func, bpf_lsm_sk_alloc_security) +BTF_ID(func, bpf_lsm_sk_free_security) +BTF_SET_END(bpf_lsm_current_hooks) + +void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, + bpf_func_t *bpf_func) +{ + const struct btf_param *args; + + if (btf_type_vlen(prog->aux->attach_func_proto) < 1 || + btf_id_set_contains(&bpf_lsm_current_hooks, + prog->aux->attach_btf_id)) { + *bpf_func = __cgroup_bpf_run_lsm_current; + return; + } + + args = btf_params(prog->aux->attach_func_proto); + +#ifdef CONFIG_NET + if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCKET]) + *bpf_func = __cgroup_bpf_run_lsm_socket; + else if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCK]) + *bpf_func = __cgroup_bpf_run_lsm_sock; + else +#endif + *bpf_func = __cgroup_bpf_run_lsm_current; +} + +int bpf_lsm_hook_idx(u32 btf_id) +{ + return btf_id_set_index(&bpf_lsm_hooks, btf_id); +} + int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) { @@ -158,6 +197,15 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL; case BPF_FUNC_get_attach_cookie: return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL; + case BPF_FUNC_get_local_storage: + return prog->expected_attach_type == BPF_LSM_CGROUP ? + &bpf_get_local_storage_proto : NULL; + case BPF_FUNC_set_retval: + return prog->expected_attach_type == BPF_LSM_CGROUP ? + &bpf_set_retval_proto : NULL; + case BPF_FUNC_get_retval: + return prog->expected_attach_type == BPF_LSM_CGROUP ? + &bpf_get_retval_proto : NULL; default: return tracing_prog_func_proto(func_id, prog); } diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2e2066d6af94..7c1fe422ed3f 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5363,6 +5363,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (arg == nr_args) { switch (prog->expected_attach_type) { + case BPF_LSM_CGROUP: case BPF_LSM_MAC: case BPF_TRACE_FEXIT: /* When LSM programs are attached to void LSM hooks @@ -6842,6 +6843,16 @@ static int btf_id_cmp_func(const void *a, const void *b) return *pa - *pb; } +int btf_id_set_index(const struct btf_id_set *set, u32 id) +{ + const u32 *p; + + p = bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func); + if (!p) + return -1; + return p - set->ids; +} + bool btf_id_set_contains(const struct btf_id_set *set, u32 id) { return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 4adb4f3ecb7f..9cf41dd4f96f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include #include @@ -61,6 +63,87 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp, return run_ctx.retval; } +unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx, + const struct bpf_insn *insn) +{ + const struct bpf_prog *shim_prog; + struct sock *sk; + struct cgroup *cgrp; + int ret = 0; + u64 *args; + + args = (u64 *)ctx; + sk = (void *)(unsigned long)args[0]; + /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/ + shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi)); + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + if (likely(cgrp)) + ret = bpf_prog_run_array_cg(&cgrp->bpf, + shim_prog->aux->cgroup_atype, + ctx, bpf_prog_run, 0, NULL); + return ret; +} + +unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx, + const struct bpf_insn *insn) +{ + const struct bpf_prog *shim_prog; + struct socket *sock; + struct cgroup *cgrp; + int ret = 0; + u64 *args; + + args = (u64 *)ctx; + sock = (void *)(unsigned long)args[0]; + /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/ + shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi)); + + cgrp = sock_cgroup_ptr(&sock->sk->sk_cgrp_data); + if (likely(cgrp)) + ret = bpf_prog_run_array_cg(&cgrp->bpf, + shim_prog->aux->cgroup_atype, + ctx, bpf_prog_run, 0, NULL); + return ret; +} + +unsigned int __cgroup_bpf_run_lsm_current(const void *ctx, + const struct bpf_insn *insn) +{ + const struct bpf_prog *shim_prog; + struct cgroup *cgrp; + int ret = 0; + + /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/ + shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi)); + + /* We rely on trampoline's __bpf_prog_enter_lsm_cgroup to grab RCU read lock. */ + cgrp = task_dfl_cgroup(current); + if (likely(cgrp)) + ret = bpf_prog_run_array_cg(&cgrp->bpf, + shim_prog->aux->cgroup_atype, + ctx, bpf_prog_run, 0, NULL); + return ret; +} + +#ifdef CONFIG_BPF_LSM +static enum cgroup_bpf_attach_type +bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id) +{ + if (attach_type != BPF_LSM_CGROUP) + return to_cgroup_bpf_attach_type(attach_type); + return CGROUP_LSM_START + bpf_lsm_hook_idx(attach_btf_id); +} +#else +static enum cgroup_bpf_attach_type +bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id) +{ + if (attach_type != BPF_LSM_CGROUP) + return to_cgroup_bpf_attach_type(attach_type); + return -EOPNOTSUPP; +} +#endif /* CONFIG_BPF_LSM */ + void cgroup_bpf_offline(struct cgroup *cgrp) { cgroup_get(cgrp); @@ -163,10 +246,16 @@ static void cgroup_bpf_release(struct work_struct *work) hlist_for_each_entry_safe(pl, pltmp, progs, node) { hlist_del(&pl->node); - if (pl->prog) + if (pl->prog) { + if (pl->prog->expected_attach_type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(pl->prog); bpf_prog_put(pl->prog); - if (pl->link) + } + if (pl->link) { + if (pl->link->link.prog->expected_attach_type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(pl->link->link.prog); bpf_cgroup_link_auto_detach(pl->link); + } kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key[atype]); } @@ -479,6 +568,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; + struct bpf_prog *new_prog = prog ? : link->link.prog; enum cgroup_bpf_attach_type atype; struct bpf_prog_list *pl; struct hlist_head *progs; @@ -495,7 +585,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, /* replace_prog implies BPF_F_REPLACE, and vice versa */ return -EINVAL; - atype = to_cgroup_bpf_attach_type(type); + atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id); if (atype < 0) return -EINVAL; @@ -549,17 +639,30 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, bpf_cgroup_storages_assign(pl->storage, storage); cgrp->bpf.flags[atype] = saved_flags; + if (type == BPF_LSM_CGROUP) { + err = bpf_trampoline_link_cgroup_shim(new_prog, atype); + if (err) + goto cleanup; + } + err = update_effective_progs(cgrp, atype); if (err) - goto cleanup; + goto cleanup_trampoline; - if (old_prog) + if (old_prog) { + if (type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(old_prog); bpf_prog_put(old_prog); - else + } else { static_branch_inc(&cgroup_bpf_enabled_key[atype]); + } bpf_cgroup_storages_link(new_storage, cgrp, type); return 0; +cleanup_trampoline: + if (type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(new_prog); + cleanup: if (old_prog) { pl->prog = old_prog; @@ -651,7 +754,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, struct hlist_head *progs; bool found = false; - atype = to_cgroup_bpf_attach_type(link->type); + atype = bpf_cgroup_atype_find(link->type, new_prog->aux->attach_btf_id); if (atype < 0) return -EINVAL; @@ -803,9 +906,15 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_prog *old_prog; struct bpf_prog_list *pl; struct hlist_head *progs; + u32 attach_btf_id = 0; u32 flags; - atype = to_cgroup_bpf_attach_type(type); + if (prog) + attach_btf_id = prog->aux->attach_btf_id; + if (link) + attach_btf_id = link->link.prog->aux->attach_btf_id; + + atype = bpf_cgroup_atype_find(type, attach_btf_id); if (atype < 0) return -EINVAL; @@ -839,8 +948,11 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, if (hlist_empty(progs)) /* last program was detached, reset flags to zero */ cgrp->bpf.flags[atype] = 0; - if (old_prog) + if (old_prog) { + if (type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(old_prog); bpf_prog_put(old_prog); + } static_branch_dec(&cgroup_bpf_enabled_key[atype]); return 0; } @@ -999,6 +1111,8 @@ static void bpf_cgroup_link_release(struct bpf_link *link) WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link, cg_link->type)); + if (cg_link->type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog); cg = cg_link->cgroup; cg_link->cgroup = NULL; @@ -1343,7 +1457,7 @@ BPF_CALL_0(bpf_get_retval) return ctx->retval; } -static const struct bpf_func_proto bpf_get_retval_proto = { +const struct bpf_func_proto bpf_get_retval_proto = { .func = bpf_get_retval, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1358,7 +1472,7 @@ BPF_CALL_1(bpf_set_retval, int, retval) return 0; } -static const struct bpf_func_proto bpf_set_retval_proto = { +const struct bpf_func_proto bpf_set_retval_proto = { .func = bpf_set_retval, .gpl_only = false, .ret_type = RET_INTEGER, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f023cb399e3f..4cc10b942a3c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2666,6 +2666,8 @@ const struct bpf_func_proto bpf_get_local_storage_proto __weak; const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_snprintf_btf_proto __weak; const struct bpf_func_proto bpf_seq_printf_btf_proto __weak; +const struct bpf_func_proto bpf_set_retval_proto __weak; +const struct bpf_func_proto bpf_get_retval_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7d5af5b99f0d..626b8f7d237b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3416,6 +3416,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_SK_LOOKUP; case BPF_XDP: return BPF_PROG_TYPE_XDP; + case BPF_LSM_CGROUP: + return BPF_PROG_TYPE_LSM; default: return BPF_PROG_TYPE_UNSPEC; } @@ -3469,6 +3471,11 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_LSM: + if (ptype == BPF_PROG_TYPE_LSM && + prog->expected_attach_type != BPF_LSM_CGROUP) + return -EINVAL; + ret = cgroup_bpf_prog_attach(attr, ptype, prog); break; default: @@ -3506,6 +3513,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_LSM: return cgroup_bpf_prog_detach(attr, ptype); default: return -EINVAL; @@ -4540,6 +4548,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = bpf_raw_tp_link_attach(prog, NULL); else if (prog->expected_attach_type == BPF_TRACE_ITER) ret = bpf_iter_link_attach(attr, uattr, prog); + else if (prog->expected_attach_type == BPF_LSM_CGROUP) + ret = cgroup_bpf_link_attach(attr, prog); else ret = bpf_tracing_prog_attach(prog, attr->link_create.target_fd, diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 5466e15be61f..d7c251d7fbcd 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include /* dummy _ops. The verifier will operate on target program's ops. */ const struct bpf_verifier_ops bpf_extension_verifier_ops = { @@ -496,6 +498,177 @@ int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampolin return err; } +#if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) +static void bpf_shim_tramp_link_release(struct bpf_link *link) +{ + struct bpf_shim_tramp_link *shim_link = + container_of(link, struct bpf_shim_tramp_link, link.link); + + /* paired with 'shim_link->trampoline = tr' in bpf_trampoline_link_cgroup_shim */ + if (!shim_link->trampoline) + return; + + WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline)); + bpf_trampoline_put(shim_link->trampoline); +} + +static void bpf_shim_tramp_link_dealloc(struct bpf_link *link) +{ + struct bpf_shim_tramp_link *shim_link = + container_of(link, struct bpf_shim_tramp_link, link.link); + + kfree(shim_link); +} + +static const struct bpf_link_ops bpf_shim_tramp_link_lops = { + .release = bpf_shim_tramp_link_release, + .dealloc = bpf_shim_tramp_link_dealloc, +}; + +static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog, + bpf_func_t bpf_func, + int cgroup_atype) +{ + struct bpf_shim_tramp_link *shim_link = NULL; + struct bpf_prog *p; + + shim_link = kzalloc(sizeof(*shim_link), GFP_USER); + if (!shim_link) + return NULL; + + p = bpf_prog_alloc(1, 0); + if (!p) { + kfree(shim_link); + return NULL; + } + + p->jited = false; + p->bpf_func = bpf_func; + + p->aux->cgroup_atype = cgroup_atype; + p->aux->attach_func_proto = prog->aux->attach_func_proto; + p->aux->attach_btf_id = prog->aux->attach_btf_id; + p->aux->attach_btf = prog->aux->attach_btf; + btf_get(p->aux->attach_btf); + p->type = BPF_PROG_TYPE_LSM; + p->expected_attach_type = BPF_LSM_MAC; + bpf_prog_inc(p); + bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC, + &bpf_shim_tramp_link_lops, p); + + return shim_link; +} + +static struct bpf_shim_tramp_link *cgroup_shim_find(struct bpf_trampoline *tr, + bpf_func_t bpf_func) +{ + struct bpf_tramp_link *link; + int kind; + + for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { + hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) { + struct bpf_prog *p = link->link.prog; + + if (p->bpf_func == bpf_func) + return container_of(link, struct bpf_shim_tramp_link, link); + } + } + + return NULL; +} + +int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, + int cgroup_atype) +{ + struct bpf_shim_tramp_link *shim_link = NULL; + struct bpf_attach_target_info tgt_info = {}; + struct bpf_trampoline *tr; + bpf_func_t bpf_func; + u64 key; + int err; + + err = bpf_check_attach_target(NULL, prog, NULL, + prog->aux->attach_btf_id, + &tgt_info); + if (err) + return err; + + key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, + prog->aux->attach_btf_id); + + bpf_lsm_find_cgroup_shim(prog, &bpf_func); + tr = bpf_trampoline_get(key, &tgt_info); + if (!tr) + return -ENOMEM; + + mutex_lock(&tr->mutex); + + shim_link = cgroup_shim_find(tr, bpf_func); + if (shim_link) { + /* Reusing existing shim attached by the other program. */ + bpf_link_inc(&shim_link->link.link); + + mutex_unlock(&tr->mutex); + bpf_trampoline_put(tr); /* bpf_trampoline_get above */ + return 0; + } + + /* Allocate and install new shim. */ + + shim_link = cgroup_shim_alloc(prog, bpf_func, cgroup_atype); + if (!shim_link) { + err = -ENOMEM; + goto err; + } + + err = __bpf_trampoline_link_prog(&shim_link->link, tr); + if (err) + goto err; + + shim_link->trampoline = tr; + /* note, we're still holding tr refcnt from above */ + + mutex_unlock(&tr->mutex); + + return 0; +err: + mutex_unlock(&tr->mutex); + + if (shim_link) + bpf_link_put(&shim_link->link.link); + + /* have to release tr while _not_ holding its mutex */ + bpf_trampoline_put(tr); /* bpf_trampoline_get above */ + + return err; +} + +void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog) +{ + struct bpf_shim_tramp_link *shim_link = NULL; + struct bpf_trampoline *tr; + bpf_func_t bpf_func; + u64 key; + + key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, + prog->aux->attach_btf_id); + + bpf_lsm_find_cgroup_shim(prog, &bpf_func); + tr = bpf_trampoline_lookup(key); + if (WARN_ON_ONCE(!tr)) + return; + + mutex_lock(&tr->mutex); + shim_link = cgroup_shim_find(tr, bpf_func); + mutex_unlock(&tr->mutex); + + if (shim_link) + bpf_link_put(&shim_link->link.link); + + bpf_trampoline_put(tr); /* bpf_trampoline_lookup above */ +} +#endif + struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info) { @@ -628,6 +801,31 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_ rcu_read_unlock(); } +u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx) + __acquires(RCU) +{ + /* Runtime stats are exported via actual BPF_LSM_CGROUP + * programs, not the shims. + */ + rcu_read_lock(); + migrate_disable(); + + run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); + + return NO_START_TIME; +} + +void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx) + __releases(RCU) +{ + bpf_reset_run_ctx(run_ctx->saved_run_ctx); + + migrate_enable(); + rcu_read_unlock(); +} + u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) { rcu_read_lock_trace(); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4938477912cd..df3ec6b05f05 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7322,6 +7322,18 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn reg_type_str(env, regs[BPF_REG_1].type)); return -EACCES; } + break; + case BPF_FUNC_set_retval: + if (env->prog->expected_attach_type == BPF_LSM_CGROUP) { + if (!env->prog->aux->attach_func_proto->type) { + /* Make sure programs that attach to void + * hooks don't try to modify return value. + */ + verbose(env, "BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n"); + return -EINVAL; + } + } + break; } if (err) @@ -10527,6 +10539,22 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_PROG_TYPE_SK_LOOKUP: range = tnum_range(SK_DROP, SK_PASS); break; + + case BPF_PROG_TYPE_LSM: + if (env->prog->expected_attach_type != BPF_LSM_CGROUP) { + /* Regular BPF_PROG_TYPE_LSM programs can return + * any value. + */ + return 0; + } + if (!env->prog->aux->attach_func_proto->type) { + /* Make sure programs that attach to void + * hooks don't try to modify return value. + */ + range = tnum_range(1, 1); + } + break; + case BPF_PROG_TYPE_EXT: /* freplace program can return anything as its return value * depends on the to-be-replaced kernel func or bpf program. @@ -10543,6 +10571,9 @@ static int check_return_code(struct bpf_verifier_env *env) if (!tnum_in(range, reg->var_off)) { verbose_invalid_scalar(env, reg, &range, "program exit", "R0"); + if (prog->expected_attach_type == BPF_LSM_CGROUP && + !prog->aux->attach_func_proto->type) + verbose(env, "Note, BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n"); return -EINVAL; } @@ -14902,6 +14933,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, fallthrough; case BPF_MODIFY_RETURN: case BPF_LSM_MAC: + case BPF_LSM_CGROUP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: if (!btf_type_is_func(t)) { -- cgit v1.2.3 From c0e19f2c9a3edd38e4b1bdae98eb44555d02bc31 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 Jun 2022 10:43:07 -0700 Subject: bpf: minimize number of allocated lsm slots per program Previous patch adds 1:1 mapping between all 211 LSM hooks and bpf_cgroup program array. Instead of reserving a slot per possible hook, reserve 10 slots per cgroup for lsm programs. Those slots are dynamically allocated on demand and reclaimed. struct cgroup_bpf { struct bpf_prog_array * effective[33]; /* 0 264 */ /* --- cacheline 4 boundary (256 bytes) was 8 bytes ago --- */ struct hlist_head progs[33]; /* 264 264 */ /* --- cacheline 8 boundary (512 bytes) was 16 bytes ago --- */ u8 flags[33]; /* 528 33 */ /* XXX 7 bytes hole, try to pack */ struct list_head storages; /* 568 16 */ /* --- cacheline 9 boundary (576 bytes) was 8 bytes ago --- */ struct bpf_prog_array * inactive; /* 584 8 */ struct percpu_ref refcnt; /* 592 16 */ struct work_struct release_work; /* 608 72 */ /* size: 680, cachelines: 11, members: 7 */ /* sum members: 673, holes: 1, sum holes: 7 */ /* last cacheline: 40 bytes */ }; Reviewed-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220628174314.1216643-5-sdf@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_lsm.c | 5 ----- kernel/bpf/btf.c | 10 ---------- kernel/bpf/cgroup.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++- kernel/bpf/core.c | 7 +++++++ kernel/bpf/trampoline.c | 1 + 5 files changed, 54 insertions(+), 16 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 0f72020bfdcf..83aa431dd52e 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -69,11 +69,6 @@ void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, *bpf_func = __cgroup_bpf_run_lsm_current; } -int bpf_lsm_hook_idx(u32 btf_id) -{ - return btf_id_set_index(&bpf_lsm_hooks, btf_id); -} - int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 7c1fe422ed3f..8d3c7ab8af46 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6843,16 +6843,6 @@ static int btf_id_cmp_func(const void *a, const void *b) return *pa - *pb; } -int btf_id_set_index(const struct btf_id_set *set, u32 id) -{ - const u32 *p; - - p = bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func); - if (!p) - return -1; - return p - set->ids; -} - bool btf_id_set_contains(const struct btf_id_set *set, u32 id) { return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9cf41dd4f96f..169cbd0de797 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -127,12 +127,57 @@ unsigned int __cgroup_bpf_run_lsm_current(const void *ctx, } #ifdef CONFIG_BPF_LSM +struct cgroup_lsm_atype { + u32 attach_btf_id; + int refcnt; +}; + +static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM]; + static enum cgroup_bpf_attach_type bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id) { + int i; + + lockdep_assert_held(&cgroup_mutex); + if (attach_type != BPF_LSM_CGROUP) return to_cgroup_bpf_attach_type(attach_type); - return CGROUP_LSM_START + bpf_lsm_hook_idx(attach_btf_id); + + for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++) + if (cgroup_lsm_atype[i].attach_btf_id == attach_btf_id) + return CGROUP_LSM_START + i; + + for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++) + if (cgroup_lsm_atype[i].attach_btf_id == 0) + return CGROUP_LSM_START + i; + + return -E2BIG; + +} + +void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype) +{ + int i = cgroup_atype - CGROUP_LSM_START; + + lockdep_assert_held(&cgroup_mutex); + + WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id && + cgroup_lsm_atype[i].attach_btf_id != attach_btf_id); + + cgroup_lsm_atype[i].attach_btf_id = attach_btf_id; + cgroup_lsm_atype[i].refcnt++; +} + +void bpf_cgroup_atype_put(int cgroup_atype) +{ + int i = cgroup_atype - CGROUP_LSM_START; + + mutex_lock(&cgroup_mutex); + if (--cgroup_lsm_atype[i].refcnt <= 0) + cgroup_lsm_atype[i].attach_btf_id = 0; + WARN_ON_ONCE(cgroup_lsm_atype[i].refcnt < 0); + mutex_unlock(&cgroup_mutex); } #else static enum cgroup_bpf_attach_type diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 4cc10b942a3c..805c2ad5c793 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -107,6 +107,9 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag fp->aux->prog = fp; fp->jit_requested = ebpf_jit_enabled(); fp->blinding_requested = bpf_jit_blinding_enabled(fp); +#ifdef CONFIG_CGROUP_BPF + aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID; +#endif INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode); mutex_init(&fp->aux->used_maps_mutex); @@ -2569,6 +2572,10 @@ static void bpf_prog_free_deferred(struct work_struct *work) aux = container_of(work, struct bpf_prog_aux, work); #ifdef CONFIG_BPF_SYSCALL bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab); +#endif +#ifdef CONFIG_CGROUP_BPF + if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID) + bpf_cgroup_atype_put(aux->cgroup_atype); #endif bpf_free_used_maps(aux); bpf_free_used_btfs(aux); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index d7c251d7fbcd..6cd226584c33 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -555,6 +555,7 @@ static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog bpf_prog_inc(p); bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC, &bpf_shim_tramp_link_lops, p); + bpf_cgroup_atype_get(p->aux->attach_btf_id, cgroup_atype); return shim_link; } -- cgit v1.2.3 From b79c9fc9551b45953a94abf550b7bd3b00e3a0f9 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 Jun 2022 10:43:08 -0700 Subject: bpf: implement BPF_PROG_QUERY for BPF_LSM_CGROUP We have two options: 1. Treat all BPF_LSM_CGROUP the same, regardless of attach_btf_id 2. Treat BPF_LSM_CGROUP+attach_btf_id as a separate hook point I was doing (2) in the original patch, but switching to (1) here: * bpf_prog_query returns all attached BPF_LSM_CGROUP programs regardless of attach_btf_id * attach_btf_id is exported via bpf_prog_info Reviewed-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220628174314.1216643-6-sdf@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 95 +++++++++++++++++++++++++++++++++++----------------- kernel/bpf/syscall.c | 8 ++++- 2 files changed, 71 insertions(+), 32 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 169cbd0de797..59b7eb60d5b4 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1017,57 +1017,90 @@ static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, union bpf_attr __user *uattr) { + __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags); __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); enum bpf_attach_type type = attr->query.attach_type; + enum cgroup_bpf_attach_type from_atype, to_atype; enum cgroup_bpf_attach_type atype; struct bpf_prog_array *effective; - struct hlist_head *progs; - struct bpf_prog *prog; int cnt, ret = 0, i; + int total_cnt = 0; u32 flags; - atype = to_cgroup_bpf_attach_type(type); - if (atype < 0) - return -EINVAL; - - progs = &cgrp->bpf.progs[atype]; - flags = cgrp->bpf.flags[atype]; + if (type == BPF_LSM_CGROUP) { + if (attr->query.prog_cnt && prog_ids && !prog_attach_flags) + return -EINVAL; - effective = rcu_dereference_protected(cgrp->bpf.effective[atype], - lockdep_is_held(&cgroup_mutex)); + from_atype = CGROUP_LSM_START; + to_atype = CGROUP_LSM_END; + flags = 0; + } else { + from_atype = to_cgroup_bpf_attach_type(type); + if (from_atype < 0) + return -EINVAL; + to_atype = from_atype; + flags = cgrp->bpf.flags[from_atype]; + } - if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) - cnt = bpf_prog_array_length(effective); - else - cnt = prog_list_length(progs); + for (atype = from_atype; atype <= to_atype; atype++) { + if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { + effective = rcu_dereference_protected(cgrp->bpf.effective[atype], + lockdep_is_held(&cgroup_mutex)); + total_cnt += bpf_prog_array_length(effective); + } else { + total_cnt += prog_list_length(&cgrp->bpf.progs[atype]); + } + } if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) return -EFAULT; - if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) + if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt))) return -EFAULT; - if (attr->query.prog_cnt == 0 || !prog_ids || !cnt) + if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt) /* return early if user requested only program count + flags */ return 0; - if (attr->query.prog_cnt < cnt) { - cnt = attr->query.prog_cnt; + + if (attr->query.prog_cnt < total_cnt) { + total_cnt = attr->query.prog_cnt; ret = -ENOSPC; } - if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { - return bpf_prog_array_copy_to_user(effective, prog_ids, cnt); - } else { - struct bpf_prog_list *pl; - u32 id; + for (atype = from_atype; atype <= to_atype && total_cnt; atype++) { + if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { + effective = rcu_dereference_protected(cgrp->bpf.effective[atype], + lockdep_is_held(&cgroup_mutex)); + cnt = min_t(int, bpf_prog_array_length(effective), total_cnt); + ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt); + } else { + struct hlist_head *progs; + struct bpf_prog_list *pl; + struct bpf_prog *prog; + u32 id; + + progs = &cgrp->bpf.progs[atype]; + cnt = min_t(int, prog_list_length(progs), total_cnt); + i = 0; + hlist_for_each_entry(pl, progs, node) { + prog = prog_list_prog(pl); + id = prog->aux->id; + if (copy_to_user(prog_ids + i, &id, sizeof(id))) + return -EFAULT; + if (++i == cnt) + break; + } + } - i = 0; - hlist_for_each_entry(pl, progs, node) { - prog = prog_list_prog(pl); - id = prog->aux->id; - if (copy_to_user(prog_ids + i, &id, sizeof(id))) - return -EFAULT; - if (++i == cnt) - break; + if (prog_attach_flags) { + flags = cgrp->bpf.flags[atype]; + + for (i = 0; i < cnt; i++) + if (copy_to_user(prog_attach_flags + i, &flags, sizeof(flags))) + return -EFAULT; + prog_attach_flags += cnt; } + + prog_ids += cnt; + total_cnt -= cnt; } return ret; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 626b8f7d237b..ab688d85b2c6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3520,7 +3520,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) } } -#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt +#define BPF_PROG_QUERY_LAST_FIELD query.prog_attach_flags static int bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -3556,6 +3556,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_SYSCTL: case BPF_CGROUP_GETSOCKOPT: case BPF_CGROUP_SETSOCKOPT: + case BPF_LSM_CGROUP: return cgroup_bpf_prog_query(attr, uattr); case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); @@ -4066,6 +4067,11 @@ static int bpf_prog_get_info_by_fd(struct file *file, if (prog->aux->btf) info.btf_id = btf_obj_id(prog->aux->btf); + info.attach_btf_id = prog->aux->attach_btf_id; + if (prog->aux->attach_btf) + info.attach_btf_obj_id = btf_obj_id(prog->aux->attach_btf); + else if (prog->aux->dst_prog) + info.attach_btf_obj_id = btf_obj_id(prog->aux->dst_prog->aux->attach_btf); ulen = info.nr_func_info; info.nr_func_info = prog->aux->func_info_cnt; -- cgit v1.2.3 From 9113d7e48e9128522b9f5a54dfd30dff10509a92 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 Jun 2022 10:43:09 -0700 Subject: bpf: expose bpf_{g,s}etsockopt to lsm cgroup I don't see how to make it nice without introducing btf id lists for the hooks where these helpers are allowed. Some LSM hooks work on the locked sockets, some are triggering early and don't grab any locks, so have two lists for now: 1. LSM hooks which trigger under socket lock - minority of the hooks, but ideal case for us, we can expose existing BTF-based helpers 2. LSM hooks which trigger without socket lock, but they trigger early in the socket creation path where it should be safe to do setsockopt without any locks 3. The rest are prohibited. I'm thinking that this use-case might be a good gateway to sleeping lsm cgroup hooks in the future. We can either expose lock/unlock operations (and add tracking to the verifier) or have another set of bpf_setsockopt wrapper that grab the locks and might sleep. Reviewed-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220628174314.1216643-7-sdf@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_lsm.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 83aa431dd52e..d469b7f3deef 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -45,6 +45,24 @@ BTF_ID(func, bpf_lsm_sk_alloc_security) BTF_ID(func, bpf_lsm_sk_free_security) BTF_SET_END(bpf_lsm_current_hooks) +/* List of LSM hooks that trigger while the socket is properly locked. + */ +BTF_SET_START(bpf_lsm_locked_sockopt_hooks) +BTF_ID(func, bpf_lsm_socket_sock_rcv_skb) +BTF_ID(func, bpf_lsm_sock_graft) +BTF_ID(func, bpf_lsm_inet_csk_clone) +BTF_ID(func, bpf_lsm_inet_conn_established) +BTF_SET_END(bpf_lsm_locked_sockopt_hooks) + +/* List of LSM hooks that trigger while the socket is _not_ locked, + * but it's ok to call bpf_{g,s}etsockopt because the socket is still + * in the early init phase. + */ +BTF_SET_START(bpf_lsm_unlocked_sockopt_hooks) +BTF_ID(func, bpf_lsm_socket_post_create) +BTF_ID(func, bpf_lsm_socket_socketpair) +BTF_SET_END(bpf_lsm_unlocked_sockopt_hooks) + void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func) { @@ -201,6 +219,26 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_retval: return prog->expected_attach_type == BPF_LSM_CGROUP ? &bpf_get_retval_proto : NULL; + case BPF_FUNC_setsockopt: + if (prog->expected_attach_type != BPF_LSM_CGROUP) + return NULL; + if (btf_id_set_contains(&bpf_lsm_locked_sockopt_hooks, + prog->aux->attach_btf_id)) + return &bpf_sk_setsockopt_proto; + if (btf_id_set_contains(&bpf_lsm_unlocked_sockopt_hooks, + prog->aux->attach_btf_id)) + return &bpf_unlocked_sk_setsockopt_proto; + return NULL; + case BPF_FUNC_getsockopt: + if (prog->expected_attach_type != BPF_LSM_CGROUP) + return NULL; + if (btf_id_set_contains(&bpf_lsm_locked_sockopt_hooks, + prog->aux->attach_btf_id)) + return &bpf_sk_getsockopt_proto; + if (btf_id_set_contains(&bpf_lsm_unlocked_sockopt_hooks, + prog->aux->attach_btf_id)) + return &bpf_unlocked_sk_getsockopt_proto; + return NULL; default: return tracing_prog_func_proto(func_id, prog); } -- cgit v1.2.3 From ec6209c8d42f815bc3bef10934637ca92114cd1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20M=C3=BCller?= Date: Tue, 28 Jun 2022 16:01:21 +0000 Subject: bpf, libbpf: Add type match support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds support for the proposed type match relation to relo_core where it is shared between userspace and kernel. It plumbs through both kernel-side and libbpf-side support. The matching relation is defined as follows (copy from source): - modifiers and typedefs are stripped (and, hence, effectively ignored) - generally speaking types need to be of same kind (struct vs. struct, union vs. union, etc.) - exceptions are struct/union behind a pointer which could also match a forward declaration of a struct or union, respectively, and enum vs. enum64 (see below) Then, depending on type: - integers: - match if size and signedness match - arrays & pointers: - target types are recursively matched - structs & unions: - local members need to exist in target with the same name - for each member we recursively check match unless it is already behind a pointer, in which case we only check matching names and compatible kind - enums: - local variants have to have a match in target by symbolic name (but not numeric value) - size has to match (but enum may match enum64 and vice versa) - function pointers: - number and position of arguments in local type has to match target - for each argument and the return value we recursively check match Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220628160127.607834-5-deso@posteo.net --- kernel/bpf/btf.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 8d3c7ab8af46..4f2408a4df08 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7443,6 +7443,15 @@ int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, MAX_TYPES_ARE_COMPAT_DEPTH); } +#define MAX_TYPES_MATCH_DEPTH 2 + +int bpf_core_types_match(const struct btf *local_btf, u32 local_id, + const struct btf *targ_btf, u32 targ_id) +{ + return __bpf_core_types_match(local_btf, local_id, targ_btf, targ_id, false, + MAX_TYPES_MATCH_DEPTH); +} + static bool bpf_core_is_flavor_sep(const char *s) { /* check X___Y name pattern, where X and Y are not underscores */ -- cgit v1.2.3 From d1a6edecc1fddfb6ef92c8f720631d2c02bf2744 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 8 Jul 2022 10:50:00 -0700 Subject: bpf: Check attach_func_proto more carefully in check_return_code Syzkaller reports the following crash: RIP: 0010:check_return_code kernel/bpf/verifier.c:10575 [inline] RIP: 0010:do_check kernel/bpf/verifier.c:12346 [inline] RIP: 0010:do_check_common+0xb3d2/0xd250 kernel/bpf/verifier.c:14610 With the following reproducer: bpf$PROG_LOAD_XDP(0x5, &(0x7f00000004c0)={0xd, 0x3, &(0x7f0000000000)=ANY=[@ANYBLOB="1800000000000019000000000000000095"], &(0x7f0000000300)='GPL\x00', 0x0, 0x0, 0x0, 0x0, 0x0, '\x00', 0x0, 0x2b, 0xffffffffffffffff, 0x8, 0x0, 0x0, 0x10, 0x0}, 0x80) Because we don't enforce expected_attach_type for XDP programs, we end up in hitting 'if (prog->expected_attach_type == BPF_LSM_CGROUP' part in check_return_code and follow up with testing `prog->aux->attach_func_proto->type`, but `prog->aux->attach_func_proto` is NULL. Add explicit prog_type check for the "Note, BPF_LSM_CGROUP that attach ..." condition. Also, don't skip return code check for LSM/STRUCT_OPS. The above actually brings an issue with existing selftest which tries to return EPERM from void inet_csk_clone. Fix the test (and move called_socket_clone to make sure it's not incremented in case of an error) and add a new one to explicitly verify this condition. Fixes: 69fd337a975c ("bpf: per-cgroup lsm flavor") Reported-by: syzbot+5cc0730bd4b4d2c5f152@syzkaller.appspotmail.com Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220708175000.2603078-1-sdf@google.com --- kernel/bpf/verifier.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index df3ec6b05f05..e3cf6194c24f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10444,11 +10444,21 @@ static int check_return_code(struct bpf_verifier_env *env) const bool is_subprog = frame->subprogno; /* LSM and struct_ops func-ptr's return type could be "void" */ - if (!is_subprog && - (prog_type == BPF_PROG_TYPE_STRUCT_OPS || - prog_type == BPF_PROG_TYPE_LSM) && - !prog->aux->attach_func_proto->type) - return 0; + if (!is_subprog) { + switch (prog_type) { + case BPF_PROG_TYPE_LSM: + if (prog->expected_attach_type == BPF_LSM_CGROUP) + /* See below, can be 0 or 0-1 depending on hook. */ + break; + fallthrough; + case BPF_PROG_TYPE_STRUCT_OPS: + if (!prog->aux->attach_func_proto->type) + return 0; + break; + default: + break; + } + } /* eBPF calling convention is such that R0 is used * to return the value from eBPF program. @@ -10572,6 +10582,7 @@ static int check_return_code(struct bpf_verifier_env *env) if (!tnum_in(range, reg->var_off)) { verbose_invalid_scalar(env, reg, &range, "program exit", "R0"); if (prog->expected_attach_type == BPF_LSM_CGROUP && + prog_type == BPF_PROG_TYPE_LSM && !prog->aux->attach_func_proto->type) verbose(env, "Note, BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n"); return -EINVAL; -- cgit v1.2.3 From 535a57a7ffc04932ad83c1a5649b09ba6c93ce83 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Mon, 11 Jul 2022 11:08:20 -0400 Subject: bpf: Remove is_valid_bpf_tramp_flags() Before generating bpf trampoline, x86 calls is_valid_bpf_tramp_flags() to check the input flags. This check is architecture independent. So, to be consistent with x86, arm64 should also do this check before generating bpf trampoline. However, the BPF_TRAMP_F_XXX flags are not used by user code and the flags argument is almost constant at compile time, so this run time check is a bit redundant. Remove is_valid_bpf_tramp_flags() and add some comments to the usage of BPF_TRAMP_F_XXX flags, as suggested by Alexei. Signed-off-by: Xu Kuohai Signed-off-by: Daniel Borkmann Reviewed-by: Jean-Philippe Brucker Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20220711150823.2128542-2-xukuohai@huawei.com --- kernel/bpf/bpf_struct_ops.c | 3 +++ kernel/bpf/trampoline.c | 3 +++ 2 files changed, 6 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 7e0068c3399c..84b2d9dba79a 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -341,6 +341,9 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, tlinks[BPF_TRAMP_FENTRY].links[0] = link; tlinks[BPF_TRAMP_FENTRY].nr_links = 1; + /* BPF_TRAMP_F_RET_FENTRY_RET is only used by bpf_struct_ops, + * and it must be used alone. + */ flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0; return arch_prepare_bpf_trampoline(NULL, image, image_end, model, flags, tlinks, NULL); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 6cd226584c33..fd69812412ca 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -360,6 +360,9 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) if (tlinks[BPF_TRAMP_FEXIT].nr_links || tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) + /* NOTE: BPF_TRAMP_F_RESTORE_REGS and BPF_TRAMP_F_SKIP_FRAME + * should not be set together. + */ flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; if (ip_arg) -- cgit v1.2.3 From 4201d9ab3e42d9e2a20320b751a931e6239c0df2 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 11 Jul 2022 09:28:27 -0700 Subject: bpf: reparent bpf maps on memcg offlining The memory consumed by a bpf map is always accounted to the memory cgroup of the process which created the map. The map can outlive the memory cgroup if it's used by processes in other cgroups or is pinned on bpffs. In this case the map pins the original cgroup in the dying state. For other types of objects (slab objects, non-slab kernel allocations, percpu objects and recently LRU pages) there is a reparenting process implemented: on cgroup offlining charged objects are getting reassigned to the parent cgroup. Because all charges and statistics are fully recursive it's a fairly cheap operation. For efficiency and consistency with other types of objects, let's do the same for bpf maps. Fortunately thanks to the objcg API, the required changes are minimal. Please, note that individual allocations (slabs, percpu and large kmallocs) already have the reparenting mechanism. This commit adds it to the saved map->memcg pointer by replacing it to map->objcg. Because dying cgroups are not visible for a user and all charges are recursive, this commit doesn't bring any behavior changes for a user. v2: added a missing const qualifier Signed-off-by: Roman Gushchin Reviewed-by: Shakeel Butt Link: https://lore.kernel.org/r/20220711162827.184743-1-roman.gushchin@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ab688d85b2c6..83c7136c5788 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -419,35 +419,53 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) #ifdef CONFIG_MEMCG_KMEM static void bpf_map_save_memcg(struct bpf_map *map) { - map->memcg = get_mem_cgroup_from_mm(current->mm); + /* Currently if a map is created by a process belonging to the root + * memory cgroup, get_obj_cgroup_from_current() will return NULL. + * So we have to check map->objcg for being NULL each time it's + * being used. + */ + map->objcg = get_obj_cgroup_from_current(); } static void bpf_map_release_memcg(struct bpf_map *map) { - mem_cgroup_put(map->memcg); + if (map->objcg) + obj_cgroup_put(map->objcg); +} + +static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map) +{ + if (map->objcg) + return get_mem_cgroup_from_objcg(map->objcg); + + return root_mem_cgroup; } void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, int node) { - struct mem_cgroup *old_memcg; + struct mem_cgroup *memcg, *old_memcg; void *ptr; - old_memcg = set_active_memcg(map->memcg); + memcg = bpf_map_get_memcg(map); + old_memcg = set_active_memcg(memcg); ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node); set_active_memcg(old_memcg); + mem_cgroup_put(memcg); return ptr; } void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) { - struct mem_cgroup *old_memcg; + struct mem_cgroup *memcg, *old_memcg; void *ptr; - old_memcg = set_active_memcg(map->memcg); + memcg = bpf_map_get_memcg(map); + old_memcg = set_active_memcg(memcg); ptr = kzalloc(size, flags | __GFP_ACCOUNT); set_active_memcg(old_memcg); + mem_cgroup_put(memcg); return ptr; } @@ -455,12 +473,14 @@ void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align, gfp_t flags) { - struct mem_cgroup *old_memcg; + struct mem_cgroup *memcg, *old_memcg; void __percpu *ptr; - old_memcg = set_active_memcg(map->memcg); + memcg = bpf_map_get_memcg(map); + old_memcg = set_active_memcg(memcg); ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT); set_active_memcg(old_memcg); + mem_cgroup_put(memcg); return ptr; } -- cgit v1.2.3 From 1d5f82d9dd477d5c66e0214a68c3e4f308eadd6d Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 5 Jul 2022 17:26:12 -0700 Subject: bpf, x86: fix freeing of not-finalized bpf_prog_pack syzbot reported a few issues with bpf_prog_pack [1], [2]. This only happens with multiple subprogs. In jit_subprogs(), we first call bpf_int_jit_compile() on each sub program. And then, we call it on each sub program again. jit_data is not freed in the first call of bpf_int_jit_compile(). Similarly we don't call bpf_jit_binary_pack_finalize() in the first call of bpf_int_jit_compile(). If bpf_int_jit_compile() failed for one sub program, we will call bpf_jit_binary_pack_finalize() for this sub program. However, we don't have a chance to call it for other sub programs. Then we will hit "goto out_free" in jit_subprogs(), and call bpf_jit_free on some subprograms that haven't got bpf_jit_binary_pack_finalize() yet. At this point, bpf_jit_binary_pack_free() is called and the whole 2MB page is freed erroneously. Fix this with a custom bpf_jit_free() for x86_64, which calls bpf_jit_binary_pack_finalize() if necessary. Also, with custom bpf_jit_free(), bpf_prog_aux->use_bpf_prog_pack is not needed any more, remove it. Fixes: 1022a5498f6f ("bpf, x86_64: Use bpf_jit_binary_pack_alloc") [1] https://syzkaller.appspot.com/bug?extid=2f649ec6d2eea1495a8f [2] https://syzkaller.appspot.com/bug?extid=87f65c75f4a72db05445 Reported-by: syzbot+2f649ec6d2eea1495a8f@syzkaller.appspotmail.com Reported-by: syzbot+87f65c75f4a72db05445@syzkaller.appspotmail.com Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20220706002612.4013790-1-song@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 805c2ad5c793..cfb8a50a9f12 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -650,12 +650,6 @@ static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp) return fp->jited && !bpf_prog_was_classic(fp); } -static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) -{ - return list_empty(&fp->aux->ksym.lnode) || - fp->aux->ksym.lnode.prev == LIST_POISON2; -} - void bpf_prog_kallsyms_add(struct bpf_prog *fp) { if (!bpf_prog_kallsyms_candidate(fp) || @@ -1153,7 +1147,6 @@ int bpf_jit_binary_pack_finalize(struct bpf_prog *prog, bpf_prog_pack_free(ro_header); return PTR_ERR(ptr); } - prog->aux->use_bpf_prog_pack = true; return 0; } @@ -1177,17 +1170,23 @@ void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header, bpf_jit_uncharge_modmem(size); } +struct bpf_binary_header * +bpf_jit_binary_pack_hdr(const struct bpf_prog *fp) +{ + unsigned long real_start = (unsigned long)fp->bpf_func; + unsigned long addr; + + addr = real_start & BPF_PROG_CHUNK_MASK; + return (void *)addr; +} + static inline struct bpf_binary_header * bpf_jit_binary_hdr(const struct bpf_prog *fp) { unsigned long real_start = (unsigned long)fp->bpf_func; unsigned long addr; - if (fp->aux->use_bpf_prog_pack) - addr = real_start & BPF_PROG_CHUNK_MASK; - else - addr = real_start & PAGE_MASK; - + addr = real_start & PAGE_MASK; return (void *)addr; } @@ -1200,11 +1199,7 @@ void __weak bpf_jit_free(struct bpf_prog *fp) if (fp->jited) { struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); - if (fp->aux->use_bpf_prog_pack) - bpf_jit_binary_pack_free(hdr, NULL /* rw_buffer */); - else - bpf_jit_binary_free(hdr); - + bpf_jit_binary_free(hdr); WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); } -- cgit v1.2.3 From ace2bee839e08df324cb320763258dfd72e6120e Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sat, 9 Jul 2022 15:44:56 +0000 Subject: bpf: Make non-preallocated allocation low priority GFP_ATOMIC doesn't cooperate well with memcg pressure so far, especially if we allocate too much GFP_ATOMIC memory. For example, when we set the memcg limit to limit a non-preallocated bpf memory, the GFP_ATOMIC can easily break the memcg limit by force charge. So it is very dangerous to use GFP_ATOMIC in non-preallocated case. One way to make it safe is to remove __GFP_HIGH from GFP_ATOMIC, IOW, use (__GFP_ATOMIC | __GFP_KSWAPD_RECLAIM) instead, then it will be limited if we allocate too much memory. There's a plan to completely remove __GFP_ATOMIC in the mm side[1], so let's use GFP_NOWAIT instead. We introduced BPF_F_NO_PREALLOC is because full map pre-allocation is too memory expensive for some cases. That means removing __GFP_HIGH doesn't break the rule of BPF_F_NO_PREALLOC, but has the same goal with it-avoiding issues caused by too much memory. So let's remove it. This fix can also apply to other run-time allocations, for example, the allocation in lpm trie, local storage and devmap. So let fix it consistently over the bpf code It also fixes a typo in the comment. [1]. https://lore.kernel.org/linux-mm/163712397076.13692.4727608274002939094@noble.neil.brown.name/ Cc: Roman Gushchin Cc: Shakeel Butt Cc: NeilBrown Signed-off-by: Yafang Shao Reviewed-by: Shakeel Butt Link: https://lore.kernel.org/r/20220709154457.57379-2-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 2 +- kernel/bpf/hashtab.c | 6 +++--- kernel/bpf/local_storage.c | 2 +- kernel/bpf/lpm_trie.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index c2867068e5bd..1400561efb15 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -845,7 +845,7 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_dtab_netdev *dev; dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev), - GFP_ATOMIC | __GFP_NOWARN, + GFP_NOWAIT | __GFP_NOWARN, dtab->map.numa_node); if (!dev) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 17fb69c0e0dc..da7578426a46 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -61,7 +61,7 @@ * * As regular device interrupt handlers and soft interrupts are forced into * thread context, the existing code which does - * spin_lock*(); alloc(GPF_ATOMIC); spin_unlock*(); + * spin_lock*(); alloc(GFP_ATOMIC); spin_unlock*(); * just works. * * In theory the BPF locks could be converted to regular spinlocks as well, @@ -978,7 +978,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, goto dec_count; } l_new = bpf_map_kmalloc_node(&htab->map, htab->elem_size, - GFP_ATOMIC | __GFP_NOWARN, + GFP_NOWAIT | __GFP_NOWARN, htab->map.numa_node); if (!l_new) { l_new = ERR_PTR(-ENOMEM); @@ -996,7 +996,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, } else { /* alloc_percpu zero-fills */ pptr = bpf_map_alloc_percpu(&htab->map, size, 8, - GFP_ATOMIC | __GFP_NOWARN); + GFP_NOWAIT | __GFP_NOWARN); if (!pptr) { kfree(l_new); l_new = ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 8654fc97f5fe..49ef0ce040c7 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -165,7 +165,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key, } new = bpf_map_kmalloc_node(map, struct_size(new, data, map->value_size), - __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN, + __GFP_ZERO | GFP_NOWAIT | __GFP_NOWARN, map->numa_node); if (!new) return -ENOMEM; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index f0d05a3cc4b9..d789e3b831ad 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -285,7 +285,7 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie, if (value) size += trie->map.value_size; - node = bpf_map_kmalloc_node(&trie->map, size, GFP_ATOMIC | __GFP_NOWARN, + node = bpf_map_kmalloc_node(&trie->map, size, GFP_NOWAIT | __GFP_NOWARN, trie->map.numa_node); if (!node) return NULL; -- cgit v1.2.3 From 8ab4cdcf03d0b060fbf73f76460f199bbd759ff7 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 12 Jul 2022 14:06:03 -0700 Subject: bpf: Tidy up verifier check_func_arg() This patch does two things: 1. For matching against the arg type, the match should be against the base type of the arg type, since the arg type can have different bpf_type_flags set on it. 2. Uses switch casing to improve readability + efficiency. Signed-off-by: Joanne Koong Acked-by: Hao Luo Link: https://lore.kernel.org/r/20220712210603.123791-1-joannelkoong@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 66 +++++++++++++++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 28 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 328cfab3af60..26e7e787c20a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5533,17 +5533,6 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type) type == ARG_CONST_SIZE_OR_ZERO; } -static bool arg_type_is_alloc_size(enum bpf_arg_type type) -{ - return type == ARG_CONST_ALLOC_SIZE_OR_ZERO; -} - -static bool arg_type_is_int_ptr(enum bpf_arg_type type) -{ - return type == ARG_PTR_TO_INT || - type == ARG_PTR_TO_LONG; -} - static bool arg_type_is_release(enum bpf_arg_type type) { return type & OBJ_RELEASE; @@ -5929,7 +5918,8 @@ skip_type_check: meta->ref_obj_id = reg->ref_obj_id; } - if (arg_type == ARG_CONST_MAP_PTR) { + switch (base_type(arg_type)) { + case ARG_CONST_MAP_PTR: /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ if (meta->map_ptr) { /* Use map_uid (which is unique id of inner map) to reject: @@ -5954,7 +5944,8 @@ skip_type_check: } meta->map_ptr = reg->map_ptr; meta->map_uid = reg->map_uid; - } else if (arg_type == ARG_PTR_TO_MAP_KEY) { + break; + case ARG_PTR_TO_MAP_KEY: /* bpf_map_xxx(..., map_ptr, ..., key) call: * check that [key, key + map->key_size) are within * stack limits and initialized @@ -5971,7 +5962,8 @@ skip_type_check: err = check_helper_mem_access(env, regno, meta->map_ptr->key_size, false, NULL); - } else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE) { + break; + case ARG_PTR_TO_MAP_VALUE: if (type_may_be_null(arg_type) && register_is_null(reg)) return 0; @@ -5987,14 +5979,16 @@ skip_type_check: err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, false, meta); - } else if (arg_type == ARG_PTR_TO_PERCPU_BTF_ID) { + break; + case ARG_PTR_TO_PERCPU_BTF_ID: if (!reg->btf_id) { verbose(env, "Helper has invalid btf_id in R%d\n", regno); return -EACCES; } meta->ret_btf = reg->btf; meta->ret_btf_id = reg->btf_id; - } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { + break; + case ARG_PTR_TO_SPIN_LOCK: if (meta->func_id == BPF_FUNC_spin_lock) { if (process_spin_lock(env, regno, true)) return -EACCES; @@ -6005,12 +5999,15 @@ skip_type_check: verbose(env, "verifier internal error\n"); return -EFAULT; } - } else if (arg_type == ARG_PTR_TO_TIMER) { + break; + case ARG_PTR_TO_TIMER: if (process_timer_func(env, regno, meta)) return -EACCES; - } else if (arg_type == ARG_PTR_TO_FUNC) { + break; + case ARG_PTR_TO_FUNC: meta->subprogno = reg->subprogno; - } else if (base_type(arg_type) == ARG_PTR_TO_MEM) { + break; + case ARG_PTR_TO_MEM: /* The access to this pointer is only checked when we hit the * next is_mem_size argument below. */ @@ -6020,11 +6017,14 @@ skip_type_check: fn->arg_size[arg], false, meta); } - } else if (arg_type_is_mem_size(arg_type)) { - bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); - - err = check_mem_size_reg(env, reg, regno, zero_size_allowed, meta); - } else if (arg_type_is_dynptr(arg_type)) { + break; + case ARG_CONST_SIZE: + err = check_mem_size_reg(env, reg, regno, false, meta); + break; + case ARG_CONST_SIZE_OR_ZERO: + err = check_mem_size_reg(env, reg, regno, true, meta); + break; + case ARG_PTR_TO_DYNPTR: if (arg_type & MEM_UNINIT) { if (!is_dynptr_reg_valid_uninit(env, reg)) { verbose(env, "Dynptr has to be an uninitialized dynptr\n"); @@ -6058,21 +6058,28 @@ skip_type_check: err_extra, arg + 1); return -EINVAL; } - } else if (arg_type_is_alloc_size(arg_type)) { + break; + case ARG_CONST_ALLOC_SIZE_OR_ZERO: if (!tnum_is_const(reg->var_off)) { verbose(env, "R%d is not a known constant'\n", regno); return -EACCES; } meta->mem_size = reg->var_off.value; - } else if (arg_type_is_int_ptr(arg_type)) { + break; + case ARG_PTR_TO_INT: + case ARG_PTR_TO_LONG: + { int size = int_ptr_type_to_size(arg_type); err = check_helper_mem_access(env, regno, size, false, meta); if (err) return err; err = check_ptr_alignment(env, reg, 0, size, true); - } else if (arg_type == ARG_PTR_TO_CONST_STR) { + break; + } + case ARG_PTR_TO_CONST_STR: + { struct bpf_map *map = reg->map_ptr; int map_off; u64 map_addr; @@ -6111,9 +6118,12 @@ skip_type_check: verbose(env, "string is not zero-terminated\n"); return -EINVAL; } - } else if (arg_type == ARG_PTR_TO_KPTR) { + break; + } + case ARG_PTR_TO_KPTR: if (process_kptr_func(env, regno, meta)) return -EACCES; + break; } return err; -- cgit v1.2.3 From 5002615a37b1e23a4b51c386ee22c8f90a70b4dd Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 13 Jul 2022 16:09:36 +0000 Subject: bpf: Warn on non-preallocated case for BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE is also tracing type, which may cause unexpected memory allocation if we set BPF_F_NO_PREALLOC. Let's also warn on it similar as we do in case of BPF_PROG_TYPE_RAW_TRACEPOINT. Signed-off-by: Yafang Shao Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220713160936.57488-1-laoar.shao@gmail.com --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 26e7e787c20a..47fd6efa102a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12572,6 +12572,7 @@ static bool is_tracing_prog_type(enum bpf_prog_type type) case BPF_PROG_TYPE_TRACEPOINT: case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: return true; default: return false; -- cgit v1.2.3 From 9c7c48d6a1e2eb5192ad5294c1c4dbd42a88e88b Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Jul 2022 14:16:37 -0700 Subject: bpf: Fix subprog names in stack traces. The commit 7337224fc150 ("bpf: Improve the info.func_info and info.func_info_rec_size behavior") accidently made bpf_prog_ksym_set_name() conservative for bpf subprograms. Fixed it so instead of "bpf_prog_tag_F" the stack traces print "bpf_prog_tag_full_subprog_name". Fixes: 7337224fc150 ("bpf: Improve the info.func_info and info.func_info_rec_size behavior") Reported-by: Tejun Heo Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220714211637.17150-1-alexei.starovoitov@gmail.com --- kernel/bpf/verifier.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 47fd6efa102a..c59c3df0fea6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13631,6 +13631,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) /* Below members will be freed only at prog->aux */ func[i]->aux->btf = prog->aux->btf; func[i]->aux->func_info = prog->aux->func_info; + func[i]->aux->func_info_cnt = prog->aux->func_info_cnt; func[i]->aux->poke_tab = prog->aux->poke_tab; func[i]->aux->size_poke_tab = prog->aux->size_poke_tab; @@ -13643,9 +13644,6 @@ static int jit_subprogs(struct bpf_verifier_env *env) poke->aux = func[i]->aux; } - /* Use bpf_prog_F_tag to indicate functions in stack traces. - * Long term would need debug info to populate names - */ func[i]->aux->name[0] = 'F'; func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; func[i]->jit_requested = 1; -- cgit v1.2.3 From a2a5580fcbf808e7c2310e4959b62f9d2157fdb6 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Thu, 14 Jul 2022 11:03:22 +0100 Subject: bpf: Fix check against plain integer v 'NULL' When checking with sparse, btf_show_type_value() is causing a warning about checking integer vs NULL when the macro is passed a pointer, due to the 'value != 0' check. Stop sparse complaining about any type-casting by adding a cast to the typeof(value). This fixes the following sparse warnings: kernel/bpf/btf.c:2579:17: warning: Using plain integer as NULL pointer kernel/bpf/btf.c:2581:17: warning: Using plain integer as NULL pointer kernel/bpf/btf.c:3407:17: warning: Using plain integer as NULL pointer kernel/bpf/btf.c:3758:9: warning: Using plain integer as NULL pointer Signed-off-by: Ben Dooks Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220714100322.260467-1-ben.dooks@sifive.com --- kernel/bpf/btf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4423045b8ff3..5869f03bcb6e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1116,7 +1116,8 @@ __printf(2, 3) static void btf_show(struct btf_show *show, const char *fmt, ...) */ #define btf_show_type_value(show, fmt, value) \ do { \ - if ((value) != 0 || (show->flags & BTF_SHOW_ZERO) || \ + if ((value) != (__typeof__(value))0 || \ + (show->flags & BTF_SHOW_ZERO) || \ show->state.depth == 0) { \ btf_show(show, "%s%s" fmt "%s%s", \ btf_show_indent(show), \ -- cgit v1.2.3 From 3848636b4a88f0706f9ce48d532163244abadd43 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Thu, 14 Jul 2022 10:46:12 +0800 Subject: bpf: iterators: Build and use lightweight bootstrap version of bpftool kernel/bpf/preload/iterators use bpftool for vmlinux.h, skeleton, and static linking only. So we can use lightweight bootstrap version of bpftool to handle these, and it will be faster. Suggested-by: Andrii Nakryiko Signed-off-by: Pu Lehui Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220714024612.944071-4-pulehui@huawei.com --- kernel/bpf/preload/iterators/Makefile | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/preload/iterators/Makefile b/kernel/bpf/preload/iterators/Makefile index bfe24f8c5a20..6762b1260f2f 100644 --- a/kernel/bpf/preload/iterators/Makefile +++ b/kernel/bpf/preload/iterators/Makefile @@ -9,7 +9,7 @@ LLVM_STRIP ?= llvm-strip TOOLS_PATH := $(abspath ../../../../tools) BPFTOOL_SRC := $(TOOLS_PATH)/bpf/bpftool BPFTOOL_OUTPUT := $(abs_out)/bpftool -DEFAULT_BPFTOOL := $(OUTPUT)/sbin/bpftool +DEFAULT_BPFTOOL := $(BPFTOOL_OUTPUT)/bootstrap/bpftool BPFTOOL ?= $(DEFAULT_BPFTOOL) LIBBPF_SRC := $(TOOLS_PATH)/lib/bpf @@ -61,9 +61,5 @@ $(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OU OUTPUT=$(abspath $(dir $@))/ prefix= \ DESTDIR=$(LIBBPF_DESTDIR) $(abspath $@) install_headers -$(DEFAULT_BPFTOOL): $(BPFOBJ) | $(BPFTOOL_OUTPUT) - $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOL_SRC) \ - OUTPUT=$(BPFTOOL_OUTPUT)/ \ - LIBBPF_OUTPUT=$(LIBBPF_OUTPUT)/ \ - LIBBPF_DESTDIR=$(LIBBPF_DESTDIR)/ \ - prefix= DESTDIR=$(abs_out)/ install-bin +$(DEFAULT_BPFTOOL): | $(BPFTOOL_OUTPUT) + $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOL_SRC) OUTPUT=$(BPFTOOL_OUTPUT)/ bootstrap -- cgit v1.2.3 From 3908fcddc65d04e069b03be49b33fae90e424631 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 14 Jul 2022 11:54:04 -0700 Subject: bpf: fix lsm_cgroup build errors on esoteric configs This particular ones is about having the following: CONFIG_BPF_LSM=y # CONFIG_CGROUP_BPF is not set Also, add __maybe_unused to the args for the !CONFIG_NET cases. Reported-by: kernel test robot Signed-off-by: Stanislav Fomichev Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20220714185404.3647772-1-sdf@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_lsm.c | 8 ++++++-- kernel/bpf/trampoline.c | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index d469b7f3deef..fa71d58b7ded 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -63,10 +63,11 @@ BTF_ID(func, bpf_lsm_socket_post_create) BTF_ID(func, bpf_lsm_socket_socketpair) BTF_SET_END(bpf_lsm_unlocked_sockopt_hooks) +#ifdef CONFIG_CGROUP_BPF void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func) { - const struct btf_param *args; + const struct btf_param *args __maybe_unused; if (btf_type_vlen(prog->aux->attach_func_proto) < 1 || btf_id_set_contains(&bpf_lsm_current_hooks, @@ -75,9 +76,9 @@ void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, return; } +#ifdef CONFIG_NET args = btf_params(prog->aux->attach_func_proto); -#ifdef CONFIG_NET if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCKET]) *bpf_func = __cgroup_bpf_run_lsm_socket; else if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCK]) @@ -86,6 +87,7 @@ void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, #endif *bpf_func = __cgroup_bpf_run_lsm_current; } +#endif int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) @@ -219,6 +221,7 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_retval: return prog->expected_attach_type == BPF_LSM_CGROUP ? &bpf_get_retval_proto : NULL; +#ifdef CONFIG_NET case BPF_FUNC_setsockopt: if (prog->expected_attach_type != BPF_LSM_CGROUP) return NULL; @@ -239,6 +242,7 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) prog->aux->attach_btf_id)) return &bpf_unlocked_sk_getsockopt_proto; return NULL; +#endif default: return tracing_prog_func_proto(func_id, prog); } diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index fd69812412ca..6691dbf9e467 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -501,7 +501,7 @@ int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampolin return err; } -#if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) +#if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM) static void bpf_shim_tramp_link_release(struct bpf_link *link) { struct bpf_shim_tramp_link *shim_link = -- cgit v1.2.3 From 87ac0d600943994444e24382a87aa19acc4cd3d4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Jul 2022 22:31:43 -0700 Subject: bpf: fix potential 32-bit overflow when accessing ARRAY map element If BPF array map is bigger than 4GB, element pointer calculation can overflow because both index and elem_size are u32. Fix this everywhere by forcing 64-bit multiplication. Extract this formula into separate small helper and use it consistently in various places. Speculative-preventing formula utilizing index_mask trick is left as is, but explicit u64 casts are added in both places. Fixes: c85d69135a91 ("bpf: move memory size checks to bpf_map_charge_init()") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220715053146.1291891-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index fe40d3b9458f..1d05d63e6fa5 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -156,6 +156,11 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) return &array->map; } +static void *array_map_elem_ptr(struct bpf_array* array, u32 index) +{ + return array->value + (u64)array->elem_size * index; +} + /* Called from syscall or from eBPF program */ static void *array_map_lookup_elem(struct bpf_map *map, void *key) { @@ -165,7 +170,7 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) if (unlikely(index >= array->map.max_entries)) return NULL; - return array->value + array->elem_size * (index & array->index_mask); + return array->value + (u64)array->elem_size * (index & array->index_mask); } static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, @@ -339,7 +344,7 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, value, map->value_size); } else { val = array->value + - array->elem_size * (index & array->index_mask); + (u64)array->elem_size * (index & array->index_mask); if (map_flags & BPF_F_LOCK) copy_map_value_locked(map, val, value, false); else @@ -408,8 +413,7 @@ static void array_map_free_timers(struct bpf_map *map) return; for (i = 0; i < array->map.max_entries; i++) - bpf_timer_cancel_and_free(array->value + array->elem_size * i + - map->timer_off); + bpf_timer_cancel_and_free(array_map_elem_ptr(array, i) + map->timer_off); } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ @@ -420,7 +424,7 @@ static void array_map_free(struct bpf_map *map) if (map_value_has_kptrs(map)) { for (i = 0; i < array->map.max_entries; i++) - bpf_map_free_kptrs(map, array->value + array->elem_size * i); + bpf_map_free_kptrs(map, array_map_elem_ptr(array, i)); bpf_map_free_kptr_off_tab(map); } @@ -556,7 +560,7 @@ static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos) index = info->index & array->index_mask; if (info->percpu_value_buf) return array->pptrs[index]; - return array->value + array->elem_size * index; + return array_map_elem_ptr(array, index); } static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) @@ -575,7 +579,7 @@ static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) index = info->index & array->index_mask; if (info->percpu_value_buf) return array->pptrs[index]; - return array->value + array->elem_size * index; + return array_map_elem_ptr(array, index); } static int __bpf_array_map_seq_show(struct seq_file *seq, void *v) @@ -690,7 +694,7 @@ static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_ if (is_percpu) val = this_cpu_ptr(array->pptrs[i]); else - val = array->value + array->elem_size * i; + val = array_map_elem_ptr(array, i); num_elems++; key = i; ret = callback_fn((u64)(long)map, (u64)(long)&key, -- cgit v1.2.3 From d937bc3449fa868cbeaf5c87576f9929b765c1e0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Jul 2022 22:31:44 -0700 Subject: bpf: make uniform use of array->elem_size everywhere in arraymap.c BPF_MAP_TYPE_ARRAY is rounding value_size to closest multiple of 8 and stores that as array->elem_size for various memory allocations and accesses. But the code tends to re-calculate round_up(map->value_size, 8) in multiple places instead of using array->elem_size. Cleaning this up and making sure we always use array->size to avoid duplication of this (admittedly simple) logic for consistency. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220715053146.1291891-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 1d05d63e6fa5..98ee09155151 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -208,7 +208,7 @@ static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_insn *insn = insn_buf; - u32 elem_size = round_up(map->value_size, 8); + u32 elem_size = array->elem_size; const int ret = BPF_REG_0; const int map_ptr = BPF_REG_1; const int index = BPF_REG_2; @@ -277,7 +277,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) * access 'value_size' of them, so copying rounded areas * will not leak any kernel data */ - size = round_up(map->value_size, 8); + size = array->elem_size; rcu_read_lock(); pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { @@ -381,7 +381,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, * returned or zeros which were zero-filled by percpu_alloc, * so no kernel data leaks possible */ - size = round_up(map->value_size, 8); + size = array->elem_size; rcu_read_lock(); pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { @@ -587,6 +587,7 @@ static int __bpf_array_map_seq_show(struct seq_file *seq, void *v) struct bpf_iter_seq_array_map_info *info = seq->private; struct bpf_iter__bpf_map_elem ctx = {}; struct bpf_map *map = info->map; + struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_iter_meta meta; struct bpf_prog *prog; int off = 0, cpu = 0; @@ -607,7 +608,7 @@ static int __bpf_array_map_seq_show(struct seq_file *seq, void *v) ctx.value = v; } else { pptr = v; - size = round_up(map->value_size, 8); + size = array->elem_size; for_each_possible_cpu(cpu) { bpf_long_memcpy(info->percpu_value_buf + off, per_cpu_ptr(pptr, cpu), @@ -637,11 +638,12 @@ static int bpf_iter_init_array_map(void *priv_data, { struct bpf_iter_seq_array_map_info *seq_info = priv_data; struct bpf_map *map = aux->map; + struct bpf_array *array = container_of(map, struct bpf_array, map); void *value_buf; u32 buf_size; if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { - buf_size = round_up(map->value_size, 8) * num_possible_cpus(); + buf_size = array->elem_size * num_possible_cpus(); value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN); if (!value_buf) return -ENOMEM; @@ -1326,7 +1328,7 @@ static int array_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); - u32 elem_size = round_up(map->value_size, 8); + u32 elem_size = array->elem_size; struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; const int map_ptr = BPF_REG_1; -- cgit v1.2.3 From 63b8ce77b15ebf69c4b0ef4b87451e2626aa3c43 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Jul 2022 22:31:45 -0700 Subject: bpf: remove obsolete KMALLOC_MAX_SIZE restriction on array map value size Syscall-side map_lookup_elem() and map_update_elem() used to use kmalloc() to allocate temporary buffers of value_size, so KMALLOC_MAX_SIZE limit on value_size made sense to prevent creation of array map that won't be accessible through syscall interface. But this limitation since has been lifted by relying on kvmalloc() in syscall handling code. So remove KMALLOC_MAX_SIZE, which among other things means that it's possible to have BPF global variable sections (.bss, .data, .rodata) bigger than 8MB now. Keep the sanity check to prevent trivial overflows like round_up(map->value_size, 8) and restrict value size to <= INT_MAX (2GB). Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220715053146.1291891-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 98ee09155151..d3e734bf8056 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -70,10 +70,8 @@ int array_map_alloc_check(union bpf_attr *attr) attr->map_flags & BPF_F_PRESERVE_ELEMS) return -EINVAL; - if (attr->value_size > KMALLOC_MAX_SIZE) - /* if value_size is bigger, the user space won't be able to - * access the elements. - */ + /* avoid overflow on round_up(map->value_size) */ + if (attr->value_size > INT_MAX) return -E2BIG; return 0; -- cgit v1.2.3 From aef9d4a34a51f0a50b4cc04c635955b37972fc90 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 20 Jul 2022 09:47:29 -0700 Subject: bpf: Check attach_func_proto more carefully in check_helper_call Syzkaller found a problem similar to d1a6edecc1fd ("bpf: Check attach_func_proto more carefully in check_return_code") where attach_func_proto might be NULL: RIP: 0010:check_helper_call+0x3dcb/0x8d50 kernel/bpf/verifier.c:7330 do_check kernel/bpf/verifier.c:12302 [inline] do_check_common+0x6e1e/0xb980 kernel/bpf/verifier.c:14610 do_check_main kernel/bpf/verifier.c:14673 [inline] bpf_check+0x661e/0xc520 kernel/bpf/verifier.c:15243 bpf_prog_load+0x11ae/0x1f80 kernel/bpf/syscall.c:2620 With the following reproducer: bpf$BPF_PROG_RAW_TRACEPOINT_LOAD(0x5, &(0x7f0000000780)={0xf, 0x4, &(0x7f0000000040)=@framed={{}, [@call={0x85, 0x0, 0x0, 0xbb}]}, &(0x7f0000000000)='GPL\x00', 0x0, 0x0, 0x0, 0x0, 0x0, '\x00', 0x0, 0x2b, 0xffffffffffffffff, 0x8, 0x0, 0x0, 0x10, 0x0}, 0x80) Let's do the same here, only check attach_func_proto for the prog types where we are certain that attach_func_proto is defined. Fixes: 69fd337a975c ("bpf: per-cgroup lsm flavor") Reported-by: syzbot+0f8d989b1fba1addc5e0@syzkaller.appspotmail.com Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220720164729.147544-1-sdf@google.com --- kernel/bpf/verifier.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c59c3df0fea6..7c1e056624f9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7170,6 +7170,7 @@ static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { + enum bpf_prog_type prog_type = resolve_prog_type(env->prog); const struct bpf_func_proto *fn = NULL; enum bpf_return_type ret_type; enum bpf_type_flag ret_flag; @@ -7331,7 +7332,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } break; case BPF_FUNC_set_retval: - if (env->prog->expected_attach_type == BPF_LSM_CGROUP) { + if (prog_type == BPF_PROG_TYPE_LSM && + env->prog->expected_attach_type == BPF_LSM_CGROUP) { if (!env->prog->aux->attach_func_proto->type) { /* Make sure programs that attach to void * hooks don't try to modify return value. -- cgit v1.2.3 From a4703e3184320d6e15e2bc81d2ccf1c8c883f9d1 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 21 Jul 2022 15:42:35 +0200 Subject: bpf: Switch to new kfunc flags infrastructure Instead of populating multiple sets to indicate some attribute and then researching the same BTF ID in them, prepare a single unified BTF set which indicates whether a kfunc is allowed to be called, and also its attributes if any at the same time. Now, only one call is needed to perform the lookup for both kfunc availability and its attributes. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220721134245.2450-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 106 +++++++++++++++++++++++--------------------------- kernel/bpf/verifier.c | 14 +++---- 2 files changed, 54 insertions(+), 66 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 5869f03bcb6e..4d9c2d88720f 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -213,7 +213,7 @@ enum { }; struct btf_kfunc_set_tab { - struct btf_id_set *sets[BTF_KFUNC_HOOK_MAX][BTF_KFUNC_TYPE_MAX]; + struct btf_id_set8 *sets[BTF_KFUNC_HOOK_MAX]; }; struct btf_id_dtor_kfunc_tab { @@ -1616,7 +1616,7 @@ static void btf_free_id(struct btf *btf) static void btf_free_kfunc_set_tab(struct btf *btf) { struct btf_kfunc_set_tab *tab = btf->kfunc_set_tab; - int hook, type; + int hook; if (!tab) return; @@ -1625,10 +1625,8 @@ static void btf_free_kfunc_set_tab(struct btf *btf) */ if (btf_is_module(btf)) goto free_tab; - for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++) { - for (type = 0; type < ARRAY_SIZE(tab->sets[0]); type++) - kfree(tab->sets[hook][type]); - } + for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++) + kfree(tab->sets[hook]); free_tab: kfree(tab); btf->kfunc_set_tab = NULL; @@ -6172,7 +6170,8 @@ static bool is_kfunc_arg_mem_size(const struct btf *btf, static int btf_check_func_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, - bool ptr_to_mem_ok) + bool ptr_to_mem_ok, + u32 kfunc_flags) { enum bpf_prog_type prog_type = resolve_prog_type(env->prog); struct bpf_verifier_log *log = &env->log; @@ -6210,10 +6209,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, if (is_kfunc) { /* Only kfunc can be release func */ - rel = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog), - BTF_KFUNC_TYPE_RELEASE, func_id); - kptr_get = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog), - BTF_KFUNC_TYPE_KPTR_ACQUIRE, func_id); + rel = kfunc_flags & KF_RELEASE; + kptr_get = kfunc_flags & KF_KPTR_GET; } /* check that BTF function arguments match actual types that the @@ -6442,7 +6439,7 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, return -EINVAL; is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; - err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global); + err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, 0); /* Compiler optimizations can remove arguments from static functions * or mismatched type can be passed into a global function. @@ -6455,9 +6452,10 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, - struct bpf_reg_state *regs) + struct bpf_reg_state *regs, + u32 kfunc_flags) { - return btf_check_func_arg_match(env, btf, func_id, regs, true); + return btf_check_func_arg_match(env, btf, func_id, regs, true, kfunc_flags); } /* Convert BTF of a function into bpf_reg_state if possible @@ -6854,6 +6852,11 @@ bool btf_id_set_contains(const struct btf_id_set *set, u32 id) return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL; } +static void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id) +{ + return bsearch(&id, set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func); +} + enum { BTF_MODULE_F_LIVE = (1 << 0), }; @@ -7102,16 +7105,16 @@ BTF_TRACING_TYPE_xxx /* Kernel Function (kfunc) BTF ID set registration API */ -static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, - enum btf_kfunc_type type, - struct btf_id_set *add_set, bool vmlinux_set) +static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, + struct btf_id_set8 *add_set) { + bool vmlinux_set = !btf_is_module(btf); struct btf_kfunc_set_tab *tab; - struct btf_id_set *set; + struct btf_id_set8 *set; u32 set_cnt; int ret; - if (hook >= BTF_KFUNC_HOOK_MAX || type >= BTF_KFUNC_TYPE_MAX) { + if (hook >= BTF_KFUNC_HOOK_MAX) { ret = -EINVAL; goto end; } @@ -7127,7 +7130,7 @@ static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, btf->kfunc_set_tab = tab; } - set = tab->sets[hook][type]; + set = tab->sets[hook]; /* Warn when register_btf_kfunc_id_set is called twice for the same hook * for module sets. */ @@ -7141,7 +7144,7 @@ static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, * pointer and return. */ if (!vmlinux_set) { - tab->sets[hook][type] = add_set; + tab->sets[hook] = add_set; return 0; } @@ -7150,7 +7153,7 @@ static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, * and concatenate all individual sets being registered. While each set * is individually sorted, they may become unsorted when concatenated, * hence re-sorting the final set again is required to make binary - * searching the set using btf_id_set_contains function work. + * searching the set using btf_id_set8_contains function work. */ set_cnt = set ? set->cnt : 0; @@ -7165,8 +7168,8 @@ static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, } /* Grow set */ - set = krealloc(tab->sets[hook][type], - offsetof(struct btf_id_set, ids[set_cnt + add_set->cnt]), + set = krealloc(tab->sets[hook], + offsetof(struct btf_id_set8, pairs[set_cnt + add_set->cnt]), GFP_KERNEL | __GFP_NOWARN); if (!set) { ret = -ENOMEM; @@ -7174,15 +7177,15 @@ static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, } /* For newly allocated set, initialize set->cnt to 0 */ - if (!tab->sets[hook][type]) + if (!tab->sets[hook]) set->cnt = 0; - tab->sets[hook][type] = set; + tab->sets[hook] = set; /* Concatenate the two sets */ - memcpy(set->ids + set->cnt, add_set->ids, add_set->cnt * sizeof(set->ids[0])); + memcpy(set->pairs + set->cnt, add_set->pairs, add_set->cnt * sizeof(set->pairs[0])); set->cnt += add_set->cnt; - sort(set->ids, set->cnt, sizeof(set->ids[0]), btf_id_cmp_func, NULL); + sort(set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func, NULL); return 0; end: @@ -7190,38 +7193,25 @@ end: return ret; } -static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, - const struct btf_kfunc_id_set *kset) -{ - bool vmlinux_set = !btf_is_module(btf); - int type, ret = 0; - - for (type = 0; type < ARRAY_SIZE(kset->sets); type++) { - if (!kset->sets[type]) - continue; - - ret = __btf_populate_kfunc_set(btf, hook, type, kset->sets[type], vmlinux_set); - if (ret) - break; - } - return ret; -} - -static bool __btf_kfunc_id_set_contains(const struct btf *btf, +static u32 *__btf_kfunc_id_set_contains(const struct btf *btf, enum btf_kfunc_hook hook, - enum btf_kfunc_type type, u32 kfunc_btf_id) { - struct btf_id_set *set; + struct btf_id_set8 *set; + u32 *id; - if (hook >= BTF_KFUNC_HOOK_MAX || type >= BTF_KFUNC_TYPE_MAX) - return false; + if (hook >= BTF_KFUNC_HOOK_MAX) + return NULL; if (!btf->kfunc_set_tab) - return false; - set = btf->kfunc_set_tab->sets[hook][type]; + return NULL; + set = btf->kfunc_set_tab->sets[hook]; if (!set) - return false; - return btf_id_set_contains(set, kfunc_btf_id); + return NULL; + id = btf_id_set8_contains(set, kfunc_btf_id); + if (!id) + return NULL; + /* The flags for BTF ID are located next to it */ + return id + 1; } static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) @@ -7249,14 +7239,14 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) * keeping the reference for the duration of the call provides the necessary * protection for looking up a well-formed btf->kfunc_set_tab. */ -bool btf_kfunc_id_set_contains(const struct btf *btf, +u32 *btf_kfunc_id_set_contains(const struct btf *btf, enum bpf_prog_type prog_type, - enum btf_kfunc_type type, u32 kfunc_btf_id) + u32 kfunc_btf_id) { enum btf_kfunc_hook hook; hook = bpf_prog_type_to_kfunc_hook(prog_type); - return __btf_kfunc_id_set_contains(btf, hook, type, kfunc_btf_id); + return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id); } /* This function must be invoked only from initcalls/module init functions */ @@ -7283,7 +7273,7 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, return PTR_ERR(btf); hook = bpf_prog_type_to_kfunc_hook(prog_type); - ret = btf_populate_kfunc_set(btf, hook, kset); + ret = btf_populate_kfunc_set(btf, hook, kset->set); btf_put(btf); return ret; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7c1e056624f9..096fdac70165 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7562,6 +7562,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int err, insn_idx = *insn_idx_p; const struct btf_param *args; struct btf *desc_btf; + u32 *kfunc_flags; bool acq; /* skip for now, but return error when we find this in fixup_kfunc_call */ @@ -7577,18 +7578,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, func_name = btf_name_by_offset(desc_btf, func->name_off); func_proto = btf_type_by_id(desc_btf, func->type); - if (!btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), - BTF_KFUNC_TYPE_CHECK, func_id)) { + kfunc_flags = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), func_id); + if (!kfunc_flags) { verbose(env, "calling kernel function %s is not allowed\n", func_name); return -EACCES; } - - acq = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), - BTF_KFUNC_TYPE_ACQUIRE, func_id); + acq = *kfunc_flags & KF_ACQUIRE; /* Check the arguments */ - err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs); + err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs, *kfunc_flags); if (err < 0) return err; /* In case of release function, we get register number of refcounted @@ -7632,8 +7631,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, regs[BPF_REG_0].btf = desc_btf; regs[BPF_REG_0].type = PTR_TO_BTF_ID; regs[BPF_REG_0].btf_id = ptr_type_id; - if (btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), - BTF_KFUNC_TYPE_RET_NULL, func_id)) { + if (*kfunc_flags & KF_RET_NULL) { regs[BPF_REG_0].type |= PTR_MAYBE_NULL; /* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */ regs[BPF_REG_0].id = ++env->id_gen; -- cgit v1.2.3 From 56e948ffc098a780fefb6c1784a3a2c7b81100a1 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 21 Jul 2022 15:42:36 +0200 Subject: bpf: Add support for forcing kfunc args to be trusted Teach the verifier to detect a new KF_TRUSTED_ARGS kfunc flag, which means each pointer argument must be trusted, which we define as a pointer that is referenced (has non-zero ref_obj_id) and also needs to have its offset unchanged, similar to how release functions expect their argument. This allows a kfunc to receive pointer arguments unchanged from the result of the acquire kfunc. This is required to ensure that kfunc that operate on some object only work on acquired pointers and not normal PTR_TO_BTF_ID with same type which can be obtained by pointer walking. The restrictions applied to release arguments also apply to trusted arguments. This implies that strict type matching (not deducing type by recursively following members at offset) and OBJ_RELEASE offset checks (ensuring they are zero) are used for trusted pointer arguments. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220721134245.2450-5-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4d9c2d88720f..7ac971ea98d1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6174,10 +6174,10 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, u32 kfunc_flags) { enum bpf_prog_type prog_type = resolve_prog_type(env->prog); + bool rel = false, kptr_get = false, trusted_arg = false; struct bpf_verifier_log *log = &env->log; u32 i, nargs, ref_id, ref_obj_id = 0; bool is_kfunc = btf_is_kernel(btf); - bool rel = false, kptr_get = false; const char *func_name, *ref_tname; const struct btf_type *t, *ref_t; const struct btf_param *args; @@ -6211,6 +6211,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, /* Only kfunc can be release func */ rel = kfunc_flags & KF_RELEASE; kptr_get = kfunc_flags & KF_KPTR_GET; + trusted_arg = kfunc_flags & KF_TRUSTED_ARGS; } /* check that BTF function arguments match actual types that the @@ -6235,10 +6236,19 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } + /* Check if argument must be a referenced pointer, args + i has + * been verified to be a pointer (after skipping modifiers). + */ + if (is_kfunc && trusted_arg && !reg->ref_obj_id) { + bpf_log(log, "R%d must be referenced\n", regno); + return -EINVAL; + } + ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); ref_tname = btf_name_by_offset(btf, ref_t->name_off); - if (rel && reg->ref_obj_id) + /* Trusted args have the same offset checks as release arguments */ + if (trusted_arg || (rel && reg->ref_obj_id)) arg_type |= OBJ_RELEASE; ret = check_func_arg_reg_off(env, reg, regno, arg_type); if (ret < 0) @@ -6336,7 +6346,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off); if (!btf_struct_ids_match(log, reg_btf, reg_ref_id, - reg->off, btf, ref_id, rel && reg->ref_obj_id)) { + reg->off, btf, ref_id, + trusted_arg || (rel && reg->ref_obj_id))) { bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n", func_name, i, btf_type_str(ref_t), ref_tname, -- cgit v1.2.3 From 00963a2e75a872e5fce4d0115ac2786ec86b57a6 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 19 Jul 2022 17:21:26 -0700 Subject: bpf: Support bpf_trampoline on functions with IPMODIFY (e.g. livepatch) When tracing a function with IPMODIFY ftrace_ops (livepatch), the bpf trampoline must follow the instruction pointer saved on stack. This needs extra handling for bpf trampolines with BPF_TRAMP_F_CALL_ORIG flag. Implement bpf_tramp_ftrace_ops_func and use it for the ftrace_ops used by BPF trampoline. This enables tracing functions with livepatch. This also requires moving bpf trampoline to *_ftrace_direct_mult APIs. Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/all/20220602193706.2607681-2-song@kernel.org/ Link: https://lore.kernel.org/bpf/20220720002126.803253-5-song@kernel.org --- kernel/bpf/trampoline.c | 158 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 141 insertions(+), 17 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 6691dbf9e467..42e387a12694 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -13,6 +13,7 @@ #include #include #include +#include /* dummy _ops. The verifier will operate on target program's ops. */ const struct bpf_verifier_ops bpf_extension_verifier_ops = { @@ -29,6 +30,81 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; /* serializes access to trampoline_table */ static DEFINE_MUTEX(trampoline_mutex); +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex); + +static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd) +{ + struct bpf_trampoline *tr = ops->private; + int ret = 0; + + if (cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF) { + /* This is called inside register_ftrace_direct_multi(), so + * tr->mutex is already locked. + */ + lockdep_assert_held_once(&tr->mutex); + + /* Instead of updating the trampoline here, we propagate + * -EAGAIN to register_ftrace_direct_multi(). Then we can + * retry register_ftrace_direct_multi() after updating the + * trampoline. + */ + if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) && + !(tr->flags & BPF_TRAMP_F_ORIG_STACK)) { + if (WARN_ON_ONCE(tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY)) + return -EBUSY; + + tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY; + return -EAGAIN; + } + + return 0; + } + + /* The normal locking order is + * tr->mutex => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c) + * + * The following two commands are called from + * + * prepare_direct_functions_for_ipmodify + * cleanup_direct_functions_after_ipmodify + * + * In both cases, direct_mutex is already locked. Use + * mutex_trylock(&tr->mutex) to avoid deadlock in race condition + * (something else is making changes to this same trampoline). + */ + if (!mutex_trylock(&tr->mutex)) { + /* sleep 1 ms to make sure whatever holding tr->mutex makes + * some progress. + */ + msleep(1); + return -EAGAIN; + } + + switch (cmd) { + case FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER: + tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY; + + if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) && + !(tr->flags & BPF_TRAMP_F_ORIG_STACK)) + ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */); + break; + case FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER: + tr->flags &= ~BPF_TRAMP_F_SHARE_IPMODIFY; + + if (tr->flags & BPF_TRAMP_F_ORIG_STACK) + ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */); + break; + default: + ret = -EINVAL; + break; + }; + + mutex_unlock(&tr->mutex); + return ret; +} +#endif + bool bpf_prog_has_trampoline(const struct bpf_prog *prog) { enum bpf_attach_type eatype = prog->expected_attach_type; @@ -89,6 +165,16 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key) tr = kzalloc(sizeof(*tr), GFP_KERNEL); if (!tr) goto out; +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + tr->fops = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL); + if (!tr->fops) { + kfree(tr); + tr = NULL; + goto out; + } + tr->fops->private = tr; + tr->fops->ops_func = bpf_tramp_ftrace_ops_func; +#endif tr->key = key; INIT_HLIST_NODE(&tr->hlist); @@ -128,7 +214,7 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr) int ret; if (tr->func.ftrace_managed) - ret = unregister_ftrace_direct((long)ip, (long)old_addr); + ret = unregister_ftrace_direct_multi(tr->fops, (long)old_addr); else ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL); @@ -137,15 +223,20 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr) return ret; } -static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr) +static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr, + bool lock_direct_mutex) { void *ip = tr->func.addr; int ret; - if (tr->func.ftrace_managed) - ret = modify_ftrace_direct((long)ip, (long)old_addr, (long)new_addr); - else + if (tr->func.ftrace_managed) { + if (lock_direct_mutex) + ret = modify_ftrace_direct_multi(tr->fops, (long)new_addr); + else + ret = modify_ftrace_direct_multi_nolock(tr->fops, (long)new_addr); + } else { ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr); + } return ret; } @@ -163,10 +254,12 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) if (bpf_trampoline_module_get(tr)) return -ENOENT; - if (tr->func.ftrace_managed) - ret = register_ftrace_direct((long)ip, (long)new_addr); - else + if (tr->func.ftrace_managed) { + ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 0); + ret = register_ftrace_direct_multi(tr->fops, (long)new_addr); + } else { ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr); + } if (ret) bpf_trampoline_module_put(tr); @@ -332,11 +425,11 @@ out: return ERR_PTR(err); } -static int bpf_trampoline_update(struct bpf_trampoline *tr) +static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex) { struct bpf_tramp_image *im; struct bpf_tramp_links *tlinks; - u32 flags = BPF_TRAMP_F_RESTORE_REGS; + u32 orig_flags = tr->flags; bool ip_arg = false; int err, total; @@ -358,18 +451,31 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) goto out; } + /* clear all bits except SHARE_IPMODIFY */ + tr->flags &= BPF_TRAMP_F_SHARE_IPMODIFY; + if (tlinks[BPF_TRAMP_FEXIT].nr_links || - tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) + tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) { /* NOTE: BPF_TRAMP_F_RESTORE_REGS and BPF_TRAMP_F_SKIP_FRAME * should not be set together. */ - flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; + tr->flags |= BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; + } else { + tr->flags |= BPF_TRAMP_F_RESTORE_REGS; + } if (ip_arg) - flags |= BPF_TRAMP_F_IP_ARG; + tr->flags |= BPF_TRAMP_F_IP_ARG; + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +again: + if ((tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY) && + (tr->flags & BPF_TRAMP_F_CALL_ORIG)) + tr->flags |= BPF_TRAMP_F_ORIG_STACK; +#endif err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE, - &tr->func.model, flags, tlinks, + &tr->func.model, tr->flags, tlinks, tr->func.addr); if (err < 0) goto out; @@ -378,17 +484,34 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) WARN_ON(!tr->cur_image && tr->selector); if (tr->cur_image) /* progs already running at this address */ - err = modify_fentry(tr, tr->cur_image->image, im->image); + err = modify_fentry(tr, tr->cur_image->image, im->image, lock_direct_mutex); else /* first time registering */ err = register_fentry(tr, im->image); + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + if (err == -EAGAIN) { + /* -EAGAIN from bpf_tramp_ftrace_ops_func. Now + * BPF_TRAMP_F_SHARE_IPMODIFY is set, we can generate the + * trampoline again, and retry register. + */ + /* reset fops->func and fops->trampoline for re-register */ + tr->fops->func = NULL; + tr->fops->trampoline = 0; + goto again; + } +#endif if (err) goto out; + if (tr->cur_image) bpf_tramp_image_put(tr->cur_image); tr->cur_image = im; tr->selector++; out: + /* If any error happens, restore previous flags */ + if (err) + tr->flags = orig_flags; kfree(tlinks); return err; } @@ -454,7 +577,7 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_tr hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]); tr->progs_cnt[kind]++; - err = bpf_trampoline_update(tr); + err = bpf_trampoline_update(tr, true /* lock_direct_mutex */); if (err) { hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; @@ -487,7 +610,7 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_ } hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; - return bpf_trampoline_update(tr); + return bpf_trampoline_update(tr, true /* lock_direct_mutex */); } /* bpf_trampoline_unlink_prog() should never fail. */ @@ -715,6 +838,7 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) * multiple rcu callbacks. */ hlist_del(&tr->hlist); + kfree(tr->fops); kfree(tr); out: mutex_unlock(&trampoline_mutex); -- cgit v1.2.3 From ea2babac63d40e59926dc5de4550dac94cc3c6d2 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 13 Jul 2022 13:49:50 -0700 Subject: bpf: Simplify bpf_prog_pack_[size|mask] Simplify the logic that selects bpf_prog_pack_size, and always use (PMD_SIZE * num_possible_nodes()). This is a good tradeoff, as most of the performance benefit observed is from less direct map fragmentation [0]. Also, module_alloc(4MB) may not allocate 4MB aligned memory. Therefore, we cannot use (ptr & bpf_prog_pack_mask) to find the correct address of bpf_prog_pack. Fix this by checking the header address falls in the range of pack->ptr and (pack->ptr + bpf_prog_pack_size). [0] https://lore.kernel.org/bpf/20220707223546.4124919-1-song@kernel.org/ Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20220713204950.3015201-1-song@kernel.org --- kernel/bpf/core.c | 71 +++++++++++++------------------------------------------ 1 file changed, 17 insertions(+), 54 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index cfb8a50a9f12..72d0721318e1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -825,15 +825,6 @@ struct bpf_prog_pack { #define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE) -static size_t bpf_prog_pack_size = -1; -static size_t bpf_prog_pack_mask = -1; - -static int bpf_prog_chunk_count(void) -{ - WARN_ON_ONCE(bpf_prog_pack_size == -1); - return bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE; -} - static DEFINE_MUTEX(pack_mutex); static LIST_HEAD(pack_list); @@ -841,55 +832,33 @@ static LIST_HEAD(pack_list); * CONFIG_MMU=n. Use PAGE_SIZE in these cases. */ #ifdef PMD_SIZE -#define BPF_HPAGE_SIZE PMD_SIZE -#define BPF_HPAGE_MASK PMD_MASK +#define BPF_PROG_PACK_SIZE (PMD_SIZE * num_possible_nodes()) #else -#define BPF_HPAGE_SIZE PAGE_SIZE -#define BPF_HPAGE_MASK PAGE_MASK +#define BPF_PROG_PACK_SIZE PAGE_SIZE #endif -static size_t select_bpf_prog_pack_size(void) -{ - size_t size; - void *ptr; - - size = BPF_HPAGE_SIZE * num_online_nodes(); - ptr = module_alloc(size); - - /* Test whether we can get huge pages. If not just use PAGE_SIZE - * packs. - */ - if (!ptr || !is_vm_area_hugepages(ptr)) { - size = PAGE_SIZE; - bpf_prog_pack_mask = PAGE_MASK; - } else { - bpf_prog_pack_mask = BPF_HPAGE_MASK; - } - - vfree(ptr); - return size; -} +#define BPF_PROG_CHUNK_COUNT (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE) static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_prog_pack *pack; - pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(bpf_prog_chunk_count())), + pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)), GFP_KERNEL); if (!pack) return NULL; - pack->ptr = module_alloc(bpf_prog_pack_size); + pack->ptr = module_alloc(BPF_PROG_PACK_SIZE); if (!pack->ptr) { kfree(pack); return NULL; } - bpf_fill_ill_insns(pack->ptr, bpf_prog_pack_size); - bitmap_zero(pack->bitmap, bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE); + bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE); + bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE); list_add_tail(&pack->list, &pack_list); set_vm_flush_reset_perms(pack->ptr); - set_memory_ro((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE); - set_memory_x((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE); + set_memory_ro((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); + set_memory_x((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); return pack; } @@ -901,10 +870,7 @@ static void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insn void *ptr = NULL; mutex_lock(&pack_mutex); - if (bpf_prog_pack_size == -1) - bpf_prog_pack_size = select_bpf_prog_pack_size(); - - if (size > bpf_prog_pack_size) { + if (size > BPF_PROG_PACK_SIZE) { size = round_up(size, PAGE_SIZE); ptr = module_alloc(size); if (ptr) { @@ -916,9 +882,9 @@ static void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insn goto out; } list_for_each_entry(pack, &pack_list, list) { - pos = bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0, + pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0, nbits, 0); - if (pos < bpf_prog_chunk_count()) + if (pos < BPF_PROG_CHUNK_COUNT) goto found_free_area; } @@ -942,18 +908,15 @@ static void bpf_prog_pack_free(struct bpf_binary_header *hdr) struct bpf_prog_pack *pack = NULL, *tmp; unsigned int nbits; unsigned long pos; - void *pack_ptr; mutex_lock(&pack_mutex); - if (hdr->size > bpf_prog_pack_size) { + if (hdr->size > BPF_PROG_PACK_SIZE) { module_memfree(hdr); goto out; } - pack_ptr = (void *)((unsigned long)hdr & bpf_prog_pack_mask); - list_for_each_entry(tmp, &pack_list, list) { - if (tmp->ptr == pack_ptr) { + if ((void *)hdr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > (void *)hdr) { pack = tmp; break; } @@ -963,14 +926,14 @@ static void bpf_prog_pack_free(struct bpf_binary_header *hdr) goto out; nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size); - pos = ((unsigned long)hdr - (unsigned long)pack_ptr) >> BPF_PROG_CHUNK_SHIFT; + pos = ((unsigned long)hdr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT; WARN_ONCE(bpf_arch_text_invalidate(hdr, hdr->size), "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n"); bitmap_clear(pack->bitmap, pos, nbits); - if (bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0, - bpf_prog_chunk_count(), 0) == 0) { + if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0, + BPF_PROG_CHUNK_COUNT, 0) == 0) { list_del(&pack->list); module_memfree(pack->ptr); kfree(pack); -- cgit v1.2.3 From bd82ea52f0ee2890b698155a6d7bf9ea5bd8930d Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Sat, 23 Jul 2022 19:17:10 +0200 Subject: bpf, devmap: Compute proper xdp_frame len redirecting frames Even if it is currently forbidden to XDP_REDIRECT a multi-frag xdp_frame into a devmap, compute proper xdp_frame length in __xdp_enqueue and is_valid_dst routines running xdp_get_frame_len(). Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/894d99c01139e921bdb6868158ff8e67f661c072.1658596075.git.lorenzo@kernel.org --- kernel/bpf/devmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 1400561efb15..a0e02b009487 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -477,7 +477,7 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, if (!dev->netdev_ops->ndo_xdp_xmit) return -EOPNOTSUPP; - err = xdp_ok_fwd_dev(dev, xdpf->len); + err = xdp_ok_fwd_dev(dev, xdp_get_frame_len(xdpf)); if (unlikely(err)) return err; @@ -536,7 +536,7 @@ static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf) !obj->dev->netdev_ops->ndo_xdp_xmit) return false; - if (xdp_ok_fwd_dev(obj->dev, xdpf->len)) + if (xdp_ok_fwd_dev(obj->dev, xdp_get_frame_len(xdpf))) return false; return true; -- cgit v1.2.3 From 58250ae350de8d28ce91ade4605d32c9e7f062a8 Mon Sep 17 00:00:00 2001 From: Fedor Tokarev Date: Mon, 11 Jul 2022 23:13:17 +0200 Subject: bpf: btf: Fix vsnprintf return value check vsnprintf returns the number of characters which would have been written if enough space had been available, excluding the terminating null byte. Thus, the return value of 'len_left' means that the last character has been dropped. Signed-off-by: Fedor Tokarev Signed-off-by: Andrii Nakryiko Acked-by: Alan Maguire Link: https://lore.kernel.org/bpf/20220711211317.GA1143610@laptop --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 7ac971ea98d1..7e64447659f3 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6643,7 +6643,7 @@ static void btf_snprintf_show(struct btf_show *show, const char *fmt, if (len < 0) { ssnprintf->len_left = 0; ssnprintf->len = len; - } else if (len > ssnprintf->len_left) { + } else if (len >= ssnprintf->len_left) { /* no space, drive on to get length we would have written */ ssnprintf->len_left = 0; ssnprintf->len += len; -- cgit v1.2.3 From dc81f8d1e8ea3f5dfa88919cb834a135a6a536b8 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 29 Jul 2022 12:41:06 -0700 Subject: bpf: Fix test_progs -j error with fentry/fexit tests When multiple threads are attaching/detaching fentry/fexit programs to the same trampoline, we may call register_fentry on the same trampoline twice: register_fentry(), unregister_fentry(), then register_fentry again. This causes ftrace_set_filter_ip() for the same ip on tr->fops twice, which leaves duplicated ip in tr->fops. The extra ip is not cleaned up properly on unregister and thus causes failures with further register in register_ftrace_direct_multi(): register_ftrace_direct_multi() { ... for (i = 0; i < size; i++) { hlist_for_each_entry(entry, &hash->buckets[i], hlist) { if (ftrace_find_rec_direct(entry->ip)) goto out_unlock; } } ... } This can be triggered with parallel fentry/fexit tests with test_progs: ./test_progs -t fentry,fexit -j Fix this by resetting tr->fops in ftrace_set_filter_ip(), so that there will never be duplicated entries in tr->fops. Fixes: 00963a2e75a8 ("bpf: Support bpf_trampoline on functions with IPMODIFY (e.g. livepatch)") Reported-by: Andrii Nakryiko Signed-off-by: Song Liu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220729194106.1207472-1-song@kernel.org --- kernel/bpf/trampoline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 42e387a12694..7ec7e23559ad 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -255,7 +255,7 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) return -ENOENT; if (tr->func.ftrace_managed) { - ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 0); + ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1); ret = register_ftrace_direct_multi(tr->fops, (long)new_addr); } else { ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr); -- cgit v1.2.3 From 3b317abc71598bda8ff9a9c483ad8ae167b18382 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Thu, 28 Jul 2022 07:40:48 -0400 Subject: bpf: Fix NULL pointer dereference when registering bpf trampoline A panic was reported on arm64: [ 44.517109] audit: type=1334 audit(1658859870.268:59): prog-id=19 op=LOAD [ 44.622031] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000010 [ 44.624321] Mem abort info: [ 44.625049] ESR = 0x0000000096000004 [ 44.625935] EC = 0x25: DABT (current EL), IL = 32 bits [ 44.627182] SET = 0, FnV = 0 [ 44.627930] EA = 0, S1PTW = 0 [ 44.628684] FSC = 0x04: level 0 translation fault [ 44.629788] Data abort info: [ 44.630474] ISV = 0, ISS = 0x00000004 [ 44.631362] CM = 0, WnR = 0 [ 44.632041] user pgtable: 4k pages, 48-bit VAs, pgdp=0000000100ab5000 [ 44.633494] [0000000000000010] pgd=0000000000000000, p4d=0000000000000000 [ 44.635202] Internal error: Oops: 96000004 [#1] SMP [ 44.636452] Modules linked in: xfs crct10dif_ce ghash_ce virtio_blk virtio_console virtio_mmio qemu_fw_cfg [ 44.638713] CPU: 2 PID: 1 Comm: systemd Not tainted 5.19.0-rc7 #1 [ 44.640164] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 [ 44.641799] pstate: 00400005 (nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 44.643404] pc : ftrace_set_filter_ip+0x24/0xa0 [ 44.644659] lr : bpf_trampoline_update.constprop.0+0x428/0x4a0 [ 44.646118] sp : ffff80000803b9f0 [ 44.646950] x29: ffff80000803b9f0 x28: ffff0b5d80364400 x27: ffff80000803bb48 [ 44.648721] x26: ffff8000085ad000 x25: ffff0b5d809d2400 x24: 0000000000000000 [ 44.650493] x23: 00000000ffffffed x22: ffff0b5dd7ea0900 x21: 0000000000000000 [ 44.652279] x20: 0000000000000000 x19: 0000000000000000 x18: ffffffffffffffff [ 44.654067] x17: 0000000000000000 x16: 0000000000000000 x15: ffffffffffffffff [ 44.655787] x14: ffff0b5d809d2498 x13: ffff0b5d809d2432 x12: 0000000005f5e100 [ 44.657535] x11: abcc77118461cefd x10: 000000000000005f x9 : ffffa7219cb5b190 [ 44.659254] x8 : ffffa7219c8e0000 x7 : 0000000000000000 x6 : ffffa7219db075e0 [ 44.661066] x5 : ffffa7219d3130e0 x4 : ffffa7219cab9da0 x3 : 0000000000000000 [ 44.662837] x2 : 0000000000000000 x1 : ffffa7219cb7a5c0 x0 : 0000000000000000 [ 44.664675] Call trace: [ 44.665274] ftrace_set_filter_ip+0x24/0xa0 [ 44.666327] bpf_trampoline_update.constprop.0+0x428/0x4a0 [ 44.667696] __bpf_trampoline_link_prog+0xcc/0x1c0 [ 44.668834] bpf_trampoline_link_prog+0x40/0x64 [ 44.669919] bpf_tracing_prog_attach+0x120/0x490 [ 44.671011] link_create+0xe0/0x2b0 [ 44.671869] __sys_bpf+0x484/0xd30 [ 44.672706] __arm64_sys_bpf+0x30/0x40 [ 44.673678] invoke_syscall+0x78/0x100 [ 44.674623] el0_svc_common.constprop.0+0x4c/0xf4 [ 44.675783] do_el0_svc+0x38/0x4c [ 44.676624] el0_svc+0x34/0x100 [ 44.677429] el0t_64_sync_handler+0x11c/0x150 [ 44.678532] el0t_64_sync+0x190/0x194 [ 44.679439] Code: 2a0203f4 f90013f5 2a0303f5 f9001fe1 (f9400800) [ 44.680959] ---[ end trace 0000000000000000 ]--- [ 44.682111] Kernel panic - not syncing: Oops: Fatal exception [ 44.683488] SMP: stopping secondary CPUs [ 44.684551] Kernel Offset: 0x2721948e0000 from 0xffff800008000000 [ 44.686095] PHYS_OFFSET: 0xfffff4a380000000 [ 44.687144] CPU features: 0x010,00022811,19001080 [ 44.688308] Memory Limit: none [ 44.689082] ---[ end Kernel panic - not syncing: Oops: Fatal exception ]--- It's caused by a NULL tr->fops passed to ftrace_set_filter_ip(). tr->fops is initialized to NULL and is assigned to an allocated memory address if CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS is enabled. Since there is no direct call on arm64 yet, the config can't be enabled. To fix it, call ftrace_set_filter_ip() only if tr->fops is not NULL. Fixes: 00963a2e75a8 ("bpf: Support bpf_trampoline on functions with IPMODIFY (e.g. livepatch)") Reported-by: Bruno Goncalves Signed-off-by: Xu Kuohai Signed-off-by: Andrii Nakryiko Tested-by: Bruno Goncalves Acked-by: Song Liu Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20220728114048.3540461-1-xukuohai@huaweicloud.com --- kernel/bpf/trampoline.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 7ec7e23559ad..c122d8b3ddc9 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -248,8 +248,11 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) int ret; faddr = ftrace_location((unsigned long)ip); - if (faddr) + if (faddr) { + if (!tr->fops) + return -ENOTSUPP; tr->func.ftrace_managed = true; + } if (bpf_trampoline_module_get(tr)) return -ENOENT; -- cgit v1.2.3 From 14250fa4839b3a48c979e7faaf4cbcce619d02bd Mon Sep 17 00:00:00 2001 From: Yang Li Date: Tue, 26 Jul 2022 06:27:33 +0800 Subject: bpf: Remove unneeded semicolon Eliminate the following coccicheck warning: /kernel/bpf/trampoline.c:101:2-3: Unneeded semicolon Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220725222733.55613-1-yang.lee@linux.alibaba.com --- kernel/bpf/trampoline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index c122d8b3ddc9..0f532e6a717f 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -98,7 +98,7 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd default: ret = -EINVAL; break; - }; + } mutex_unlock(&tr->mutex); return ret; -- cgit v1.2.3