diff options
Diffstat (limited to 'net')
114 files changed, 3967 insertions, 1179 deletions
diff --git a/net/Kconfig b/net/Kconfig index 3101bfcbdd7a..bd191f978a23 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -258,7 +258,7 @@ config XPS default y config HWBM - bool + bool config CGROUP_NET_PRIO bool "Network priority cgroup" @@ -309,12 +309,12 @@ config BPF_STREAM_PARSER select STREAM_PARSER select NET_SOCK_MSG ---help--- - Enabling this allows a stream parser to be used with - BPF_MAP_TYPE_SOCKMAP. + Enabling this allows a stream parser to be used with + BPF_MAP_TYPE_SOCKMAP. - BPF_MAP_TYPE_SOCKMAP provides a map type to use with network sockets. - It can be used to enforce socket policy, implement socket redirects, - etc. + BPF_MAP_TYPE_SOCKMAP provides a map type to use with network sockets. + It can be used to enforce socket policy, implement socket redirects, + etc. config NET_FLOW_LIMIT bool @@ -349,12 +349,12 @@ config NET_DROP_MONITOR tristate "Network packet drop alerting service" depends on INET && TRACEPOINTS ---help--- - This feature provides an alerting service to userspace in the - event that packets are discarded in the network stack. Alerts - are broadcast via netlink socket to any listening user space - process. If you don't need network drop alerts, or if you are ok - just checking the various proc files and other utilities for - drop statistics, say N here. + This feature provides an alerting service to userspace in the + event that packets are discarded in the network stack. Alerts + are broadcast via netlink socket to any listening user space + process. If you don't need network drop alerts, or if you are ok + just checking the various proc files and other utilities for + drop statistics, say N here. endmenu @@ -433,7 +433,7 @@ config NET_DEVLINK imply NET_DROP_MONITOR config PAGE_POOL - bool + bool config FAILOVER tristate "Generic failover module" diff --git a/net/atm/clip.c b/net/atm/clip.c index a7972da7235d..294cb9efe3d3 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -89,7 +89,7 @@ static void unlink_clip_vcc(struct clip_vcc *clip_vcc) struct clip_vcc **walk; if (!entry) { - pr_crit("!clip_vcc->entry (clip_vcc %p)\n", clip_vcc); + pr_err("!clip_vcc->entry (clip_vcc %p)\n", clip_vcc); return; } netif_tx_lock_bh(entry->neigh->dev); /* block clip_start_xmit() */ @@ -109,10 +109,10 @@ static void unlink_clip_vcc(struct clip_vcc *clip_vcc) error = neigh_update(entry->neigh, NULL, NUD_NONE, NEIGH_UPDATE_F_ADMIN, 0); if (error) - pr_crit("neigh_update failed with %d\n", error); + pr_err("neigh_update failed with %d\n", error); goto out; } - pr_crit("ATMARP: failed (entry %p, vcc 0x%p)\n", entry, clip_vcc); + pr_err("ATMARP: failed (entry %p, vcc 0x%p)\n", entry, clip_vcc); out: netif_tx_unlock_bh(entry->neigh->dev); } diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 0be4497cb832..915c2d6f7fb9 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -105,6 +105,40 @@ out: return err; } +/* Integer types of various sizes and pointer combinations cover variety of + * architecture dependent calling conventions. 7+ can be supported in the + * future. + */ +int noinline bpf_fentry_test1(int a) +{ + return a + 1; +} + +int noinline bpf_fentry_test2(int a, u64 b) +{ + return a + b; +} + +int noinline bpf_fentry_test3(char a, int b, u64 c) +{ + return a + b + c; +} + +int noinline bpf_fentry_test4(void *a, char b, int c, u64 d) +{ + return (long)a + b + c + d; +} + +int noinline bpf_fentry_test5(u64 a, void *b, short c, int d, u64 e) +{ + return a + (long)b + c + d + e; +} + +int noinline bpf_fentry_test6(u64 a, void *b, short c, int d, void *e, u64 f) +{ + return a + (long)b + c + d + (long)e + f; +} + static void *bpf_test_init(const union bpf_attr *kattr, u32 size, u32 headroom, u32 tailroom) { @@ -122,6 +156,15 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 size, kfree(data); return ERR_PTR(-EFAULT); } + if (bpf_fentry_test1(1) != 2 || + bpf_fentry_test2(2, 3) != 5 || + bpf_fentry_test3(4, 5, 6) != 15 || + bpf_fentry_test4((void *)7, 8, 9, 10) != 34 || + bpf_fentry_test5(11, (void *)12, 13, 14, 15) != 65 || + bpf_fentry_test6(16, (void *)17, 18, 19, (void *)20, 21) != 111) { + kfree(data); + return ERR_PTR(-EFAULT); + } return data; } diff --git a/net/can/af_can.c b/net/can/af_can.c index 5518a7d9eed9..128d37a4c2e0 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -86,11 +86,12 @@ static atomic_t skbcounter = ATOMIC_INIT(0); /* af_can socket functions */ -static void can_sock_destruct(struct sock *sk) +void can_sock_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_error_queue); } +EXPORT_SYMBOL(can_sock_destruct); static const struct can_proto *can_get_proto(int protocol) { diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index def2f813ffce..137054bff9ec 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -51,6 +51,7 @@ static void j1939_can_recv(struct sk_buff *iskb, void *data) if (!skb) return; + j1939_priv_get(priv); can_skb_set_owner(skb, iskb->sk); /* get a pointer to the header of the skb @@ -104,6 +105,7 @@ static void j1939_can_recv(struct sk_buff *iskb, void *data) j1939_simple_recv(priv, skb); j1939_sk_recv(priv, skb); done: + j1939_priv_put(priv); kfree_skb(skb); } @@ -150,6 +152,10 @@ static void __j1939_priv_release(struct kref *kref) netdev_dbg(priv->ndev, "%s: 0x%p\n", __func__, priv); + WARN_ON_ONCE(!list_empty(&priv->active_session_list)); + WARN_ON_ONCE(!list_empty(&priv->ecus)); + WARN_ON_ONCE(!list_empty(&priv->j1939_socks)); + dev_put(ndev); kfree(priv); } @@ -207,6 +213,9 @@ static inline struct j1939_priv *j1939_ndev_to_priv(struct net_device *ndev) { struct can_ml_priv *can_ml_priv = ndev->ml_priv; + if (!can_ml_priv) + return NULL; + return can_ml_priv->j1939_priv; } diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 4d8ba701e15d..de09b0a65791 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -78,7 +78,6 @@ static void j1939_jsk_add(struct j1939_priv *priv, struct j1939_sock *jsk) { jsk->state |= J1939_SOCK_BOUND; j1939_priv_get(priv); - jsk->priv = priv; spin_lock_bh(&priv->j1939_socks_lock); list_add_tail(&jsk->list, &priv->j1939_socks); @@ -91,7 +90,6 @@ static void j1939_jsk_del(struct j1939_priv *priv, struct j1939_sock *jsk) list_del_init(&jsk->list); spin_unlock_bh(&priv->j1939_socks_lock); - jsk->priv = NULL; j1939_priv_put(priv); jsk->state &= ~J1939_SOCK_BOUND; } @@ -349,6 +347,34 @@ void j1939_sk_recv(struct j1939_priv *priv, struct sk_buff *skb) spin_unlock_bh(&priv->j1939_socks_lock); } +static void j1939_sk_sock_destruct(struct sock *sk) +{ + struct j1939_sock *jsk = j1939_sk(sk); + + /* This function will be call by the generic networking code, when then + * the socket is ultimately closed (sk->sk_destruct). + * + * The race between + * - processing a received CAN frame + * (can_receive -> j1939_can_recv) + * and accessing j1939_priv + * ... and ... + * - closing a socket + * (j1939_can_rx_unregister -> can_rx_unregister) + * and calling the final j1939_priv_put() + * + * is avoided by calling the final j1939_priv_put() from this + * RCU deferred cleanup call. + */ + if (jsk->priv) { + j1939_priv_put(jsk->priv); + jsk->priv = NULL; + } + + /* call generic CAN sock destruct */ + can_sock_destruct(sk); +} + static int j1939_sk_init(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); @@ -371,6 +397,7 @@ static int j1939_sk_init(struct sock *sk) atomic_set(&jsk->skb_pending, 0); spin_lock_init(&jsk->sk_session_queue_lock); INIT_LIST_HEAD(&jsk->sk_session_queue); + sk->sk_destruct = j1939_sk_sock_destruct; return 0; } @@ -443,6 +470,12 @@ static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len) } jsk->ifindex = addr->can_ifindex; + + /* the corresponding j1939_priv_put() is called via + * sk->sk_destruct, which points to j1939_sk_sock_destruct() + */ + j1939_priv_get(priv); + jsk->priv = priv; } /* set default transmit pgn */ @@ -560,8 +593,8 @@ static int j1939_sk_release(struct socket *sock) if (!sk) return 0; - jsk = j1939_sk(sk); lock_sock(sk); + jsk = j1939_sk(sk); if (jsk->state & J1939_SOCK_BOUND) { struct j1939_priv *priv = jsk->priv; @@ -1059,51 +1092,72 @@ static int j1939_sk_sendmsg(struct socket *sock, struct msghdr *msg, { struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); - struct j1939_priv *priv = jsk->priv; + struct j1939_priv *priv; int ifindex; int ret; + lock_sock(sock->sk); /* various socket state tests */ - if (!(jsk->state & J1939_SOCK_BOUND)) - return -EBADFD; + if (!(jsk->state & J1939_SOCK_BOUND)) { + ret = -EBADFD; + goto sendmsg_done; + } + priv = jsk->priv; ifindex = jsk->ifindex; - if (!jsk->addr.src_name && jsk->addr.sa == J1939_NO_ADDR) + if (!jsk->addr.src_name && jsk->addr.sa == J1939_NO_ADDR) { /* no source address assigned yet */ - return -EBADFD; + ret = -EBADFD; + goto sendmsg_done; + } /* deal with provided destination address info */ if (msg->msg_name) { struct sockaddr_can *addr = msg->msg_name; - if (msg->msg_namelen < J1939_MIN_NAMELEN) - return -EINVAL; + if (msg->msg_namelen < J1939_MIN_NAMELEN) { + ret = -EINVAL; + goto sendmsg_done; + } - if (addr->can_family != AF_CAN) - return -EINVAL; + if (addr->can_family != AF_CAN) { + ret = -EINVAL; + goto sendmsg_done; + } - if (addr->can_ifindex && addr->can_ifindex != ifindex) - return -EBADFD; + if (addr->can_ifindex && addr->can_ifindex != ifindex) { + ret = -EBADFD; + goto sendmsg_done; + } if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn) && - !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn)) - return -EINVAL; + !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn)) { + ret = -EINVAL; + goto sendmsg_done; + } if (!addr->can_addr.j1939.name && addr->can_addr.j1939.addr == J1939_NO_ADDR && - !sock_flag(sk, SOCK_BROADCAST)) + !sock_flag(sk, SOCK_BROADCAST)) { /* broadcast, but SO_BROADCAST not set */ - return -EACCES; + ret = -EACCES; + goto sendmsg_done; + } } else { if (!jsk->addr.dst_name && jsk->addr.da == J1939_NO_ADDR && - !sock_flag(sk, SOCK_BROADCAST)) + !sock_flag(sk, SOCK_BROADCAST)) { /* broadcast, but SO_BROADCAST not set */ - return -EACCES; + ret = -EACCES; + goto sendmsg_done; + } } ret = j1939_sk_send_loop(priv, sk, msg, size); +sendmsg_done: + release_sock(sock->sk); + return ret; } diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index e5f1a56994c6..9f99af5b0b11 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -255,6 +255,7 @@ static void __j1939_session_drop(struct j1939_session *session) return; j1939_sock_pending_del(session->sk); + sock_put(session->sk); } static void j1939_session_destroy(struct j1939_session *session) @@ -266,6 +267,9 @@ static void j1939_session_destroy(struct j1939_session *session) netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session); + WARN_ON_ONCE(!list_empty(&session->sk_session_queue_entry)); + WARN_ON_ONCE(!list_empty(&session->active_session_list_entry)); + skb_queue_purge(&session->skb_queue); __j1939_session_drop(session); j1939_priv_put(session->priv); @@ -1042,12 +1046,13 @@ j1939_session_deactivate_activate_next(struct j1939_session *session) j1939_sk_queue_activate_next(session); } -static void j1939_session_cancel(struct j1939_session *session, +static void __j1939_session_cancel(struct j1939_session *session, enum j1939_xtp_abort err) { struct j1939_priv *priv = session->priv; WARN_ON_ONCE(!err); + lockdep_assert_held(&session->priv->active_session_list_lock); session->err = j1939_xtp_abort_to_errno(priv, err); /* do not send aborts on incoming broadcasts */ @@ -1062,6 +1067,20 @@ static void j1939_session_cancel(struct j1939_session *session, j1939_sk_send_loop_abort(session->sk, session->err); } +static void j1939_session_cancel(struct j1939_session *session, + enum j1939_xtp_abort err) +{ + j1939_session_list_lock(session->priv); + + if (session->state >= J1939_SESSION_ACTIVE && + session->state < J1939_SESSION_WAITING_ABORT) { + j1939_tp_set_rxtimeout(session, J1939_XTP_ABORT_TIMEOUT_MS); + __j1939_session_cancel(session, err); + } + + j1939_session_list_unlock(session->priv); +} + static enum hrtimer_restart j1939_tp_txtimer(struct hrtimer *hrtimer) { struct j1939_session *session = @@ -1108,8 +1127,6 @@ static enum hrtimer_restart j1939_tp_txtimer(struct hrtimer *hrtimer) netdev_alert(priv->ndev, "%s: 0x%p: tx aborted with unknown reason: %i\n", __func__, session, ret); if (session->skcb.addr.type != J1939_SIMPLE) { - j1939_tp_set_rxtimeout(session, - J1939_XTP_ABORT_TIMEOUT_MS); j1939_session_cancel(session, J1939_XTP_ABORT_OTHER); } else { session->err = ret; @@ -1169,7 +1186,7 @@ static enum hrtimer_restart j1939_tp_rxtimer(struct hrtimer *hrtimer) hrtimer_start(&session->rxtimer, ms_to_ktime(J1939_XTP_ABORT_TIMEOUT_MS), HRTIMER_MODE_REL_SOFT); - j1939_session_cancel(session, J1939_XTP_ABORT_TIMEOUT); + __j1939_session_cancel(session, J1939_XTP_ABORT_TIMEOUT); } j1939_session_list_unlock(session->priv); } @@ -1375,7 +1392,6 @@ j1939_xtp_rx_cts_one(struct j1939_session *session, struct sk_buff *skb) out_session_cancel: j1939_session_timers_cancel(session); - j1939_tp_set_rxtimeout(session, J1939_XTP_ABORT_TIMEOUT_MS); j1939_session_cancel(session, err); } @@ -1572,7 +1588,6 @@ static int j1939_xtp_rx_rts_session_active(struct j1939_session *session, /* RTS on active session */ j1939_session_timers_cancel(session); - j1939_tp_set_rxtimeout(session, J1939_XTP_ABORT_TIMEOUT_MS); j1939_session_cancel(session, J1939_XTP_ABORT_BUSY); } @@ -1583,7 +1598,6 @@ static int j1939_xtp_rx_rts_session_active(struct j1939_session *session, session->last_cmd); j1939_session_timers_cancel(session); - j1939_tp_set_rxtimeout(session, J1939_XTP_ABORT_TIMEOUT_MS); j1939_session_cancel(session, J1939_XTP_ABORT_BUSY); return -EBUSY; @@ -1785,7 +1799,6 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, out_session_cancel: j1939_session_timers_cancel(session); - j1939_tp_set_rxtimeout(session, J1939_XTP_ABORT_TIMEOUT_MS); j1939_session_cancel(session, J1939_XTP_ABORT_FAULT); j1939_session_put(session); } @@ -1866,6 +1879,7 @@ struct j1939_session *j1939_tp_send(struct j1939_priv *priv, return ERR_PTR(-ENOMEM); /* skb is recounted in j1939_session_new() */ + sock_hold(skb->sk); session->sk = skb->sk; session->transmission = true; session->pkt.total = (size + 6) / 7; @@ -2028,7 +2042,11 @@ int j1939_cancel_active_session(struct j1939_priv *priv, struct sock *sk) &priv->active_session_list, active_session_list_entry) { if (!sk || sk == session->sk) { - j1939_session_timers_cancel(session); + if (hrtimer_try_to_cancel(&session->txtimer) == 1) + j1939_session_put(session); + if (hrtimer_try_to_cancel(&session->rxtimer) == 1) + j1939_session_put(session); + session->err = ESHUTDOWN; j1939_session_deactivate_locked(session); } diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index da5639a5bd3b..458be6b3eda9 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -798,7 +798,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) * Try to grab map refcnt to make sure that it's still * alive and prevent concurrent removal. */ - map = bpf_map_inc_not_zero(&smap->map, false); + map = bpf_map_inc_not_zero(&smap->map); if (IS_ERR(map)) continue; diff --git a/net/core/dev.c b/net/core/dev.c index 1c799d486623..c7fc902ccbdc 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5586,7 +5586,7 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi, diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb); if (skb_vlan_tag_present(p)) - diffs |= p->vlan_tci ^ skb->vlan_tci; + diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb); diffs |= skb_metadata_dst_cmp(p, skb); diffs |= skb_metadata_differs(p, skb); if (maclen == ETH_HLEN) @@ -5611,8 +5611,7 @@ static void skb_gro_reset_offset(struct sk_buff *skb) NAPI_GRO_CB(skb)->frag0 = NULL; NAPI_GRO_CB(skb)->frag0_len = 0; - if (skb_mac_header(skb) == skb_tail_pointer(skb) && - pinfo->nr_frags && + if (!skb_headlen(skb) && pinfo->nr_frags && !PageHighMem(skb_frag_page(frag0))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, diff --git a/net/core/devlink.c b/net/core/devlink.c index 9bad78388a07..4c63c9a4c09e 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2812,7 +2812,7 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) struct net *dest_net = NULL; int err; - if (!devlink_reload_supported(devlink)) + if (!devlink_reload_supported(devlink) || !devlink->reload_enabled) return -EOPNOTSUPP; err = devlink_resources_validate(devlink, NULL, info); @@ -3006,6 +3006,11 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_NAME, .type = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE, + .name = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_NAME, + .type = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) @@ -4742,6 +4747,7 @@ struct devlink_health_reporter { bool auto_recover; u8 health_state; u64 dump_ts; + u64 dump_real_ts; u64 error_count; u64 recovery_count; u64 last_recovery_ts; @@ -4918,6 +4924,7 @@ static int devlink_health_do_dump(struct devlink_health_reporter *reporter, goto dump_err; reporter->dump_ts = jiffies; + reporter->dump_real_ts = ktime_get_real_ns(); return 0; @@ -5067,6 +5074,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg, jiffies_to_msecs(reporter->dump_ts), DEVLINK_ATTR_PAD)) goto reporter_nest_cancel; + if (reporter->dump_fmsg && + nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS, + reporter->dump_real_ts, DEVLINK_ATTR_PAD)) + goto reporter_nest_cancel; nla_nest_end(msg, reporter_attr); genlmsg_end(msg, hdr); diff --git a/net/core/filter.c b/net/core/filter.c index fc303abec8fa..49ded4a7588a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3816,7 +3816,7 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; -static u32 bpf_skb_output_btf_ids[5]; +static int bpf_skb_output_btf_ids[5]; const struct bpf_func_proto bpf_skb_output_proto = { .func = bpf_skb_event_output, .gpl_only = true, @@ -8684,16 +8684,6 @@ out: } #ifdef CONFIG_INET -struct sk_reuseport_kern { - struct sk_buff *skb; - struct sock *sk; - struct sock *selected_sk; - void *data_end; - u32 hash; - u32 reuseport_id; - bool bind_inany; -}; - static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern, struct sock_reuseport *reuse, struct sock *sk, struct sk_buff *skb, diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 865ba6ca16eb..ae3bcb1540ec 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -923,21 +923,23 @@ static int rx_queue_add_kobject(struct net_device *dev, int index) error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL, "rx-%u", index); if (error) - return error; + goto err; dev_hold(queue->dev); if (dev->sysfs_rx_queue_group) { error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group); - if (error) { - kobject_put(kobj); - return error; - } + if (error) + goto err; } kobject_uevent(kobj, KOBJ_ADD); return error; + +err: + kobject_put(kobj); + return error; } #endif /* CONFIG_SYSFS */ @@ -1461,21 +1463,22 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index) error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL, "tx-%u", index); if (error) - return error; + goto err; dev_hold(queue->dev); #ifdef CONFIG_BQL error = sysfs_create_group(kobj, &dql_group); - if (error) { - kobject_put(kobj); - return error; - } + if (error) + goto err; #endif kobject_uevent(kobj, KOBJ_ADD); - return 0; + +err: + kobject_put(kobj); + return error; } #endif /* CONFIG_SYSFS */ diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 5bc65587f1c4..a6aefe989043 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -18,6 +18,9 @@ #include <trace/events/page_pool.h> +#define DEFER_TIME (msecs_to_jiffies(1000)) +#define DEFER_WARN_INTERVAL (60 * HZ) + static int page_pool_init(struct page_pool *pool, const struct page_pool_params *params) { @@ -44,6 +47,21 @@ static int page_pool_init(struct page_pool *pool, (pool->p.dma_dir != DMA_BIDIRECTIONAL)) return -EINVAL; + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) { + /* In order to request DMA-sync-for-device the page + * needs to be mapped + */ + if (!(pool->p.flags & PP_FLAG_DMA_MAP)) + return -EINVAL; + + if (!pool->p.max_len) + return -EINVAL; + + /* pool->p.offset has to be set according to the address + * offset used by the DMA engine to start copying rx data + */ + } + if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) return -ENOMEM; @@ -112,6 +130,16 @@ static struct page *__page_pool_get_cached(struct page_pool *pool) return page; } +static void page_pool_dma_sync_for_device(struct page_pool *pool, + struct page *page, + unsigned int dma_sync_size) +{ + dma_sync_size = min(dma_sync_size, pool->p.max_len); + dma_sync_single_range_for_device(pool->p.dev, page->dma_addr, + pool->p.offset, dma_sync_size, + pool->p.dma_dir); +} + /* slow path */ noinline static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, @@ -156,6 +184,9 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, } page->dma_addr = dma; + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) + page_pool_dma_sync_for_device(pool, page, pool->p.max_len); + skip_dma_map: /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; @@ -193,22 +224,14 @@ static s32 page_pool_inflight(struct page_pool *pool) { u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); - s32 distance; + s32 inflight; - distance = _distance(hold_cnt, release_cnt); + inflight = _distance(hold_cnt, release_cnt); - trace_page_pool_inflight(pool, distance, hold_cnt, release_cnt); - return distance; -} - -static bool __page_pool_safe_to_destroy(struct page_pool *pool) -{ - s32 inflight = page_pool_inflight(pool); - - /* The distance should not be able to become negative */ + trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight); - return (inflight == 0); + return inflight; } /* Cleanup page_pool state from page */ @@ -216,6 +239,7 @@ static void __page_pool_clean_page(struct page_pool *pool, struct page *page) { dma_addr_t dma; + int count; if (!(pool->p.flags & PP_FLAG_DMA_MAP)) goto skip_dma_unmap; @@ -227,9 +251,11 @@ static void __page_pool_clean_page(struct page_pool *pool, DMA_ATTR_SKIP_CPU_SYNC); page->dma_addr = 0; skip_dma_unmap: - atomic_inc(&pool->pages_state_release_cnt); - trace_page_pool_state_release(pool, page, - atomic_read(&pool->pages_state_release_cnt)); + /* This may be the last page returned, releasing the pool, so + * it is not safe to reference pool afterwards. + */ + count = atomic_inc_return(&pool->pages_state_release_cnt); + trace_page_pool_state_release(pool, page, count); } /* unmap the page and clean our state */ @@ -283,8 +309,19 @@ static bool __page_pool_recycle_direct(struct page *page, return true; } -void __page_pool_put_page(struct page_pool *pool, - struct page *page, bool allow_direct) +/* page is NOT reusable when: + * 1) allocated when system is under some pressure. (page_is_pfmemalloc) + * 2) belongs to a different NUMA node than pool->p.nid. + * + * To update pool->p.nid users must call page_pool_update_nid. + */ +static bool pool_page_reusable(struct page_pool *pool, struct page *page) +{ + return !page_is_pfmemalloc(page) && page_to_nid(page) == pool->p.nid; +} + +void __page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) { /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the @@ -292,9 +329,14 @@ void __page_pool_put_page(struct page_pool *pool, * * refcnt == 1 means page_pool owns page, and can recycle it. */ - if (likely(page_ref_count(page) == 1)) { + if (likely(page_ref_count(page) == 1 && + pool_page_reusable(pool, page))) { /* Read barrier done in page_ref_count / READ_ONCE */ + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) + page_pool_dma_sync_for_device(pool, page, + dma_sync_size); + if (allow_direct && in_serving_softirq()) if (__page_pool_recycle_direct(page, pool)) return; @@ -338,31 +380,10 @@ static void __page_pool_empty_ring(struct page_pool *pool) } } -static void __warn_in_flight(struct page_pool *pool) +static void page_pool_free(struct page_pool *pool) { - u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); - u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); - s32 distance; - - distance = _distance(hold_cnt, release_cnt); - - /* Drivers should fix this, but only problematic when DMA is used */ - WARN(1, "Still in-flight pages:%d hold:%u released:%u", - distance, hold_cnt, release_cnt); -} - -void __page_pool_free(struct page_pool *pool) -{ - /* Only last user actually free/release resources */ - if (!page_pool_put(pool)) - return; - - WARN(pool->alloc.count, "API usage violation"); - WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty"); - - /* Can happen due to forced shutdown */ - if (!__page_pool_safe_to_destroy(pool)) - __warn_in_flight(pool); + if (pool->disconnect) + pool->disconnect(pool); ptr_ring_cleanup(&pool->ring, NULL); @@ -371,15 +392,14 @@ void __page_pool_free(struct page_pool *pool) kfree(pool); } -EXPORT_SYMBOL(__page_pool_free); -/* Request to shutdown: release pages cached by page_pool, and check - * for in-flight pages - */ -bool __page_pool_request_shutdown(struct page_pool *pool) +static void page_pool_empty_alloc_cache_once(struct page_pool *pool) { struct page *page; + if (pool->destroy_cnt) + return; + /* Empty alloc cache, assume caller made sure this is * no-longer in use, and page_pool_alloc_pages() cannot be * call concurrently. @@ -388,12 +408,83 @@ bool __page_pool_request_shutdown(struct page_pool *pool) page = pool->alloc.cache[--pool->alloc.count]; __page_pool_return_page(pool, page); } +} + +static void page_pool_scrub(struct page_pool *pool) +{ + page_pool_empty_alloc_cache_once(pool); + pool->destroy_cnt++; /* No more consumers should exist, but producers could still * be in-flight. */ __page_pool_empty_ring(pool); +} + +static int page_pool_release(struct page_pool *pool) +{ + int inflight; + + page_pool_scrub(pool); + inflight = page_pool_inflight(pool); + if (!inflight) + page_pool_free(pool); + + return inflight; +} + +static void page_pool_release_retry(struct work_struct *wq) +{ + struct delayed_work *dwq = to_delayed_work(wq); + struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); + int inflight; + + inflight = page_pool_release(pool); + if (!inflight) + return; + + /* Periodic warning */ + if (time_after_eq(jiffies, pool->defer_warn)) { + int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; + + pr_warn("%s() stalled pool shutdown %d inflight %d sec\n", + __func__, inflight, sec); + pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; + } + + /* Still not ready to be disconnected, retry later */ + schedule_delayed_work(&pool->release_dw, DEFER_TIME); +} - return __page_pool_safe_to_destroy(pool); +void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *)) +{ + refcount_inc(&pool->user_cnt); + pool->disconnect = disconnect; +} + +void page_pool_destroy(struct page_pool *pool) +{ + if (!pool) + return; + + if (!page_pool_put(pool)) + return; + + if (!page_pool_release(pool)) + return; + + pool->defer_start = jiffies; + pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; + + INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry); + schedule_delayed_work(&pool->release_dw, DEFER_TIME); +} +EXPORT_SYMBOL(page_pool_destroy); + +/* Caller must provide appropriate safe context, e.g. NAPI. */ +void page_pool_update_nid(struct page_pool *pool, int new_nid) +{ + trace_page_pool_update_nid(pool, new_nid); + pool->p.nid = new_nid; } -EXPORT_SYMBOL(__page_pool_request_shutdown); +EXPORT_SYMBOL(page_pool_update_nid); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 000eddb1207d..9f7aa448bd11 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2251,6 +2251,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) if (tb[IFLA_VF_MAC]) { struct ifla_vf_mac *ivm = nla_data(tb[IFLA_VF_MAC]); + if (ivm->vf >= INT_MAX) + return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_mac) err = ops->ndo_set_vf_mac(dev, ivm->vf, @@ -2262,6 +2264,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) if (tb[IFLA_VF_VLAN]) { struct ifla_vf_vlan *ivv = nla_data(tb[IFLA_VF_VLAN]); + if (ivv->vf >= INT_MAX) + return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_vlan) err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan, @@ -2294,6 +2298,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) if (len == 0) return -EINVAL; + if (ivvl[0]->vf >= INT_MAX) + return -EINVAL; err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan, ivvl[0]->qos, ivvl[0]->vlan_proto); if (err < 0) @@ -2304,6 +2310,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) struct ifla_vf_tx_rate *ivt = nla_data(tb[IFLA_VF_TX_RATE]); struct ifla_vf_info ivf; + if (ivt->vf >= INT_MAX) + return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_get_vf_config) err = ops->ndo_get_vf_config(dev, ivt->vf, &ivf); @@ -2322,6 +2330,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) if (tb[IFLA_VF_RATE]) { struct ifla_vf_rate *ivt = nla_data(tb[IFLA_VF_RATE]); + if (ivt->vf >= INT_MAX) + return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_rate) err = ops->ndo_set_vf_rate(dev, ivt->vf, @@ -2334,6 +2344,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) if (tb[IFLA_VF_SPOOFCHK]) { struct ifla_vf_spoofchk *ivs = nla_data(tb[IFLA_VF_SPOOFCHK]); + if (ivs->vf >= INT_MAX) + return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_spoofchk) err = ops->ndo_set_vf_spoofchk(dev, ivs->vf, @@ -2345,6 +2357,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) if (tb[IFLA_VF_LINK_STATE]) { struct ifla_vf_link_state *ivl = nla_data(tb[IFLA_VF_LINK_STATE]); + if (ivl->vf >= INT_MAX) + return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_link_state) err = ops->ndo_set_vf_link_state(dev, ivl->vf, @@ -2358,6 +2372,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) err = -EOPNOTSUPP; ivrssq_en = nla_data(tb[IFLA_VF_RSS_QUERY_EN]); + if (ivrssq_en->vf >= INT_MAX) + return -EINVAL; if (ops->ndo_set_vf_rss_query_en) err = ops->ndo_set_vf_rss_query_en(dev, ivrssq_en->vf, ivrssq_en->setting); @@ -2368,6 +2384,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) if (tb[IFLA_VF_TRUST]) { struct ifla_vf_trust *ivt = nla_data(tb[IFLA_VF_TRUST]); + if (ivt->vf >= INT_MAX) + return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_trust) err = ops->ndo_set_vf_trust(dev, ivt->vf, ivt->setting); @@ -2378,15 +2396,18 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) if (tb[IFLA_VF_IB_NODE_GUID]) { struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_NODE_GUID]); + if (ivt->vf >= INT_MAX) + return -EINVAL; if (!ops->ndo_set_vf_guid) return -EOPNOTSUPP; - return handle_vf_guid(dev, ivt, IFLA_VF_IB_NODE_GUID); } if (tb[IFLA_VF_IB_PORT_GUID]) { struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_PORT_GUID]); + if (ivt->vf >= INT_MAX) + return -EINVAL; if (!ops->ndo_set_vf_guid) return -EOPNOTSUPP; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index ad31e4e53d0a..a469d2124f3f 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -793,15 +793,18 @@ static void sk_psock_strp_data_ready(struct sock *sk) static void sk_psock_write_space(struct sock *sk) { struct sk_psock *psock; - void (*write_space)(struct sock *sk); + void (*write_space)(struct sock *sk) = NULL; rcu_read_lock(); psock = sk_psock(sk); - if (likely(psock && sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))) - schedule_work(&psock->work); - write_space = psock->saved_write_space; + if (likely(psock)) { + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + schedule_work(&psock->work); + write_space = psock->saved_write_space; + } rcu_read_unlock(); - write_space(sk); + if (write_space) + write_space(sk); } int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) diff --git a/net/core/xdp.c b/net/core/xdp.c index 20781ad5f9c3..e334fad0a6b8 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -70,77 +70,63 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu) xa = container_of(rcu, struct xdp_mem_allocator, rcu); - /* Allocator have indicated safe to remove before this is called */ - if (xa->mem.type == MEM_TYPE_PAGE_POOL) - page_pool_free(xa->page_pool); - /* Allow this ID to be reused */ ida_simple_remove(&mem_id_pool, xa->mem.id); - /* Poison memory */ - xa->mem.id = 0xFFFF; - xa->mem.type = 0xF0F0; - xa->allocator = (void *)0xDEAD9001; - kfree(xa); } -static bool __mem_id_disconnect(int id, bool force) +static void mem_xa_remove(struct xdp_mem_allocator *xa) { - struct xdp_mem_allocator *xa; - bool safe_to_remove = true; + trace_mem_disconnect(xa); mutex_lock(&mem_id_lock); - xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); - if (!xa) { - mutex_unlock(&mem_id_lock); - WARN(1, "Request remove non-existing id(%d), driver bug?", id); - return true; - } - xa->disconnect_cnt++; - - /* Detects in-flight packet-pages for page_pool */ - if (xa->mem.type == MEM_TYPE_PAGE_POOL) - safe_to_remove = page_pool_request_shutdown(xa->page_pool); - - trace_mem_disconnect(xa, safe_to_remove, force); - - if ((safe_to_remove || force) && - !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) + if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); mutex_unlock(&mem_id_lock); - return (safe_to_remove|force); } -#define DEFER_TIME (msecs_to_jiffies(1000)) -#define DEFER_WARN_INTERVAL (30 * HZ) -#define DEFER_MAX_RETRIES 120 +static void mem_allocator_disconnect(void *allocator) +{ + struct xdp_mem_allocator *xa; + struct rhashtable_iter iter; + + rhashtable_walk_enter(mem_id_ht, &iter); + do { + rhashtable_walk_start(&iter); + + while ((xa = rhashtable_walk_next(&iter)) && !IS_ERR(xa)) { + if (xa->allocator == allocator) + mem_xa_remove(xa); + } + + rhashtable_walk_stop(&iter); -static void mem_id_disconnect_defer_retry(struct work_struct *wq) + } while (xa == ERR_PTR(-EAGAIN)); + rhashtable_walk_exit(&iter); +} + +static void mem_id_disconnect(int id) { - struct delayed_work *dwq = to_delayed_work(wq); - struct xdp_mem_allocator *xa = container_of(dwq, typeof(*xa), defer_wq); - bool force = false; + struct xdp_mem_allocator *xa; - if (xa->disconnect_cnt > DEFER_MAX_RETRIES) - force = true; + mutex_lock(&mem_id_lock); - if (__mem_id_disconnect(xa->mem.id, force)) + xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); + if (!xa) { + mutex_unlock(&mem_id_lock); + WARN(1, "Request remove non-existing id(%d), driver bug?", id); return; + } - /* Periodic warning */ - if (time_after_eq(jiffies, xa->defer_warn)) { - int sec = (s32)((u32)jiffies - (u32)xa->defer_start) / HZ; + trace_mem_disconnect(xa); - pr_warn("%s() stalled mem.id=%u shutdown %d attempts %d sec\n", - __func__, xa->mem.id, xa->disconnect_cnt, sec); - xa->defer_warn = jiffies + DEFER_WARN_INTERVAL; - } + if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) + call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); - /* Still not ready to be disconnected, retry later */ - schedule_delayed_work(&xa->defer_wq, DEFER_TIME); + mutex_unlock(&mem_id_lock); } void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) @@ -153,38 +139,21 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) return; } - if (xdp_rxq->mem.type != MEM_TYPE_PAGE_POOL && - xdp_rxq->mem.type != MEM_TYPE_ZERO_COPY) { - return; - } - if (id == 0) return; - if (__mem_id_disconnect(id, false)) - return; - - /* Could not disconnect, defer new disconnect attempt to later */ - mutex_lock(&mem_id_lock); + if (xdp_rxq->mem.type == MEM_TYPE_ZERO_COPY) + return mem_id_disconnect(id); - xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); - if (!xa) { - mutex_unlock(&mem_id_lock); - return; + if (xdp_rxq->mem.type == MEM_TYPE_PAGE_POOL) { + rcu_read_lock(); + xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params); + page_pool_destroy(xa->page_pool); + rcu_read_unlock(); } - xa->defer_start = jiffies; - xa->defer_warn = jiffies + DEFER_WARN_INTERVAL; - - INIT_DELAYED_WORK(&xa->defer_wq, mem_id_disconnect_defer_retry); - mutex_unlock(&mem_id_lock); - schedule_delayed_work(&xa->defer_wq, DEFER_TIME); } EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model); -/* This unregister operation will also cleanup and destroy the - * allocator. The page_pool_free() operation is first called when it's - * safe to remove, possibly deferred to a workqueue. - */ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) { /* Simplify driver cleanup code paths, allow unreg "unused" */ @@ -371,7 +340,7 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, } if (type == MEM_TYPE_PAGE_POOL) - page_pool_get(xdp_alloc->page_pool); + page_pool_use_xdp_mem(allocator, mem_allocator_disconnect); mutex_unlock(&mem_id_lock); @@ -402,15 +371,8 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); page = virt_to_head_page(data); - if (likely(xa)) { - napi_direct &= !xdp_return_frame_no_direct(); - page_pool_put_page(xa->page_pool, page, napi_direct); - } else { - /* Hopefully stack show who to blame for late return */ - WARN_ONCE(1, "page_pool gone mem.id=%d", mem->id); - trace_mem_return_failed(mem, page); - put_page(page); - } + napi_direct &= !xdp_return_frame_no_direct(); + page_pool_put_page(xa->page_pool, page, napi_direct); rcu_read_unlock(); break; case MEM_TYPE_PAGE_SHARED: diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 136612792c08..1e6c3cac11e6 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -79,6 +79,13 @@ config NET_DSA_TAG_KSZ Say Y if you want to enable support for tagging frames for the Microchip 8795/9477/9893 families of switches. +config NET_DSA_TAG_OCELOT + tristate "Tag driver for Ocelot family of switches" + select PACKING + help + Say Y or M if you want to enable support for tagging frames for the + Ocelot switches (VSC7511, VSC7512, VSC7513, VSC7514, VSC9959). + config NET_DSA_TAG_QCA tristate "Tag driver for Qualcomm Atheros QCA8K switches" help diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 2c6d286f0511..9a482c38bdb1 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_NET_DSA_TAG_GSWIP) += tag_gswip.o obj-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o obj-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o obj-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o +obj-$(CONFIG_NET_DSA_TAG_OCELOT) += tag_ocelot.o obj-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o obj-$(CONFIG_NET_DSA_TAG_SJA1105) += tag_sja1105.o obj-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 53e7577896b6..2dd86d9bcda9 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -153,8 +153,8 @@ void dsa_port_link_unregister_of(struct dsa_port *dp); void dsa_port_phylink_validate(struct phylink_config *config, unsigned long *supported, struct phylink_link_state *state); -int dsa_port_phylink_mac_link_state(struct phylink_config *config, - struct phylink_link_state *state); +void dsa_port_phylink_mac_pcs_get_state(struct phylink_config *config, + struct phylink_link_state *state); void dsa_port_phylink_mac_config(struct phylink_config *config, unsigned int mode, const struct phylink_link_state *state); diff --git a/net/dsa/port.c b/net/dsa/port.c index 6e93c36bf0c0..46ac9ba21987 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -429,19 +429,22 @@ void dsa_port_phylink_validate(struct phylink_config *config, } EXPORT_SYMBOL_GPL(dsa_port_phylink_validate); -int dsa_port_phylink_mac_link_state(struct phylink_config *config, - struct phylink_link_state *state) +void dsa_port_phylink_mac_pcs_get_state(struct phylink_config *config, + struct phylink_link_state *state) { struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); struct dsa_switch *ds = dp->ds; - /* Only called for SGMII and 802.3z */ - if (!ds->ops->phylink_mac_link_state) - return -EOPNOTSUPP; + /* Only called for inband modes */ + if (!ds->ops->phylink_mac_link_state) { + state->link = 0; + return; + } - return ds->ops->phylink_mac_link_state(ds, dp->index, state); + if (ds->ops->phylink_mac_link_state(ds, dp->index, state) < 0) + state->link = 0; } -EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_state); +EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_pcs_get_state); void dsa_port_phylink_mac_config(struct phylink_config *config, unsigned int mode, @@ -510,7 +513,7 @@ EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_up); const struct phylink_mac_ops dsa_port_phylink_mac_ops = { .validate = dsa_port_phylink_validate, - .mac_link_state = dsa_port_phylink_mac_link_state, + .mac_pcs_get_state = dsa_port_phylink_mac_pcs_get_state, .mac_config = dsa_port_phylink_mac_config, .mac_an_restart = dsa_port_phylink_mac_an_restart, .mac_link_down = dsa_port_phylink_mac_link_down, diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 73632d21f1a6..2fb6c26294b5 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -105,7 +105,7 @@ static int dsa_8021q_restore_pvid(struct dsa_switch *ds, int port) slave = dsa_to_port(ds, port)->slave; err = br_vlan_get_pvid(slave, &pvid); - if (err < 0) + if (!pvid || err < 0) /* There is no pvid on the bridge for this port, which is * perfectly valid. Nothing to restore, bye-bye! */ diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c new file mode 100644 index 000000000000..8e3e7283d430 --- /dev/null +++ b/net/dsa/tag_ocelot.c @@ -0,0 +1,241 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright 2019 NXP Semiconductors + */ +#include <soc/mscc/ocelot.h> +#include <linux/packing.h> +#include "dsa_priv.h" + +/* The CPU injection header and the CPU extraction header can have 3 types of + * prefixes: long, short and no prefix. The format of the header itself is the + * same in all 3 cases. + * + * Extraction with long prefix: + * + * +-------------------+-------------------+------+------+------------+-------+ + * | ff:ff:ff:ff:ff:ff | ff:ff:ff:ff:ff:ff | 8880 | 000a | extraction | frame | + * | | | | | header | | + * +-------------------+-------------------+------+------+------------+-------+ + * 48 bits 48 bits 16 bits 16 bits 128 bits + * + * Extraction with short prefix: + * + * +------+------+------------+-------+ + * | 8880 | 000a | extraction | frame | + * | | | header | | + * +------+------+------------+-------+ + * 16 bits 16 bits 128 bits + * + * Extraction with no prefix: + * + * +------------+-------+ + * | extraction | frame | + * | header | | + * +------------+-------+ + * 128 bits + * + * + * Injection with long prefix: + * + * +-------------------+-------------------+------+------+------------+-------+ + * | any dmac | any smac | 8880 | 000a | injection | frame | + * | | | | | header | | + * +-------------------+-------------------+------+------+------------+-------+ + * 48 bits 48 bits 16 bits 16 bits 128 bits + * + * Injection with short prefix: + * + * +------+------+------------+-------+ + * | 8880 | 000a | injection | frame | + * | | | header | | + * +------+------+------------+-------+ + * 16 bits 16 bits 128 bits + * + * Injection with no prefix: + * + * +------------+-------+ + * | injection | frame | + * | header | | + * +------------+-------+ + * 128 bits + * + * The injection header looks like this (network byte order, bit 127 + * is part of lowest address byte in memory, bit 0 is part of highest + * address byte): + * + * +------+------+------+------+------+------+------+------+ + * 127:120 |BYPASS| MASQ | MASQ_PORT |REW_OP|REW_OP| + * +------+------+------+------+------+------+------+------+ + * 119:112 | REW_OP | + * +------+------+------+------+------+------+------+------+ + * 111:104 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 103: 96 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 95: 88 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 87: 80 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 79: 72 | RSV | + * +------+------+------+------+------+------+------+------+ + * 71: 64 | RSV | DEST | + * +------+------+------+------+------+------+------+------+ + * 63: 56 | DEST | + * +------+------+------+------+------+------+------+------+ + * 55: 48 | RSV | + * +------+------+------+------+------+------+------+------+ + * 47: 40 | RSV | SRC_PORT | RSV |TFRM_TIMER| + * +------+------+------+------+------+------+------+------+ + * 39: 32 | TFRM_TIMER | RSV | + * +------+------+------+------+------+------+------+------+ + * 31: 24 | RSV | DP | POP_CNT | CPUQ | + * +------+------+------+------+------+------+------+------+ + * 23: 16 | CPUQ | QOS_CLASS |TAG_TYPE| + * +------+------+------+------+------+------+------+------+ + * 15: 8 | PCP | DEI | VID | + * +------+------+------+------+------+------+------+------+ + * 7: 0 | VID | + * +------+------+------+------+------+------+------+------+ + * + * And the extraction header looks like this: + * + * +------+------+------+------+------+------+------+------+ + * 127:120 | RSV | REW_OP | + * +------+------+------+------+------+------+------+------+ + * 119:112 | REW_OP | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 111:104 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 103: 96 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 95: 88 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 87: 80 | REW_VAL | LLEN | + * +------+------+------+------+------+------+------+------+ + * 79: 72 | LLEN | WLEN | + * +------+------+------+------+------+------+------+------+ + * 71: 64 | WLEN | RSV | + * +------+------+------+------+------+------+------+------+ + * 63: 56 | RSV | + * +------+------+------+------+------+------+------+------+ + * 55: 48 | RSV | + * +------+------+------+------+------+------+------+------+ + * 47: 40 | RSV | SRC_PORT | ACL_ID | + * +------+------+------+------+------+------+------+------+ + * 39: 32 | ACL_ID | RSV | SFLOW_ID | + * +------+------+------+------+------+------+------+------+ + * 31: 24 |ACL_HIT| DP | LRN_FLAGS | CPUQ | + * +------+------+------+------+------+------+------+------+ + * 23: 16 | CPUQ | QOS_CLASS |TAG_TYPE| + * +------+------+------+------+------+------+------+------+ + * 15: 8 | PCP | DEI | VID | + * +------+------+------+------+------+------+------+------+ + * 7: 0 | VID | + * +------+------+------+------+------+------+------+------+ + */ + +static struct sk_buff *ocelot_xmit(struct sk_buff *skb, + struct net_device *netdev) +{ + struct dsa_port *dp = dsa_slave_to_port(netdev); + u64 bypass, dest, src, qos_class, rew_op; + struct dsa_switch *ds = dp->ds; + int port = dp->index; + struct ocelot *ocelot = ds->priv; + struct ocelot_port *ocelot_port = ocelot->ports[port]; + u8 *injection; + + if (unlikely(skb_cow_head(skb, OCELOT_TAG_LEN) < 0)) { + netdev_err(netdev, "Cannot make room for tag.\n"); + return NULL; + } + + injection = skb_push(skb, OCELOT_TAG_LEN); + + memset(injection, 0, OCELOT_TAG_LEN); + + src = dsa_upstream_port(ds, port); + dest = BIT(port); + bypass = true; + qos_class = skb->priority; + + packing(injection, &bypass, 127, 127, OCELOT_TAG_LEN, PACK, 0); + packing(injection, &dest, 68, 56, OCELOT_TAG_LEN, PACK, 0); + packing(injection, &src, 46, 43, OCELOT_TAG_LEN, PACK, 0); + packing(injection, &qos_class, 19, 17, OCELOT_TAG_LEN, PACK, 0); + + if (ocelot->ptp && (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) { + rew_op = ocelot_port->ptp_cmd; + if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) { + rew_op |= (ocelot_port->ts_id % 4) << 3; + ocelot_port->ts_id++; + } + + packing(injection, &rew_op, 125, 117, OCELOT_TAG_LEN, PACK, 0); + } + + return skb; +} + +static struct sk_buff *ocelot_rcv(struct sk_buff *skb, + struct net_device *netdev, + struct packet_type *pt) +{ + u64 src_port, qos_class; + u8 *start = skb->data; + u8 *extraction; + + /* Revert skb->data by the amount consumed by the DSA master, + * so it points to the beginning of the frame. + */ + skb_push(skb, ETH_HLEN); + /* We don't care about the long prefix, it is just for easy entrance + * into the DSA master's RX filter. Discard it now by moving it into + * the headroom. + */ + skb_pull(skb, OCELOT_LONG_PREFIX_LEN); + /* And skb->data now points to the extraction frame header. + * Keep a pointer to it. + */ + extraction = skb->data; + /* Now the EFH is part of the headroom as well */ + skb_pull(skb, OCELOT_TAG_LEN); + /* Reset the pointer to the real MAC header */ + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + /* And move skb->data to the correct location again */ + skb_pull(skb, ETH_HLEN); + + /* Remove from inet csum the extraction header */ + skb_postpull_rcsum(skb, start, OCELOT_LONG_PREFIX_LEN + OCELOT_TAG_LEN); + + packing(extraction, &src_port, 46, 43, OCELOT_TAG_LEN, UNPACK, 0); + packing(extraction, &qos_class, 19, 17, OCELOT_TAG_LEN, UNPACK, 0); + + skb->dev = dsa_master_find_slave(netdev, 0, src_port); + if (!skb->dev) + /* The switch will reflect back some frames sent through + * sockets opened on the bare DSA master. These will come back + * with src_port equal to the index of the CPU port, for which + * there is no slave registered. So don't print any error + * message here (ignore and drop those frames). + */ + return NULL; + + skb->offload_fwd_mark = 1; + skb->priority = qos_class; + + return skb; +} + +static struct dsa_device_ops ocelot_netdev_ops = { + .name = "ocelot", + .proto = DSA_TAG_PROTO_OCELOT, + .xmit = ocelot_xmit, + .rcv = ocelot_rcv, + .overhead = OCELOT_TAG_LEN + OCELOT_LONG_PREFIX_LEN, +}; + +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_OCELOT); + +module_dsa_tag_driver(ocelot_netdev_ops); diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 03381f3e12ba..fc816b187170 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -180,8 +180,8 @@ config NET_IPIP config NET_IPGRE_DEMUX tristate "IP: GRE demultiplexer" help - This is helper module to demultiplex GRE packets on GRE version field criteria. - Required by ip_gre and pptp modules. + This is helper module to demultiplex GRE packets on GRE version field criteria. + Required by ip_gre and pptp modules. config NET_IP_TUNNEL tristate @@ -459,200 +459,200 @@ config TCP_CONG_BIC tristate "Binary Increase Congestion (BIC) control" default m ---help--- - BIC-TCP is a sender-side only change that ensures a linear RTT - fairness under large windows while offering both scalability and - bounded TCP-friendliness. The protocol combines two schemes - called additive increase and binary search increase. When the - congestion window is large, additive increase with a large - increment ensures linear RTT fairness as well as good - scalability. Under small congestion windows, binary search - increase provides TCP friendliness. - See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ + BIC-TCP is a sender-side only change that ensures a linear RTT + fairness under large windows while offering both scalability and + bounded TCP-friendliness. The protocol combines two schemes + called additive increase and binary search increase. When the + congestion window is large, additive increase with a large + increment ensures linear RTT fairness as well as good + scalability. Under small congestion windows, binary search + increase provides TCP friendliness. + See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ config TCP_CONG_CUBIC tristate "CUBIC TCP" default y ---help--- - This is version 2.0 of BIC-TCP which uses a cubic growth function - among other techniques. - See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf + This is version 2.0 of BIC-TCP which uses a cubic growth function + among other techniques. + See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf config TCP_CONG_WESTWOOD tristate "TCP Westwood+" default m ---help--- - TCP Westwood+ is a sender-side only modification of the TCP Reno - protocol stack that optimizes the performance of TCP congestion - control. It is based on end-to-end bandwidth estimation to set - congestion window and slow start threshold after a congestion - episode. Using this estimation, TCP Westwood+ adaptively sets a - slow start threshold and a congestion window which takes into - account the bandwidth used at the time congestion is experienced. - TCP Westwood+ significantly increases fairness wrt TCP Reno in - wired networks and throughput over wireless links. + TCP Westwood+ is a sender-side only modification of the TCP Reno + protocol stack that optimizes the performance of TCP congestion + control. It is based on end-to-end bandwidth estimation to set + congestion window and slow start threshold after a congestion + episode. Using this estimation, TCP Westwood+ adaptively sets a + slow start threshold and a congestion window which takes into + account the bandwidth used at the time congestion is experienced. + TCP Westwood+ significantly increases fairness wrt TCP Reno in + wired networks and throughput over wireless links. config TCP_CONG_HTCP tristate "H-TCP" default m ---help--- - H-TCP is a send-side only modifications of the TCP Reno - protocol stack that optimizes the performance of TCP - congestion control for high speed network links. It uses a - modeswitch to change the alpha and beta parameters of TCP Reno - based on network conditions and in a way so as to be fair with - other Reno and H-TCP flows. + H-TCP is a send-side only modifications of the TCP Reno + protocol stack that optimizes the performance of TCP + congestion control for high speed network links. It uses a + modeswitch to change the alpha and beta parameters of TCP Reno + based on network conditions and in a way so as to be fair with + other Reno and H-TCP flows. config TCP_CONG_HSTCP tristate "High Speed TCP" default n ---help--- - Sally Floyd's High Speed TCP (RFC 3649) congestion control. - A modification to TCP's congestion control mechanism for use - with large congestion windows. A table indicates how much to - increase the congestion window by when an ACK is received. - For more detail see http://www.icir.org/floyd/hstcp.html + Sally Floyd's High Speed TCP (RFC 3649) congestion control. + A modification to TCP's congestion control mechanism for use + with large congestion windows. A table indicates how much to + increase the congestion window by when an ACK is received. + For more detail see http://www.icir.org/floyd/hstcp.html config TCP_CONG_HYBLA tristate "TCP-Hybla congestion control algorithm" default n ---help--- - TCP-Hybla is a sender-side only change that eliminates penalization of - long-RTT, large-bandwidth connections, like when satellite legs are - involved, especially when sharing a common bottleneck with normal - terrestrial connections. + TCP-Hybla is a sender-side only change that eliminates penalization of + long-RTT, large-bandwidth connections, like when satellite legs are + involved, especially when sharing a common bottleneck with normal + terrestrial connections. config TCP_CONG_VEGAS tristate "TCP Vegas" default n ---help--- - TCP Vegas is a sender-side only change to TCP that anticipates - the onset of congestion by estimating the bandwidth. TCP Vegas - adjusts the sending rate by modifying the congestion - window. TCP Vegas should provide less packet loss, but it is - not as aggressive as TCP Reno. + TCP Vegas is a sender-side only change to TCP that anticipates + the onset of congestion by estimating the bandwidth. TCP Vegas + adjusts the sending rate by modifying the congestion + window. TCP Vegas should provide less packet loss, but it is + not as aggressive as TCP Reno. config TCP_CONG_NV - tristate "TCP NV" - default n - ---help--- - TCP NV is a follow up to TCP Vegas. It has been modified to deal with - 10G networks, measurement noise introduced by LRO, GRO and interrupt - coalescence. In addition, it will decrease its cwnd multiplicatively - instead of linearly. + tristate "TCP NV" + default n + ---help--- + TCP NV is a follow up to TCP Vegas. It has been modified to deal with + 10G networks, measurement noise introduced by LRO, GRO and interrupt + coalescence. In addition, it will decrease its cwnd multiplicatively + instead of linearly. - Note that in general congestion avoidance (cwnd decreased when # packets - queued grows) cannot coexist with congestion control (cwnd decreased only - when there is packet loss) due to fairness issues. One scenario when they - can coexist safely is when the CA flows have RTTs << CC flows RTTs. + Note that in general congestion avoidance (cwnd decreased when # packets + queued grows) cannot coexist with congestion control (cwnd decreased only + when there is packet loss) due to fairness issues. One scenario when they + can coexist safely is when the CA flows have RTTs << CC flows RTTs. - For further details see http://www.brakmo.org/networking/tcp-nv/ + For further details see http://www.brakmo.org/networking/tcp-nv/ config TCP_CONG_SCALABLE tristate "Scalable TCP" default n ---help--- - Scalable TCP is a sender-side only change to TCP which uses a - MIMD congestion control algorithm which has some nice scaling - properties, though is known to have fairness issues. - See http://www.deneholme.net/tom/scalable/ + Scalable TCP is a sender-side only change to TCP which uses a + MIMD congestion control algorithm which has some nice scaling + properties, though is known to have fairness issues. + See http://www.deneholme.net/tom/scalable/ config TCP_CONG_LP tristate "TCP Low Priority" default n ---help--- - TCP Low Priority (TCP-LP), a distributed algorithm whose goal is - to utilize only the excess network bandwidth as compared to the - ``fair share`` of bandwidth as targeted by TCP. - See http://www-ece.rice.edu/networks/TCP-LP/ + TCP Low Priority (TCP-LP), a distributed algorithm whose goal is + to utilize only the excess network bandwidth as compared to the + ``fair share`` of bandwidth as targeted by TCP. + See http://www-ece.rice.edu/networks/TCP-LP/ config TCP_CONG_VENO tristate "TCP Veno" default n ---help--- - TCP Veno is a sender-side only enhancement of TCP to obtain better - throughput over wireless networks. TCP Veno makes use of state - distinguishing to circumvent the difficult judgment of the packet loss - type. TCP Veno cuts down less congestion window in response to random - loss packets. - See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186> + TCP Veno is a sender-side only enhancement of TCP to obtain better + throughput over wireless networks. TCP Veno makes use of state + distinguishing to circumvent the difficult judgment of the packet loss + type. TCP Veno cuts down less congestion window in response to random + loss packets. + See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186> config TCP_CONG_YEAH tristate "YeAH TCP" select TCP_CONG_VEGAS default n ---help--- - YeAH-TCP is a sender-side high-speed enabled TCP congestion control - algorithm, which uses a mixed loss/delay approach to compute the - congestion window. It's design goals target high efficiency, - internal, RTT and Reno fairness, resilience to link loss while - keeping network elements load as low as possible. + YeAH-TCP is a sender-side high-speed enabled TCP congestion control + algorithm, which uses a mixed loss/delay approach to compute the + congestion window. It's design goals target high efficiency, + internal, RTT and Reno fairness, resilience to link loss while + keeping network elements load as low as possible. - For further details look here: - http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf + For further details look here: + http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf config TCP_CONG_ILLINOIS tristate "TCP Illinois" default n ---help--- - TCP-Illinois is a sender-side modification of TCP Reno for - high speed long delay links. It uses round-trip-time to - adjust the alpha and beta parameters to achieve a higher average - throughput and maintain fairness. + TCP-Illinois is a sender-side modification of TCP Reno for + high speed long delay links. It uses round-trip-time to + adjust the alpha and beta parameters to achieve a higher average + throughput and maintain fairness. - For further details see: - http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html + For further details see: + http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html config TCP_CONG_DCTCP tristate "DataCenter TCP (DCTCP)" default n ---help--- - DCTCP leverages Explicit Congestion Notification (ECN) in the network to - provide multi-bit feedback to the end hosts. It is designed to provide: + DCTCP leverages Explicit Congestion Notification (ECN) in the network to + provide multi-bit feedback to the end hosts. It is designed to provide: - - High burst tolerance (incast due to partition/aggregate), - - Low latency (short flows, queries), - - High throughput (continuous data updates, large file transfers) with - commodity, shallow-buffered switches. + - High burst tolerance (incast due to partition/aggregate), + - Low latency (short flows, queries), + - High throughput (continuous data updates, large file transfers) with + commodity, shallow-buffered switches. - All switches in the data center network running DCTCP must support - ECN marking and be configured for marking when reaching defined switch - buffer thresholds. The default ECN marking threshold heuristic for - DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets - (~100KB) at 10Gbps, but might need further careful tweaking. + All switches in the data center network running DCTCP must support + ECN marking and be configured for marking when reaching defined switch + buffer thresholds. The default ECN marking threshold heuristic for + DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets + (~100KB) at 10Gbps, but might need further careful tweaking. - For further details see: - http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf + For further details see: + http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf config TCP_CONG_CDG tristate "CAIA Delay-Gradient (CDG)" default n ---help--- - CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies - the TCP sender in order to: + CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies + the TCP sender in order to: o Use the delay gradient as a congestion signal. o Back off with an average probability that is independent of the RTT. o Coexist with flows that use loss-based congestion control. o Tolerate packet loss unrelated to congestion. - For further details see: - D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using - delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg + For further details see: + D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using + delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg config TCP_CONG_BBR tristate "BBR TCP" default n ---help--- - BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to - maximize network utilization and minimize queues. It builds an explicit - model of the the bottleneck delivery rate and path round-trip - propagation delay. It tolerates packet loss and delay unrelated to - congestion. It can operate over LAN, WAN, cellular, wifi, or cable - modem links. It can coexist with flows that use loss-based congestion - control, and can operate with shallow buffers, deep buffers, - bufferbloat, policers, or AQM schemes that do not provide a delay - signal. It requires the fq ("Fair Queue") pacing packet scheduler. + BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to + maximize network utilization and minimize queues. It builds an explicit + model of the the bottleneck delivery rate and path round-trip + propagation delay. It tolerates packet loss and delay unrelated to + congestion. It can operate over LAN, WAN, cellular, wifi, or cable + modem links. It can coexist with flows that use loss-based congestion + control, and can operate with shallow buffers, deep buffers, + bufferbloat, policers, or AQM schemes that do not provide a delay + signal. It requires the fq ("Fair Queue") pacing packet scheduler. choice prompt "Default TCP congestion control" diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 71c78d223dfd..577db1d50a24 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -70,11 +70,6 @@ fail: fib_free_table(main_table); return -ENOMEM; } - -static bool fib4_has_custom_rules(struct net *net) -{ - return false; -} #else struct fib_table *fib_new_table(struct net *net, u32 id) @@ -131,11 +126,6 @@ struct fib_table *fib_get_table(struct net *net, u32 id) } return NULL; } - -static bool fib4_has_custom_rules(struct net *net) -{ - return net->ipv4.fib_has_custom_rules; -} #endif /* CONFIG_IP_MULTIPLE_TABLES */ static void fib_replace_table(struct net *net, struct fib_table *old, diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 10636fb6093e..572b6307a2df 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -340,6 +340,8 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, iph->saddr, iph->daddr, tpi->key); if (tunnel) { + const struct iphdr *tnl_params; + if (__iptunnel_pull_header(skb, hdr_len, tpi->proto, raw_proto, false) < 0) goto drop; @@ -348,7 +350,9 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, skb_pop_mac_header(skb); else skb_reset_mac_header(skb); - if (tunnel->collect_md) { + + tnl_params = &tunnel->parms.iph; + if (tunnel->collect_md || tnl_params->daddr == 0) { __be16 flags; __be64 tun_id; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 24a95126e698..aa438c6758a7 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -302,16 +302,31 @@ drop: return true; } +static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph, + const struct sk_buff *hint) +{ + return hint && !skb_dst(skb) && ip_hdr(hint)->daddr == iph->daddr && + ip_hdr(hint)->tos == iph->tos; +} + INDIRECT_CALLABLE_DECLARE(int udp_v4_early_demux(struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int tcp_v4_early_demux(struct sk_buff *)); static int ip_rcv_finish_core(struct net *net, struct sock *sk, - struct sk_buff *skb, struct net_device *dev) + struct sk_buff *skb, struct net_device *dev, + const struct sk_buff *hint) { const struct iphdr *iph = ip_hdr(skb); int (*edemux)(struct sk_buff *skb); struct rtable *rt; int err; + if (ip_can_use_hint(skb, iph, hint)) { + err = ip_route_use_hint(skb, iph->daddr, iph->saddr, iph->tos, + dev, hint); + if (unlikely(err)) + goto drop_error; + } + if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && !skb->sk && @@ -408,7 +423,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) if (!skb) return NET_RX_SUCCESS; - ret = ip_rcv_finish_core(net, sk, skb, dev); + ret = ip_rcv_finish_core(net, sk, skb, dev, NULL); if (ret != NET_RX_DROP) ret = dst_input(skb); return ret; @@ -535,11 +550,20 @@ static void ip_sublist_rcv_finish(struct list_head *head) } } +static struct sk_buff *ip_extract_route_hint(const struct net *net, + struct sk_buff *skb, int rt_type) +{ + if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST) + return NULL; + + return skb; +} + static void ip_list_rcv_finish(struct net *net, struct sock *sk, struct list_head *head) { + struct sk_buff *skb, *next, *hint = NULL; struct dst_entry *curr_dst = NULL; - struct sk_buff *skb, *next; struct list_head sublist; INIT_LIST_HEAD(&sublist); @@ -554,11 +578,14 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk, skb = l3mdev_ip_rcv(skb); if (!skb) continue; - if (ip_rcv_finish_core(net, sk, skb, dev) == NET_RX_DROP) + if (ip_rcv_finish_core(net, sk, skb, dev, hint) == NET_RX_DROP) continue; dst = skb_dst(skb); if (curr_dst != dst) { + hint = ip_extract_route_hint(net, skb, + ((struct rtable *)dst)->rt_type); + /* dispatch old sublist */ if (!list_empty(&sublist)) ip_sublist_rcv_finish(&sublist); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 3d8baaaf7086..9d83cb320dcb 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -422,7 +422,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net_device *dev = skb_dst(skb)->dev; + struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev; IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); @@ -430,7 +430,7 @@ int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) skb->protocol = htons(ETH_P_IP); return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, - net, sk, skb, NULL, dev, + net, sk, skb, indev, dev, ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index c724fb30d048..47f8b947eef1 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -215,6 +215,7 @@ void ip_tunnel_get_stats64(struct net_device *dev, EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = { + [LWTUNNEL_IP_UNSPEC] = { .strict_start_type = LWTUNNEL_IP_OPTS }, [LWTUNNEL_IP_ID] = { .type = NLA_U64 }, [LWTUNNEL_IP_DST] = { .type = NLA_U32 }, [LWTUNNEL_IP_SRC] = { .type = NLA_U32 }, @@ -251,7 +252,7 @@ erspan_opt_policy[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1] = { }; static int ip_tun_parse_opts_geneve(struct nlattr *attr, - struct ip_tunnel_info *info, + struct ip_tunnel_info *info, int opts_len, struct netlink_ext_ack *extack) { struct nlattr *tb[LWTUNNEL_IP_OPT_GENEVE_MAX + 1]; @@ -273,7 +274,7 @@ static int ip_tun_parse_opts_geneve(struct nlattr *attr, return -EINVAL; if (info) { - struct geneve_opt *opt = ip_tunnel_info_opts(info); + struct geneve_opt *opt = ip_tunnel_info_opts(info) + opts_len; memcpy(opt->opt_data, nla_data(attr), data_len); opt->length = data_len / 4; @@ -288,7 +289,7 @@ static int ip_tun_parse_opts_geneve(struct nlattr *attr, } static int ip_tun_parse_opts_vxlan(struct nlattr *attr, - struct ip_tunnel_info *info, + struct ip_tunnel_info *info, int opts_len, struct netlink_ext_ack *extack) { struct nlattr *tb[LWTUNNEL_IP_OPT_VXLAN_MAX + 1]; @@ -303,7 +304,8 @@ static int ip_tun_parse_opts_vxlan(struct nlattr *attr, return -EINVAL; if (info) { - struct vxlan_metadata *md = ip_tunnel_info_opts(info); + struct vxlan_metadata *md = + ip_tunnel_info_opts(info) + opts_len; attr = tb[LWTUNNEL_IP_OPT_VXLAN_GBP]; md->gbp = nla_get_u32(attr); @@ -314,11 +316,12 @@ static int ip_tun_parse_opts_vxlan(struct nlattr *attr, } static int ip_tun_parse_opts_erspan(struct nlattr *attr, - struct ip_tunnel_info *info, + struct ip_tunnel_info *info, int opts_len, struct netlink_ext_ack *extack) { struct nlattr *tb[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1]; int err; + u8 ver; err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_ERSPAN_MAX, attr, erspan_opt_policy, extack); @@ -328,23 +331,31 @@ static int ip_tun_parse_opts_erspan(struct nlattr *attr, if (!tb[LWTUNNEL_IP_OPT_ERSPAN_VER]) return -EINVAL; - if (info) { - struct erspan_metadata *md = ip_tunnel_info_opts(info); + ver = nla_get_u8(tb[LWTUNNEL_IP_OPT_ERSPAN_VER]); + if (ver == 1) { + if (!tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX]) + return -EINVAL; + } else if (ver == 2) { + if (!tb[LWTUNNEL_IP_OPT_ERSPAN_DIR] || + !tb[LWTUNNEL_IP_OPT_ERSPAN_HWID]) + return -EINVAL; + } else { + return -EINVAL; + } - attr = tb[LWTUNNEL_IP_OPT_ERSPAN_VER]; - md->version = nla_get_u8(attr); + if (info) { + struct erspan_metadata *md = + ip_tunnel_info_opts(info) + opts_len; - if (md->version == 1 && tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX]) { + md->version = ver; + if (ver == 1) { attr = tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX]; md->u.index = nla_get_be32(attr); - } else if (md->version == 2 && tb[LWTUNNEL_IP_OPT_ERSPAN_DIR] && - tb[LWTUNNEL_IP_OPT_ERSPAN_HWID]) { + } else { attr = tb[LWTUNNEL_IP_OPT_ERSPAN_DIR]; md->u.md2.dir = nla_get_u8(attr); attr = tb[LWTUNNEL_IP_OPT_ERSPAN_HWID]; set_hwid(&md->u.md2, nla_get_u8(attr)); - } else { - return -EINVAL; } info->key.tun_flags |= TUNNEL_ERSPAN_OPT; @@ -356,30 +367,57 @@ static int ip_tun_parse_opts_erspan(struct nlattr *attr, static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info, struct netlink_ext_ack *extack) { - struct nlattr *tb[LWTUNNEL_IP_OPTS_MAX + 1]; - int err; + int err, rem, opt_len, opts_len = 0, type = 0; + struct nlattr *nla; if (!attr) return 0; - err = nla_parse_nested(tb, LWTUNNEL_IP_OPTS_MAX, attr, - ip_opts_policy, extack); + err = nla_validate(nla_data(attr), nla_len(attr), LWTUNNEL_IP_OPTS_MAX, + ip_opts_policy, extack); if (err) return err; - if (tb[LWTUNNEL_IP_OPTS_GENEVE]) - err = ip_tun_parse_opts_geneve(tb[LWTUNNEL_IP_OPTS_GENEVE], - info, extack); - else if (tb[LWTUNNEL_IP_OPTS_VXLAN]) - err = ip_tun_parse_opts_vxlan(tb[LWTUNNEL_IP_OPTS_VXLAN], - info, extack); - else if (tb[LWTUNNEL_IP_OPTS_ERSPAN]) - err = ip_tun_parse_opts_erspan(tb[LWTUNNEL_IP_OPTS_ERSPAN], - info, extack); - else - err = -EINVAL; + nla_for_each_attr(nla, nla_data(attr), nla_len(attr), rem) { + switch (nla_type(nla)) { + case LWTUNNEL_IP_OPTS_GENEVE: + if (type && type != TUNNEL_GENEVE_OPT) + return -EINVAL; + opt_len = ip_tun_parse_opts_geneve(nla, info, opts_len, + extack); + if (opt_len < 0) + return opt_len; + opts_len += opt_len; + if (opts_len > IP_TUNNEL_OPTS_MAX) + return -EINVAL; + type = TUNNEL_GENEVE_OPT; + break; + case LWTUNNEL_IP_OPTS_VXLAN: + if (type) + return -EINVAL; + opt_len = ip_tun_parse_opts_vxlan(nla, info, opts_len, + extack); + if (opt_len < 0) + return opt_len; + opts_len += opt_len; + type = TUNNEL_VXLAN_OPT; + break; + case LWTUNNEL_IP_OPTS_ERSPAN: + if (type) + return -EINVAL; + opt_len = ip_tun_parse_opts_erspan(nla, info, opts_len, + extack); + if (opt_len < 0) + return opt_len; + opts_len += opt_len; + type = TUNNEL_ERSPAN_OPT; + break; + default: + return -EINVAL; + } + } - return err; + return opts_len; } static int ip_tun_get_optlen(struct nlattr *attr, @@ -477,18 +515,23 @@ static int ip_tun_fill_encap_opts_geneve(struct sk_buff *skb, { struct geneve_opt *opt; struct nlattr *nest; + int offset = 0; nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_GENEVE); if (!nest) return -ENOMEM; - opt = ip_tunnel_info_opts(tun_info); - if (nla_put_be16(skb, LWTUNNEL_IP_OPT_GENEVE_CLASS, opt->opt_class) || - nla_put_u8(skb, LWTUNNEL_IP_OPT_GENEVE_TYPE, opt->type) || - nla_put(skb, LWTUNNEL_IP_OPT_GENEVE_DATA, opt->length * 4, - opt->opt_data)) { - nla_nest_cancel(skb, nest); - return -ENOMEM; + while (tun_info->options_len > offset) { + opt = ip_tunnel_info_opts(tun_info) + offset; + if (nla_put_be16(skb, LWTUNNEL_IP_OPT_GENEVE_CLASS, + opt->opt_class) || + nla_put_u8(skb, LWTUNNEL_IP_OPT_GENEVE_TYPE, opt->type) || + nla_put(skb, LWTUNNEL_IP_OPT_GENEVE_DATA, opt->length * 4, + opt->opt_data)) { + nla_nest_cancel(skb, nest); + return -ENOMEM; + } + offset += sizeof(*opt) + opt->length * 4; } nla_nest_end(skb, nest); @@ -526,7 +569,7 @@ static int ip_tun_fill_encap_opts_erspan(struct sk_buff *skb, return -ENOMEM; md = ip_tunnel_info_opts(tun_info); - if (nla_put_u32(skb, LWTUNNEL_IP_OPT_ERSPAN_VER, md->version)) + if (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_VER, md->version)) goto err; if (md->version == 1 && @@ -602,13 +645,18 @@ static int ip_tun_opts_nlsize(struct ip_tunnel_info *info) opt_len = nla_total_size(0); /* LWTUNNEL_IP_OPTS */ if (info->key.tun_flags & TUNNEL_GENEVE_OPT) { - struct geneve_opt *opt = ip_tunnel_info_opts(info); - - opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_GENEVE */ - + nla_total_size(2) /* OPT_GENEVE_CLASS */ - + nla_total_size(1) /* OPT_GENEVE_TYPE */ - + nla_total_size(opt->length * 4); - /* OPT_GENEVE_DATA */ + struct geneve_opt *opt; + int offset = 0; + + opt_len += nla_total_size(0); /* LWTUNNEL_IP_OPTS_GENEVE */ + while (info->options_len > offset) { + opt = ip_tunnel_info_opts(info) + offset; + opt_len += nla_total_size(2) /* OPT_GENEVE_CLASS */ + + nla_total_size(1) /* OPT_GENEVE_TYPE */ + + nla_total_size(opt->length * 4); + /* OPT_GENEVE_DATA */ + offset += sizeof(*opt) + opt->length * 4; + } } else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) { opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_VXLAN */ + nla_total_size(4); /* OPT_VXLAN_GBP */ @@ -661,12 +709,14 @@ static const struct lwtunnel_encap_ops ip_tun_lwt_ops = { }; static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = { + [LWTUNNEL_IP6_UNSPEC] = { .strict_start_type = LWTUNNEL_IP6_OPTS }, [LWTUNNEL_IP6_ID] = { .type = NLA_U64 }, [LWTUNNEL_IP6_DST] = { .len = sizeof(struct in6_addr) }, [LWTUNNEL_IP6_SRC] = { .len = sizeof(struct in6_addr) }, [LWTUNNEL_IP6_HOPLIMIT] = { .type = NLA_U8 }, [LWTUNNEL_IP6_TC] = { .type = NLA_U8 }, [LWTUNNEL_IP6_FLAGS] = { .type = NLA_U16 }, + [LWTUNNEL_IP6_OPTS] = { .type = NLA_NESTED }, }; static int ip6_tun_build_state(struct nlattr *attr, diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 32e20b758b68..f35308ff84c3 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -1412,6 +1412,9 @@ static int __init wait_for_devices(void) struct net_device *dev; int found = 0; + /* make sure deferred device probes are finished */ + wait_for_device_probe(); + rtnl_lock(); for_each_netdev(&init_net, dev) { if (ic_is_init_dev(dev)) { diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 440294bdb752..6e68def66822 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2291,7 +2291,8 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb, rcu_read_unlock(); return -ENODEV; } - skb2 = skb_clone(skb, GFP_ATOMIC); + + skb2 = skb_realloc_headroom(skb, sizeof(struct iphdr)); if (!skb2) { read_unlock(&mrt_lock); rcu_read_unlock(); diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c index 168b72e18be0..e32e41b99f0f 100644 --- a/net/ipv4/netfilter/nf_flow_table_ipv4.c +++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c @@ -10,7 +10,7 @@ static struct nf_flowtable_type flowtable_ipv4 = { .family = NFPROTO_IPV4, .init = nf_flow_table_init, .setup = nf_flow_table_offload_setup, - .action = nf_flow_rule_route, + .action = nf_flow_rule_route_ipv4, .free = nf_flow_table_free, .hook = nf_flow_offload_ip_hook, .owner = THIS_MODULE, diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index fc34fd1668d6..511eaa94e2d1 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -23,7 +23,6 @@ static void remove_nexthop(struct net *net, struct nexthop *nh, #define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS) static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = { - [NHA_UNSPEC] = { .strict_start_type = NHA_UNSPEC + 1 }, [NHA_ID] = { .type = NLA_U32 }, [NHA_GROUP] = { .type = NLA_BINARY }, [NHA_GROUP_TYPE] = { .type = NLA_U16 }, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index dcc4fa10138d..f88c93c38f11 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2019,10 +2019,52 @@ static int ip_mkroute_input(struct sk_buff *skb, return __mkroute_input(skb, res, in_dev, daddr, saddr, tos); } +/* Implements all the saddr-related checks as ip_route_input_slow(), + * assuming daddr is valid and the destination is not a local broadcast one. + * Uses the provided hint instead of performing a route lookup. + */ +int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, + const struct sk_buff *hint) +{ + struct in_device *in_dev = __in_dev_get_rcu(dev); + struct rtable *rt = (struct rtable *)hint; + struct net *net = dev_net(dev); + int err = -EINVAL; + u32 tag = 0; + + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) + goto martian_source; + + if (ipv4_is_zeronet(saddr)) + goto martian_source; + + if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) + goto martian_source; + + if (rt->rt_type != RTN_LOCAL) + goto skip_validate_source; + + tos &= IPTOS_RT_MASK; + err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag); + if (err < 0) + goto martian_source; + +skip_validate_source: + skb_dst_copy(skb, hint); + return 0; + +martian_source: + ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); + return err; +} + /* * NOTE. We drop all the packets that has local source * addresses, because every properly looped back packet * must have correct destination already attached by output routine. + * Changes in the enforced policies must be applied also to + * ip_route_use_hint(). * * Such approach solves two big problems: * 1. Not simplex devices are handled properly. diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 59ded25acd04..fcb2cd167f64 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -340,6 +340,10 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write, user_key[i * 4 + 1], user_key[i * 4 + 2], user_key[i * 4 + 3]); + + if (WARN_ON_ONCE(off >= tbl.maxlen - 1)) + break; + if (i + 1 < n_keys) off += snprintf(tbl.data + off, tbl.maxlen - off, ","); } @@ -1037,7 +1041,7 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_fib_multipath_hash_policy, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, + .extra2 = &two, }, #endif { diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index c445a81d144e..3737ec096650 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -256,6 +256,9 @@ void tcp_get_available_congestion_control(char *buf, size_t maxlen) offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ca->name); + + if (WARN_ON_ONCE(offs >= maxlen)) + break; } rcu_read_unlock(); } @@ -285,6 +288,9 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen) offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ca->name); + + if (WARN_ON_ONCE(offs >= maxlen)) + break; } rcu_read_unlock(); } diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c index 4849edb62d52..12ab5db2b71c 100644 --- a/net/ipv4/tcp_ulp.c +++ b/net/ipv4/tcp_ulp.c @@ -92,6 +92,9 @@ void tcp_get_available_ulp(char *buf, size_t maxlen) offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ulp_ops->name); + + if (WARN_ON_ONCE(offs >= maxlen)) + break; } rcu_read_unlock(); } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1d58ce829dca..4da5758cc718 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1297,6 +1297,27 @@ out: #define UDP_SKB_IS_STATELESS 0x80000000 +/* all head states (dst, sk, nf conntrack) except skb extensions are + * cleared by udp_rcv(). + * + * We need to preserve secpath, if present, to eventually process + * IP_CMSG_PASSSEC at recvmsg() time. + * + * Other extensions can be cleared. + */ +static bool udp_try_make_stateless(struct sk_buff *skb) +{ + if (!skb_has_extensions(skb)) + return true; + + if (!secpath_exists(skb)) { + skb_ext_reset(skb); + return true; + } + + return false; +} + static void udp_set_dev_scratch(struct sk_buff *skb) { struct udp_dev_scratch *scratch = udp_skb_scratch(skb); @@ -1308,11 +1329,7 @@ static void udp_set_dev_scratch(struct sk_buff *skb) scratch->csum_unnecessary = !!skb_csum_unnecessary(skb); scratch->is_linear = !skb_is_nonlinear(skb); #endif - /* all head states execept sp (dst, sk, nf) are always cleared by - * udp_rcv() and we need to preserve secpath, if present, to eventually - * process IP_CMSG_PASSSEC at recvmsg() time - */ - if (likely(!skb_sec_path(skb))) + if (udp_try_make_stateless(skb)) scratch->_tsize_state |= UDP_SKB_IS_STATELESS; } @@ -2534,9 +2551,11 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, case UDP_ENCAP: switch (val) { case 0: +#ifdef CONFIG_XFRM case UDP_ENCAP_ESPINUDP: case UDP_ENCAP_ESPINUDP_NON_IKE: up->encap_rcv = xfrm4_udp_encap_rcv; +#endif /* FALLTHROUGH */ case UDP_ENCAP_L2TPINUDP: up->encap_type = val; diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index ecff3fce9807..89ba7c87de5d 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -92,7 +92,7 @@ static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) { return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, - net, sk, skb, NULL, skb_dst(skb)->dev, + net, sk, skb, skb->dev, skb_dst(skb)->dev, __xfrm4_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index f66bc2af4e9d..7bae6a91b487 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1461,6 +1461,8 @@ out: } #endif goto failure; + } else if (fib6_requires_src(rt)) { + fib6_routes_require_src_inc(info->nl_net); } return err; @@ -1933,6 +1935,8 @@ int fib6_del(struct fib6_info *rt, struct nl_info *info) struct fib6_info *cur = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); if (rt == cur) { + if (fib6_requires_src(cur)) + fib6_routes_require_src_dec(info->nl_net); fib6_del_route(table, fn, rtp, info); return 0; } diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index ef7f707d9ae3..7b089d0ac8cd 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -86,11 +86,27 @@ static void ip6_sublist_rcv_finish(struct list_head *head) } } +static bool ip6_can_use_hint(const struct sk_buff *skb, + const struct sk_buff *hint) +{ + return hint && !skb_dst(skb) && + ipv6_addr_equal(&ipv6_hdr(hint)->daddr, &ipv6_hdr(skb)->daddr); +} + +static struct sk_buff *ip6_extract_route_hint(const struct net *net, + struct sk_buff *skb) +{ + if (fib6_routes_require_src(net) || fib6_has_custom_rules(net)) + return NULL; + + return skb; +} + static void ip6_list_rcv_finish(struct net *net, struct sock *sk, struct list_head *head) { + struct sk_buff *skb, *next, *hint = NULL; struct dst_entry *curr_dst = NULL; - struct sk_buff *skb, *next; struct list_head sublist; INIT_LIST_HEAD(&sublist); @@ -104,9 +120,15 @@ static void ip6_list_rcv_finish(struct net *net, struct sock *sk, skb = l3mdev_ip6_rcv(skb); if (!skb) continue; - ip6_rcv_finish_core(net, sk, skb); + + if (ip6_can_use_hint(skb, hint)) + skb_dst_copy(skb, hint); + else + ip6_rcv_finish_core(net, sk, skb); dst = skb_dst(skb); if (curr_dst != dst) { + hint = ip6_extract_route_hint(net, skb); + /* dispatch old sublist */ if (!list_empty(&sublist)) ip6_sublist_rcv_finish(&sublist); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 71827b56c006..945508a7cb0f 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -160,7 +160,7 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net_device *dev = skb_dst(skb)->dev; + struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev; struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); skb->protocol = htons(ETH_P_IPV6); @@ -173,7 +173,7 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) } return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, - net, sk, skb, NULL, dev, + net, sk, skb, indev, dev, ip6_finish_output, !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 264c292e7dcc..79fc012dd2ca 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -363,8 +363,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, break; case IPV6_TRANSPARENT: - if (valbool && !ns_capable(net->user_ns, CAP_NET_ADMIN) && - !ns_capable(net->user_ns, CAP_NET_RAW)) { + if (valbool && !ns_capable(net->user_ns, CAP_NET_RAW) && + !ns_capable(net->user_ns, CAP_NET_ADMIN)) { retv = -EPERM; break; } diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 69443e9a3aa5..0594131fa46d 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -128,9 +128,9 @@ config IP6_NF_MATCH_HL depends on NETFILTER_ADVANCED select NETFILTER_XT_MATCH_HL ---help--- - This is a backwards-compat option for the user's convenience - (e.g. when running oldconfig). It selects - CONFIG_NETFILTER_XT_MATCH_HL. + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_MATCH_HL. config IP6_NF_MATCH_IPV6HEADER tristate '"ipv6header" IPv6 Extension Headers Match' @@ -184,9 +184,9 @@ config IP6_NF_TARGET_HL depends on NETFILTER_ADVANCED && IP6_NF_MANGLE select NETFILTER_XT_TARGET_HL ---help--- - This is a backwards-compatible option for the user's convenience - (e.g. when running oldconfig). It selects - CONFIG_NETFILTER_XT_TARGET_HL. + This is a backwards-compatible option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_TARGET_HL. config IP6_NF_FILTER tristate "Packet filtering" @@ -245,14 +245,14 @@ config IP6_NF_RAW # security table for MAC policy config IP6_NF_SECURITY - tristate "Security table" - depends on SECURITY - depends on NETFILTER_ADVANCED - help - This option adds a `security' table to iptables, for use - with Mandatory Access Control (MAC) policy. - - If unsure, say N. + tristate "Security table" + depends on SECURITY + depends on NETFILTER_ADVANCED + help + This option adds a `security' table to iptables, for use + with Mandatory Access Control (MAC) policy. + + If unsure, say N. config IP6_NF_NAT tristate "ip6tables NAT support" diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c index f069bc0dc056..a8566ee12e83 100644 --- a/net/ipv6/netfilter/nf_flow_table_ipv6.c +++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c @@ -11,7 +11,7 @@ static struct nf_flowtable_type flowtable_ipv6 = { .family = NFPROTO_IPV6, .init = nf_flow_table_init, .setup = nf_flow_table_offload_setup, - .action = nf_flow_rule_route, + .action = nf_flow_rule_route_ipv6, .free = nf_flow_table_free, .hook = nf_flow_offload_ipv6_hook, .owner = THIS_MODULE, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index edcb52543518..b59940416cb5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -634,7 +634,7 @@ static void rt6_probe(struct fib6_nh *fib6_nh) * Router Reachability Probe MUST be rate-limited * to no more than one per minute. */ - if (fib6_nh->fib_nh_gw_family) + if (!fib6_nh->fib_nh_gw_family) return; nh_gw = &fib6_nh->fib_nh_gw6; @@ -6199,6 +6199,9 @@ static int __net_init ip6_route_net_init(struct net *net) dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, ip6_template_metrics, true); INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->rt6i_uncached); +#ifdef CONFIG_IPV6_SUBTREES + net->ipv6.fib6_routes_require_src = 0; +#endif #endif net->ipv6.sysctl.flush_delay = 0; diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 9d4f75e0d33a..85a5447a3e8d 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -81,6 +81,11 @@ static struct ipv6_sr_hdr *get_srh(struct sk_buff *skb) if (!pskb_may_pull(skb, srhoff + len)) return NULL; + /* note that pskb_may_pull may change pointers in header; + * for this reason it is necessary to reload them when needed. + */ + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + if (!seg6_validate_srh(srh, len)) return NULL; @@ -144,8 +149,9 @@ static void advance_nextseg(struct ipv6_sr_hdr *srh, struct in6_addr *daddr) *daddr = *addr; } -int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, - u32 tbl_id) +static int +seg6_lookup_any_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, + u32 tbl_id, bool local_delivery) { struct net *net = dev_net(skb->dev); struct ipv6hdr *hdr = ipv6_hdr(skb); @@ -153,6 +159,7 @@ int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, struct dst_entry *dst = NULL; struct rt6_info *rt; struct flowi6 fl6; + int dev_flags = 0; fl6.flowi6_iif = skb->dev->ifindex; fl6.daddr = nhaddr ? *nhaddr : hdr->daddr; @@ -177,7 +184,13 @@ int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, dst = &rt->dst; } - if (dst && dst->dev->flags & IFF_LOOPBACK && !dst->error) { + /* we want to discard traffic destined for local packet processing, + * if @local_delivery is set to false. + */ + if (!local_delivery) + dev_flags |= IFF_LOOPBACK; + + if (dst && (dst->dev->flags & dev_flags) && !dst->error) { dst_release(dst); dst = NULL; } @@ -194,6 +207,12 @@ out: return dst->error; } +int seg6_lookup_nexthop(struct sk_buff *skb, + struct in6_addr *nhaddr, u32 tbl_id) +{ + return seg6_lookup_any_nexthop(skb, nhaddr, tbl_id, false); +} + /* regular endpoint function */ static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt) { @@ -336,6 +355,8 @@ static int input_action_end_dx6(struct sk_buff *skb, if (!ipv6_addr_any(&slwt->nh6)) nhaddr = &slwt->nh6; + skb_set_transport_header(skb, sizeof(struct ipv6hdr)); + seg6_lookup_nexthop(skb, nhaddr, 0); return dst_input(skb); @@ -365,6 +386,8 @@ static int input_action_end_dx4(struct sk_buff *skb, skb_dst_drop(skb); + skb_set_transport_header(skb, sizeof(struct iphdr)); + err = ip_route_input(skb, nhaddr, iph->saddr, 0, skb->dev); if (err) goto drop; @@ -385,7 +408,9 @@ static int input_action_end_dt6(struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; - seg6_lookup_nexthop(skb, NULL, slwt->table); + skb_set_transport_header(skb, sizeof(struct ipv6hdr)); + + seg6_lookup_any_nexthop(skb, NULL, slwt->table, true); return dst_input(skb); diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index eecac1b7148e..fbe51d40bd7e 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -187,7 +187,7 @@ skip_frag: int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, - net, sk, skb, NULL, skb_dst(skb)->dev, + net, sk, skb, skb->dev, skb_dst(skb)->dev, __xfrm6_output, !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile index 4f03ebe732fa..6cbb1286d6c0 100644 --- a/net/mac80211/Makefile +++ b/net/mac80211/Makefile @@ -32,7 +32,8 @@ mac80211-y := \ chan.o \ trace.o mlme.o \ tdls.o \ - ocb.o + ocb.o \ + airtime.o mac80211-$(CONFIG_MAC80211_LEDS) += led.o mac80211-$(CONFIG_MAC80211_DEBUGFS) += \ diff --git a/net/mac80211/airtime.c b/net/mac80211/airtime.c new file mode 100644 index 000000000000..63cb0028b02d --- /dev/null +++ b/net/mac80211/airtime.c @@ -0,0 +1,597 @@ +// SPDX-License-Identifier: ISC +/* + * Copyright (C) 2019 Felix Fietkau <nbd@nbd.name> + */ + +#include <net/mac80211.h> +#include "ieee80211_i.h" +#include "sta_info.h" + +#define AVG_PKT_SIZE 1024 + +/* Number of bits for an average sized packet */ +#define MCS_NBITS (AVG_PKT_SIZE << 3) + +/* Number of kilo-symbols (symbols * 1024) for a packet with (bps) bits per + * symbol. We use k-symbols to avoid rounding in the _TIME macros below. + */ +#define MCS_N_KSYMS(bps) DIV_ROUND_UP(MCS_NBITS << 10, (bps)) + +/* Transmission time (in 1024 * usec) for a packet containing (ksyms) * 1024 + * symbols. + */ +#define MCS_SYMBOL_TIME(sgi, ksyms) \ + (sgi ? \ + ((ksyms) * 4 * 18) / 20 : /* 3.6 us per sym */ \ + ((ksyms) * 4) /* 4.0 us per sym */ \ + ) + +/* Transmit duration for the raw data part of an average sized packet */ +#define MCS_DURATION(streams, sgi, bps) \ + ((u32)MCS_SYMBOL_TIME(sgi, MCS_N_KSYMS((streams) * (bps)))) + +#define MCS_DURATION_S(shift, streams, sgi, bps) \ + ((u16)((MCS_DURATION(streams, sgi, bps) >> shift))) + +/* These should match the values in enum nl80211_he_gi */ +#define HE_GI_08 0 +#define HE_GI_16 1 +#define HE_GI_32 2 + +/* Transmission time (1024 usec) for a packet containing (ksyms) * k-symbols */ +#define HE_SYMBOL_TIME(gi, ksyms) \ + (gi == HE_GI_08 ? \ + ((ksyms) * 16 * 17) / 20 : /* 13.6 us per sym */ \ + (gi == HE_GI_16 ? \ + ((ksyms) * 16 * 18) / 20 : /* 14.4 us per sym */ \ + ((ksyms) * 16) /* 16.0 us per sym */ \ + )) + +/* Transmit duration for the raw data part of an average sized packet */ +#define HE_DURATION(streams, gi, bps) \ + ((u32)HE_SYMBOL_TIME(gi, MCS_N_KSYMS((streams) * (bps)))) + +#define HE_DURATION_S(shift, streams, gi, bps) \ + (HE_DURATION(streams, gi, bps) >> shift) + +#define BW_20 0 +#define BW_40 1 +#define BW_80 2 +#define BW_160 3 + +/* + * Define group sort order: HT40 -> SGI -> #streams + */ +#define IEEE80211_MAX_STREAMS 4 +#define IEEE80211_HT_STREAM_GROUPS 4 /* BW(=2) * SGI(=2) */ +#define IEEE80211_VHT_STREAM_GROUPS 8 /* BW(=4) * SGI(=2) */ + +#define IEEE80211_HE_MAX_STREAMS 8 +#define IEEE80211_HE_STREAM_GROUPS 12 /* BW(=4) * GI(=3) */ + +#define IEEE80211_HT_GROUPS_NB (IEEE80211_MAX_STREAMS * \ + IEEE80211_HT_STREAM_GROUPS) +#define IEEE80211_VHT_GROUPS_NB (IEEE80211_MAX_STREAMS * \ + IEEE80211_VHT_STREAM_GROUPS) +#define IEEE80211_HE_GROUPS_NB (IEEE80211_HE_MAX_STREAMS * \ + IEEE80211_HE_STREAM_GROUPS) +#define IEEE80211_GROUPS_NB (IEEE80211_HT_GROUPS_NB + \ + IEEE80211_VHT_GROUPS_NB + \ + IEEE80211_HE_GROUPS_NB) + +#define IEEE80211_HT_GROUP_0 0 +#define IEEE80211_VHT_GROUP_0 (IEEE80211_HT_GROUP_0 + IEEE80211_HT_GROUPS_NB) +#define IEEE80211_HE_GROUP_0 (IEEE80211_VHT_GROUP_0 + IEEE80211_VHT_GROUPS_NB) + +#define MCS_GROUP_RATES 12 + +#define HT_GROUP_IDX(_streams, _sgi, _ht40) \ + IEEE80211_HT_GROUP_0 + \ + IEEE80211_MAX_STREAMS * 2 * _ht40 + \ + IEEE80211_MAX_STREAMS * _sgi + \ + _streams - 1 + +#define _MAX(a, b) (((a)>(b))?(a):(b)) + +#define GROUP_SHIFT(duration) \ + _MAX(0, 16 - __builtin_clz(duration)) + +/* MCS rate information for an MCS group */ +#define __MCS_GROUP(_streams, _sgi, _ht40, _s) \ + [HT_GROUP_IDX(_streams, _sgi, _ht40)] = { \ + .shift = _s, \ + .duration = { \ + MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 54 : 26), \ + MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 108 : 52), \ + MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 162 : 78), \ + MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 216 : 104), \ + MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 324 : 156), \ + MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 432 : 208), \ + MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 486 : 234), \ + MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 540 : 260) \ + } \ +} + +#define MCS_GROUP_SHIFT(_streams, _sgi, _ht40) \ + GROUP_SHIFT(MCS_DURATION(_streams, _sgi, _ht40 ? 54 : 26)) + +#define MCS_GROUP(_streams, _sgi, _ht40) \ + __MCS_GROUP(_streams, _sgi, _ht40, \ + MCS_GROUP_SHIFT(_streams, _sgi, _ht40)) + +#define VHT_GROUP_IDX(_streams, _sgi, _bw) \ + (IEEE80211_VHT_GROUP_0 + \ + IEEE80211_MAX_STREAMS * 2 * (_bw) + \ + IEEE80211_MAX_STREAMS * (_sgi) + \ + (_streams) - 1) + +#define BW2VBPS(_bw, r4, r3, r2, r1) \ + (_bw == BW_160 ? r4 : _bw == BW_80 ? r3 : _bw == BW_40 ? r2 : r1) + +#define __VHT_GROUP(_streams, _sgi, _bw, _s) \ + [VHT_GROUP_IDX(_streams, _sgi, _bw)] = { \ + .shift = _s, \ + .duration = { \ + MCS_DURATION_S(_s, _streams, _sgi, \ + BW2VBPS(_bw, 234, 117, 54, 26)), \ + MCS_DURATION_S(_s, _streams, _sgi, \ + BW2VBPS(_bw, 468, 234, 108, 52)), \ + MCS_DURATION_S(_s, _streams, _sgi, \ + BW2VBPS(_bw, 702, 351, 162, 78)), \ + MCS_DURATION_S(_s, _streams, _sgi, \ + BW2VBPS(_bw, 936, 468, 216, 104)), \ + MCS_DURATION_S(_s, _streams, _sgi, \ + BW2VBPS(_bw, 1404, 702, 324, 156)), \ + MCS_DURATION_S(_s, _streams, _sgi, \ + BW2VBPS(_bw, 1872, 936, 432, 208)), \ + MCS_DURATION_S(_s, _streams, _sgi, \ + BW2VBPS(_bw, 2106, 1053, 486, 234)), \ + MCS_DURATION_S(_s, _streams, _sgi, \ + BW2VBPS(_bw, 2340, 1170, 540, 260)), \ + MCS_DURATION_S(_s, _streams, _sgi, \ + BW2VBPS(_bw, 2808, 1404, 648, 312)), \ + MCS_DURATION_S(_s, _streams, _sgi, \ + BW2VBPS(_bw, 3120, 1560, 720, 346)) \ + } \ +} + +#define VHT_GROUP_SHIFT(_streams, _sgi, _bw) \ + GROUP_SHIFT(MCS_DURATION(_streams, _sgi, \ + BW2VBPS(_bw, 243, 117, 54, 26))) + +#define VHT_GROUP(_streams, _sgi, _bw) \ + __VHT_GROUP(_streams, _sgi, _bw, \ + VHT_GROUP_SHIFT(_streams, _sgi, _bw)) + + +#define HE_GROUP_IDX(_streams, _gi, _bw) \ + (IEEE80211_HE_GROUP_0 + \ + IEEE80211_HE_MAX_STREAMS * 3 * (_bw) + \ + IEEE80211_HE_MAX_STREAMS * (_gi) + \ + (_streams) - 1) + +#define __HE_GROUP(_streams, _gi, _bw, _s) \ + [HE_GROUP_IDX(_streams, _gi, _bw)] = { \ + .shift = _s, \ + .duration = { \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 979, 489, 230, 115)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 1958, 979, 475, 230)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 2937, 1468, 705, 345)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 3916, 1958, 936, 475)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 5875, 2937, 1411, 705)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 7833, 3916, 1872, 936)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 8827, 4406, 2102, 1051)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 9806, 4896, 2347, 1166)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 11764, 5875, 2808, 1411)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 13060, 6523, 3124, 1555)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 14702, 7344, 3513, 1756)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 16329, 8164, 3902, 1944)) \ + } \ +} + +#define HE_GROUP_SHIFT(_streams, _gi, _bw) \ + GROUP_SHIFT(HE_DURATION(_streams, _gi, \ + BW2VBPS(_bw, 979, 489, 230, 115))) + +#define HE_GROUP(_streams, _gi, _bw) \ + __HE_GROUP(_streams, _gi, _bw, \ + HE_GROUP_SHIFT(_streams, _gi, _bw)) +struct mcs_group { + u8 shift; + u16 duration[MCS_GROUP_RATES]; +}; + +static const struct mcs_group airtime_mcs_groups[] = { + MCS_GROUP(1, 0, BW_20), + MCS_GROUP(2, 0, BW_20), + MCS_GROUP(3, 0, BW_20), + MCS_GROUP(4, 0, BW_20), + + MCS_GROUP(1, 1, BW_20), + MCS_GROUP(2, 1, BW_20), + MCS_GROUP(3, 1, BW_20), + MCS_GROUP(4, 1, BW_20), + + MCS_GROUP(1, 0, BW_40), + MCS_GROUP(2, 0, BW_40), + MCS_GROUP(3, 0, BW_40), + MCS_GROUP(4, 0, BW_40), + + MCS_GROUP(1, 1, BW_40), + MCS_GROUP(2, 1, BW_40), + MCS_GROUP(3, 1, BW_40), + MCS_GROUP(4, 1, BW_40), + + VHT_GROUP(1, 0, BW_20), + VHT_GROUP(2, 0, BW_20), + VHT_GROUP(3, 0, BW_20), + VHT_GROUP(4, 0, BW_20), + + VHT_GROUP(1, 1, BW_20), + VHT_GROUP(2, 1, BW_20), + VHT_GROUP(3, 1, BW_20), + VHT_GROUP(4, 1, BW_20), + + VHT_GROUP(1, 0, BW_40), + VHT_GROUP(2, 0, BW_40), + VHT_GROUP(3, 0, BW_40), + VHT_GROUP(4, 0, BW_40), + + VHT_GROUP(1, 1, BW_40), + VHT_GROUP(2, 1, BW_40), + VHT_GROUP(3, 1, BW_40), + VHT_GROUP(4, 1, BW_40), + + VHT_GROUP(1, 0, BW_80), + VHT_GROUP(2, 0, BW_80), + VHT_GROUP(3, 0, BW_80), + VHT_GROUP(4, 0, BW_80), + + VHT_GROUP(1, 1, BW_80), + VHT_GROUP(2, 1, BW_80), + VHT_GROUP(3, 1, BW_80), + VHT_GROUP(4, 1, BW_80), + + VHT_GROUP(1, 0, BW_160), + VHT_GROUP(2, 0, BW_160), + VHT_GROUP(3, 0, BW_160), + VHT_GROUP(4, 0, BW_160), + + VHT_GROUP(1, 1, BW_160), + VHT_GROUP(2, 1, BW_160), + VHT_GROUP(3, 1, BW_160), + VHT_GROUP(4, 1, BW_160), + + HE_GROUP(1, HE_GI_08, BW_20), + HE_GROUP(2, HE_GI_08, BW_20), + HE_GROUP(3, HE_GI_08, BW_20), + HE_GROUP(4, HE_GI_08, BW_20), + HE_GROUP(5, HE_GI_08, BW_20), + HE_GROUP(6, HE_GI_08, BW_20), + HE_GROUP(7, HE_GI_08, BW_20), + HE_GROUP(8, HE_GI_08, BW_20), + + HE_GROUP(1, HE_GI_16, BW_20), + HE_GROUP(2, HE_GI_16, BW_20), + HE_GROUP(3, HE_GI_16, BW_20), + HE_GROUP(4, HE_GI_16, BW_20), + HE_GROUP(5, HE_GI_16, BW_20), + HE_GROUP(6, HE_GI_16, BW_20), + HE_GROUP(7, HE_GI_16, BW_20), + HE_GROUP(8, HE_GI_16, BW_20), + + HE_GROUP(1, HE_GI_32, BW_20), + HE_GROUP(2, HE_GI_32, BW_20), + HE_GROUP(3, HE_GI_32, BW_20), + HE_GROUP(4, HE_GI_32, BW_20), + HE_GROUP(5, HE_GI_32, BW_20), + HE_GROUP(6, HE_GI_32, BW_20), + HE_GROUP(7, HE_GI_32, BW_20), + HE_GROUP(8, HE_GI_32, BW_20), + + HE_GROUP(1, HE_GI_08, BW_40), + HE_GROUP(2, HE_GI_08, BW_40), + HE_GROUP(3, HE_GI_08, BW_40), + HE_GROUP(4, HE_GI_08, BW_40), + HE_GROUP(5, HE_GI_08, BW_40), + HE_GROUP(6, HE_GI_08, BW_40), + HE_GROUP(7, HE_GI_08, BW_40), + HE_GROUP(8, HE_GI_08, BW_40), + + HE_GROUP(1, HE_GI_16, BW_40), + HE_GROUP(2, HE_GI_16, BW_40), + HE_GROUP(3, HE_GI_16, BW_40), + HE_GROUP(4, HE_GI_16, BW_40), + HE_GROUP(5, HE_GI_16, BW_40), + HE_GROUP(6, HE_GI_16, BW_40), + HE_GROUP(7, HE_GI_16, BW_40), + HE_GROUP(8, HE_GI_16, BW_40), + + HE_GROUP(1, HE_GI_32, BW_40), + HE_GROUP(2, HE_GI_32, BW_40), + HE_GROUP(3, HE_GI_32, BW_40), + HE_GROUP(4, HE_GI_32, BW_40), + HE_GROUP(5, HE_GI_32, BW_40), + HE_GROUP(6, HE_GI_32, BW_40), + HE_GROUP(7, HE_GI_32, BW_40), + HE_GROUP(8, HE_GI_32, BW_40), + + HE_GROUP(1, HE_GI_08, BW_80), + HE_GROUP(2, HE_GI_08, BW_80), + HE_GROUP(3, HE_GI_08, BW_80), + HE_GROUP(4, HE_GI_08, BW_80), + HE_GROUP(5, HE_GI_08, BW_80), + HE_GROUP(6, HE_GI_08, BW_80), + HE_GROUP(7, HE_GI_08, BW_80), + HE_GROUP(8, HE_GI_08, BW_80), + + HE_GROUP(1, HE_GI_16, BW_80), + HE_GROUP(2, HE_GI_16, BW_80), + HE_GROUP(3, HE_GI_16, BW_80), + HE_GROUP(4, HE_GI_16, BW_80), + HE_GROUP(5, HE_GI_16, BW_80), + HE_GROUP(6, HE_GI_16, BW_80), + HE_GROUP(7, HE_GI_16, BW_80), + HE_GROUP(8, HE_GI_16, BW_80), + + HE_GROUP(1, HE_GI_32, BW_80), + HE_GROUP(2, HE_GI_32, BW_80), + HE_GROUP(3, HE_GI_32, BW_80), + HE_GROUP(4, HE_GI_32, BW_80), + HE_GROUP(5, HE_GI_32, BW_80), + HE_GROUP(6, HE_GI_32, BW_80), + HE_GROUP(7, HE_GI_32, BW_80), + HE_GROUP(8, HE_GI_32, BW_80), + + HE_GROUP(1, HE_GI_08, BW_160), + HE_GROUP(2, HE_GI_08, BW_160), + HE_GROUP(3, HE_GI_08, BW_160), + HE_GROUP(4, HE_GI_08, BW_160), + HE_GROUP(5, HE_GI_08, BW_160), + HE_GROUP(6, HE_GI_08, BW_160), + HE_GROUP(7, HE_GI_08, BW_160), + HE_GROUP(8, HE_GI_08, BW_160), + + HE_GROUP(1, HE_GI_16, BW_160), + HE_GROUP(2, HE_GI_16, BW_160), + HE_GROUP(3, HE_GI_16, BW_160), + HE_GROUP(4, HE_GI_16, BW_160), + HE_GROUP(5, HE_GI_16, BW_160), + HE_GROUP(6, HE_GI_16, BW_160), + HE_GROUP(7, HE_GI_16, BW_160), + HE_GROUP(8, HE_GI_16, BW_160), + + HE_GROUP(1, HE_GI_32, BW_160), + HE_GROUP(2, HE_GI_32, BW_160), + HE_GROUP(3, HE_GI_32, BW_160), + HE_GROUP(4, HE_GI_32, BW_160), + HE_GROUP(5, HE_GI_32, BW_160), + HE_GROUP(6, HE_GI_32, BW_160), + HE_GROUP(7, HE_GI_32, BW_160), + HE_GROUP(8, HE_GI_32, BW_160), +}; + +static u32 +ieee80211_calc_legacy_rate_duration(u16 bitrate, bool short_pre, + bool cck, int len) +{ + u32 duration; + + if (cck) { + duration = 144 + 48; /* preamble + PLCP */ + if (short_pre) + duration >>= 1; + + duration += 10; /* SIFS */ + } else { + duration = 20 + 16; /* premable + SIFS */ + } + + len <<= 3; + duration += (len * 10) / bitrate; + + return duration; +} + +u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw, + struct ieee80211_rx_status *status, + int len) +{ + struct ieee80211_supported_band *sband; + const struct ieee80211_rate *rate; + bool sgi = status->enc_flags & RX_ENC_FLAG_SHORT_GI; + bool sp = status->enc_flags & RX_ENC_FLAG_SHORTPRE; + int bw, streams; + int group, idx; + u32 duration; + bool cck; + + switch (status->bw) { + case RATE_INFO_BW_20: + bw = BW_20; + break; + case RATE_INFO_BW_40: + bw = BW_40; + break; + case RATE_INFO_BW_80: + bw = BW_80; + break; + case RATE_INFO_BW_160: + bw = BW_160; + break; + default: + WARN_ON_ONCE(1); + return 0; + } + + switch (status->encoding) { + case RX_ENC_LEGACY: + if (WARN_ON_ONCE(status->band > NL80211_BAND_5GHZ)) + return 0; + + sband = hw->wiphy->bands[status->band]; + if (!sband || status->rate_idx > sband->n_bitrates) + return 0; + + rate = &sband->bitrates[status->rate_idx]; + cck = rate->flags & IEEE80211_RATE_MANDATORY_B; + + return ieee80211_calc_legacy_rate_duration(rate->bitrate, sp, + cck, len); + + case RX_ENC_VHT: + streams = status->nss; + idx = status->rate_idx; + group = VHT_GROUP_IDX(streams, sgi, bw); + break; + case RX_ENC_HT: + streams = ((status->rate_idx >> 3) & 3) + 1; + idx = status->rate_idx & 7; + group = HT_GROUP_IDX(streams, sgi, bw); + break; + case RX_ENC_HE: + streams = status->nss; + idx = status->rate_idx; + group = HE_GROUP_IDX(streams, status->he_gi, bw); + break; + default: + WARN_ON_ONCE(1); + return 0; + } + + if (WARN_ON_ONCE((status->encoding != RX_ENC_HE && streams > 4) || + (status->encoding == RX_ENC_HE && streams > 8))) + return 0; + + duration = airtime_mcs_groups[group].duration[idx]; + duration <<= airtime_mcs_groups[group].shift; + duration *= len; + duration /= AVG_PKT_SIZE; + duration /= 1024; + + duration += 36 + (streams << 2); + + return duration; +} +EXPORT_SYMBOL_GPL(ieee80211_calc_rx_airtime); + +static u32 ieee80211_calc_tx_airtime_rate(struct ieee80211_hw *hw, + struct ieee80211_tx_rate *rate, + u8 band, int len) +{ + struct ieee80211_rx_status stat = { + .band = band, + }; + + if (rate->idx < 0 || !rate->count) + return 0; + + if (rate->flags & IEEE80211_TX_RC_80_MHZ_WIDTH) + stat.bw = RATE_INFO_BW_80; + else if (rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH) + stat.bw = RATE_INFO_BW_40; + else + stat.bw = RATE_INFO_BW_20; + + stat.enc_flags = 0; + if (rate->flags & IEEE80211_TX_RC_USE_SHORT_PREAMBLE) + stat.enc_flags |= RX_ENC_FLAG_SHORTPRE; + if (rate->flags & IEEE80211_TX_RC_SHORT_GI) + stat.enc_flags |= RX_ENC_FLAG_SHORT_GI; + + stat.rate_idx = rate->idx; + if (rate->flags & IEEE80211_TX_RC_VHT_MCS) { + stat.encoding = RX_ENC_VHT; + stat.rate_idx = ieee80211_rate_get_vht_mcs(rate); + stat.nss = ieee80211_rate_get_vht_nss(rate); + } else if (rate->flags & IEEE80211_TX_RC_MCS) { + stat.encoding = RX_ENC_HT; + } else { + stat.encoding = RX_ENC_LEGACY; + } + + return ieee80211_calc_rx_airtime(hw, &stat, len); +} + +u32 ieee80211_calc_tx_airtime(struct ieee80211_hw *hw, + struct ieee80211_tx_info *info, + int len) +{ + u32 duration = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(info->status.rates); i++) { + struct ieee80211_tx_rate *rate = &info->status.rates[i]; + u32 cur_duration; + + cur_duration = ieee80211_calc_tx_airtime_rate(hw, rate, + info->band, len); + if (!cur_duration) + break; + + duration += cur_duration * rate->count; + } + + return duration; +} +EXPORT_SYMBOL_GPL(ieee80211_calc_tx_airtime); + +u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_sta *pubsta, + int len) +{ + struct ieee80211_supported_band *sband; + struct ieee80211_chanctx_conf *conf; + int rateidx, shift = 0; + bool cck, short_pream; + u32 basic_rates; + u8 band = 0; + u16 rate; + + len += 38; /* Ethernet header length */ + + conf = rcu_dereference(vif->chanctx_conf); + if (conf) { + band = conf->def.chan->band; + shift = ieee80211_chandef_get_shift(&conf->def); + } + + if (pubsta) { + struct sta_info *sta = container_of(pubsta, struct sta_info, + sta); + + return ieee80211_calc_tx_airtime_rate(hw, + &sta->tx_stats.last_rate, + band, len); + } + + if (!conf) + return 0; + + /* No station to get latest rate from, so calculate the worst-case + * duration using the lowest configured basic rate. + */ + sband = hw->wiphy->bands[band]; + + basic_rates = vif->bss_conf.basic_rates; + short_pream = vif->bss_conf.use_short_preamble; + + rateidx = basic_rates ? ffs(basic_rates) - 1 : 0; + rate = sband->bitrates[rateidx].bitrate << shift; + cck = sband->bitrates[rateidx].flags & IEEE80211_RATE_MANDATORY_B; + + return ieee80211_calc_legacy_rate_duration(rate, short_pream, cck, len); +} diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 568b3b276931..ad41d74530c6 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -59,6 +59,8 @@ static const struct file_operations name## _ops = { \ debugfs_create_file(#name, mode, phyd, local, &name## _ops); +DEBUGFS_READONLY_FILE(hw_conf, "%x", + local->hw.conf.flags); DEBUGFS_READONLY_FILE(user_power, "%d", local->user_power_level); DEBUGFS_READONLY_FILE(power, "%d", @@ -148,6 +150,87 @@ static const struct file_operations aqm_ops = { .llseek = default_llseek, }; +static ssize_t aql_txq_limit_read(struct file *file, + char __user *user_buf, + size_t count, + loff_t *ppos) +{ + struct ieee80211_local *local = file->private_data; + char buf[400]; + int len = 0; + + len = scnprintf(buf, sizeof(buf), + "AC AQL limit low AQL limit high\n" + "VO %u %u\n" + "VI %u %u\n" + "BE %u %u\n" + "BK %u %u\n", + local->aql_txq_limit_low[IEEE80211_AC_VO], + local->aql_txq_limit_high[IEEE80211_AC_VO], + local->aql_txq_limit_low[IEEE80211_AC_VI], + local->aql_txq_limit_high[IEEE80211_AC_VI], + local->aql_txq_limit_low[IEEE80211_AC_BE], + local->aql_txq_limit_high[IEEE80211_AC_BE], + local->aql_txq_limit_low[IEEE80211_AC_BK], + local->aql_txq_limit_high[IEEE80211_AC_BK]); + return simple_read_from_buffer(user_buf, count, ppos, + buf, len); +} + +static ssize_t aql_txq_limit_write(struct file *file, + const char __user *user_buf, + size_t count, + loff_t *ppos) +{ + struct ieee80211_local *local = file->private_data; + char buf[100]; + size_t len; + u32 ac, q_limit_low, q_limit_high, q_limit_low_old, q_limit_high_old; + struct sta_info *sta; + + if (count > sizeof(buf)) + return -EINVAL; + + if (copy_from_user(buf, user_buf, count)) + return -EFAULT; + + buf[sizeof(buf) - 1] = 0; + len = strlen(buf); + if (len > 0 && buf[len - 1] == '\n') + buf[len - 1] = 0; + + if (sscanf(buf, "%u %u %u", &ac, &q_limit_low, &q_limit_high) != 3) + return -EINVAL; + + if (ac >= IEEE80211_NUM_ACS) + return -EINVAL; + + q_limit_low_old = local->aql_txq_limit_low[ac]; + q_limit_high_old = local->aql_txq_limit_high[ac]; + + local->aql_txq_limit_low[ac] = q_limit_low; + local->aql_txq_limit_high[ac] = q_limit_high; + + mutex_lock(&local->sta_mtx); + list_for_each_entry(sta, &local->sta_list, list) { + /* If a sta has customized queue limits, keep it */ + if (sta->airtime[ac].aql_limit_low == q_limit_low_old && + sta->airtime[ac].aql_limit_high == q_limit_high_old) { + sta->airtime[ac].aql_limit_low = q_limit_low; + sta->airtime[ac].aql_limit_high = q_limit_high; + } + } + mutex_unlock(&local->sta_mtx); + return count; +} + +static const struct file_operations aql_txq_limit_ops = { + .write = aql_txq_limit_write, + .read = aql_txq_limit_read, + .open = simple_open, + .llseek = default_llseek, +}; + static ssize_t force_tx_status_read(struct file *file, char __user *user_buf, size_t count, @@ -433,6 +516,7 @@ void debugfs_hw_add(struct ieee80211_local *local) DEBUGFS_ADD(hwflags); DEBUGFS_ADD(user_power); DEBUGFS_ADD(power); + DEBUGFS_ADD(hw_conf); DEBUGFS_ADD_MODE(force_tx_status, 0600); if (local->ops->wake_tx_queue) @@ -441,6 +525,10 @@ void debugfs_hw_add(struct ieee80211_local *local) debugfs_create_u16("airtime_flags", 0600, phyd, &local->airtime_flags); + DEBUGFS_ADD(aql_txq_limit); + debugfs_create_u32("aql_threshold", 0600, + phyd, &local->aql_threshold); + statsd = debugfs_create_dir("statistics", phyd); /* if the dir failed, don't put all the other things into the root! */ diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index c8ad20c28c43..0185e6e5e5d1 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -197,10 +197,12 @@ static ssize_t sta_airtime_read(struct file *file, char __user *userbuf, { struct sta_info *sta = file->private_data; struct ieee80211_local *local = sta->sdata->local; - size_t bufsz = 200; + size_t bufsz = 400; char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf; u64 rx_airtime = 0, tx_airtime = 0; s64 deficit[IEEE80211_NUM_ACS]; + u32 q_depth[IEEE80211_NUM_ACS]; + u32 q_limit_l[IEEE80211_NUM_ACS], q_limit_h[IEEE80211_NUM_ACS]; ssize_t rv; int ac; @@ -212,19 +214,22 @@ static ssize_t sta_airtime_read(struct file *file, char __user *userbuf, rx_airtime += sta->airtime[ac].rx_airtime; tx_airtime += sta->airtime[ac].tx_airtime; deficit[ac] = sta->airtime[ac].deficit; + q_limit_l[ac] = sta->airtime[ac].aql_limit_low; + q_limit_h[ac] = sta->airtime[ac].aql_limit_high; spin_unlock_bh(&local->active_txq_lock[ac]); + q_depth[ac] = atomic_read(&sta->airtime[ac].aql_tx_pending); } p += scnprintf(p, bufsz + buf - p, "RX: %llu us\nTX: %llu us\nWeight: %u\n" - "Deficit: VO: %lld us VI: %lld us BE: %lld us BK: %lld us\n", - rx_airtime, - tx_airtime, - sta->airtime_weight, - deficit[0], - deficit[1], - deficit[2], - deficit[3]); + "Deficit: VO: %lld us VI: %lld us BE: %lld us BK: %lld us\n" + "Q depth: VO: %u us VI: %u us BE: %u us BK: %u us\n" + "Q limit[low/high]: VO: %u/%u VI: %u/%u BE: %u/%u BK: %u/%u\n", + rx_airtime, tx_airtime, sta->airtime_weight, + deficit[0], deficit[1], deficit[2], deficit[3], + q_depth[0], q_depth[1], q_depth[2], q_depth[3], + q_limit_l[0], q_limit_h[0], q_limit_l[1], q_limit_h[1], + q_limit_l[2], q_limit_h[2], q_limit_l[3], q_limit_h[3]), rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); kfree(buf); @@ -236,7 +241,25 @@ static ssize_t sta_airtime_write(struct file *file, const char __user *userbuf, { struct sta_info *sta = file->private_data; struct ieee80211_local *local = sta->sdata->local; - int ac; + u32 ac, q_limit_l, q_limit_h; + char _buf[100] = {}, *buf = _buf; + + if (count > sizeof(_buf)) + return -EINVAL; + + if (copy_from_user(buf, userbuf, count)) + return -EFAULT; + + buf[sizeof(_buf) - 1] = '\0'; + if (sscanf(buf, "queue limit %u %u %u", &ac, &q_limit_l, &q_limit_h) + != 3) + return -EINVAL; + + if (ac >= IEEE80211_NUM_ACS) + return -EINVAL; + + sta->airtime[ac].aql_limit_low = q_limit_l; + sta->airtime[ac].aql_limit_high = q_limit_h; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { spin_lock_bh(&local->active_txq_lock[ac]); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 05406e9c05b3..ad15b3be8bb3 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1142,6 +1142,10 @@ struct ieee80211_local { u16 schedule_round[IEEE80211_NUM_ACS]; u16 airtime_flags; + u32 aql_txq_limit_low[IEEE80211_NUM_ACS]; + u32 aql_txq_limit_high[IEEE80211_NUM_ACS]; + u32 aql_threshold; + atomic_t aql_total_pending_airtime; const struct ieee80211_ops *ops; @@ -2249,6 +2253,10 @@ const char *ieee80211_get_reason_code_string(u16 reason_code); extern const struct ethtool_ops ieee80211_ethtool_ops; +u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_sta *pubsta, + int len); #ifdef CONFIG_MAC80211_NOINLINE #define debug_noinline noinline #else diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 2d05c4cfaf6d..6cca0853f183 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -667,8 +667,16 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, for (i = 0; i < IEEE80211_NUM_ACS; i++) { INIT_LIST_HEAD(&local->active_txqs[i]); spin_lock_init(&local->active_txq_lock[i]); + local->aql_txq_limit_low[i] = IEEE80211_DEFAULT_AQL_TXQ_LIMIT_L; + local->aql_txq_limit_high[i] = + IEEE80211_DEFAULT_AQL_TXQ_LIMIT_H; } - local->airtime_flags = AIRTIME_USE_TX | AIRTIME_USE_RX; + + local->airtime_flags = AIRTIME_USE_TX | + AIRTIME_USE_RX | + AIRTIME_USE_AQL; + local->aql_threshold = IEEE80211_AQL_THRESHOLD; + atomic_set(&local->aql_total_pending_airtime, 0); INIT_LIST_HEAD(&local->chanctx_list); mutex_init(&local->chanctx_mtx); diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 8d3a2389b055..8eafd81e97b4 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -210,6 +210,20 @@ struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata, return NULL; } +struct sta_info *sta_info_get_by_addrs(struct ieee80211_local *local, + const u8 *sta_addr, const u8 *vif_addr) +{ + struct rhlist_head *tmp; + struct sta_info *sta; + + for_each_sta_info(local, sta_addr, sta, tmp) { + if (ether_addr_equal(vif_addr, sta->sdata->vif.addr)) + return sta; + } + + return NULL; +} + struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata, int idx) { @@ -396,6 +410,9 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, skb_queue_head_init(&sta->ps_tx_buf[i]); skb_queue_head_init(&sta->tx_filtered[i]); sta->airtime[i].deficit = sta->airtime_weight; + atomic_set(&sta->airtime[i].aql_tx_pending, 0); + sta->airtime[i].aql_limit_low = local->aql_txq_limit_low[i]; + sta->airtime[i].aql_limit_high = local->aql_txq_limit_high[i]; } for (i = 0; i < IEEE80211_NUM_TIDS; i++) @@ -1893,6 +1910,41 @@ void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid, } EXPORT_SYMBOL(ieee80211_sta_register_airtime); +void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local, + struct sta_info *sta, u8 ac, + u16 tx_airtime, bool tx_completed) +{ + int tx_pending; + + if (!tx_completed) { + if (sta) + atomic_add(tx_airtime, + &sta->airtime[ac].aql_tx_pending); + + atomic_add(tx_airtime, &local->aql_total_pending_airtime); + return; + } + + if (sta) { + tx_pending = atomic_sub_return(tx_airtime, + &sta->airtime[ac].aql_tx_pending); + if (WARN_ONCE(tx_pending < 0, + "STA %pM AC %d txq pending airtime underflow: %u, %u", + sta->addr, ac, tx_pending, tx_airtime)) + atomic_cmpxchg(&sta->airtime[ac].aql_tx_pending, + tx_pending, 0); + } + + tx_pending = atomic_sub_return(tx_airtime, + &local->aql_total_pending_airtime); + if (WARN_ONCE(tx_pending < 0, + "Device %s AC %d pending airtime underflow: %u, %u", + wiphy_name(local->hw.wiphy), ac, tx_pending, + tx_airtime)) + atomic_cmpxchg(&local->aql_total_pending_airtime, + tx_pending, 0); +} + int sta_info_move_state(struct sta_info *sta, enum ieee80211_sta_state new_state) { diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 369c2dddce52..ad5d8a4ae56d 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -127,13 +127,21 @@ enum ieee80211_agg_stop_reason { /* Debugfs flags to enable/disable use of RX/TX airtime in scheduler */ #define AIRTIME_USE_TX BIT(0) #define AIRTIME_USE_RX BIT(1) +#define AIRTIME_USE_AQL BIT(2) struct airtime_info { u64 rx_airtime; u64 tx_airtime; s64 deficit; + atomic_t aql_tx_pending; /* Estimated airtime for frames pending */ + u32 aql_limit_low; + u32 aql_limit_high; }; +void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local, + struct sta_info *sta, u8 ac, + u16 tx_airtime, bool tx_completed); + struct sta_info; /** @@ -725,6 +733,10 @@ struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata, struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr); +/* user must hold sta_mtx or be in RCU critical section */ +struct sta_info *sta_info_get_by_addrs(struct ieee80211_local *local, + const u8 *sta_addr, const u8 *vif_addr); + #define for_each_sta_info(local, _addr, _sta, _tmp) \ rhl_for_each_entry_rcu(_sta, _tmp, \ sta_info_hash_lookup(local, _addr), hash_node) diff --git a/net/mac80211/status.c b/net/mac80211/status.c index ab8ba5835ca0..b720feaf9a74 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -670,12 +670,26 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local, struct sk_buff *skb, bool dropped) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + u16 tx_time_est = ieee80211_info_get_tx_time_est(info); struct ieee80211_hdr *hdr = (void *)skb->data; bool acked = info->flags & IEEE80211_TX_STAT_ACK; if (dropped) acked = false; + if (tx_time_est) { + struct sta_info *sta; + + rcu_read_lock(); + + sta = sta_info_get_by_addrs(local, hdr->addr1, hdr->addr2); + ieee80211_sta_update_pending_airtime(local, sta, + skb_get_queue_mapping(skb), + tx_time_est, + true); + rcu_read_unlock(); + } + if (info->flags & IEEE80211_TX_INTFL_MLME_CONN_TX) { struct ieee80211_sub_if_data *sdata; @@ -877,6 +891,7 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, struct ieee80211_bar *bar; int shift = 0; int tid = IEEE80211_NUM_TIDS; + u16 tx_time_est; rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count); @@ -986,6 +1001,17 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, ieee80211_sta_register_airtime(&sta->sta, tid, info->status.tx_time, 0); + if ((tx_time_est = ieee80211_info_get_tx_time_est(info)) > 0) { + /* Do this here to avoid the expensive lookup of the sta + * in ieee80211_report_used_skb(). + */ + ieee80211_sta_update_pending_airtime(local, sta, + skb_get_queue_mapping(skb), + tx_time_est, + true); + ieee80211_info_set_tx_time_est(info, 0); + } + if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { if (info->flags & IEEE80211_TX_STAT_ACK) { if (sta->status_stats.lost_packets) @@ -1030,7 +1056,8 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, I802_DEBUG_INC(local->dot11FailedCount); } - if (ieee80211_is_nullfunc(fc) && ieee80211_has_pm(fc) && + if ((ieee80211_is_nullfunc(fc) || ieee80211_is_qos_nullfunc(fc)) && + ieee80211_has_pm(fc) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS) && !(info->flags & IEEE80211_TX_CTL_INJECTED) && local->ps_sdata && !(local->scanning)) { @@ -1073,19 +1100,13 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) .skb = skb, .info = IEEE80211_SKB_CB(skb), }; - struct rhlist_head *tmp; struct sta_info *sta; rcu_read_lock(); - for_each_sta_info(local, hdr->addr1, sta, tmp) { - /* skip wrong virtual interface */ - if (!ether_addr_equal(hdr->addr2, sta->sdata->vif.addr)) - continue; - + sta = sta_info_get_by_addrs(local, hdr->addr1, hdr->addr2); + if (sta) status.sta = &sta->sta; - break; - } __ieee80211_tx_status(hw, &status); rcu_read_unlock(); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index db38be1b75fa..b696b9136f4c 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2270,6 +2270,9 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, * isn't always enough to find the interface to use; for proper * VLAN/WDS support we will need a different mechanism (which * likely isn't going to be monitor interfaces). + * + * This is necessary, for example, for old hostapd versions that + * don't use nl80211-based management TX/RX. */ sdata = IEEE80211_DEV_TO_SUB_IF(dev); @@ -3551,6 +3554,9 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, WARN_ON_ONCE(softirq_count() == 0); + if (!ieee80211_txq_airtime_check(hw, txq)) + return NULL; + begin: spin_lock_bh(&fq->lock); @@ -3661,6 +3667,21 @@ begin: } IEEE80211_SKB_CB(skb)->control.vif = vif; + + if (local->airtime_flags & AIRTIME_USE_AQL) { + u32 airtime; + + airtime = ieee80211_calc_expected_tx_airtime(hw, vif, txq->sta, + skb->len); + if (airtime) { + airtime = ieee80211_info_set_tx_time_est(info, airtime); + ieee80211_sta_update_pending_airtime(local, tx.sta, + txq->ac, + airtime, + false); + } + } + return skb; out: @@ -3674,7 +3695,8 @@ struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw, u8 ac) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_txq *ret = NULL; - struct txq_info *txqi = NULL; + struct txq_info *txqi = NULL, *head = NULL; + bool found_eligible_txq = false; spin_lock_bh(&local->active_txq_lock[ac]); @@ -3685,13 +3707,30 @@ struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw, u8 ac) if (!txqi) goto out; + if (txqi == head) { + if (!found_eligible_txq) + goto out; + else + found_eligible_txq = false; + } + + if (!head) + head = txqi; + if (txqi->txq.sta) { struct sta_info *sta = container_of(txqi->txq.sta, - struct sta_info, sta); + struct sta_info, sta); + bool aql_check = ieee80211_txq_airtime_check(hw, &txqi->txq); + s64 deficit = sta->airtime[txqi->txq.ac].deficit; + + if (aql_check) + found_eligible_txq = true; - if (sta->airtime[txqi->txq.ac].deficit < 0) { + if (deficit < 0) sta->airtime[txqi->txq.ac].deficit += sta->airtime_weight; + + if (deficit < 0 || !aql_check) { list_move_tail(&txqi->schedule_order, &local->active_txqs[txqi->txq.ac]); goto begin; @@ -3745,6 +3784,33 @@ void __ieee80211_schedule_txq(struct ieee80211_hw *hw, } EXPORT_SYMBOL(__ieee80211_schedule_txq); +bool ieee80211_txq_airtime_check(struct ieee80211_hw *hw, + struct ieee80211_txq *txq) +{ + struct sta_info *sta; + struct ieee80211_local *local = hw_to_local(hw); + + if (!(local->airtime_flags & AIRTIME_USE_AQL)) + return true; + + if (!txq->sta) + return true; + + sta = container_of(txq->sta, struct sta_info, sta); + if (atomic_read(&sta->airtime[txq->ac].aql_tx_pending) < + sta->airtime[txq->ac].aql_limit_low) + return true; + + if (atomic_read(&local->aql_total_pending_airtime) < + local->aql_threshold && + atomic_read(&sta->airtime[txq->ac].aql_tx_pending) < + sta->airtime[txq->ac].aql_limit_high) + return true; + + return false; +} +EXPORT_SYMBOL(ieee80211_txq_airtime_check); + bool ieee80211_txq_may_transmit(struct ieee80211_hw *hw, struct ieee80211_txq *txq) { diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 1a04e0929738..be5e95a0d876 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -25,7 +25,8 @@ /* 3 Counters support added */ /* 4 Comments support added */ /* 5 Forceadd support added */ -#define IPSET_TYPE_REV_MAX 6 /* skbinfo support added */ +/* 6 skbinfo support added */ +#define IPSET_TYPE_REV_MAX 7 /* interface wildcard support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -57,6 +58,7 @@ struct hash_netiface4_elem { u8 cidr; u8 nomatch; u8 elem; + u8 wildcard; char iface[IFNAMSIZ]; }; @@ -71,7 +73,9 @@ hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1, ip1->cidr == ip2->cidr && (++*multi) && ip1->physdev == ip2->physdev && - strcmp(ip1->iface, ip2->iface) == 0; + (ip1->wildcard ? + strncmp(ip1->iface, ip2->iface, strlen(ip1->iface)) == 0 : + strcmp(ip1->iface, ip2->iface) == 0); } static int @@ -103,7 +107,8 @@ static bool hash_netiface4_data_list(struct sk_buff *skb, const struct hash_netiface4_elem *data) { - u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0; + u32 flags = (data->physdev ? IPSET_FLAG_PHYSDEV : 0) | + (data->wildcard ? IPSET_FLAG_IFACE_WILDCARD : 0); if (data->nomatch) flags |= IPSET_FLAG_NOMATCH; @@ -229,6 +234,8 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], e.physdev = 1; if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); + if (cadt_flags & IPSET_FLAG_IFACE_WILDCARD) + e.wildcard = 1; } if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) { e.ip = htonl(ip & ip_set_hostmask(e.cidr)); @@ -280,6 +287,7 @@ struct hash_netiface6_elem { u8 cidr; u8 nomatch; u8 elem; + u8 wildcard; char iface[IFNAMSIZ]; }; @@ -294,7 +302,9 @@ hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1, ip1->cidr == ip2->cidr && (++*multi) && ip1->physdev == ip2->physdev && - strcmp(ip1->iface, ip2->iface) == 0; + (ip1->wildcard ? + strncmp(ip1->iface, ip2->iface, strlen(ip1->iface)) == 0 : + strcmp(ip1->iface, ip2->iface) == 0); } static int @@ -326,7 +336,8 @@ static bool hash_netiface6_data_list(struct sk_buff *skb, const struct hash_netiface6_elem *data) { - u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0; + u32 flags = (data->physdev ? IPSET_FLAG_PHYSDEV : 0) | + (data->wildcard ? IPSET_FLAG_IFACE_WILDCARD : 0); if (data->nomatch) flags |= IPSET_FLAG_NOMATCH; @@ -440,6 +451,8 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], e.physdev = 1; if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); + if (cadt_flags & IPSET_FLAG_IFACE_WILDCARD) + e.wildcard = 1; } ret = adtfn(set, &e, &ext, &ext, flags); diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 8468d2d02284..9889d52eda82 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -18,11 +18,11 @@ static DEFINE_MUTEX(flowtable_lock); static LIST_HEAD(flowtables); static void -flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct, +flow_offload_fill_dir(struct flow_offload *flow, enum flow_offload_tuple_dir dir) { struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple; - struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple; + struct nf_conntrack_tuple *ctt = &flow->ct->tuplehash[dir].tuple; ft->dir = dir; @@ -57,8 +57,8 @@ struct flow_offload *flow_offload_alloc(struct nf_conn *ct) flow->ct = ct; - flow_offload_fill_dir(flow, ct, FLOW_OFFLOAD_DIR_ORIGINAL); - flow_offload_fill_dir(flow, ct, FLOW_OFFLOAD_DIR_REPLY); + flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_ORIGINAL); + flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_REPLY); if (ct->status & IPS_SRC_NAT) flow->flags |= FLOW_OFFLOAD_SNAT; diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c index bfb910b874ce..88bedf1ff1ae 100644 --- a/net/netfilter/nf_flow_table_inet.c +++ b/net/netfilter/nf_flow_table_inet.c @@ -21,11 +21,34 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb, return NF_ACCEPT; } +static int nf_flow_rule_route_inet(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + const struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple; + int err; + + switch (flow_tuple->l3proto) { + case NFPROTO_IPV4: + err = nf_flow_rule_route_ipv4(net, flow, dir, flow_rule); + break; + case NFPROTO_IPV6: + err = nf_flow_rule_route_ipv6(net, flow, dir, flow_rule); + break; + default: + err = -1; + break; + } + + return err; +} + static struct nf_flowtable_type flowtable_inet = { .family = NFPROTO_INET, .init = nf_flow_table_init, .setup = nf_flow_table_offload_setup, - .action = nf_flow_rule_route, + .action = nf_flow_rule_route_inet, .free = nf_flow_table_free, .hook = nf_flow_offload_inet_hook, .owner = THIS_MODULE, diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 9be61f47303a..c54c9a6cc981 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -112,13 +112,22 @@ static void flow_offload_mangle(struct flow_action_entry *entry, memcpy(&entry->mangle.val, value, sizeof(u32)); } +static inline struct flow_action_entry * +flow_action_entry_next(struct nf_flow_rule *flow_rule) +{ + int i = flow_rule->rule->action.num_entries++; + + return &flow_rule->rule->action.entries[i]; +} + static int flow_offload_eth_src(struct net *net, const struct flow_offload *flow, enum flow_offload_tuple_dir dir, - struct flow_action_entry *entry0, - struct flow_action_entry *entry1) + struct nf_flow_rule *flow_rule) { const struct flow_offload_tuple *tuple = &flow->tuplehash[!dir].tuple; + struct flow_action_entry *entry0 = flow_action_entry_next(flow_rule); + struct flow_action_entry *entry1 = flow_action_entry_next(flow_rule); struct net_device *dev; u32 mask, val; u16 val16; @@ -145,10 +154,11 @@ static int flow_offload_eth_src(struct net *net, static int flow_offload_eth_dst(struct net *net, const struct flow_offload *flow, enum flow_offload_tuple_dir dir, - struct flow_action_entry *entry0, - struct flow_action_entry *entry1) + struct nf_flow_rule *flow_rule) { const struct flow_offload_tuple *tuple = &flow->tuplehash[dir].tuple; + struct flow_action_entry *entry0 = flow_action_entry_next(flow_rule); + struct flow_action_entry *entry1 = flow_action_entry_next(flow_rule); struct neighbour *n; u32 mask, val; u16 val16; @@ -175,8 +185,9 @@ static int flow_offload_eth_dst(struct net *net, static void flow_offload_ipv4_snat(struct net *net, const struct flow_offload *flow, enum flow_offload_tuple_dir dir, - struct flow_action_entry *entry) + struct nf_flow_rule *flow_rule) { + struct flow_action_entry *entry = flow_action_entry_next(flow_rule); u32 mask = ~htonl(0xffffffff); __be32 addr; u32 offset; @@ -201,8 +212,9 @@ static void flow_offload_ipv4_snat(struct net *net, static void flow_offload_ipv4_dnat(struct net *net, const struct flow_offload *flow, enum flow_offload_tuple_dir dir, - struct flow_action_entry *entry) + struct nf_flow_rule *flow_rule) { + struct flow_action_entry *entry = flow_action_entry_next(flow_rule); u32 mask = ~htonl(0xffffffff); __be32 addr; u32 offset; @@ -224,6 +236,71 @@ static void flow_offload_ipv4_dnat(struct net *net, (u8 *)&addr, (u8 *)&mask); } +static void flow_offload_ipv6_mangle(struct nf_flow_rule *flow_rule, + unsigned int offset, + u8 *addr, u8 *mask) +{ + struct flow_action_entry *entry; + int i; + + for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i += sizeof(u32)) { + entry = flow_action_entry_next(flow_rule); + flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP6, + offset + i, + &addr[i], mask); + } +} + +static void flow_offload_ipv6_snat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + u32 mask = ~htonl(0xffffffff); + const u8 *addr; + u32 offset; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6.s6_addr; + offset = offsetof(struct ipv6hdr, saddr); + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6.s6_addr; + offset = offsetof(struct ipv6hdr, daddr); + break; + default: + return; + } + + flow_offload_ipv6_mangle(flow_rule, offset, (u8 *)addr, (u8 *)&mask); +} + +static void flow_offload_ipv6_dnat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + u32 mask = ~htonl(0xffffffff); + const u8 *addr; + u32 offset; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6.s6_addr; + offset = offsetof(struct ipv6hdr, daddr); + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6.s6_addr; + offset = offsetof(struct ipv6hdr, saddr); + break; + default: + return; + } + + flow_offload_ipv6_mangle(flow_rule, offset, (u8 *)addr, (u8 *)&mask); +} + static int flow_offload_l4proto(const struct flow_offload *flow) { u8 protonum = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto; @@ -246,8 +323,9 @@ static int flow_offload_l4proto(const struct flow_offload *flow) static void flow_offload_port_snat(struct net *net, const struct flow_offload *flow, enum flow_offload_tuple_dir dir, - struct flow_action_entry *entry) + struct nf_flow_rule *flow_rule) { + struct flow_action_entry *entry = flow_action_entry_next(flow_rule); u32 mask = ~htonl(0xffff0000); __be16 port; u32 offset; @@ -272,8 +350,9 @@ static void flow_offload_port_snat(struct net *net, static void flow_offload_port_dnat(struct net *net, const struct flow_offload *flow, enum flow_offload_tuple_dir dir, - struct flow_action_entry *entry) + struct nf_flow_rule *flow_rule) { + struct flow_action_entry *entry = flow_action_entry_next(flow_rule); u32 mask = ~htonl(0xffff); __be16 port; u32 offset; @@ -297,9 +376,10 @@ static void flow_offload_port_dnat(struct net *net, static void flow_offload_ipv4_checksum(struct net *net, const struct flow_offload *flow, - struct flow_action_entry *entry) + struct nf_flow_rule *flow_rule) { u8 protonum = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto; + struct flow_action_entry *entry = flow_action_entry_next(flow_rule); entry->id = FLOW_ACTION_CSUM; entry->csum_flags = TCA_CSUM_UPDATE_FLAG_IPV4HDR; @@ -316,8 +396,9 @@ static void flow_offload_ipv4_checksum(struct net *net, static void flow_offload_redirect(const struct flow_offload *flow, enum flow_offload_tuple_dir dir, - struct flow_action_entry *entry) + struct nf_flow_rule *flow_rule) { + struct flow_action_entry *entry = flow_action_entry_next(flow_rule); struct rtable *rt; rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache; @@ -326,45 +407,56 @@ static void flow_offload_redirect(const struct flow_offload *flow, dev_hold(rt->dst.dev); } -int nf_flow_rule_route(struct net *net, const struct flow_offload *flow, - enum flow_offload_tuple_dir dir, - struct nf_flow_rule *flow_rule) +int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) { - int i; - - if (flow_offload_eth_src(net, flow, dir, - &flow_rule->rule->action.entries[0], - &flow_rule->rule->action.entries[1]) < 0) + if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 || + flow_offload_eth_dst(net, flow, dir, flow_rule) < 0) return -1; - if (flow_offload_eth_dst(net, flow, dir, - &flow_rule->rule->action.entries[2], - &flow_rule->rule->action.entries[3]) < 0) - return -1; - - i = 4; if (flow->flags & FLOW_OFFLOAD_SNAT) { - flow_offload_ipv4_snat(net, flow, dir, - &flow_rule->rule->action.entries[i++]); - flow_offload_port_snat(net, flow, dir, - &flow_rule->rule->action.entries[i++]); + flow_offload_ipv4_snat(net, flow, dir, flow_rule); + flow_offload_port_snat(net, flow, dir, flow_rule); } if (flow->flags & FLOW_OFFLOAD_DNAT) { - flow_offload_ipv4_dnat(net, flow, dir, - &flow_rule->rule->action.entries[i++]); - flow_offload_port_dnat(net, flow, dir, - &flow_rule->rule->action.entries[i++]); + flow_offload_ipv4_dnat(net, flow, dir, flow_rule); + flow_offload_port_dnat(net, flow, dir, flow_rule); } if (flow->flags & FLOW_OFFLOAD_SNAT || flow->flags & FLOW_OFFLOAD_DNAT) - flow_offload_ipv4_checksum(net, flow, - &flow_rule->rule->action.entries[i++]); + flow_offload_ipv4_checksum(net, flow, flow_rule); - flow_offload_redirect(flow, dir, &flow_rule->rule->action.entries[i++]); + flow_offload_redirect(flow, dir, flow_rule); - return i; + return 0; } -EXPORT_SYMBOL_GPL(nf_flow_rule_route); +EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv4); + +int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 || + flow_offload_eth_dst(net, flow, dir, flow_rule) < 0) + return -1; + + if (flow->flags & FLOW_OFFLOAD_SNAT) { + flow_offload_ipv6_snat(net, flow, dir, flow_rule); + flow_offload_port_snat(net, flow, dir, flow_rule); + } + if (flow->flags & FLOW_OFFLOAD_DNAT) { + flow_offload_ipv6_dnat(net, flow, dir, flow_rule); + flow_offload_port_dnat(net, flow, dir, flow_rule); + } + + flow_offload_redirect(flow, dir, flow_rule); + + return 0; +} +EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv6); + +#define NF_FLOW_RULE_ACTION_MAX 16 static struct nf_flow_rule * nf_flow_offload_rule_alloc(struct net *net, @@ -375,13 +467,13 @@ nf_flow_offload_rule_alloc(struct net *net, const struct flow_offload *flow = offload->flow; const struct flow_offload_tuple *tuple; struct nf_flow_rule *flow_rule; - int err = -ENOMEM, num_actions; + int err = -ENOMEM; flow_rule = kzalloc(sizeof(*flow_rule), GFP_KERNEL); if (!flow_rule) goto err_flow; - flow_rule->rule = flow_rule_alloc(10); + flow_rule->rule = flow_rule_alloc(NF_FLOW_RULE_ACTION_MAX); if (!flow_rule->rule) goto err_flow_rule; @@ -394,12 +486,10 @@ nf_flow_offload_rule_alloc(struct net *net, if (err < 0) goto err_flow_match; - num_actions = flowtable->type->action(net, flow, dir, flow_rule); - if (num_actions < 0) + flow_rule->rule->action.num_entries = 0; + if (flowtable->type->action(net, flow, dir, flow_rule) < 0) goto err_flow_match; - flow_rule->rule->action.num_entries = num_actions; - return flow_rule; err_flow_match: @@ -722,6 +812,9 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, if (!(flowtable->flags & NF_FLOWTABLE_HW_OFFLOAD)) return 0; + if (!dev->netdev_ops->ndo_setup_tc) + return -EOPNOTSUPP; + bo.net = dev_net(dev); bo.block = &flowtable->flow_block; bo.command = cmd; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2dc636faa322..ff04cdc87f76 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -361,6 +361,7 @@ static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type, static int nft_delrule(struct nft_ctx *ctx, struct nft_rule *rule) { + struct nft_flow_rule *flow; struct nft_trans *trans; int err; @@ -368,6 +369,16 @@ static int nft_delrule(struct nft_ctx *ctx, struct nft_rule *rule) if (trans == NULL) return -ENOMEM; + if (ctx->chain->flags & NFT_CHAIN_HW_OFFLOAD) { + flow = nft_flow_rule_create(ctx->net, rule); + if (IS_ERR(flow)) { + nft_trans_destroy(trans); + return PTR_ERR(flow); + } + + nft_trans_flow_rule(trans) = flow; + } + err = nf_tables_delrule_deactivate(ctx, rule); if (err < 0) { nft_trans_destroy(trans); @@ -5964,16 +5975,22 @@ nft_flowtable_type_get(struct net *net, u8 family) return ERR_PTR(-ENOENT); } +static void nft_unregister_flowtable_hook(struct net *net, + struct nft_flowtable *flowtable, + struct nft_hook *hook) +{ + nf_unregister_net_hook(net, &hook->ops); + flowtable->data.type->setup(&flowtable->data, hook->ops.dev, + FLOW_BLOCK_UNBIND); +} + static void nft_unregister_flowtable_net_hooks(struct net *net, struct nft_flowtable *flowtable) { struct nft_hook *hook; - list_for_each_entry(hook, &flowtable->hook_list, list) { - nf_unregister_net_hook(net, &hook->ops); - flowtable->data.type->setup(&flowtable->data, hook->ops.dev, - FLOW_BLOCK_UNBIND); - } + list_for_each_entry(hook, &flowtable->hook_list, list) + nft_unregister_flowtable_hook(net, flowtable, hook); } static int nft_register_flowtable_net_hooks(struct net *net, @@ -5995,12 +6012,20 @@ static int nft_register_flowtable_net_hooks(struct net *net, } } - flowtable->data.type->setup(&flowtable->data, hook->ops.dev, - FLOW_BLOCK_BIND); - err = nf_register_net_hook(net, &hook->ops); + err = flowtable->data.type->setup(&flowtable->data, + hook->ops.dev, + FLOW_BLOCK_BIND); if (err < 0) goto err_unregister_net_hooks; + err = nf_register_net_hook(net, &hook->ops); + if (err < 0) { + flowtable->data.type->setup(&flowtable->data, + hook->ops.dev, + FLOW_BLOCK_UNBIND); + goto err_unregister_net_hooks; + } + i++; } @@ -6011,9 +6036,7 @@ err_unregister_net_hooks: if (i-- <= 0) break; - nf_unregister_net_hook(net, &hook->ops); - flowtable->data.type->setup(&flowtable->data, hook->ops.dev, - FLOW_BLOCK_UNBIND); + nft_unregister_flowtable_hook(net, flowtable, hook); list_del_rcu(&hook->list); kfree_rcu(hook, rcu); } @@ -6120,7 +6143,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, return 0; err5: list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { - nf_unregister_net_hook(net, &hook->ops); + nft_unregister_flowtable_hook(net, flowtable, hook); list_del_rcu(&hook->list); kfree_rcu(hook, rcu); } @@ -6465,7 +6488,7 @@ static void nft_flowtable_event(unsigned long event, struct net_device *dev, if (hook->ops.dev != dev) continue; - nf_unregister_net_hook(dev_net(dev), &hook->ops); + nft_unregister_flowtable_hook(dev_net(dev), flowtable, hook); list_del_rcu(&hook->list); kfree_rcu(hook, rcu); break; diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index cdea3010c7a0..68f17a6921d8 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -159,9 +159,9 @@ static void nft_flow_cls_offload_setup(struct flow_cls_offload *cls_flow, const struct nft_base_chain *basechain, const struct nft_rule *rule, const struct nft_flow_rule *flow, + struct netlink_ext_ack *extack, enum flow_cls_command command) { - struct netlink_ext_ack extack; __be16 proto = ETH_P_ALL; memset(cls_flow, 0, sizeof(*cls_flow)); @@ -170,7 +170,7 @@ static void nft_flow_cls_offload_setup(struct flow_cls_offload *cls_flow, proto = flow->proto; nft_flow_offload_common_init(&cls_flow->common, proto, - basechain->ops.priority, &extack); + basechain->ops.priority, extack); cls_flow->command = command; cls_flow->cookie = (unsigned long) rule; if (flow) @@ -182,6 +182,7 @@ static int nft_flow_offload_rule(struct nft_chain *chain, struct nft_flow_rule *flow, enum flow_cls_command command) { + struct netlink_ext_ack extack = {}; struct flow_cls_offload cls_flow; struct nft_base_chain *basechain; @@ -189,7 +190,8 @@ static int nft_flow_offload_rule(struct nft_chain *chain, return -EOPNOTSUPP; basechain = nft_base_chain(chain); - nft_flow_cls_offload_setup(&cls_flow, basechain, rule, flow, command); + nft_flow_cls_offload_setup(&cls_flow, basechain, rule, flow, &extack, + command); return nft_setup_cb_call(TC_SETUP_CLSFLOWER, &cls_flow, &basechain->flow_block.cb_list); @@ -207,13 +209,15 @@ static int nft_flow_offload_unbind(struct flow_block_offload *bo, { struct flow_block_cb *block_cb, *next; struct flow_cls_offload cls_flow; + struct netlink_ext_ack extack; struct nft_chain *chain; struct nft_rule *rule; chain = &basechain->chain; list_for_each_entry(rule, &chain->rules, list) { + memset(&extack, 0, sizeof(extack)); nft_flow_cls_offload_setup(&cls_flow, basechain, rule, NULL, - FLOW_CLS_DESTROY); + &extack, FLOW_CLS_DESTROY); nft_setup_cb_call(TC_SETUP_CLSFLOWER, &cls_flow, &bo->cb_list); } @@ -385,6 +389,55 @@ static int nft_flow_offload_chain(struct nft_chain *chain, u8 *ppolicy, return nft_flow_block_chain(basechain, NULL, cmd); } +static void nft_flow_rule_offload_abort(struct net *net, + struct nft_trans *trans) +{ + int err = 0; + + list_for_each_entry_continue_reverse(trans, &net->nft.commit_list, list) { + if (trans->ctx.family != NFPROTO_NETDEV) + continue; + + switch (trans->msg_type) { + case NFT_MSG_NEWCHAIN: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) || + nft_trans_chain_update(trans)) + continue; + + err = nft_flow_offload_chain(trans->ctx.chain, NULL, + FLOW_BLOCK_UNBIND); + break; + case NFT_MSG_DELCHAIN: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + continue; + + err = nft_flow_offload_chain(trans->ctx.chain, NULL, + FLOW_BLOCK_BIND); + break; + case NFT_MSG_NEWRULE: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + continue; + + err = nft_flow_offload_rule(trans->ctx.chain, + nft_trans_rule(trans), + NULL, FLOW_CLS_DESTROY); + break; + case NFT_MSG_DELRULE: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + continue; + + err = nft_flow_offload_rule(trans->ctx.chain, + nft_trans_rule(trans), + nft_trans_flow_rule(trans), + FLOW_CLS_REPLACE); + break; + } + + if (WARN_ON_ONCE(err)) + break; + } +} + int nft_flow_rule_offload_commit(struct net *net) { struct nft_trans *trans; @@ -418,14 +471,14 @@ int nft_flow_rule_offload_commit(struct net *net) continue; if (trans->ctx.flags & NLM_F_REPLACE || - !(trans->ctx.flags & NLM_F_APPEND)) - return -EOPNOTSUPP; - + !(trans->ctx.flags & NLM_F_APPEND)) { + err = -EOPNOTSUPP; + break; + } err = nft_flow_offload_rule(trans->ctx.chain, nft_trans_rule(trans), nft_trans_flow_rule(trans), FLOW_CLS_REPLACE); - nft_flow_rule_destroy(nft_trans_flow_rule(trans)); break; case NFT_MSG_DELRULE: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) @@ -433,13 +486,31 @@ int nft_flow_rule_offload_commit(struct net *net) err = nft_flow_offload_rule(trans->ctx.chain, nft_trans_rule(trans), - nft_trans_flow_rule(trans), - FLOW_CLS_DESTROY); + NULL, FLOW_CLS_DESTROY); break; } - if (err) - return err; + if (err) { + nft_flow_rule_offload_abort(net, trans); + break; + } + } + + list_for_each_entry(trans, &net->nft.commit_list, list) { + if (trans->ctx.family != NFPROTO_NETDEV) + continue; + + switch (trans->msg_type) { + case NFT_MSG_NEWRULE: + case NFT_MSG_DELRULE: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + continue; + + nft_flow_rule_destroy(nft_trans_flow_rule(trans)); + break; + default: + break; + } } return err; diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index 0744b2bb46da..b8092069f868 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> +#include <linux/if_arp.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables_offload.h> @@ -125,6 +126,11 @@ static int __nft_cmp_offload(struct nft_offload_ctx *ctx, flow->match.dissector.used_keys |= BIT(reg->key); flow->match.dissector.offset[reg->key] = reg->base_offset; + if (reg->key == FLOW_DISSECTOR_KEY_META && + reg->offset == offsetof(struct nft_flow_key, meta.ingress_iftype) && + nft_reg_load16(priv->data.data) != ARPHRD_ETHER) + return -EOPNOTSUPP; + nft_offload_update_dependency(ctx, &priv->data, priv->len); return 0; diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 317e3a9e8c5b..9740b554fdb3 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -33,19 +33,19 @@ static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state); -static u8 nft_meta_weekday(unsigned long secs) +static u8 nft_meta_weekday(time64_t secs) { unsigned int dse; u8 wday; secs -= NFT_META_SECS_PER_MINUTE * sys_tz.tz_minuteswest; - dse = secs / NFT_META_SECS_PER_DAY; + dse = div_u64(secs, NFT_META_SECS_PER_DAY); wday = (4 + dse) % NFT_META_DAYS_PER_WEEK; return wday; } -static u32 nft_meta_hour(unsigned long secs) +static u32 nft_meta_hour(time64_t secs) { struct tm tm; @@ -250,10 +250,10 @@ void nft_meta_get_eval(const struct nft_expr *expr, nft_reg_store64(dest, ktime_get_real_ns()); break; case NFT_META_TIME_DAY: - nft_reg_store8(dest, nft_meta_weekday(get_seconds())); + nft_reg_store8(dest, nft_meta_weekday(ktime_get_real_seconds())); break; case NFT_META_TIME_HOUR: - *dest = nft_meta_hour(get_seconds()); + *dest = nft_meta_hour(ktime_get_real_seconds()); break; default: WARN_ON(1); @@ -547,6 +547,14 @@ static int nft_meta_get_offload(struct nft_offload_ctx *ctx, sizeof(__u8), reg); nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT); break; + case NFT_META_IIF: + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_META, meta, + ingress_ifindex, sizeof(__u32), reg); + break; + case NFT_META_IIFTYPE: + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_META, meta, + ingress_iftype, sizeof(__u16), reg); + break; default: return -EOPNOTSUPP; } diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 5cb2d8908d2a..1993af3a2979 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -23,50 +23,58 @@ #include <linux/ip.h> #include <linux/ipv6.h> +static bool nft_payload_rebuild_vlan_hdr(const struct sk_buff *skb, int mac_off, + struct vlan_ethhdr *veth) +{ + if (skb_copy_bits(skb, mac_off, veth, ETH_HLEN)) + return false; + + veth->h_vlan_proto = skb->vlan_proto; + veth->h_vlan_TCI = htons(skb_vlan_tag_get(skb)); + veth->h_vlan_encapsulated_proto = skb->protocol; + + return true; +} + /* add vlan header into the user buffer for if tag was removed by offloads */ static bool nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len) { int mac_off = skb_mac_header(skb) - skb->data; - u8 vlan_len, *vlanh, *dst_u8 = (u8 *) d; + u8 *vlanh, *dst_u8 = (u8 *) d; struct vlan_ethhdr veth; + u8 vlan_hlen = 0; + + if ((skb->protocol == htons(ETH_P_8021AD) || + skb->protocol == htons(ETH_P_8021Q)) && + offset >= VLAN_ETH_HLEN && offset < VLAN_ETH_HLEN + VLAN_HLEN) + vlan_hlen += VLAN_HLEN; vlanh = (u8 *) &veth; - if (offset < ETH_HLEN) { - u8 ethlen = min_t(u8, len, ETH_HLEN - offset); + if (offset < VLAN_ETH_HLEN + vlan_hlen) { + u8 ethlen = len; - if (skb_copy_bits(skb, mac_off, &veth, ETH_HLEN)) + if (vlan_hlen && + skb_copy_bits(skb, mac_off, &veth, VLAN_ETH_HLEN) < 0) + return false; + else if (!nft_payload_rebuild_vlan_hdr(skb, mac_off, &veth)) return false; - veth.h_vlan_proto = skb->vlan_proto; + if (offset + len > VLAN_ETH_HLEN + vlan_hlen) + ethlen -= offset + len - VLAN_ETH_HLEN + vlan_hlen; - memcpy(dst_u8, vlanh + offset, ethlen); + memcpy(dst_u8, vlanh + offset - vlan_hlen, ethlen); len -= ethlen; if (len == 0) return true; dst_u8 += ethlen; - offset = ETH_HLEN; - } else if (offset >= VLAN_ETH_HLEN) { - offset -= VLAN_HLEN; - goto skip; + offset = ETH_HLEN + vlan_hlen; + } else { + offset -= VLAN_HLEN + vlan_hlen; } - veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb)); - veth.h_vlan_encapsulated_proto = skb->protocol; - - vlanh += offset; - - vlan_len = min_t(u8, len, VLAN_ETH_HLEN - offset); - memcpy(dst_u8, vlanh, vlan_len); - - len -= vlan_len; - if (!len) - return true; - - dst_u8 += vlan_len; - skip: return skb_copy_bits(skb, offset + mac_off, dst_u8, len) == 0; } @@ -174,6 +182,44 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx, NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs, dst, ETH_ALEN, reg); break; + case offsetof(struct ethhdr, h_proto): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, + n_proto, sizeof(__be16), reg); + nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK); + break; + case offsetof(struct vlan_ethhdr, h_vlan_TCI): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_VLAN, vlan, + vlan_tci, sizeof(__be16), reg); + break; + case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_VLAN, vlan, + vlan_tpid, sizeof(__be16), reg); + nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK); + break; + case offsetof(struct vlan_ethhdr, h_vlan_TCI) + sizeof(struct vlan_hdr): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, vlan, + vlan_tci, sizeof(__be16), reg); + break; + case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto) + + sizeof(struct vlan_hdr): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, vlan, + vlan_tpid, sizeof(__be16), reg); + break; default: return -EOPNOTSUPP; } diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c index 8dbb4d48f2ed..67cb98489415 100644 --- a/net/netfilter/xt_time.c +++ b/net/netfilter/xt_time.c @@ -77,12 +77,12 @@ static inline bool is_leap(unsigned int y) * This is done in three separate functions so that the most expensive * calculations are done last, in case a "simple match" can be found earlier. */ -static inline unsigned int localtime_1(struct xtm *r, time_t time) +static inline unsigned int localtime_1(struct xtm *r, time64_t time) { unsigned int v, w; /* Each day has 86400s, so finding the hour/minute is actually easy. */ - v = time % SECONDS_PER_DAY; + div_u64_rem(time, SECONDS_PER_DAY, &v); r->second = v % 60; w = v / 60; r->minute = w % 60; @@ -90,13 +90,13 @@ static inline unsigned int localtime_1(struct xtm *r, time_t time) return v; } -static inline void localtime_2(struct xtm *r, time_t time) +static inline void localtime_2(struct xtm *r, time64_t time) { /* * Here comes the rest (weekday, monthday). First, divide the SSTE * by seconds-per-day to get the number of _days_ since the epoch. */ - r->dse = time / 86400; + r->dse = div_u64(time, SECONDS_PER_DAY); /* * 1970-01-01 (w=0) was a Thursday (4). @@ -105,7 +105,7 @@ static inline void localtime_2(struct xtm *r, time_t time) r->weekday = (4 + r->dse - 1) % 7 + 1; } -static void localtime_3(struct xtm *r, time_t time) +static void localtime_3(struct xtm *r, time64_t time) { unsigned int year, i, w = r->dse; @@ -160,7 +160,7 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par) const struct xt_time_info *info = par->matchinfo; unsigned int packet_time; struct xtm current_time; - s64 stamp; + time64_t stamp; /* * We need real time here, but we can neither use skb->tstamp @@ -173,14 +173,14 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par) * 1. match before 13:00 * 2. match after 13:00 * - * If you match against processing time (get_seconds) it + * If you match against processing time (ktime_get_real_seconds) it * may happen that the same packet matches both rules if * it arrived at the right moment before 13:00, so it would be * better to check skb->tstamp and set it via __net_timestamp() * if needed. This however breaks outgoing packets tx timestamp, * and causes them to get delayed forever by fq packet scheduler. */ - stamp = get_seconds(); + stamp = ktime_get_real_seconds(); if (info->flags & XT_TIME_LOCAL_TZ) /* Adjust for local timezone */ @@ -193,6 +193,9 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par) * - 'now' is in the weekday mask * - 'now' is in the daytime range time_start..time_end * (and by default, libxt_time will set these so as to match) + * + * note: info->date_start/stop are unsigned 32-bit values that + * can hold values beyond y2038, but not after y2106. */ if (stamp < info->date_start || stamp > info->date_stop) diff --git a/net/nfc/hci/Kconfig b/net/nfc/hci/Kconfig index 97bd3a2c5c98..4822d6f46947 100644 --- a/net/nfc/hci/Kconfig +++ b/net/nfc/hci/Kconfig @@ -1,12 +1,12 @@ # SPDX-License-Identifier: GPL-2.0-only config NFC_HCI - depends on NFC - tristate "NFC HCI implementation" - default n - help - Say Y here if you want to build support for a kernel NFC HCI - implementation. This is mostly needed for devices that only process - HCI frames, like for example the NXP pn544. + depends on NFC + tristate "NFC HCI implementation" + default n + help + Say Y here if you want to build support for a kernel NFC HCI + implementation. This is mostly needed for devices that only process + HCI frames, like for example the NXP pn544. config NFC_SHDLC depends on NFC_HCI diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 2088619c03f0..93d4991ddc1f 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -350,7 +350,8 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info, size_t size = NLMSG_ALIGN(sizeof(struct ovs_header)) + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */ + nla_total_size(ovs_key_attr_size()) /* OVS_PACKET_ATTR_KEY */ - + nla_total_size(sizeof(unsigned int)); /* OVS_PACKET_ATTR_LEN */ + + nla_total_size(sizeof(unsigned int)) /* OVS_PACKET_ATTR_LEN */ + + nla_total_size(sizeof(u64)); /* OVS_PACKET_ATTR_HASH */ /* OVS_PACKET_ATTR_USERDATA */ if (upcall_info->userdata) @@ -393,6 +394,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, size_t len; unsigned int hlen; int err, dp_ifindex; + u64 hash; dp_ifindex = get_dpifindex(dp); if (!dp_ifindex) @@ -485,23 +487,30 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, } /* Add OVS_PACKET_ATTR_MRU */ - if (upcall_info->mru) { - if (nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU, - upcall_info->mru)) { - err = -ENOBUFS; - goto out; - } - pad_packet(dp, user_skb); + if (upcall_info->mru && + nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU, upcall_info->mru)) { + err = -ENOBUFS; + goto out; } /* Add OVS_PACKET_ATTR_LEN when packet is truncated */ - if (cutlen > 0) { - if (nla_put_u32(user_skb, OVS_PACKET_ATTR_LEN, - skb->len)) { - err = -ENOBUFS; - goto out; - } - pad_packet(dp, user_skb); + if (cutlen > 0 && + nla_put_u32(user_skb, OVS_PACKET_ATTR_LEN, skb->len)) { + err = -ENOBUFS; + goto out; + } + + /* Add OVS_PACKET_ATTR_HASH */ + hash = skb_get_hash_raw(skb); + if (skb->sw_hash) + hash |= OVS_PACKET_HASH_SW_BIT; + + if (skb->l4_hash) + hash |= OVS_PACKET_HASH_L4_BIT; + + if (nla_put(user_skb, OVS_PACKET_ATTR_HASH, sizeof (u64), &hash)) { + err = -ENOBUFS; + goto out; } /* Only reserve room for attribute header, packet data is added @@ -543,6 +552,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; struct vport *input_vport; u16 mru = 0; + u64 hash; int len; int err; bool log = !a[OVS_PACKET_ATTR_PROBE]; @@ -568,6 +578,14 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) } OVS_CB(packet)->mru = mru; + if (a[OVS_PACKET_ATTR_HASH]) { + hash = nla_get_u64(a[OVS_PACKET_ATTR_HASH]); + + __skb_set_hash(packet, hash & 0xFFFFFFFFULL, + !!(hash & OVS_PACKET_HASH_SW_BIT), + !!(hash & OVS_PACKET_HASH_L4_BIT)); + } + /* Build an sw_flow for sending this packet. */ flow = ovs_flow_alloc(); err = PTR_ERR(flow); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 81e85dde8217..e239a46c2f94 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -139,6 +139,18 @@ struct ovs_net { bool xt_label; }; +/** + * enum ovs_pkt_hash_types - hash info to include with a packet + * to send to userspace. + * @OVS_PACKET_HASH_SW_BIT: indicates hash was computed in software stack. + * @OVS_PACKET_HASH_L4_BIT: indicates hash is a canonical 4-tuple hash + * over transport ports. + */ +enum ovs_pkt_hash_types { + OVS_PACKET_HASH_SW_BIT = (1ULL << 32), + OVS_PACKET_HASH_L4_BIT = (1ULL << 33), +}; + extern unsigned int ovs_net_id; void ovs_lock(void); void ovs_unlock(void); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 6b345c858dba..c71f4328d138 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -513,6 +513,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) struct ib_qp_init_attr attr; struct ib_cq_init_attr cq_attr = {}; struct rds_ib_device *rds_ibdev; + unsigned long max_wrs; int ret, fr_queue_space; struct dma_pool *pool; @@ -533,10 +534,15 @@ static int rds_ib_setup_qp(struct rds_connection *conn) /* add the conn now so that connection establishment has the dev */ rds_ib_add_conn(rds_ibdev, conn); - if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1) - rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1); - if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1) - rds_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1); + max_wrs = rds_ibdev->max_wrs < rds_ib_sysctl_max_send_wr + 1 ? + rds_ibdev->max_wrs - 1 : rds_ib_sysctl_max_send_wr; + if (ic->i_send_ring.w_nr != max_wrs) + rds_ib_ring_resize(&ic->i_send_ring, max_wrs); + + max_wrs = rds_ibdev->max_wrs < rds_ib_sysctl_max_recv_wr + 1 ? + rds_ibdev->max_wrs - 1 : rds_ib_sysctl_max_recv_wr; + if (ic->i_recv_ring.w_nr != max_wrs) + rds_ib_ring_resize(&ic->i_recv_ring, max_wrs); /* Protection domain and memory range */ ic->i_pd = rds_ibdev->pd; @@ -1176,8 +1182,9 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp) ic->i_flowctl = 0; atomic_set(&ic->i_credits, 0); - rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr); - rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr); + /* Re-init rings, but retain sizes. */ + rds_ib_ring_init(&ic->i_send_ring, ic->i_send_ring.w_nr); + rds_ib_ring_init(&ic->i_recv_ring, ic->i_recv_ring.w_nr); if (ic->i_ibinc) { rds_inc_put(&ic->i_ibinc->ii_inc); @@ -1224,8 +1231,8 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) * rds_ib_conn_shutdown() waits for these to be emptied so they * must be initialized before it can be called. */ - rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr); - rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr); + rds_ib_ring_init(&ic->i_send_ring, 0); + rds_ib_ring_init(&ic->i_recv_ring, 0); ic->conn = conn; conn->c_transport_data = ic; diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 68d6af56b243..c13638aeef46 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -474,7 +474,6 @@ drop: } static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = { - [TCA_CT_UNSPEC] = { .strict_start_type = TCA_CT_UNSPEC + 1 }, [TCA_CT_ACTION] = { .type = NLA_U16 }, [TCA_CT_PARMS] = { .type = NLA_EXACT_LEN, .len = sizeof(struct tc_ct) }, [TCA_CT_ZONE] = { .type = NLA_U16 }, diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index 4d8c822b6aca..c7d5e12ee919 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -119,7 +119,6 @@ static int valid_label(const struct nlattr *attr, } static const struct nla_policy mpls_policy[TCA_MPLS_MAX + 1] = { - [TCA_MPLS_UNSPEC] = { .strict_start_type = TCA_MPLS_UNSPEC + 1 }, [TCA_MPLS_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_mpls)), [TCA_MPLS_PROTO] = { .type = NLA_U16 }, [TCA_MPLS_LABEL] = NLA_POLICY_VALIDATE_FN(NLA_U32, valid_label), diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index d5eff6ac17a9..3ad718576304 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -43,7 +43,7 @@ static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla, int err = -EINVAL; int rem; - if (!nla || !n) + if (!nla) return NULL; keys_ex = kcalloc(n, sizeof(*k), GFP_KERNEL); @@ -171,6 +171,10 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, } parm = nla_data(pattr); + if (!parm->nkeys) { + NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed"); + return -EINVAL; + } ksize = parm->nkeys * sizeof(struct tc_pedit_key); if (nla_len(pattr) < sizeof(*parm) + ksize) { NL_SET_ERR_MSG_ATTR(extack, pattr, "Length of TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute is invalid"); @@ -184,12 +188,6 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { - if (!parm->nkeys) { - tcf_idr_cleanup(tn, index); - NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed"); - ret = -EINVAL; - goto out_free; - } ret = tcf_idr_create(tn, index, est, a, &act_pedit_ops, bind, false, 0); if (ret) { diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index cb34e5d57aaa..6379f9568ab8 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -10,6 +10,8 @@ #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <net/geneve.h> +#include <net/vxlan.h> +#include <net/erspan.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/dst.h> @@ -53,7 +55,11 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a, static const struct nla_policy enc_opts_policy[TCA_TUNNEL_KEY_ENC_OPTS_MAX + 1] = { + [TCA_TUNNEL_KEY_ENC_OPTS_UNSPEC] = { + .strict_start_type = TCA_TUNNEL_KEY_ENC_OPTS_VXLAN }, [TCA_TUNNEL_KEY_ENC_OPTS_GENEVE] = { .type = NLA_NESTED }, + [TCA_TUNNEL_KEY_ENC_OPTS_VXLAN] = { .type = NLA_NESTED }, + [TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN] = { .type = NLA_NESTED }, }; static const struct nla_policy @@ -64,6 +70,19 @@ geneve_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1] = { .len = 128 }, }; +static const struct nla_policy +vxlan_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX + 1] = { + [TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP] = { .type = NLA_U32 }, +}; + +static const struct nla_policy +erspan_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX + 1] = { + [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER] = { .type = NLA_U8 }, + [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX] = { .type = NLA_U32 }, + [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR] = { .type = NLA_U8 }, + [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID] = { .type = NLA_U8 }, +}; + static int tunnel_key_copy_geneve_opt(const struct nlattr *nla, void *dst, int dst_len, struct netlink_ext_ack *extack) @@ -116,10 +135,89 @@ tunnel_key_copy_geneve_opt(const struct nlattr *nla, void *dst, int dst_len, return opt_len; } +static int +tunnel_key_copy_vxlan_opt(const struct nlattr *nla, void *dst, int dst_len, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX + 1]; + int err; + + err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX, nla, + vxlan_opt_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key vxlan option gbp"); + return -EINVAL; + } + + if (dst) { + struct vxlan_metadata *md = dst; + + md->gbp = nla_get_u32(tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP]); + } + + return sizeof(struct vxlan_metadata); +} + +static int +tunnel_key_copy_erspan_opt(const struct nlattr *nla, void *dst, int dst_len, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX + 1]; + int err; + u8 ver; + + err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX, nla, + erspan_opt_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option ver"); + return -EINVAL; + } + + ver = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER]); + if (ver == 1) { + if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option index"); + return -EINVAL; + } + } else if (ver == 2) { + if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR] || + !tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option dir or hwid"); + return -EINVAL; + } + } else { + NL_SET_ERR_MSG(extack, "Tunnel key erspan option ver is incorrect"); + return -EINVAL; + } + + if (dst) { + struct erspan_metadata *md = dst; + + md->version = ver; + if (ver == 1) { + nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX]; + md->u.index = nla_get_be32(nla); + } else { + nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR]; + md->u.md2.dir = nla_get_u8(nla); + nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID]; + set_hwid(&md->u.md2, nla_get_u8(nla)); + } + } + + return sizeof(struct erspan_metadata); +} + static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst, int dst_len, struct netlink_ext_ack *extack) { - int err, rem, opt_len, len = nla_len(nla), opts_len = 0; + int err, rem, opt_len, len = nla_len(nla), opts_len = 0, type = 0; const struct nlattr *attr, *head = nla_data(nla); err = nla_validate_deprecated(head, len, TCA_TUNNEL_KEY_ENC_OPTS_MAX, @@ -130,15 +228,48 @@ static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst, nla_for_each_attr(attr, head, len, rem) { switch (nla_type(attr)) { case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE: + if (type && type != TUNNEL_GENEVE_OPT) { + NL_SET_ERR_MSG(extack, "Duplicate type for geneve options"); + return -EINVAL; + } opt_len = tunnel_key_copy_geneve_opt(attr, dst, dst_len, extack); if (opt_len < 0) return opt_len; opts_len += opt_len; + if (opts_len > IP_TUNNEL_OPTS_MAX) { + NL_SET_ERR_MSG(extack, "Tunnel options exceeds max size"); + return -EINVAL; + } if (dst) { dst_len -= opt_len; dst += opt_len; } + type = TUNNEL_GENEVE_OPT; + break; + case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN: + if (type) { + NL_SET_ERR_MSG(extack, "Duplicate type for vxlan options"); + return -EINVAL; + } + opt_len = tunnel_key_copy_vxlan_opt(attr, dst, + dst_len, extack); + if (opt_len < 0) + return opt_len; + opts_len += opt_len; + type = TUNNEL_VXLAN_OPT; + break; + case TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN: + if (type) { + NL_SET_ERR_MSG(extack, "Duplicate type for erspan options"); + return -EINVAL; + } + opt_len = tunnel_key_copy_erspan_opt(attr, dst, + dst_len, extack); + if (opt_len < 0) + return opt_len; + opts_len += opt_len; + type = TUNNEL_ERSPAN_OPT; break; } } @@ -175,6 +306,22 @@ static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info, #else return -EAFNOSUPPORT; #endif + case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN: +#if IS_ENABLED(CONFIG_INET) + info->key.tun_flags |= TUNNEL_VXLAN_OPT; + return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info), + opts_len, extack); +#else + return -EAFNOSUPPORT; +#endif + case TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN: +#if IS_ENABLED(CONFIG_INET) + info->key.tun_flags |= TUNNEL_ERSPAN_OPT; + return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info), + opts_len, extack); +#else + return -EAFNOSUPPORT; +#endif default: NL_SET_ERR_MSG(extack, "Cannot set tunnel options for unknown tunnel type"); return -EINVAL; @@ -451,6 +598,56 @@ static int tunnel_key_geneve_opts_dump(struct sk_buff *skb, return 0; } +static int tunnel_key_vxlan_opts_dump(struct sk_buff *skb, + const struct ip_tunnel_info *info) +{ + struct vxlan_metadata *md = (struct vxlan_metadata *)(info + 1); + struct nlattr *start; + + start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_VXLAN); + if (!start) + return -EMSGSIZE; + + if (nla_put_u32(skb, TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP, md->gbp)) { + nla_nest_cancel(skb, start); + return -EMSGSIZE; + } + + nla_nest_end(skb, start); + return 0; +} + +static int tunnel_key_erspan_opts_dump(struct sk_buff *skb, + const struct ip_tunnel_info *info) +{ + struct erspan_metadata *md = (struct erspan_metadata *)(info + 1); + struct nlattr *start; + + start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN); + if (!start) + return -EMSGSIZE; + + if (nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER, md->version)) + goto err; + + if (md->version == 1 && + nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX, md->u.index)) + goto err; + + if (md->version == 2 && + (nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR, + md->u.md2.dir) || + nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID, + get_hwid(&md->u.md2)))) + goto err; + + nla_nest_end(skb, start); + return 0; +err: + nla_nest_cancel(skb, start); + return -EMSGSIZE; +} + static int tunnel_key_opts_dump(struct sk_buff *skb, const struct ip_tunnel_info *info) { @@ -468,6 +665,14 @@ static int tunnel_key_opts_dump(struct sk_buff *skb, err = tunnel_key_geneve_opts_dump(skb, info); if (err) goto err_out; + } else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) { + err = tunnel_key_vxlan_opts_dump(skb, info); + if (err) + goto err_out; + } else if (info->key.tun_flags & TUNNEL_ERSPAN_OPT) { + err = tunnel_key_erspan_opts_dump(skb, info); + if (err) + goto err_out; } else { err_out: nla_nest_cancel(skb, start); diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 74221e3351c3..c307ee1d6ca6 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -22,6 +22,8 @@ #include <net/ip.h> #include <net/flow_dissector.h> #include <net/geneve.h> +#include <net/vxlan.h> +#include <net/erspan.h> #include <net/dst.h> #include <net/dst_metadata.h> @@ -688,7 +690,11 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { static const struct nla_policy enc_opts_policy[TCA_FLOWER_KEY_ENC_OPTS_MAX + 1] = { + [TCA_FLOWER_KEY_ENC_OPTS_UNSPEC] = { + .strict_start_type = TCA_FLOWER_KEY_ENC_OPTS_VXLAN }, [TCA_FLOWER_KEY_ENC_OPTS_GENEVE] = { .type = NLA_NESTED }, + [TCA_FLOWER_KEY_ENC_OPTS_VXLAN] = { .type = NLA_NESTED }, + [TCA_FLOWER_KEY_ENC_OPTS_ERSPAN] = { .type = NLA_NESTED }, }; static const struct nla_policy @@ -699,6 +705,19 @@ geneve_opt_policy[TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX + 1] = { .len = 128 }, }; +static const struct nla_policy +vxlan_opt_policy[TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX + 1] = { + [TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP] = { .type = NLA_U32 }, +}; + +static const struct nla_policy +erspan_opt_policy[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX + 1] = { + [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID] = { .type = NLA_U8 }, +}; + static void fl_set_key_val(struct nlattr **tb, void *val, int val_type, void *mask, int mask_type, int len) @@ -928,6 +947,105 @@ static int fl_set_geneve_opt(const struct nlattr *nla, struct fl_flow_key *key, return sizeof(struct geneve_opt) + data_len; } +static int fl_set_vxlan_opt(const struct nlattr *nla, struct fl_flow_key *key, + int depth, int option_len, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX + 1]; + struct vxlan_metadata *md; + int err; + + md = (struct vxlan_metadata *)&key->enc_opts.data[key->enc_opts.len]; + memset(md, 0xff, sizeof(*md)); + + if (!depth) + return sizeof(*md); + + if (nla_type(nla) != TCA_FLOWER_KEY_ENC_OPTS_VXLAN) { + NL_SET_ERR_MSG(extack, "Non-vxlan option type for mask"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX, nla, + vxlan_opt_policy, extack); + if (err < 0) + return err; + + if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key vxlan option gbp"); + return -EINVAL; + } + + if (tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]) + md->gbp = nla_get_u32(tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]); + + return sizeof(*md); +} + +static int fl_set_erspan_opt(const struct nlattr *nla, struct fl_flow_key *key, + int depth, int option_len, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX + 1]; + struct erspan_metadata *md; + int err; + + md = (struct erspan_metadata *)&key->enc_opts.data[key->enc_opts.len]; + memset(md, 0xff, sizeof(*md)); + md->version = 1; + + if (!depth) + return sizeof(*md); + + if (nla_type(nla) != TCA_FLOWER_KEY_ENC_OPTS_ERSPAN) { + NL_SET_ERR_MSG(extack, "Non-erspan option type for mask"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX, nla, + erspan_opt_policy, extack); + if (err < 0) + return err; + + if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option ver"); + return -EINVAL; + } + + if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER]) + md->version = nla_get_u8(tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER]); + + if (md->version == 1) { + if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option index"); + return -EINVAL; + } + if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX]) { + nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX]; + md->u.index = nla_get_be32(nla); + } + } else if (md->version == 2) { + if (!option_len && (!tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR] || + !tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID])) { + NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option dir or hwid"); + return -EINVAL; + } + if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR]) { + nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR]; + md->u.md2.dir = nla_get_u8(nla); + } + if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID]) { + nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID]; + set_hwid(&md->u.md2, nla_get_u8(nla)); + } + } else { + NL_SET_ERR_MSG(extack, "Tunnel key erspan option ver is incorrect"); + return -EINVAL; + } + + return sizeof(*md); +} + static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, struct fl_flow_key *mask, struct netlink_ext_ack *extack) @@ -958,6 +1076,11 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS]), key_depth) { switch (nla_type(nla_opt_key)) { case TCA_FLOWER_KEY_ENC_OPTS_GENEVE: + if (key->enc_opts.dst_opt_type && + key->enc_opts.dst_opt_type != TUNNEL_GENEVE_OPT) { + NL_SET_ERR_MSG(extack, "Duplicate type for geneve options"); + return -EINVAL; + } option_len = 0; key->enc_opts.dst_opt_type = TUNNEL_GENEVE_OPT; option_len = fl_set_geneve_opt(nla_opt_key, key, @@ -986,6 +1109,72 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, if (msk_depth) nla_opt_msk = nla_next(nla_opt_msk, &msk_depth); break; + case TCA_FLOWER_KEY_ENC_OPTS_VXLAN: + if (key->enc_opts.dst_opt_type) { + NL_SET_ERR_MSG(extack, "Duplicate type for vxlan options"); + return -EINVAL; + } + option_len = 0; + key->enc_opts.dst_opt_type = TUNNEL_VXLAN_OPT; + option_len = fl_set_vxlan_opt(nla_opt_key, key, + key_depth, option_len, + extack); + if (option_len < 0) + return option_len; + + key->enc_opts.len += option_len; + /* At the same time we need to parse through the mask + * in order to verify exact and mask attribute lengths. + */ + mask->enc_opts.dst_opt_type = TUNNEL_VXLAN_OPT; + option_len = fl_set_vxlan_opt(nla_opt_msk, mask, + msk_depth, option_len, + extack); + if (option_len < 0) + return option_len; + + mask->enc_opts.len += option_len; + if (key->enc_opts.len != mask->enc_opts.len) { + NL_SET_ERR_MSG(extack, "Key and mask miss aligned"); + return -EINVAL; + } + + if (msk_depth) + nla_opt_msk = nla_next(nla_opt_msk, &msk_depth); + break; + case TCA_FLOWER_KEY_ENC_OPTS_ERSPAN: + if (key->enc_opts.dst_opt_type) { + NL_SET_ERR_MSG(extack, "Duplicate type for erspan options"); + return -EINVAL; + } + option_len = 0; + key->enc_opts.dst_opt_type = TUNNEL_ERSPAN_OPT; + option_len = fl_set_erspan_opt(nla_opt_key, key, + key_depth, option_len, + extack); + if (option_len < 0) + return option_len; + + key->enc_opts.len += option_len; + /* At the same time we need to parse through the mask + * in order to verify exact and mask attribute lengths. + */ + mask->enc_opts.dst_opt_type = TUNNEL_ERSPAN_OPT; + option_len = fl_set_erspan_opt(nla_opt_msk, mask, + msk_depth, option_len, + extack); + if (option_len < 0) + return option_len; + + mask->enc_opts.len += option_len; + if (key->enc_opts.len != mask->enc_opts.len) { + NL_SET_ERR_MSG(extack, "Key and mask miss aligned"); + return -EINVAL; + } + + if (msk_depth) + nla_opt_msk = nla_next(nla_opt_msk, &msk_depth); + break; default: NL_SET_ERR_MSG(extack, "Unknown tunnel option type"); return -EINVAL; @@ -2135,6 +2324,61 @@ nla_put_failure: return -EMSGSIZE; } +static int fl_dump_key_vxlan_opt(struct sk_buff *skb, + struct flow_dissector_key_enc_opts *enc_opts) +{ + struct vxlan_metadata *md; + struct nlattr *nest; + + nest = nla_nest_start_noflag(skb, TCA_FLOWER_KEY_ENC_OPTS_VXLAN); + if (!nest) + goto nla_put_failure; + + md = (struct vxlan_metadata *)&enc_opts->data[0]; + if (nla_put_u32(skb, TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP, md->gbp)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int fl_dump_key_erspan_opt(struct sk_buff *skb, + struct flow_dissector_key_enc_opts *enc_opts) +{ + struct erspan_metadata *md; + struct nlattr *nest; + + nest = nla_nest_start_noflag(skb, TCA_FLOWER_KEY_ENC_OPTS_ERSPAN); + if (!nest) + goto nla_put_failure; + + md = (struct erspan_metadata *)&enc_opts->data[0]; + if (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER, md->version)) + goto nla_put_failure; + + if (md->version == 1 && + nla_put_be32(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX, md->u.index)) + goto nla_put_failure; + + if (md->version == 2 && + (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR, + md->u.md2.dir) || + nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID, + get_hwid(&md->u.md2)))) + goto nla_put_failure; + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + static int fl_dump_key_ct(struct sk_buff *skb, struct flow_dissector_key_ct *key, struct flow_dissector_key_ct *mask) @@ -2188,6 +2432,16 @@ static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type, if (err) goto nla_put_failure; break; + case TUNNEL_VXLAN_OPT: + err = fl_dump_key_vxlan_opt(skb, enc_opts); + if (err) + goto nla_put_failure; + break; + case TUNNEL_ERSPAN_OPT: + err = fl_dump_key_erspan_opt(skb, enc_opts); + if (err) + goto nla_put_failure; + break; default: goto nla_put_failure; } diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index df98a887eb89..b0b0dc46af61 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -22,6 +22,7 @@ #define QUEUE_THRESHOLD 16384 #define DQCOUNT_INVALID -1 +#define DTIME_INVALID 0xffffffffffffffff #define MAX_PROB 0xffffffffffffffff #define PIE_SCALE 8 @@ -34,6 +35,7 @@ struct pie_params { u32 beta; /* and are used for shift relative to 1 */ bool ecn; /* true if ecn is enabled */ bool bytemode; /* to scale drop early prob based on pkt size */ + u8 dq_rate_estimator; /* to calculate delay using Little's law */ }; /* variables used */ @@ -77,11 +79,34 @@ static void pie_params_init(struct pie_params *params) params->target = PSCHED_NS2TICKS(15 * NSEC_PER_MSEC); /* 15 ms */ params->ecn = false; params->bytemode = false; + params->dq_rate_estimator = false; +} + +/* private skb vars */ +struct pie_skb_cb { + psched_time_t enqueue_time; +}; + +static struct pie_skb_cb *get_pie_cb(const struct sk_buff *skb) +{ + qdisc_cb_private_validate(skb, sizeof(struct pie_skb_cb)); + return (struct pie_skb_cb *)qdisc_skb_cb(skb)->data; +} + +static psched_time_t pie_get_enqueue_time(const struct sk_buff *skb) +{ + return get_pie_cb(skb)->enqueue_time; +} + +static void pie_set_enqueue_time(struct sk_buff *skb) +{ + get_pie_cb(skb)->enqueue_time = psched_get_time(); } static void pie_vars_init(struct pie_vars *vars) { vars->dq_count = DQCOUNT_INVALID; + vars->dq_tstamp = DTIME_INVALID; vars->accu_prob = 0; vars->avg_dq_rate = 0; /* default of 150 ms in pschedtime */ @@ -172,6 +197,10 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* we can enqueue the packet */ if (enqueue) { + /* Set enqueue time only when dq_rate_estimator is disabled. */ + if (!q->params.dq_rate_estimator) + pie_set_enqueue_time(skb); + q->stats.packets_in++; if (qdisc_qlen(sch) > q->stats.maxq) q->stats.maxq = qdisc_qlen(sch); @@ -194,6 +223,7 @@ static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = { [TCA_PIE_BETA] = {.type = NLA_U32}, [TCA_PIE_ECN] = {.type = NLA_U32}, [TCA_PIE_BYTEMODE] = {.type = NLA_U32}, + [TCA_PIE_DQ_RATE_ESTIMATOR] = {.type = NLA_U32}, }; static int pie_change(struct Qdisc *sch, struct nlattr *opt, @@ -247,6 +277,10 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_PIE_BYTEMODE]) q->params.bytemode = nla_get_u32(tb[TCA_PIE_BYTEMODE]); + if (tb[TCA_PIE_DQ_RATE_ESTIMATOR]) + q->params.dq_rate_estimator = + nla_get_u32(tb[TCA_PIE_DQ_RATE_ESTIMATOR]); + /* Drop excess packets if new limit is lower */ qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { @@ -266,6 +300,28 @@ static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) { struct pie_sched_data *q = qdisc_priv(sch); int qlen = sch->qstats.backlog; /* current queue size in bytes */ + psched_time_t now = psched_get_time(); + u32 dtime = 0; + + /* If dq_rate_estimator is disabled, calculate qdelay using the + * packet timestamp. + */ + if (!q->params.dq_rate_estimator) { + q->vars.qdelay = now - pie_get_enqueue_time(skb); + + if (q->vars.dq_tstamp != DTIME_INVALID) + dtime = now - q->vars.dq_tstamp; + + q->vars.dq_tstamp = now; + + if (qlen == 0) + q->vars.qdelay = 0; + + if (dtime == 0) + return; + + goto burst_allowance_reduction; + } /* If current queue is about 10 packets or more and dq_count is unset * we have enough packets to calculate the drain rate. Save @@ -289,10 +345,10 @@ static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) q->vars.dq_count += skb->len; if (q->vars.dq_count >= QUEUE_THRESHOLD) { - psched_time_t now = psched_get_time(); - u32 dtime = now - q->vars.dq_tstamp; u32 count = q->vars.dq_count << PIE_SCALE; + dtime = now - q->vars.dq_tstamp; + if (dtime == 0) return; @@ -317,14 +373,19 @@ static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) q->vars.dq_tstamp = psched_get_time(); } - if (q->vars.burst_time > 0) { - if (q->vars.burst_time > dtime) - q->vars.burst_time -= dtime; - else - q->vars.burst_time = 0; - } + goto burst_allowance_reduction; } } + + return; + +burst_allowance_reduction: + if (q->vars.burst_time > 0) { + if (q->vars.burst_time > dtime) + q->vars.burst_time -= dtime; + else + q->vars.burst_time = 0; + } } static void calculate_probability(struct Qdisc *sch) @@ -332,19 +393,25 @@ static void calculate_probability(struct Qdisc *sch) struct pie_sched_data *q = qdisc_priv(sch); u32 qlen = sch->qstats.backlog; /* queue size in bytes */ psched_time_t qdelay = 0; /* in pschedtime */ - psched_time_t qdelay_old = q->vars.qdelay; /* in pschedtime */ + psched_time_t qdelay_old = 0; /* in pschedtime */ s64 delta = 0; /* determines the change in probability */ u64 oldprob; u64 alpha, beta; u32 power; bool update_prob = true; - q->vars.qdelay_old = q->vars.qdelay; + if (q->params.dq_rate_estimator) { + qdelay_old = q->vars.qdelay; + q->vars.qdelay_old = q->vars.qdelay; - if (q->vars.avg_dq_rate > 0) - qdelay = (qlen << PIE_SCALE) / q->vars.avg_dq_rate; - else - qdelay = 0; + if (q->vars.avg_dq_rate > 0) + qdelay = (qlen << PIE_SCALE) / q->vars.avg_dq_rate; + else + qdelay = 0; + } else { + qdelay = q->vars.qdelay; + qdelay_old = q->vars.qdelay_old; + } /* If qdelay is zero and qlen is not, it means qlen is very small, less * than dequeue_rate, so we do not update probabilty in this round @@ -430,14 +497,18 @@ static void calculate_probability(struct Qdisc *sch) /* We restart the measurement cycle if the following conditions are met * 1. If the delay has been low for 2 consecutive Tupdate periods * 2. Calculated drop probability is zero - * 3. We have atleast one estimate for the avg_dq_rate ie., - * is a non-zero value + * 3. If average dq_rate_estimator is enabled, we have atleast one + * estimate for the avg_dq_rate ie., is a non-zero value */ if ((q->vars.qdelay < q->params.target / 2) && (q->vars.qdelay_old < q->params.target / 2) && q->vars.prob == 0 && - q->vars.avg_dq_rate > 0) + (!q->params.dq_rate_estimator || q->vars.avg_dq_rate > 0)) { pie_vars_init(&q->vars); + } + + if (!q->params.dq_rate_estimator) + q->vars.qdelay_old = qdelay; } static void pie_timer(struct timer_list *t) @@ -497,7 +568,9 @@ static int pie_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) || nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) || nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) || - nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode)) + nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode) || + nla_put_u32(skb, TCA_PIE_DQ_RATE_ESTIMATOR, + q->params.dq_rate_estimator)) goto nla_put_failure; return nla_nest_end(skb, opts); @@ -514,9 +587,6 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) .prob = q->vars.prob, .delay = ((u32)PSCHED_TICKS2NS(q->vars.qdelay)) / NSEC_PER_USEC, - /* unscale and return dq_rate in bytes per sec */ - .avg_dq_rate = q->vars.avg_dq_rate * - (PSCHED_TICKS_PER_SEC) >> PIE_SCALE, .packets_in = q->stats.packets_in, .overlimit = q->stats.overlimit, .maxq = q->stats.maxq, @@ -524,6 +594,14 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) .ecn_mark = q->stats.ecn_mark, }; + /* avg_dq_rate is only valid if dq_rate_estimator is enabled */ + st.dq_rate_estimating = q->params.dq_rate_estimator; + + /* unscale and return dq_rate in bytes per sec */ + if (q->params.dq_rate_estimator) + st.avg_dq_rate = q->vars.avg_dq_rate * + (PSCHED_TICKS_PER_SEC) >> PIE_SCALE; + return gnet_stats_copy_app(d, &st, sizeof(st)); } diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 7cd68628c637..c609373c8661 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -922,7 +922,7 @@ static int taprio_parse_mqprio_opt(struct net_device *dev, } /* Verify priority mapping uses valid tcs */ - for (i = 0; i < TC_BITMASK + 1; i++) { + for (i = 0; i <= TC_BITMASK; i++) { if (qopt->prio_tc_map[i] >= qopt->num_tc) { NL_SET_ERR_MSG(extack, "Invalid traffic class in priority to traffic class mapping"); return -EINVAL; @@ -1347,6 +1347,26 @@ out: return err; } +static int taprio_mqprio_cmp(const struct net_device *dev, + const struct tc_mqprio_qopt *mqprio) +{ + int i; + + if (!mqprio || mqprio->num_tc != dev->num_tc) + return -1; + + for (i = 0; i < mqprio->num_tc; i++) + if (dev->tc_to_txq[i].count != mqprio->count[i] || + dev->tc_to_txq[i].offset != mqprio->offset[i]) + return -1; + + for (i = 0; i <= TC_BITMASK; i++) + if (dev->prio_tc_map[i] != mqprio->prio_tc_map[i]) + return -1; + + return 0; +} + static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { @@ -1398,6 +1418,10 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, admin = rcu_dereference(q->admin_sched); rcu_read_unlock(); + /* no changes - no new mqprio settings */ + if (!taprio_mqprio_cmp(dev, mqprio)) + mqprio = NULL; + if (mqprio && (oper || admin)) { NL_SET_ERR_MSG(extack, "Changing the traffic mapping of a running schedule is not supported"); err = -ENOTSUPP; @@ -1455,7 +1479,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, mqprio->offset[i]); /* Always use supplied priority mappings */ - for (i = 0; i < TC_BITMASK + 1; i++) + for (i = 0; i <= TC_BITMASK; i++) netdev_set_prio_tc_map(dev, i, mqprio->prio_tc_map[i]); } diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index b7d9fd285c71..b997072c72e5 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -25,6 +25,7 @@ #include <linux/in.h> #include <linux/sched/signal.h> #include <linux/if_vlan.h> +#include <linux/rcupdate_wait.h> #include <net/sock.h> #include <net/tcp.h> @@ -798,6 +799,7 @@ static void smc_connect_work(struct work_struct *work) smc->sk.sk_err = EPIPE; else if (signal_pending(current)) smc->sk.sk_err = -sock_intr_errno(timeo); + sock_put(&smc->sk); /* passive closing */ goto out; } @@ -1735,7 +1737,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, case TCP_FASTOPEN_KEY: case TCP_FASTOPEN_NO_COOKIE: /* option not supported by SMC */ - if (sk->sk_state == SMC_INIT) { + if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { smc_switch_to_fallback(smc); smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; } else { @@ -2038,22 +2040,28 @@ static int __init smc_init(void) if (rc) goto out_pernet_subsys; + rc = smc_core_init(); + if (rc) { + pr_err("%s: smc_core_init fails with %d\n", __func__, rc); + goto out_pnet; + } + rc = smc_llc_init(); if (rc) { pr_err("%s: smc_llc_init fails with %d\n", __func__, rc); - goto out_pnet; + goto out_core; } rc = smc_cdc_init(); if (rc) { pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc); - goto out_pnet; + goto out_core; } rc = proto_register(&smc_proto, 1); if (rc) { pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc); - goto out_pnet; + goto out_core; } rc = proto_register(&smc_proto6, 1); @@ -2085,6 +2093,8 @@ out_proto6: proto_unregister(&smc_proto6); out_proto: proto_unregister(&smc_proto); +out_core: + smc_core_exit(); out_pnet: smc_pnet_exit(); out_pernet_subsys: @@ -2095,14 +2105,15 @@ out_pernet_subsys: static void __exit smc_exit(void) { - smc_core_exit(); static_branch_disable(&tcp_have_smc); - smc_ib_unregister_client(); sock_unregister(PF_SMC); + smc_core_exit(); + smc_ib_unregister_client(); proto_unregister(&smc_proto6); proto_unregister(&smc_proto); smc_pnet_exit(); unregister_pernet_subsys(&smc_net_ops); + rcu_barrier(); } module_init(smc_init); diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 7dc07ec2379b..164f1584861b 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -131,6 +131,9 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn) { int rc; + if (!conn->lgr || (conn->lgr->is_smcd && conn->lgr->peer_shutdown)) + return -EPIPE; + if (conn->lgr->is_smcd) { spin_lock_bh(&conn->send_lock); rc = smcd_cdc_msg_send(conn); diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 49bcebff6378..0879f7bed967 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -349,7 +349,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, smc->peer_diagnosis = ntohl(dclc->peer_diagnosis); if (((struct smc_clc_msg_decline *)buf)->hdr.flag) { smc->conn.lgr->sync_err = 1; - smc_lgr_terminate(smc->conn.lgr); + smc_lgr_terminate(smc->conn.lgr, true); } } diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index d34e5adce2eb..290270c821ca 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -20,8 +20,6 @@ #include "smc_cdc.h" #include "smc_close.h" -#define SMC_CLOSE_WAIT_LISTEN_CLCSOCK_TIME (5 * HZ) - /* release the clcsock that is assigned to the smc_sock */ void smc_clcsock_release(struct smc_sock *smc) { @@ -110,6 +108,17 @@ int smc_close_abort(struct smc_connection *conn) return smc_cdc_get_slot_and_msg_send(conn); } +static void smc_close_cancel_work(struct smc_sock *smc) +{ + struct sock *sk = &smc->sk; + + release_sock(sk); + cancel_work_sync(&smc->conn.close_work); + cancel_delayed_work_sync(&smc->conn.tx_work); + lock_sock(sk); + sk->sk_state = SMC_CLOSED; +} + /* terminate smc socket abnormally - active abort * link group is terminated, i.e. RDMA communication no longer possible */ @@ -126,23 +135,21 @@ void smc_close_active_abort(struct smc_sock *smc) switch (sk->sk_state) { case SMC_ACTIVE: sk->sk_state = SMC_PEERABORTWAIT; - release_sock(sk); - cancel_delayed_work_sync(&smc->conn.tx_work); - lock_sock(sk); + smc_close_cancel_work(smc); sk->sk_state = SMC_CLOSED; sock_put(sk); /* passive closing */ break; case SMC_APPCLOSEWAIT1: case SMC_APPCLOSEWAIT2: - release_sock(sk); - cancel_delayed_work_sync(&smc->conn.tx_work); - lock_sock(sk); + smc_close_cancel_work(smc); sk->sk_state = SMC_CLOSED; sock_put(sk); /* postponed passive closing */ break; case SMC_PEERCLOSEWAIT1: case SMC_PEERCLOSEWAIT2: case SMC_PEERFINCLOSEWAIT: + sk->sk_state = SMC_PEERABORTWAIT; + smc_close_cancel_work(smc); sk->sk_state = SMC_CLOSED; smc_conn_free(&smc->conn); release_clcsock = true; @@ -150,7 +157,11 @@ void smc_close_active_abort(struct smc_sock *smc) break; case SMC_PROCESSABORT: case SMC_APPFINCLOSEWAIT: + sk->sk_state = SMC_PEERABORTWAIT; + smc_close_cancel_work(smc); sk->sk_state = SMC_CLOSED; + smc_conn_free(&smc->conn); + release_clcsock = true; break; case SMC_INIT: case SMC_PEERABORTWAIT: diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 0d92456729ab..bb92c7c6214c 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -13,6 +13,8 @@ #include <linux/if_vlan.h> #include <linux/random.h> #include <linux/workqueue.h> +#include <linux/wait.h> +#include <linux/reboot.h> #include <net/tcp.h> #include <net/sock.h> #include <rdma/ib_verbs.h> @@ -39,6 +41,9 @@ static struct smc_lgr_list smc_lgr_list = { /* established link groups */ .num = 0, }; +static atomic_t lgr_cnt; /* number of existing link groups */ +static DECLARE_WAIT_QUEUE_HEAD(lgrs_deleted); + static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc); @@ -161,10 +166,10 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) * of the DELETE LINK sequence from server; or as server to * initiate the delete processing. See smc_llc_rx_delete_link(). */ -static int smc_link_send_delete(struct smc_link *lnk) +static int smc_link_send_delete(struct smc_link *lnk, bool orderly) { if (lnk->state == SMC_LNK_ACTIVE && - !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, true)) { + !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, orderly)) { smc_llc_link_deleting(lnk); return 0; } @@ -201,7 +206,7 @@ static void smc_lgr_free_work(struct work_struct *work) if (!lgr->is_smcd && !lgr->terminating) { /* try to send del link msg, on error free lgr immediately */ if (lnk->state == SMC_LNK_ACTIVE && - !smc_link_send_delete(lnk)) { + !smc_link_send_delete(lnk, true)) { /* reschedule in case we never receive a response */ smc_lgr_schedule_free_work(lgr); spin_unlock_bh(lgr_lock); @@ -214,7 +219,7 @@ static void smc_lgr_free_work(struct work_struct *work) if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE) smc_llc_link_inactive(lnk); - if (lgr->is_smcd) + if (lgr->is_smcd && !lgr->terminating) smc_ism_signal_shutdown(lgr); smc_lgr_free(lgr); } @@ -224,7 +229,7 @@ static void smc_lgr_terminate_work(struct work_struct *work) struct smc_link_group *lgr = container_of(work, struct smc_link_group, terminate_work); - smc_lgr_terminate(lgr); + smc_lgr_terminate(lgr, true); } /* create a new SMC link group */ @@ -275,6 +280,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->smcd = ini->ism_dev; lgr_list = &ini->ism_dev->lgr_list; lgr_lock = &lgr->smcd->lgr_lock; + lgr->peer_shutdown = 0; + atomic_inc(&ini->ism_dev->lgr_cnt); } else { /* SMC-R specific settings */ get_device(&ini->ib_dev->ibdev->dev); @@ -317,6 +324,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) rc = smc_wr_create_link(lnk); if (rc) goto destroy_qp; + atomic_inc(&lgr_cnt); + atomic_inc(&ini->ib_dev->lnk_cnt); } smc->conn.lgr = lgr; spin_lock_bh(lgr_lock); @@ -380,7 +389,8 @@ void smc_conn_free(struct smc_connection *conn) if (!lgr) return; if (lgr->is_smcd) { - smc_ism_unset_conn(conn); + if (!list_empty(&lgr->list)) + smc_ism_unset_conn(conn); tasklet_kill(&conn->rx_tsklet); } else { smc_cdc_tx_dismiss_slots(conn); @@ -403,6 +413,8 @@ static void smc_link_clear(struct smc_link *lnk) smc_ib_destroy_queue_pair(lnk); smc_ib_dealloc_protection_domain(lnk); smc_wr_free_link_mem(lnk); + if (!atomic_dec_return(&lnk->smcibdev->lnk_cnt)) + wake_up(&lnk->smcibdev->lnks_deleted); } static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, @@ -480,11 +492,17 @@ static void smc_lgr_free(struct smc_link_group *lgr) { smc_lgr_free_bufs(lgr); if (lgr->is_smcd) { - smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); - put_device(&lgr->smcd->dev); + if (!lgr->terminating) { + smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); + put_device(&lgr->smcd->dev); + } + if (!atomic_dec_return(&lgr->smcd->lgr_cnt)) + wake_up(&lgr->smcd->lgrs_deleted); } else { smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); put_device(&lgr->lnk[SMC_SINGLE_LINK].smcibdev->ibdev->dev); + if (!atomic_dec_return(&lgr_cnt)) + wake_up(&lgrs_deleted); } kfree(lgr); } @@ -502,6 +520,20 @@ void smc_lgr_forget(struct smc_link_group *lgr) spin_unlock_bh(lgr_lock); } +static void smcd_unregister_all_dmbs(struct smc_link_group *lgr) +{ + int i; + + for (i = 0; i < SMC_RMBE_SIZES; i++) { + struct smc_buf_desc *buf_desc; + + list_for_each_entry(buf_desc, &lgr->rmbs[i], list) { + buf_desc->len += sizeof(struct smcd_cdc_msg); + smc_ism_unregister_dmb(lgr->smcd, buf_desc); + } + } +} + static void smc_sk_wake_ups(struct smc_sock *smc) { smc->sk.sk_write_space(&smc->sk); @@ -510,20 +542,50 @@ static void smc_sk_wake_ups(struct smc_sock *smc) } /* kill a connection */ -static void smc_conn_kill(struct smc_connection *conn) +static void smc_conn_kill(struct smc_connection *conn, bool soft) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); - smc_close_abort(conn); + if (conn->lgr->is_smcd && conn->lgr->peer_shutdown) + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; + else + smc_close_abort(conn); conn->killed = 1; + smc->sk.sk_err = ECONNABORTED; smc_sk_wake_ups(smc); + if (conn->lgr->is_smcd) { + smc_ism_unset_conn(conn); + if (soft) + tasklet_kill(&conn->rx_tsklet); + else + tasklet_unlock_wait(&conn->rx_tsklet); + } else { + smc_cdc_tx_dismiss_slots(conn); + } smc_lgr_unregister_conn(conn); - smc->sk.sk_err = ECONNABORTED; smc_close_active_abort(smc); } +static void smc_lgr_cleanup(struct smc_link_group *lgr) +{ + if (lgr->is_smcd) { + smc_ism_signal_shutdown(lgr); + smcd_unregister_all_dmbs(lgr); + smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); + put_device(&lgr->smcd->dev); + } else { + struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + + wake_up(&lnk->wr_reg_wait); + if (lnk->state != SMC_LNK_INACTIVE) { + smc_link_send_delete(lnk, false); + smc_llc_link_inactive(lnk); + } + } +} + /* terminate link group */ -static void __smc_lgr_terminate(struct smc_link_group *lgr) +static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) { struct smc_connection *conn; struct smc_sock *smc; @@ -531,6 +593,8 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr) if (lgr->terminating) return; /* lgr already terminating */ + if (!soft) + cancel_delayed_work_sync(&lgr->free_work); lgr->terminating = 1; if (!lgr->is_smcd) smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); @@ -544,20 +608,25 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr) smc = container_of(conn, struct smc_sock, conn); sock_hold(&smc->sk); /* sock_put below */ lock_sock(&smc->sk); - smc_conn_kill(conn); + smc_conn_kill(conn, soft); release_sock(&smc->sk); sock_put(&smc->sk); /* sock_hold above */ read_lock_bh(&lgr->conns_lock); node = rb_first(&lgr->conns_all); } read_unlock_bh(&lgr->conns_lock); - if (!lgr->is_smcd) - wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait); - smc_lgr_schedule_free_work_fast(lgr); + smc_lgr_cleanup(lgr); + if (soft) + smc_lgr_schedule_free_work_fast(lgr); + else + smc_lgr_free(lgr); } -/* unlink and terminate link group */ -void smc_lgr_terminate(struct smc_link_group *lgr) +/* unlink and terminate link group + * @soft: true if link group shutdown can take its time + * false if immediate link group shutdown is required + */ +void smc_lgr_terminate(struct smc_link_group *lgr, bool soft) { spinlock_t *lgr_lock; @@ -567,9 +636,11 @@ void smc_lgr_terminate(struct smc_link_group *lgr) spin_unlock_bh(lgr_lock); return; /* lgr already terminating */ } + if (!soft) + lgr->freeing = 1; list_del_init(&lgr->list); spin_unlock_bh(lgr_lock); - __smc_lgr_terminate(lgr); + __smc_lgr_terminate(lgr, soft); } /* Called when IB port is terminated */ @@ -582,18 +653,20 @@ void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { if (!lgr->is_smcd && lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && - lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) + lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) { list_move(&lgr->list, &lgr_free_list); + lgr->freeing = 1; + } } spin_unlock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { list_del_init(&lgr->list); - __smc_lgr_terminate(lgr); + __smc_lgr_terminate(lgr, false); } } -/* Called when SMC-D device is terminated or peer is lost */ +/* Called when peer lgr shutdown (regularly or abnormally) is received */ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan) { struct smc_link_group *lgr, *l; @@ -604,6 +677,8 @@ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan) list_for_each_entry_safe(lgr, l, &dev->lgr_list, list) { if ((!peer_gid || lgr->peer_gid == peer_gid) && (vlan == VLAN_VID_MASK || lgr->vlan_id == vlan)) { + if (peer_gid) /* peer triggered termination */ + lgr->peer_shutdown = 1; list_move(&lgr->list, &lgr_free_list); } } @@ -612,11 +687,67 @@ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan) /* cancel the regular free workers and actually free lgrs */ list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { list_del_init(&lgr->list); - __smc_lgr_terminate(lgr); - cancel_delayed_work_sync(&lgr->free_work); - if (!peer_gid && vlan == VLAN_VID_MASK) /* dev terminated? */ - smc_ism_signal_shutdown(lgr); - smc_lgr_free(lgr); + schedule_work(&lgr->terminate_work); + } +} + +/* Called when an SMCD device is removed or the smc module is unloaded */ +void smc_smcd_terminate_all(struct smcd_dev *smcd) +{ + struct smc_link_group *lgr, *lg; + LIST_HEAD(lgr_free_list); + + spin_lock_bh(&smcd->lgr_lock); + list_splice_init(&smcd->lgr_list, &lgr_free_list); + list_for_each_entry(lgr, &lgr_free_list, list) + lgr->freeing = 1; + spin_unlock_bh(&smcd->lgr_lock); + + list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) { + list_del_init(&lgr->list); + __smc_lgr_terminate(lgr, false); + } + + if (atomic_read(&smcd->lgr_cnt)) + wait_event(smcd->lgrs_deleted, !atomic_read(&smcd->lgr_cnt)); +} + +/* Called when an SMCR device is removed or the smc module is unloaded. + * If smcibdev is given, all SMCR link groups using this device are terminated. + * If smcibdev is NULL, all SMCR link groups are terminated. + */ +void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) +{ + struct smc_link_group *lgr, *lg; + LIST_HEAD(lgr_free_list); + + spin_lock_bh(&smc_lgr_list.lock); + if (!smcibdev) { + list_splice_init(&smc_lgr_list.list, &lgr_free_list); + list_for_each_entry(lgr, &lgr_free_list, list) + lgr->freeing = 1; + } else { + list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { + if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev) { + list_move(&lgr->list, &lgr_free_list); + lgr->freeing = 1; + } + } + } + spin_unlock_bh(&smc_lgr_list.lock); + + list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) { + list_del_init(&lgr->list); + __smc_lgr_terminate(lgr, false); + } + + if (smcibdev) { + if (atomic_read(&smcibdev->lnk_cnt)) + wait_event(smcibdev->lnks_deleted, + !atomic_read(&smcibdev->lnk_cnt)); + } else { + if (atomic_read(&lgr_cnt)) + wait_event(lgrs_deleted, !atomic_read(&lgr_cnt)); } } @@ -1137,37 +1268,42 @@ static void smc_core_going_away(void) spin_unlock(&smcd_dev_list.lock); } -/* Called (from smc_exit) when module is removed */ -void smc_core_exit(void) +/* Clean up all SMC link groups */ +static void smc_lgrs_shutdown(void) { - struct smc_link_group *lgr, *lg; - LIST_HEAD(lgr_freeing_list); struct smcd_dev *smcd; smc_core_going_away(); - spin_lock_bh(&smc_lgr_list.lock); - list_splice_init(&smc_lgr_list.list, &lgr_freeing_list); - spin_unlock_bh(&smc_lgr_list.lock); + smc_smcr_terminate_all(NULL); spin_lock(&smcd_dev_list.lock); list_for_each_entry(smcd, &smcd_dev_list.list, list) - list_splice_init(&smcd->lgr_list, &lgr_freeing_list); + smc_smcd_terminate_all(smcd); spin_unlock(&smcd_dev_list.lock); +} - list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { - list_del_init(&lgr->list); - if (!lgr->is_smcd) { - struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; +static int smc_core_reboot_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + smc_lgrs_shutdown(); - if (lnk->state == SMC_LNK_ACTIVE) - smc_llc_send_delete_link(lnk, SMC_LLC_REQ, - false); - smc_llc_link_inactive(lnk); - } - cancel_delayed_work_sync(&lgr->free_work); - if (lgr->is_smcd) - smc_ism_signal_shutdown(lgr); - smc_lgr_free(lgr); /* free link group */ - } + return 0; +} + +static struct notifier_block smc_reboot_notifier = { + .notifier_call = smc_core_reboot_event, +}; + +int __init smc_core_init(void) +{ + atomic_set(&lgr_cnt, 0); + return register_reboot_notifier(&smc_reboot_notifier); +} + +/* Called (from smc_exit) when module is removed */ +void smc_core_exit(void) +{ + unregister_reboot_notifier(&smc_reboot_notifier); + smc_lgrs_shutdown(); } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index e6fd1ed42064..c472e12951d1 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -228,6 +228,8 @@ struct smc_link_group { /* Peer GID (remote) */ struct smcd_dev *smcd; /* ISM device for VLAN reg. */ + u8 peer_shutdown : 1; + /* peer triggered shutdownn */ }; }; }; @@ -285,7 +287,7 @@ static inline struct smc_connection *smc_lgr_find_conn( static inline void smc_lgr_terminate_sched(struct smc_link_group *lgr) { - if (!lgr->terminating) + if (!lgr->terminating && !lgr->freeing) schedule_work(&lgr->terminate_work); } @@ -294,10 +296,12 @@ struct smc_clc_msg_accept_confirm; struct smc_clc_msg_local; void smc_lgr_forget(struct smc_link_group *lgr); -void smc_lgr_terminate(struct smc_link_group *lgr); +void smc_lgr_terminate(struct smc_link_group *lgr, bool soft); void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport); void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan); +void smc_smcd_terminate_all(struct smcd_dev *dev); +void smc_smcr_terminate_all(struct smc_ib_device *smcibdev); int smc_buf_create(struct smc_sock *smc, bool is_smcd); int smc_uncompress_bufsize(u8 compressed); int smc_rmb_rtoken_handling(struct smc_connection *conn, @@ -314,6 +318,7 @@ void smc_conn_free(struct smc_connection *conn); int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini); void smcd_conn_free(struct smc_connection *conn); void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr); +int smc_core_init(void); void smc_core_exit(void); static inline struct smc_link_group *smc_get_lgr(struct smc_link *link) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index af05daeb0538..548632621f4b 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -15,6 +15,7 @@ #include <linux/random.h> #include <linux/workqueue.h> #include <linux/scatterlist.h> +#include <linux/wait.h> #include <rdma/ib_verbs.h> #include <rdma/ib_cache.h> @@ -520,9 +521,9 @@ static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) if (!smcibdev->initialized) return; smcibdev->initialized = 0; - smc_wr_remove_dev(smcibdev); ib_destroy_cq(smcibdev->roce_cq_recv); ib_destroy_cq(smcibdev->roce_cq_send); + smc_wr_remove_dev(smcibdev); } static struct ib_client smc_ib_client; @@ -543,7 +544,8 @@ static void smc_ib_add_dev(struct ib_device *ibdev) smcibdev->ibdev = ibdev; INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work); - + atomic_set(&smcibdev->lnk_cnt, 0); + init_waitqueue_head(&smcibdev->lnks_deleted); spin_lock(&smc_ib_devices.lock); list_add_tail(&smcibdev->list, &smc_ib_devices.list); spin_unlock(&smc_ib_devices.lock); @@ -565,7 +567,7 @@ static void smc_ib_add_dev(struct ib_device *ibdev) schedule_work(&smcibdev->port_event_work); } -/* callback function for ib_register_client() */ +/* callback function for ib_unregister_client() */ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) { struct smc_ib_device *smcibdev; @@ -575,6 +577,7 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) spin_lock(&smc_ib_devices.lock); list_del_init(&smcibdev->list); /* remove from smc_ib_devices */ spin_unlock(&smc_ib_devices.lock); + smc_smcr_terminate_all(smcibdev); smc_ib_cleanup_per_ibdev(smcibdev); ib_unregister_event_handler(&smcibdev->event_handler); kfree(smcibdev); diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 6a0069db6cae..255db87547d3 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -14,6 +14,7 @@ #include <linux/interrupt.h> #include <linux/if_ether.h> +#include <linux/wait.h> #include <rdma/ib_verbs.h> #include <net/smc.h> @@ -48,6 +49,8 @@ struct smc_ib_device { /* ib-device infos for smc */ struct work_struct port_event_work; unsigned long port_event_mask; DECLARE_BITMAP(ports_going_away, SMC_MAX_PORTS); + atomic_t lnk_cnt; /* number of links on ibdev */ + wait_queue_head_t lnks_deleted; /* wait 4 removal of all links*/ }; struct smc_buf_desc; diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index ee7340898cb4..5c4727d5066e 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -146,6 +146,10 @@ out: int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) { struct smcd_dmb dmb; + int rc = 0; + + if (!dmb_desc->dma_addr) + return rc; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_tok = dmb_desc->token; @@ -153,7 +157,13 @@ int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) dmb.cpu_addr = dmb_desc->cpu_addr; dmb.dma_addr = dmb_desc->dma_addr; dmb.dmb_len = dmb_desc->len; - return smcd->ops->unregister_dmb(smcd, &dmb); + rc = smcd->ops->unregister_dmb(smcd, &dmb); + if (!rc || rc == ISM_ERROR) { + dmb_desc->cpu_addr = NULL; + dmb_desc->dma_addr = 0; + } + + return rc; } int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, @@ -226,6 +236,9 @@ int smc_ism_signal_shutdown(struct smc_link_group *lgr) int rc; union smcd_sw_event_info ev_info; + if (lgr->peer_shutdown) + return 0; + memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE); ev_info.vlan_id = lgr->vlan_id; ev_info.code = ISM_EVENT_REQUEST; @@ -289,6 +302,7 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, spin_lock_init(&smcd->lgr_lock); INIT_LIST_HEAD(&smcd->vlan); INIT_LIST_HEAD(&smcd->lgr_list); + init_waitqueue_head(&smcd->lgrs_deleted); smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)", WQ_MEM_RECLAIM, name); if (!smcd->event_wq) { @@ -313,12 +327,12 @@ EXPORT_SYMBOL_GPL(smcd_register_dev); void smcd_unregister_dev(struct smcd_dev *smcd) { spin_lock(&smcd_dev_list.lock); - list_del(&smcd->list); + list_del_init(&smcd->list); spin_unlock(&smcd_dev_list.lock); smcd->going_away = 1; + smc_smcd_terminate_all(smcd); flush_workqueue(smcd->event_wq); destroy_workqueue(smcd->event_wq); - smc_smcd_terminate(smcd, 0, VLAN_VID_MASK); device_del(&smcd->dev); } @@ -372,7 +386,7 @@ void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno) spin_lock_irqsave(&smcd->lock, flags); conn = smcd->conn[dmbno]; - if (conn) + if (conn && !conn->killed) tasklet_schedule(&conn->rx_tsklet); spin_unlock_irqrestore(&smcd->lock, flags); } diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index e1918ffaf125..a9f6431dd69a 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -614,7 +614,7 @@ static void smc_llc_testlink_work(struct work_struct *work) rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp, SMC_LLC_WAIT_TIME); if (rc <= 0) { - smc_lgr_terminate(smc_get_lgr(link)); + smc_lgr_terminate(smc_get_lgr(link), true); return; } next_interval = link->llc_testlink_time; @@ -656,6 +656,7 @@ void smc_llc_link_active(struct smc_link *link, int testlink_time) void smc_llc_link_deleting(struct smc_link *link) { link->state = SMC_LNK_DELETING; + smc_wr_wakeup_tx_wait(link); } /* called in tasklet context */ @@ -663,6 +664,8 @@ void smc_llc_link_inactive(struct smc_link *link) { link->state = SMC_LNK_INACTIVE; cancel_delayed_work(&link->llc_testlink_wrk); + smc_wr_wakeup_reg_wait(link); + smc_wr_wakeup_tx_wait(link); } /* called in worker context */ @@ -695,9 +698,11 @@ int smc_llc_do_confirm_rkey(struct smc_link *link, int smc_llc_do_delete_rkey(struct smc_link *link, struct smc_buf_desc *rmb_desc) { - int rc; + int rc = 0; mutex_lock(&link->llc_delete_rkey_mutex); + if (link->state != SMC_LNK_ACTIVE) + goto out; reinit_completion(&link->llc_delete_rkey); rc = smc_llc_send_delete_rkey(link, rmb_desc); if (rc) diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 824f096ee7de..0d42e7716b91 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -284,7 +284,7 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey; rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); if (rc) - smc_lgr_terminate(lgr); + smc_lgr_terminate(lgr, true); return rc; } diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 50743dc56c86..337ee52ad3d3 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -50,6 +50,26 @@ struct smc_wr_tx_pend { /* control data for a pending send request */ /*------------------------------- completion --------------------------------*/ +/* returns true if at least one tx work request is pending on the given link */ +static inline bool smc_wr_is_tx_pend(struct smc_link *link) +{ + if (find_first_bit(link->wr_tx_mask, link->wr_tx_cnt) != + link->wr_tx_cnt) { + return true; + } + return false; +} + +/* wait till all pending tx work requests on the given link are completed */ +static inline int smc_wr_tx_wait_no_pending_sends(struct smc_link *link) +{ + if (wait_event_timeout(link->wr_tx_wait, !smc_wr_is_tx_pend(link), + SMC_WR_TX_WAIT_PENDING_TIME)) + return 0; + else /* timeout */ + return -EPIPE; +} + static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id) { u32 i; @@ -75,7 +95,7 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) link->wr_reg_state = FAILED; else link->wr_reg_state = CONFIRMED; - wake_up(&link->wr_reg_wait); + smc_wr_wakeup_reg_wait(link); return; } @@ -171,6 +191,7 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, struct smc_rdma_wr **wr_rdma_buf, struct smc_wr_tx_pend_priv **wr_pend_priv) { + struct smc_link_group *lgr = smc_get_lgr(link); struct smc_wr_tx_pend *wr_pend; u32 idx = link->wr_tx_cnt; struct ib_send_wr *wr_ib; @@ -179,19 +200,20 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, *wr_buf = NULL; *wr_pend_priv = NULL; - if (in_softirq()) { + if (in_softirq() || lgr->terminating) { rc = smc_wr_tx_get_free_slot_index(link, &idx); if (rc) return rc; } else { - rc = wait_event_timeout( + rc = wait_event_interruptible_timeout( link->wr_tx_wait, link->state == SMC_LNK_INACTIVE || + lgr->terminating || (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY), SMC_WR_TX_WAIT_FREE_SLOT_TIME); if (!rc) { /* timeout - terminate connections */ - smc_lgr_terminate_sched(smc_get_lgr(link)); + smc_lgr_terminate_sched(lgr); return -EPIPE; } if (idx == link->wr_tx_cnt) @@ -227,6 +249,7 @@ int smc_wr_tx_put_slot(struct smc_link *link, memset(&link->wr_tx_bufs[idx], 0, sizeof(link->wr_tx_bufs[idx])); test_and_clear_bit(idx, link->wr_tx_mask); + wake_up(&link->wr_tx_wait); return 1; } @@ -510,8 +533,10 @@ void smc_wr_free_link(struct smc_link *lnk) { struct ib_device *ibdev; - memset(lnk->wr_tx_mask, 0, - BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask)); + if (smc_wr_tx_wait_no_pending_sends(lnk)) + memset(lnk->wr_tx_mask, 0, + BITS_TO_LONGS(SMC_WR_BUF_CNT) * + sizeof(*lnk->wr_tx_mask)); if (!lnk->smcibdev) return; diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 09bf32fd3959..3ac99c898418 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -60,6 +60,16 @@ static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val) atomic_long_set(wr_tx_id, val); } +static inline void smc_wr_wakeup_tx_wait(struct smc_link *lnk) +{ + wake_up_all(&lnk->wr_tx_wait); +} + +static inline void smc_wr_wakeup_reg_wait(struct smc_link *lnk) +{ + wake_up(&lnk->wr_reg_wait); +} + /* post a new receive work request to fill a completed old work request entry */ static inline int smc_wr_rx_post(struct smc_link *link) { diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index f41096a759fa..55aeba681cf4 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -87,9 +87,9 @@ int tipc_bcast_get_mtu(struct net *net) return tipc_link_mss(tipc_bc_sndlink(net)); } -void tipc_bcast_disable_rcast(struct net *net) +void tipc_bcast_toggle_rcast(struct net *net, bool supp) { - tipc_bc_base(net)->rcast_support = false; + tipc_bc_base(net)->rcast_support = supp; } static void tipc_bcbase_calc_bc_threshold(struct net *net) diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index dadad953e2be..9e847d9617d3 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -85,7 +85,7 @@ void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_bcl); void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id); void tipc_bcast_dec_bearer_dst_cnt(struct net *net, int bearer_id); int tipc_bcast_get_mtu(struct net *net); -void tipc_bcast_disable_rcast(struct net *net); +void tipc_bcast_toggle_rcast(struct net *net, bool supp); int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts, struct tipc_mc_method *method, struct tipc_nlist *dests, u16 *cong_link_cnt); diff --git a/net/tipc/core.c b/net/tipc/core.c index fc01a13d7462..7532a00ac73d 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -34,8 +34,6 @@ * POSSIBILITY OF SUCH DAMAGE. */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - #include "core.h" #include "name_table.h" #include "subscr.h" diff --git a/net/tipc/core.h b/net/tipc/core.h index 775848a5f27e..631d83c9705f 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -61,6 +61,12 @@ #include <net/genetlink.h> #include <net/netns/hash.h> +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + struct tipc_node; struct tipc_bearer; struct tipc_bc_base; diff --git a/net/tipc/link.c b/net/tipc/link.c index fb72031228c9..24d4d10756d3 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -550,7 +550,7 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, /* Disable replicast if even a single peer doesn't support it */ if (link_is_bc_rcvlink(l) && !(peer_caps & TIPC_BCAST_RCAST)) - tipc_bcast_disable_rcast(net); + tipc_bcast_toggle_rcast(net, false); return true; } diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 66a65c2cdb23..92d04dc2a44b 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -35,6 +35,7 @@ */ #include <net/sock.h> +#include <linux/list_sort.h> #include "core.h" #include "netlink.h" #include "name_table.h" @@ -66,6 +67,7 @@ struct service_range { /** * struct tipc_service - container for all published instances of a service type * @type: 32 bit 'type' value for service + * @publ_cnt: increasing counter for publications in this service * @ranges: rb tree containing all service ranges for this service * @service_list: links to adjacent name ranges in hash chain * @subscriptions: list of subscriptions for this service type @@ -74,6 +76,7 @@ struct service_range { */ struct tipc_service { u32 type; + u32 publ_cnt; struct rb_root ranges; struct hlist_node service_list; struct list_head subscriptions; @@ -109,6 +112,7 @@ static struct publication *tipc_publ_create(u32 type, u32 lower, u32 upper, INIT_LIST_HEAD(&publ->binding_node); INIT_LIST_HEAD(&publ->local_publ); INIT_LIST_HEAD(&publ->all_publ); + INIT_LIST_HEAD(&publ->list); return publ; } @@ -244,6 +248,8 @@ static struct publication *tipc_service_insert_publ(struct net *net, p = tipc_publ_create(type, lower, upper, scope, node, port, key); if (!p) goto err; + /* Suppose there shouldn't be a huge gap btw publs i.e. >INT_MAX */ + p->id = sc->publ_cnt++; if (in_own_node(net, node)) list_add(&p->local_publ, &sr->local_publ); list_add(&p->all_publ, &sr->all_publ); @@ -278,6 +284,20 @@ static struct publication *tipc_service_remove_publ(struct service_range *sr, } /** + * Code reused: time_after32() for the same purpose + */ +#define publication_after(pa, pb) time_after32((pa)->id, (pb)->id) +static int tipc_publ_sort(void *priv, struct list_head *a, + struct list_head *b) +{ + struct publication *pa, *pb; + + pa = container_of(a, struct publication, list); + pb = container_of(b, struct publication, list); + return publication_after(pa, pb); +} + +/** * tipc_service_subscribe - attach a subscription, and optionally * issue the prescribed number of events if there is any service * range overlapping with the requested range @@ -286,36 +306,51 @@ static void tipc_service_subscribe(struct tipc_service *service, struct tipc_subscription *sub) { struct tipc_subscr *sb = &sub->evt.s; + struct publication *p, *first, *tmp; + struct list_head publ_list; struct service_range *sr; struct tipc_name_seq ns; - struct publication *p; struct rb_node *n; - bool first; + u32 filter; ns.type = tipc_sub_read(sb, seq.type); ns.lower = tipc_sub_read(sb, seq.lower); ns.upper = tipc_sub_read(sb, seq.upper); + filter = tipc_sub_read(sb, filter); tipc_sub_get(sub); list_add(&sub->service_list, &service->subscriptions); - if (tipc_sub_read(sb, filter) & TIPC_SUB_NO_STATUS) + if (filter & TIPC_SUB_NO_STATUS) return; + INIT_LIST_HEAD(&publ_list); for (n = rb_first(&service->ranges); n; n = rb_next(n)) { sr = container_of(n, struct service_range, tree_node); if (sr->lower > ns.upper) break; if (!tipc_sub_check_overlap(&ns, sr->lower, sr->upper)) continue; - first = true; + first = NULL; list_for_each_entry(p, &sr->all_publ, all_publ) { - tipc_sub_report_overlap(sub, sr->lower, sr->upper, - TIPC_PUBLISHED, p->port, - p->node, p->scope, first); - first = false; + if (filter & TIPC_SUB_PORTS) + list_add_tail(&p->list, &publ_list); + else if (!first || publication_after(first, p)) + /* Pick this range's *first* publication */ + first = p; } + if (first) + list_add_tail(&first->list, &publ_list); + } + + /* Sort the publications before reporting */ + list_sort(NULL, &publ_list, tipc_publ_sort); + list_for_each_entry_safe(p, tmp, &publ_list, list) { + tipc_sub_report_overlap(sub, p->lower, p->upper, + TIPC_PUBLISHED, p->port, p->node, + p->scope, true); + list_del_init(&p->list); } } diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index f79066334cc8..728bc7016c38 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -58,6 +58,7 @@ struct tipc_group; * @node: network address of publishing socket's node * @port: publishing port * @key: publication key, unique across the cluster + * @id: publication id * @binding_node: all publications from the same node which bound this one * - Remote publications: in node->publ_list * Used by node/name distr to withdraw publications when node is lost @@ -69,6 +70,7 @@ struct tipc_group; * Used by closest_first and multicast receive lookup algorithms * @all_publ: all publications identical to this one, whatever node and scope * Used by round-robin lookup algorithm + * @list: to form a list of publications in temporal order * @rcu: RCU callback head used for deferred freeing */ struct publication { @@ -79,10 +81,12 @@ struct publication { u32 node; u32 port; u32 key; + u32 id; struct list_head binding_node; struct list_head binding_sock; struct list_head local_publ; struct list_head all_publ; + struct list_head list; struct rcu_head rcu; }; diff --git a/net/tipc/node.c b/net/tipc/node.c index aaf595613e6e..ab04e00cb95b 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -496,6 +496,9 @@ update: tn->capabilities &= temp_node->capabilities; } + tipc_bcast_toggle_rcast(net, + (tn->capabilities & TIPC_BCAST_RCAST)); + goto exit; } n = kzalloc(sizeof(*n), GFP_ATOMIC); @@ -557,6 +560,7 @@ update: list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } + tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); trace_tipc_node_create(n, true, " "); exit: spin_unlock_bh(&tn->node_list_lock); @@ -740,7 +744,8 @@ static bool tipc_node_cleanup(struct tipc_node *peer) list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } - + tipc_bcast_toggle_rcast(peer->net, + (tn->capabilities & TIPC_BCAST_RCAST)); spin_unlock_bh(&tn->node_list_lock); return deleted; } @@ -2198,6 +2203,7 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } + tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); err = 0; err_out: tipc_node_put(peer); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 5d7859aac78e..a1c8d722ca20 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2880,7 +2880,7 @@ static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid) struct tipc_sock *tsk; rcu_read_lock(); - tsk = rhashtable_lookup_fast(&tn->sk_rht, &portid, tsk_rht_params); + tsk = rhashtable_lookup(&tn->sk_rht, &portid, tsk_rht_params); if (tsk) sock_hold(&tsk->sk); rcu_read_unlock(); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 5bb93dd5762b..bdca31ffe6da 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -861,6 +861,7 @@ static int __init tls_register(void) tls_sw_proto_ops = inet_stream_ops; tls_sw_proto_ops.splice_read = tls_sw_splice_read; + tls_sw_proto_ops.sendpage_locked = tls_sw_sendpage_locked, tls_device_init(); tcp_register_ulp(&tcp_tls_ulp_ops); diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c index 83d9c80a684e..3a5dd1e07233 100644 --- a/net/tls/tls_proc.c +++ b/net/tls/tls_proc.c @@ -6,6 +6,7 @@ #include <net/snmp.h> #include <net/tls.h> +#ifdef CONFIG_PROC_FS static const struct snmp_mib tls_mib_list[] = { SNMP_MIB_ITEM("TlsCurrTxSw", LINUX_MIB_TLSCURRTXSW), SNMP_MIB_ITEM("TlsCurrRxSw", LINUX_MIB_TLSCURRRXSW), @@ -32,6 +33,7 @@ static int tls_statistics_seq_show(struct seq_file *seq, void *v) return 0; } +#endif int __net_init tls_proc_init(struct net *net) { diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 141da093ff04..da9f9ce51e7b 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1209,6 +1209,17 @@ sendpage_end: return copied ? copied : ret; } +int tls_sw_sendpage_locked(struct sock *sk, struct page *page, + int offset, size_t size, int flags) +{ + if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | + MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY | + MSG_NO_SHARED_FRAGS)) + return -ENOTSUPP; + + return tls_sw_do_sendpage(sk, page, offset, size, flags); +} + int tls_sw_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) { diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 1f4fde4711b6..74db4cd637a7 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -126,19 +126,18 @@ static struct proto vsock_proto = { */ #define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ) -static const struct vsock_transport *transport; +#define VSOCK_DEFAULT_BUFFER_SIZE (1024 * 256) +#define VSOCK_DEFAULT_BUFFER_MAX_SIZE (1024 * 256) +#define VSOCK_DEFAULT_BUFFER_MIN_SIZE 128 + +/* Transport used for host->guest communication */ +static const struct vsock_transport *transport_h2g; +/* Transport used for guest->host communication */ +static const struct vsock_transport *transport_g2h; +/* Transport used for DGRAM communication */ +static const struct vsock_transport *transport_dgram; static DEFINE_MUTEX(vsock_register_mutex); -/**** EXPORTS ****/ - -/* Get the ID of the local context. This is transport dependent. */ - -int vm_sockets_get_local_cid(void) -{ - return transport->get_local_cid(); -} -EXPORT_SYMBOL_GPL(vm_sockets_get_local_cid); - /**** UTILS ****/ /* Each bound VSocket is stored in the bind hash table and each connected @@ -188,7 +187,7 @@ static int vsock_auto_bind(struct vsock_sock *vsk) return __vsock_bind(sk, &local_addr); } -static int __init vsock_init_tables(void) +static void vsock_init_tables(void) { int i; @@ -197,7 +196,6 @@ static int __init vsock_init_tables(void) for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) INIT_LIST_HEAD(&vsock_connected_table[i]); - return 0; } static void __vsock_insert_bound(struct list_head *list, @@ -230,9 +228,15 @@ static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr) { struct vsock_sock *vsk; - list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) - if (addr->svm_port == vsk->local_addr.svm_port) + list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) { + if (vsock_addr_equals_addr(addr, &vsk->local_addr)) + return sk_vsock(vsk); + + if (addr->svm_port == vsk->local_addr.svm_port && + (vsk->local_addr.svm_cid == VMADDR_CID_ANY || + addr->svm_cid == VMADDR_CID_ANY)) return sk_vsock(vsk); + } return NULL; } @@ -382,6 +386,88 @@ void vsock_enqueue_accept(struct sock *listener, struct sock *connected) } EXPORT_SYMBOL_GPL(vsock_enqueue_accept); +static void vsock_deassign_transport(struct vsock_sock *vsk) +{ + if (!vsk->transport) + return; + + vsk->transport->destruct(vsk); + module_put(vsk->transport->module); + vsk->transport = NULL; +} + +/* Assign a transport to a socket and call the .init transport callback. + * + * Note: for stream socket this must be called when vsk->remote_addr is set + * (e.g. during the connect() or when a connection request on a listener + * socket is received). + * The vsk->remote_addr is used to decide which transport to use: + * - remote CID <= VMADDR_CID_HOST will use guest->host transport; + * - remote CID == local_cid (guest->host transport) will use guest->host + * transport for loopback (host->guest transports don't support loopback); + * - remote CID > VMADDR_CID_HOST will use host->guest transport; + */ +int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) +{ + const struct vsock_transport *new_transport; + struct sock *sk = sk_vsock(vsk); + unsigned int remote_cid = vsk->remote_addr.svm_cid; + int ret; + + switch (sk->sk_type) { + case SOCK_DGRAM: + new_transport = transport_dgram; + break; + case SOCK_STREAM: + if (remote_cid <= VMADDR_CID_HOST || + (transport_g2h && + remote_cid == transport_g2h->get_local_cid())) + new_transport = transport_g2h; + else + new_transport = transport_h2g; + break; + default: + return -ESOCKTNOSUPPORT; + } + + if (vsk->transport) { + if (vsk->transport == new_transport) + return 0; + + vsk->transport->release(vsk); + vsock_deassign_transport(vsk); + } + + /* We increase the module refcnt to prevent the transport unloading + * while there are open sockets assigned to it. + */ + if (!new_transport || !try_module_get(new_transport->module)) + return -ENODEV; + + ret = new_transport->init(vsk, psk); + if (ret) { + module_put(new_transport->module); + return ret; + } + + vsk->transport = new_transport; + + return 0; +} +EXPORT_SYMBOL_GPL(vsock_assign_transport); + +bool vsock_find_cid(unsigned int cid) +{ + if (transport_g2h && cid == transport_g2h->get_local_cid()) + return true; + + if (transport_h2g && cid == VMADDR_CID_HOST) + return true; + + return false; +} +EXPORT_SYMBOL_GPL(vsock_find_cid); + static struct sock *vsock_dequeue_accept(struct sock *listener) { struct vsock_sock *vlistener; @@ -418,7 +504,12 @@ static bool vsock_is_pending(struct sock *sk) static int vsock_send_shutdown(struct sock *sk, int mode) { - return transport->shutdown(vsock_sk(sk), mode); + struct vsock_sock *vsk = vsock_sk(sk); + + if (!vsk->transport) + return -ENODEV; + + return vsk->transport->shutdown(vsk, mode); } static void vsock_pending_work(struct work_struct *work) @@ -528,13 +619,12 @@ static int __vsock_bind_stream(struct vsock_sock *vsk, static int __vsock_bind_dgram(struct vsock_sock *vsk, struct sockaddr_vm *addr) { - return transport->dgram_bind(vsk, addr); + return vsk->transport->dgram_bind(vsk, addr); } static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr) { struct vsock_sock *vsk = vsock_sk(sk); - u32 cid; int retval; /* First ensure this socket isn't already bound. */ @@ -544,10 +634,9 @@ static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr) /* Now bind to the provided address or select appropriate values if * none are provided (VMADDR_CID_ANY and VMADDR_PORT_ANY). Note that * like AF_INET prevents binding to a non-local IP address (in most - * cases), we only allow binding to the local CID. + * cases), we only allow binding to a local CID. */ - cid = transport->get_local_cid(); - if (addr->svm_cid != cid && addr->svm_cid != VMADDR_CID_ANY) + if (addr->svm_cid != VMADDR_CID_ANY && !vsock_find_cid(addr->svm_cid)) return -EADDRNOTAVAIL; switch (sk->sk_socket->type) { @@ -571,12 +660,12 @@ static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr) static void vsock_connect_timeout(struct work_struct *work); -struct sock *__vsock_create(struct net *net, - struct socket *sock, - struct sock *parent, - gfp_t priority, - unsigned short type, - int kern) +static struct sock *__vsock_create(struct net *net, + struct socket *sock, + struct sock *parent, + gfp_t priority, + unsigned short type, + int kern) { struct sock *sk; struct vsock_sock *psk; @@ -620,23 +709,20 @@ struct sock *__vsock_create(struct net *net, vsk->trusted = psk->trusted; vsk->owner = get_cred(psk->owner); vsk->connect_timeout = psk->connect_timeout; + vsk->buffer_size = psk->buffer_size; + vsk->buffer_min_size = psk->buffer_min_size; + vsk->buffer_max_size = psk->buffer_max_size; } else { vsk->trusted = capable(CAP_NET_ADMIN); vsk->owner = get_current_cred(); vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT; + vsk->buffer_size = VSOCK_DEFAULT_BUFFER_SIZE; + vsk->buffer_min_size = VSOCK_DEFAULT_BUFFER_MIN_SIZE; + vsk->buffer_max_size = VSOCK_DEFAULT_BUFFER_MAX_SIZE; } - if (transport->init(vsk, psk) < 0) { - sk_free(sk); - return NULL; - } - - if (sock) - vsock_insert_unbound(vsk); - return sk; } -EXPORT_SYMBOL_GPL(__vsock_create); static void __vsock_release(struct sock *sk, int level) { @@ -650,7 +736,10 @@ static void __vsock_release(struct sock *sk, int level) /* The release call is supposed to use lock_sock_nested() * rather than lock_sock(), if a sock lock should be acquired. */ - transport->release(vsk); + if (vsk->transport) + vsk->transport->release(vsk); + else if (sk->sk_type == SOCK_STREAM) + vsock_remove_sock(vsk); /* When "level" is SINGLE_DEPTH_NESTING, use the nested * version to avoid the warning "possible recursive locking @@ -678,7 +767,7 @@ static void vsock_sk_destruct(struct sock *sk) { struct vsock_sock *vsk = vsock_sk(sk); - transport->destruct(vsk); + vsock_deassign_transport(vsk); /* When clearing these addresses, there's no need to set the family and * possibly register the address family with the kernel. @@ -700,15 +789,22 @@ static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) return err; } +struct sock *vsock_create_connected(struct sock *parent) +{ + return __vsock_create(sock_net(parent), NULL, parent, GFP_KERNEL, + parent->sk_type, 0); +} +EXPORT_SYMBOL_GPL(vsock_create_connected); + s64 vsock_stream_has_data(struct vsock_sock *vsk) { - return transport->stream_has_data(vsk); + return vsk->transport->stream_has_data(vsk); } EXPORT_SYMBOL_GPL(vsock_stream_has_data); s64 vsock_stream_has_space(struct vsock_sock *vsk) { - return transport->stream_has_space(vsk); + return vsk->transport->stream_has_space(vsk); } EXPORT_SYMBOL_GPL(vsock_stream_has_space); @@ -877,6 +973,7 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock, mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; } else if (sock->type == SOCK_STREAM) { + const struct vsock_transport *transport = vsk->transport; lock_sock(sk); /* Listening sockets that have connections in their accept @@ -887,7 +984,7 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock, mask |= EPOLLIN | EPOLLRDNORM; /* If there is something in the queue then we can read. */ - if (transport->stream_is_active(vsk) && + if (transport && transport->stream_is_active(vsk) && !(sk->sk_shutdown & RCV_SHUTDOWN)) { bool data_ready_now = false; int ret = transport->notify_poll_in( @@ -952,6 +1049,7 @@ static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg, struct sock *sk; struct vsock_sock *vsk; struct sockaddr_vm *remote_addr; + const struct vsock_transport *transport; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; @@ -960,6 +1058,7 @@ static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg, err = 0; sk = sock->sk; vsk = vsock_sk(sk); + transport = vsk->transport; lock_sock(sk); @@ -1044,8 +1143,8 @@ static int vsock_dgram_connect(struct socket *sock, if (err) goto out; - if (!transport->dgram_allow(remote_addr->svm_cid, - remote_addr->svm_port)) { + if (!vsk->transport->dgram_allow(remote_addr->svm_cid, + remote_addr->svm_port)) { err = -EINVAL; goto out; } @@ -1061,7 +1160,9 @@ out: static int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { - return transport->dgram_dequeue(vsock_sk(sock->sk), msg, len, flags); + struct vsock_sock *vsk = vsock_sk(sock->sk); + + return vsk->transport->dgram_dequeue(vsk, msg, len, flags); } static const struct proto_ops vsock_dgram_ops = { @@ -1087,6 +1188,8 @@ static const struct proto_ops vsock_dgram_ops = { static int vsock_transport_cancel_pkt(struct vsock_sock *vsk) { + const struct vsock_transport *transport = vsk->transport; + if (!transport->cancel_pkt) return -EOPNOTSUPP; @@ -1123,6 +1226,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, int err; struct sock *sk; struct vsock_sock *vsk; + const struct vsock_transport *transport; struct sockaddr_vm *remote_addr; long timeout; DEFINE_WAIT(wait); @@ -1157,19 +1261,26 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, goto out; } + /* Set the remote address that we are connecting to. */ + memcpy(&vsk->remote_addr, remote_addr, + sizeof(vsk->remote_addr)); + + err = vsock_assign_transport(vsk, NULL); + if (err) + goto out; + + transport = vsk->transport; + /* The hypervisor and well-known contexts do not have socket * endpoints. */ - if (!transport->stream_allow(remote_addr->svm_cid, + if (!transport || + !transport->stream_allow(remote_addr->svm_cid, remote_addr->svm_port)) { err = -ENETUNREACH; goto out; } - /* Set the remote address that we are connecting to. */ - memcpy(&vsk->remote_addr, remote_addr, - sizeof(vsk->remote_addr)); - err = vsock_auto_bind(vsk); if (err) goto out; @@ -1364,6 +1475,23 @@ out: return err; } +static void vsock_update_buffer_size(struct vsock_sock *vsk, + const struct vsock_transport *transport, + u64 val) +{ + if (val > vsk->buffer_max_size) + val = vsk->buffer_max_size; + + if (val < vsk->buffer_min_size) + val = vsk->buffer_min_size; + + if (val != vsk->buffer_size && + transport && transport->notify_buffer_size) + transport->notify_buffer_size(vsk, &val); + + vsk->buffer_size = val; +} + static int vsock_stream_setsockopt(struct socket *sock, int level, int optname, @@ -1373,6 +1501,7 @@ static int vsock_stream_setsockopt(struct socket *sock, int err; struct sock *sk; struct vsock_sock *vsk; + const struct vsock_transport *transport; u64 val; if (level != AF_VSOCK) @@ -1393,23 +1522,26 @@ static int vsock_stream_setsockopt(struct socket *sock, err = 0; sk = sock->sk; vsk = vsock_sk(sk); + transport = vsk->transport; lock_sock(sk); switch (optname) { case SO_VM_SOCKETS_BUFFER_SIZE: COPY_IN(val); - transport->set_buffer_size(vsk, val); + vsock_update_buffer_size(vsk, transport, val); break; case SO_VM_SOCKETS_BUFFER_MAX_SIZE: COPY_IN(val); - transport->set_max_buffer_size(vsk, val); + vsk->buffer_max_size = val; + vsock_update_buffer_size(vsk, transport, vsk->buffer_size); break; case SO_VM_SOCKETS_BUFFER_MIN_SIZE: COPY_IN(val); - transport->set_min_buffer_size(vsk, val); + vsk->buffer_min_size = val; + vsock_update_buffer_size(vsk, transport, vsk->buffer_size); break; case SO_VM_SOCKETS_CONNECT_TIMEOUT: { @@ -1476,17 +1608,17 @@ static int vsock_stream_getsockopt(struct socket *sock, switch (optname) { case SO_VM_SOCKETS_BUFFER_SIZE: - val = transport->get_buffer_size(vsk); + val = vsk->buffer_size; COPY_OUT(val); break; case SO_VM_SOCKETS_BUFFER_MAX_SIZE: - val = transport->get_max_buffer_size(vsk); + val = vsk->buffer_max_size; COPY_OUT(val); break; case SO_VM_SOCKETS_BUFFER_MIN_SIZE: - val = transport->get_min_buffer_size(vsk); + val = vsk->buffer_min_size; COPY_OUT(val); break; @@ -1517,6 +1649,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, { struct sock *sk; struct vsock_sock *vsk; + const struct vsock_transport *transport; ssize_t total_written; long timeout; int err; @@ -1525,6 +1658,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, sk = sock->sk; vsk = vsock_sk(sk); + transport = vsk->transport; total_written = 0; err = 0; @@ -1546,7 +1680,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, goto out; } - if (sk->sk_state != TCP_ESTABLISHED || + if (!transport || sk->sk_state != TCP_ESTABLISHED || !vsock_addr_bound(&vsk->local_addr)) { err = -ENOTCONN; goto out; @@ -1656,6 +1790,7 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, { struct sock *sk; struct vsock_sock *vsk; + const struct vsock_transport *transport; int err; size_t target; ssize_t copied; @@ -1666,11 +1801,12 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, sk = sock->sk; vsk = vsock_sk(sk); + transport = vsk->transport; err = 0; lock_sock(sk); - if (sk->sk_state != TCP_ESTABLISHED) { + if (!transport || sk->sk_state != TCP_ESTABLISHED) { /* Recvmsg is supposed to return 0 if a peer performs an * orderly shutdown. Differentiate between that case and when a * peer has not connected or a local shutdown occured with the @@ -1844,6 +1980,10 @@ static const struct proto_ops vsock_stream_ops = { static int vsock_create(struct net *net, struct socket *sock, int protocol, int kern) { + struct vsock_sock *vsk; + struct sock *sk; + int ret; + if (!sock) return -EINVAL; @@ -1863,7 +2003,23 @@ static int vsock_create(struct net *net, struct socket *sock, sock->state = SS_UNCONNECTED; - return __vsock_create(net, sock, NULL, GFP_KERNEL, 0, kern) ? 0 : -ENOMEM; + sk = __vsock_create(net, sock, NULL, GFP_KERNEL, 0, kern); + if (!sk) + return -ENOMEM; + + vsk = vsock_sk(sk); + + if (sock->type == SOCK_DGRAM) { + ret = vsock_assign_transport(vsk, NULL); + if (ret < 0) { + sock_put(sk); + return ret; + } + } + + vsock_insert_unbound(vsk); + + return 0; } static const struct net_proto_family vsock_family_ops = { @@ -1876,11 +2032,20 @@ static long vsock_dev_do_ioctl(struct file *filp, unsigned int cmd, void __user *ptr) { u32 __user *p = ptr; + u32 cid = VMADDR_CID_ANY; int retval = 0; switch (cmd) { case IOCTL_VM_SOCKETS_GET_LOCAL_CID: - if (put_user(transport->get_local_cid(), p) != 0) + /* To be compatible with the VMCI behavior, we prioritize the + * guest CID instead of well-know host CID (VMADDR_CID_HOST). + */ + if (transport_g2h) + cid = transport_g2h->get_local_cid(); + else if (transport_h2g) + cid = transport_h2g->get_local_cid(); + + if (put_user(cid, p) != 0) retval = -EFAULT; break; @@ -1920,24 +2085,13 @@ static struct miscdevice vsock_device = { .fops = &vsock_device_ops, }; -int __vsock_core_init(const struct vsock_transport *t, struct module *owner) +static int __init vsock_init(void) { - int err = mutex_lock_interruptible(&vsock_register_mutex); + int err = 0; - if (err) - return err; - - if (transport) { - err = -EBUSY; - goto err_busy; - } - - /* Transport must be the owner of the protocol so that it can't - * unload while there are open sockets. - */ - vsock_proto.owner = owner; - transport = t; + vsock_init_tables(); + vsock_proto.owner = THIS_MODULE; vsock_device.minor = MISC_DYNAMIC_MINOR; err = misc_register(&vsock_device); if (err) { @@ -1958,7 +2112,6 @@ int __vsock_core_init(const struct vsock_transport *t, struct module *owner) goto err_unregister_proto; } - mutex_unlock(&vsock_register_mutex); return 0; err_unregister_proto: @@ -1966,44 +2119,86 @@ err_unregister_proto: err_deregister_misc: misc_deregister(&vsock_device); err_reset_transport: - transport = NULL; -err_busy: - mutex_unlock(&vsock_register_mutex); return err; } -EXPORT_SYMBOL_GPL(__vsock_core_init); -void vsock_core_exit(void) +static void __exit vsock_exit(void) { - mutex_lock(&vsock_register_mutex); - misc_deregister(&vsock_device); sock_unregister(AF_VSOCK); proto_unregister(&vsock_proto); - - /* We do not want the assignment below re-ordered. */ - mb(); - transport = NULL; - - mutex_unlock(&vsock_register_mutex); } -EXPORT_SYMBOL_GPL(vsock_core_exit); -const struct vsock_transport *vsock_core_get_transport(void) +const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk) { - /* vsock_register_mutex not taken since only the transport uses this - * function and only while registered. - */ - return transport; + return vsk->transport; } EXPORT_SYMBOL_GPL(vsock_core_get_transport); -static void __exit vsock_exit(void) +int vsock_core_register(const struct vsock_transport *t, int features) +{ + const struct vsock_transport *t_h2g, *t_g2h, *t_dgram; + int err = mutex_lock_interruptible(&vsock_register_mutex); + + if (err) + return err; + + t_h2g = transport_h2g; + t_g2h = transport_g2h; + t_dgram = transport_dgram; + + if (features & VSOCK_TRANSPORT_F_H2G) { + if (t_h2g) { + err = -EBUSY; + goto err_busy; + } + t_h2g = t; + } + + if (features & VSOCK_TRANSPORT_F_G2H) { + if (t_g2h) { + err = -EBUSY; + goto err_busy; + } + t_g2h = t; + } + + if (features & VSOCK_TRANSPORT_F_DGRAM) { + if (t_dgram) { + err = -EBUSY; + goto err_busy; + } + t_dgram = t; + } + + transport_h2g = t_h2g; + transport_g2h = t_g2h; + transport_dgram = t_dgram; + +err_busy: + mutex_unlock(&vsock_register_mutex); + return err; +} +EXPORT_SYMBOL_GPL(vsock_core_register); + +void vsock_core_unregister(const struct vsock_transport *t) { - /* Do nothing. This function makes this module removable. */ + mutex_lock(&vsock_register_mutex); + + if (transport_h2g == t) + transport_h2g = NULL; + + if (transport_g2h == t) + transport_g2h = NULL; + + if (transport_dgram == t) + transport_dgram = NULL; + + mutex_unlock(&vsock_register_mutex); } +EXPORT_SYMBOL_GPL(vsock_core_unregister); -module_init(vsock_init_tables); +module_init(vsock_init); module_exit(vsock_exit); MODULE_AUTHOR("VMware, Inc."); diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 7fa09c5e4625..3c7d07a99fc5 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -165,6 +165,8 @@ static const guid_t srv_id_template = GUID_INIT(0x00000000, 0xfacb, 0x11e6, 0xbd, 0x58, 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3); +static bool hvs_check_transport(struct vsock_sock *vsk); + static bool is_valid_srv_id(const guid_t *id) { return !memcmp(&id->b[4], &srv_id_template.b[4], sizeof(guid_t) - 4); @@ -188,7 +190,8 @@ static void hvs_remote_addr_init(struct sockaddr_vm *remote, static u32 host_ephemeral_port = MIN_HOST_EPHEMERAL_PORT; struct sock *sk; - vsock_addr_init(remote, VMADDR_CID_ANY, VMADDR_PORT_ANY); + /* Remote peer is always the host */ + vsock_addr_init(remote, VMADDR_CID_HOST, VMADDR_PORT_ANY); while (1) { /* Wrap around ? */ @@ -360,13 +363,24 @@ static void hvs_open_connection(struct vmbus_channel *chan) if (sk->sk_ack_backlog >= sk->sk_max_ack_backlog) goto out; - new = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL, - sk->sk_type, 0); + new = vsock_create_connected(sk); if (!new) goto out; new->sk_state = TCP_SYN_SENT; vnew = vsock_sk(new); + + hvs_addr_init(&vnew->local_addr, if_type); + hvs_remote_addr_init(&vnew->remote_addr, &vnew->local_addr); + + ret = vsock_assign_transport(vnew, vsock_sk(sk)); + /* Transport assigned (looking at remote_addr) must be the + * same where we received the request. + */ + if (ret || !hvs_check_transport(vnew)) { + sock_put(new); + goto out; + } hvs_new = vnew->trans; hvs_new->chan = chan; } else { @@ -430,9 +444,6 @@ static void hvs_open_connection(struct vmbus_channel *chan) new->sk_state = TCP_ESTABLISHED; sk_acceptq_added(sk); - hvs_addr_init(&vnew->local_addr, if_type); - hvs_remote_addr_init(&vnew->remote_addr, &vnew->local_addr); - hvs_new->vm_srv_id = *if_type; hvs_new->host_srv_id = *if_instance; @@ -845,37 +856,9 @@ int hvs_notify_send_post_enqueue(struct vsock_sock *vsk, ssize_t written, return 0; } -static void hvs_set_buffer_size(struct vsock_sock *vsk, u64 val) -{ - /* Ignored. */ -} - -static void hvs_set_min_buffer_size(struct vsock_sock *vsk, u64 val) -{ - /* Ignored. */ -} - -static void hvs_set_max_buffer_size(struct vsock_sock *vsk, u64 val) -{ - /* Ignored. */ -} - -static u64 hvs_get_buffer_size(struct vsock_sock *vsk) -{ - return -ENOPROTOOPT; -} - -static u64 hvs_get_min_buffer_size(struct vsock_sock *vsk) -{ - return -ENOPROTOOPT; -} - -static u64 hvs_get_max_buffer_size(struct vsock_sock *vsk) -{ - return -ENOPROTOOPT; -} - static struct vsock_transport hvs_transport = { + .module = THIS_MODULE, + .get_local_cid = hvs_get_local_cid, .init = hvs_sock_init, @@ -908,14 +891,13 @@ static struct vsock_transport hvs_transport = { .notify_send_pre_enqueue = hvs_notify_send_pre_enqueue, .notify_send_post_enqueue = hvs_notify_send_post_enqueue, - .set_buffer_size = hvs_set_buffer_size, - .set_min_buffer_size = hvs_set_min_buffer_size, - .set_max_buffer_size = hvs_set_max_buffer_size, - .get_buffer_size = hvs_get_buffer_size, - .get_min_buffer_size = hvs_get_min_buffer_size, - .get_max_buffer_size = hvs_get_max_buffer_size, }; +static bool hvs_check_transport(struct vsock_sock *vsk) +{ + return vsk->transport == &hvs_transport; +} + static int hvs_probe(struct hv_device *hdev, const struct hv_vmbus_device_id *dev_id) { @@ -964,7 +946,7 @@ static int __init hvs_init(void) if (ret != 0) return ret; - ret = vsock_core_init(&hvs_transport); + ret = vsock_core_register(&hvs_transport, VSOCK_TRANSPORT_F_G2H); if (ret) { vmbus_driver_unregister(&hvs_drv); return ret; @@ -975,7 +957,7 @@ static int __init hvs_init(void) static void __exit hvs_exit(void) { - vsock_core_exit(); + vsock_core_unregister(&hvs_transport); vmbus_driver_unregister(&hvs_drv); } diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 082a30936690..1458c5c8b64d 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -86,33 +86,6 @@ out_rcu: return ret; } -static void virtio_transport_loopback_work(struct work_struct *work) -{ - struct virtio_vsock *vsock = - container_of(work, struct virtio_vsock, loopback_work); - LIST_HEAD(pkts); - - spin_lock_bh(&vsock->loopback_list_lock); - list_splice_init(&vsock->loopback_list, &pkts); - spin_unlock_bh(&vsock->loopback_list_lock); - - mutex_lock(&vsock->rx_lock); - - if (!vsock->rx_run) - goto out; - - while (!list_empty(&pkts)) { - struct virtio_vsock_pkt *pkt; - - pkt = list_first_entry(&pkts, struct virtio_vsock_pkt, list); - list_del_init(&pkt->list); - - virtio_transport_recv_pkt(pkt); - } -out: - mutex_unlock(&vsock->rx_lock); -} - static int virtio_transport_send_pkt_loopback(struct virtio_vsock *vsock, struct virtio_vsock_pkt *pkt) { @@ -370,59 +343,6 @@ static bool virtio_transport_more_replies(struct virtio_vsock *vsock) return val < virtqueue_get_vring_size(vq); } -static void virtio_transport_rx_work(struct work_struct *work) -{ - struct virtio_vsock *vsock = - container_of(work, struct virtio_vsock, rx_work); - struct virtqueue *vq; - - vq = vsock->vqs[VSOCK_VQ_RX]; - - mutex_lock(&vsock->rx_lock); - - if (!vsock->rx_run) - goto out; - - do { - virtqueue_disable_cb(vq); - for (;;) { - struct virtio_vsock_pkt *pkt; - unsigned int len; - - if (!virtio_transport_more_replies(vsock)) { - /* Stop rx until the device processes already - * pending replies. Leave rx virtqueue - * callbacks disabled. - */ - goto out; - } - - pkt = virtqueue_get_buf(vq, &len); - if (!pkt) { - break; - } - - vsock->rx_buf_nr--; - - /* Drop short/long packets */ - if (unlikely(len < sizeof(pkt->hdr) || - len > sizeof(pkt->hdr) + pkt->len)) { - virtio_transport_free_pkt(pkt); - continue; - } - - pkt->len = len - sizeof(pkt->hdr); - virtio_transport_deliver_tap_pkt(pkt); - virtio_transport_recv_pkt(pkt); - } - } while (!virtqueue_enable_cb(vq)); - -out: - if (vsock->rx_buf_nr < vsock->rx_buf_max_nr / 2) - virtio_vsock_rx_fill(vsock); - mutex_unlock(&vsock->rx_lock); -} - /* event_lock must be held */ static int virtio_vsock_event_fill_one(struct virtio_vsock *vsock, struct virtio_vsock_event *event) @@ -542,6 +462,8 @@ static void virtio_vsock_rx_done(struct virtqueue *vq) static struct virtio_transport virtio_transport = { .transport = { + .module = THIS_MODULE, + .get_local_cid = virtio_transport_get_local_cid, .init = virtio_transport_do_socket_init, @@ -574,18 +496,92 @@ static struct virtio_transport virtio_transport = { .notify_send_pre_block = virtio_transport_notify_send_pre_block, .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue, .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue, - - .set_buffer_size = virtio_transport_set_buffer_size, - .set_min_buffer_size = virtio_transport_set_min_buffer_size, - .set_max_buffer_size = virtio_transport_set_max_buffer_size, - .get_buffer_size = virtio_transport_get_buffer_size, - .get_min_buffer_size = virtio_transport_get_min_buffer_size, - .get_max_buffer_size = virtio_transport_get_max_buffer_size, + .notify_buffer_size = virtio_transport_notify_buffer_size, }, .send_pkt = virtio_transport_send_pkt, }; +static void virtio_transport_loopback_work(struct work_struct *work) +{ + struct virtio_vsock *vsock = + container_of(work, struct virtio_vsock, loopback_work); + LIST_HEAD(pkts); + + spin_lock_bh(&vsock->loopback_list_lock); + list_splice_init(&vsock->loopback_list, &pkts); + spin_unlock_bh(&vsock->loopback_list_lock); + + mutex_lock(&vsock->rx_lock); + + if (!vsock->rx_run) + goto out; + + while (!list_empty(&pkts)) { + struct virtio_vsock_pkt *pkt; + + pkt = list_first_entry(&pkts, struct virtio_vsock_pkt, list); + list_del_init(&pkt->list); + + virtio_transport_recv_pkt(&virtio_transport, pkt); + } +out: + mutex_unlock(&vsock->rx_lock); +} + +static void virtio_transport_rx_work(struct work_struct *work) +{ + struct virtio_vsock *vsock = + container_of(work, struct virtio_vsock, rx_work); + struct virtqueue *vq; + + vq = vsock->vqs[VSOCK_VQ_RX]; + + mutex_lock(&vsock->rx_lock); + + if (!vsock->rx_run) + goto out; + + do { + virtqueue_disable_cb(vq); + for (;;) { + struct virtio_vsock_pkt *pkt; + unsigned int len; + + if (!virtio_transport_more_replies(vsock)) { + /* Stop rx until the device processes already + * pending replies. Leave rx virtqueue + * callbacks disabled. + */ + goto out; + } + + pkt = virtqueue_get_buf(vq, &len); + if (!pkt) { + break; + } + + vsock->rx_buf_nr--; + + /* Drop short/long packets */ + if (unlikely(len < sizeof(pkt->hdr) || + len > sizeof(pkt->hdr) + pkt->len)) { + virtio_transport_free_pkt(pkt); + continue; + } + + pkt->len = len - sizeof(pkt->hdr); + virtio_transport_deliver_tap_pkt(pkt); + virtio_transport_recv_pkt(&virtio_transport, pkt); + } + } while (!virtqueue_enable_cb(vq)); + +out: + if (vsock->rx_buf_nr < vsock->rx_buf_max_nr / 2) + virtio_vsock_rx_fill(vsock); + mutex_unlock(&vsock->rx_lock); +} + static int virtio_vsock_probe(struct virtio_device *vdev) { vq_callback_t *callbacks[] = { @@ -776,7 +772,8 @@ static int __init virtio_vsock_init(void) if (!virtio_vsock_workqueue) return -ENOMEM; - ret = vsock_core_init(&virtio_transport.transport); + ret = vsock_core_register(&virtio_transport.transport, + VSOCK_TRANSPORT_F_G2H); if (ret) goto out_wq; @@ -787,7 +784,7 @@ static int __init virtio_vsock_init(void) return 0; out_vci: - vsock_core_exit(); + vsock_core_unregister(&virtio_transport.transport); out_wq: destroy_workqueue(virtio_vsock_workqueue); return ret; @@ -796,7 +793,7 @@ out_wq: static void __exit virtio_vsock_exit(void) { unregister_virtio_driver(&virtio_vsock_driver); - vsock_core_exit(); + vsock_core_unregister(&virtio_transport.transport); destroy_workqueue(virtio_vsock_workqueue); } diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 828edd88488c..e5ea29c6bca7 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -29,9 +29,10 @@ /* Threshold for detecting small packets to copy */ #define GOOD_COPY_LEN 128 -static const struct virtio_transport *virtio_transport_get_ops(void) +static const struct virtio_transport * +virtio_transport_get_ops(struct vsock_sock *vsk) { - const struct vsock_transport *t = vsock_core_get_transport(); + const struct vsock_transport *t = vsock_core_get_transport(vsk); return container_of(t, struct virtio_transport, transport); } @@ -168,7 +169,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, struct virtio_vsock_pkt *pkt; u32 pkt_len = info->pkt_len; - src_cid = vm_sockets_get_local_cid(); + src_cid = virtio_transport_get_ops(vsk)->transport.get_local_cid(); src_port = vsk->local_addr.svm_port; if (!info->remote_cid) { dst_cid = vsk->remote_addr.svm_cid; @@ -201,7 +202,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, virtio_transport_inc_tx_pkt(vvs, pkt); - return virtio_transport_get_ops()->send_pkt(pkt); + return virtio_transport_get_ops(vsk)->send_pkt(pkt); } static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, @@ -452,20 +453,16 @@ int virtio_transport_do_socket_init(struct vsock_sock *vsk, vsk->trans = vvs; vvs->vsk = vsk; - if (psk) { + if (psk && psk->trans) { struct virtio_vsock_sock *ptrans = psk->trans; - vvs->buf_size = ptrans->buf_size; - vvs->buf_size_min = ptrans->buf_size_min; - vvs->buf_size_max = ptrans->buf_size_max; vvs->peer_buf_alloc = ptrans->peer_buf_alloc; - } else { - vvs->buf_size = VIRTIO_VSOCK_DEFAULT_BUF_SIZE; - vvs->buf_size_min = VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE; - vvs->buf_size_max = VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE; } - vvs->buf_alloc = vvs->buf_size; + if (vsk->buffer_size > VIRTIO_VSOCK_MAX_BUF_SIZE) + vsk->buffer_size = VIRTIO_VSOCK_MAX_BUF_SIZE; + + vvs->buf_alloc = vsk->buffer_size; spin_lock_init(&vvs->rx_lock); spin_lock_init(&vvs->tx_lock); @@ -475,71 +472,20 @@ int virtio_transport_do_socket_init(struct vsock_sock *vsk, } EXPORT_SYMBOL_GPL(virtio_transport_do_socket_init); -u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk) -{ - struct virtio_vsock_sock *vvs = vsk->trans; - - return vvs->buf_size; -} -EXPORT_SYMBOL_GPL(virtio_transport_get_buffer_size); - -u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk) -{ - struct virtio_vsock_sock *vvs = vsk->trans; - - return vvs->buf_size_min; -} -EXPORT_SYMBOL_GPL(virtio_transport_get_min_buffer_size); - -u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk) +/* sk_lock held by the caller */ +void virtio_transport_notify_buffer_size(struct vsock_sock *vsk, u64 *val) { struct virtio_vsock_sock *vvs = vsk->trans; - return vvs->buf_size_max; -} -EXPORT_SYMBOL_GPL(virtio_transport_get_max_buffer_size); + if (*val > VIRTIO_VSOCK_MAX_BUF_SIZE) + *val = VIRTIO_VSOCK_MAX_BUF_SIZE; -void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val) -{ - struct virtio_vsock_sock *vvs = vsk->trans; - - if (val > VIRTIO_VSOCK_MAX_BUF_SIZE) - val = VIRTIO_VSOCK_MAX_BUF_SIZE; - if (val < vvs->buf_size_min) - vvs->buf_size_min = val; - if (val > vvs->buf_size_max) - vvs->buf_size_max = val; - vvs->buf_size = val; - vvs->buf_alloc = val; + vvs->buf_alloc = *val; virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM, NULL); } -EXPORT_SYMBOL_GPL(virtio_transport_set_buffer_size); - -void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val) -{ - struct virtio_vsock_sock *vvs = vsk->trans; - - if (val > VIRTIO_VSOCK_MAX_BUF_SIZE) - val = VIRTIO_VSOCK_MAX_BUF_SIZE; - if (val > vvs->buf_size) - vvs->buf_size = val; - vvs->buf_size_min = val; -} -EXPORT_SYMBOL_GPL(virtio_transport_set_min_buffer_size); - -void virtio_transport_set_max_buffer_size(struct vsock_sock *vsk, u64 val) -{ - struct virtio_vsock_sock *vvs = vsk->trans; - - if (val > VIRTIO_VSOCK_MAX_BUF_SIZE) - val = VIRTIO_VSOCK_MAX_BUF_SIZE; - if (val < vvs->buf_size) - vvs->buf_size = val; - vvs->buf_size_max = val; -} -EXPORT_SYMBOL_GPL(virtio_transport_set_max_buffer_size); +EXPORT_SYMBOL_GPL(virtio_transport_notify_buffer_size); int virtio_transport_notify_poll_in(struct vsock_sock *vsk, @@ -631,9 +577,7 @@ EXPORT_SYMBOL_GPL(virtio_transport_notify_send_post_enqueue); u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk) { - struct virtio_vsock_sock *vvs = vsk->trans; - - return vvs->buf_size; + return vsk->buffer_size; } EXPORT_SYMBOL_GPL(virtio_transport_stream_rcvhiwat); @@ -745,9 +689,9 @@ static int virtio_transport_reset(struct vsock_sock *vsk, /* Normally packets are associated with a socket. There may be no socket if an * attempt was made to connect to a socket that does not exist. */ -static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt) +static int virtio_transport_reset_no_sock(const struct virtio_transport *t, + struct virtio_vsock_pkt *pkt) { - const struct virtio_transport *t; struct virtio_vsock_pkt *reply; struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_RST, @@ -767,7 +711,6 @@ static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt) if (!reply) return -ENOMEM; - t = virtio_transport_get_ops(); if (!t) { virtio_transport_free_pkt(reply); return -ENOTCONN; @@ -1043,13 +986,39 @@ virtio_transport_send_response(struct vsock_sock *vsk, return virtio_transport_send_pkt_info(vsk, &info); } +static bool virtio_transport_space_update(struct sock *sk, + struct virtio_vsock_pkt *pkt) +{ + struct vsock_sock *vsk = vsock_sk(sk); + struct virtio_vsock_sock *vvs = vsk->trans; + bool space_available; + + /* Listener sockets are not associated with any transport, so we are + * not able to take the state to see if there is space available in the + * remote peer, but since they are only used to receive requests, we + * can assume that there is always space available in the other peer. + */ + if (!vvs) + return true; + + /* buf_alloc and fwd_cnt is always included in the hdr */ + spin_lock_bh(&vvs->tx_lock); + vvs->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc); + vvs->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt); + space_available = virtio_transport_has_space(vsk); + spin_unlock_bh(&vvs->tx_lock); + return space_available; +} + /* Handle server socket */ static int -virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt) +virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt, + struct virtio_transport *t) { struct vsock_sock *vsk = vsock_sk(sk); struct vsock_sock *vchild; struct sock *child; + int ret; if (le16_to_cpu(pkt->hdr.op) != VIRTIO_VSOCK_OP_REQUEST) { virtio_transport_reset(vsk, pkt); @@ -1061,8 +1030,7 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt) return -ENOMEM; } - child = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL, - sk->sk_type, 0); + child = vsock_create_connected(sk); if (!child) { virtio_transport_reset(vsk, pkt); return -ENOMEM; @@ -1080,6 +1048,20 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt) vsock_addr_init(&vchild->remote_addr, le64_to_cpu(pkt->hdr.src_cid), le32_to_cpu(pkt->hdr.src_port)); + ret = vsock_assign_transport(vchild, vsk); + /* Transport assigned (looking at remote_addr) must be the same + * where we received the request. + */ + if (ret || vchild->transport != &t->transport) { + release_sock(child); + virtio_transport_reset(vsk, pkt); + sock_put(child); + return ret; + } + + if (virtio_transport_space_update(child, pkt)) + child->sk_write_space(child); + vsock_insert_connected(vchild); vsock_enqueue_accept(sk, child); virtio_transport_send_response(vchild, pkt); @@ -1090,26 +1072,11 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt) return 0; } -static bool virtio_transport_space_update(struct sock *sk, - struct virtio_vsock_pkt *pkt) -{ - struct vsock_sock *vsk = vsock_sk(sk); - struct virtio_vsock_sock *vvs = vsk->trans; - bool space_available; - - /* buf_alloc and fwd_cnt is always included in the hdr */ - spin_lock_bh(&vvs->tx_lock); - vvs->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc); - vvs->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt); - space_available = virtio_transport_has_space(vsk); - spin_unlock_bh(&vvs->tx_lock); - return space_available; -} - /* We are under the virtio-vsock's vsock->rx_lock or vhost-vsock's vq->mutex * lock. */ -void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt) +void virtio_transport_recv_pkt(struct virtio_transport *t, + struct virtio_vsock_pkt *pkt) { struct sockaddr_vm src, dst; struct vsock_sock *vsk; @@ -1131,7 +1098,7 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt) le32_to_cpu(pkt->hdr.fwd_cnt)); if (le16_to_cpu(pkt->hdr.type) != VIRTIO_VSOCK_TYPE_STREAM) { - (void)virtio_transport_reset_no_sock(pkt); + (void)virtio_transport_reset_no_sock(t, pkt); goto free_pkt; } @@ -1142,7 +1109,7 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt) if (!sk) { sk = vsock_find_bound_socket(&dst); if (!sk) { - (void)virtio_transport_reset_no_sock(pkt); + (void)virtio_transport_reset_no_sock(t, pkt); goto free_pkt; } } @@ -1161,7 +1128,7 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt) switch (sk->sk_state) { case TCP_LISTEN: - virtio_transport_recv_listen(sk, pkt); + virtio_transport_recv_listen(sk, pkt, t); virtio_transport_free_pkt(pkt); break; case TCP_SYN_SENT: @@ -1179,6 +1146,7 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt) virtio_transport_free_pkt(pkt); break; } + release_sock(sk); /* Release refcnt obtained when we fetched this socket out of the diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 6ba98a1efe2e..644d32e43d23 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -57,6 +57,7 @@ static bool vmci_transport_old_proto_override(bool *old_pkt_proto); static u16 vmci_transport_new_proto_supported_versions(void); static bool vmci_transport_proto_to_notify_struct(struct sock *sk, u16 *proto, bool old_pkt_proto); +static bool vmci_check_transport(struct vsock_sock *vsk); struct vmci_transport_recv_pkt_info { struct work_struct work; @@ -74,15 +75,6 @@ static u32 vmci_transport_qp_resumed_sub_id = VMCI_INVALID_ID; static int PROTOCOL_OVERRIDE = -1; -#define VMCI_TRANSPORT_DEFAULT_QP_SIZE_MIN 128 -#define VMCI_TRANSPORT_DEFAULT_QP_SIZE 262144 -#define VMCI_TRANSPORT_DEFAULT_QP_SIZE_MAX 262144 - -/* The default peer timeout indicates how long we will wait for a peer response - * to a control message. - */ -#define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ) - /* Helper function to convert from a VMCI error code to a VSock error code. */ static s32 vmci_transport_error_to_vsock_error(s32 vmci_error) @@ -1013,8 +1005,7 @@ static int vmci_transport_recv_listen(struct sock *sk, return -ECONNREFUSED; } - pending = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL, - sk->sk_type, 0); + pending = vsock_create_connected(sk); if (!pending) { vmci_transport_send_reset(sk, pkt); return -ENOMEM; @@ -1027,14 +1018,24 @@ static int vmci_transport_recv_listen(struct sock *sk, vsock_addr_init(&vpending->remote_addr, pkt->dg.src.context, pkt->src_port); + err = vsock_assign_transport(vpending, vsock_sk(sk)); + /* Transport assigned (looking at remote_addr) must be the same + * where we received the request. + */ + if (err || !vmci_check_transport(vpending)) { + vmci_transport_send_reset(sk, pkt); + sock_put(pending); + return err; + } + /* If the proposed size fits within our min/max, accept it. Otherwise * propose our own size. */ - if (pkt->u.size >= vmci_trans(vpending)->queue_pair_min_size && - pkt->u.size <= vmci_trans(vpending)->queue_pair_max_size) { + if (pkt->u.size >= vpending->buffer_min_size && + pkt->u.size <= vpending->buffer_max_size) { qp_size = pkt->u.size; } else { - qp_size = vmci_trans(vpending)->queue_pair_size; + qp_size = vpending->buffer_size; } /* Figure out if we are using old or new requests based on the @@ -1103,7 +1104,7 @@ static int vmci_transport_recv_listen(struct sock *sk, pending->sk_state = TCP_SYN_SENT; vmci_trans(vpending)->produce_size = vmci_trans(vpending)->consume_size = qp_size; - vmci_trans(vpending)->queue_pair_size = qp_size; + vpending->buffer_size = qp_size; vmci_trans(vpending)->notify_ops->process_request(pending); @@ -1397,8 +1398,8 @@ static int vmci_transport_recv_connecting_client_negotiate( vsk->ignore_connecting_rst = false; /* Verify that we're OK with the proposed queue pair size */ - if (pkt->u.size < vmci_trans(vsk)->queue_pair_min_size || - pkt->u.size > vmci_trans(vsk)->queue_pair_max_size) { + if (pkt->u.size < vsk->buffer_min_size || + pkt->u.size > vsk->buffer_max_size) { err = -EINVAL; goto destroy; } @@ -1503,8 +1504,7 @@ vmci_transport_recv_connecting_client_invalid(struct sock *sk, vsk->sent_request = false; vsk->ignore_connecting_rst = true; - err = vmci_transport_send_conn_request( - sk, vmci_trans(vsk)->queue_pair_size); + err = vmci_transport_send_conn_request(sk, vsk->buffer_size); if (err < 0) err = vmci_transport_error_to_vsock_error(err); else @@ -1588,21 +1588,6 @@ static int vmci_transport_socket_init(struct vsock_sock *vsk, INIT_LIST_HEAD(&vmci_trans(vsk)->elem); vmci_trans(vsk)->sk = &vsk->sk; spin_lock_init(&vmci_trans(vsk)->lock); - if (psk) { - vmci_trans(vsk)->queue_pair_size = - vmci_trans(psk)->queue_pair_size; - vmci_trans(vsk)->queue_pair_min_size = - vmci_trans(psk)->queue_pair_min_size; - vmci_trans(vsk)->queue_pair_max_size = - vmci_trans(psk)->queue_pair_max_size; - } else { - vmci_trans(vsk)->queue_pair_size = - VMCI_TRANSPORT_DEFAULT_QP_SIZE; - vmci_trans(vsk)->queue_pair_min_size = - VMCI_TRANSPORT_DEFAULT_QP_SIZE_MIN; - vmci_trans(vsk)->queue_pair_max_size = - VMCI_TRANSPORT_DEFAULT_QP_SIZE_MAX; - } return 0; } @@ -1818,8 +1803,7 @@ static int vmci_transport_connect(struct vsock_sock *vsk) if (vmci_transport_old_proto_override(&old_pkt_proto) && old_pkt_proto) { - err = vmci_transport_send_conn_request( - sk, vmci_trans(vsk)->queue_pair_size); + err = vmci_transport_send_conn_request(sk, vsk->buffer_size); if (err < 0) { sk->sk_state = TCP_CLOSE; return err; @@ -1827,8 +1811,7 @@ static int vmci_transport_connect(struct vsock_sock *vsk) } else { int supported_proto_versions = vmci_transport_new_proto_supported_versions(); - err = vmci_transport_send_conn_request2( - sk, vmci_trans(vsk)->queue_pair_size, + err = vmci_transport_send_conn_request2(sk, vsk->buffer_size, supported_proto_versions); if (err < 0) { sk->sk_state = TCP_CLOSE; @@ -1881,46 +1864,6 @@ static bool vmci_transport_stream_is_active(struct vsock_sock *vsk) return !vmci_handle_is_invalid(vmci_trans(vsk)->qp_handle); } -static u64 vmci_transport_get_buffer_size(struct vsock_sock *vsk) -{ - return vmci_trans(vsk)->queue_pair_size; -} - -static u64 vmci_transport_get_min_buffer_size(struct vsock_sock *vsk) -{ - return vmci_trans(vsk)->queue_pair_min_size; -} - -static u64 vmci_transport_get_max_buffer_size(struct vsock_sock *vsk) -{ - return vmci_trans(vsk)->queue_pair_max_size; -} - -static void vmci_transport_set_buffer_size(struct vsock_sock *vsk, u64 val) -{ - if (val < vmci_trans(vsk)->queue_pair_min_size) - vmci_trans(vsk)->queue_pair_min_size = val; - if (val > vmci_trans(vsk)->queue_pair_max_size) - vmci_trans(vsk)->queue_pair_max_size = val; - vmci_trans(vsk)->queue_pair_size = val; -} - -static void vmci_transport_set_min_buffer_size(struct vsock_sock *vsk, - u64 val) -{ - if (val > vmci_trans(vsk)->queue_pair_size) - vmci_trans(vsk)->queue_pair_size = val; - vmci_trans(vsk)->queue_pair_min_size = val; -} - -static void vmci_transport_set_max_buffer_size(struct vsock_sock *vsk, - u64 val) -{ - if (val < vmci_trans(vsk)->queue_pair_size) - vmci_trans(vsk)->queue_pair_size = val; - vmci_trans(vsk)->queue_pair_max_size = val; -} - static int vmci_transport_notify_poll_in( struct vsock_sock *vsk, size_t target, @@ -2076,7 +2019,8 @@ static u32 vmci_transport_get_local_cid(void) return vmci_get_context_id(); } -static const struct vsock_transport vmci_transport = { +static struct vsock_transport vmci_transport = { + .module = THIS_MODULE, .init = vmci_transport_socket_init, .destruct = vmci_transport_destruct, .release = vmci_transport_release, @@ -2103,15 +2047,26 @@ static const struct vsock_transport vmci_transport = { .notify_send_pre_enqueue = vmci_transport_notify_send_pre_enqueue, .notify_send_post_enqueue = vmci_transport_notify_send_post_enqueue, .shutdown = vmci_transport_shutdown, - .set_buffer_size = vmci_transport_set_buffer_size, - .set_min_buffer_size = vmci_transport_set_min_buffer_size, - .set_max_buffer_size = vmci_transport_set_max_buffer_size, - .get_buffer_size = vmci_transport_get_buffer_size, - .get_min_buffer_size = vmci_transport_get_min_buffer_size, - .get_max_buffer_size = vmci_transport_get_max_buffer_size, .get_local_cid = vmci_transport_get_local_cid, }; +static bool vmci_check_transport(struct vsock_sock *vsk) +{ + return vsk->transport == &vmci_transport; +} + +void vmci_vsock_transport_cb(bool is_host) +{ + int features; + + if (is_host) + features = VSOCK_TRANSPORT_F_H2G; + else + features = VSOCK_TRANSPORT_F_G2H; + + vsock_core_register(&vmci_transport, features); +} + static int __init vmci_transport_init(void) { int err; @@ -2128,7 +2083,6 @@ static int __init vmci_transport_init(void) pr_err("Unable to create datagram handle. (%d)\n", err); return vmci_transport_error_to_vsock_error(err); } - err = vmci_event_subscribe(VMCI_EVENT_QP_RESUMED, vmci_transport_qp_resumed_cb, NULL, &vmci_transport_qp_resumed_sub_id); @@ -2139,12 +2093,21 @@ static int __init vmci_transport_init(void) goto err_destroy_stream_handle; } - err = vsock_core_init(&vmci_transport); + /* Register only with dgram feature, other features (H2G, G2H) will be + * registered when the first host or guest becomes active. + */ + err = vsock_core_register(&vmci_transport, VSOCK_TRANSPORT_F_DGRAM); if (err < 0) goto err_unsubscribe; + err = vmci_register_vsock_callback(vmci_vsock_transport_cb); + if (err < 0) + goto err_unregister; + return 0; +err_unregister: + vsock_core_unregister(&vmci_transport); err_unsubscribe: vmci_event_unsubscribe(vmci_transport_qp_resumed_sub_id); err_destroy_stream_handle: @@ -2170,7 +2133,8 @@ static void __exit vmci_transport_exit(void) vmci_transport_qp_resumed_sub_id = VMCI_INVALID_ID; } - vsock_core_exit(); + vmci_register_vsock_callback(NULL); + vsock_core_unregister(&vmci_transport); } module_exit(vmci_transport_exit); diff --git a/net/vmw_vsock/vmci_transport.h b/net/vmw_vsock/vmci_transport.h index 1ca1e8640b31..b7b072194282 100644 --- a/net/vmw_vsock/vmci_transport.h +++ b/net/vmw_vsock/vmci_transport.h @@ -108,9 +108,6 @@ struct vmci_transport { struct vmci_qp *qpair; u64 produce_size; u64 consume_size; - u64 queue_pair_size; - u64 queue_pair_min_size; - u64 queue_pair_max_size; u32 detach_sub_id; union vmci_transport_notify notify; const struct vmci_transport_notify_ops *notify_ops; diff --git a/net/vmw_vsock/vmci_transport_notify.h b/net/vmw_vsock/vmci_transport_notify.h index 7843f08d4290..a1aa5a998c0e 100644 --- a/net/vmw_vsock/vmci_transport_notify.h +++ b/net/vmw_vsock/vmci_transport_notify.h @@ -11,7 +11,6 @@ #include <linux/types.h> #include <linux/vmw_vmci_defs.h> #include <linux/vmw_vmci_api.h> -#include <linux/vm_sockets.h> #include "vmci_transport.h" diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index 51bb6018f3bf..17b8a7d4b71b 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -3,13 +3,13 @@ # XFRM configuration # config XFRM - bool - depends on INET - select GRO_CELLS - select SKB_EXTENSIONS + bool + depends on INET + select GRO_CELLS + select SKB_EXTENSIONS config XFRM_OFFLOAD - bool + bool config XFRM_ALGO tristate diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 9b599ed66d97..2c86a2fc3915 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -480,6 +480,9 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) else XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEINVALID); + + if (encap_type == -1) + dev_put(skb->dev); goto drop; } diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 0f5131bc3342..7ac1542feaf8 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -732,30 +732,7 @@ static struct rtnl_link_ops xfrmi_link_ops __read_mostly = { .get_link_net = xfrmi_get_link_net, }; -static void __net_exit xfrmi_destroy_interfaces(struct xfrmi_net *xfrmn) -{ - struct xfrm_if *xi; - LIST_HEAD(list); - - xi = rtnl_dereference(xfrmn->xfrmi[0]); - if (!xi) - return; - - unregister_netdevice_queue(xi->dev, &list); - unregister_netdevice_many(&list); -} - -static void __net_exit xfrmi_exit_net(struct net *net) -{ - struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); - - rtnl_lock(); - xfrmi_destroy_interfaces(xfrmn); - rtnl_unlock(); -} - static struct pernet_operations xfrmi_net_ops = { - .exit = xfrmi_exit_net, .id = &xfrmi_net_id, .size = sizeof(struct xfrmi_net), }; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index c6f3c4a1bd99..f3423562d933 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -495,6 +495,8 @@ static void ___xfrm_state_destroy(struct xfrm_state *x) x->type->destructor(x); xfrm_put_type(x->type); } + if (x->xfrag.page) + put_page(x->xfrag.page); xfrm_dev_state_free(x); security_xfrm_state_free(x); xfrm_state_free(x); |