From e16edc99d658cd41c60a44cc14d170697aa3271f Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 16 Apr 2021 12:44:16 +0200 Subject: vsock/vmci: log once the failed queue pair allocation VMCI feature is not supported in conjunction with the vSphere Fault Tolerance (FT) feature. VMware Tools can repeatedly try to create a vsock connection. If FT is enabled the kernel logs is flooded with the following messages: qp_alloc_hypercall result = -20 Could not attach to queue pair with -20 "qp_alloc_hypercall result = -20" was hidden by commit e8266c4c3307 ("VMCI: Stop log spew when qp allocation isn't possible"), but "Could not attach to queue pair with -20" is still there flooding the log. Since the error message can be useful in some cases, print it only once. Fixes: d021c344051a ("VSOCK: Introduce VM Sockets") Signed-off-by: Stefano Garzarella Reviewed-by: Jorgen Hansen Signed-off-by: David S. Miller --- net/vmw_vsock/vmci_transport.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 8b65323207db..1c9ecb18b8e6 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -568,8 +568,7 @@ vmci_transport_queue_pair_alloc(struct vmci_qp **qpair, peer, flags, VMCI_NO_PRIVILEGE_FLAGS); out: if (err < 0) { - pr_err("Could not attach to queue pair with %d\n", - err); + pr_err_once("Could not attach to queue pair with %d\n", err); err = vmci_transport_error_to_vsock_error(err); } -- cgit v1.2.3 From ed8157f1ebf1ae81a8fa2653e3f20d2076fad1c9 Mon Sep 17 00:00:00 2001 From: Du Cheng Date: Sat, 17 Apr 2021 07:30:46 +0800 Subject: net: sched: tapr: prevent cycle_time == 0 in parse_taprio_schedule There is a reproducible sequence from the userland that will trigger a WARN_ON() condition in taprio_get_start_time, which causes kernel to panic if configured as "panic_on_warn". Catch this condition in parse_taprio_schedule to prevent this condition. Reported as bug on syzkaller: https://syzkaller.appspot.com/bug?extid=d50710fd0873a9c6b40c Reported-by: syzbot+d50710fd0873a9c6b40c@syzkaller.appspotmail.com Signed-off-by: Du Cheng Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/sch_taprio.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 8287894541e3..909c798b7403 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -901,6 +901,12 @@ static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb, list_for_each_entry(entry, &new->entries, list) cycle = ktime_add_ns(cycle, entry->interval); + + if (!cycle) { + NL_SET_ERR_MSG(extack, "'cycle_time' can never be 0"); + return -EINVAL; + } + new->cycle_time = cycle; } -- cgit v1.2.3 From 7ad18ff6449cbd6beb26b53128ddf56d2685aa93 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Mon, 19 Apr 2021 12:53:06 +0000 Subject: gro: fix napi_gro_frags() Fast GRO breakage due to IP alignment check Commit 38ec4944b593 ("gro: ensure frag0 meets IP header alignment") did the right thing, but missed the fact that napi_gro_frags() logics calls for skb_gro_reset_offset() *before* pulling Ethernet header to the skb linear space. That said, the introduced check for frag0 address being aligned to 4 always fails for it as Ethernet header is obviously 14 bytes long, and in case with NET_IP_ALIGN its start is not aligned to 4. Fix this by adding @nhoff argument to skb_gro_reset_offset() which tells if an IP header is placed right at the start of frag0 or not. This restores Fast GRO for napi_gro_frags() that became very slow after the mentioned commit, and preserves the introduced check to avoid silent unaligned accesses. From v1 [0]: - inline tiny skb_gro_reset_offset() to let the code be optimized more efficively (esp. for the !NET_IP_ALIGN case) (Eric); - pull in Reviewed-by from Eric. [0] https://lore.kernel.org/netdev/20210418114200.5839-1-alobakin@pm.me Fixes: 38ec4944b593 ("gro: ensure frag0 meets IP header alignment") Reviewed-by: Eric Dumazet Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 1f79b9aa9a3f..15fe36332fb8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5914,7 +5914,7 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi, return head; } -static void skb_gro_reset_offset(struct sk_buff *skb) +static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff) { const struct skb_shared_info *pinfo = skb_shinfo(skb); const skb_frag_t *frag0 = &pinfo->frags[0]; @@ -5925,7 +5925,7 @@ static void skb_gro_reset_offset(struct sk_buff *skb) if (!skb_headlen(skb) && pinfo->nr_frags && !PageHighMem(skb_frag_page(frag0)) && - (!NET_IP_ALIGN || !(skb_frag_off(frag0) & 3))) { + (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, skb_frag_size(frag0), @@ -6143,7 +6143,7 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) skb_mark_napi_id(skb, napi); trace_napi_gro_receive_entry(skb); - skb_gro_reset_offset(skb); + skb_gro_reset_offset(skb, 0); ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb)); trace_napi_gro_receive_exit(ret); @@ -6232,7 +6232,7 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) napi->skb = NULL; skb_reset_mac_header(skb); - skb_gro_reset_offset(skb); + skb_gro_reset_offset(skb, hlen); if (unlikely(skb_gro_header_hard(skb, hlen))) { eth = skb_gro_header_slow(skb, hlen, 0); -- cgit v1.2.3 From c1102e9d49eb36c0be18cb3e16f6e46ffb717964 Mon Sep 17 00:00:00 2001 From: Di Zhu Date: Mon, 19 Apr 2021 21:56:41 +0800 Subject: net: fix a data race when get vlan device We encountered a crash: in the packet receiving process, we got an illegal VLAN device address, but the VLAN device address saved in vmcore is correct. After checking the code, we found a possible data competition: CPU 0: CPU 1: (RCU read lock) (RTNL lock) vlan_do_receive() register_vlan_dev() vlan_find_dev() ->__vlan_group_get_device() ->vlan_group_prealloc_vid() In vlan_group_prealloc_vid(), We need to make sure that memset() in kzalloc() is executed before assigning value to vlan devices array: ================================= kzalloc() ->memset(object, 0, size) smp_wmb() vg->vlan_devices_arrays[pidx][vidx] = array; ================================== Because __vlan_group_get_device() function depends on this order. otherwise we may get a wrong address from the hardware cache on another cpu. So fix it by adding memory barrier instruction to ensure the order of memory operations. Signed-off-by: Di Zhu Signed-off-by: David S. Miller --- net/8021q/vlan.c | 3 +++ net/8021q/vlan.h | 4 ++++ 2 files changed, 7 insertions(+) (limited to 'net') diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 8b644113715e..fb3d3262dc1a 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -71,6 +71,9 @@ static int vlan_group_prealloc_vid(struct vlan_group *vg, if (array == NULL) return -ENOBUFS; + /* paired with smp_rmb() in __vlan_group_get_device() */ + smp_wmb(); + vg->vlan_devices_arrays[pidx][vidx] = array; return 0; } diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index 953405362795..fa3ad3d4d58c 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -57,6 +57,10 @@ static inline struct net_device *__vlan_group_get_device(struct vlan_group *vg, array = vg->vlan_devices_arrays[pidx] [vlan_id / VLAN_GROUP_ARRAY_PART_LEN]; + + /* paired with smp_wmb() in vlan_group_prealloc_vid() */ + smp_rmb(); + return array ? array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] : NULL; } -- cgit v1.2.3 From 8432b8114957235f42e070a16118a7f750de9d39 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 20 Apr 2021 13:07:27 +0200 Subject: vsock/virtio: free queued packets when closing socket As reported by syzbot [1], there is a memory leak while closing the socket. We partially solved this issue with commit ac03046ece2b ("vsock/virtio: free packets during the socket release"), but we forgot to drain the RX queue when the socket is definitely closed by the scheduled work. To avoid future issues, let's use the new virtio_transport_remove_sock() to drain the RX queue before removing the socket from the af_vsock lists calling vsock_remove_sock(). [1] https://syzkaller.appspot.com/bug?extid=24452624fc4c571eedd9 Fixes: ac03046ece2b ("vsock/virtio: free packets during the socket release") Reported-and-tested-by: syzbot+24452624fc4c571eedd9@syzkaller.appspotmail.com Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- net/vmw_vsock/virtio_transport_common.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index e4370b1b7494..902cb6dd710b 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -733,6 +733,23 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t, return t->send_pkt(reply); } +/* This function should be called with sk_lock held and SOCK_DONE set */ +static void virtio_transport_remove_sock(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + struct virtio_vsock_pkt *pkt, *tmp; + + /* We don't need to take rx_lock, as the socket is closing and we are + * removing it. + */ + list_for_each_entry_safe(pkt, tmp, &vvs->rx_queue, list) { + list_del(&pkt->list); + virtio_transport_free_pkt(pkt); + } + + vsock_remove_sock(vsk); +} + static void virtio_transport_wait_close(struct sock *sk, long timeout) { if (timeout) { @@ -765,7 +782,7 @@ static void virtio_transport_do_close(struct vsock_sock *vsk, (!cancel_timeout || cancel_delayed_work(&vsk->close_work))) { vsk->close_work_scheduled = false; - vsock_remove_sock(vsk); + virtio_transport_remove_sock(vsk); /* Release refcnt obtained when we scheduled the timeout */ sock_put(sk); @@ -828,22 +845,15 @@ static bool virtio_transport_close(struct vsock_sock *vsk) void virtio_transport_release(struct vsock_sock *vsk) { - struct virtio_vsock_sock *vvs = vsk->trans; - struct virtio_vsock_pkt *pkt, *tmp; struct sock *sk = &vsk->sk; bool remove_sock = true; if (sk->sk_type == SOCK_STREAM) remove_sock = virtio_transport_close(vsk); - list_for_each_entry_safe(pkt, tmp, &vvs->rx_queue, list) { - list_del(&pkt->list); - virtio_transport_free_pkt(pkt); - } - if (remove_sock) { sock_set_flag(sk, SOCK_DONE); - vsock_remove_sock(vsk); + virtio_transport_remove_sock(vsk); } } EXPORT_SYMBOL_GPL(virtio_transport_release); -- cgit v1.2.3 From 47a017f33943278570c072bc71681809b2567b3a Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Wed, 21 Apr 2021 10:40:07 -0700 Subject: net: qrtr: Avoid potential use after free in MHI send It is possible that the MHI ul_callback will be invoked immediately following the queueing of the skb for transmission, leading to the callback decrementing the refcount of the associated sk and freeing the skb. As such the dereference of skb and the increment of the sk refcount must happen before the skb is queued, to avoid the skb to be used after free and potentially the sk to drop its last refcount.. Fixes: 6e728f321393 ("net: qrtr: Add MHI transport layer") Signed-off-by: Bjorn Andersson Signed-off-by: David S. Miller --- net/qrtr/mhi.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/qrtr/mhi.c b/net/qrtr/mhi.c index 2bf2b1943e61..fa611678af05 100644 --- a/net/qrtr/mhi.c +++ b/net/qrtr/mhi.c @@ -50,6 +50,9 @@ static int qcom_mhi_qrtr_send(struct qrtr_endpoint *ep, struct sk_buff *skb) struct qrtr_mhi_dev *qdev = container_of(ep, struct qrtr_mhi_dev, ep); int rc; + if (skb->sk) + sock_hold(skb->sk); + rc = skb_linearize(skb); if (rc) goto free_skb; @@ -59,12 +62,11 @@ static int qcom_mhi_qrtr_send(struct qrtr_endpoint *ep, struct sk_buff *skb) if (rc) goto free_skb; - if (skb->sk) - sock_hold(skb->sk); - return rc; free_skb: + if (skb->sk) + sock_put(skb->sk); kfree_skb(skb); return rc; -- cgit v1.2.3 From eefb45eef5c4c425e87667af8f5e904fbdd47abf Mon Sep 17 00:00:00 2001 From: Chinmay Agarwal Date: Thu, 22 Apr 2021 01:12:22 +0530 Subject: neighbour: Prevent Race condition in neighbour subsytem Following Race Condition was detected: : Executing: __netif_receive_skb() ->__netif_receive_skb_core() -> arp_rcv() -> arp_process().arp_process() calls __neigh_lookup() which takes a reference on neighbour entry 'n'. Moves further along, arp_process() and calls neigh_update()-> __neigh_update(). Neighbour entry is unlocked just before a call to neigh_update_gc_list. This unlocking paves way for another thread that may take a reference on the same and mark it dead and remove it from gc_list. - neigh_flush_dev() is under execution and calls neigh_mark_dead(n) marking the neighbour entry 'n' as dead. Also n will be removed from gc_list. Moves further along neigh_flush_dev() and calls neigh_cleanup_and_release(n), but since reference count increased in t1, 'n' couldn't be destroyed. - Code hits neigh_update_gc_list, with neighbour entry set as dead. - arp_process() finally calls neigh_release(n), destroying the neighbour entry and we have a destroyed ntry still part of gc_list. Fixes: eb4e8fac00d1("neighbour: Prevent a dead entry from updating gc_list") Signed-off-by: Chinmay Agarwal Signed-off-by: David S. Miller --- net/core/neighbour.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 8379719d1dce..98f20efbfadf 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -131,6 +131,9 @@ static void neigh_update_gc_list(struct neighbour *n) write_lock_bh(&n->tbl->lock); write_lock(&n->lock); + if (n->dead) + goto out; + /* remove from the gc list if new state is permanent or if neighbor * is externally learned; otherwise entry should be on the gc list */ @@ -147,6 +150,7 @@ static void neigh_update_gc_list(struct neighbour *n) atomic_inc(&n->tbl->gc_entries); } +out: write_unlock(&n->lock); write_unlock_bh(&n->tbl->lock); } -- cgit v1.2.3 From 7d742b509dd773f6ae2f32ffe3d2c0f3ea598a6d Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 21 Apr 2021 15:57:47 +0200 Subject: openvswitch: meter: remove rate from the bucket size calculation Implementation of meters supposed to be a classic token bucket with 2 typical parameters: rate and burst size. Burst size in this schema is the maximum number of bytes/packets that could pass without being rate limited. Recent changes to userspace datapath made meter implementation to be in line with the kernel one, and this uncovered several issues. The main problem is that maximum bucket size for unknown reason accounts not only burst size, but also the numerical value of rate. This creates a lot of confusion around behavior of meters. For example, if rate is configured as 1000 pps and burst size set to 1, this should mean that meter will tolerate bursts of 1 packet at most, i.e. not a single packet above the rate should pass the meter. However, current implementation calculates maximum bucket size as (rate + burst size), so the effective bucket size will be 1001. This means that first 1000 packets will not be rate limited and average rate might be twice as high as the configured rate. This also makes it practically impossible to configure meter that will have burst size lower than the rate, which might be a desirable configuration if the rate is high. Inability to configure low values of a burst size and overall inability for a user to predict what will be a maximum and average rate from the configured parameters of a meter without looking at the OVS and kernel code might be also classified as a security issue, because drop meters are frequently used as a way of protection from DoS attacks. This change removes rate from the calculation of a bucket size, making it in line with the classic token bucket algorithm and essentially making the rate and burst tolerance being predictable from a users' perspective. Same change proposed for the userspace implementation. Fixes: 96fbc13d7e77 ("openvswitch: Add meter infrastructure") Signed-off-by: Ilya Maximets Signed-off-by: David S. Miller --- net/openvswitch/meter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 15424d26e85d..96b524ceabca 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -392,7 +392,7 @@ static struct dp_meter *dp_meter_create(struct nlattr **a) * * Start with a full bucket. */ - band->bucket = (band->burst_size + band->rate) * 1000ULL; + band->bucket = band->burst_size * 1000ULL; band_max_delta_t = div_u64(band->bucket, band->rate); if (band_max_delta_t > meter->max_delta_t) meter->max_delta_t = band_max_delta_t; @@ -641,7 +641,7 @@ bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, long long int max_bucket_size; band = &meter->bands[i]; - max_bucket_size = (band->burst_size + band->rate) * 1000LL; + max_bucket_size = band->burst_size * 1000LL; band->bucket += delta_ms * band->rate; if (band->bucket > max_bucket_size) -- cgit v1.2.3 From 6477dd39e62c3a67cfa368ddc127410b4ae424c6 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Fri, 23 Apr 2021 09:40:33 -0700 Subject: mptcp: Retransmit DATA_FIN With this change, the MPTCP-level retransmission timer is used to resend DATA_FIN. The retranmit timer is not stopped while waiting for a MPTCP-level ACK of DATA_FIN, and retransmitted DATA_FINs are sent on all subflows. The retry interval starts at TCP_RTO_MIN and then doubles on each attempt, up to TCP_RTO_MAX. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/146 Fixes: 43b54c6ee382 ("mptcp: Use full MPTCP-level disconnect state machine") Acked-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 4bde960e19dc..61329b8181ea 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -399,6 +399,14 @@ static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq) return false; } +static void mptcp_set_datafin_timeout(const struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + mptcp_sk(sk)->timer_ival = min(TCP_RTO_MAX, + TCP_RTO_MIN << icsk->icsk_retransmits); +} + static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk) { long tout = ssk && inet_csk(ssk)->icsk_pending ? @@ -1052,7 +1060,7 @@ out: } if (snd_una == READ_ONCE(msk->snd_nxt)) { - if (msk->timer_ival) + if (msk->timer_ival && !mptcp_data_fin_enabled(msk)) mptcp_stop_timer(sk); } else { mptcp_reset_timer(sk); @@ -2276,8 +2284,19 @@ static void __mptcp_retrans(struct sock *sk) __mptcp_clean_una_wakeup(sk); dfrag = mptcp_rtx_head(sk); - if (!dfrag) + if (!dfrag) { + if (mptcp_data_fin_enabled(msk)) { + struct inet_connection_sock *icsk = inet_csk(sk); + + icsk->icsk_retransmits++; + mptcp_set_datafin_timeout(sk); + mptcp_send_ack(msk); + + goto reset_timer; + } + return; + } ssk = mptcp_subflow_get_retrans(msk); if (!ssk) @@ -2460,6 +2479,8 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) pr_debug("Sending DATA_FIN on subflow %p", ssk); mptcp_set_timeout(sk, ssk); tcp_send_ack(ssk); + if (!mptcp_timer_pending(sk)) + mptcp_reset_timer(sk); } break; } -- cgit v1.2.3