From 64aa42338e9a88c139b89797163714f0f95f3c6b Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 13 May 2015 15:26:10 +0800 Subject: esp4: Use high-order sequence number bits for IV generation I noticed we were only using the low-order bits for IV generation when ESN is enabled. This is very bad because it means that the IV can repeat. We must use the full 64 bits. Signed-off-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/ipv4/esp4.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 421a80b09b62..30b544f025ac 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -256,7 +256,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) aead_givcrypt_set_crypt(req, sg, sg, clen, iv); aead_givcrypt_set_assoc(req, asg, assoclen); aead_givcrypt_set_giv(req, esph->enc_data, - XFRM_SKB_CB(skb)->seq.output.low); + XFRM_SKB_CB(skb)->seq.output.low + + ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32)); ESP_SKB_CB(skb)->tmp = tmp; err = crypto_aead_givencrypt(req); -- cgit v1.2.3 From cd5279c194f89c9b97c294af4aaf4ea8c5e3c704 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 27 May 2015 07:16:43 -0700 Subject: ip_vti/ip6_vti: Do not touch skb->mark on xmit Instead of modifying skb->mark we can simply modify the flowi_mark that is generated as a result of the xfrm_decode_session. By doing this we don't need to actually touch the skb->mark and it can be preserved as it passes out through the tunnel. Signed-off-by: Alexander Duyck Signed-off-by: Steffen Klassert --- net/ipv4/ip_vti.c | 5 +++-- net/ipv6/ip6_vti.c | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 9f7269f3c54a..4c318e1c13c8 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -216,8 +216,6 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) memset(&fl, 0, sizeof(fl)); - skb->mark = be32_to_cpu(tunnel->parms.o_key); - switch (skb->protocol) { case htons(ETH_P_IP): xfrm_decode_session(skb, &fl, AF_INET); @@ -233,6 +231,9 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } + /* override mark with tunnel output key */ + fl.flowi_mark = be32_to_cpu(tunnel->parms.o_key); + return vti_xmit(skb, dev, &fl); } diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index ed9d681207fa..104de4da3ff3 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -495,7 +495,6 @@ vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) int ret; memset(&fl, 0, sizeof(fl)); - skb->mark = be32_to_cpu(t->parms.o_key); switch (skb->protocol) { case htons(ETH_P_IPV6): @@ -516,6 +515,9 @@ vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) goto tx_err; } + /* override mark with tunnel output key */ + fl.flowi_mark = be32_to_cpu(t->parms.o_key); + ret = vti6_xmit(skb, dev, &fl); if (ret < 0) goto tx_err; -- cgit v1.2.3 From d55c670cbc54b2270a465cdc382ce71adae45785 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 27 May 2015 07:16:54 -0700 Subject: ip_vti/ip6_vti: Preserve skb->mark after rcv_cb call The vti6_rcv_cb and vti_rcv_cb calls were leaving the skb->mark modified after completing the function. This resulted in the original skb->mark value being lost. Since we only need skb->mark to be set for xfrm_policy_check we can pull the assignment into the rcv_cb calls and then just restore the original mark after xfrm_policy_check has been completed. Signed-off-by: Alexander Duyck Signed-off-by: Steffen Klassert --- net/ipv4/ip_vti.c | 9 +++++++-- net/ipv6/ip6_vti.c | 9 +++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 4c318e1c13c8..0c152087ca15 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -65,7 +65,6 @@ static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, goto drop; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel; - skb->mark = be32_to_cpu(tunnel->parms.i_key); return xfrm_input(skb, nexthdr, spi, encap_type); } @@ -91,6 +90,8 @@ static int vti_rcv_cb(struct sk_buff *skb, int err) struct pcpu_sw_netstats *tstats; struct xfrm_state *x; struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4; + u32 orig_mark = skb->mark; + int ret; if (!tunnel) return 1; @@ -107,7 +108,11 @@ static int vti_rcv_cb(struct sk_buff *skb, int err) x = xfrm_input_state(skb); family = x->inner_mode->afinfo->family; - if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) + skb->mark = be32_to_cpu(tunnel->parms.i_key); + ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family); + skb->mark = orig_mark; + + if (!ret) return -EPERM; skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev))); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 104de4da3ff3..ff3bd863fa03 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -322,7 +322,6 @@ static int vti6_rcv(struct sk_buff *skb) } XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t; - skb->mark = be32_to_cpu(t->parms.i_key); rcu_read_unlock(); @@ -342,6 +341,8 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err) struct pcpu_sw_netstats *tstats; struct xfrm_state *x; struct ip6_tnl *t = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6; + u32 orig_mark = skb->mark; + int ret; if (!t) return 1; @@ -358,7 +359,11 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err) x = xfrm_input_state(skb); family = x->inner_mode->afinfo->family; - if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) + skb->mark = be32_to_cpu(t->parms.i_key); + ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family); + skb->mark = orig_mark; + + if (!ret) return -EPERM; skb_scrub_packet(skb, !net_eq(t->net, dev_net(skb->dev))); -- cgit v1.2.3 From beb39db59d14990e401e235faf66a6b9b31240b0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 30 May 2015 09:16:53 -0700 Subject: udp: fix behavior of wrong checksums We have two problems in UDP stack related to bogus checksums : 1) We return -EAGAIN to application even if receive queue is not empty. This breaks applications using edge trigger epoll() 2) Under UDP flood, we can loop forever without yielding to other processes, potentially hanging the host, especially on non SMP. This patch is an attempt to make things better. We might in the future add extra support for rt applications wanting to better control time spent doing a recv() in a hostile environment. For example we could validate checksums before queuing packets in socket receive queue. Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/udp.c | 6 ++---- net/ipv6/udp.c | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d10b7e0112eb..1c92ea67baef 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1345,10 +1345,8 @@ csum_copy_err: } unlock_sock_fast(sk, slow); - if (noblock) - return -EAGAIN; - - /* starting over for a new packet */ + /* starting over for a new packet, but check if we need to yield */ + cond_resched(); msg->msg_flags &= ~MSG_TRUNC; goto try_again; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index c2ec41617a35..e51fc3eee6db 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -525,10 +525,8 @@ csum_copy_err: } unlock_sock_fast(sk, slow); - if (noblock) - return -EAGAIN; - - /* starting over for a new packet */ + /* starting over for a new packet, but check if we need to yield */ + cond_resched(); msg->msg_flags &= ~MSG_TRUNC; goto try_again; } -- cgit v1.2.3 From 9f950415e4e28e7cfae2e416b43e862e8101d996 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Fri, 29 May 2015 13:47:07 -0400 Subject: tcp: fix child sockets to use system default congestion control if not set Linux 3.17 and earlier are explicitly engineered so that if the app doesn't specifically request a CC module on a listener before the SYN arrives, then the child gets the system default CC when the connection is established. See tcp_init_congestion_control() in 3.17 or earlier, which says "if no choice made yet assign the current value set as default". The change ("net: tcp: assign tcp cong_ops when tcp sk is created") altered these semantics, so that children got their parent listener's congestion control even if the system default had changed after the listener was created. This commit returns to those original semantics from 3.17 and earlier, since they are the original semantics from 2007 in 4d4d3d1e8 ("[TCP]: Congestion control initialization."), and some Linux congestion control workflows depend on that. In summary, if a listener socket specifically sets TCP_CONGESTION to "x", or the route locks the CC module to "x", then the child gets "x". Otherwise the child gets current system default from net.ipv4.tcp_congestion_control. That's the behavior in 3.17 and earlier, and this commit restores that. Fixes: 55d8694fa82c ("net: tcp: assign tcp cong_ops when tcp sk is created") Cc: Florian Westphal Cc: Daniel Borkmann Cc: Glenn Judd Cc: Stephen Hemminger Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: Yuchung Cheng Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 3 ++- net/ipv4/tcp_cong.c | 5 ++++- net/ipv4/tcp_minisocks.c | 5 ++++- 3 files changed, 10 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 497bc14cdb85..0320bbb7d7b5 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -98,7 +98,8 @@ struct inet_connection_sock { const struct tcp_congestion_ops *icsk_ca_ops; const struct inet_connection_sock_af_ops *icsk_af_ops; unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); - __u8 icsk_ca_state:7, + __u8 icsk_ca_state:6, + icsk_ca_setsockopt:1, icsk_ca_dst_locked:1; __u8 icsk_retransmits; __u8 icsk_pending; diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 7a5ae50c80c8..84be008c945c 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -187,6 +187,7 @@ static void tcp_reinit_congestion_control(struct sock *sk, tcp_cleanup_congestion_control(sk); icsk->icsk_ca_ops = ca; + icsk->icsk_ca_setsockopt = 1; if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); @@ -335,8 +336,10 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) rcu_read_lock(); ca = __tcp_ca_find_autoload(name); /* No change asking for existing value */ - if (ca == icsk->icsk_ca_ops) + if (ca == icsk->icsk_ca_ops) { + icsk->icsk_ca_setsockopt = 1; goto out; + } if (!ca) err = -ENOENT; else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b5732a54f2ad..17e7339ee5ca 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -420,7 +420,10 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) rcu_read_unlock(); } - if (!ca_got_dst && !try_module_get(icsk->icsk_ca_ops->owner)) + /* If no valid choice made yet, assign current system default ca. */ + if (!ca_got_dst && + (!icsk->icsk_ca_setsockopt || + !try_module_get(icsk->icsk_ca_ops->owner))) tcp_assign_congestion_control(sk); tcp_set_ca_state(sk, TCP_CA_Open); -- cgit v1.2.3