From b357a364c57c940ddb932224542494363df37378 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 23 Apr 2015 18:03:44 -0700
Subject: inet: fix possible panic in reqsk_queue_unlink()

[ 3897.923145] BUG: unable to handle kernel NULL pointer dereference at
 0000000000000080
[ 3897.931025] IP: [<ffffffffa9f27686>] reqsk_timer_handler+0x1a6/0x243

There is a race when reqsk_timer_handler() and tcp_check_req() call
inet_csk_reqsk_queue_unlink() on the same req at the same time.

Before commit fa76ce7328b2 ("inet: get rid of central tcp/dccp listener
timer"), listener spinlock was held and race could not happen.

To solve this bug, we change reqsk_queue_unlink() to not assume req
must be found, and we return a status, to conditionally release a
refcount on the request sock.

This also means tcp_check_req() in non fastopen case might or not
consume req refcount, so tcp_v6_hnd_req() & tcp_v4_hnd_req() have
to properly handle this.

(Same remark for dccp_check_req() and its callers)

inet_csk_reqsk_queue_drop() is now too big to be inlined, as it is
called 4 times in tcp and 3 times in dccp.

Fixes: fa76ce7328b2 ("inet: get rid of central tcp/dccp listener timer")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_connection_sock.h | 20 +-------------------
 include/net/request_sock.h         | 18 ------------------
 2 files changed, 1 insertion(+), 37 deletions(-)

(limited to 'include/net')
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 7b5887cd1172..48a815823587 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -279,12 +279,6 @@ static inline void inet_csk_reqsk_queue_add(struct sock *sk,
 void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
 				   unsigned long timeout);
 
-static inline void inet_csk_reqsk_queue_removed(struct sock *sk,
-						struct request_sock *req)
-{
-	reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
-}
-
 static inline void inet_csk_reqsk_queue_added(struct sock *sk,
 					      const unsigned long timeout)
 {
@@ -306,19 +300,7 @@ static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk)
 	return reqsk_queue_is_full(&inet_csk(sk)->icsk_accept_queue);
 }
 
-static inline void inet_csk_reqsk_queue_unlink(struct sock *sk,
-					       struct request_sock *req)
-{
-	reqsk_queue_unlink(&inet_csk(sk)->icsk_accept_queue, req);
-}
-
-static inline void inet_csk_reqsk_queue_drop(struct sock *sk,
-					     struct request_sock *req)
-{
-	inet_csk_reqsk_queue_unlink(sk, req);
-	inet_csk_reqsk_queue_removed(sk, req);
-	reqsk_put(req);
-}
+void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req);
 
 void inet_csk_destroy_sock(struct sock *sk);
 void inet_csk_prepare_forced_close(struct sock *sk);
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index fe41f3ceb008..9f4265ce8892 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -212,24 +212,6 @@ static inline int reqsk_queue_empty(struct request_sock_queue *queue)
 	return queue->rskq_accept_head == NULL;
 }
 
-static inline void reqsk_queue_unlink(struct request_sock_queue *queue,
-				      struct request_sock *req)
-{
-	struct listen_sock *lopt = queue->listen_opt;
-	struct request_sock **prev;
-
-	spin_lock(&queue->syn_wait_lock);
-
-	prev = &lopt->syn_table[req->rsk_hash];
-	while (*prev != req)
-		prev = &(*prev)->dl_next;
-	*prev = req->dl_next;
-
-	spin_unlock(&queue->syn_wait_lock);
-	if (del_timer(&req->rsk_timer))
-		reqsk_put(req);
-}
-
 static inline void reqsk_queue_add(struct request_sock_queue *queue,
 				   struct request_sock *req,
 				   struct sock *parent,
-- 
cgit v1.2.3


From 73b5a6f2a7a1cb78ccdec3900afc8657e11bc6bf Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Sun, 26 Apr 2015 15:55:57 +0300
Subject: net/bonding: Make DRV macros private

The bonding modules currently defines four macros with
general names that pollute the global namespace:
DRV_VERSION
DRV_RELDATE
DRV_NAME
DRV_DESCRIPTION

Fixing that by defining a private bonding_priv.h
header files which includes those defines.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c    |  2 ++
 drivers/net/bonding/bond_procfs.c  |  1 +
 drivers/net/bonding/bonding_priv.h | 25 +++++++++++++++++++++++++
 include/net/bonding.h              |  7 -------
 4 files changed, 28 insertions(+), 7 deletions(-)
 create mode 100644 drivers/net/bonding/bonding_priv.h

(limited to 'include/net')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 78dde56ae6e6..3a10551d64cf 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -82,6 +82,8 @@
 #include <net/bond_3ad.h>
 #include <net/bond_alb.h>
 
+#include "bonding_priv.h"
+
 /*---------------------------- Module parameters ----------------------------*/
 
 /* monitor all links that often (in milliseconds). <=0 disables monitoring */
diff --git a/drivers/net/bonding/bond_procfs.c b/drivers/net/bonding/bond_procfs.c
index 62694cfc05b6..b20b35acb47d 100644
--- a/drivers/net/bonding/bond_procfs.c
+++ b/drivers/net/bonding/bond_procfs.c
@@ -4,6 +4,7 @@
 #include <net/netns/generic.h>
 #include <net/bonding.h>
 
+#include "bonding_priv.h"
 
 static void *bond_info_seq_start(struct seq_file *seq, loff_t *pos)
 	__acquires(RCU)
diff --git a/drivers/net/bonding/bonding_priv.h b/drivers/net/bonding/bonding_priv.h
new file mode 100644
index 000000000000..5a4d81a9437c
--- /dev/null
+++ b/drivers/net/bonding/bonding_priv.h
@@ -0,0 +1,25 @@
+/*
+ * Bond several ethernet interfaces into a Cisco, running 'Etherchannel'.
+ *
+ * Portions are (c) Copyright 1995 Simon "Guru Aleph-Null" Janes
+ * NCM: Network and Communications Management, Inc.
+ *
+ * BUT, I'm the one who modified it for ethernet, so:
+ * (c) Copyright 1999, Thomas Davis, tadavis@lbl.gov
+ *
+ *	This software may be used and distributed according to the terms
+ *	of the GNU Public License, incorporated herein by reference.
+ *
+ */
+
+#ifndef _BONDING_PRIV_H
+#define _BONDING_PRIV_H
+
+#define DRV_VERSION	"3.7.1"
+#define DRV_RELDATE	"April 27, 2011"
+#define DRV_NAME	"bonding"
+#define DRV_DESCRIPTION	"Ethernet Channel Bonding Driver"
+
+#define bond_version DRV_DESCRIPTION ": v" DRV_VERSION " (" DRV_RELDATE ")\n"
+
+#endif
diff --git a/include/net/bonding.h b/include/net/bonding.h
index fda6feeb6c1f..78ed135e9dea 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -30,13 +30,6 @@
 #include <net/bond_alb.h>
 #include <net/bond_options.h>
 
-#define DRV_VERSION	"3.7.1"
-#define DRV_RELDATE	"April 27, 2011"
-#define DRV_NAME	"bonding"
-#define DRV_DESCRIPTION	"Ethernet Channel Bonding Driver"
-
-#define bond_version DRV_DESCRIPTION ": v" DRV_VERSION " (" DRV_RELDATE ")\n"
-
 #define BOND_MAX_ARP_TARGETS	16
 
 #define BOND_DEFAULT_MIIMON	100
-- 
cgit v1.2.3


From 0df48c26d8418c5c9fba63fac15b660d70ca2f1c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 28 Apr 2015 15:28:17 -0700
Subject: tcp: add tcpi_bytes_acked to tcp_info

This patch tracks total number of bytes acked for a TCP socket.
This is the sum of all changes done to tp->snd_una, and allows
for precise tracking of delivered data.

RFC4898 named this : tcpEStatsAppHCThruOctetsAcked

This is a 64bit field, and can be fetched both from TCP_INFO
getsockopt() if one has a handle on a TCP socket, or from inet_diag
netlink facility (iproute2/ss patch will follow)

Note that tp->bytes_acked was placed near tp->snd_una for
best data locality and minimal performance impact.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Cc: Matt Mathis <mattmathis@google.com>
Cc: Eric Salo <salo@google.com>
Cc: Martin Lau <kafai@fb.com>
Cc: Chris Rapier <rapier@psc.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  4 ++++
 include/net/tcp.h        |  2 +-
 include/uapi/linux/tcp.h |  1 +
 net/ipv4/tcp.c           |  6 +++++-
 net/ipv4/tcp_input.c     | 13 +++++++++++--
 5 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 0caa3a2d4106..0f73b43171da 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -150,6 +150,10 @@ struct tcp_sock {
 	u32	rcv_wup;	/* rcv_nxt on last window update sent	*/
  	u32	snd_nxt;	/* Next sequence we send		*/
 
+	u64	bytes_acked;	/* RFC4898 tcpEStatsAppHCThruOctetsAcked
+				 * sum(delta(snd_una)), or how many bytes
+				 * were acked.
+				 */
  	u32	snd_una;	/* First byte we want an ack for	*/
  	u32	snd_sml;	/* Last byte of the most recently transmitted small packet */
 	u32	rcv_tstamp;	/* timestamp of last received ACK (for keepalives) */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 051dc5c2802d..dd7b4ea6a10c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -576,7 +576,7 @@ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
 }
 
 /* tcp.c */
-void tcp_get_info(const struct sock *, struct tcp_info *);
+void tcp_get_info(struct sock *, struct tcp_info *);
 
 /* Read 'sendfile()'-style from a TCP socket */
 typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *,
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 3b9718328d8b..6666e98a0af9 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -189,6 +189,7 @@ struct tcp_info {
 
 	__u64	tcpi_pacing_rate;
 	__u64	tcpi_max_pacing_rate;
+	__u64	tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */
 };
 
 /* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8c5cd9efebbc..4bf0e8ca7b5b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2592,7 +2592,7 @@ EXPORT_SYMBOL(compat_tcp_setsockopt);
 #endif
 
 /* Return information about state of tcp endpoint in API format. */
-void tcp_get_info(const struct sock *sk, struct tcp_info *info)
+void tcp_get_info(struct sock *sk, struct tcp_info *info)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2663,6 +2663,10 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info)
 
 	rate = READ_ONCE(sk->sk_max_pacing_rate);
 	info->tcpi_max_pacing_rate = rate != ~0U ? rate : ~0ULL;
+
+	spin_lock_bh(&sk->sk_lock.slock);
+	info->tcpi_bytes_acked = tp->bytes_acked;
+	spin_unlock_bh(&sk->sk_lock.slock);
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3a4d9b34bed4..378d3f4d4dc3 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3280,6 +3280,15 @@ static inline bool tcp_may_update_window(const struct tcp_sock *tp,
 		(ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
 }
 
+/* If we update tp->snd_una, also update tp->bytes_acked */
+static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
+{
+	u32 delta = ack - tp->snd_una;
+
+	tp->bytes_acked += delta;
+	tp->snd_una = ack;
+}
+
 /* Update our send window.
  *
  * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
@@ -3315,7 +3324,7 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
 		}
 	}
 
-	tp->snd_una = ack;
+	tcp_snd_una_update(tp, ack);
 
 	return flag;
 }
@@ -3497,7 +3506,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 		 * Note, we use the fact that SND.UNA>=SND.WL2.
 		 */
 		tcp_update_wl(tp, ack_seq);
-		tp->snd_una = ack;
+		tcp_snd_una_update(tp, ack);
 		flag |= FLAG_WIN_UPDATE;
 
 		tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
-- 
cgit v1.2.3


From 64f40ff5bbdb1b679fb3c4dbc8230d6517d2b8dc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 28 Apr 2015 16:23:48 -0700
Subject: tcp: prepare CC get_info() access from getsockopt()

We would like that optional info provided by Congestion Control
modules using netlink can also be read using getsockopt()

This patch changes get_info() to put this information in a buffer,
instead of skb, like tcp_get_info(), so that following patch
can reuse this common infrastructure.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h              |  5 ++++-
 include/uapi/linux/inet_diag.h |  4 ++++
 net/ipv4/inet_diag.c           |  8 +++++---
 net/ipv4/tcp_dctcp.c           | 20 ++++++++++----------
 net/ipv4/tcp_illinois.c        | 21 +++++++++++----------
 net/ipv4/tcp_vegas.c           | 19 ++++++++++---------
 net/ipv4/tcp_vegas.h           |  3 ++-
 7 files changed, 46 insertions(+), 34 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index dd7b4ea6a10c..6d204f3f9df8 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -804,6 +804,8 @@ enum tcp_ca_ack_event_flags {
 /* Requires ECN/ECT set on all packets */
 #define TCP_CONG_NEEDS_ECN	0x2
 
+union tcp_cc_info;
+
 struct tcp_congestion_ops {
 	struct list_head	list;
 	u32 key;
@@ -829,7 +831,8 @@ struct tcp_congestion_ops {
 	/* hook for packet ack accounting (optional) */
 	void (*pkts_acked)(struct sock *sk, u32 num_acked, s32 rtt_us);
 	/* get info for inet_diag (optional) */
-	int (*get_info)(struct sock *sk, u32 ext, struct sk_buff *skb);
+	size_t (*get_info)(struct sock *sk, u32 ext, int *attr,
+			   union tcp_cc_info *info);
 
 	char 		name[TCP_CA_NAME_MAX];
 	struct module 	*owner;
diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index d65c0a09efd3..c7093c75bdd6 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -143,4 +143,8 @@ struct tcp_dctcp_info {
 	__u32	dctcp_ab_tot;
 };
 
+union tcp_cc_info {
+	struct tcpvegas_info	vegas;
+	struct tcp_dctcp_info	dctcp;
+};
 #endif /* _UAPI_INET_DIAG_H_ */
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index bb77ebdae3b3..4d32262c7502 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -224,14 +224,16 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 	handler->idiag_get_info(sk, r, info);
 
 	if (sk->sk_state < TCP_TIME_WAIT) {
-		int err = 0;
+		union tcp_cc_info info;
+		size_t sz = 0;
+		int attr;
 
 		rcu_read_lock();
 		ca_ops = READ_ONCE(icsk->icsk_ca_ops);
 		if (ca_ops && ca_ops->get_info)
-			err = ca_ops->get_info(sk, ext, skb);
+			sz = ca_ops->get_info(sk, ext, &attr, &info);
 		rcu_read_unlock();
-		if (err < 0)
+		if (sz && nla_put(skb, attr, sz, &info) < 0)
 			goto errout;
 	}
 
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 4376016f7fa5..4c41c1287197 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -277,7 +277,8 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
 	}
 }
 
-static int dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
+static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr,
+			     union tcp_cc_info *info)
 {
 	const struct dctcp *ca = inet_csk_ca(sk);
 
@@ -286,18 +287,17 @@ static int dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
 	 */
 	if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) ||
 	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
-		struct tcp_dctcp_info info;
-
-		memset(&info, 0, sizeof(info));
+		memset(info, 0, sizeof(struct tcp_dctcp_info));
 		if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) {
-			info.dctcp_enabled = 1;
-			info.dctcp_ce_state = (u16) ca->ce_state;
-			info.dctcp_alpha = ca->dctcp_alpha;
-			info.dctcp_ab_ecn = ca->acked_bytes_ecn;
-			info.dctcp_ab_tot = ca->acked_bytes_total;
+			info->dctcp.dctcp_enabled = 1;
+			info->dctcp.dctcp_ce_state = (u16) ca->ce_state;
+			info->dctcp.dctcp_alpha = ca->dctcp_alpha;
+			info->dctcp.dctcp_ab_ecn = ca->acked_bytes_ecn;
+			info->dctcp.dctcp_ab_tot = ca->acked_bytes_total;
 		}
 
-		return nla_put(skb, INET_DIAG_DCTCPINFO, sizeof(info), &info);
+		*attr = INET_DIAG_DCTCPINFO;
+		return sizeof(*info);
 	}
 	return 0;
 }
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 67476f085e48..f71002e4db0b 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -300,24 +300,25 @@ static u32 tcp_illinois_ssthresh(struct sock *sk)
 }
 
 /* Extract info for Tcp socket info provided via netlink. */
-static int tcp_illinois_info(struct sock *sk, u32 ext, struct sk_buff *skb)
+static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr,
+				union tcp_cc_info *info)
 {
 	const struct illinois *ca = inet_csk_ca(sk);
 
 	if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
-		struct tcpvegas_info info = {
-			.tcpv_enabled = 1,
-			.tcpv_rttcnt = ca->cnt_rtt,
-			.tcpv_minrtt = ca->base_rtt,
-		};
+		info->vegas.tcpv_enabled = 1;
+		info->vegas.tcpv_rttcnt = ca->cnt_rtt;
+		info->vegas.tcpv_minrtt = ca->base_rtt;
+		info->vegas.tcpv_rtt = 0;
 
-		if (info.tcpv_rttcnt > 0) {
+		if (info->vegas.tcpv_rttcnt > 0) {
 			u64 t = ca->sum_rtt;
 
-			do_div(t, info.tcpv_rttcnt);
-			info.tcpv_rtt = t;
+			do_div(t, info->vegas.tcpv_rttcnt);
+			info->vegas.tcpv_rtt = t;
 		}
-		return nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info);
+		*attr = INET_DIAG_VEGASINFO;
+		return sizeof(struct tcpvegas_info);
 	}
 	return 0;
 }
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index c71a1b8f7bde..a6cea1d5e20d 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -286,18 +286,19 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 }
 
 /* Extract info for Tcp socket info provided via netlink. */
-int tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
+size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr,
+			  union tcp_cc_info *info)
 {
 	const struct vegas *ca = inet_csk_ca(sk);
+
 	if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
-		struct tcpvegas_info info = {
-			.tcpv_enabled = ca->doing_vegas_now,
-			.tcpv_rttcnt = ca->cntRTT,
-			.tcpv_rtt = ca->baseRTT,
-			.tcpv_minrtt = ca->minRTT,
-		};
-
-		return nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info);
+		info->vegas.tcpv_enabled = ca->doing_vegas_now,
+		info->vegas.tcpv_rttcnt = ca->cntRTT,
+		info->vegas.tcpv_rtt = ca->baseRTT,
+		info->vegas.tcpv_minrtt = ca->minRTT,
+
+		*attr = INET_DIAG_VEGASINFO;
+		return sizeof(struct tcpvegas_info);
 	}
 	return 0;
 }
diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h
index e8a6b33cc61d..ef9da5306c68 100644
--- a/net/ipv4/tcp_vegas.h
+++ b/net/ipv4/tcp_vegas.h
@@ -19,6 +19,7 @@ void tcp_vegas_init(struct sock *sk);
 void tcp_vegas_state(struct sock *sk, u8 ca_state);
 void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us);
 void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event);
-int tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb);
+size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr,
+			  union tcp_cc_info *info);
 
 #endif	/* __TCP_VEGAS_H */
-- 
cgit v1.2.3


From 42fb23e2f57accbed69804e6d72ea3a88b8a84c4 Mon Sep 17 00:00:00 2001
From: Varka Bhadram <varkabhadram@gmail.com>
Date: Thu, 30 Apr 2015 17:44:52 +0200
Subject: mac802154: add description to mac802154 APIs

This patch adds the proper description to the mac802154 core APIs.

Signed-off-by: Varka Bhadram <varkab@cdac.in>
Acked-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/mac802154.h | 94 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 92 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/mac802154.h b/include/net/mac802154.h
index e18e7fd43f47..7df28a4c23f9 100644
--- a/include/net/mac802154.h
+++ b/include/net/mac802154.h
@@ -247,19 +247,109 @@ static inline void ieee802154_le64_to_be64(void *be64_dst, const void *le64_src)
 	__put_unaligned_memmove64(swab64p(le64_src), be64_dst);
 }
 
-/* Basic interface to register ieee802154 device */
+/**
+ * ieee802154_alloc_hw - Allocate a new hardware device
+ *
+ * This must be called once for each hardware device. The returned pointer
+ * must be used to refer to this device when calling other functions.
+ * mac802154 allocates a private data area for the driver pointed to by
+ * @priv in &struct ieee802154_hw, the size of this area is given as
+ * @priv_data_len.
+ *
+ * @priv_data_len: length of private data
+ * @ops: callbacks for this device
+ *
+ * Return: A pointer to the new hardware device, or %NULL on error.
+ */
 struct ieee802154_hw *
 ieee802154_alloc_hw(size_t priv_data_len, const struct ieee802154_ops *ops);
+
+/**
+ * ieee802154_free_hw - free hardware descriptor
+ *
+ * This function frees everything that was allocated, including the
+ * private data for the driver. You must call ieee802154_unregister_hw()
+ * before calling this function.
+ *
+ * @hw: the hardware to free
+ */
 void ieee802154_free_hw(struct ieee802154_hw *hw);
+
+/**
+ * ieee802154_register_hw - Register hardware device
+ *
+ * You must call this function before any other functions in
+ * mac802154. Note that before a hardware can be registered, you
+ * need to fill the contained wpan_phy's information.
+ *
+ * @hw: the device to register as returned by ieee802154_alloc_hw()
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
 int ieee802154_register_hw(struct ieee802154_hw *hw);
+
+/**
+ * ieee802154_unregister_hw - Unregister a hardware device
+ *
+ * This function instructs mac802154 to free allocated resources
+ * and unregister netdevices from the networking subsystem.
+ *
+ * @hw: the hardware to unregister
+ */
 void ieee802154_unregister_hw(struct ieee802154_hw *hw);
 
+/**
+ * ieee802154_rx - receive frame
+ *
+ * Use this function to hand received frames to mac802154. The receive
+ * buffer in @skb must start with an IEEE 802.15.4 header. In case of a
+ * paged @skb is used, the driver is recommended to put the ieee802154
+ * header of the frame on the linear part of the @skb to avoid memory
+ * allocation and/or memcpy by the stack.
+ *
+ * This function may not be called in IRQ context. Calls to this function
+ * for a single hardware must be synchronized against each other.
+ *
+ * @hw: the hardware this frame came in on
+ * @skb: the buffer to receive, owned by mac802154 after this call
+ */
 void ieee802154_rx(struct ieee802154_hw *hw, struct sk_buff *skb);
+
+/**
+ * ieee802154_rx_irqsafe - receive frame
+ *
+ * Like ieee802154_rx() but can be called in IRQ context
+ * (internally defers to a tasklet.)
+ *
+ * @hw: the hardware this frame came in on
+ * @skb: the buffer to receive, owned by mac802154 after this call
+ * @lqi: link quality indicator
+ */
 void ieee802154_rx_irqsafe(struct ieee802154_hw *hw, struct sk_buff *skb,
 			   u8 lqi);
-
+/**
+ * ieee802154_wake_queue - wake ieee802154 queue
+ * @hw: pointer as obtained from ieee802154_alloc_hw().
+ *
+ * Drivers should use this function instead of netif_wake_queue.
+ */
 void ieee802154_wake_queue(struct ieee802154_hw *hw);
+
+/**
+ * ieee802154_stop_queue - stop ieee802154 queue
+ * @hw: pointer as obtained from ieee802154_alloc_hw().
+ *
+ * Drivers should use this function instead of netif_stop_queue.
+ */
 void ieee802154_stop_queue(struct ieee802154_hw *hw);
+
+/**
+ * ieee802154_xmit_complete - frame transmission complete
+ *
+ * @hw: pointer as obtained from ieee802154_alloc_hw().
+ * @skb: buffer for transmission
+ * @ifs_handling: indicate interframe space handling
+ */
 void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb,
 			      bool ifs_handling);
 
-- 
cgit v1.2.3


From 5b4a10390460cccf17a9fac739e153d68cf25ef5 Mon Sep 17 00:00:00 2001
From: Varka Bhadram <varkabhadram@gmail.com>
Date: Thu, 30 Apr 2015 17:44:57 +0200
Subject: cfg802154: pass name_assign_type to rdev_add_virtual_intf()

This code is based on commit 6bab2e19c5ffd
("cfg80211: pass name_assign_type to rdev_add_virtual_intf()")

This will expose in sysfs whether the ifname of a IEEE-802.15.4
device is set by userspace or generated by the kernel.
We are using two types of name_assign_types
 o NET_NAME_ENUM: Default interface name provided by kernel
 o NET_NAME_USER: Interface name provided by user.

Signed-off-by: Varka Bhadram <varkab@cdac.in>
Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/cfg802154.h      |  2 ++
 net/ieee802154/nl-phy.c      |  5 ++++-
 net/ieee802154/nl802154.c    |  2 +-
 net/ieee802154/rdev-ops.h    | 10 +++++++---
 net/mac802154/cfg.c          |  9 ++++++---
 net/mac802154/ieee802154_i.h |  3 ++-
 net/mac802154/iface.c        |  5 +++--
 net/mac802154/main.c         |  3 ++-
 8 files changed, 27 insertions(+), 12 deletions(-)

(limited to 'include/net')

diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index eeda67652766..6ea16c84293b 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -30,11 +30,13 @@ struct wpan_phy_cca;
 struct cfg802154_ops {
 	struct net_device * (*add_virtual_intf_deprecated)(struct wpan_phy *wpan_phy,
 							   const char *name,
+							   unsigned char name_assign_type,
 							   int type);
 	void	(*del_virtual_intf_deprecated)(struct wpan_phy *wpan_phy,
 					       struct net_device *dev);
 	int	(*add_virtual_intf)(struct wpan_phy *wpan_phy,
 				    const char *name,
+				    unsigned char name_assign_type,
 				    enum nl802154_iftype type,
 				    __le64 extended_addr);
 	int	(*del_virtual_intf)(struct wpan_phy *wpan_phy,
diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c
index 1b9d25f6e898..346c6665d25e 100644
--- a/net/ieee802154/nl-phy.c
+++ b/net/ieee802154/nl-phy.c
@@ -175,6 +175,7 @@ int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info)
 	int rc = -ENOBUFS;
 	struct net_device *dev;
 	int type = __IEEE802154_DEV_INVALID;
+	unsigned char name_assign_type;
 
 	pr_debug("%s\n", __func__);
 
@@ -190,8 +191,10 @@ int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info)
 		if (devname[nla_len(info->attrs[IEEE802154_ATTR_DEV_NAME]) - 1]
 				!= '\0')
 			return -EINVAL; /* phy name should be null-terminated */
+		name_assign_type = NET_NAME_USER;
 	} else  {
 		devname = "wpan%d";
+		name_assign_type = NET_NAME_ENUM;
 	}
 
 	if (strlen(devname) >= IFNAMSIZ)
@@ -221,7 +224,7 @@ int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	dev = rdev_add_virtual_intf_deprecated(wpan_phy_to_rdev(phy), devname,
-					       type);
+					       name_assign_type, type);
 	if (IS_ERR(dev)) {
 		rc = PTR_ERR(dev);
 		goto nla_put_failure;
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index a4daf91b8d0a..f3c12f6a4a39 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -589,7 +589,7 @@ static int nl802154_new_interface(struct sk_buff *skb, struct genl_info *info)
 
 	return rdev_add_virtual_intf(rdev,
 				     nla_data(info->attrs[NL802154_ATTR_IFNAME]),
-				     type, extended_addr);
+				     NET_NAME_USER, type, extended_addr);
 }
 
 static int nl802154_del_interface(struct sk_buff *skb, struct genl_info *info)
diff --git a/net/ieee802154/rdev-ops.h b/net/ieee802154/rdev-ops.h
index 624413ee095f..7b5a9dd94fe5 100644
--- a/net/ieee802154/rdev-ops.h
+++ b/net/ieee802154/rdev-ops.h
@@ -8,10 +8,12 @@
 
 static inline struct net_device *
 rdev_add_virtual_intf_deprecated(struct cfg802154_registered_device *rdev,
-				 const char *name, int type)
+				 const char *name,
+				 unsigned char name_assign_type,
+				 int type)
 {
 	return rdev->ops->add_virtual_intf_deprecated(&rdev->wpan_phy, name,
-						      type);
+						      name_assign_type, type);
 }
 
 static inline void
@@ -23,13 +25,15 @@ rdev_del_virtual_intf_deprecated(struct cfg802154_registered_device *rdev,
 
 static inline int
 rdev_add_virtual_intf(struct cfg802154_registered_device *rdev, char *name,
+		      unsigned char name_assign_type,
 		      enum nl802154_iftype type, __le64 extended_addr)
 {
 	int ret;
 
 	trace_802154_rdev_add_virtual_intf(&rdev->wpan_phy, name, type,
 					   extended_addr);
-	ret = rdev->ops->add_virtual_intf(&rdev->wpan_phy, name, type,
+	ret = rdev->ops->add_virtual_intf(&rdev->wpan_phy, name,
+					  name_assign_type, type,
 					  extended_addr);
 	trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
 	return ret;
diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c
index 5d9f68c75e5f..70be9c799f8a 100644
--- a/net/mac802154/cfg.c
+++ b/net/mac802154/cfg.c
@@ -22,13 +22,14 @@
 
 static struct net_device *
 ieee802154_add_iface_deprecated(struct wpan_phy *wpan_phy,
-				const char *name, int type)
+				const char *name,
+				unsigned char name_assign_type, int type)
 {
 	struct ieee802154_local *local = wpan_phy_priv(wpan_phy);
 	struct net_device *dev;
 
 	rtnl_lock();
-	dev = ieee802154_if_add(local, name, type,
+	dev = ieee802154_if_add(local, name, name_assign_type, type,
 				cpu_to_le64(0x0000000000000000ULL));
 	rtnl_unlock();
 
@@ -45,12 +46,14 @@ static void ieee802154_del_iface_deprecated(struct wpan_phy *wpan_phy,
 
 static int
 ieee802154_add_iface(struct wpan_phy *phy, const char *name,
+		     unsigned char name_assign_type,
 		     enum nl802154_iftype type, __le64 extended_addr)
 {
 	struct ieee802154_local *local = wpan_phy_priv(phy);
 	struct net_device *err;
 
-	err = ieee802154_if_add(local, name, type, extended_addr);
+	err = ieee802154_if_add(local, name, name_assign_type, type,
+				extended_addr);
 	return PTR_ERR_OR_ZERO(err);
 }
 
diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h
index bebd70ffc7a3..127ba18386fc 100644
--- a/net/mac802154/ieee802154_i.h
+++ b/net/mac802154/ieee802154_i.h
@@ -182,7 +182,8 @@ void ieee802154_iface_exit(void);
 void ieee802154_if_remove(struct ieee802154_sub_if_data *sdata);
 struct net_device *
 ieee802154_if_add(struct ieee802154_local *local, const char *name,
-		  enum nl802154_iftype type, __le64 extended_addr);
+		  unsigned char name_assign_type, enum nl802154_iftype type,
+		  __le64 extended_addr);
 void ieee802154_remove_interfaces(struct ieee802154_local *local);
 
 #endif /* __IEEE802154_I_H */
diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c
index 38b56f9d9386..91b75abbd1a1 100644
--- a/net/mac802154/iface.c
+++ b/net/mac802154/iface.c
@@ -522,7 +522,8 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata,
 
 struct net_device *
 ieee802154_if_add(struct ieee802154_local *local, const char *name,
-		  enum nl802154_iftype type, __le64 extended_addr)
+		  unsigned char name_assign_type, enum nl802154_iftype type,
+		  __le64 extended_addr)
 {
 	struct net_device *ndev = NULL;
 	struct ieee802154_sub_if_data *sdata = NULL;
@@ -531,7 +532,7 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name,
 	ASSERT_RTNL();
 
 	ndev = alloc_netdev(sizeof(*sdata) + local->hw.vif_data_size, name,
-			    NET_NAME_UNKNOWN, ieee802154_if_setup);
+			    name_assign_type, ieee802154_if_setup);
 	if (!ndev)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/net/mac802154/main.c b/net/mac802154/main.c
index 8500378c8318..68b9667323ec 100644
--- a/net/mac802154/main.c
+++ b/net/mac802154/main.c
@@ -161,7 +161,8 @@ int ieee802154_register_hw(struct ieee802154_hw *hw)
 
 	rtnl_lock();
 
-	dev = ieee802154_if_add(local, "wpan%d", NL802154_IFTYPE_NODE,
+	dev = ieee802154_if_add(local, "wpan%d", NET_NAME_ENUM,
+				NL802154_IFTYPE_NODE,
 				cpu_to_le64(0x0000000000000000ULL));
 	if (IS_ERR(dev)) {
 		rtnl_unlock();
-- 
cgit v1.2.3


From 4b32b5ad31a68a661f761c76dfd0d076636d3ae9 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 28 Apr 2015 13:03:06 -0700
Subject: ipv6: Stop rt6_info from using inet_peer's metrics

inet_peer is indexed by the dst address alone.  However, the fib6 tree
could have multiple routing entries (rt6_info) for the same dst. For
example,
1. A /128 dst via multiple gateways.
2. A RTF_CACHE route cloned from a /128 route.

In the above cases, all of them will share the same metrics and
step on each other.

This patch will steer away from inet_peer's metrics and use
dst_cow_metrics_generic() for everything.

Change Highlights:
1. Remove rt6_cow_metrics() which currently acquires metrics from
   inet_peer for DST_HOST route (i.e. /128 route).
2. Add rt6i_pmtu to take care of the pmtu update to avoid creating a
   full size metrics just to override the RTAX_MTU.
3. After (2), the RTF_CACHE route can also share the metrics with its
   dst.from route, by:
   dst_init_metrics(&cache_rt->dst, dst_metrics_ptr(cache_rt->dst.from), true);
4. Stop creating RTF_CACHE route by cloning another RTF_CACHE route.  Instead,
   directly clone from rt->dst.

   [ Currently, cloning from another RTF_CACHE is only possible during
     rt6_do_redirect().  Also, the old clone is removed from the tree
     immediately after the new clone is added. ]

   In case of cloning from an older redirect RTF_CACHE, it should work as
   before.

   In case of cloning from an older pmtu RTF_CACHE, this patch will forget
   the pmtu and re-learn it (if there is any) from the redirected route.

The _rt6i_peer and DST_METRICS_FORCE_OVERWRITE will be removed
in the next cleanup patch.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Reviewed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  10 +----
 net/ipv6/route.c      | 102 +++++++++++++++++++++++++++++---------------------
 2 files changed, 60 insertions(+), 52 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 20e80fa7bbdd..7383a8cc8f84 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -124,6 +124,7 @@ struct rt6_info {
 	unsigned long			_rt6i_peer;
 
 	u32				rt6i_metric;
+	u32				rt6i_pmtu;
 	/* more non-fragment space at head required */
 	unsigned short			rt6i_nfheader_len;
 	u8				rt6i_protocol;
@@ -189,15 +190,6 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout)
 	rt0->rt6i_flags |= RTF_EXPIRES;
 }
 
-static inline void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
-{
-	struct dst_entry *new = (struct dst_entry *) from;
-
-	rt->rt6i_flags &= ~RTF_EXPIRES;
-	dst_hold(new);
-	rt->dst.from = new;
-}
-
 static inline void ip6_rt_put(struct rt6_info *rt)
 {
 	/* dst_release() accepts a NULL parameter.
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index aa4cfdd13f28..4d6eb5d90cb4 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -92,6 +92,7 @@ static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 					   struct sk_buff *skb, u32 mtu);
 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
 					struct sk_buff *skb);
+static void		rt6_dst_from_metrics_check(struct rt6_info *rt);
 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
 
 #ifdef CONFIG_IPV6_ROUTE_INFO
@@ -136,33 +137,12 @@ static struct inet_peer *rt6_get_peer_create(struct rt6_info *rt)
 
 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
 {
-	struct rt6_info *rt = (struct rt6_info *) dst;
-	struct inet_peer *peer;
-	u32 *p = NULL;
+	struct rt6_info *rt = (struct rt6_info *)dst;
 
-	if (!(rt->dst.flags & DST_HOST))
+	if (rt->rt6i_flags & RTF_CACHE)
+		return NULL;
+	else
 		return dst_cow_metrics_generic(dst, old);
-
-	peer = rt6_get_peer_create(rt);
-	if (peer) {
-		u32 *old_p = __DST_METRICS_PTR(old);
-		unsigned long prev, new;
-
-		p = peer->metrics;
-		if (inet_metrics_new(peer) ||
-		    (old & DST_METRICS_FORCE_OVERWRITE))
-			memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
-
-		new = (unsigned long) p;
-		prev = cmpxchg(&dst->_metrics, old, new);
-
-		if (prev != old) {
-			p = __DST_METRICS_PTR(prev);
-			if (prev & DST_METRICS_READ_ONLY)
-				p = NULL;
-		}
-	}
-	return p;
 }
 
 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
@@ -323,8 +303,7 @@ static void ip6_dst_destroy(struct dst_entry *dst)
 	struct inet6_dev *idev = rt->rt6i_idev;
 	struct dst_entry *from = dst->from;
 
-	if (!(rt->dst.flags & DST_HOST))
-		dst_destroy_metrics_generic(dst);
+	dst_destroy_metrics_generic(dst);
 
 	if (idev) {
 		rt->rt6i_idev = NULL;
@@ -333,11 +312,6 @@ static void ip6_dst_destroy(struct dst_entry *dst)
 
 	dst->from = NULL;
 	dst_release(from);
-
-	if (rt6_has_peer(rt)) {
-		struct inet_peer *peer = rt6_peer_ptr(rt);
-		inet_putpeer(peer);
-	}
 }
 
 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
@@ -1003,6 +977,7 @@ redo_rt6_select:
 	goto redo_fib6_lookup_lock;
 
 out2:
+	rt6_dst_from_metrics_check(rt);
 	rt->dst.lastuse = jiffies;
 	rt->dst.__use++;
 
@@ -1111,6 +1086,13 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
  *	Destination cache support functions
  */
 
+static void rt6_dst_from_metrics_check(struct rt6_info *rt)
+{
+	if (rt->dst.from &&
+	    dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
+		dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
+}
+
 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
 {
 	struct rt6_info *rt;
@@ -1127,6 +1109,8 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
 	if (rt6_check_expired(rt))
 		return NULL;
 
+	rt6_dst_from_metrics_check(rt);
+
 	return dst;
 }
 
@@ -1179,7 +1163,7 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 		if (mtu < IPV6_MIN_MTU)
 			mtu = IPV6_MIN_MTU;
 
-		dst_metric_set(dst, RTAX_MTU, mtu);
+		rt6->rt6i_pmtu = mtu;
 		rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires);
 	}
 }
@@ -1359,9 +1343,14 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst)
 
 static unsigned int ip6_mtu(const struct dst_entry *dst)
 {
+	const struct rt6_info *rt = (const struct rt6_info *)dst;
+	unsigned int mtu = rt->rt6i_pmtu;
 	struct inet6_dev *idev;
-	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
 
+	if (mtu)
+		goto out;
+
+	mtu = dst_metric_raw(dst, RTAX_MTU);
 	if (mtu)
 		goto out;
 
@@ -1947,12 +1936,27 @@ out:
  *	Misc support functions
  */
 
+static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
+{
+	BUG_ON(from->dst.from);
+
+	rt->rt6i_flags &= ~RTF_EXPIRES;
+	dst_hold(&from->dst);
+	rt->dst.from = &from->dst;
+	dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
+}
+
 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
 				    const struct in6_addr *dest)
 {
 	struct net *net = dev_net(ort->dst.dev);
-	struct rt6_info *rt = ip6_dst_alloc(net, ort->dst.dev, 0,
-					    ort->rt6i_table);
+	struct rt6_info *rt;
+
+	if (ort->rt6i_flags & RTF_CACHE)
+		ort = (struct rt6_info *)ort->dst.from;
+
+	rt = ip6_dst_alloc(net, ort->dst.dev, 0,
+			   ort->rt6i_table);
 
 	if (rt) {
 		rt->dst.input = ort->dst.input;
@@ -1961,7 +1965,6 @@ static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
 
 		rt->rt6i_dst.addr = *dest;
 		rt->rt6i_dst.plen = 128;
-		dst_copy_metrics(&rt->dst, &ort->dst);
 		rt->dst.error = ort->dst.error;
 		rt->rt6i_idev = ort->rt6i_idev;
 		if (rt->rt6i_idev)
@@ -2393,11 +2396,20 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
 	   PMTU discouvery.
 	 */
 	if (rt->dst.dev == arg->dev &&
-	    !dst_metric_locked(&rt->dst, RTAX_MTU) &&
-	    (dst_mtu(&rt->dst) >= arg->mtu ||
-	     (dst_mtu(&rt->dst) < arg->mtu &&
-	      dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
-		dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
+	    !dst_metric_locked(&rt->dst, RTAX_MTU)) {
+		if (rt->rt6i_flags & RTF_CACHE) {
+			/* For RTF_CACHE with rt6i_pmtu == 0
+			 * (i.e. a redirected route),
+			 * the metrics of its rt->dst.from has already
+			 * been updated.
+			 */
+			if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
+				rt->rt6i_pmtu = arg->mtu;
+		} else if (dst_mtu(&rt->dst) >= arg->mtu ||
+			   (dst_mtu(&rt->dst) < arg->mtu &&
+			    dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
+			dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
+		}
 	}
 	return 0;
 }
@@ -2627,6 +2639,7 @@ static int rt6_fill_node(struct net *net,
 			 int iif, int type, u32 portid, u32 seq,
 			 int prefix, int nowait, unsigned int flags)
 {
+	u32 metrics[RTAX_MAX];
 	struct rtmsg *rtm;
 	struct nlmsghdr *nlh;
 	long expires;
@@ -2740,7 +2753,10 @@ static int rt6_fill_node(struct net *net,
 			goto nla_put_failure;
 	}
 
-	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
+	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
+	if (rt->rt6i_pmtu)
+		metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
+	if (rtnetlink_put_metrics(skb, metrics) < 0)
 		goto nla_put_failure;
 
 	if (rt->rt6i_flags & RTF_GATEWAY) {
-- 
cgit v1.2.3


From afc4eef80c92b199357db3570d3c9c7631d699ff Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 28 Apr 2015 13:03:07 -0700
Subject: ipv6: Remove DST_METRICS_FORCE_OVERWRITE and _rt6i_peer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

_rt6i_peer is no longer needed after the last patch,
'ipv6: Stop rt6_info from using inet_peer's metrics'.

DST_METRICS_FORCE_OVERWRITE is added by
commit e5fd387ad5b3 ("ipv6: do not overwrite inetpeer metrics prematurely").
Since inetpeer is no longer used for metrics, this bit is also not needed.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Reviewed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Michal Kubeček <mkubecek@suse.cz>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h       |  6 ------
 include/net/ip6_fib.h   | 31 -------------------------------
 net/ipv6/route.c        | 36 +-----------------------------------
 net/ipv6/xfrm6_policy.c | 14 --------------
 4 files changed, 1 insertion(+), 86 deletions(-)

(limited to 'include/net')

diff --git a/include/net/dst.h b/include/net/dst.h
index 0fb99a26e973..22aa93f165f1 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -109,7 +109,6 @@ u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old);
 extern const u32 dst_default_metrics[];
 
 #define DST_METRICS_READ_ONLY		0x1UL
-#define DST_METRICS_FORCE_OVERWRITE	0x2UL
 #define DST_METRICS_FLAGS		0x3UL
 #define __DST_METRICS_PTR(Y)	\
 	((u32 *)((Y) & ~DST_METRICS_FLAGS))
@@ -120,11 +119,6 @@ static inline bool dst_metrics_read_only(const struct dst_entry *dst)
 	return dst->_metrics & DST_METRICS_READ_ONLY;
 }
 
-static inline void dst_metrics_set_force_overwrite(struct dst_entry *dst)
-{
-	dst->_metrics |= DST_METRICS_FORCE_OVERWRITE;
-}
-
 void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old);
 
 static inline void dst_destroy_metrics_generic(struct dst_entry *dst)
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 7383a8cc8f84..e00018011028 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -121,7 +121,6 @@ struct rt6_info {
 	struct rt6key			rt6i_prefsrc;
 
 	struct inet6_dev		*rt6i_idev;
-	unsigned long			_rt6i_peer;
 
 	u32				rt6i_metric;
 	u32				rt6i_pmtu;
@@ -130,36 +129,6 @@ struct rt6_info {
 	u8				rt6i_protocol;
 };
 
-static inline struct inet_peer *rt6_peer_ptr(struct rt6_info *rt)
-{
-	return inetpeer_ptr(rt->_rt6i_peer);
-}
-
-static inline bool rt6_has_peer(struct rt6_info *rt)
-{
-	return inetpeer_ptr_is_peer(rt->_rt6i_peer);
-}
-
-static inline void __rt6_set_peer(struct rt6_info *rt, struct inet_peer *peer)
-{
-	__inetpeer_ptr_set_peer(&rt->_rt6i_peer, peer);
-}
-
-static inline bool rt6_set_peer(struct rt6_info *rt, struct inet_peer *peer)
-{
-	return inetpeer_ptr_set_peer(&rt->_rt6i_peer, peer);
-}
-
-static inline void rt6_init_peer(struct rt6_info *rt, struct inet_peer_base *base)
-{
-	inetpeer_init_ptr(&rt->_rt6i_peer, base);
-}
-
-static inline void rt6_transfer_peer(struct rt6_info *rt, struct rt6_info *ort)
-{
-	inetpeer_transfer_peer(&rt->_rt6i_peer, &ort->_rt6i_peer);
-}
-
 static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst)
 {
 	return ((struct rt6_info *)dst)->rt6i_idev;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 4d6eb5d90cb4..352271154ddb 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -105,36 +105,6 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
 					   const struct in6_addr *gwaddr, int ifindex);
 #endif
 
-static void rt6_bind_peer(struct rt6_info *rt, int create)
-{
-	struct inet_peer_base *base;
-	struct inet_peer *peer;
-
-	base = inetpeer_base_ptr(rt->_rt6i_peer);
-	if (!base)
-		return;
-
-	peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create);
-	if (peer) {
-		if (!rt6_set_peer(rt, peer))
-			inet_putpeer(peer);
-	}
-}
-
-static struct inet_peer *__rt6_get_peer(struct rt6_info *rt, int create)
-{
-	if (rt6_has_peer(rt))
-		return rt6_peer_ptr(rt);
-
-	rt6_bind_peer(rt, create);
-	return (rt6_has_peer(rt) ? rt6_peer_ptr(rt) : NULL);
-}
-
-static struct inet_peer *rt6_get_peer_create(struct rt6_info *rt)
-{
-	return __rt6_get_peer(rt, 1);
-}
-
 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
 {
 	struct rt6_info *rt = (struct rt6_info *)dst;
@@ -291,7 +261,6 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net,
 		struct dst_entry *dst = &rt->dst;
 
 		memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
-		rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
 		INIT_LIST_HEAD(&rt->rt6i_siblings);
 	}
 	return rt;
@@ -1052,7 +1021,6 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
 		new = &rt->dst;
 
 		memset(new + 1, 0, sizeof(*rt) - sizeof(*new));
-		rt6_init_peer(rt, net->ipv6.peers);
 
 		new->__use = 1;
 		new->input = dst_discard;
@@ -1597,10 +1565,8 @@ int ip6_route_add(struct fib6_config *cfg)
 
 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
 	rt->rt6i_dst.plen = cfg->fc_dst_len;
-	if (rt->rt6i_dst.plen == 128) {
+	if (rt->rt6i_dst.plen == 128)
 		rt->dst.flags |= DST_HOST;
-		dst_metrics_set_force_overwrite(&rt->dst);
-	}
 
 #ifdef CONFIG_IPV6_SUBTREES
 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index f337a908a76a..6ae256b4c55a 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -71,13 +71,6 @@ static int xfrm6_get_tos(const struct flowi *fl)
 	return 0;
 }
 
-static void xfrm6_init_dst(struct net *net, struct xfrm_dst *xdst)
-{
-	struct rt6_info *rt = (struct rt6_info *)xdst;
-
-	rt6_init_peer(rt, net->ipv6.peers);
-}
-
 static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst,
 			   int nfheader_len)
 {
@@ -106,8 +99,6 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 		return -ENODEV;
 	}
 
-	rt6_transfer_peer(&xdst->u.rt6, rt);
-
 	/* Sheit... I remember I did this right. Apparently,
 	 * it was magically lost, so this code needs audit */
 	xdst->u.rt6.rt6i_flags = rt->rt6i_flags & (RTF_ANYCAST |
@@ -255,10 +246,6 @@ static void xfrm6_dst_destroy(struct dst_entry *dst)
 	if (likely(xdst->u.rt6.rt6i_idev))
 		in6_dev_put(xdst->u.rt6.rt6i_idev);
 	dst_destroy_metrics_generic(dst);
-	if (rt6_has_peer(&xdst->u.rt6)) {
-		struct inet_peer *peer = rt6_peer_ptr(&xdst->u.rt6);
-		inet_putpeer(peer);
-	}
 	xfrm_dst_destroy(xdst);
 }
 
@@ -308,7 +295,6 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
 	.get_saddr =		xfrm6_get_saddr,
 	.decode_session =	_decode_session6,
 	.get_tos =		xfrm6_get_tos,
-	.init_dst =		xfrm6_init_dst,
 	.init_path =		xfrm6_init_path,
 	.fill_dst =		xfrm6_fill_dst,
 	.blackhole_route =	ip6_blackhole_route,
-- 
cgit v1.2.3


From 4749c3ef854e3a5d3dd3cc0ccd2dcb7e05d583bd Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 30 Apr 2015 12:12:00 +0200
Subject: net: sched: remove TC_MUNGED bits

Not used.

pedit sets TC_MUNGED when packet content was altered, but all the core
does is unset MUNGED again and then set OK2MUNGE.

And the latter isn't tested anywhere. So lets remove both
TC_MUNGED and TC_OK2MUNGE.

Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/tc-actions-env-rules.txt | 2 --
 include/net/sch_generic.h                         | 2 --
 include/uapi/linux/pkt_cls.h                      | 3 +++
 net/sched/act_api.c                               | 5 -----
 net/sched/act_pedit.c                             | 5 +----
 5 files changed, 4 insertions(+), 13 deletions(-)

(limited to 'include/net')

diff --git a/Documentation/networking/tc-actions-env-rules.txt b/Documentation/networking/tc-actions-env-rules.txt
index 70d6cf608251..95c71716b2e2 100644
--- a/Documentation/networking/tc-actions-env-rules.txt
+++ b/Documentation/networking/tc-actions-env-rules.txt
@@ -14,8 +14,6 @@ resets them for you, so invoke skb_act_clone() rather than skb_clone().
 
 2) If you munge any packet thou shalt call pskb_expand_head in the case
 someone else is referencing the skb. After that you "own" the skb.
-You must also tell us if it is ok to munge the packet (TC_OK2MUNGE),
-this way any action downstream can stomp on the packet.
 
 3) Dropping packets you don't own is a no-no. You simply return
 TC_ACT_SHOT to the caller and they will drop it.
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 6d778efcfdfd..994b5a092f33 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -755,8 +755,6 @@ static inline struct sk_buff *skb_act_clone(struct sk_buff *skb, gfp_t gfp_mask,
 
 	if (n) {
 		n->tc_verd = SET_TC_VERD(n->tc_verd, 0);
-		n->tc_verd = CLR_TC_OK2MUNGE(n->tc_verd);
-		n->tc_verd = CLR_TC_MUNGED(n->tc_verd);
 	}
 	return n;
 }
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index bf08e76bf505..6810ca43a80a 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -35,6 +35,8 @@ bits 9,10,11: redirect counter -  redirect TTL. Loop avoidance
  *
  * */
 
+#ifndef __KERNEL__
+/* backwards compat for userspace only */
 #define TC_MUNGED          _TC_MAKEMASK1(0)
 #define SET_TC_MUNGED(v)   ( TC_MUNGED | (v & ~TC_MUNGED))
 #define CLR_TC_MUNGED(v)   ( v & ~TC_MUNGED)
@@ -42,6 +44,7 @@ bits 9,10,11: redirect counter -  redirect TTL. Loop avoidance
 #define TC_OK2MUNGE        _TC_MAKEMASK1(1)
 #define SET_TC_OK2MUNGE(v)   ( TC_OK2MUNGE | (v & ~TC_OK2MUNGE))
 #define CLR_TC_OK2MUNGE(v)   ( v & ~TC_OK2MUNGE)
+#endif
 
 #define S_TC_VERD          _TC_MAKE32(2)
 #define M_TC_VERD          _TC_MAKEMASK(4,S_TC_VERD)
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 3d43e4979f27..af427a3dbcba 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -392,11 +392,6 @@ int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions,
 	list_for_each_entry(a, actions, list) {
 repeat:
 		ret = a->ops->act(skb, a, res);
-		if (TC_MUNGED & skb->tc_verd) {
-			/* copied already, allow trampling */
-			skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
-			skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd);
-		}
 		if (ret == TC_ACT_REPEAT)
 			goto repeat;	/* we need a ttl - JHS */
 		if (ret != TC_ACT_PIPE)
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 59649d588d79..17e6d6669c7f 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -108,7 +108,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 		     struct tcf_result *res)
 {
 	struct tcf_pedit *p = a->priv;
-	int i, munged = 0;
+	int i;
 	unsigned int off;
 
 	if (skb_unclone(skb, GFP_ATOMIC))
@@ -156,11 +156,8 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 			*ptr = ((*ptr & tkey->mask) ^ tkey->val);
 			if (ptr == &_data)
 				skb_store_bits(skb, off + offset, ptr, 4);
-			munged++;
 		}
 
-		if (munged)
-			skb->tc_verd = SET_TC_MUNGED(skb->tc_verd);
 		goto done;
 	} else
 		WARN(1, "pedit BUG: index %d\n", p->tcf_index);
-- 
cgit v1.2.3


From 82a584b7cd366511a22e37675b029cf2fb58e291 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Wed, 29 Apr 2015 15:33:21 -0700
Subject: ipv6: Flow label state ranges

This patch divides the IPv6 flow label space into two ranges:
0-7ffff is reserved for flow label manager, 80000-fffff will be
used for creating auto flow labels (per RFC6438). This only affects how
labels are set on transmit, it does not affect receive. This range split
can be disbaled by systcl.

Background:

IPv6 flow labels have been an unmitigated disappointment thus far
in the lifetime of IPv6. Support in HW devices to use them for ECMP
is lacking, and OSes don't turn them on by default. If we had these
we could get much better hashing in IPv6 networks without resorting
to DPI, possibly eliminating some of the motivations to to define new
encaps in UDP just for getting ECMP.

Unfortunately, the initial specfications of IPv6 did not clarify
how they are to be used. There has always been a vague concept that
these can be used for ECMP, flow hashing, etc. and we do now have a
good standard how to this in RFC6438. The problem is that flow labels
can be either stateful or stateless (as in RFC6438), and we are
presented with the possibility that a stateless label may collide
with a stateful one.  Attempts to split the flow label space were
rejected in IETF. When we added support in Linux for RFC6438, we
could not turn on flow labels by default due to this conflict.

This patch splits the flow label space and should give us
a path to enabling auto flow labels by default for all IPv6 packets.
This is an API change so we need to consider compatibility with
existing deployment. The stateful range is chosen to be the lower
values in hopes that most uses would have chosen small numbers.

Once we resolve the stateless/stateful issue, we can proceed to
look at enabling RFC6438 flow labels by default (starting with
scaled testing).

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 8 ++++++++
 include/net/ipv6.h                     | 9 +++++++--
 include/net/netns/ipv6.h               | 1 +
 net/ipv6/af_inet6.c                    | 1 +
 net/ipv6/ip6_flowlabel.c               | 4 ++++
 net/ipv6/sysctl_net_ipv6.c             | 8 ++++++++
 6 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 071fb18dc57c..5095c63e50ed 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1213,6 +1213,14 @@ auto_flowlabels - BOOLEAN
 	FALSE: disabled
 	Default: false
 
+flowlabel_state_ranges - BOOLEAN
+	Split the flow label number space into two ranges. 0-0x7FFFF is
+	reserved for the IPv6 flow manager facility, 0x80000-0xFFFFF
+	is reserved for stateless flow labels as described in RFC6437.
+	TRUE: enabled
+	FALSE: disabled
+	Default: true
+
 anycast_src_echo_reply - BOOLEAN
 	Controls the use of anycast addresses as source addresses for ICMPv6
 	echo reply
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index eec8ad3c9843..53d25ef1699a 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -239,8 +239,10 @@ struct ip6_flowlabel {
 	struct net		*fl_net;
 };
 
-#define IPV6_FLOWINFO_MASK	cpu_to_be32(0x0FFFFFFF)
-#define IPV6_FLOWLABEL_MASK	cpu_to_be32(0x000FFFFF)
+#define IPV6_FLOWINFO_MASK		cpu_to_be32(0x0FFFFFFF)
+#define IPV6_FLOWLABEL_MASK		cpu_to_be32(0x000FFFFF)
+#define IPV6_FLOWLABEL_STATELESS_FLAG	cpu_to_be32(0x00080000)
+
 #define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK)
 #define IPV6_TCLASS_SHIFT	20
 
@@ -719,6 +721,9 @@ static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb,
 		hash ^= hash >> 12;
 
 		flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK;
+
+		if (net->ipv6.sysctl.flowlabel_state_ranges)
+			flowlabel |= IPV6_FLOWLABEL_STATELESS_FLAG;
 	}
 
 	return flowlabel;
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index d2527bf81142..8d93544a2d2b 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -34,6 +34,7 @@ struct netns_sysctl_ipv6 {
 	int fwmark_reflect;
 	int idgen_retries;
 	int idgen_delay;
+	int flowlabel_state_ranges;
 };
 
 struct netns_ipv6 {
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index eef63b394c5a..4632afa57e05 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -768,6 +768,7 @@ static int __net_init inet6_net_init(struct net *net)
 	net->ipv6.sysctl.auto_flowlabels = 0;
 	net->ipv6.sysctl.idgen_retries = 3;
 	net->ipv6.sysctl.idgen_delay = 1 * HZ;
+	net->ipv6.sysctl.flowlabel_state_ranges = 1;
 	atomic_set(&net->ipv6.fib6_sernum, 1);
 
 	err = ipv6_init_mibs(net);
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index d491125011c4..1f9ebe3cbb4a 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -595,6 +595,10 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen)
 		if (freq.flr_label & ~IPV6_FLOWLABEL_MASK)
 			return -EINVAL;
 
+		if (net->ipv6.sysctl.flowlabel_state_ranges &&
+		    (freq.flr_label & IPV6_FLOWLABEL_STATELESS_FLAG))
+			return -ERANGE;
+
 		fl = fl_create(net, sk, &freq, optval, optlen, &err);
 		if (!fl)
 			return err;
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index abcc79f649b3..4e705add4f18 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -68,6 +68,13 @@ static struct ctl_table ipv6_table_template[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
+	{
+		.procname	= "flowlabel_state_ranges",
+		.data		= &init_net.ipv6.sysctl.flowlabel_state_ranges,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
@@ -109,6 +116,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net)
 	ipv6_table[4].data = &net->ipv6.sysctl.fwmark_reflect;
 	ipv6_table[5].data = &net->ipv6.sysctl.idgen_retries;
 	ipv6_table[6].data = &net->ipv6.sysctl.idgen_delay;
+	ipv6_table[7].data = &net->ipv6.sysctl.flowlabel_state_ranges;
 
 	ipv6_route_table = ipv6_route_sysctl_init(net);
 	if (!ipv6_route_table)
-- 
cgit v1.2.3


From a5d28090405038ca1f40c13f38d6d4285456efee Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 30 Apr 2015 09:40:40 -0700
Subject: codel: fix maxpacket/mtu confusion

Under presence of TSO/GSO/GRO packets, codel at low rates can be quite
useless. In following example, not a single packet was ever dropped,
while average delay in codel queue is ~100 ms !

qdisc codel 0: parent 1:12 limit 16000p target 5.0ms interval 100.0ms
 Sent 134376498 bytes 88797 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 13626b 3p requeues 0
  count 0 lastcount 0 ldelay 96.9ms drop_next 0us
  maxpacket 9084 ecn_mark 0 drop_overlimit 0

This comes from a confusion of what should be the minimal backlog. It is
pretty clear it is not 64KB or whatever max GSO packet ever reached the
qdisc.

codel intent was to use MTU of the device.

After the fix, we finally drop some packets, and rtt/cwnd of my single
TCP flow are meeting our expectations.

qdisc codel 0: parent 1:12 limit 16000p target 5.0ms interval 100.0ms
 Sent 102798497 bytes 67912 pkt (dropped 1365, overlimits 0 requeues 0)
 backlog 6056b 3p requeues 0
  count 1 lastcount 1 ldelay 36.3ms drop_next 0us
  maxpacket 10598 ecn_mark 0 drop_overlimit 0

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Kathleen Nichols <nichols@pollere.com>
Cc: Dave Taht <dave.taht@gmail.com>
Cc: Van Jacobson <vanj@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/codel.h      | 10 +++++++---
 net/sched/sch_codel.c    |  2 +-
 net/sched/sch_fq_codel.c |  2 +-
 3 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/codel.h b/include/net/codel.h
index aeee28081245..1e18005f7f65 100644
--- a/include/net/codel.h
+++ b/include/net/codel.h
@@ -120,11 +120,13 @@ static inline u32 codel_time_to_us(codel_time_t val)
  * struct codel_params - contains codel parameters
  * @target:	target queue size (in time units)
  * @interval:	width of moving time window
+ * @mtu:	device mtu, or minimal queue backlog in bytes.
  * @ecn:	is Explicit Congestion Notification enabled
  */
 struct codel_params {
 	codel_time_t	target;
 	codel_time_t	interval;
+	u32		mtu;
 	bool		ecn;
 };
 
@@ -166,10 +168,12 @@ struct codel_stats {
 	u32		ecn_mark;
 };
 
-static void codel_params_init(struct codel_params *params)
+static void codel_params_init(struct codel_params *params,
+			      const struct Qdisc *sch)
 {
 	params->interval = MS2TIME(100);
 	params->target = MS2TIME(5);
+	params->mtu = psched_mtu(qdisc_dev(sch));
 	params->ecn = false;
 }
 
@@ -180,7 +184,7 @@ static void codel_vars_init(struct codel_vars *vars)
 
 static void codel_stats_init(struct codel_stats *stats)
 {
-	stats->maxpacket = 256;
+	stats->maxpacket = 0;
 }
 
 /*
@@ -234,7 +238,7 @@ static bool codel_should_drop(const struct sk_buff *skb,
 		stats->maxpacket = qdisc_pkt_len(skb);
 
 	if (codel_time_before(vars->ldelay, params->target) ||
-	    sch->qstats.backlog <= stats->maxpacket) {
+	    sch->qstats.backlog <= params->mtu) {
 		/* went below - stay below for at least interval */
 		vars->first_above_time = 0;
 		return false;
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index de28f8e968e8..7a0bdb16ac92 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -164,7 +164,7 @@ static int codel_init(struct Qdisc *sch, struct nlattr *opt)
 
 	sch->limit = DEFAULT_CODEL_LIMIT;
 
-	codel_params_init(&q->params);
+	codel_params_init(&q->params, sch);
 	codel_vars_init(&q->vars);
 	codel_stats_init(&q->stats);
 
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 1e52decb7b59..c244c45b78d7 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -391,7 +391,7 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt)
 	q->perturbation = prandom_u32();
 	INIT_LIST_HEAD(&q->new_flows);
 	INIT_LIST_HEAD(&q->old_flows);
-	codel_params_init(&q->cparams);
+	codel_params_init(&q->cparams, sch);
 	codel_stats_init(&q->cstats);
 	q->cparams.ecn = true;
 
-- 
cgit v1.2.3


From 2f59e1ebaa7f762c8825871b5486b5f5b4fa952f Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Fri, 1 May 2015 11:30:17 -0700
Subject: net: Add flow_keys digest

Some users of flow keys (well just sch_choke now) need to pass
flow_keys in skbuff cb, and use them for exact comparisons of flows
so that skb->hash is not sufficient. In order to increase size of
the flow_keys structure, we introduce another structure for
the purpose of passing flow keys in skbuff cb. We limit this structure
to sixteen bytes, and we will technically treat this as a digest of
flow_keys struct hence its name flow_keys_digest. In the first
incaranation we just copy the flow_keys structure up to 16 bytes--
this is the same information previously passed in the cb. In the
future, we'll adapt this for larger flow_keys and could use something
like SHA-1 over the whole flow_keys to improve the quality of the
digest.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_keys.h   | 16 ++++++++++++++++
 net/core/flow_dissector.c | 27 +++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)

(limited to 'include/net')

diff --git a/include/net/flow_keys.h b/include/net/flow_keys.h
index dc8fd81412bf..6d6ef626811a 100644
--- a/include/net/flow_keys.h
+++ b/include/net/flow_keys.h
@@ -42,4 +42,20 @@ static inline __be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8
 u32 flow_hash_from_keys(struct flow_keys *keys);
 unsigned int flow_get_hlen(const unsigned char *data, unsigned int max_len,
 			   __be16 protocol);
+
+/* struct flow_keys_digest:
+ *
+ * This structure is used to hold a digest of the full flow keys. This is a
+ * larger "hash" of a flow to allow definitively matching specific flows where
+ * the 32 bit skb->hash is not large enough. The size is limited to 16 bytes so
+ * that it can by used in CB of skb (see sch_choke for an example).
+ */
+#define FLOW_KEYS_DIGEST_LEN	16
+struct flow_keys_digest {
+	u8	data[FLOW_KEYS_DIGEST_LEN];
+};
+
+void make_flow_keys_digest(struct flow_keys_digest *digest,
+			   const struct flow_keys *flow);
+
 #endif
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index b1df995ef06c..d3acc4dff4ae 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -310,6 +310,33 @@ static inline u32 ___skb_get_hash(const struct sk_buff *skb,
 	return __flow_hash_from_keys(keys, keyval);
 }
 
+struct _flow_keys_digest_data {
+	__be16	n_proto;
+	u8	ip_proto;
+	u8	padding;
+	__be32	ports;
+	__be32	src;
+	__be32	dst;
+};
+
+void make_flow_keys_digest(struct flow_keys_digest *digest,
+			   const struct flow_keys *flow)
+{
+	struct _flow_keys_digest_data *data =
+	    (struct _flow_keys_digest_data *)digest;
+
+	BUILD_BUG_ON(sizeof(*data) > sizeof(*digest));
+
+	memset(digest, 0, sizeof(*digest));
+
+	data->n_proto = flow->n_proto;
+	data->ip_proto = flow->ip_proto;
+	data->ports = flow->ports;
+	data->src = flow->src;
+	data->dst = flow->dst;
+}
+EXPORT_SYMBOL(make_flow_keys_digest);
+
 /*
  * __skb_get_hash: calculate a flow hash based on src/dst addresses
  * and src/dst port numbers.  Sets hash in skb to non-zero hash value
-- 
cgit v1.2.3


From ff419b3f95ab7a97c5f72876b53f12a249dacc2a Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 26 Apr 2015 20:13:34 -0700
Subject: mac80211: fix 90 kernel-doc warnings

Eliminate 90 of these warnings:

Warning(..//include/net/mac80211.h:1682): No description found for parameter 'drv_priv[0]'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index b4bef1152c05..8e3668b44c29 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1666,6 +1666,8 @@ struct ieee80211_tx_control {
  * @sta: station table entry, %NULL for per-vif queue
  * @tid: the TID for this queue (unused for per-vif queue)
  * @ac: the AC for this queue
+ * @drv_priv: data area for driver use, will always be aligned to
+ *	sizeof(void *).
  *
  * The driver can obtain packets from this queue by calling
  * ieee80211_tx_dequeue().
-- 
cgit v1.2.3


From 9afd85c9e4552b276e2f4cfefd622bdeeffbbf26 Mon Sep 17 00:00:00 2001
From: Linus Lüssing <linus.luessing@c0d3.blue>
Date: Sat, 2 May 2015 14:01:07 +0200
Subject: net: Export IGMP/MLD message validation code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With this patch, the IGMP and MLD message validation functions are moved
from the bridge code to IPv4/IPv6 multicast files. Some small
refactoring was done to enhance readibility and to iron out some
differences in behaviour between the IGMP and MLD parsing code (e.g. the
skb-cloning of MLD messages is now only done if necessary, just like the
IGMP part always did).

Finally, these IGMP and MLD message validation functions are exported so
that not only the bridge can use it but batman-adv later, too.

Signed-off-by: Linus Lüssing <linus.luessing@c0d3.blue>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/igmp.h      |   1 +
 include/linux/skbuff.h    |   3 +
 include/net/addrconf.h    |   1 +
 net/bridge/br_multicast.c | 218 +++++++---------------------------------------
 net/core/skbuff.c         |  87 ++++++++++++++++++
 net/ipv4/igmp.c           | 162 ++++++++++++++++++++++++++++++++++
 net/ipv6/Makefile         |   1 +
 net/ipv6/mcast_snoop.c    | 213 ++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 498 insertions(+), 188 deletions(-)
 create mode 100644 net/ipv6/mcast_snoop.c

(limited to 'include/net')

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index 2c677afeea47..193ad488d3e2 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -130,5 +130,6 @@ extern void ip_mc_unmap(struct in_device *);
 extern void ip_mc_remap(struct in_device *);
 extern void ip_mc_dec_group(struct in_device *in_dev, __be32 addr);
 extern void ip_mc_inc_group(struct in_device *in_dev, __be32 addr);
+int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed);
 
 #endif
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index acb83e249e3f..9c2f793573fa 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3419,6 +3419,9 @@ static inline void skb_checksum_none_assert(const struct sk_buff *skb)
 bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);
 
 int skb_checksum_setup(struct sk_buff *skb, bool recalculate);
+struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
+				     unsigned int transport_len,
+				     __sum16(*skb_chkf)(struct sk_buff *skb));
 
 u32 skb_get_poff(const struct sk_buff *skb);
 u32 __skb_get_poff(const struct sk_buff *skb, void *data,
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 80456f72d70a..def59d3a34d5 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -142,6 +142,7 @@ void ipv6_mc_unmap(struct inet6_dev *idev);
 void ipv6_mc_remap(struct inet6_dev *idev);
 void ipv6_mc_init_dev(struct inet6_dev *idev);
 void ipv6_mc_destroy_dev(struct inet6_dev *idev);
+int ipv6_mc_check_mld(struct sk_buff *skb, struct sk_buff **skb_trimmed);
 void addrconf_dad_failure(struct inet6_ifaddr *ifp);
 
 bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group,
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index b52f4cb8aee9..2d69d5cab52f 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -975,9 +975,6 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
 	int err = 0;
 	__be32 group;
 
-	if (!pskb_may_pull(skb, sizeof(*ih)))
-		return -EINVAL;
-
 	ih = igmpv3_report_hdr(skb);
 	num = ntohs(ih->ngrec);
 	len = sizeof(*ih);
@@ -1248,25 +1245,14 @@ static int br_ip4_multicast_query(struct net_bridge *br,
 			max_delay = 10 * HZ;
 			group = 0;
 		}
-	} else {
-		if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) {
-			err = -EINVAL;
-			goto out;
-		}
-
+	} else if (skb->len >= sizeof(*ih3)) {
 		ih3 = igmpv3_query_hdr(skb);
 		if (ih3->nsrcs)
 			goto out;
 
 		max_delay = ih3->code ?
 			    IGMPV3_MRC(ih3->code) * (HZ / IGMP_TIMER_SCALE) : 1;
-	}
-
-	/* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer
-	 * all-systems destination addresses (224.0.0.1) for general queries
-	 */
-	if (!group && iph->daddr != htonl(INADDR_ALLHOSTS_GROUP)) {
-		err = -EINVAL;
+	} else {
 		goto out;
 	}
 
@@ -1329,12 +1315,6 @@ static int br_ip6_multicast_query(struct net_bridge *br,
 	    (port && port->state == BR_STATE_DISABLED))
 		goto out;
 
-	/* RFC2710+RFC3810 (MLDv1+MLDv2) require link-local source addresses */
-	if (!(ipv6_addr_type(&ip6h->saddr) & IPV6_ADDR_LINKLOCAL)) {
-		err = -EINVAL;
-		goto out;
-	}
-
 	if (skb->len == sizeof(*mld)) {
 		if (!pskb_may_pull(skb, sizeof(*mld))) {
 			err = -EINVAL;
@@ -1358,14 +1338,6 @@ static int br_ip6_multicast_query(struct net_bridge *br,
 
 	is_general_query = group && ipv6_addr_any(group);
 
-	/* RFC2710+RFC3810 (MLDv1+MLDv2) require the multicast link layer
-	 * all-nodes destination address (ff02::1) for general queries
-	 */
-	if (is_general_query && !ipv6_addr_is_ll_all_nodes(&ip6h->daddr)) {
-		err = -EINVAL;
-		goto out;
-	}
-
 	if (is_general_query) {
 		saddr.proto = htons(ETH_P_IPV6);
 		saddr.u.ip6 = ip6h->saddr;
@@ -1557,66 +1529,22 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
 				 struct sk_buff *skb,
 				 u16 vid)
 {
-	struct sk_buff *skb2 = skb;
-	const struct iphdr *iph;
+	struct sk_buff *skb_trimmed = NULL;
 	struct igmphdr *ih;
-	unsigned int len;
-	unsigned int offset;
 	int err;
 
-	/* We treat OOM as packet loss for now. */
-	if (!pskb_may_pull(skb, sizeof(*iph)))
-		return -EINVAL;
-
-	iph = ip_hdr(skb);
-
-	if (iph->ihl < 5 || iph->version != 4)
-		return -EINVAL;
-
-	if (!pskb_may_pull(skb, ip_hdrlen(skb)))
-		return -EINVAL;
-
-	iph = ip_hdr(skb);
+	err = ip_mc_check_igmp(skb, &skb_trimmed);
 
-	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
-		return -EINVAL;
-
-	if (iph->protocol != IPPROTO_IGMP) {
-		if (!ipv4_is_local_multicast(iph->daddr))
+	if (err == -ENOMSG) {
+		if (!ipv4_is_local_multicast(ip_hdr(skb)->daddr))
 			BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
 		return 0;
+	} else if (err < 0) {
+		return err;
 	}
 
-	len = ntohs(iph->tot_len);
-	if (skb->len < len || len < ip_hdrlen(skb))
-		return -EINVAL;
-
-	if (skb->len > len) {
-		skb2 = skb_clone(skb, GFP_ATOMIC);
-		if (!skb2)
-			return -ENOMEM;
-
-		err = pskb_trim_rcsum(skb2, len);
-		if (err)
-			goto err_out;
-	}
-
-	len -= ip_hdrlen(skb2);
-	offset = skb_network_offset(skb2) + ip_hdrlen(skb2);
-	__skb_pull(skb2, offset);
-	skb_reset_transport_header(skb2);
-
-	err = -EINVAL;
-	if (!pskb_may_pull(skb2, sizeof(*ih)))
-		goto out;
-
-	if (skb_checksum_simple_validate(skb2))
-		goto out;
-
-	err = 0;
-
 	BR_INPUT_SKB_CB(skb)->igmp = 1;
-	ih = igmp_hdr(skb2);
+	ih = igmp_hdr(skb);
 
 	switch (ih->type) {
 	case IGMP_HOST_MEMBERSHIP_REPORT:
@@ -1625,21 +1553,19 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
 		err = br_ip4_multicast_add_group(br, port, ih->group, vid);
 		break;
 	case IGMPV3_HOST_MEMBERSHIP_REPORT:
-		err = br_ip4_multicast_igmp3_report(br, port, skb2, vid);
+		err = br_ip4_multicast_igmp3_report(br, port, skb_trimmed, vid);
 		break;
 	case IGMP_HOST_MEMBERSHIP_QUERY:
-		err = br_ip4_multicast_query(br, port, skb2, vid);
+		err = br_ip4_multicast_query(br, port, skb_trimmed, vid);
 		break;
 	case IGMP_HOST_LEAVE_MESSAGE:
 		br_ip4_multicast_leave_group(br, port, ih->group, vid);
 		break;
 	}
 
-out:
-	__skb_push(skb2, offset);
-err_out:
-	if (skb2 != skb)
-		kfree_skb(skb2);
+	if (skb_trimmed)
+		kfree_skb(skb_trimmed);
+
 	return err;
 }
 
@@ -1649,126 +1575,42 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
 				 struct sk_buff *skb,
 				 u16 vid)
 {
-	struct sk_buff *skb2;
-	const struct ipv6hdr *ip6h;
-	u8 icmp6_type;
-	u8 nexthdr;
-	__be16 frag_off;
-	unsigned int len;
-	int offset;
+	struct sk_buff *skb_trimmed = NULL;
+	struct mld_msg *mld;
 	int err;
 
-	if (!pskb_may_pull(skb, sizeof(*ip6h)))
-		return -EINVAL;
+	err = ipv6_mc_check_mld(skb, &skb_trimmed);
 
-	ip6h = ipv6_hdr(skb);
-
-	/*
-	 * We're interested in MLD messages only.
-	 *  - Version is 6
-	 *  - MLD has always Router Alert hop-by-hop option
-	 *  - But we do not support jumbrograms.
-	 */
-	if (ip6h->version != 6)
-		return 0;
-
-	/* Prevent flooding this packet if there is no listener present */
-	if (!ipv6_addr_is_ll_all_nodes(&ip6h->daddr))
-		BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
-
-	if (ip6h->nexthdr != IPPROTO_HOPOPTS ||
-	    ip6h->payload_len == 0)
-		return 0;
-
-	len = ntohs(ip6h->payload_len) + sizeof(*ip6h);
-	if (skb->len < len)
-		return -EINVAL;
-
-	nexthdr = ip6h->nexthdr;
-	offset = ipv6_skip_exthdr(skb, sizeof(*ip6h), &nexthdr, &frag_off);
-
-	if (offset < 0 || nexthdr != IPPROTO_ICMPV6)
+	if (err == -ENOMSG) {
+		if (!ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr))
+			BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
 		return 0;
-
-	/* Okay, we found ICMPv6 header */
-	skb2 = skb_clone(skb, GFP_ATOMIC);
-	if (!skb2)
-		return -ENOMEM;
-
-	err = -EINVAL;
-	if (!pskb_may_pull(skb2, offset + sizeof(struct icmp6hdr)))
-		goto out;
-
-	len -= offset - skb_network_offset(skb2);
-
-	__skb_pull(skb2, offset);
-	skb_reset_transport_header(skb2);
-	skb_postpull_rcsum(skb2, skb_network_header(skb2),
-			   skb_network_header_len(skb2));
-
-	icmp6_type = icmp6_hdr(skb2)->icmp6_type;
-
-	switch (icmp6_type) {
-	case ICMPV6_MGM_QUERY:
-	case ICMPV6_MGM_REPORT:
-	case ICMPV6_MGM_REDUCTION:
-	case ICMPV6_MLD2_REPORT:
-		break;
-	default:
-		err = 0;
-		goto out;
-	}
-
-	/* Okay, we found MLD message. Check further. */
-	if (skb2->len > len) {
-		err = pskb_trim_rcsum(skb2, len);
-		if (err)
-			goto out;
-		err = -EINVAL;
+	} else if (err < 0) {
+		return err;
 	}
 
-	ip6h = ipv6_hdr(skb2);
-
-	if (skb_checksum_validate(skb2, IPPROTO_ICMPV6, ip6_compute_pseudo))
-		goto out;
-
-	err = 0;
-
 	BR_INPUT_SKB_CB(skb)->igmp = 1;
+	mld = (struct mld_msg *)skb_transport_header(skb);
 
-	switch (icmp6_type) {
+	switch (mld->mld_type) {
 	case ICMPV6_MGM_REPORT:
-	    {
-		struct mld_msg *mld;
-		if (!pskb_may_pull(skb2, sizeof(*mld))) {
-			err = -EINVAL;
-			goto out;
-		}
-		mld = (struct mld_msg *)skb_transport_header(skb2);
 		BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
 		err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid);
 		break;
-	    }
 	case ICMPV6_MLD2_REPORT:
-		err = br_ip6_multicast_mld2_report(br, port, skb2, vid);
+		err = br_ip6_multicast_mld2_report(br, port, skb_trimmed, vid);
 		break;
 	case ICMPV6_MGM_QUERY:
-		err = br_ip6_multicast_query(br, port, skb2, vid);
+		err = br_ip6_multicast_query(br, port, skb_trimmed, vid);
 		break;
 	case ICMPV6_MGM_REDUCTION:
-	    {
-		struct mld_msg *mld;
-		if (!pskb_may_pull(skb2, sizeof(*mld))) {
-			err = -EINVAL;
-			goto out;
-		}
-		mld = (struct mld_msg *)skb_transport_header(skb2);
 		br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid);
-	    }
+		break;
 	}
 
-out:
-	kfree_skb(skb2);
+	if (skb_trimmed)
+		kfree_skb(skb_trimmed);
+
 	return err;
 }
 #endif
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3cfff2a3d651..1e4278a4dd7e 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4030,6 +4030,93 @@ int skb_checksum_setup(struct sk_buff *skb, bool recalculate)
 }
 EXPORT_SYMBOL(skb_checksum_setup);
 
+/**
+ * skb_checksum_maybe_trim - maybe trims the given skb
+ * @skb: the skb to check
+ * @transport_len: the data length beyond the network header
+ *
+ * Checks whether the given skb has data beyond the given transport length.
+ * If so, returns a cloned skb trimmed to this transport length.
+ * Otherwise returns the provided skb. Returns NULL in error cases
+ * (e.g. transport_len exceeds skb length or out-of-memory).
+ *
+ * Caller needs to set the skb transport header and release the returned skb.
+ * Provided skb is consumed.
+ */
+static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb,
+					       unsigned int transport_len)
+{
+	struct sk_buff *skb_chk;
+	unsigned int len = skb_transport_offset(skb) + transport_len;
+	int ret;
+
+	if (skb->len < len) {
+		kfree_skb(skb);
+		return NULL;
+	} else if (skb->len == len) {
+		return skb;
+	}
+
+	skb_chk = skb_clone(skb, GFP_ATOMIC);
+	kfree_skb(skb);
+
+	if (!skb_chk)
+		return NULL;
+
+	ret = pskb_trim_rcsum(skb_chk, len);
+	if (ret) {
+		kfree_skb(skb_chk);
+		return NULL;
+	}
+
+	return skb_chk;
+}
+
+/**
+ * skb_checksum_trimmed - validate checksum of an skb
+ * @skb: the skb to check
+ * @transport_len: the data length beyond the network header
+ * @skb_chkf: checksum function to use
+ *
+ * Applies the given checksum function skb_chkf to the provided skb.
+ * Returns a checked and maybe trimmed skb. Returns NULL on error.
+ *
+ * If the skb has data beyond the given transport length, then a
+ * trimmed & cloned skb is checked and returned.
+ *
+ * Caller needs to set the skb transport header and release the returned skb.
+ * Provided skb is consumed.
+ */
+struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
+				     unsigned int transport_len,
+				     __sum16(*skb_chkf)(struct sk_buff *skb))
+{
+	struct sk_buff *skb_chk;
+	unsigned int offset = skb_transport_offset(skb);
+	int ret;
+
+	skb_chk = skb_checksum_maybe_trim(skb, transport_len);
+	if (!skb_chk)
+		return NULL;
+
+	if (!pskb_may_pull(skb_chk, offset)) {
+		kfree_skb(skb_chk);
+		return NULL;
+	}
+
+	__skb_pull(skb_chk, offset);
+	ret = skb_chkf(skb_chk);
+	__skb_push(skb_chk, offset);
+
+	if (ret) {
+		kfree_skb(skb_chk);
+		return NULL;
+	}
+
+	return skb_chk;
+}
+EXPORT_SYMBOL(skb_checksum_trimmed);
+
 void __skb_warn_lro_forwarding(const struct sk_buff *skb)
 {
 	net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index a3a697f5ffba..651cdf648ec4 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1339,6 +1339,168 @@ out:
 }
 EXPORT_SYMBOL(ip_mc_inc_group);
 
+static int ip_mc_check_iphdr(struct sk_buff *skb)
+{
+	const struct iphdr *iph;
+	unsigned int len;
+	unsigned int offset = skb_network_offset(skb) + sizeof(*iph);
+
+	if (!pskb_may_pull(skb, offset))
+		return -EINVAL;
+
+	iph = ip_hdr(skb);
+
+	if (iph->version != 4 || ip_hdrlen(skb) < sizeof(*iph))
+		return -EINVAL;
+
+	offset += ip_hdrlen(skb) - sizeof(*iph);
+
+	if (!pskb_may_pull(skb, offset))
+		return -EINVAL;
+
+	iph = ip_hdr(skb);
+
+	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+		return -EINVAL;
+
+	len = skb_network_offset(skb) + ntohs(iph->tot_len);
+	if (skb->len < len || len < offset)
+		return -EINVAL;
+
+	skb_set_transport_header(skb, offset);
+
+	return 0;
+}
+
+static int ip_mc_check_igmp_reportv3(struct sk_buff *skb)
+{
+	unsigned int len = skb_transport_offset(skb);
+
+	len += sizeof(struct igmpv3_report);
+
+	return pskb_may_pull(skb, len) ? 0 : -EINVAL;
+}
+
+static int ip_mc_check_igmp_query(struct sk_buff *skb)
+{
+	unsigned int len = skb_transport_offset(skb);
+
+	len += sizeof(struct igmphdr);
+	if (skb->len < len)
+		return -EINVAL;
+
+	/* IGMPv{1,2}? */
+	if (skb->len != len) {
+		/* or IGMPv3? */
+		len += sizeof(struct igmpv3_query) - sizeof(struct igmphdr);
+		if (skb->len < len || !pskb_may_pull(skb, len))
+			return -EINVAL;
+	}
+
+	/* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer
+	 * all-systems destination addresses (224.0.0.1) for general queries
+	 */
+	if (!igmp_hdr(skb)->group &&
+	    ip_hdr(skb)->daddr != htonl(INADDR_ALLHOSTS_GROUP))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int ip_mc_check_igmp_msg(struct sk_buff *skb)
+{
+	switch (igmp_hdr(skb)->type) {
+	case IGMP_HOST_LEAVE_MESSAGE:
+	case IGMP_HOST_MEMBERSHIP_REPORT:
+	case IGMPV2_HOST_MEMBERSHIP_REPORT:
+		/* fall through */
+		return 0;
+	case IGMPV3_HOST_MEMBERSHIP_REPORT:
+		return ip_mc_check_igmp_reportv3(skb);
+	case IGMP_HOST_MEMBERSHIP_QUERY:
+		return ip_mc_check_igmp_query(skb);
+	default:
+		return -ENOMSG;
+	}
+}
+
+static inline __sum16 ip_mc_validate_checksum(struct sk_buff *skb)
+{
+	return skb_checksum_simple_validate(skb);
+}
+
+static int __ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed)
+
+{
+	struct sk_buff *skb_chk;
+	unsigned int transport_len;
+	unsigned int len = skb_transport_offset(skb) + sizeof(struct igmphdr);
+	int ret;
+
+	transport_len = ntohs(ip_hdr(skb)->tot_len) - ip_hdrlen(skb);
+
+	skb_get(skb);
+	skb_chk = skb_checksum_trimmed(skb, transport_len,
+				       ip_mc_validate_checksum);
+	if (!skb_chk)
+		return -EINVAL;
+
+	if (!pskb_may_pull(skb_chk, len)) {
+		kfree_skb(skb_chk);
+		return -EINVAL;
+	}
+
+	ret = ip_mc_check_igmp_msg(skb_chk);
+	if (ret) {
+		kfree_skb(skb_chk);
+		return ret;
+	}
+
+	if (skb_trimmed)
+		*skb_trimmed = skb_chk;
+	else
+		kfree_skb(skb_chk);
+
+	return 0;
+}
+
+/**
+ * ip_mc_check_igmp - checks whether this is a sane IGMP packet
+ * @skb: the skb to validate
+ * @skb_trimmed: to store an skb pointer trimmed to IPv4 packet tail (optional)
+ *
+ * Checks whether an IPv4 packet is a valid IGMP packet. If so sets
+ * skb network and transport headers accordingly and returns zero.
+ *
+ * -EINVAL: A broken packet was detected, i.e. it violates some internet
+ *  standard
+ * -ENOMSG: IP header validation succeeded but it is not an IGMP packet.
+ * -ENOMEM: A memory allocation failure happened.
+ *
+ * Optionally, an skb pointer might be provided via skb_trimmed (or set it
+ * to NULL): After parsing an IGMP packet successfully it will point to
+ * an skb which has its tail aligned to the IP packet end. This might
+ * either be the originally provided skb or a trimmed, cloned version if
+ * the skb frame had data beyond the IP packet. A cloned skb allows us
+ * to leave the original skb and its full frame unchanged (which might be
+ * desirable for layer 2 frame jugglers).
+ *
+ * The caller needs to release a reference count from any returned skb_trimmed.
+ */
+int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed)
+{
+	int ret = ip_mc_check_iphdr(skb);
+
+	if (ret < 0)
+		return ret;
+
+	if (ip_hdr(skb)->protocol != IPPROTO_IGMP)
+		return -ENOMSG;
+
+	return __ip_mc_check_igmp(skb, skb_trimmed);
+}
+EXPORT_SYMBOL(ip_mc_check_igmp);
+
 /*
  *	Resend IGMP JOIN report; used by netdev notifier.
  */
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 2e8c06108ab9..0f3f1999719a 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -48,4 +48,5 @@ obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o
 
 ifneq ($(CONFIG_IPV6),)
 obj-$(CONFIG_NET_UDP_TUNNEL) += ip6_udp_tunnel.o
+obj-y += mcast_snoop.o
 endif
diff --git a/net/ipv6/mcast_snoop.c b/net/ipv6/mcast_snoop.c
new file mode 100644
index 000000000000..1a2cbc13a7d3
--- /dev/null
+++ b/net/ipv6/mcast_snoop.c
@@ -0,0 +1,213 @@
+/* Copyright (C) 2010: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
+ * Copyright (C) 2015: Linus Lüssing <linus.luessing@c0d3.blue>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Based on the MLD support added to br_multicast.c by YOSHIFUJI Hideaki.
+ */
+
+#include <linux/skbuff.h>
+#include <net/ipv6.h>
+#include <net/mld.h>
+#include <net/addrconf.h>
+#include <net/ip6_checksum.h>
+
+static int ipv6_mc_check_ip6hdr(struct sk_buff *skb)
+{
+	const struct ipv6hdr *ip6h;
+	unsigned int len;
+	unsigned int offset = skb_network_offset(skb) + sizeof(*ip6h);
+
+	if (!pskb_may_pull(skb, offset))
+		return -EINVAL;
+
+	ip6h = ipv6_hdr(skb);
+
+	if (ip6h->version != 6)
+		return -EINVAL;
+
+	len = offset + ntohs(ip6h->payload_len);
+	if (skb->len < len || len <= offset)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int ipv6_mc_check_exthdrs(struct sk_buff *skb)
+{
+	const struct ipv6hdr *ip6h;
+	unsigned int offset;
+	u8 nexthdr;
+	__be16 frag_off;
+
+	ip6h = ipv6_hdr(skb);
+
+	if (ip6h->nexthdr != IPPROTO_HOPOPTS)
+		return -ENOMSG;
+
+	nexthdr = ip6h->nexthdr;
+	offset = skb_network_offset(skb) + sizeof(*ip6h);
+	offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off);
+
+	if (offset < 0)
+		return -EINVAL;
+
+	if (nexthdr != IPPROTO_ICMPV6)
+		return -ENOMSG;
+
+	skb_set_transport_header(skb, offset);
+
+	return 0;
+}
+
+static int ipv6_mc_check_mld_reportv2(struct sk_buff *skb)
+{
+	unsigned int len = skb_transport_offset(skb);
+
+	len += sizeof(struct mld2_report);
+
+	return pskb_may_pull(skb, len) ? 0 : -EINVAL;
+}
+
+static int ipv6_mc_check_mld_query(struct sk_buff *skb)
+{
+	struct mld_msg *mld;
+	unsigned int len = skb_transport_offset(skb);
+
+	/* RFC2710+RFC3810 (MLDv1+MLDv2) require link-local source addresses */
+	if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL))
+		return -EINVAL;
+
+	len += sizeof(struct mld_msg);
+	if (skb->len < len)
+		return -EINVAL;
+
+	/* MLDv1? */
+	if (skb->len != len) {
+		/* or MLDv2? */
+		len += sizeof(struct mld2_query) - sizeof(struct mld_msg);
+		if (skb->len < len || !pskb_may_pull(skb, len))
+			return -EINVAL;
+	}
+
+	mld = (struct mld_msg *)skb_transport_header(skb);
+
+	/* RFC2710+RFC3810 (MLDv1+MLDv2) require the multicast link layer
+	 * all-nodes destination address (ff02::1) for general queries
+	 */
+	if (ipv6_addr_any(&mld->mld_mca) &&
+	    !ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int ipv6_mc_check_mld_msg(struct sk_buff *skb)
+{
+	struct mld_msg *mld = (struct mld_msg *)skb_transport_header(skb);
+
+	switch (mld->mld_type) {
+	case ICMPV6_MGM_REDUCTION:
+	case ICMPV6_MGM_REPORT:
+		/* fall through */
+		return 0;
+	case ICMPV6_MLD2_REPORT:
+		return ipv6_mc_check_mld_reportv2(skb);
+	case ICMPV6_MGM_QUERY:
+		return ipv6_mc_check_mld_query(skb);
+	default:
+		return -ENOMSG;
+	}
+}
+
+static inline __sum16 ipv6_mc_validate_checksum(struct sk_buff *skb)
+{
+	return skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo);
+}
+
+static int __ipv6_mc_check_mld(struct sk_buff *skb,
+			       struct sk_buff **skb_trimmed)
+
+{
+	struct sk_buff *skb_chk = NULL;
+	unsigned int transport_len;
+	unsigned int len = skb_transport_offset(skb) + sizeof(struct mld_msg);
+	int ret;
+
+	transport_len = ntohs(ipv6_hdr(skb)->payload_len);
+	transport_len -= skb_transport_offset(skb) - sizeof(struct ipv6hdr);
+
+	skb_get(skb);
+	skb_chk = skb_checksum_trimmed(skb, transport_len,
+				       ipv6_mc_validate_checksum);
+	if (!skb_chk)
+		return -EINVAL;
+
+	if (!pskb_may_pull(skb_chk, len)) {
+		kfree_skb(skb_chk);
+		return -EINVAL;
+	}
+
+	ret = ipv6_mc_check_mld_msg(skb_chk);
+	if (ret) {
+		kfree_skb(skb_chk);
+		return ret;
+	}
+
+	if (skb_trimmed)
+		*skb_trimmed = skb_chk;
+	else
+		kfree_skb(skb_chk);
+
+	return 0;
+}
+
+/**
+ * ipv6_mc_check_mld - checks whether this is a sane MLD packet
+ * @skb: the skb to validate
+ * @skb_trimmed: to store an skb pointer trimmed to IPv6 packet tail (optional)
+ *
+ * Checks whether an IPv6 packet is a valid MLD packet. If so sets
+ * skb network and transport headers accordingly and returns zero.
+ *
+ * -EINVAL: A broken packet was detected, i.e. it violates some internet
+ *  standard
+ * -ENOMSG: IP header validation succeeded but it is not an MLD packet.
+ * -ENOMEM: A memory allocation failure happened.
+ *
+ * Optionally, an skb pointer might be provided via skb_trimmed (or set it
+ * to NULL): After parsing an MLD packet successfully it will point to
+ * an skb which has its tail aligned to the IP packet end. This might
+ * either be the originally provided skb or a trimmed, cloned version if
+ * the skb frame had data beyond the IP packet. A cloned skb allows us
+ * to leave the original skb and its full frame unchanged (which might be
+ * desirable for layer 2 frame jugglers).
+ *
+ * The caller needs to release a reference count from any returned skb_trimmed.
+ */
+int ipv6_mc_check_mld(struct sk_buff *skb, struct sk_buff **skb_trimmed)
+{
+	int ret;
+
+	ret = ipv6_mc_check_ip6hdr(skb);
+	if (ret < 0)
+		return ret;
+
+	ret = ipv6_mc_check_exthdrs(skb);
+	if (ret < 0)
+		return ret;
+
+	return __ipv6_mc_check_mld(skb, skb_trimmed);
+}
+EXPORT_SYMBOL(ipv6_mc_check_mld);
-- 
cgit v1.2.3


From cd8ae85299d54155702a56811b2e035e63064d3d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 3 May 2015 21:34:46 -0700
Subject: tcp: provide SYN headers for passive connections

This patch allows a server application to get the TCP SYN headers for
its passive connections.  This is useful if the server is doing
fingerprinting of clients based on SYN packet contents.

Two socket options are added: TCP_SAVE_SYN and TCP_SAVED_SYN.

The first is used on a socket to enable saving the SYN headers
for child connections. This can be set before or after the listen()
call.

The latter is used to retrieve the SYN headers for passive connections,
if the parent listener has enabled TCP_SAVE_SYN.

TCP_SAVED_SYN is read once, it frees the saved SYN headers.

The data returned in TCP_SAVED_SYN are network (IPv4/IPv6) and TCP
headers.

Original patch was written by Tom Herbert, I changed it to not hold
a full skb (and associated dst and conntracking reference).

We have used such patch for about 3 years at Google.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Tested-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h        |  8 ++++++++
 include/net/request_sock.h |  4 +++-
 include/uapi/linux/tcp.h   |  2 ++
 net/ipv4/tcp.c             | 35 +++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_input.c       | 18 ++++++++++++++++++
 net/ipv4/tcp_ipv4.c        |  1 +
 net/ipv4/tcp_minisocks.c   |  3 +++
 7 files changed, 70 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 3b2911502a8c..e6fb5df22db1 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -199,6 +199,7 @@ struct tcp_sock {
 		syn_fastopen:1,	/* SYN includes Fast Open option */
 		syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
 		syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
+		save_syn:1,	/* Save headers of SYN packet */
 		is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
 	u32	tlp_high_seq;	/* snd_nxt at the time of TLP retransmit. */
 
@@ -326,6 +327,7 @@ struct tcp_sock {
 	 * socket. Used to retransmit SYNACKs etc.
 	 */
 	struct request_sock *fastopen_rsk;
+	u32	*saved_syn;
 };
 
 enum tsq_flags {
@@ -393,4 +395,10 @@ static inline int fastopen_init_queue(struct sock *sk, int backlog)
 	return 0;
 }
 
+static inline void tcp_saved_syn_free(struct tcp_sock *tp)
+{
+	kfree(tp->saved_syn);
+	tp->saved_syn = NULL;
+}
+
 #endif	/* _LINUX_TCP_H */
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 9f4265ce8892..87935cad2f7b 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -64,6 +64,7 @@ struct request_sock {
 	struct timer_list		rsk_timer;
 	const struct request_sock_ops	*rsk_ops;
 	struct sock			*sk;
+	u32				*saved_syn;
 	u32				secid;
 	u32				peer_secid;
 };
@@ -77,7 +78,7 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener)
 		req->rsk_ops = ops;
 		sock_hold(sk_listener);
 		req->rsk_listener = sk_listener;
-
+		req->saved_syn = NULL;
 		/* Following is temporary. It is coupled with debugging
 		 * helpers in reqsk_put() & reqsk_free()
 		 */
@@ -104,6 +105,7 @@ static inline void reqsk_free(struct request_sock *req)
 	req->rsk_ops->destructor(req);
 	if (req->rsk_listener)
 		sock_put(req->rsk_listener);
+	kfree(req->saved_syn);
 	kmem_cache_free(req->rsk_ops->slab, req);
 }
 
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index faa72f4fa547..51ebedba577f 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -113,6 +113,8 @@ enum {
 #define TCP_TIMESTAMP		24
 #define TCP_NOTSENT_LOWAT	25	/* limit number of unsent bytes in write queue */
 #define TCP_CC_INFO		26	/* Get Congestion Control (optional) info */
+#define TCP_SAVE_SYN		27	/* Record SYN headers for new connections */
+#define TCP_SAVED_SYN		28	/* Get SYN headers recorded for connection */
 
 struct tcp_repair_opt {
 	__u32	opt_code;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 46efa03d2b11..ecccfdc50d76 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2482,6 +2482,13 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 			icsk->icsk_syn_retries = val;
 		break;
 
+	case TCP_SAVE_SYN:
+		if (val < 0 || val > 1)
+			err = -EINVAL;
+		else
+			tp->save_syn = val;
+		break;
+
 	case TCP_LINGER2:
 		if (val < 0)
 			tp->linger2 = -1;
@@ -2818,6 +2825,34 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 	case TCP_NOTSENT_LOWAT:
 		val = tp->notsent_lowat;
 		break;
+	case TCP_SAVE_SYN:
+		val = tp->save_syn;
+		break;
+	case TCP_SAVED_SYN: {
+		if (get_user(len, optlen))
+			return -EFAULT;
+
+		lock_sock(sk);
+		if (tp->saved_syn) {
+			len = min_t(unsigned int, tp->saved_syn[0], len);
+			if (put_user(len, optlen)) {
+				release_sock(sk);
+				return -EFAULT;
+			}
+			if (copy_to_user(optval, tp->saved_syn + 1, len)) {
+				release_sock(sk);
+				return -EFAULT;
+			}
+			tcp_saved_syn_free(tp);
+			release_sock(sk);
+		} else {
+			release_sock(sk);
+			len = 0;
+			if (put_user(len, optlen))
+				return -EFAULT;
+		}
+		return 0;
+	}
 	default:
 		return -ENOPROTOOPT;
 	}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 09bdc4abfcbb..df2ca615cd0c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6060,6 +6060,23 @@ static bool tcp_syn_flood_action(struct sock *sk,
 	return want_cookie;
 }
 
+static void tcp_reqsk_record_syn(const struct sock *sk,
+				 struct request_sock *req,
+				 const struct sk_buff *skb)
+{
+	if (tcp_sk(sk)->save_syn) {
+		u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
+		u32 *copy;
+
+		copy = kmalloc(len + sizeof(u32), GFP_ATOMIC);
+		if (copy) {
+			copy[0] = len;
+			memcpy(&copy[1], skb_network_header(skb), len);
+			req->saved_syn = copy;
+		}
+	}
+}
+
 int tcp_conn_request(struct request_sock_ops *rsk_ops,
 		     const struct tcp_request_sock_ops *af_ops,
 		     struct sock *sk, struct sk_buff *skb)
@@ -6192,6 +6209,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 		tcp_rsk(req)->tfo_listener = false;
 		af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
 	}
+	tcp_reqsk_record_syn(sk, req, skb);
 
 	return 0;
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fc1c658ec6c1..91cb4768a860 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1802,6 +1802,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
 
 	/* If socket is aborted during connect operation */
 	tcp_free_fastopen_req(tp);
+	tcp_saved_syn_free(tp);
 
 	sk_sockets_allocated_dec(sk);
 	sock_release_memcg(sk);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index e5d7649136fc..ebe2ab2596ed 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -536,6 +536,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		newtp->fastopen_rsk = NULL;
 		newtp->syn_data_acked = 0;
 
+		newtp->saved_syn = req->saved_syn;
+		req->saved_syn = NULL;
+
 		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
 	}
 	return newsk;
-- 
cgit v1.2.3


From 21c8fe9915276d923f8c1e43434fd6d37a3b9aef Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 6 May 2015 14:26:24 -0700
Subject: tcp: adjust window probe timers to safer values

With the advent of small rto timers in datacenter TCP,
(ip route ... rto_min x), the following can happen :

1) Qdisc is full, transmit fails.

   TCP sets a timer based on icsk_rto to retry the transmit, without
   exponential backoff.
   With low icsk_rto, and lot of sockets, all cpus are servicing timer
   interrupts like crazy.
   Intent of the code was to retry with a timer between 200 (TCP_RTO_MIN)
   and 500ms (TCP_RESOURCE_PROBE_INTERVAL)

2) Receivers can send zero windows if they don't drain their receive queue.

   TCP sends zero window probes, based on icsk_rto current value, with
   exponential backoff.
   With /proc/sys/net/ipv4/tcp_retries2 being 15 (or even smaller in
   some cases), sender can abort in less than one or two minutes !
   If receiver stops the sender, it obviously doesn't care of very tight
   rto. Probability of dropping the ACK reopening the window is not
   worth the risk.

Lets change the base timer to be at least 200ms (TCP_RTO_MIN) for these
events (but not normal RTO based retransmits)

A followup patch adds a new SNMP counter, as it would have helped a lot
diagnosing this issue.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     | 27 ++++++++++++++++++++++-----
 net/ipv4/tcp_input.c  |  2 +-
 net/ipv4/tcp_output.c |  2 +-
 3 files changed, 24 insertions(+), 7 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6d204f3f9df8..7a2248a35b13 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1043,14 +1043,31 @@ static inline bool tcp_is_cwnd_limited(const struct sock *sk)
 	return tp->is_cwnd_limited;
 }
 
-static inline void tcp_check_probe_timer(struct sock *sk)
+/* Something is really bad, we could not queue an additional packet,
+ * because qdisc is full or receiver sent a 0 window.
+ * We do not want to add fuel to the fire, or abort too early,
+ * so make sure the timer we arm now is at least 200ms in the future,
+ * regardless of current icsk_rto value (as it could be ~2ms)
+ */
+static inline unsigned long tcp_probe0_base(const struct sock *sk)
 {
-	const struct tcp_sock *tp = tcp_sk(sk);
-	const struct inet_connection_sock *icsk = inet_csk(sk);
+	return max_t(unsigned long, inet_csk(sk)->icsk_rto, TCP_RTO_MIN);
+}
 
-	if (!tp->packets_out && !icsk->icsk_pending)
+/* Variant of inet_csk_rto_backoff() used for zero window probes */
+static inline unsigned long tcp_probe0_when(const struct sock *sk,
+					    unsigned long max_when)
+{
+	u64 when = (u64)tcp_probe0_base(sk) << inet_csk(sk)->icsk_backoff;
+
+	return (unsigned long)min_t(u64, when, max_when);
+}
+
+static inline void tcp_check_probe_timer(struct sock *sk)
+{
+	if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending)
 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
-					  icsk->icsk_rto, TCP_RTO_MAX);
+					  tcp_probe0_base(sk), TCP_RTO_MAX);
 }
 
 static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index df2ca615cd0c..cf8b20ff6658 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3233,7 +3233,7 @@ static void tcp_ack_probe(struct sock *sk)
 		 * This function is not for random using!
 		 */
 	} else {
-		unsigned long when = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
+		unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
 
 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
 					  when, TCP_RTO_MAX);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a369e8a70b2c..b76c719e1979 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3490,7 +3490,7 @@ void tcp_send_probe0(struct sock *sk)
 		probe_max = TCP_RESOURCE_PROBE_INTERVAL;
 	}
 	inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
-				  inet_csk_rto_backoff(icsk, probe_max),
+				  tcp_probe0_when(sk, probe_max),
 				  TCP_RTO_MAX);
 }
 
-- 
cgit v1.2.3


From e520af48c7e5acae5f17f82a79ba7ab7cf156f3b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 6 May 2015 14:26:25 -0700
Subject: tcp: add TCPWinProbe and TCPKeepAlive SNMP counters

Diagnosing problems related to Window Probes has been hard because
we lack a counter.

TCPWinProbe counts the number of ACK packets a sender has to send
at regular intervals to make sure a reverse ACK packet opening back
a window had not been lost.

TCPKeepAlive counts the number of ACK packets sent to keep TCP
flows alive (SO_KEEPALIVE)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h         |  2 +-
 include/uapi/linux/snmp.h |  2 ++
 net/ipv4/proc.c           |  2 ++
 net/ipv4/tcp_output.c     | 13 +++++++------
 net/ipv4/tcp_timer.c      |  2 +-
 5 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7a2248a35b13..b8ea12880fd9 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -527,7 +527,7 @@ int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t);
 
 void tcp_send_probe0(struct sock *);
 void tcp_send_partial(struct sock *);
-int tcp_write_wakeup(struct sock *);
+int tcp_write_wakeup(struct sock *, int mib);
 void tcp_send_fin(struct sock *sk);
 void tcp_send_active_reset(struct sock *sk, gfp_t priority);
 int tcp_send_synack(struct sock *);
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 6a6fb747c78d..eee8968407f0 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -276,6 +276,8 @@ enum
 	LINUX_MIB_TCPACKSKIPPEDFINWAIT2,	/* TCPACKSkippedFinWait2 */
 	LINUX_MIB_TCPACKSKIPPEDTIMEWAIT,	/* TCPACKSkippedTimeWait */
 	LINUX_MIB_TCPACKSKIPPEDCHALLENGE,	/* TCPACKSkippedChallenge */
+	LINUX_MIB_TCPWINPROBE,			/* TCPWinProbe */
+	LINUX_MIB_TCPKEEPALIVE,			/* TCPKeepAlive */
 	__LINUX_MIB_MAX
 };
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index e1f3b911dd1e..da5d483e236a 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -298,6 +298,8 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPACKSkippedFinWait2", LINUX_MIB_TCPACKSKIPPEDFINWAIT2),
 	SNMP_MIB_ITEM("TCPACKSkippedTimeWait", LINUX_MIB_TCPACKSKIPPEDTIMEWAIT),
 	SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE),
+	SNMP_MIB_ITEM("TCPWinProbe", LINUX_MIB_TCPWINPROBE),
+	SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b76c719e1979..7386d32cd670 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3382,7 +3382,7 @@ EXPORT_SYMBOL_GPL(tcp_send_ack);
  * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
  * out-of-date with SND.UNA-1 to probe window.
  */
-static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
+static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
@@ -3400,6 +3400,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
 	 */
 	tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
 	skb_mstamp_get(&skb->skb_mstamp);
+	NET_INC_STATS_BH(sock_net(sk), mib);
 	return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
 }
 
@@ -3407,12 +3408,12 @@ void tcp_send_window_probe(struct sock *sk)
 {
 	if (sk->sk_state == TCP_ESTABLISHED) {
 		tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
-		tcp_xmit_probe_skb(sk, 0);
+		tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE);
 	}
 }
 
 /* Initiate keepalive or window probe from timer. */
-int tcp_write_wakeup(struct sock *sk)
+int tcp_write_wakeup(struct sock *sk, int mib)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
@@ -3449,8 +3450,8 @@ int tcp_write_wakeup(struct sock *sk)
 		return err;
 	} else {
 		if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
-			tcp_xmit_probe_skb(sk, 1);
-		return tcp_xmit_probe_skb(sk, 0);
+			tcp_xmit_probe_skb(sk, 1, mib);
+		return tcp_xmit_probe_skb(sk, 0, mib);
 	}
 }
 
@@ -3464,7 +3465,7 @@ void tcp_send_probe0(struct sock *sk)
 	unsigned long probe_max;
 	int err;
 
-	err = tcp_write_wakeup(sk);
+	err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
 
 	if (tp->packets_out || !tcp_send_head(sk)) {
 		/* Cancel probe timer, if it is not required. */
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 8c65dc147d8b..65bf670e8714 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -616,7 +616,7 @@ static void tcp_keepalive_timer (unsigned long data)
 			tcp_write_err(sk);
 			goto out;
 		}
-		if (tcp_write_wakeup(sk) <= 0) {
+		if (tcp_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) {
 			icsk->icsk_probes_out++;
 			elapsed = keepalive_intvl_when(tp);
 		} else {
-- 
cgit v1.2.3


From 7a0877d4b438886b72be61632eaa774d13262f70 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Thu, 7 May 2015 11:02:49 +0200
Subject: netns: rename peernet2id() to peernet2id_alloc()

In a following commit, a new function will be introduced to only lookup for
a nsid (no allocation if the nsid doesn't exist). To avoid confusion, the
existing function is renamed.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c         | 2 +-
 include/net/net_namespace.h | 2 +-
 net/core/net_namespace.c    | 4 ++--
 net/core/rtnetlink.c        | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 3517ab0aa803..48341ae49012 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -336,7 +336,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
 
 	if (!net_eq(dev_net(vxlan->dev), vxlan->net) &&
 	    nla_put_s32(skb, NDA_LINK_NETNSID,
-			peernet2id(dev_net(vxlan->dev), vxlan->net)))
+			peernet2id_alloc(dev_net(vxlan->dev), vxlan->net)))
 		goto nla_put_failure;
 
 	if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index f733656404de..6d1e2eae32fb 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -271,7 +271,7 @@ static inline struct net *read_pnet(const possible_net_t *pnet)
 #define __net_initconst	__initconst
 #endif
 
-int peernet2id(struct net *net, struct net *peer);
+int peernet2id_alloc(struct net *net, struct net *peer);
 struct net *get_net_ns_by_id(struct net *net, int id);
 
 struct pernet_operations {
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 37c68bb72db3..9c806ac569f9 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -202,13 +202,13 @@ static int __peernet2id(struct net *net, struct net *peer, bool alloc)
 /* This function returns the id of a peer netns. If no id is assigned, one will
  * be allocated and returned.
  */
-int peernet2id(struct net *net, struct net *peer)
+int peernet2id_alloc(struct net *net, struct net *peer)
 {
 	bool alloc = atomic_read(&peer->count) == 0 ? false : true;
 
 	return __peernet2id(net, peer, alloc);
 }
-EXPORT_SYMBOL(peernet2id);
+EXPORT_SYMBOL(peernet2id_alloc);
 
 struct net *get_net_ns_by_id(struct net *net, int id)
 {
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 666e0928ba40..83e08323fdcd 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1204,7 +1204,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 		struct net *link_net = dev->rtnl_link_ops->get_link_net(dev);
 
 		if (!net_eq(dev_net(dev), link_net)) {
-			int id = peernet2id(dev_net(dev), link_net);
+			int id = peernet2id_alloc(dev_net(dev), link_net);
 
 			if (nla_put_s32(skb, IFLA_LINK_NETNSID, id))
 				goto nla_put_failure;
-- 
cgit v1.2.3


From 59324cf35aba5336b611074028777838a963d03b Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Thu, 7 May 2015 11:02:53 +0200
Subject: netlink: allow to listen "all" netns

More accurately, listen all netns that have a nsid assigned into the netns
where the netlink socket is opened.
For this purpose, a netlink socket option is added:
NETLINK_LISTEN_ALL_NSID. When this option is set on a netlink socket, this
socket will receive netlink notifications from all netns that have a nsid
assigned into the netns where the socket has been opened. The nsid is sent
to userland via an anscillary data.

With this patch, a daemon needs only one socket to listen many netns. This
is useful when the number of netns is high.

Because 0 is a valid value for a nsid, the field nsid_is_set indicates if
the field nsid is valid or not. skb->cb is initialized to 0 on skb
allocation, thus we are sure that we will never send a nsid 0 by error to
the userland.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h      |  2 ++
 include/net/net_namespace.h  |  2 ++
 include/uapi/linux/netlink.h |  1 +
 net/core/net_namespace.c     | 10 ++++++++-
 net/netlink/af_netlink.c     | 52 +++++++++++++++++++++++++++++++++++++++-----
 5 files changed, 61 insertions(+), 6 deletions(-)

(limited to 'include/net')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 6835c1279df7..9120edb650a0 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -28,6 +28,8 @@ struct netlink_skb_parms {
 	__u32			dst_group;
 	__u32			flags;
 	struct sock		*sk;
+	bool			nsid_is_set;
+	int			nsid;
 };
 
 #define NETLINK_CB(skb)		(*(struct netlink_skb_parms*)&((skb)->cb))
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 6d1e2eae32fb..3f850acc844e 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -272,6 +272,8 @@ static inline struct net *read_pnet(const possible_net_t *pnet)
 #endif
 
 int peernet2id_alloc(struct net *net, struct net *peer);
+int peernet2id(struct net *net, struct net *peer);
+bool peernet_has_id(struct net *net, struct net *peer);
 struct net *get_net_ns_by_id(struct net *net, int id);
 
 struct pernet_operations {
diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index 1a85940f8ab7..3e34b7d702f8 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -108,6 +108,7 @@ struct nlmsgerr {
 #define NETLINK_NO_ENOBUFS	5
 #define NETLINK_RX_RING		6
 #define NETLINK_TX_RING		7
+#define NETLINK_LISTEN_ALL_NSID	8
 
 struct nl_pktinfo {
 	__u32	group;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index ae5008b097de..a665bf490c88 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -229,7 +229,7 @@ int peernet2id_alloc(struct net *net, struct net *peer)
 EXPORT_SYMBOL(peernet2id_alloc);
 
 /* This function returns, if assigned, the id of a peer netns. */
-static int peernet2id(struct net *net, struct net *peer)
+int peernet2id(struct net *net, struct net *peer)
 {
 	unsigned long flags;
 	int id;
@@ -240,6 +240,14 @@ static int peernet2id(struct net *net, struct net *peer)
 	return id;
 }
 
+/* This function returns true is the peer netns has an id assigned into the
+ * current netns.
+ */
+bool peernet_has_id(struct net *net, struct net *peer)
+{
+	return peernet2id(net, peer) >= 0;
+}
+
 struct net *get_net_ns_by_id(struct net *net, int id)
 {
 	unsigned long flags;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index bf7f56d7a9aa..a5fff75accf8 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -83,6 +83,7 @@ struct listeners {
 #define NETLINK_F_RECV_PKTINFO		0x2
 #define NETLINK_F_BROADCAST_SEND_ERROR	0x4
 #define NETLINK_F_RECV_NO_ENOBUFS	0x8
+#define NETLINK_F_LISTEN_ALL_NSID	0x10
 
 static inline int netlink_is_kernel(struct sock *sk)
 {
@@ -1932,8 +1933,17 @@ static void do_one_broadcast(struct sock *sk,
 	    !test_bit(p->group - 1, nlk->groups))
 		return;
 
-	if (!net_eq(sock_net(sk), p->net))
-		return;
+	if (!net_eq(sock_net(sk), p->net)) {
+		if (!(nlk->flags & NETLINK_F_LISTEN_ALL_NSID))
+			return;
+
+		if (!peernet_has_id(sock_net(sk), p->net))
+			return;
+
+		if (!file_ns_capable(sk->sk_socket->file, p->net->user_ns,
+				     CAP_NET_BROADCAST))
+			return;
+	}
 
 	if (p->failure) {
 		netlink_overrun(sk);
@@ -1959,13 +1969,22 @@ static void do_one_broadcast(struct sock *sk,
 		p->failure = 1;
 		if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR)
 			p->delivery_failure = 1;
-	} else if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) {
+		goto out;
+	}
+	if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) {
 		kfree_skb(p->skb2);
 		p->skb2 = NULL;
-	} else if (sk_filter(sk, p->skb2)) {
+		goto out;
+	}
+	if (sk_filter(sk, p->skb2)) {
 		kfree_skb(p->skb2);
 		p->skb2 = NULL;
-	} else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) {
+		goto out;
+	}
+	NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net);
+	NETLINK_CB(p->skb2).nsid_is_set = true;
+	val = netlink_broadcast_deliver(sk, p->skb2);
+	if (val < 0) {
 		netlink_overrun(sk);
 		if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR)
 			p->delivery_failure = 1;
@@ -1974,6 +1993,7 @@ static void do_one_broadcast(struct sock *sk,
 		p->delivered = 1;
 		p->skb2 = NULL;
 	}
+out:
 	sock_put(sk);
 }
 
@@ -2202,6 +2222,16 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 		break;
 	}
 #endif /* CONFIG_NETLINK_MMAP */
+	case NETLINK_LISTEN_ALL_NSID:
+		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST))
+			return -EPERM;
+
+		if (val)
+			nlk->flags |= NETLINK_F_LISTEN_ALL_NSID;
+		else
+			nlk->flags &= ~NETLINK_F_LISTEN_ALL_NSID;
+		err = 0;
+		break;
 	default:
 		err = -ENOPROTOOPT;
 	}
@@ -2268,6 +2298,16 @@ static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
 	put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info);
 }
 
+static void netlink_cmsg_listen_all_nsid(struct sock *sk, struct msghdr *msg,
+					 struct sk_buff *skb)
+{
+	if (!NETLINK_CB(skb).nsid_is_set)
+		return;
+
+	put_cmsg(msg, SOL_NETLINK, NETLINK_LISTEN_ALL_NSID, sizeof(int),
+		 &NETLINK_CB(skb).nsid);
+}
+
 static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 {
 	struct sock *sk = sock->sk;
@@ -2421,6 +2461,8 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 
 	if (nlk->flags & NETLINK_F_RECV_PKTINFO)
 		netlink_cmsg_recv_pktinfo(msg, skb);
+	if (nlk->flags & NETLINK_F_LISTEN_ALL_NSID)
+		netlink_cmsg_listen_all_nsid(sk, msg, skb);
 
 	memset(&scm, 0, sizeof(scm));
 	scm.creds = *NETLINK_CREDS(skb);
-- 
cgit v1.2.3


From 80ba92fa1a92dea128283f69f55b02242e213650 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 8 May 2015 15:05:12 -0700
Subject: codel: add ce_threshold attribute

For DCTCP or similar ECN based deployments on fabrics with shallow
buffers, hosts are responsible for a good part of the buffering.

This patch adds an optional ce_threshold to codel & fq_codel qdiscs,
so that DCTCP can have feedback from queuing in the host.

A DCTCP enabled egress port simply have a queue occupancy threshold
above which ECT packets get CE mark.

In codel language this translates to a sojourn time, so that one doesn't
have to worry about bytes or bandwidth but delays.

This makes the host an active participant in the health of the whole
network.

This also helps experimenting DCTCP in a setup without DCTCP compliant
fabric.

On following example, ce_threshold is set to 1ms, and we can see from
'ldelay xxx us' that TCP is not trying to go around the 5ms codel
target.

Queue has more capacity to absorb inelastic bursts (say from UDP
traffic), as queues are maintained to an optimal level.

lpaa23:~# ./tc -s -d qd sh dev eth1
qdisc mq 1: dev eth1 root
 Sent 87910654696 bytes 58065331 pkt (dropped 0, overlimits 0 requeues 42961)
 backlog 3108242b 364p requeues 42961
qdisc codel 8063: dev eth1 parent 1:1 limit 1000p target 5.0ms ce_threshold 1.0ms interval 100.0ms
 Sent 7363778701 bytes 4863809 pkt (dropped 0, overlimits 0 requeues 5503)
 rate 2348Mbit 193919pps backlog 255866b 46p requeues 5503
  count 0 lastcount 0 ldelay 1.0ms drop_next 0us
  maxpacket 68130 ecn_mark 0 drop_overlimit 0 ce_mark 72384
qdisc codel 8064: dev eth1 parent 1:2 limit 1000p target 5.0ms ce_threshold 1.0ms interval 100.0ms
 Sent 7636486190 bytes 5043942 pkt (dropped 0, overlimits 0 requeues 5186)
 rate 2319Mbit 191538pps backlog 207418b 64p requeues 5186
  count 0 lastcount 0 ldelay 694us drop_next 0us
  maxpacket 68130 ecn_mark 0 drop_overlimit 0 ce_mark 69873
qdisc codel 8065: dev eth1 parent 1:3 limit 1000p target 5.0ms ce_threshold 1.0ms interval 100.0ms
 Sent 11569360142 bytes 7641602 pkt (dropped 0, overlimits 0 requeues 5554)
 rate 3041Mbit 251096pps backlog 210446b 59p requeues 5554
  count 0 lastcount 0 ldelay 889us drop_next 0us
  maxpacket 68130 ecn_mark 0 drop_overlimit 0 ce_mark 37780
...

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Florian Westphal <fw@strlen.de>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Glenn Judd <glenn.judd@morganstanley.com>
Cc: Nandita Dukkipati <nanditad@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/codel.h            | 12 +++++++++++-
 include/uapi/linux/pkt_sched.h |  4 ++++
 net/sched/sch_codel.c          | 15 +++++++++++++--
 net/sched/sch_fq_codel.c       | 15 ++++++++++++++-
 4 files changed, 42 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/codel.h b/include/net/codel.h
index aeee28081245..8c0f78f209e8 100644
--- a/include/net/codel.h
+++ b/include/net/codel.h
@@ -7,7 +7,7 @@
  *  Copyright (C) 2011-2012 Kathleen Nichols <nichols@pollere.com>
  *  Copyright (C) 2011-2012 Van Jacobson <van@pollere.net>
  *  Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net>
- *  Copyright (C) 2012 Eric Dumazet <edumazet@google.com>
+ *  Copyright (C) 2012,2015 Eric Dumazet <edumazet@google.com>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -119,11 +119,13 @@ static inline u32 codel_time_to_us(codel_time_t val)
 /**
  * struct codel_params - contains codel parameters
  * @target:	target queue size (in time units)
+ * @ce_threshold:  threshold for marking packets with ECN CE
  * @interval:	width of moving time window
  * @ecn:	is Explicit Congestion Notification enabled
  */
 struct codel_params {
 	codel_time_t	target;
+	codel_time_t	ce_threshold;
 	codel_time_t	interval;
 	bool		ecn;
 };
@@ -159,17 +161,22 @@ struct codel_vars {
  * @maxpacket:	largest packet we've seen so far
  * @drop_count:	temp count of dropped packets in dequeue()
  * ecn_mark:	number of packets we ECN marked instead of dropping
+ * ce_mark:	number of packets CE marked because sojourn time was above ce_threshold
  */
 struct codel_stats {
 	u32		maxpacket;
 	u32		drop_count;
 	u32		ecn_mark;
+	u32		ce_mark;
 };
 
+#define CODEL_DISABLED_THRESHOLD INT_MAX
+
 static void codel_params_init(struct codel_params *params)
 {
 	params->interval = MS2TIME(100);
 	params->target = MS2TIME(5);
+	params->ce_threshold = CODEL_DISABLED_THRESHOLD;
 	params->ecn = false;
 }
 
@@ -350,6 +357,9 @@ static struct sk_buff *codel_dequeue(struct Qdisc *sch,
 						    vars->rec_inv_sqrt);
 	}
 end:
+	if (skb && codel_time_after(vars->ldelay, params->ce_threshold) &&
+	    INET_ECN_set_ce(skb))
+		stats->ce_mark++;
 	return skb;
 }
 #endif
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 534b84710745..69d88b309cc7 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -679,6 +679,7 @@ enum {
 	TCA_CODEL_LIMIT,
 	TCA_CODEL_INTERVAL,
 	TCA_CODEL_ECN,
+	TCA_CODEL_CE_THRESHOLD,
 	__TCA_CODEL_MAX
 };
 
@@ -695,6 +696,7 @@ struct tc_codel_xstats {
 	__u32	drop_overlimit; /* number of time max qdisc packet limit was hit */
 	__u32	ecn_mark;  /* number of packets we ECN marked instead of dropped */
 	__u32	dropping;  /* are we in dropping state ? */
+	__u32	ce_mark;   /* number of CE marked packets because of ce_threshold */
 };
 
 /* FQ_CODEL */
@@ -707,6 +709,7 @@ enum {
 	TCA_FQ_CODEL_ECN,
 	TCA_FQ_CODEL_FLOWS,
 	TCA_FQ_CODEL_QUANTUM,
+	TCA_FQ_CODEL_CE_THRESHOLD,
 	__TCA_FQ_CODEL_MAX
 };
 
@@ -730,6 +733,7 @@ struct tc_fq_codel_qd_stats {
 				 */
 	__u32	new_flows_len;	/* count of flows in new list */
 	__u32	old_flows_len;	/* count of flows in old list */
+	__u32	ce_mark;	/* packets above ce_threshold */
 };
 
 struct tc_fq_codel_cl_stats {
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index de28f8e968e8..1474b6560fac 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -6,7 +6,7 @@
  *
  *  Implemented on linux by :
  *  Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net>
- *  Copyright (C) 2012 Eric Dumazet <edumazet@google.com>
+ *  Copyright (C) 2012,2015 Eric Dumazet <edumazet@google.com>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -109,6 +109,7 @@ static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = {
 	[TCA_CODEL_LIMIT]	= { .type = NLA_U32 },
 	[TCA_CODEL_INTERVAL]	= { .type = NLA_U32 },
 	[TCA_CODEL_ECN]		= { .type = NLA_U32 },
+	[TCA_CODEL_CE_THRESHOLD]= { .type = NLA_U32 },
 };
 
 static int codel_change(struct Qdisc *sch, struct nlattr *opt)
@@ -133,6 +134,12 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt)
 		q->params.target = ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT;
 	}
 
+	if (tb[TCA_CODEL_CE_THRESHOLD]) {
+		u64 val = nla_get_u32(tb[TCA_CODEL_CE_THRESHOLD]);
+
+		q->params.ce_threshold = (val * NSEC_PER_USEC) >> CODEL_SHIFT;
+	}
+
 	if (tb[TCA_CODEL_INTERVAL]) {
 		u32 interval = nla_get_u32(tb[TCA_CODEL_INTERVAL]);
 
@@ -201,7 +208,10 @@ static int codel_dump(struct Qdisc *sch, struct sk_buff *skb)
 	    nla_put_u32(skb, TCA_CODEL_ECN,
 			q->params.ecn))
 		goto nla_put_failure;
-
+	if (q->params.ce_threshold != CODEL_DISABLED_THRESHOLD &&
+	    nla_put_u32(skb, TCA_CODEL_CE_THRESHOLD,
+			codel_time_to_us(q->params.ce_threshold)))
+		goto nla_put_failure;
 	return nla_nest_end(skb, opts);
 
 nla_put_failure:
@@ -220,6 +230,7 @@ static int codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 		.ldelay		= codel_time_to_us(q->vars.ldelay),
 		.dropping	= q->vars.dropping,
 		.ecn_mark	= q->stats.ecn_mark,
+		.ce_mark	= q->stats.ce_mark,
 	};
 
 	if (q->vars.dropping) {
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index a6fc53d69513..778739786b32 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -6,7 +6,7 @@
  *	as published by the Free Software Foundation; either version
  *	2 of the License, or (at your option) any later version.
  *
- *  Copyright (C) 2012 Eric Dumazet <edumazet@google.com>
+ *  Copyright (C) 2012,2015 Eric Dumazet <edumazet@google.com>
  */
 
 #include <linux/module.h>
@@ -292,6 +292,7 @@ static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = {
 	[TCA_FQ_CODEL_ECN]	= { .type = NLA_U32 },
 	[TCA_FQ_CODEL_FLOWS]	= { .type = NLA_U32 },
 	[TCA_FQ_CODEL_QUANTUM]	= { .type = NLA_U32 },
+	[TCA_FQ_CODEL_CE_THRESHOLD] = { .type = NLA_U32 },
 };
 
 static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt)
@@ -322,6 +323,12 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt)
 		q->cparams.target = (target * NSEC_PER_USEC) >> CODEL_SHIFT;
 	}
 
+	if (tb[TCA_FQ_CODEL_CE_THRESHOLD]) {
+		u64 val = nla_get_u32(tb[TCA_FQ_CODEL_CE_THRESHOLD]);
+
+		q->cparams.ce_threshold = (val * NSEC_PER_USEC) >> CODEL_SHIFT;
+	}
+
 	if (tb[TCA_FQ_CODEL_INTERVAL]) {
 		u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]);
 
@@ -441,6 +448,11 @@ static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb)
 			q->flows_cnt))
 		goto nla_put_failure;
 
+	if (q->cparams.ce_threshold != CODEL_DISABLED_THRESHOLD &&
+	    nla_put_u32(skb, TCA_FQ_CODEL_CE_THRESHOLD,
+			codel_time_to_us(q->cparams.ce_threshold)))
+		goto nla_put_failure;
+
 	return nla_nest_end(skb, opts);
 
 nla_put_failure:
@@ -459,6 +471,7 @@ static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 	st.qdisc_stats.drop_overlimit = q->drop_overlimit;
 	st.qdisc_stats.ecn_mark = q->cstats.ecn_mark;
 	st.qdisc_stats.new_flow_count = q->new_flow_count;
+	st.qdisc_stats.ce_mark = q->cstats.ce_mark;
 
 	list_for_each(pos, &q->new_flows)
 		st.qdisc_stats.new_flows_len++;
-- 
cgit v1.2.3


From 11aa9c28b4209242a9de0a661a7b3405adb568a0 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 8 May 2015 21:09:13 -0500
Subject: net: Pass kern from net_proto_family.create to sk_alloc

In preparation for changing how struct net is refcounted
on kernel sockets pass the knowledge that we are creating
a kernel socket from sock_create_kern through to sk_alloc.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 crypto/af_alg.c                |  4 ++--
 drivers/isdn/mISDN/socket.c    | 12 ++++++------
 drivers/net/macvtap.c          |  2 +-
 drivers/net/ppp/pppoe.c        |  4 ++--
 drivers/net/ppp/pppox.c        |  2 +-
 drivers/net/ppp/pptp.c         |  4 ++--
 drivers/net/tun.c              |  2 +-
 include/linux/if_pppox.h       |  2 +-
 include/net/af_vsock.h         |  2 +-
 include/net/llc_conn.h         |  2 +-
 include/net/sock.h             |  2 +-
 net/appletalk/ddp.c            |  2 +-
 net/atm/common.c               |  4 ++--
 net/atm/common.h               |  2 +-
 net/atm/pvc.c                  |  2 +-
 net/atm/svc.c                  |  2 +-
 net/ax25/af_ax25.c             |  4 ++--
 net/bluetooth/bnep/sock.c      |  2 +-
 net/bluetooth/cmtp/sock.c      |  2 +-
 net/bluetooth/hci_sock.c       |  2 +-
 net/bluetooth/hidp/sock.c      |  2 +-
 net/bluetooth/l2cap_sock.c     | 10 +++++-----
 net/bluetooth/rfcomm/sock.c    |  8 ++++----
 net/bluetooth/sco.c            |  8 ++++----
 net/caif/caif_socket.c         |  2 +-
 net/can/af_can.c               |  2 +-
 net/core/sock.c                |  3 ++-
 net/decnet/af_decnet.c         |  8 ++++----
 net/ieee802154/socket.c        |  2 +-
 net/ipv4/af_inet.c             |  2 +-
 net/ipv6/af_inet6.c            |  2 +-
 net/ipx/af_ipx.c               |  2 +-
 net/irda/af_irda.c             |  2 +-
 net/iucv/af_iucv.c             | 10 +++++-----
 net/key/af_key.c               |  2 +-
 net/l2tp/l2tp_ppp.c            |  4 ++--
 net/llc/af_llc.c               |  2 +-
 net/llc/llc_conn.c             |  6 +++---
 net/netlink/af_netlink.c       | 11 +++++------
 net/netrom/af_netrom.c         |  4 ++--
 net/nfc/af_nfc.c               |  2 +-
 net/nfc/llcp.h                 |  2 +-
 net/nfc/llcp_core.c            |  2 +-
 net/nfc/llcp_sock.c            |  8 ++++----
 net/nfc/nfc.h                  |  2 +-
 net/nfc/rawsock.c              |  4 ++--
 net/packet/af_packet.c         |  2 +-
 net/phonet/af_phonet.c         |  2 +-
 net/phonet/pep.c               |  2 +-
 net/rds/af_rds.c               |  2 +-
 net/rose/af_rose.c             |  4 ++--
 net/rxrpc/af_rxrpc.c           |  2 +-
 net/sctp/ipv6.c                |  2 +-
 net/sctp/protocol.c            |  2 +-
 net/tipc/socket.c              |  2 +-
 net/unix/af_unix.c             |  8 ++++----
 net/vmw_vsock/af_vsock.c       |  7 ++++---
 net/vmw_vsock/vmci_transport.c |  2 +-
 net/x25/af_x25.c               |  8 ++++----
 59 files changed, 109 insertions(+), 108 deletions(-)

(limited to 'include/net')

diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index f22cc56fd1b3..5ad0d5354535 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -244,7 +244,7 @@ int af_alg_accept(struct sock *sk, struct socket *newsock)
 	if (!type)
 		goto unlock;
 
-	sk2 = sk_alloc(sock_net(sk), PF_ALG, GFP_KERNEL, &alg_proto);
+	sk2 = sk_alloc(sock_net(sk), PF_ALG, GFP_KERNEL, &alg_proto, 0);
 	err = -ENOMEM;
 	if (!sk2)
 		goto unlock;
@@ -324,7 +324,7 @@ static int alg_create(struct net *net, struct socket *sock, int protocol,
 		return -EPROTONOSUPPORT;
 
 	err = -ENOMEM;
-	sk = sk_alloc(net, PF_ALG, GFP_KERNEL, &alg_proto);
+	sk = sk_alloc(net, PF_ALG, GFP_KERNEL, &alg_proto, kern);
 	if (!sk)
 		goto out;
 
diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c
index 8dc7290089bb..0d29b5a6356d 100644
--- a/drivers/isdn/mISDN/socket.c
+++ b/drivers/isdn/mISDN/socket.c
@@ -601,14 +601,14 @@ static const struct proto_ops data_sock_ops = {
 };
 
 static int
-data_sock_create(struct net *net, struct socket *sock, int protocol)
+data_sock_create(struct net *net, struct socket *sock, int protocol, int kern)
 {
 	struct sock *sk;
 
 	if (sock->type != SOCK_DGRAM)
 		return -ESOCKTNOSUPPORT;
 
-	sk = sk_alloc(net, PF_ISDN, GFP_KERNEL, &mISDN_proto);
+	sk = sk_alloc(net, PF_ISDN, GFP_KERNEL, &mISDN_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
@@ -756,14 +756,14 @@ static const struct proto_ops base_sock_ops = {
 
 
 static int
-base_sock_create(struct net *net, struct socket *sock, int protocol)
+base_sock_create(struct net *net, struct socket *sock, int protocol, int kern)
 {
 	struct sock *sk;
 
 	if (sock->type != SOCK_RAW)
 		return -ESOCKTNOSUPPORT;
 
-	sk = sk_alloc(net, PF_ISDN, GFP_KERNEL, &mISDN_proto);
+	sk = sk_alloc(net, PF_ISDN, GFP_KERNEL, &mISDN_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
@@ -785,7 +785,7 @@ mISDN_sock_create(struct net *net, struct socket *sock, int proto, int kern)
 
 	switch (proto) {
 	case ISDN_P_BASE:
-		err = base_sock_create(net, sock, proto);
+		err = base_sock_create(net, sock, proto, kern);
 		break;
 	case ISDN_P_TE_S0:
 	case ISDN_P_NT_S0:
@@ -799,7 +799,7 @@ mISDN_sock_create(struct net *net, struct socket *sock, int proto, int kern)
 	case ISDN_P_B_L2DTMF:
 	case ISDN_P_B_L2DSP:
 	case ISDN_P_B_L2DSPHDLC:
-		err = data_sock_create(net, sock, proto);
+		err = data_sock_create(net, sock, proto, kern);
 		break;
 	default:
 		return err;
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 8c350c5d54ad..0398631a3c24 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -476,7 +476,7 @@ static int macvtap_open(struct inode *inode, struct file *file)
 
 	err = -ENOMEM;
 	q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
-					     &macvtap_proto);
+					     &macvtap_proto, 0);
 	if (!q)
 		goto out;
 
diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
index aa1dd926623a..f86c5ab334aa 100644
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -546,11 +546,11 @@ static struct proto pppoe_sk_proto __read_mostly = {
  * Initialize a new struct sock.
  *
  **********************************************************************/
-static int pppoe_create(struct net *net, struct socket *sock)
+static int pppoe_create(struct net *net, struct socket *sock, int kern)
 {
 	struct sock *sk;
 
-	sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppoe_sk_proto);
+	sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppoe_sk_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
diff --git a/drivers/net/ppp/pppox.c b/drivers/net/ppp/pppox.c
index 2940e9fe351b..0e1b30622477 100644
--- a/drivers/net/ppp/pppox.c
+++ b/drivers/net/ppp/pppox.c
@@ -118,7 +118,7 @@ static int pppox_create(struct net *net, struct socket *sock, int protocol,
 	    !try_module_get(pppox_protos[protocol]->owner))
 		goto out;
 
-	rc = pppox_protos[protocol]->create(net, sock);
+	rc = pppox_protos[protocol]->create(net, sock, kern);
 
 	module_put(pppox_protos[protocol]->owner);
 out:
diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c
index e3bfbd4d0136..14839bc0aaf5 100644
--- a/drivers/net/ppp/pptp.c
+++ b/drivers/net/ppp/pptp.c
@@ -561,14 +561,14 @@ static void pptp_sock_destruct(struct sock *sk)
 	skb_queue_purge(&sk->sk_receive_queue);
 }
 
-static int pptp_create(struct net *net, struct socket *sock)
+static int pptp_create(struct net *net, struct socket *sock, int kern)
 {
 	int error = -ENOMEM;
 	struct sock *sk;
 	struct pppox_sock *po;
 	struct pptp_opt *opt;
 
-	sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pptp_sk_proto);
+	sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pptp_sk_proto, kern);
 	if (!sk)
 		goto out;
 
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 3262f3e2b8b2..1a1c4f7b3ec5 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2148,7 +2148,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
 	DBG1(KERN_INFO, "tunX: tun_chr_open\n");
 
 	tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
-					    &tun_proto);
+					    &tun_proto, 0);
 	if (!tfile)
 		return -ENOMEM;
 	RCU_INIT_POINTER(tfile->tun, NULL);
diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h
index 66a7d7600f43..b49cf923becc 100644
--- a/include/linux/if_pppox.h
+++ b/include/linux/if_pppox.h
@@ -74,7 +74,7 @@ static inline struct sock *sk_pppox(struct pppox_sock *po)
 struct module;
 
 struct pppox_proto {
-	int		(*create)(struct net *net, struct socket *sock);
+	int		(*create)(struct net *net, struct socket *sock, int kern);
 	int		(*ioctl)(struct socket *sock, unsigned int cmd,
 				 unsigned long arg);
 	struct module	*owner;
diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 172632dd9930..db639a4c5ab8 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -74,7 +74,7 @@ void vsock_pending_work(struct work_struct *work);
 struct sock *__vsock_create(struct net *net,
 			    struct socket *sock,
 			    struct sock *parent,
-			    gfp_t priority, unsigned short type);
+			    gfp_t priority, unsigned short type, int kern);
 
 /**** TRANSPORT ****/
 
diff --git a/include/net/llc_conn.h b/include/net/llc_conn.h
index 0134681acc4c..fe994d2e5286 100644
--- a/include/net/llc_conn.h
+++ b/include/net/llc_conn.h
@@ -96,7 +96,7 @@ static __inline__ char llc_backlog_type(struct sk_buff *skb)
 }
 
 struct sock *llc_sk_alloc(struct net *net, int family, gfp_t priority,
-			  struct proto *prot);
+			  struct proto *prot, int kern);
 void llc_sk_free(struct sock *sk);
 
 void llc_sk_reset(struct sock *sk);
diff --git a/include/net/sock.h b/include/net/sock.h
index 3a4898ec8c67..d8dcf91732b0 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1514,7 +1514,7 @@ static inline void unlock_sock_fast(struct sock *sk, bool slow)
 
 
 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
-		      struct proto *prot);
+		      struct proto *prot, int kern);
 void sk_free(struct sock *sk);
 void sk_release_kernel(struct sock *sk);
 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority);
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 3b7ad43c7dad..d5871ac493eb 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1030,7 +1030,7 @@ static int atalk_create(struct net *net, struct socket *sock, int protocol,
 	if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
 		goto out;
 	rc = -ENOMEM;
-	sk = sk_alloc(net, PF_APPLETALK, GFP_KERNEL, &ddp_proto);
+	sk = sk_alloc(net, PF_APPLETALK, GFP_KERNEL, &ddp_proto, kern);
 	if (!sk)
 		goto out;
 	rc = 0;
diff --git a/net/atm/common.c b/net/atm/common.c
index ed0466637e13..49a872db7e42 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -141,7 +141,7 @@ static struct proto vcc_proto = {
 	.release_cb = vcc_release_cb,
 };
 
-int vcc_create(struct net *net, struct socket *sock, int protocol, int family)
+int vcc_create(struct net *net, struct socket *sock, int protocol, int family, int kern)
 {
 	struct sock *sk;
 	struct atm_vcc *vcc;
@@ -149,7 +149,7 @@ int vcc_create(struct net *net, struct socket *sock, int protocol, int family)
 	sock->sk = NULL;
 	if (sock->type == SOCK_STREAM)
 		return -EINVAL;
-	sk = sk_alloc(net, family, GFP_KERNEL, &vcc_proto);
+	sk = sk_alloc(net, family, GFP_KERNEL, &vcc_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 	sock_init_data(sock, sk);
diff --git a/net/atm/common.h b/net/atm/common.h
index 4d6f5b2068ac..959436b87182 100644
--- a/net/atm/common.h
+++ b/net/atm/common.h
@@ -10,7 +10,7 @@
 #include <linux/poll.h> /* for poll_table */
 
 
-int vcc_create(struct net *net, struct socket *sock, int protocol, int family);
+int vcc_create(struct net *net, struct socket *sock, int protocol, int family, int kern);
 int vcc_release(struct socket *sock);
 int vcc_connect(struct socket *sock, int itf, short vpi, int vci);
 int vcc_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
diff --git a/net/atm/pvc.c b/net/atm/pvc.c
index ae0324021407..040207ec399f 100644
--- a/net/atm/pvc.c
+++ b/net/atm/pvc.c
@@ -136,7 +136,7 @@ static int pvc_create(struct net *net, struct socket *sock, int protocol,
 		return -EAFNOSUPPORT;
 
 	sock->ops = &pvc_proto_ops;
-	return vcc_create(net, sock, protocol, PF_ATMPVC);
+	return vcc_create(net, sock, protocol, PF_ATMPVC, kern);
 }
 
 static const struct net_proto_family pvc_family_ops = {
diff --git a/net/atm/svc.c b/net/atm/svc.c
index 1ba23f5018e7..3fa0a9ee98d1 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -660,7 +660,7 @@ static int svc_create(struct net *net, struct socket *sock, int protocol,
 		return -EAFNOSUPPORT;
 
 	sock->ops = &svc_proto_ops;
-	error = vcc_create(net, sock, protocol, AF_ATMSVC);
+	error = vcc_create(net, sock, protocol, AF_ATMSVC, kern);
 	if (error)
 		return error;
 	ATM_SD(sock)->local.sas_family = AF_ATMSVC;
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 330c1f4a5a0b..4273533d22b1 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -855,7 +855,7 @@ static int ax25_create(struct net *net, struct socket *sock, int protocol,
 		return -ESOCKTNOSUPPORT;
 	}
 
-	sk = sk_alloc(net, PF_AX25, GFP_ATOMIC, &ax25_proto);
+	sk = sk_alloc(net, PF_AX25, GFP_ATOMIC, &ax25_proto, kern);
 	if (sk == NULL)
 		return -ENOMEM;
 
@@ -881,7 +881,7 @@ struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev)
 	struct sock *sk;
 	ax25_cb *ax25, *oax25;
 
-	sk = sk_alloc(sock_net(osk), PF_AX25, GFP_ATOMIC,	osk->sk_prot);
+	sk = sk_alloc(sock_net(osk), PF_AX25, GFP_ATOMIC, osk->sk_prot, 0);
 	if (sk == NULL)
 		return NULL;
 
diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c
index bde2bdd9e929..b5116fa9835e 100644
--- a/net/bluetooth/bnep/sock.c
+++ b/net/bluetooth/bnep/sock.c
@@ -202,7 +202,7 @@ static int bnep_sock_create(struct net *net, struct socket *sock, int protocol,
 	if (sock->type != SOCK_RAW)
 		return -ESOCKTNOSUPPORT;
 
-	sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &bnep_proto);
+	sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &bnep_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c
index d82787d417bd..ce86a7bae844 100644
--- a/net/bluetooth/cmtp/sock.c
+++ b/net/bluetooth/cmtp/sock.c
@@ -205,7 +205,7 @@ static int cmtp_sock_create(struct net *net, struct socket *sock, int protocol,
 	if (sock->type != SOCK_RAW)
 		return -ESOCKTNOSUPPORT;
 
-	sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &cmtp_proto);
+	sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &cmtp_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 56f9edbf3d05..5b14dcafcd08 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -1377,7 +1377,7 @@ static int hci_sock_create(struct net *net, struct socket *sock, int protocol,
 
 	sock->ops = &hci_sock_ops;
 
-	sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hci_sk_proto);
+	sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hci_sk_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c
index cb3fdde1968a..008ba439bd62 100644
--- a/net/bluetooth/hidp/sock.c
+++ b/net/bluetooth/hidp/sock.c
@@ -235,7 +235,7 @@ static int hidp_sock_create(struct net *net, struct socket *sock, int protocol,
 	if (sock->type != SOCK_RAW)
 		return -ESOCKTNOSUPPORT;
 
-	sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hidp_proto);
+	sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hidp_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index a7278f05eafb..244287706f91 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -43,7 +43,7 @@ static struct bt_sock_list l2cap_sk_list = {
 static const struct proto_ops l2cap_sock_ops;
 static void l2cap_sock_init(struct sock *sk, struct sock *parent);
 static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock,
-				     int proto, gfp_t prio);
+				     int proto, gfp_t prio, int kern);
 
 bool l2cap_is_socket(struct socket *sock)
 {
@@ -1193,7 +1193,7 @@ static struct l2cap_chan *l2cap_sock_new_connection_cb(struct l2cap_chan *chan)
 	}
 
 	sk = l2cap_sock_alloc(sock_net(parent), NULL, BTPROTO_L2CAP,
-			      GFP_ATOMIC);
+			      GFP_ATOMIC, 0);
 	if (!sk) {
 		release_sock(parent);
 		return NULL;
@@ -1523,12 +1523,12 @@ static struct proto l2cap_proto = {
 };
 
 static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock,
-				     int proto, gfp_t prio)
+				     int proto, gfp_t prio, int kern)
 {
 	struct sock *sk;
 	struct l2cap_chan *chan;
 
-	sk = sk_alloc(net, PF_BLUETOOTH, prio, &l2cap_proto);
+	sk = sk_alloc(net, PF_BLUETOOTH, prio, &l2cap_proto, kern);
 	if (!sk)
 		return NULL;
 
@@ -1574,7 +1574,7 @@ static int l2cap_sock_create(struct net *net, struct socket *sock, int protocol,
 
 	sock->ops = &l2cap_sock_ops;
 
-	sk = l2cap_sock_alloc(net, sock, protocol, GFP_ATOMIC);
+	sk = l2cap_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern);
 	if (!sk)
 		return -ENOMEM;
 
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 825e8fb5114b..b2338e971b33 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -269,12 +269,12 @@ static struct proto rfcomm_proto = {
 	.obj_size	= sizeof(struct rfcomm_pinfo)
 };
 
-static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio)
+static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio, int kern)
 {
 	struct rfcomm_dlc *d;
 	struct sock *sk;
 
-	sk = sk_alloc(net, PF_BLUETOOTH, prio, &rfcomm_proto);
+	sk = sk_alloc(net, PF_BLUETOOTH, prio, &rfcomm_proto, kern);
 	if (!sk)
 		return NULL;
 
@@ -324,7 +324,7 @@ static int rfcomm_sock_create(struct net *net, struct socket *sock,
 
 	sock->ops = &rfcomm_sock_ops;
 
-	sk = rfcomm_sock_alloc(net, sock, protocol, GFP_ATOMIC);
+	sk = rfcomm_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern);
 	if (!sk)
 		return -ENOMEM;
 
@@ -969,7 +969,7 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc *
 		goto done;
 	}
 
-	sk = rfcomm_sock_alloc(sock_net(parent), NULL, BTPROTO_RFCOMM, GFP_ATOMIC);
+	sk = rfcomm_sock_alloc(sock_net(parent), NULL, BTPROTO_RFCOMM, GFP_ATOMIC, 0);
 	if (!sk)
 		goto done;
 
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 4322c833e748..6b6e59dc54cf 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -460,11 +460,11 @@ static struct proto sco_proto = {
 	.obj_size	= sizeof(struct sco_pinfo)
 };
 
-static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio)
+static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio, int kern)
 {
 	struct sock *sk;
 
-	sk = sk_alloc(net, PF_BLUETOOTH, prio, &sco_proto);
+	sk = sk_alloc(net, PF_BLUETOOTH, prio, &sco_proto, kern);
 	if (!sk)
 		return NULL;
 
@@ -501,7 +501,7 @@ static int sco_sock_create(struct net *net, struct socket *sock, int protocol,
 
 	sock->ops = &sco_sock_ops;
 
-	sk = sco_sock_alloc(net, sock, protocol, GFP_ATOMIC);
+	sk = sco_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern);
 	if (!sk)
 		return -ENOMEM;
 
@@ -1026,7 +1026,7 @@ static void sco_conn_ready(struct sco_conn *conn)
 		bh_lock_sock(parent);
 
 		sk = sco_sock_alloc(sock_net(parent), NULL,
-				    BTPROTO_SCO, GFP_ATOMIC);
+				    BTPROTO_SCO, GFP_ATOMIC, 0);
 		if (!sk) {
 			bh_unlock_sock(parent);
 			sco_conn_unlock(conn);
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 4ec0c803aef1..78a04ebb113c 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -1047,7 +1047,7 @@ static int caif_create(struct net *net, struct socket *sock, int protocol,
 	 * is really not used at all in the net/core or socket.c but the
 	 * initialization makes sure that sock->state is not uninitialized.
 	 */
-	sk = sk_alloc(net, PF_CAIF, GFP_KERNEL, &prot);
+	sk = sk_alloc(net, PF_CAIF, GFP_KERNEL, &prot, kern);
 	if (!sk)
 		return -ENOMEM;
 
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 32d710eaf1fc..d4d404bdfc9a 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -179,7 +179,7 @@ static int can_create(struct net *net, struct socket *sock, int protocol,
 
 	sock->ops = cp->ops;
 
-	sk = sk_alloc(net, PF_CAN, GFP_KERNEL, cp->prot);
+	sk = sk_alloc(net, PF_CAN, GFP_KERNEL, cp->prot, kern);
 	if (!sk) {
 		err = -ENOMEM;
 		goto errout;
diff --git a/net/core/sock.c b/net/core/sock.c
index e891bcf325ca..cbc3789b830c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1396,9 +1396,10 @@ EXPORT_SYMBOL_GPL(sock_update_netprioidx);
  *	@family: protocol family
  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
  *	@prot: struct proto associated with this new sock instance
+ *	@kern: is this to be a kernel socket?
  */
 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
-		      struct proto *prot)
+		      struct proto *prot, int kern)
 {
 	struct sock *sk;
 
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 754484b3cd0e..675cf94e04f8 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -468,10 +468,10 @@ static struct proto dn_proto = {
 	.obj_size		= sizeof(struct dn_sock),
 };
 
-static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gfp)
+static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gfp, int kern)
 {
 	struct dn_scp *scp;
-	struct sock *sk = sk_alloc(net, PF_DECnet, gfp, &dn_proto);
+	struct sock *sk = sk_alloc(net, PF_DECnet, gfp, &dn_proto, kern);
 
 	if  (!sk)
 		goto out;
@@ -693,7 +693,7 @@ static int dn_create(struct net *net, struct socket *sock, int protocol,
 	}
 
 
-	if ((sk = dn_alloc_sock(net, sock, GFP_KERNEL)) == NULL)
+	if ((sk = dn_alloc_sock(net, sock, GFP_KERNEL, kern)) == NULL)
 		return -ENOBUFS;
 
 	sk->sk_protocol = protocol;
@@ -1096,7 +1096,7 @@ static int dn_accept(struct socket *sock, struct socket *newsock, int flags)
 
 	cb = DN_SKB_CB(skb);
 	sk->sk_ack_backlog--;
-	newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation);
+	newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation, 0);
 	if (newsk == NULL) {
 		release_sock(sk);
 		kfree_skb(skb);
diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
index b60c65f70346..7aaaf967df58 100644
--- a/net/ieee802154/socket.c
+++ b/net/ieee802154/socket.c
@@ -1014,7 +1014,7 @@ static int ieee802154_create(struct net *net, struct socket *sock,
 	}
 
 	rc = -ENOMEM;
-	sk = sk_alloc(net, PF_IEEE802154, GFP_KERNEL, proto);
+	sk = sk_alloc(net, PF_IEEE802154, GFP_KERNEL, proto, kern);
 	if (!sk)
 		goto out;
 	rc = 0;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 09f4d024dfe5..e2dd9cb99d61 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -317,7 +317,7 @@ lookup_protocol:
 	WARN_ON(!answer_prot->slab);
 
 	err = -ENOBUFS;
-	sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
+	sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);
 	if (!sk)
 		goto out;
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 4632afa57e05..f3866c0b6cfe 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -167,7 +167,7 @@ lookup_protocol:
 	WARN_ON(!answer_prot->slab);
 
 	err = -ENOBUFS;
-	sk = sk_alloc(net, PF_INET6, GFP_KERNEL, answer_prot);
+	sk = sk_alloc(net, PF_INET6, GFP_KERNEL, answer_prot, kern);
 	if (!sk)
 		goto out;
 
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index 4ea5d7497b5f..48d0dc89b58d 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -1347,7 +1347,7 @@ static int ipx_create(struct net *net, struct socket *sock, int protocol,
 		goto out;
 
 	rc = -ENOMEM;
-	sk = sk_alloc(net, PF_IPX, GFP_KERNEL, &ipx_proto);
+	sk = sk_alloc(net, PF_IPX, GFP_KERNEL, &ipx_proto, kern);
 	if (!sk)
 		goto out;
 
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index ee0ea25c8e7a..fae6822cc367 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -1100,7 +1100,7 @@ static int irda_create(struct net *net, struct socket *sock, int protocol,
 	}
 
 	/* Allocate networking socket */
-	sk = sk_alloc(net, PF_IRDA, GFP_KERNEL, &irda_proto);
+	sk = sk_alloc(net, PF_IRDA, GFP_KERNEL, &irda_proto, kern);
 	if (sk == NULL)
 		return -ENOMEM;
 
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 6daa52a18d40..918151c11348 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -535,12 +535,12 @@ static void iucv_sock_init(struct sock *sk, struct sock *parent)
 		sk->sk_type = parent->sk_type;
 }
 
-static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio)
+static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio, int kern)
 {
 	struct sock *sk;
 	struct iucv_sock *iucv;
 
-	sk = sk_alloc(&init_net, PF_IUCV, prio, &iucv_proto);
+	sk = sk_alloc(&init_net, PF_IUCV, prio, &iucv_proto, kern);
 	if (!sk)
 		return NULL;
 	iucv = iucv_sk(sk);
@@ -602,7 +602,7 @@ static int iucv_sock_create(struct net *net, struct socket *sock, int protocol,
 		return -ESOCKTNOSUPPORT;
 	}
 
-	sk = iucv_sock_alloc(sock, protocol, GFP_KERNEL);
+	sk = iucv_sock_alloc(sock, protocol, GFP_KERNEL, kern);
 	if (!sk)
 		return -ENOMEM;
 
@@ -1723,7 +1723,7 @@ static int iucv_callback_connreq(struct iucv_path *path,
 	}
 
 	/* Create the new socket */
-	nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC);
+	nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC, 0);
 	if (!nsk) {
 		err = pr_iucv->path_sever(path, user_data);
 		iucv_path_free(path);
@@ -1933,7 +1933,7 @@ static int afiucv_hs_callback_syn(struct sock *sk, struct sk_buff *skb)
 		goto out;
 	}
 
-	nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC);
+	nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC, 0);
 	bh_lock_sock(sk);
 	if ((sk->sk_state != IUCV_LISTEN) ||
 	    sk_acceptq_is_full(sk) ||
diff --git a/net/key/af_key.c b/net/key/af_key.c
index f0d52d721b3a..9e834ec475a9 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -149,7 +149,7 @@ static int pfkey_create(struct net *net, struct socket *sock, int protocol,
 		return -EPROTONOSUPPORT;
 
 	err = -ENOMEM;
-	sk = sk_alloc(net, PF_KEY, GFP_KERNEL, &key_proto);
+	sk = sk_alloc(net, PF_KEY, GFP_KERNEL, &key_proto, kern);
 	if (sk == NULL)
 		goto out;
 
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index e9b0dec56b8e..f56c9f69e9f2 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -542,12 +542,12 @@ static int pppol2tp_backlog_recv(struct sock *sk, struct sk_buff *skb)
 
 /* socket() handler. Initialize a new struct sock.
  */
-static int pppol2tp_create(struct net *net, struct socket *sock)
+static int pppol2tp_create(struct net *net, struct socket *sock, int kern)
 {
 	int error = -ENOMEM;
 	struct sock *sk;
 
-	sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppol2tp_sk_proto);
+	sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppol2tp_sk_proto, kern);
 	if (!sk)
 		goto out;
 
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 17a8dff06090..8fd9febaa5ba 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -168,7 +168,7 @@ static int llc_ui_create(struct net *net, struct socket *sock, int protocol,
 
 	if (likely(sock->type == SOCK_DGRAM || sock->type == SOCK_STREAM)) {
 		rc = -ENOMEM;
-		sk = llc_sk_alloc(net, PF_LLC, GFP_KERNEL, &llc_proto);
+		sk = llc_sk_alloc(net, PF_LLC, GFP_KERNEL, &llc_proto, kern);
 		if (sk) {
 			rc = 0;
 			llc_ui_sk_init(sock, sk);
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index 81a61fce3afb..3e821daf9dd4 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -768,7 +768,7 @@ static struct sock *llc_create_incoming_sock(struct sock *sk,
 					     struct llc_addr *daddr)
 {
 	struct sock *newsk = llc_sk_alloc(sock_net(sk), sk->sk_family, GFP_ATOMIC,
-					  sk->sk_prot);
+					  sk->sk_prot, 0);
 	struct llc_sock *newllc, *llc = llc_sk(sk);
 
 	if (!newsk)
@@ -931,9 +931,9 @@ static void llc_sk_init(struct sock *sk)
  *	Allocates a LLC sock and initializes it. Returns the new LLC sock
  *	or %NULL if there's no memory available for one
  */
-struct sock *llc_sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot)
+struct sock *llc_sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern)
 {
-	struct sock *sk = sk_alloc(net, family, priority, prot);
+	struct sock *sk = sk_alloc(net, family, priority, prot, kern);
 
 	if (!sk)
 		goto out;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index a5fff75accf8..be6665ab7f40 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1119,14 +1119,15 @@ static struct proto netlink_proto = {
 };
 
 static int __netlink_create(struct net *net, struct socket *sock,
-			    struct mutex *cb_mutex, int protocol)
+			    struct mutex *cb_mutex, int protocol,
+			    int kern)
 {
 	struct sock *sk;
 	struct netlink_sock *nlk;
 
 	sock->ops = &netlink_ops;
 
-	sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto);
+	sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
@@ -1188,7 +1189,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol,
 	if (err < 0)
 		goto out;
 
-	err = __netlink_create(net, sock, cb_mutex, protocol);
+	err = __netlink_create(net, sock, cb_mutex, protocol, kern);
 	if (err < 0)
 		goto out_module;
 
@@ -2515,14 +2516,12 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module,
 
 	if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock))
 		return NULL;
-
 	/*
 	 * We have to just have a reference on the net from sk, but don't
 	 * get_net it. Besides, we cannot get and then put the net here.
 	 * So we create one inside init_net and the move it to net.
 	 */
-
-	if (__netlink_create(&init_net, sock, cb_mutex, unit) < 0)
+	if (__netlink_create(&init_net, sock, cb_mutex, unit, 0) < 0)
 		goto out_sock_release_nosk;
 
 	sk = sock->sk;
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index b987fd56c3c5..ed212ffc1d9d 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -433,7 +433,7 @@ static int nr_create(struct net *net, struct socket *sock, int protocol,
 	if (sock->type != SOCK_SEQPACKET || protocol != 0)
 		return -ESOCKTNOSUPPORT;
 
-	sk = sk_alloc(net, PF_NETROM, GFP_ATOMIC, &nr_proto);
+	sk = sk_alloc(net, PF_NETROM, GFP_ATOMIC, &nr_proto, kern);
 	if (sk  == NULL)
 		return -ENOMEM;
 
@@ -476,7 +476,7 @@ static struct sock *nr_make_new(struct sock *osk)
 	if (osk->sk_type != SOCK_SEQPACKET)
 		return NULL;
 
-	sk = sk_alloc(sock_net(osk), PF_NETROM, GFP_ATOMIC, osk->sk_prot);
+	sk = sk_alloc(sock_net(osk), PF_NETROM, GFP_ATOMIC, osk->sk_prot, 0);
 	if (sk == NULL)
 		return NULL;
 
diff --git a/net/nfc/af_nfc.c b/net/nfc/af_nfc.c
index 2277276f52bc..54e40fa47822 100644
--- a/net/nfc/af_nfc.c
+++ b/net/nfc/af_nfc.c
@@ -40,7 +40,7 @@ static int nfc_sock_create(struct net *net, struct socket *sock, int proto,
 
 	read_lock(&proto_tab_lock);
 	if (proto_tab[proto] &&	try_module_get(proto_tab[proto]->owner)) {
-		rc = proto_tab[proto]->create(net, sock, proto_tab[proto]);
+		rc = proto_tab[proto]->create(net, sock, proto_tab[proto], kern);
 		module_put(proto_tab[proto]->owner);
 	}
 	read_unlock(&proto_tab_lock);
diff --git a/net/nfc/llcp.h b/net/nfc/llcp.h
index de1789e3cc82..1f68724d44d3 100644
--- a/net/nfc/llcp.h
+++ b/net/nfc/llcp.h
@@ -225,7 +225,7 @@ void nfc_llcp_send_to_raw_sock(struct nfc_llcp_local *local,
 			       struct sk_buff *skb, u8 direction);
 
 /* Sock API */
-struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp);
+struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp, int kern);
 void nfc_llcp_sock_free(struct nfc_llcp_sock *sock);
 void nfc_llcp_accept_unlink(struct sock *sk);
 void nfc_llcp_accept_enqueue(struct sock *parent, struct sock *sk);
diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c
index b18f07ccb504..98876274a1ee 100644
--- a/net/nfc/llcp_core.c
+++ b/net/nfc/llcp_core.c
@@ -934,7 +934,7 @@ static void nfc_llcp_recv_connect(struct nfc_llcp_local *local,
 		sock->ssap = ssap;
 	}
 
-	new_sk = nfc_llcp_sock_alloc(NULL, parent->sk_type, GFP_ATOMIC);
+	new_sk = nfc_llcp_sock_alloc(NULL, parent->sk_type, GFP_ATOMIC, 0);
 	if (new_sk == NULL) {
 		reason = LLCP_DM_REJ;
 		release_sock(&sock->sk);
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index 9578bd6a4f3e..b7de0da46acd 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -942,12 +942,12 @@ static void llcp_sock_destruct(struct sock *sk)
 	}
 }
 
-struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp)
+struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp, int kern)
 {
 	struct sock *sk;
 	struct nfc_llcp_sock *llcp_sock;
 
-	sk = sk_alloc(&init_net, PF_NFC, gfp, &llcp_sock_proto);
+	sk = sk_alloc(&init_net, PF_NFC, gfp, &llcp_sock_proto, kern);
 	if (!sk)
 		return NULL;
 
@@ -993,7 +993,7 @@ void nfc_llcp_sock_free(struct nfc_llcp_sock *sock)
 }
 
 static int llcp_sock_create(struct net *net, struct socket *sock,
-			    const struct nfc_protocol *nfc_proto)
+			    const struct nfc_protocol *nfc_proto, int kern)
 {
 	struct sock *sk;
 
@@ -1009,7 +1009,7 @@ static int llcp_sock_create(struct net *net, struct socket *sock,
 	else
 		sock->ops = &llcp_sock_ops;
 
-	sk = nfc_llcp_sock_alloc(sock, sock->type, GFP_ATOMIC);
+	sk = nfc_llcp_sock_alloc(sock, sock->type, GFP_ATOMIC, kern);
 	if (sk == NULL)
 		return -ENOMEM;
 
diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h
index a8ce80b47720..5c93e8412a26 100644
--- a/net/nfc/nfc.h
+++ b/net/nfc/nfc.h
@@ -30,7 +30,7 @@ struct nfc_protocol {
 	struct proto *proto;
 	struct module *owner;
 	int (*create)(struct net *net, struct socket *sock,
-		      const struct nfc_protocol *nfc_proto);
+		      const struct nfc_protocol *nfc_proto, int kern);
 };
 
 struct nfc_rawsock {
diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c
index 82b4e8024778..e9a91488fe3d 100644
--- a/net/nfc/rawsock.c
+++ b/net/nfc/rawsock.c
@@ -334,7 +334,7 @@ static void rawsock_destruct(struct sock *sk)
 }
 
 static int rawsock_create(struct net *net, struct socket *sock,
-			  const struct nfc_protocol *nfc_proto)
+			  const struct nfc_protocol *nfc_proto, int kern)
 {
 	struct sock *sk;
 
@@ -348,7 +348,7 @@ static int rawsock_create(struct net *net, struct socket *sock,
 	else
 		sock->ops = &rawsock_ops;
 
-	sk = sk_alloc(net, PF_NFC, GFP_ATOMIC, nfc_proto->proto);
+	sk = sk_alloc(net, PF_NFC, GFP_ATOMIC, nfc_proto->proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 5102c3cc4eec..94713276a1d9 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2832,7 +2832,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
 	sock->state = SS_UNCONNECTED;
 
 	err = -ENOBUFS;
-	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
+	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
 	if (sk == NULL)
 		goto out;
 
diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index 32ab87d34828..10d42f3220ab 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -97,7 +97,7 @@ static int pn_socket_create(struct net *net, struct socket *sock, int protocol,
 		goto out;
 	}
 
-	sk = sk_alloc(net, PF_PHONET, GFP_KERNEL, pnp->prot);
+	sk = sk_alloc(net, PF_PHONET, GFP_KERNEL, pnp->prot, kern);
 	if (sk == NULL) {
 		err = -ENOMEM;
 		goto out;
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index 6de2aeb98a1f..850a86cde0b3 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -845,7 +845,7 @@ static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp)
 	}
 
 	/* Create a new to-be-accepted sock */
-	newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot);
+	newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot, 0);
 	if (!newsk) {
 		pep_reject_conn(sk, skb, PN_PIPE_ERR_OVERLOAD, GFP_KERNEL);
 		err = -ENOBUFS;
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 10443377fb9d..3d83641f2861 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -440,7 +440,7 @@ static int rds_create(struct net *net, struct socket *sock, int protocol,
 	if (sock->type != SOCK_SEQPACKET || protocol)
 		return -ESOCKTNOSUPPORT;
 
-	sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto);
+	sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 8ae603069a1a..36dbc2da3661 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -520,7 +520,7 @@ static int rose_create(struct net *net, struct socket *sock, int protocol,
 	if (sock->type != SOCK_SEQPACKET || protocol != 0)
 		return -ESOCKTNOSUPPORT;
 
-	sk = sk_alloc(net, PF_ROSE, GFP_ATOMIC, &rose_proto);
+	sk = sk_alloc(net, PF_ROSE, GFP_ATOMIC, &rose_proto, kern);
 	if (sk == NULL)
 		return -ENOMEM;
 
@@ -559,7 +559,7 @@ static struct sock *rose_make_new(struct sock *osk)
 	if (osk->sk_type != SOCK_SEQPACKET)
 		return NULL;
 
-	sk = sk_alloc(sock_net(osk), PF_ROSE, GFP_ATOMIC, &rose_proto);
+	sk = sk_alloc(sock_net(osk), PF_ROSE, GFP_ATOMIC, &rose_proto, 0);
 	if (sk == NULL)
 		return NULL;
 
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 0095b9a0b779..25d60ed15284 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -632,7 +632,7 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
 	sock->ops = &rxrpc_rpc_ops;
 	sock->state = SS_UNCONNECTED;
 
-	sk = sk_alloc(net, PF_RXRPC, GFP_KERNEL, &rxrpc_proto);
+	sk = sk_alloc(net, PF_RXRPC, GFP_KERNEL, &rxrpc_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 0e4198ee2370..e703ff7fed40 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -635,7 +635,7 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk,
 	struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
 	struct sctp6_sock *newsctp6sk;
 
-	newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot);
+	newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, 0);
 	if (!newsk)
 		goto out;
 
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 53b7acde9aa3..59e80356672b 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -550,7 +550,7 @@ static struct sock *sctp_v4_create_accept_sk(struct sock *sk,
 					     struct sctp_association *asoc)
 {
 	struct sock *newsk = sk_alloc(sock_net(sk), PF_INET, GFP_KERNEL,
-			sk->sk_prot);
+			sk->sk_prot, 0);
 	struct inet_sock *newinet;
 
 	if (!newsk)
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 9074b5cede38..8f3c8e2cef8e 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -342,7 +342,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
 	}
 
 	/* Allocate socket's protocol area */
-	sk = sk_alloc(net, AF_TIPC, GFP_KERNEL, &tipc_proto);
+	sk = sk_alloc(net, AF_TIPC, GFP_KERNEL, &tipc_proto, kern);
 	if (sk == NULL)
 		return -ENOMEM;
 
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 5266ea7b922b..941b3d26e3bf 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -620,7 +620,7 @@ static struct proto unix_proto = {
  */
 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
 
-static struct sock *unix_create1(struct net *net, struct socket *sock)
+static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
 {
 	struct sock *sk = NULL;
 	struct unix_sock *u;
@@ -629,7 +629,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock)
 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
 		goto out;
 
-	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
+	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern);
 	if (!sk)
 		goto out;
 
@@ -688,7 +688,7 @@ static int unix_create(struct net *net, struct socket *sock, int protocol,
 		return -ESOCKTNOSUPPORT;
 	}
 
-	return unix_create1(net, sock) ? 0 : -ENOMEM;
+	return unix_create1(net, sock, kern) ? 0 : -ENOMEM;
 }
 
 static int unix_release(struct socket *sock)
@@ -1088,7 +1088,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	err = -ENOMEM;
 
 	/* create new sock for complete connection */
-	newsk = unix_create1(sock_net(sk), NULL);
+	newsk = unix_create1(sock_net(sk), NULL, 0);
 	if (newsk == NULL)
 		goto out;
 
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 2ec86e652a19..df5fc6b340f1 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -581,13 +581,14 @@ struct sock *__vsock_create(struct net *net,
 			    struct socket *sock,
 			    struct sock *parent,
 			    gfp_t priority,
-			    unsigned short type)
+			    unsigned short type,
+			    int kern)
 {
 	struct sock *sk;
 	struct vsock_sock *psk;
 	struct vsock_sock *vsk;
 
-	sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto);
+	sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto, kern);
 	if (!sk)
 		return NULL;
 
@@ -1866,7 +1867,7 @@ static int vsock_create(struct net *net, struct socket *sock,
 
 	sock->state = SS_UNCONNECTED;
 
-	return __vsock_create(net, sock, NULL, GFP_KERNEL, 0) ? 0 : -ENOMEM;
+	return __vsock_create(net, sock, NULL, GFP_KERNEL, 0, kern) ? 0 : -ENOMEM;
 }
 
 static const struct net_proto_family vsock_family_ops = {
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index c294da095461..1f63daff3965 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -1022,7 +1022,7 @@ static int vmci_transport_recv_listen(struct sock *sk,
 	}
 
 	pending = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL,
-				 sk->sk_type);
+				 sk->sk_type, 0);
 	if (!pending) {
 		vmci_transport_send_reset(sk, pkt);
 		return -ENOMEM;
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index c3ab230e4493..a750f330b8dd 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -515,10 +515,10 @@ static struct proto x25_proto = {
 	.obj_size = sizeof(struct x25_sock),
 };
 
-static struct sock *x25_alloc_socket(struct net *net)
+static struct sock *x25_alloc_socket(struct net *net, int kern)
 {
 	struct x25_sock *x25;
-	struct sock *sk = sk_alloc(net, AF_X25, GFP_ATOMIC, &x25_proto);
+	struct sock *sk = sk_alloc(net, AF_X25, GFP_ATOMIC, &x25_proto, kern);
 
 	if (!sk)
 		goto out;
@@ -553,7 +553,7 @@ static int x25_create(struct net *net, struct socket *sock, int protocol,
 		goto out;
 
 	rc = -ENOBUFS;
-	if ((sk = x25_alloc_socket(net)) == NULL)
+	if ((sk = x25_alloc_socket(net, kern)) == NULL)
 		goto out;
 
 	x25 = x25_sk(sk);
@@ -602,7 +602,7 @@ static struct sock *x25_make_new(struct sock *osk)
 	if (osk->sk_type != SOCK_SEQPACKET)
 		goto out;
 
-	if ((sk = x25_alloc_socket(sock_net(osk))) == NULL)
+	if ((sk = x25_alloc_socket(sock_net(osk), 0)) == NULL)
 		goto out;
 
 	x25 = x25_sk(sk);
-- 
cgit v1.2.3


From 26abe14379f8e2fa3fd1bcf97c9a7ad9364886fe Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 8 May 2015 21:10:31 -0500
Subject: net: Modify sk_alloc to not reference count the netns of kernel
 sockets.

Now that sk_alloc knows when a kernel socket is being allocated modify
it to not reference count the network namespace of kernel sockets.

Keep track of if a socket needs reference counting by adding a flag to
struct sock called sk_net_refcnt.

Update all of the callers of sock_create_kern to stop using
sk_change_net and sk_release_kernel as those hacks are no longer
needed, to avoid reference counting a kernel socket.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_common.h       |  2 +-
 include/net/sock.h              |  2 ++
 net/core/sock.c                 |  8 ++++++--
 net/ipv4/af_inet.c              |  4 +---
 net/ipv4/udp_tunnel.c           |  8 +++-----
 net/ipv6/ip6_udp_tunnel.c       |  6 ++----
 net/l2tp/l2tp_core.c            | 15 ++++++---------
 net/netfilter/ipvs/ip_vs_sync.c | 30 +++++++++---------------------
 8 files changed, 30 insertions(+), 45 deletions(-)

(limited to 'include/net')

diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index 4a92423eefa5..279f83591971 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -41,7 +41,7 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len,
 
 static inline void inet_ctl_sock_destroy(struct sock *sk)
 {
-	sk_release_kernel(sk);
+	sock_release(sk->sk_socket);
 }
 
 #endif
diff --git a/include/net/sock.h b/include/net/sock.h
index d8dcf91732b0..9e6b2c0b4741 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -184,6 +184,7 @@ struct sock_common {
 	unsigned char		skc_reuse:4;
 	unsigned char		skc_reuseport:1;
 	unsigned char		skc_ipv6only:1;
+	unsigned char		skc_net_refcnt:1;
 	int			skc_bound_dev_if;
 	union {
 		struct hlist_node	skc_bind_node;
@@ -323,6 +324,7 @@ struct sock {
 #define sk_reuse		__sk_common.skc_reuse
 #define sk_reuseport		__sk_common.skc_reuseport
 #define sk_ipv6only		__sk_common.skc_ipv6only
+#define sk_net_refcnt		__sk_common.skc_net_refcnt
 #define sk_bound_dev_if		__sk_common.skc_bound_dev_if
 #define sk_bind_node		__sk_common.skc_bind_node
 #define sk_prot			__sk_common.skc_prot
diff --git a/net/core/sock.c b/net/core/sock.c
index cbc3789b830c..9e8968e9ebeb 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1412,7 +1412,10 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 		 */
 		sk->sk_prot = sk->sk_prot_creator = prot;
 		sock_lock_init(sk);
-		sock_net_set(sk, get_net(net));
+		sk->sk_net_refcnt = kern ? 0 : 1;
+		if (likely(sk->sk_net_refcnt))
+			get_net(net);
+		sock_net_set(sk, net);
 		atomic_set(&sk->sk_wmem_alloc, 1);
 
 		sock_update_classid(sk);
@@ -1446,7 +1449,8 @@ static void __sk_free(struct sock *sk)
 	if (sk->sk_peer_cred)
 		put_cred(sk->sk_peer_cred);
 	put_pid(sk->sk_peer_pid);
-	put_net(sock_net(sk));
+	if (likely(sk->sk_net_refcnt))
+		put_net(sock_net(sk));
 	sk_prot_free(sk->sk_prot_creator, sk);
 }
 
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e2dd9cb99d61..235d36afece3 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1430,7 +1430,7 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
 			 struct net *net)
 {
 	struct socket *sock;
-	int rc = sock_create_kern(&init_net, family, type, protocol, &sock);
+	int rc = sock_create_kern(net, family, type, protocol, &sock);
 
 	if (rc == 0) {
 		*sk = sock->sk;
@@ -1440,8 +1440,6 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
 		 * we do not wish this socket to see incoming packets.
 		 */
 		(*sk)->sk_prot->unhash(*sk);
-
-		sk_change_net(*sk, net);
 	}
 	return rc;
 }
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 4e2837476967..933ea903f7b8 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -15,12 +15,10 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
 	struct socket *sock = NULL;
 	struct sockaddr_in udp_addr;
 
-	err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, 0, &sock);
+	err = sock_create_kern(net, AF_INET, SOCK_DGRAM, 0, &sock);
 	if (err < 0)
 		goto error;
 
-	sk_change_net(sock->sk, net);
-
 	udp_addr.sin_family = AF_INET;
 	udp_addr.sin_addr = cfg->local_ip;
 	udp_addr.sin_port = cfg->local_udp_port;
@@ -47,7 +45,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
 error:
 	if (sock) {
 		kernel_sock_shutdown(sock, SHUT_RDWR);
-		sk_release_kernel(sock->sk);
+		sock_release(sock);
 	}
 	*sockp = NULL;
 	return err;
@@ -101,7 +99,7 @@ void udp_tunnel_sock_release(struct socket *sock)
 {
 	rcu_assign_sk_user_data(sock->sk, NULL);
 	kernel_sock_shutdown(sock, SHUT_RDWR);
-	sk_release_kernel(sock->sk);
+	sock_release(sock);
 }
 EXPORT_SYMBOL_GPL(udp_tunnel_sock_release);
 
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index 478576b61214..e1a1136bda7c 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -19,12 +19,10 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
 	int err;
 	struct socket *sock = NULL;
 
-	err = sock_create_kern(&init_net, AF_INET6, SOCK_DGRAM, 0, &sock);
+	err = sock_create_kern(net, AF_INET6, SOCK_DGRAM, 0, &sock);
 	if (err < 0)
 		goto error;
 
-	sk_change_net(sock->sk, net);
-
 	udp6_addr.sin6_family = AF_INET6;
 	memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6,
 	       sizeof(udp6_addr.sin6_addr));
@@ -55,7 +53,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
 error:
 	if (sock) {
 		kernel_sock_shutdown(sock, SHUT_RDWR);
-		sk_release_kernel(sock->sk);
+		sock_release(sock);
 	}
 	*sockp = NULL;
 	return err;
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index ae513a2fe7f3..f6b090df3930 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1334,9 +1334,10 @@ static void l2tp_tunnel_del_work(struct work_struct *work)
 		if (sock)
 			inet_shutdown(sock, 2);
 	} else {
-		if (sock)
+		if (sock) {
 			kernel_sock_shutdown(sock, SHUT_RDWR);
-		sk_release_kernel(sk);
+			sock_release(sock);
+		}
 	}
 
 	l2tp_tunnel_sock_put(sk);
@@ -1399,13 +1400,11 @@ static int l2tp_tunnel_sock_create(struct net *net,
 		if (cfg->local_ip6 && cfg->peer_ip6) {
 			struct sockaddr_l2tpip6 ip6_addr = {0};
 
-			err = sock_create_kern(&init_net, AF_INET6, SOCK_DGRAM,
+			err = sock_create_kern(net, AF_INET6, SOCK_DGRAM,
 					  IPPROTO_L2TP, &sock);
 			if (err < 0)
 				goto out;
 
-			sk_change_net(sock->sk, net);
-
 			ip6_addr.l2tp_family = AF_INET6;
 			memcpy(&ip6_addr.l2tp_addr, cfg->local_ip6,
 			       sizeof(ip6_addr.l2tp_addr));
@@ -1429,13 +1428,11 @@ static int l2tp_tunnel_sock_create(struct net *net,
 		{
 			struct sockaddr_l2tpip ip_addr = {0};
 
-			err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM,
+			err = sock_create_kern(net, AF_INET, SOCK_DGRAM,
 					  IPPROTO_L2TP, &sock);
 			if (err < 0)
 				goto out;
 
-			sk_change_net(sock->sk, net);
-
 			ip_addr.l2tp_family = AF_INET;
 			ip_addr.l2tp_addr = cfg->local_ip;
 			ip_addr.l2tp_conn_id = tunnel_id;
@@ -1462,7 +1459,7 @@ out:
 	*sockp = sock;
 	if ((err < 0) && sock) {
 		kernel_sock_shutdown(sock, SHUT_RDWR);
-		sk_release_kernel(sock->sk);
+		sock_release(sock);
 		*sockp = NULL;
 	}
 
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 2e9a5b5d1239..b08ba9538d12 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1457,18 +1457,12 @@ static struct socket *make_send_sock(struct net *net, int id)
 	struct socket *sock;
 	int result;
 
-	/* First create a socket move it to right name space later */
-	result = sock_create_kern(&init_net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+	/* First create a socket */
+	result = sock_create_kern(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
 	if (result < 0) {
 		pr_err("Error during creation of socket; terminating\n");
 		return ERR_PTR(result);
 	}
-	/*
-	 * Kernel sockets that are a part of a namespace, should not
-	 * hold a reference to a namespace in order to allow to stop it.
-	 * After sk_change_net should be released using sk_release_kernel.
-	 */
-	sk_change_net(sock->sk, net);
 	result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn);
 	if (result < 0) {
 		pr_err("Error setting outbound mcast interface\n");
@@ -1497,7 +1491,7 @@ static struct socket *make_send_sock(struct net *net, int id)
 	return sock;
 
 error:
-	sk_release_kernel(sock->sk);
+	sock_release(sock);
 	return ERR_PTR(result);
 }
 
@@ -1518,17 +1512,11 @@ static struct socket *make_receive_sock(struct net *net, int id)
 	int result;
 
 	/* First create a socket */
-	result = sock_create_kern(&init_net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+	result = sock_create_kern(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
 	if (result < 0) {
 		pr_err("Error during creation of socket; terminating\n");
 		return ERR_PTR(result);
 	}
-	/*
-	 * Kernel sockets that are a part of a namespace, should not
-	 * hold a reference to a namespace in order to allow to stop it.
-	 * After sk_change_net should be released using sk_release_kernel.
-	 */
-	sk_change_net(sock->sk, net);
 	/* it is equivalent to the REUSEADDR option in user-space */
 	sock->sk->sk_reuse = SK_CAN_REUSE;
 	result = sysctl_sync_sock_size(ipvs);
@@ -1554,7 +1542,7 @@ static struct socket *make_receive_sock(struct net *net, int id)
 	return sock;
 
 error:
-	sk_release_kernel(sock->sk);
+	sock_release(sock);
 	return ERR_PTR(result);
 }
 
@@ -1692,7 +1680,7 @@ done:
 		ip_vs_sync_buff_release(sb);
 
 	/* release the sending multicast socket */
-	sk_release_kernel(tinfo->sock->sk);
+	sock_release(tinfo->sock);
 	kfree(tinfo);
 
 	return 0;
@@ -1729,7 +1717,7 @@ static int sync_thread_backup(void *data)
 	}
 
 	/* release the sending multicast socket */
-	sk_release_kernel(tinfo->sock->sk);
+	sock_release(tinfo->sock);
 	kfree(tinfo->buf);
 	kfree(tinfo);
 
@@ -1854,11 +1842,11 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
 	return 0;
 
 outsocket:
-	sk_release_kernel(sock->sk);
+	sock_release(sock);
 
 outtinfo:
 	if (tinfo) {
-		sk_release_kernel(tinfo->sock->sk);
+		sock_release(tinfo->sock);
 		kfree(tinfo->buf);
 		kfree(tinfo);
 	}
-- 
cgit v1.2.3


From affb9792f1d99e1e4d64411e147b648d65f2576e Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 8 May 2015 21:12:13 -0500
Subject: net: kill sk_change_net and sk_release_kernel

These functions are no longer needed and no longer used kill them.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 17 -----------------
 net/core/sock.c    | 19 -------------------
 2 files changed, 36 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index 9e6b2c0b4741..d882f4c8e438 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1518,7 +1518,6 @@ static inline void unlock_sock_fast(struct sock *sk, bool slow)
 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 		      struct proto *prot, int kern);
 void sk_free(struct sock *sk);
-void sk_release_kernel(struct sock *sk);
 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority);
 
 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
@@ -2194,22 +2193,6 @@ void sock_net_set(struct sock *sk, struct net *net)
 	write_pnet(&sk->sk_net, net);
 }
 
-/*
- * Kernel sockets, f.e. rtnl or icmp_socket, are a part of a namespace.
- * They should not hold a reference to a namespace in order to allow
- * to stop it.
- * Sockets after sk_change_net should be released using sk_release_kernel
- */
-static inline void sk_change_net(struct sock *sk, struct net *net)
-{
-	struct net *current_net = sock_net(sk);
-
-	if (!net_eq(current_net, net)) {
-		put_net(current_net);
-		sock_net_set(sk, net);
-	}
-}
-
 static inline struct sock *skb_steal_sock(struct sk_buff *skb)
 {
 	if (skb->sk) {
diff --git a/net/core/sock.c b/net/core/sock.c
index 9e8968e9ebeb..c18738a795b0 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1466,25 +1466,6 @@ void sk_free(struct sock *sk)
 }
 EXPORT_SYMBOL(sk_free);
 
-/*
- * Last sock_put should drop reference to sk->sk_net. It has already
- * been dropped in sk_change_net. Taking reference to stopping namespace
- * is not an option.
- * Take reference to a socket to remove it from hash _alive_ and after that
- * destroy it in the context of init_net.
- */
-void sk_release_kernel(struct sock *sk)
-{
-	if (sk == NULL || sk->sk_socket == NULL)
-		return;
-
-	sock_hold(sk);
-	sock_net_set(sk, get_net(&init_net));
-	sock_release(sk->sk_socket);
-	sock_put(sk);
-}
-EXPORT_SYMBOL(sk_release_kernel);
-
 static void sk_update_clone(const struct sock *sk, struct sock *newsk)
 {
 	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
-- 
cgit v1.2.3


From 6791e4661c4bd3e9f193a84247f2c389578a4336 Mon Sep 17 00:00:00 2001
From: Mahesh Bandewar <maheshb@google.com>
Date: Sat, 9 May 2015 00:01:55 -0700
Subject: bonding: Allow userspace to set actors' system_priority in AD system

This patch allows user to randomize the system-priority in an ad-system.
The allowed range is 1 - 0xFFFF while default value is 0xFFFF. If user
does not specify this value, the system defaults to 0xFFFF, which is
what it was before this patch.

Following example code could set the value -
    # modprobe bonding mode=4
    # sys_prio=$(( 1 + RANDOM + RANDOM ))
    # echo $sys_prio > /sys/class/net/bond0/bonding/ad_actor_sys_prio
    # echo +eth1 > /sys/class/net/bond0/bonding/slaves
    ...
    # ip link set bond0 up

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@redhat.com>
[jt: * fixed up style issues reported by checkpatch
     * changed how the default value is set in bond_check_params(), this
       makes the default consistent between what gets set for a new bond
       and what the default is claimed to be in the bonding options.]
Signed-off-by: Jonathan Toppins <jtoppins@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/bonding.txt |  9 +++++++++
 drivers/net/bonding/bond_3ad.c       |  5 ++++-
 drivers/net/bonding/bond_main.c      | 12 ++++++++++++
 drivers/net/bonding/bond_options.c   | 28 +++++++++++++++++++++++++++-
 drivers/net/bonding/bond_procfs.c    |  2 ++
 drivers/net/bonding/bond_sysfs.c     | 15 +++++++++++++++
 include/net/bond_options.h           |  1 +
 include/net/bonding.h                |  1 +
 8 files changed, 71 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt
index 83bf4986baea..34946115acec 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -178,6 +178,15 @@ active_slave
 	active slave, or the empty string if there is no active slave or
 	the current mode does not use an active slave.
 
+ad_actor_sys_prio
+
+	In an AD system, this specifies the system priority. The allowed range
+	is 1 - 65535. If the value is not specified, it takes 65535 as the
+	default value.
+
+	This parameter has effect only in 802.3ad mode and is available through
+	SysFs interface.
+
 ad_select
 
 	Specifies the 802.3ad aggregation selection logic to use.  The
diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index fbd54f0e32e8..4c003bc87d4b 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -1908,7 +1908,8 @@ void bond_3ad_initialize(struct bonding *bond, u16 tick_resolution)
 
 		BOND_AD_INFO(bond).aggregator_identifier = 0;
 
-		BOND_AD_INFO(bond).system.sys_priority = 0xFFFF;
+		BOND_AD_INFO(bond).system.sys_priority =
+			bond->params.ad_actor_sys_prio;
 		BOND_AD_INFO(bond).system.sys_mac_addr = *((struct mac_addr *)bond->dev->dev_addr);
 
 		/* initialize how many times this module is called in one
@@ -1959,6 +1960,8 @@ void bond_3ad_bind_slave(struct slave *slave)
 			port->sm_vars &= ~AD_PORT_LACP_ENABLED;
 		/* actor system is the bond's system */
 		port->actor_system = BOND_AD_INFO(bond).system.sys_mac_addr;
+		port->actor_system_priority =
+		    BOND_AD_INFO(bond).system.sys_priority;
 		/* tx timer(to verify that no more than MAX_TX_IN_SECOND
 		 * lacpdu's are sent in one second)
 		 */
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index d5fe5d5f490f..5f2f28f0e927 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4140,6 +4140,7 @@ static int bond_check_params(struct bond_params *params)
 	struct bond_opt_value newval;
 	const struct bond_opt_value *valptr;
 	int arp_all_targets_value;
+	u16 ad_actor_sys_prio = 0;
 
 	/* Convert string parameters. */
 	if (mode) {
@@ -4434,6 +4435,16 @@ static int bond_check_params(struct bond_params *params)
 		fail_over_mac_value = BOND_FOM_NONE;
 	}
 
+	bond_opt_initstr(&newval, "default");
+	valptr = bond_opt_parse(
+			bond_opt_get(BOND_OPT_AD_ACTOR_SYS_PRIO),
+				     &newval);
+	if (!valptr) {
+		pr_err("Error: No ad_actor_sys_prio default value");
+		return -EINVAL;
+	}
+	ad_actor_sys_prio = valptr->value;
+
 	if (lp_interval == 0) {
 		pr_warn("Warning: ip_interval must be between 1 and %d, so it was reset to %d\n",
 			INT_MAX, BOND_ALB_DEFAULT_LP_INTERVAL);
@@ -4462,6 +4473,7 @@ static int bond_check_params(struct bond_params *params)
 	params->lp_interval = lp_interval;
 	params->packets_per_slave = packets_per_slave;
 	params->tlb_dynamic_lb = 1; /* Default value */
+	params->ad_actor_sys_prio = ad_actor_sys_prio;
 	if (packets_per_slave > 0) {
 		params->reciprocal_packets_per_slave =
 			reciprocal_value(packets_per_slave);
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index 4df28943d222..d2b47e5e99f7 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -70,6 +70,8 @@ static int bond_option_slaves_set(struct bonding *bond,
 				  const struct bond_opt_value *newval);
 static int bond_option_tlb_dynamic_lb_set(struct bonding *bond,
 				  const struct bond_opt_value *newval);
+static int bond_option_ad_actor_sys_prio_set(struct bonding *bond,
+					     const struct bond_opt_value *newval);
 
 
 static const struct bond_opt_value bond_mode_tbl[] = {
@@ -186,6 +188,12 @@ static const struct bond_opt_value bond_tlb_dynamic_lb_tbl[] = {
 	{ NULL,  -1, 0}
 };
 
+static const struct bond_opt_value bond_ad_actor_sys_prio_tbl[] = {
+	{ "minval",  1,     BOND_VALFLAG_MIN},
+	{ "maxval",  65535, BOND_VALFLAG_MAX | BOND_VALFLAG_DEFAULT},
+	{ NULL,      -1,    0},
+};
+
 static const struct bond_option bond_opts[BOND_OPT_LAST] = {
 	[BOND_OPT_MODE] = {
 		.id = BOND_OPT_MODE,
@@ -379,7 +387,15 @@ static const struct bond_option bond_opts[BOND_OPT_LAST] = {
 		.values = bond_tlb_dynamic_lb_tbl,
 		.flags = BOND_OPTFLAG_IFDOWN,
 		.set = bond_option_tlb_dynamic_lb_set,
-	}
+	},
+	[BOND_OPT_AD_ACTOR_SYS_PRIO] = {
+		.id = BOND_OPT_AD_ACTOR_SYS_PRIO,
+		.name = "ad_actor_sys_prio",
+		.unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_8023AD)),
+		.flags = BOND_OPTFLAG_IFDOWN,
+		.values = bond_ad_actor_sys_prio_tbl,
+		.set = bond_option_ad_actor_sys_prio_set,
+	},
 };
 
 /* Searches for an option by name */
@@ -1349,3 +1365,13 @@ static int bond_option_tlb_dynamic_lb_set(struct bonding *bond,
 
 	return 0;
 }
+
+static int bond_option_ad_actor_sys_prio_set(struct bonding *bond,
+					     const struct bond_opt_value *newval)
+{
+	netdev_info(bond->dev, "Setting ad_actor_sys_prio to (%llu)\n",
+		    newval->value);
+
+	bond->params.ad_actor_sys_prio = newval->value;
+	return 0;
+}
diff --git a/drivers/net/bonding/bond_procfs.c b/drivers/net/bonding/bond_procfs.c
index b20b35acb47d..11369299e7e5 100644
--- a/drivers/net/bonding/bond_procfs.c
+++ b/drivers/net/bonding/bond_procfs.c
@@ -135,6 +135,8 @@ static void bond_info_show_master(struct seq_file *seq)
 					  bond->params.ad_select);
 		seq_printf(seq, "Aggregator selection policy (ad_select): %s\n",
 			   optval->string);
+		seq_printf(seq, "System priority: %d\n",
+			   BOND_AD_INFO(bond).system.sys_priority);
 
 		if (__bond_3ad_get_active_agg_info(bond, &ad_info)) {
 			seq_printf(seq, "bond %s has no active aggregator\n",
diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index 7e9e151d4d61..4a7626611ca6 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -692,6 +692,20 @@ static ssize_t bonding_show_packets_per_slave(struct device *d,
 static DEVICE_ATTR(packets_per_slave, S_IRUGO | S_IWUSR,
 		   bonding_show_packets_per_slave, bonding_sysfs_store_option);
 
+static ssize_t bonding_show_ad_actor_sys_prio(struct device *d,
+					      struct device_attribute *attr,
+					      char *buf)
+{
+	struct bonding *bond = to_bond(d);
+
+	if (BOND_MODE(bond) == BOND_MODE_8023AD)
+		return sprintf(buf, "%hu\n", bond->params.ad_actor_sys_prio);
+
+	return 0;
+}
+static DEVICE_ATTR(ad_actor_sys_prio, S_IRUGO | S_IWUSR,
+		   bonding_show_ad_actor_sys_prio, bonding_sysfs_store_option);
+
 static struct attribute *per_bond_attrs[] = {
 	&dev_attr_slaves.attr,
 	&dev_attr_mode.attr,
@@ -725,6 +739,7 @@ static struct attribute *per_bond_attrs[] = {
 	&dev_attr_lp_interval.attr,
 	&dev_attr_packets_per_slave.attr,
 	&dev_attr_tlb_dynamic_lb.attr,
+	&dev_attr_ad_actor_sys_prio.attr,
 	NULL,
 };
 
diff --git a/include/net/bond_options.h b/include/net/bond_options.h
index ea6546d2c946..894002a2620f 100644
--- a/include/net/bond_options.h
+++ b/include/net/bond_options.h
@@ -63,6 +63,7 @@ enum {
 	BOND_OPT_LP_INTERVAL,
 	BOND_OPT_SLAVES,
 	BOND_OPT_TLB_DYNAMIC_LB,
+	BOND_OPT_AD_ACTOR_SYS_PRIO,
 	BOND_OPT_LAST
 };
 
diff --git a/include/net/bonding.h b/include/net/bonding.h
index 78ed135e9dea..405cf87a450a 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -136,6 +136,7 @@ struct bond_params {
 	int packets_per_slave;
 	int tlb_dynamic_lb;
 	struct reciprocal_value reciprocal_packets_per_slave;
+	u16 ad_actor_sys_prio;
 };
 
 struct bond_parm_tbl {
-- 
cgit v1.2.3


From 74514957552edd4661a4608618121f3c71d4e891 Mon Sep 17 00:00:00 2001
From: Mahesh Bandewar <maheshb@google.com>
Date: Sat, 9 May 2015 00:01:56 -0700
Subject: bonding: Allow userspace to set actors' macaddr in an AD-system.

In an AD system, the communication between actor and partner is the
business between these two entities. In the current setup anyone on the
same L2 can "guess" the LACPDU contents and then possibly send the
spoofed LACPDUs and trick the partner causing connectivity issues for
the AD system. This patch allows to use a random mac-address obscuring
it's identity making it harder for someone in the L2 is do the same thing.

This patch allows user-space to choose the mac-address for the AD-system.
This mac-address can not be NULL or a Multicast. If the mac-address is set
from user-space; kernel will honor it and will not overwrite it. In the
absence (value from user space); the logic will default to using the
masters' mac as the mac-address for the AD-system.

It can be set using example code below -

   # modprobe bonding mode=4
   # sys_mac_addr=$(printf '%02x:%02x:%02x:%02x:%02x:%02x' \
                    $(( (RANDOM & 0xFE) | 0x02 )) \
                    $(( RANDOM & 0xFF )) \
                    $(( RANDOM & 0xFF )) \
                    $(( RANDOM & 0xFF )) \
                    $(( RANDOM & 0xFF )) \
                    $(( RANDOM & 0xFF )))
   # echo $sys_mac_addr > /sys/class/net/bond0/bonding/ad_actor_system
   # echo +eth1 > /sys/class/net/bond0/bonding/slaves
   ...
   # ip link set bond0 up

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@redhat.com>
[jt: fixed up style issues reported by checkpatch]
Signed-off-by: Jonathan Toppins <jtoppins@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/bonding.txt | 12 ++++++++++++
 drivers/net/bonding/bond_3ad.c       |  7 ++++++-
 drivers/net/bonding/bond_main.c      |  1 +
 drivers/net/bonding/bond_options.c   | 27 +++++++++++++++++++++++++++
 drivers/net/bonding/bond_procfs.c    |  6 ++++++
 drivers/net/bonding/bond_sysfs.c     | 16 ++++++++++++++++
 include/net/bond_options.h           |  1 +
 include/net/bonding.h                |  1 +
 8 files changed, 70 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt
index 34946115acec..2c197b68baf0 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -187,6 +187,18 @@ ad_actor_sys_prio
 	This parameter has effect only in 802.3ad mode and is available through
 	SysFs interface.
 
+ad_actor_system
+
+	In an AD system, this specifies the mac-address for the actor in
+	protocol packet exchanges (LACPDUs). The value cannot be NULL or
+	multicast. It is preferred to have the local-admin bit set for this
+	mac but driver does not enforce it. If the value is not given then
+	system defaults to using the masters' mac address as actors' system
+	address.
+
+	This parameter has effect only in 802.3ad mode and is available through
+	SysFs interface.
+
 ad_select
 
 	Specifies the 802.3ad aggregation selection logic to use.  The
diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index 4c003bc87d4b..012f7bc22d91 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -1910,7 +1910,12 @@ void bond_3ad_initialize(struct bonding *bond, u16 tick_resolution)
 
 		BOND_AD_INFO(bond).system.sys_priority =
 			bond->params.ad_actor_sys_prio;
-		BOND_AD_INFO(bond).system.sys_mac_addr = *((struct mac_addr *)bond->dev->dev_addr);
+		if (is_zero_ether_addr(bond->params.ad_actor_system))
+			BOND_AD_INFO(bond).system.sys_mac_addr =
+			    *((struct mac_addr *)bond->dev->dev_addr);
+		else
+			BOND_AD_INFO(bond).system.sys_mac_addr =
+			    *((struct mac_addr *)bond->params.ad_actor_system);
 
 		/* initialize how many times this module is called in one
 		 * second (should be about every 100ms)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 5f2f28f0e927..a4e2f27ef683 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4474,6 +4474,7 @@ static int bond_check_params(struct bond_params *params)
 	params->packets_per_slave = packets_per_slave;
 	params->tlb_dynamic_lb = 1; /* Default value */
 	params->ad_actor_sys_prio = ad_actor_sys_prio;
+	eth_zero_addr(params->ad_actor_system);
 	if (packets_per_slave > 0) {
 		params->reciprocal_packets_per_slave =
 			reciprocal_value(packets_per_slave);
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index d2b47e5e99f7..cdcef217ac84 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -72,6 +72,8 @@ static int bond_option_tlb_dynamic_lb_set(struct bonding *bond,
 				  const struct bond_opt_value *newval);
 static int bond_option_ad_actor_sys_prio_set(struct bonding *bond,
 					     const struct bond_opt_value *newval);
+static int bond_option_ad_actor_system_set(struct bonding *bond,
+					   const struct bond_opt_value *newval);
 
 
 static const struct bond_opt_value bond_mode_tbl[] = {
@@ -396,6 +398,13 @@ static const struct bond_option bond_opts[BOND_OPT_LAST] = {
 		.values = bond_ad_actor_sys_prio_tbl,
 		.set = bond_option_ad_actor_sys_prio_set,
 	},
+	[BOND_OPT_AD_ACTOR_SYSTEM] = {
+		.id = BOND_OPT_AD_ACTOR_SYSTEM,
+		.name = "ad_actor_system",
+		.unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_8023AD)),
+		.flags = BOND_OPTFLAG_RAWVAL | BOND_OPTFLAG_IFDOWN,
+		.set = bond_option_ad_actor_system_set,
+	},
 };
 
 /* Searches for an option by name */
@@ -1375,3 +1384,21 @@ static int bond_option_ad_actor_sys_prio_set(struct bonding *bond,
 	bond->params.ad_actor_sys_prio = newval->value;
 	return 0;
 }
+
+static int bond_option_ad_actor_system_set(struct bonding *bond,
+					   const struct bond_opt_value *newval)
+{
+	u8 macaddr[ETH_ALEN];
+	int i;
+
+	i = sscanf(newval->string, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx",
+		   &macaddr[0], &macaddr[1], &macaddr[2],
+		   &macaddr[3], &macaddr[4], &macaddr[5]);
+	if (i != ETH_ALEN || !is_valid_ether_addr(macaddr)) {
+		netdev_err(bond->dev, "Invalid MAC address.\n");
+		return -EINVAL;
+	}
+
+	ether_addr_copy(bond->params.ad_actor_system, macaddr);
+	return 0;
+}
diff --git a/drivers/net/bonding/bond_procfs.c b/drivers/net/bonding/bond_procfs.c
index 11369299e7e5..e7f3047a26df 100644
--- a/drivers/net/bonding/bond_procfs.c
+++ b/drivers/net/bonding/bond_procfs.c
@@ -137,6 +137,8 @@ static void bond_info_show_master(struct seq_file *seq)
 			   optval->string);
 		seq_printf(seq, "System priority: %d\n",
 			   BOND_AD_INFO(bond).system.sys_priority);
+		seq_printf(seq, "System MAC address: %pM\n",
+			   &BOND_AD_INFO(bond).system.sys_mac_addr);
 
 		if (__bond_3ad_get_active_agg_info(bond, &ad_info)) {
 			seq_printf(seq, "bond %s has no active aggregator\n",
@@ -200,6 +202,8 @@ static void bond_info_show_slave(struct seq_file *seq,
 			seq_puts(seq, "details actor lacp pdu:\n");
 			seq_printf(seq, "    system priority: %d\n",
 				   port->actor_system_priority);
+			seq_printf(seq, "    system mac address: %pM\n",
+				   &port->actor_system);
 			seq_printf(seq, "    port key: %d\n",
 				   port->actor_oper_port_key);
 			seq_printf(seq, "    port priority: %d\n",
@@ -212,6 +216,8 @@ static void bond_info_show_slave(struct seq_file *seq,
 			seq_puts(seq, "details partner lacp pdu:\n");
 			seq_printf(seq, "    system priority: %d\n",
 				   port->partner_oper.system_priority);
+			seq_printf(seq, "    system mac address: %pM\n",
+				   &port->partner_oper.system);
 			seq_printf(seq, "    oper key: %d\n",
 				   port->partner_oper.key);
 			seq_printf(seq, "    port priority: %d\n",
diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index 4a7626611ca6..09fefa50d055 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -706,6 +706,21 @@ static ssize_t bonding_show_ad_actor_sys_prio(struct device *d,
 static DEVICE_ATTR(ad_actor_sys_prio, S_IRUGO | S_IWUSR,
 		   bonding_show_ad_actor_sys_prio, bonding_sysfs_store_option);
 
+static ssize_t bonding_show_ad_actor_system(struct device *d,
+					    struct device_attribute *attr,
+					    char *buf)
+{
+	struct bonding *bond = to_bond(d);
+
+	if (BOND_MODE(bond) == BOND_MODE_8023AD)
+		return sprintf(buf, "%pM\n", bond->params.ad_actor_system);
+
+	return 0;
+}
+
+static DEVICE_ATTR(ad_actor_system, S_IRUGO | S_IWUSR,
+		   bonding_show_ad_actor_system, bonding_sysfs_store_option);
+
 static struct attribute *per_bond_attrs[] = {
 	&dev_attr_slaves.attr,
 	&dev_attr_mode.attr,
@@ -740,6 +755,7 @@ static struct attribute *per_bond_attrs[] = {
 	&dev_attr_packets_per_slave.attr,
 	&dev_attr_tlb_dynamic_lb.attr,
 	&dev_attr_ad_actor_sys_prio.attr,
+	&dev_attr_ad_actor_system.attr,
 	NULL,
 };
 
diff --git a/include/net/bond_options.h b/include/net/bond_options.h
index 894002a2620f..eeeefa1d3cd8 100644
--- a/include/net/bond_options.h
+++ b/include/net/bond_options.h
@@ -64,6 +64,7 @@ enum {
 	BOND_OPT_SLAVES,
 	BOND_OPT_TLB_DYNAMIC_LB,
 	BOND_OPT_AD_ACTOR_SYS_PRIO,
+	BOND_OPT_AD_ACTOR_SYSTEM,
 	BOND_OPT_LAST
 };
 
diff --git a/include/net/bonding.h b/include/net/bonding.h
index 405cf87a450a..650f38693956 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -137,6 +137,7 @@ struct bond_params {
 	int tlb_dynamic_lb;
 	struct reciprocal_value reciprocal_packets_per_slave;
 	u16 ad_actor_sys_prio;
+	u8 ad_actor_system[ETH_ALEN];
 };
 
 struct bond_parm_tbl {
-- 
cgit v1.2.3


From d22a5fc0c32edcf5c3bb973ee8c9a2606ba500a8 Mon Sep 17 00:00:00 2001
From: Mahesh Bandewar <maheshb@google.com>
Date: Sat, 9 May 2015 00:01:57 -0700
Subject: bonding: Implement user key part of port_key in an AD system.

The port key has three components - user-key, speed-part, and duplex-part.
The LSBit is for the duplex-part, next 5 bits are for the speed while the
remaining 10 bits are the user defined key bits. Get these 10 bits
from the user-space (through the SysFs interface) and use it to form the
admin port-key. Allowed range for the user-key is 0 - 1023 (10 bits). If
it is not provided then use zero for the user-key-bits (default).

It can set using following example code -

   # modprobe bonding mode=4
   # usr_port_key=$(( RANDOM & 0x3FF ))
   # echo $usr_port_key > /sys/class/net/bond0/bonding/ad_user_port_key
   # echo +eth1 > /sys/class/net/bond0/bonding/slaves
   ...
   # ip link set bond0 up

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@redhat.com>
[jt: * fixed up style issues reported by checkpatch
     * fixed up context from change in ad_actor_sys_prio patch]
Signed-off-by: Jonathan Toppins <jtoppins@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/bonding.txt | 63 ++++++++++++++++++++++++++++++++++++
 drivers/net/bonding/bond_3ad.c       | 14 ++++----
 drivers/net/bonding/bond_main.c      | 10 ++++++
 drivers/net/bonding/bond_options.c   | 26 +++++++++++++++
 drivers/net/bonding/bond_sysfs.c     | 15 +++++++++
 include/net/bond_options.h           |  1 +
 include/net/bonding.h                |  1 +
 7 files changed, 123 insertions(+), 7 deletions(-)

(limited to 'include/net')

diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt
index 2c197b68baf0..334b49ef02d1 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -51,6 +51,7 @@ Table of Contents
 3.4	Configuring Bonding Manually via Sysfs
 3.5	Configuration with Interfaces Support
 3.6	Overriding Configuration for Special Cases
+3.7 Configuring LACP for 802.3ad mode in a more secure way
 
 4. Querying Bonding Configuration
 4.1	Bonding Configuration
@@ -241,6 +242,21 @@ ad_select
 
 	This option was added in bonding version 3.4.0.
 
+ad_user_port_key
+
+	In an AD system, the port-key has three parts as shown below -
+
+	   Bits   Use
+	   00     Duplex
+	   01-05  Speed
+	   06-15  User-defined
+
+	This defines the upper 10 bits of the port key. The values can be
+	from 0 - 1023. If not given, the system defaults to 0.
+
+	This parameter has effect only in 802.3ad mode and is available through
+	SysFs interface.
+
 all_slaves_active
 
 	Specifies that duplicate frames (received on inactive ports) should be
@@ -1643,6 +1659,53 @@ output port selection.
 This feature first appeared in bonding driver version 3.7.0 and support for
 output slave selection was limited to round-robin and active-backup modes.
 
+3.7 Configuring LACP for 802.3ad mode in a more secure way
+----------------------------------------------------------
+
+When using 802.3ad bonding mode, the Actor (host) and Partner (switch)
+exchange LACPDUs.  These LACPDUs cannot be sniffed, because they are
+destined to link local mac addresses (which switches/bridges are not
+supposed to forward).  However, most of the values are easily predictable
+or are simply the machine's MAC address (which is trivially known to all
+other hosts in the same L2).  This implies that other machines in the L2
+domain can spoof LACPDU packets from other hosts to the switch and potentially
+cause mayhem by joining (from the point of view of the switch) another
+machine's aggregate, thus receiving a portion of that hosts incoming
+traffic and / or spoofing traffic from that machine themselves (potentially
+even successfully terminating some portion of flows). Though this is not
+a likely scenario, one could avoid this possibility by simply configuring
+few bonding parameters:
+
+   (a) ad_actor_system : You can set a random mac-address that can be used for
+       these LACPDU exchanges. The value can not be either NULL or Multicast.
+       Also it's preferable to set the local-admin bit. Following shell code
+       generates a random mac-address as described above.
+
+       # sys_mac_addr=$(printf '%02x:%02x:%02x:%02x:%02x:%02x' \
+                                $(( (RANDOM & 0xFE) | 0x02 )) \
+                                $(( RANDOM & 0xFF )) \
+                                $(( RANDOM & 0xFF )) \
+                                $(( RANDOM & 0xFF )) \
+                                $(( RANDOM & 0xFF )) \
+                                $(( RANDOM & 0xFF )))
+       # echo $sys_mac_addr > /sys/class/net/bond0/bonding/ad_actor_system
+
+   (b) ad_actor_sys_prio : Randomize the system priority. The default value
+       is 65535, but system can take the value from 1 - 65535. Following shell
+       code generates random priority and sets it.
+
+       # sys_prio=$(( 1 + RANDOM + RANDOM ))
+       # echo $sys_prio > /sys/class/net/bond0/bonding/ad_actor_sys_prio
+
+   (c) ad_user_port_key : Use the user portion of the port-key. The default
+       keeps this empty. These are the upper 10 bits of the port-key and value
+       ranges from 0 - 1023. Following shell code generates these 10 bits and
+       sets it.
+
+       # usr_port_key=$(( RANDOM & 0x3FF ))
+       # echo $usr_port_key > /sys/class/net/bond0/bonding/ad_user_port_key
+
+
 4 Querying Bonding Configuration
 =================================
 
diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index 012f7bc22d91..7fde4d5c2b28 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -75,10 +75,10 @@
 /* Port Key definitions
  * key is determined according to the link speed, duplex and
  * user key (which is yet not supported)
- * --------------------------------------------------------------
- * Port key :	| User key	| Speed		| Duplex	|
- * --------------------------------------------------------------
- * 16		  6		  1		  0
+ *           --------------------------------------------------------------
+ * Port key  | User key (10 bits)           | Speed (5 bits)      | Duplex|
+ *           --------------------------------------------------------------
+ *           |15                           6|5                   1|0
  */
 #define  AD_DUPLEX_KEY_MASKS    0x1
 #define  AD_SPEED_KEY_MASKS     0x3E
@@ -1951,10 +1951,10 @@ void bond_3ad_bind_slave(struct slave *slave)
 
 		port->slave = slave;
 		port->actor_port_number = SLAVE_AD_INFO(slave)->id;
-		/* key is determined according to the link speed, duplex and user key(which
-		 * is yet not supported)
+		/* key is determined according to the link speed, duplex and
+		 * user key
 		 */
-		port->actor_admin_port_key = 0;
+		port->actor_admin_port_key = bond->params.ad_user_port_key << 6;
 		port->actor_admin_port_key |= __get_duplex(port);
 		port->actor_admin_port_key |= (__get_link_speed(port) << 1);
 		port->actor_oper_port_key = port->actor_admin_port_key;
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index a4e2f27ef683..2ee13be7551b 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4141,6 +4141,7 @@ static int bond_check_params(struct bond_params *params)
 	const struct bond_opt_value *valptr;
 	int arp_all_targets_value;
 	u16 ad_actor_sys_prio = 0;
+	u16 ad_user_port_key = 0;
 
 	/* Convert string parameters. */
 	if (mode) {
@@ -4445,6 +4446,14 @@ static int bond_check_params(struct bond_params *params)
 	}
 	ad_actor_sys_prio = valptr->value;
 
+	valptr = bond_opt_parse(bond_opt_get(BOND_OPT_AD_USER_PORT_KEY),
+				&newval);
+	if (!valptr) {
+		pr_err("Error: No ad_user_port_key default value");
+		return -EINVAL;
+	}
+	ad_user_port_key = valptr->value;
+
 	if (lp_interval == 0) {
 		pr_warn("Warning: ip_interval must be between 1 and %d, so it was reset to %d\n",
 			INT_MAX, BOND_ALB_DEFAULT_LP_INTERVAL);
@@ -4475,6 +4484,7 @@ static int bond_check_params(struct bond_params *params)
 	params->tlb_dynamic_lb = 1; /* Default value */
 	params->ad_actor_sys_prio = ad_actor_sys_prio;
 	eth_zero_addr(params->ad_actor_system);
+	params->ad_user_port_key = ad_user_port_key;
 	if (packets_per_slave > 0) {
 		params->reciprocal_packets_per_slave =
 			reciprocal_value(packets_per_slave);
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index cdcef217ac84..c85da05721e6 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -74,6 +74,8 @@ static int bond_option_ad_actor_sys_prio_set(struct bonding *bond,
 					     const struct bond_opt_value *newval);
 static int bond_option_ad_actor_system_set(struct bonding *bond,
 					   const struct bond_opt_value *newval);
+static int bond_option_ad_user_port_key_set(struct bonding *bond,
+					    const struct bond_opt_value *newval);
 
 
 static const struct bond_opt_value bond_mode_tbl[] = {
@@ -196,6 +198,12 @@ static const struct bond_opt_value bond_ad_actor_sys_prio_tbl[] = {
 	{ NULL,      -1,    0},
 };
 
+static const struct bond_opt_value bond_ad_user_port_key_tbl[] = {
+	{ "minval",  0,     BOND_VALFLAG_MIN | BOND_VALFLAG_DEFAULT},
+	{ "maxval",  1023,  BOND_VALFLAG_MAX},
+	{ NULL,      -1,    0},
+};
+
 static const struct bond_option bond_opts[BOND_OPT_LAST] = {
 	[BOND_OPT_MODE] = {
 		.id = BOND_OPT_MODE,
@@ -405,6 +413,14 @@ static const struct bond_option bond_opts[BOND_OPT_LAST] = {
 		.flags = BOND_OPTFLAG_RAWVAL | BOND_OPTFLAG_IFDOWN,
 		.set = bond_option_ad_actor_system_set,
 	},
+	[BOND_OPT_AD_USER_PORT_KEY] = {
+		.id = BOND_OPT_AD_USER_PORT_KEY,
+		.name = "ad_user_port_key",
+		.unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_8023AD)),
+		.flags = BOND_OPTFLAG_IFDOWN,
+		.values = bond_ad_user_port_key_tbl,
+		.set = bond_option_ad_user_port_key_set,
+	}
 };
 
 /* Searches for an option by name */
@@ -1402,3 +1418,13 @@ static int bond_option_ad_actor_system_set(struct bonding *bond,
 	ether_addr_copy(bond->params.ad_actor_system, macaddr);
 	return 0;
 }
+
+static int bond_option_ad_user_port_key_set(struct bonding *bond,
+					    const struct bond_opt_value *newval)
+{
+	netdev_info(bond->dev, "Setting ad_user_port_key to (%llu)\n",
+		    newval->value);
+
+	bond->params.ad_user_port_key = newval->value;
+	return 0;
+}
diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index 09fefa50d055..143a2abd1c1c 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -721,6 +721,20 @@ static ssize_t bonding_show_ad_actor_system(struct device *d,
 static DEVICE_ATTR(ad_actor_system, S_IRUGO | S_IWUSR,
 		   bonding_show_ad_actor_system, bonding_sysfs_store_option);
 
+static ssize_t bonding_show_ad_user_port_key(struct device *d,
+					     struct device_attribute *attr,
+					     char *buf)
+{
+	struct bonding *bond = to_bond(d);
+
+	if (BOND_MODE(bond) == BOND_MODE_8023AD)
+		return sprintf(buf, "%hu\n", bond->params.ad_user_port_key);
+
+	return 0;
+}
+static DEVICE_ATTR(ad_user_port_key, S_IRUGO | S_IWUSR,
+		   bonding_show_ad_user_port_key, bonding_sysfs_store_option);
+
 static struct attribute *per_bond_attrs[] = {
 	&dev_attr_slaves.attr,
 	&dev_attr_mode.attr,
@@ -756,6 +770,7 @@ static struct attribute *per_bond_attrs[] = {
 	&dev_attr_tlb_dynamic_lb.attr,
 	&dev_attr_ad_actor_sys_prio.attr,
 	&dev_attr_ad_actor_system.attr,
+	&dev_attr_ad_user_port_key.attr,
 	NULL,
 };
 
diff --git a/include/net/bond_options.h b/include/net/bond_options.h
index eeeefa1d3cd8..c28aca25320e 100644
--- a/include/net/bond_options.h
+++ b/include/net/bond_options.h
@@ -65,6 +65,7 @@ enum {
 	BOND_OPT_TLB_DYNAMIC_LB,
 	BOND_OPT_AD_ACTOR_SYS_PRIO,
 	BOND_OPT_AD_ACTOR_SYSTEM,
+	BOND_OPT_AD_USER_PORT_KEY,
 	BOND_OPT_LAST
 };
 
diff --git a/include/net/bonding.h b/include/net/bonding.h
index 650f38693956..20defc0353d1 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -137,6 +137,7 @@ struct bond_params {
 	int tlb_dynamic_lb;
 	struct reciprocal_value reciprocal_packets_per_slave;
 	u16 ad_actor_sys_prio;
+	u16 ad_user_port_key;
 	u8 ad_actor_system[ETH_ALEN];
 };
 
-- 
cgit v1.2.3


From b396cca6fafccf16206a5d041d59c9e6b65b6f5a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 11 May 2015 09:06:56 -0700
Subject: net: sched: deprecate enqueue_root()

Only left enqueue_root() user is netem, and it looks not necessary :

qdisc_skb_cb(skb)->pkt_len is preserved after one skb_clone()

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 6 ------
 net/sched/sch_netem.c     | 4 ++--
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 994b5a092f33..1b0a2e88ed2b 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -501,12 +501,6 @@ static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	return sch->enqueue(skb, sch);
 }
 
-static inline int qdisc_enqueue_root(struct sk_buff *skb, struct Qdisc *sch)
-{
-	qdisc_skb_cb(skb)->pkt_len = skb->len;
-	return qdisc_enqueue(skb, sch) & NET_XMIT_MASK;
-}
-
 static inline bool qdisc_is_percpu_stats(const struct Qdisc *q)
 {
 	return q->flags & TCQ_F_CPUSTATS;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 956ead2cab9a..5abd1d9de989 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -440,9 +440,9 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
 		struct Qdisc *rootq = qdisc_root(sch);
 		u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
-		q->duplicate = 0;
 
-		qdisc_enqueue_root(skb2, rootq);
+		q->duplicate = 0;
+		rootq->enqueue(skb2, rootq);
 		q->duplicate = dupsave;
 	}
 
-- 
cgit v1.2.3


From ebb9a03a590e2325f747be43c8db450e92509501 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Sun, 10 May 2015 09:47:46 -0700
Subject: switchdev: s/netdev_switch_/switchdev_/ and
 s/NETDEV_SWITCH_/SWITCHDEV_/

Turned out that "switchdev" sticks. So just unify all related terms to use
this prefix.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Acked-by: Andy Gospodarek <gospo@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c      |   4 +-
 drivers/net/ethernet/rocker/rocker.c |  10 +--
 drivers/net/team/team.c              |   4 +-
 include/net/switchdev.h              | 111 +++++++++++++++--------------
 net/bridge/br.c                      |  22 +++---
 net/bridge/br_netlink.c              |   6 +-
 net/bridge/br_stp.c                  |   2 +-
 net/core/net-sysfs.c                 |   2 +-
 net/core/rtnetlink.c                 |   2 +-
 net/ipv4/fib_trie.c                  |  40 +++++------
 net/switchdev/switchdev.c            | 134 +++++++++++++++++------------------
 11 files changed, 166 insertions(+), 171 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 2ee13be7551b..a475a0389cc7 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4039,8 +4039,8 @@ static const struct net_device_ops bond_netdev_ops = {
 	.ndo_add_slave		= bond_enslave,
 	.ndo_del_slave		= bond_release,
 	.ndo_fix_features	= bond_fix_features,
-	.ndo_bridge_setlink	= ndo_dflt_netdev_switch_port_bridge_setlink,
-	.ndo_bridge_dellink	= ndo_dflt_netdev_switch_port_bridge_dellink,
+	.ndo_bridge_setlink	= ndo_dflt_switchdev_port_bridge_setlink,
+	.ndo_bridge_dellink	= ndo_dflt_switchdev_port_bridge_dellink,
 	.ndo_features_check	= passthru_features_check,
 };
 
diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index 0c6f0a8b42dd..418f62b19545 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -3377,17 +3377,17 @@ static void rocker_port_fdb_learn_work(struct work_struct *work)
 		container_of(work, struct rocker_fdb_learn_work, work);
 	bool removing = (lw->flags & ROCKER_OP_FLAG_REMOVE);
 	bool learned = (lw->flags & ROCKER_OP_FLAG_LEARNED);
-	struct netdev_switch_notifier_fdb_info info;
+	struct switchdev_notifier_fdb_info info;
 
 	info.addr = lw->addr;
 	info.vid = lw->vid;
 
 	if (learned && removing)
-		call_netdev_switch_notifiers(NETDEV_SWITCH_FDB_DEL,
-					     lw->dev, &info.info);
+		call_switchdev_notifiers(SWITCHDEV_FDB_DEL,
+					 lw->dev, &info.info);
 	else if (learned && !removing)
-		call_netdev_switch_notifiers(NETDEV_SWITCH_FDB_ADD,
-					     lw->dev, &info.info);
+		call_switchdev_notifiers(SWITCHDEV_FDB_ADD,
+					 lw->dev, &info.info);
 
 	kfree(work);
 }
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 6928448f6b7f..cfe84965afb6 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1977,8 +1977,8 @@ static const struct net_device_ops team_netdev_ops = {
 	.ndo_del_slave		= team_del_slave,
 	.ndo_fix_features	= team_fix_features,
 	.ndo_change_carrier     = team_change_carrier,
-	.ndo_bridge_setlink     = ndo_dflt_netdev_switch_port_bridge_setlink,
-	.ndo_bridge_dellink     = ndo_dflt_netdev_switch_port_bridge_dellink,
+	.ndo_bridge_setlink     = ndo_dflt_switchdev_port_bridge_setlink,
+	.ndo_bridge_dellink     = ndo_dflt_switchdev_port_bridge_dellink,
 	.ndo_features_check	= passthru_features_check,
 };
 
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index d2e69ee3019a..cd921fa0d63a 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -43,124 +43,125 @@ struct swdev_ops {
 				      u8 tos, u8 type, u32 tb_id);
 };
 
-enum netdev_switch_notifier_type {
-	NETDEV_SWITCH_FDB_ADD = 1,
-	NETDEV_SWITCH_FDB_DEL,
+enum switchdev_notifier_type {
+	SWITCHDEV_FDB_ADD = 1,
+	SWITCHDEV_FDB_DEL,
 };
 
-struct netdev_switch_notifier_info {
+struct switchdev_notifier_info {
 	struct net_device *dev;
 };
 
-struct netdev_switch_notifier_fdb_info {
-	struct netdev_switch_notifier_info info; /* must be first */
+struct switchdev_notifier_fdb_info {
+	struct switchdev_notifier_info info; /* must be first */
 	const unsigned char *addr;
 	u16 vid;
 };
 
 static inline struct net_device *
-netdev_switch_notifier_info_to_dev(const struct netdev_switch_notifier_info *info)
+switchdev_notifier_info_to_dev(const struct switchdev_notifier_info *info)
 {
 	return info->dev;
 }
 
 #ifdef CONFIG_NET_SWITCHDEV
 
-int netdev_switch_parent_id_get(struct net_device *dev,
-				struct netdev_phys_item_id *psid);
-int netdev_switch_port_stp_update(struct net_device *dev, u8 state);
-int register_netdev_switch_notifier(struct notifier_block *nb);
-int unregister_netdev_switch_notifier(struct notifier_block *nb);
-int call_netdev_switch_notifiers(unsigned long val, struct net_device *dev,
-				 struct netdev_switch_notifier_info *info);
-int netdev_switch_port_bridge_setlink(struct net_device *dev,
-				struct nlmsghdr *nlh, u16 flags);
-int netdev_switch_port_bridge_dellink(struct net_device *dev,
-				struct nlmsghdr *nlh, u16 flags);
-int ndo_dflt_netdev_switch_port_bridge_dellink(struct net_device *dev,
-					       struct nlmsghdr *nlh, u16 flags);
-int ndo_dflt_netdev_switch_port_bridge_setlink(struct net_device *dev,
-					       struct nlmsghdr *nlh, u16 flags);
-int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
-			       u8 tos, u8 type, u32 nlflags, u32 tb_id);
-int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
-			       u8 tos, u8 type, u32 tb_id);
-void netdev_switch_fib_ipv4_abort(struct fib_info *fi);
+int switchdev_parent_id_get(struct net_device *dev,
+			    struct netdev_phys_item_id *psid);
+int switchdev_port_stp_update(struct net_device *dev, u8 state);
+int register_switchdev_notifier(struct notifier_block *nb);
+int unregister_switchdev_notifier(struct notifier_block *nb);
+int call_switchdev_notifiers(unsigned long val, struct net_device *dev,
+			     struct switchdev_notifier_info *info);
+int switchdev_port_bridge_setlink(struct net_device *dev,
+				  struct nlmsghdr *nlh, u16 flags);
+int switchdev_port_bridge_dellink(struct net_device *dev,
+				  struct nlmsghdr *nlh, u16 flags);
+int ndo_dflt_switchdev_port_bridge_dellink(struct net_device *dev,
+					   struct nlmsghdr *nlh, u16 flags);
+int ndo_dflt_switchdev_port_bridge_setlink(struct net_device *dev,
+					   struct nlmsghdr *nlh, u16 flags);
+int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
+			   u8 tos, u8 type, u32 nlflags, u32 tb_id);
+int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
+			   u8 tos, u8 type, u32 tb_id);
+void switchdev_fib_ipv4_abort(struct fib_info *fi);
 
 #else
 
-static inline int netdev_switch_parent_id_get(struct net_device *dev,
-					      struct netdev_phys_item_id *psid)
+static inline int switchdev_parent_id_get(struct net_device *dev,
+					  struct netdev_phys_item_id *psid)
 {
 	return -EOPNOTSUPP;
 }
 
-static inline int netdev_switch_port_stp_update(struct net_device *dev,
-						u8 state)
+static inline int switchdev_port_stp_update(struct net_device *dev,
+					    u8 state)
 {
 	return -EOPNOTSUPP;
 }
 
-static inline int register_netdev_switch_notifier(struct notifier_block *nb)
+static inline int register_switchdev_notifier(struct notifier_block *nb)
 {
 	return 0;
 }
 
-static inline int unregister_netdev_switch_notifier(struct notifier_block *nb)
+static inline int unregister_switchdev_notifier(struct notifier_block *nb)
 {
 	return 0;
 }
 
-static inline int call_netdev_switch_notifiers(unsigned long val, struct net_device *dev,
-					       struct netdev_switch_notifier_info *info)
+static inline int call_switchdev_notifiers(unsigned long val,
+					   struct net_device *dev,
+					   struct switchdev_notifier_info *info)
 {
 	return NOTIFY_DONE;
 }
 
-static inline int netdev_switch_port_bridge_setlink(struct net_device *dev,
-						    struct nlmsghdr *nlh,
-						    u16 flags)
+static inline int switchdev_port_bridge_setlink(struct net_device *dev,
+						struct nlmsghdr *nlh,
+						u16 flags)
 {
 	return -EOPNOTSUPP;
 }
 
-static inline int netdev_switch_port_bridge_dellink(struct net_device *dev,
-						    struct nlmsghdr *nlh,
-						    u16 flags)
+static inline int switchdev_port_bridge_dellink(struct net_device *dev,
+						struct nlmsghdr *nlh,
+						u16 flags)
 {
 	return -EOPNOTSUPP;
 }
 
-static inline int ndo_dflt_netdev_switch_port_bridge_dellink(struct net_device *dev,
-							struct nlmsghdr *nlh,
-							u16 flags)
+static inline int ndo_dflt_switchdev_port_bridge_dellink(struct net_device *dev,
+							 struct nlmsghdr *nlh,
+							 u16 flags)
 {
 	return 0;
 }
 
-static inline int ndo_dflt_netdev_switch_port_bridge_setlink(struct net_device *dev,
-							struct nlmsghdr *nlh,
-							u16 flags)
+static inline int ndo_dflt_switchdev_port_bridge_setlink(struct net_device *dev,
+							 struct nlmsghdr *nlh,
+							 u16 flags)
 {
 	return 0;
 }
 
-static inline int netdev_switch_fib_ipv4_add(u32 dst, int dst_len,
-					     struct fib_info *fi,
-					     u8 tos, u8 type,
-					     u32 nlflags, u32 tb_id)
+static inline int switchdev_fib_ipv4_add(u32 dst, int dst_len,
+					 struct fib_info *fi,
+					 u8 tos, u8 type,
+					 u32 nlflags, u32 tb_id)
 {
 	return 0;
 }
 
-static inline int netdev_switch_fib_ipv4_del(u32 dst, int dst_len,
-					     struct fib_info *fi,
-					     u8 tos, u8 type, u32 tb_id)
+static inline int switchdev_fib_ipv4_del(u32 dst, int dst_len,
+					 struct fib_info *fi,
+					 u8 tos, u8 type, u32 tb_id)
 {
 	return 0;
 }
 
-static inline void netdev_switch_fib_ipv4_abort(struct fib_info *fi)
+static inline void switchdev_fib_ipv4_abort(struct fib_info *fi)
 {
 }
 
diff --git a/net/bridge/br.c b/net/bridge/br.c
index 02c24cf63c34..a1abe4936fe1 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -121,13 +121,13 @@ static struct notifier_block br_device_notifier = {
 	.notifier_call = br_device_event
 };
 
-static int br_netdev_switch_event(struct notifier_block *unused,
-				  unsigned long event, void *ptr)
+static int br_switchdev_event(struct notifier_block *unused,
+			      unsigned long event, void *ptr)
 {
-	struct net_device *dev = netdev_switch_notifier_info_to_dev(ptr);
+	struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
 	struct net_bridge_port *p;
 	struct net_bridge *br;
-	struct netdev_switch_notifier_fdb_info *fdb_info;
+	struct switchdev_notifier_fdb_info *fdb_info;
 	int err = NOTIFY_DONE;
 
 	rtnl_lock();
@@ -138,14 +138,14 @@ static int br_netdev_switch_event(struct notifier_block *unused,
 	br = p->br;
 
 	switch (event) {
-	case NETDEV_SWITCH_FDB_ADD:
+	case SWITCHDEV_FDB_ADD:
 		fdb_info = ptr;
 		err = br_fdb_external_learn_add(br, p, fdb_info->addr,
 						fdb_info->vid);
 		if (err)
 			err = notifier_from_errno(err);
 		break;
-	case NETDEV_SWITCH_FDB_DEL:
+	case SWITCHDEV_FDB_DEL:
 		fdb_info = ptr;
 		err = br_fdb_external_learn_del(br, p, fdb_info->addr,
 						fdb_info->vid);
@@ -159,8 +159,8 @@ out:
 	return err;
 }
 
-static struct notifier_block br_netdev_switch_notifier = {
-	.notifier_call = br_netdev_switch_event,
+static struct notifier_block br_switchdev_notifier = {
+	.notifier_call = br_switchdev_event,
 };
 
 static void __net_exit br_net_exit(struct net *net)
@@ -214,7 +214,7 @@ static int __init br_init(void)
 	if (err)
 		goto err_out3;
 
-	err = register_netdev_switch_notifier(&br_netdev_switch_notifier);
+	err = register_switchdev_notifier(&br_switchdev_notifier);
 	if (err)
 		goto err_out4;
 
@@ -235,7 +235,7 @@ static int __init br_init(void)
 	return 0;
 
 err_out5:
-	unregister_netdev_switch_notifier(&br_netdev_switch_notifier);
+	unregister_switchdev_notifier(&br_switchdev_notifier);
 err_out4:
 	unregister_netdevice_notifier(&br_device_notifier);
 err_out3:
@@ -253,7 +253,7 @@ static void __exit br_deinit(void)
 {
 	stp_proto_unregister(&br_stp_proto);
 	br_netlink_fini();
-	unregister_netdev_switch_notifier(&br_netdev_switch_notifier);
+	unregister_switchdev_notifier(&br_switchdev_notifier);
 	unregister_netdevice_notifier(&br_device_notifier);
 	brioctl_set(NULL);
 	unregister_pernet_subsys(&br_net_ops);
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 4b5c236998ff..dc234533e204 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -631,8 +631,7 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags)
 	if (p && !(flags & BRIDGE_FLAGS_SELF)) {
 		/* set bridge attributes in hardware if supported
 		 */
-		ret_offload = netdev_switch_port_bridge_setlink(dev, nlh,
-								flags);
+		ret_offload = switchdev_port_bridge_setlink(dev, nlh, flags);
 		if (ret_offload && ret_offload != -EOPNOTSUPP)
 			br_warn(p->br, "error setting attrs on port %u(%s)\n",
 				(unsigned int)p->port_no, p->dev->name);
@@ -671,8 +670,7 @@ int br_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags)
 	if (p && !(flags & BRIDGE_FLAGS_SELF)) {
 		/* del bridge attributes in hardware
 		 */
-		ret_offload = netdev_switch_port_bridge_dellink(dev, nlh,
-								flags);
+		ret_offload = switchdev_port_bridge_dellink(dev, nlh, flags);
 		if (ret_offload && ret_offload != -EOPNOTSUPP)
 			br_warn(p->br, "error deleting attrs on port %u (%s)\n",
 				(unsigned int)p->port_no, p->dev->name);
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index fb3ebe615513..28e3f4bc01e0 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -42,7 +42,7 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
 	int err;
 
 	p->state = state;
-	err = netdev_switch_port_stp_update(p->dev, state);
+	err = switchdev_port_stp_update(p->dev, state);
 	if (err && err != -EOPNOTSUPP)
 		br_warn(p->br, "error setting offload STP state on port %u(%s)\n",
 				(unsigned int) p->port_no, p->dev->name);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 4238d6da5c60..be86a7ce9282 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -460,7 +460,7 @@ static ssize_t phys_switch_id_show(struct device *dev,
 	if (dev_isalive(netdev)) {
 		struct netdev_phys_item_id ppid;
 
-		ret = netdev_switch_parent_id_get(netdev, &ppid);
+		ret = switchdev_parent_id_get(netdev, &ppid);
 		if (!ret)
 			ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id);
 	}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 83e08323fdcd..fcd41fcd7e70 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1006,7 +1006,7 @@ static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev)
 	int err;
 	struct netdev_phys_item_id psid;
 
-	err = netdev_switch_parent_id_get(dev, &psid);
+	err = switchdev_parent_id_get(dev, &psid);
 	if (err) {
 		if (err == -EOPNOTSUPP)
 			return 0;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index e13fcc602da2..03444c6ae342 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1165,13 +1165,13 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 			new_fa->fa_state = state & ~FA_S_ACCESSED;
 			new_fa->fa_slen = fa->fa_slen;
 
-			err = netdev_switch_fib_ipv4_add(key, plen, fi,
-							 new_fa->fa_tos,
-							 cfg->fc_type,
-							 cfg->fc_nlflags,
-							 tb->tb_id);
+			err = switchdev_fib_ipv4_add(key, plen, fi,
+						     new_fa->fa_tos,
+						     cfg->fc_type,
+						     cfg->fc_nlflags,
+						     tb->tb_id);
 			if (err) {
-				netdev_switch_fib_ipv4_abort(fi);
+				switchdev_fib_ipv4_abort(fi);
 				kmem_cache_free(fn_alias_kmem, new_fa);
 				goto out;
 			}
@@ -1215,12 +1215,10 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 	new_fa->tb_id = tb->tb_id;
 
 	/* (Optionally) offload fib entry to switch hardware. */
-	err = netdev_switch_fib_ipv4_add(key, plen, fi, tos,
-					 cfg->fc_type,
-					 cfg->fc_nlflags,
-					 tb->tb_id);
+	err = switchdev_fib_ipv4_add(key, plen, fi, tos, cfg->fc_type,
+				     cfg->fc_nlflags, tb->tb_id);
 	if (err) {
-		netdev_switch_fib_ipv4_abort(fi);
+		switchdev_fib_ipv4_abort(fi);
 		goto out_free_new_fa;
 	}
 
@@ -1239,7 +1237,7 @@ succeeded:
 	return 0;
 
 out_sw_fib_del:
-	netdev_switch_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id);
+	switchdev_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id);
 out_free_new_fa:
 	kmem_cache_free(fn_alias_kmem, new_fa);
 out:
@@ -1517,8 +1515,8 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
 	if (!fa_to_delete)
 		return -ESRCH;
 
-	netdev_switch_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos,
-				   cfg->fc_type, tb->tb_id);
+	switchdev_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos,
+			       cfg->fc_type, tb->tb_id);
 
 	rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
 		  &cfg->fc_nlinfo, 0);
@@ -1767,10 +1765,9 @@ void fib_table_flush_external(struct fib_table *tb)
 			if (!fi || !(fi->fib_flags & RTNH_F_EXTERNAL))
 				continue;
 
-			netdev_switch_fib_ipv4_del(n->key,
-						   KEYLENGTH - fa->fa_slen,
-						   fi, fa->fa_tos,
-						   fa->fa_type, tb->tb_id);
+			switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen,
+					       fi, fa->fa_tos, fa->fa_type,
+					       tb->tb_id);
 		}
 
 		/* update leaf slen */
@@ -1835,10 +1832,9 @@ int fib_table_flush(struct fib_table *tb)
 				continue;
 			}
 
-			netdev_switch_fib_ipv4_del(n->key,
-						   KEYLENGTH - fa->fa_slen,
-						   fi, fa->fa_tos,
-						   fa->fa_type, tb->tb_id);
+			switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen,
+					       fi, fa->fa_tos, fa->fa_type,
+					       tb->tb_id);
 			hlist_del_rcu(&fa->fa_list);
 			fib_release_info(fa->fa_info);
 			alias_free_mem_rcu(fa);
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 46568b85c333..52613ed49a8c 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -19,14 +19,14 @@
 #include <net/switchdev.h>
 
 /**
- *	netdev_switch_parent_id_get - Get ID of a switch
+ *	switchdev_parent_id_get - Get ID of a switch
  *	@dev: port device
  *	@psid: switch ID
  *
  *	Get ID of a switch this port is part of.
  */
-int netdev_switch_parent_id_get(struct net_device *dev,
-				struct netdev_phys_item_id *psid)
+int switchdev_parent_id_get(struct net_device *dev,
+			    struct netdev_phys_item_id *psid)
 {
 	const struct swdev_ops *ops = dev->swdev_ops;
 
@@ -34,17 +34,17 @@ int netdev_switch_parent_id_get(struct net_device *dev,
 		return -EOPNOTSUPP;
 	return ops->swdev_parent_id_get(dev, psid);
 }
-EXPORT_SYMBOL_GPL(netdev_switch_parent_id_get);
+EXPORT_SYMBOL_GPL(switchdev_parent_id_get);
 
 /**
- *	netdev_switch_port_stp_update - Notify switch device port of STP
+ *	switchdev_port_stp_update - Notify switch device port of STP
  *					state change
  *	@dev: port device
  *	@state: port STP state
  *
  *	Notify switch device port of bridge port STP state change.
  */
-int netdev_switch_port_stp_update(struct net_device *dev, u8 state)
+int switchdev_port_stp_update(struct net_device *dev, u8 state)
 {
 	const struct swdev_ops *ops = dev->swdev_ops;
 	struct net_device *lower_dev;
@@ -55,57 +55,57 @@ int netdev_switch_port_stp_update(struct net_device *dev, u8 state)
 		return ops->swdev_port_stp_update(dev, state);
 
 	netdev_for_each_lower_dev(dev, lower_dev, iter) {
-		err = netdev_switch_port_stp_update(lower_dev, state);
+		err = switchdev_port_stp_update(lower_dev, state);
 		if (err && err != -EOPNOTSUPP)
 			return err;
 	}
 
 	return err;
 }
-EXPORT_SYMBOL_GPL(netdev_switch_port_stp_update);
+EXPORT_SYMBOL_GPL(switchdev_port_stp_update);
 
-static DEFINE_MUTEX(netdev_switch_mutex);
-static RAW_NOTIFIER_HEAD(netdev_switch_notif_chain);
+static DEFINE_MUTEX(switchdev_mutex);
+static RAW_NOTIFIER_HEAD(switchdev_notif_chain);
 
 /**
- *	register_netdev_switch_notifier - Register notifier
+ *	register_switchdev_notifier - Register notifier
  *	@nb: notifier_block
  *
  *	Register switch device notifier. This should be used by code
  *	which needs to monitor events happening in particular device.
  *	Return values are same as for atomic_notifier_chain_register().
  */
-int register_netdev_switch_notifier(struct notifier_block *nb)
+int register_switchdev_notifier(struct notifier_block *nb)
 {
 	int err;
 
-	mutex_lock(&netdev_switch_mutex);
-	err = raw_notifier_chain_register(&netdev_switch_notif_chain, nb);
-	mutex_unlock(&netdev_switch_mutex);
+	mutex_lock(&switchdev_mutex);
+	err = raw_notifier_chain_register(&switchdev_notif_chain, nb);
+	mutex_unlock(&switchdev_mutex);
 	return err;
 }
-EXPORT_SYMBOL_GPL(register_netdev_switch_notifier);
+EXPORT_SYMBOL_GPL(register_switchdev_notifier);
 
 /**
- *	unregister_netdev_switch_notifier - Unregister notifier
+ *	unregister_switchdev_notifier - Unregister notifier
  *	@nb: notifier_block
  *
  *	Unregister switch device notifier.
  *	Return values are same as for atomic_notifier_chain_unregister().
  */
-int unregister_netdev_switch_notifier(struct notifier_block *nb)
+int unregister_switchdev_notifier(struct notifier_block *nb)
 {
 	int err;
 
-	mutex_lock(&netdev_switch_mutex);
-	err = raw_notifier_chain_unregister(&netdev_switch_notif_chain, nb);
-	mutex_unlock(&netdev_switch_mutex);
+	mutex_lock(&switchdev_mutex);
+	err = raw_notifier_chain_unregister(&switchdev_notif_chain, nb);
+	mutex_unlock(&switchdev_mutex);
 	return err;
 }
-EXPORT_SYMBOL_GPL(unregister_netdev_switch_notifier);
+EXPORT_SYMBOL_GPL(unregister_switchdev_notifier);
 
 /**
- *	call_netdev_switch_notifiers - Call notifiers
+ *	call_switchdev_notifiers - Call notifiers
  *	@val: value passed unmodified to notifier function
  *	@dev: port device
  *	@info: notifier information data
@@ -114,21 +114,21 @@ EXPORT_SYMBOL_GPL(unregister_netdev_switch_notifier);
  *	when it needs to propagate hardware event.
  *	Return values are same as for atomic_notifier_call_chain().
  */
-int call_netdev_switch_notifiers(unsigned long val, struct net_device *dev,
-				 struct netdev_switch_notifier_info *info)
+int call_switchdev_notifiers(unsigned long val, struct net_device *dev,
+			     struct switchdev_notifier_info *info)
 {
 	int err;
 
 	info->dev = dev;
-	mutex_lock(&netdev_switch_mutex);
-	err = raw_notifier_call_chain(&netdev_switch_notif_chain, val, info);
-	mutex_unlock(&netdev_switch_mutex);
+	mutex_lock(&switchdev_mutex);
+	err = raw_notifier_call_chain(&switchdev_notif_chain, val, info);
+	mutex_unlock(&switchdev_mutex);
 	return err;
 }
-EXPORT_SYMBOL_GPL(call_netdev_switch_notifiers);
+EXPORT_SYMBOL_GPL(call_switchdev_notifiers);
 
 /**
- *	netdev_switch_port_bridge_setlink - Notify switch device port of bridge
+ *	switchdev_port_bridge_setlink - Notify switch device port of bridge
  *	port attributes
  *
  *	@dev: port device
@@ -137,8 +137,8 @@ EXPORT_SYMBOL_GPL(call_netdev_switch_notifiers);
  *
  *	Notify switch device port of bridge port attributes
  */
-int netdev_switch_port_bridge_setlink(struct net_device *dev,
-				      struct nlmsghdr *nlh, u16 flags)
+int switchdev_port_bridge_setlink(struct net_device *dev,
+				  struct nlmsghdr *nlh, u16 flags)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 
@@ -150,10 +150,10 @@ int netdev_switch_port_bridge_setlink(struct net_device *dev,
 
 	return ops->ndo_bridge_setlink(dev, nlh, flags);
 }
-EXPORT_SYMBOL_GPL(netdev_switch_port_bridge_setlink);
+EXPORT_SYMBOL_GPL(switchdev_port_bridge_setlink);
 
 /**
- *	netdev_switch_port_bridge_dellink - Notify switch device port of bridge
+ *	switchdev_port_bridge_dellink - Notify switch device port of bridge
  *	port attribute delete
  *
  *	@dev: port device
@@ -162,8 +162,8 @@ EXPORT_SYMBOL_GPL(netdev_switch_port_bridge_setlink);
  *
  *	Notify switch device port of bridge port attribute delete
  */
-int netdev_switch_port_bridge_dellink(struct net_device *dev,
-				      struct nlmsghdr *nlh, u16 flags)
+int switchdev_port_bridge_dellink(struct net_device *dev,
+				  struct nlmsghdr *nlh, u16 flags)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 
@@ -175,11 +175,11 @@ int netdev_switch_port_bridge_dellink(struct net_device *dev,
 
 	return ops->ndo_bridge_dellink(dev, nlh, flags);
 }
-EXPORT_SYMBOL_GPL(netdev_switch_port_bridge_dellink);
+EXPORT_SYMBOL_GPL(switchdev_port_bridge_dellink);
 
 /**
- *	ndo_dflt_netdev_switch_port_bridge_setlink - default ndo bridge setlink
- *						     op for master devices
+ *	ndo_dflt_switchdev_port_bridge_setlink - default ndo bridge setlink
+ *						 op for master devices
  *
  *	@dev: port device
  *	@nlh: netlink msg with bridge port attributes
@@ -187,8 +187,8 @@ EXPORT_SYMBOL_GPL(netdev_switch_port_bridge_dellink);
  *
  *	Notify master device slaves of bridge port attributes
  */
-int ndo_dflt_netdev_switch_port_bridge_setlink(struct net_device *dev,
-					       struct nlmsghdr *nlh, u16 flags)
+int ndo_dflt_switchdev_port_bridge_setlink(struct net_device *dev,
+					   struct nlmsghdr *nlh, u16 flags)
 {
 	struct net_device *lower_dev;
 	struct list_head *iter;
@@ -198,18 +198,18 @@ int ndo_dflt_netdev_switch_port_bridge_setlink(struct net_device *dev,
 		return ret;
 
 	netdev_for_each_lower_dev(dev, lower_dev, iter) {
-		err = netdev_switch_port_bridge_setlink(lower_dev, nlh, flags);
+		err = switchdev_port_bridge_setlink(lower_dev, nlh, flags);
 		if (err && err != -EOPNOTSUPP)
 			ret = err;
 	}
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(ndo_dflt_netdev_switch_port_bridge_setlink);
+EXPORT_SYMBOL_GPL(ndo_dflt_switchdev_port_bridge_setlink);
 
 /**
- *	ndo_dflt_netdev_switch_port_bridge_dellink - default ndo bridge dellink
- *						     op for master devices
+ *	ndo_dflt_switchdev_port_bridge_dellink - default ndo bridge dellink
+ *						 op for master devices
  *
  *	@dev: port device
  *	@nlh: netlink msg with bridge port attributes
@@ -217,8 +217,8 @@ EXPORT_SYMBOL_GPL(ndo_dflt_netdev_switch_port_bridge_setlink);
  *
  *	Notify master device slaves of bridge port attribute deletes
  */
-int ndo_dflt_netdev_switch_port_bridge_dellink(struct net_device *dev,
-					       struct nlmsghdr *nlh, u16 flags)
+int ndo_dflt_switchdev_port_bridge_dellink(struct net_device *dev,
+					   struct nlmsghdr *nlh, u16 flags)
 {
 	struct net_device *lower_dev;
 	struct list_head *iter;
@@ -228,16 +228,16 @@ int ndo_dflt_netdev_switch_port_bridge_dellink(struct net_device *dev,
 		return ret;
 
 	netdev_for_each_lower_dev(dev, lower_dev, iter) {
-		err = netdev_switch_port_bridge_dellink(lower_dev, nlh, flags);
+		err = switchdev_port_bridge_dellink(lower_dev, nlh, flags);
 		if (err && err != -EOPNOTSUPP)
 			ret = err;
 	}
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(ndo_dflt_netdev_switch_port_bridge_dellink);
+EXPORT_SYMBOL_GPL(ndo_dflt_switchdev_port_bridge_dellink);
 
-static struct net_device *netdev_switch_get_lowest_dev(struct net_device *dev)
+static struct net_device *switchdev_get_lowest_dev(struct net_device *dev)
 {
 	const struct swdev_ops *ops = dev->swdev_ops;
 	struct net_device *lower_dev;
@@ -253,7 +253,7 @@ static struct net_device *netdev_switch_get_lowest_dev(struct net_device *dev)
 		return dev;
 
 	netdev_for_each_lower_dev(dev, lower_dev, iter) {
-		port_dev = netdev_switch_get_lowest_dev(lower_dev);
+		port_dev = switchdev_get_lowest_dev(lower_dev);
 		if (port_dev)
 			return port_dev;
 	}
@@ -261,7 +261,7 @@ static struct net_device *netdev_switch_get_lowest_dev(struct net_device *dev)
 	return NULL;
 }
 
-static struct net_device *netdev_switch_get_dev_by_nhs(struct fib_info *fi)
+static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi)
 {
 	struct netdev_phys_item_id psid;
 	struct netdev_phys_item_id prev_psid;
@@ -276,11 +276,11 @@ static struct net_device *netdev_switch_get_dev_by_nhs(struct fib_info *fi)
 		if (!nh->nh_dev)
 			return NULL;
 
-		dev = netdev_switch_get_lowest_dev(nh->nh_dev);
+		dev = switchdev_get_lowest_dev(nh->nh_dev);
 		if (!dev)
 			return NULL;
 
-		if (netdev_switch_parent_id_get(dev, &psid))
+		if (switchdev_parent_id_get(dev, &psid))
 			return NULL;
 
 		if (nhsel > 0) {
@@ -297,7 +297,7 @@ static struct net_device *netdev_switch_get_dev_by_nhs(struct fib_info *fi)
 }
 
 /**
- *	netdev_switch_fib_ipv4_add - Add IPv4 route entry to switch
+ *	switchdev_fib_ipv4_add - Add IPv4 route entry to switch
  *
  *	@dst: route's IPv4 destination address
  *	@dst_len: destination address length (prefix length)
@@ -309,8 +309,8 @@ static struct net_device *netdev_switch_get_dev_by_nhs(struct fib_info *fi)
  *
  *	Add IPv4 route entry to switch device.
  */
-int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
-			       u8 tos, u8 type, u32 nlflags, u32 tb_id)
+int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
+			   u8 tos, u8 type, u32 nlflags, u32 tb_id)
 {
 	struct net_device *dev;
 	const struct swdev_ops *ops;
@@ -328,7 +328,7 @@ int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
 	if (fi->fib_net->ipv4.fib_offload_disabled)
 		return 0;
 
-	dev = netdev_switch_get_dev_by_nhs(fi);
+	dev = switchdev_get_dev_by_nhs(fi);
 	if (!dev)
 		return 0;
 	ops = dev->swdev_ops;
@@ -343,10 +343,10 @@ int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
 
 	return err;
 }
-EXPORT_SYMBOL_GPL(netdev_switch_fib_ipv4_add);
+EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_add);
 
 /**
- *	netdev_switch_fib_ipv4_del - Delete IPv4 route entry from switch
+ *	switchdev_fib_ipv4_del - Delete IPv4 route entry from switch
  *
  *	@dst: route's IPv4 destination address
  *	@dst_len: destination address length (prefix length)
@@ -357,8 +357,8 @@ EXPORT_SYMBOL_GPL(netdev_switch_fib_ipv4_add);
  *
  *	Delete IPv4 route entry from switch device.
  */
-int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
-			       u8 tos, u8 type, u32 tb_id)
+int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
+			   u8 tos, u8 type, u32 tb_id)
 {
 	struct net_device *dev;
 	const struct swdev_ops *ops;
@@ -367,7 +367,7 @@ int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
 	if (!(fi->fib_flags & RTNH_F_EXTERNAL))
 		return 0;
 
-	dev = netdev_switch_get_dev_by_nhs(fi);
+	dev = switchdev_get_dev_by_nhs(fi);
 	if (!dev)
 		return 0;
 	ops = dev->swdev_ops;
@@ -381,14 +381,14 @@ int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
 
 	return err;
 }
-EXPORT_SYMBOL_GPL(netdev_switch_fib_ipv4_del);
+EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_del);
 
 /**
- *	netdev_switch_fib_ipv4_abort - Abort an IPv4 FIB operation
+ *	switchdev_fib_ipv4_abort - Abort an IPv4 FIB operation
  *
  *	@fi: route FIB info structure
  */
-void netdev_switch_fib_ipv4_abort(struct fib_info *fi)
+void switchdev_fib_ipv4_abort(struct fib_info *fi)
 {
 	/* There was a problem installing this route to the offload
 	 * device.  For now, until we come up with more refined
@@ -401,4 +401,4 @@ void netdev_switch_fib_ipv4_abort(struct fib_info *fi)
 	fib_flush_external(fi->fib_net);
 	fi->fib_net->ipv4.fib_offload_disabled = true;
 }
-EXPORT_SYMBOL_GPL(netdev_switch_fib_ipv4_abort);
+EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_abort);
-- 
cgit v1.2.3


From 9d47c0a2d958e06322c88245749278633d333cca Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Sun, 10 May 2015 09:47:47 -0700
Subject: switchdev: s/swdev_/switchdev_/

Turned out that "switchdev" sticks. So just unify all related terms to use
this prefix.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Acked-by: Andy Gospodarek <gospo@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/rocker/rocker.c | 37 +++++++++++++++++----------------
 include/linux/netdevice.h            |  2 +-
 include/net/switchdev.h              | 30 +++++++++++++--------------
 net/dsa/slave.c                      |  8 ++++----
 net/switchdev/switchdev.c            | 40 ++++++++++++++++++------------------
 5 files changed, 59 insertions(+), 58 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index 418f62b19545..7473874daf1f 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -4221,8 +4221,8 @@ static const struct net_device_ops rocker_port_netdev_ops = {
  * swdev interface
  ********************/
 
-static int rocker_port_swdev_parent_id_get(struct net_device *dev,
-					   struct netdev_phys_item_id *psid)
+static int rocker_port_switchdev_parent_id_get(struct net_device *dev,
+					       struct netdev_phys_item_id *psid)
 {
 	struct rocker_port *rocker_port = netdev_priv(dev);
 	struct rocker *rocker = rocker_port->rocker;
@@ -4232,18 +4232,19 @@ static int rocker_port_swdev_parent_id_get(struct net_device *dev,
 	return 0;
 }
 
-static int rocker_port_swdev_port_stp_update(struct net_device *dev, u8 state)
+static int rocker_port_switchdev_port_stp_update(struct net_device *dev,
+						 u8 state)
 {
 	struct rocker_port *rocker_port = netdev_priv(dev);
 
 	return rocker_port_stp_update(rocker_port, state);
 }
 
-static int rocker_port_swdev_fib_ipv4_add(struct net_device *dev,
-					  __be32 dst, int dst_len,
-					  struct fib_info *fi,
-					  u8 tos, u8 type,
-					  u32 nlflags, u32 tb_id)
+static int rocker_port_switchdev_fib_ipv4_add(struct net_device *dev,
+					      __be32 dst, int dst_len,
+					      struct fib_info *fi,
+					      u8 tos, u8 type,
+					      u32 nlflags, u32 tb_id)
 {
 	struct rocker_port *rocker_port = netdev_priv(dev);
 	int flags = 0;
@@ -4252,10 +4253,10 @@ static int rocker_port_swdev_fib_ipv4_add(struct net_device *dev,
 				    fi, tb_id, flags);
 }
 
-static int rocker_port_swdev_fib_ipv4_del(struct net_device *dev,
-					  __be32 dst, int dst_len,
-					  struct fib_info *fi,
-					  u8 tos, u8 type, u32 tb_id)
+static int rocker_port_switchdev_fib_ipv4_del(struct net_device *dev,
+					      __be32 dst, int dst_len,
+					      struct fib_info *fi,
+					      u8 tos, u8 type, u32 tb_id)
 {
 	struct rocker_port *rocker_port = netdev_priv(dev);
 	int flags = ROCKER_OP_FLAG_REMOVE;
@@ -4264,11 +4265,11 @@ static int rocker_port_swdev_fib_ipv4_del(struct net_device *dev,
 				    fi, tb_id, flags);
 }
 
-static const struct swdev_ops rocker_port_swdev_ops = {
-	.swdev_parent_id_get		= rocker_port_swdev_parent_id_get,
-	.swdev_port_stp_update		= rocker_port_swdev_port_stp_update,
-	.swdev_fib_ipv4_add		= rocker_port_swdev_fib_ipv4_add,
-	.swdev_fib_ipv4_del		= rocker_port_swdev_fib_ipv4_del,
+static const struct switchdev_ops rocker_port_switchdev_ops = {
+	.switchdev_parent_id_get	= rocker_port_switchdev_parent_id_get,
+	.switchdev_port_stp_update	= rocker_port_switchdev_port_stp_update,
+	.switchdev_fib_ipv4_add		= rocker_port_switchdev_fib_ipv4_add,
+	.switchdev_fib_ipv4_del		= rocker_port_switchdev_fib_ipv4_del,
 };
 
 /********************
@@ -4623,7 +4624,7 @@ static int rocker_probe_port(struct rocker *rocker, unsigned int port_number)
 	rocker_port_dev_addr_init(rocker, rocker_port);
 	dev->netdev_ops = &rocker_port_netdev_ops;
 	dev->ethtool_ops = &rocker_port_ethtool_ops;
-	dev->swdev_ops = &rocker_port_swdev_ops;
+	dev->switchdev_ops = &rocker_port_switchdev_ops;
 	netif_napi_add(dev, &rocker_port->napi_tx, rocker_port_poll_tx,
 		       NAPI_POLL_WEIGHT);
 	netif_napi_add(dev, &rocker_port->napi_rx, rocker_port_poll_rx,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a6d706b2a947..2b39235b9f13 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1567,7 +1567,7 @@ struct net_device {
 	const struct net_device_ops *netdev_ops;
 	const struct ethtool_ops *ethtool_ops;
 #ifdef CONFIG_NET_SWITCHDEV
-	const struct swdev_ops *swdev_ops;
+	const struct switchdev_ops *switchdev_ops;
 #endif
 
 	const struct header_ops *header_ops;
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index cd921fa0d63a..97b556daeef7 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -19,28 +19,28 @@ struct fib_info;
 /**
  * struct switchdev_ops - switchdev operations
  *
- * @swdev_parent_id_get: Called to get an ID of the switch chip this port
+ * @switchdev_parent_id_get: Called to get an ID of the switch chip this port
  *   is part of.  If driver implements this, it indicates that it
  *   represents a port of a switch chip.
  *
- * @swdev_port_stp_update: Called to notify switch device port of bridge
+ * @switchdev_port_stp_update: Called to notify switch device port of bridge
  *   port STP state change.
  *
- * @swdev_fib_ipv4_add: Called to add/modify IPv4 route to switch device.
+ * @switchdev_fib_ipv4_add: Called to add/modify IPv4 route to switch device.
  *
- * @swdev_fib_ipv4_del: Called to delete IPv4 route from switch device.
+ * @switchdev_fib_ipv4_del: Called to delete IPv4 route from switch device.
  */
-struct swdev_ops {
-	int	(*swdev_parent_id_get)(struct net_device *dev,
-				       struct netdev_phys_item_id *psid);
-	int	(*swdev_port_stp_update)(struct net_device *dev, u8 state);
-	int	(*swdev_fib_ipv4_add)(struct net_device *dev, __be32 dst,
-				      int dst_len, struct fib_info *fi,
-				      u8 tos, u8 type, u32 nlflags,
-				      u32 tb_id);
-	int	(*swdev_fib_ipv4_del)(struct net_device *dev, __be32 dst,
-				      int dst_len, struct fib_info *fi,
-				      u8 tos, u8 type, u32 tb_id);
+struct switchdev_ops {
+	int	(*switchdev_parent_id_get)(struct net_device *dev,
+					   struct netdev_phys_item_id *psid);
+	int	(*switchdev_port_stp_update)(struct net_device *dev, u8 state);
+	int	(*switchdev_fib_ipv4_add)(struct net_device *dev, __be32 dst,
+					  int dst_len, struct fib_info *fi,
+					  u8 tos, u8 type, u32 nlflags,
+					  u32 tb_id);
+	int	(*switchdev_fib_ipv4_del)(struct net_device *dev, __be32 dst,
+					  int dst_len, struct fib_info *fi,
+					  u8 tos, u8 type, u32 tb_id);
 };
 
 enum switchdev_notifier_type {
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 03e041addea3..1546acf6ebd3 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -675,9 +675,9 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
 	.ndo_get_iflink		= dsa_slave_get_iflink,
 };
 
-static const struct swdev_ops dsa_slave_swdev_ops = {
-	.swdev_parent_id_get = dsa_slave_parent_id_get,
-	.swdev_port_stp_update = dsa_slave_stp_update,
+static const struct switchdev_ops dsa_slave_switchdev_ops = {
+	.switchdev_parent_id_get	= dsa_slave_parent_id_get,
+	.switchdev_port_stp_update	= dsa_slave_stp_update,
 };
 
 static void dsa_slave_adjust_link(struct net_device *dev)
@@ -866,7 +866,7 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
 	eth_hw_addr_inherit(slave_dev, master);
 	slave_dev->tx_queue_len = 0;
 	slave_dev->netdev_ops = &dsa_slave_netdev_ops;
-	slave_dev->swdev_ops = &dsa_slave_swdev_ops;
+	slave_dev->switchdev_ops = &dsa_slave_switchdev_ops;
 
 	netdev_for_each_tx_queue(slave_dev, dsa_slave_set_lockdep_class_one,
 				 NULL);
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 52613ed49a8c..b7f44a23def5 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -28,11 +28,11 @@
 int switchdev_parent_id_get(struct net_device *dev,
 			    struct netdev_phys_item_id *psid)
 {
-	const struct swdev_ops *ops = dev->swdev_ops;
+	const struct switchdev_ops *ops = dev->switchdev_ops;
 
-	if (!ops || !ops->swdev_parent_id_get)
+	if (!ops || !ops->switchdev_parent_id_get)
 		return -EOPNOTSUPP;
-	return ops->swdev_parent_id_get(dev, psid);
+	return ops->switchdev_parent_id_get(dev, psid);
 }
 EXPORT_SYMBOL_GPL(switchdev_parent_id_get);
 
@@ -46,13 +46,13 @@ EXPORT_SYMBOL_GPL(switchdev_parent_id_get);
  */
 int switchdev_port_stp_update(struct net_device *dev, u8 state)
 {
-	const struct swdev_ops *ops = dev->swdev_ops;
+	const struct switchdev_ops *ops = dev->switchdev_ops;
 	struct net_device *lower_dev;
 	struct list_head *iter;
 	int err = -EOPNOTSUPP;
 
-	if (ops && ops->swdev_port_stp_update)
-		return ops->swdev_port_stp_update(dev, state);
+	if (ops && ops->switchdev_port_stp_update)
+		return ops->switchdev_port_stp_update(dev, state);
 
 	netdev_for_each_lower_dev(dev, lower_dev, iter) {
 		err = switchdev_port_stp_update(lower_dev, state);
@@ -239,17 +239,17 @@ EXPORT_SYMBOL_GPL(ndo_dflt_switchdev_port_bridge_dellink);
 
 static struct net_device *switchdev_get_lowest_dev(struct net_device *dev)
 {
-	const struct swdev_ops *ops = dev->swdev_ops;
+	const struct switchdev_ops *ops = dev->switchdev_ops;
 	struct net_device *lower_dev;
 	struct net_device *port_dev;
 	struct list_head *iter;
 
 	/* Recusively search down until we find a sw port dev.
-	 * (A sw port dev supports swdev_parent_id_get).
+	 * (A sw port dev supports switchdev_parent_id_get).
 	 */
 
 	if (dev->features & NETIF_F_HW_SWITCH_OFFLOAD &&
-	    ops && ops->swdev_parent_id_get)
+	    ops && ops->switchdev_parent_id_get)
 		return dev;
 
 	netdev_for_each_lower_dev(dev, lower_dev, iter) {
@@ -313,7 +313,7 @@ int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
 			   u8 tos, u8 type, u32 nlflags, u32 tb_id)
 {
 	struct net_device *dev;
-	const struct swdev_ops *ops;
+	const struct switchdev_ops *ops;
 	int err = 0;
 
 	/* Don't offload route if using custom ip rules or if
@@ -331,12 +331,12 @@ int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
 	dev = switchdev_get_dev_by_nhs(fi);
 	if (!dev)
 		return 0;
-	ops = dev->swdev_ops;
+	ops = dev->switchdev_ops;
 
-	if (ops->swdev_fib_ipv4_add) {
-		err = ops->swdev_fib_ipv4_add(dev, htonl(dst), dst_len,
-					      fi, tos, type, nlflags,
-					      tb_id);
+	if (ops->switchdev_fib_ipv4_add) {
+		err = ops->switchdev_fib_ipv4_add(dev, htonl(dst), dst_len,
+						  fi, tos, type, nlflags,
+						  tb_id);
 		if (!err)
 			fi->fib_flags |= RTNH_F_EXTERNAL;
 	}
@@ -361,7 +361,7 @@ int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
 			   u8 tos, u8 type, u32 tb_id)
 {
 	struct net_device *dev;
-	const struct swdev_ops *ops;
+	const struct switchdev_ops *ops;
 	int err = 0;
 
 	if (!(fi->fib_flags & RTNH_F_EXTERNAL))
@@ -370,11 +370,11 @@ int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
 	dev = switchdev_get_dev_by_nhs(fi);
 	if (!dev)
 		return 0;
-	ops = dev->swdev_ops;
+	ops = dev->switchdev_ops;
 
-	if (ops->swdev_fib_ipv4_del) {
-		err = ops->swdev_fib_ipv4_del(dev, htonl(dst), dst_len,
-					      fi, tos, type, tb_id);
+	if (ops->switchdev_fib_ipv4_del) {
+		err = ops->switchdev_fib_ipv4_del(dev, htonl(dst), dst_len,
+						  fi, tos, type, tb_id);
 		if (!err)
 			fi->fib_flags &= ~RTNH_F_EXTERNAL;
 	}
-- 
cgit v1.2.3


From 3094333d9089d43e8b8f0418676fa6ae06c27b51 Mon Sep 17 00:00:00 2001
From: Scott Feldman <sfeldma@gmail.com>
Date: Sun, 10 May 2015 09:47:48 -0700
Subject: switchdev: introduce get/set attrs ops

Add two new swdev ops for get/set switch port attributes.  Most swdev
interactions on a port are gets or sets on port attributes, so rather than
adding ops for each attribute, let's define clean get/set ops for all
attributes, and then we can have clear, consistent rules on how attributes
propagate on stacked devs.

Add the basic algorithms for get/set attr ops.  Use the same recusive algo
to walk lower devs we've used for STP updates, for example.  For get,
compare attr value for each lower dev and only return success if attr
values match across all lower devs.  For sets, set the same attr value for
all lower devs.  We'll use a two-phase prepare-commit transaction model for
sets.  In the first phase, the driver(s) are asked if attr set is OK.  If
all OK, the commit attr set in second phase.  A driver would NACK the
prepare phase if it can't set the attr due to lack of resources or support,
within it's control.  RTNL lock must be held across both phases because
we'll recurse all lower devs first in prepare phase, and then recurse all
lower devs again in commit phase.  If any lower dev fails the prepare
phase, we need to abort the transaction for all lower devs.

If lower dev recusion isn't desired, allow a flag SWITCHDEV_F_NO_RECURSE to
indicate get/set only work on port (lowest) device.

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/switchdev.h   |  43 ++++++++++++
 net/switchdev/switchdev.c | 169 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 212 insertions(+)

(limited to 'include/net')

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 97b556daeef7..282043844bcf 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -14,6 +14,25 @@
 #include <linux/netdevice.h>
 #include <linux/notifier.h>
 
+#define SWITCHDEV_F_NO_RECURSE		BIT(0)
+
+enum switchdev_trans {
+	SWITCHDEV_TRANS_NONE,
+	SWITCHDEV_TRANS_PREPARE,
+	SWITCHDEV_TRANS_ABORT,
+	SWITCHDEV_TRANS_COMMIT,
+};
+
+enum switchdev_attr_id {
+	SWITCHDEV_ATTR_UNDEFINED,
+};
+
+struct switchdev_attr {
+	enum switchdev_attr_id id;
+	enum switchdev_trans trans;
+	u32 flags;
+};
+
 struct fib_info;
 
 /**
@@ -23,6 +42,10 @@ struct fib_info;
  *   is part of.  If driver implements this, it indicates that it
  *   represents a port of a switch chip.
  *
+ * @switchdev_port_attr_get: Get a port attribute (see switchdev_attr).
+ *
+ * @switchdev_port_attr_set: Set a port attribute (see switchdev_attr).
+ *
  * @switchdev_port_stp_update: Called to notify switch device port of bridge
  *   port STP state change.
  *
@@ -33,6 +56,10 @@ struct fib_info;
 struct switchdev_ops {
 	int	(*switchdev_parent_id_get)(struct net_device *dev,
 					   struct netdev_phys_item_id *psid);
+	int	(*switchdev_port_attr_get)(struct net_device *dev,
+					   struct switchdev_attr *attr);
+	int	(*switchdev_port_attr_set)(struct net_device *dev,
+					   struct switchdev_attr *attr);
 	int	(*switchdev_port_stp_update)(struct net_device *dev, u8 state);
 	int	(*switchdev_fib_ipv4_add)(struct net_device *dev, __be32 dst,
 					  int dst_len, struct fib_info *fi,
@@ -68,6 +95,10 @@ switchdev_notifier_info_to_dev(const struct switchdev_notifier_info *info)
 
 int switchdev_parent_id_get(struct net_device *dev,
 			    struct netdev_phys_item_id *psid);
+int switchdev_port_attr_get(struct net_device *dev,
+			    struct switchdev_attr *attr);
+int switchdev_port_attr_set(struct net_device *dev,
+			    struct switchdev_attr *attr);
 int switchdev_port_stp_update(struct net_device *dev, u8 state);
 int register_switchdev_notifier(struct notifier_block *nb);
 int unregister_switchdev_notifier(struct notifier_block *nb);
@@ -95,6 +126,18 @@ static inline int switchdev_parent_id_get(struct net_device *dev,
 	return -EOPNOTSUPP;
 }
 
+static inline int switchdev_port_attr_get(struct net_device *dev,
+					  struct switchdev_attr *attr)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int switchdev_port_attr_set(struct net_device *dev,
+					  struct switchdev_attr *attr)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int switchdev_port_stp_update(struct net_device *dev,
 					    u8 state)
 {
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index b7f44a23def5..8f47187dc185 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -36,6 +36,175 @@ int switchdev_parent_id_get(struct net_device *dev,
 }
 EXPORT_SYMBOL_GPL(switchdev_parent_id_get);
 
+/**
+ *	switchdev_port_attr_get - Get port attribute
+ *
+ *	@dev: port device
+ *	@attr: attribute to get
+ */
+int switchdev_port_attr_get(struct net_device *dev, struct switchdev_attr *attr)
+{
+	const struct switchdev_ops *ops = dev->switchdev_ops;
+	struct net_device *lower_dev;
+	struct list_head *iter;
+	struct switchdev_attr first = {
+		.id = SWITCHDEV_ATTR_UNDEFINED
+	};
+	int err = -EOPNOTSUPP;
+
+	if (ops && ops->switchdev_port_attr_get)
+		return ops->switchdev_port_attr_get(dev, attr);
+
+	if (attr->flags & SWITCHDEV_F_NO_RECURSE)
+		return err;
+
+	/* Switch device port(s) may be stacked under
+	 * bond/team/vlan dev, so recurse down to get attr on
+	 * each port.  Return -ENODATA if attr values don't
+	 * compare across ports.
+	 */
+
+	netdev_for_each_lower_dev(dev, lower_dev, iter) {
+		err = switchdev_port_attr_get(lower_dev, attr);
+		if (err)
+			break;
+		if (first.id == SWITCHDEV_ATTR_UNDEFINED)
+			first = *attr;
+		else if (memcmp(&first, attr, sizeof(*attr)))
+			return -ENODATA;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(switchdev_port_attr_get);
+
+static int __switchdev_port_attr_set(struct net_device *dev,
+				     struct switchdev_attr *attr)
+{
+	const struct switchdev_ops *ops = dev->switchdev_ops;
+	struct net_device *lower_dev;
+	struct list_head *iter;
+	int err = -EOPNOTSUPP;
+
+	if (ops && ops->switchdev_port_attr_set)
+		return ops->switchdev_port_attr_set(dev, attr);
+
+	if (attr->flags & SWITCHDEV_F_NO_RECURSE)
+		return err;
+
+	/* Switch device port(s) may be stacked under
+	 * bond/team/vlan dev, so recurse down to set attr on
+	 * each port.
+	 */
+
+	netdev_for_each_lower_dev(dev, lower_dev, iter) {
+		err = __switchdev_port_attr_set(lower_dev, attr);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+
+struct switchdev_attr_set_work {
+	struct work_struct work;
+	struct net_device *dev;
+	struct switchdev_attr attr;
+};
+
+static void switchdev_port_attr_set_work(struct work_struct *work)
+{
+	struct switchdev_attr_set_work *asw =
+		container_of(work, struct switchdev_attr_set_work, work);
+	int err;
+
+	rtnl_lock();
+	err = switchdev_port_attr_set(asw->dev, &asw->attr);
+	BUG_ON(err);
+	rtnl_unlock();
+
+	dev_put(asw->dev);
+	kfree(work);
+}
+
+static int switchdev_port_attr_set_defer(struct net_device *dev,
+					 struct switchdev_attr *attr)
+{
+	struct switchdev_attr_set_work *asw;
+
+	asw = kmalloc(sizeof(*asw), GFP_ATOMIC);
+	if (!asw)
+		return -ENOMEM;
+
+	INIT_WORK(&asw->work, switchdev_port_attr_set_work);
+
+	dev_hold(dev);
+	asw->dev = dev;
+	memcpy(&asw->attr, attr, sizeof(asw->attr));
+
+	schedule_work(&asw->work);
+
+	return 0;
+}
+
+/**
+ *	switchdev_port_attr_set - Set port attribute
+ *
+ *	@dev: port device
+ *	@attr: attribute to set
+ *
+ *	Use a 2-phase prepare-commit transaction model to ensure
+ *	system is not left in a partially updated state due to
+ *	failure from driver/device.
+ */
+int switchdev_port_attr_set(struct net_device *dev, struct switchdev_attr *attr)
+{
+	int err;
+
+	if (!rtnl_is_locked()) {
+		/* Running prepare-commit transaction across stacked
+		 * devices requires nothing moves, so if rtnl_lock is
+		 * not held, schedule a worker thread to hold rtnl_lock
+		 * while setting attr.
+		 */
+
+		return switchdev_port_attr_set_defer(dev, attr);
+	}
+
+	/* Phase I: prepare for attr set. Driver/device should fail
+	 * here if there are going to be issues in the commit phase,
+	 * such as lack of resources or support.  The driver/device
+	 * should reserve resources needed for the commit phase here,
+	 * but should not commit the attr.
+	 */
+
+	attr->trans = SWITCHDEV_TRANS_PREPARE;
+	err = __switchdev_port_attr_set(dev, attr);
+	if (err) {
+		/* Prepare phase failed: abort the transaction.  Any
+		 * resources reserved in the prepare phase are
+		 * released.
+		 */
+
+		attr->trans = SWITCHDEV_TRANS_ABORT;
+		__switchdev_port_attr_set(dev, attr);
+
+		return err;
+	}
+
+	/* Phase II: commit attr set.  This cannot fail as a fault
+	 * of driver/device.  If it does, it's a bug in the driver/device
+	 * because the driver said everythings was OK in phase I.
+	 */
+
+	attr->trans = SWITCHDEV_TRANS_COMMIT;
+	err = __switchdev_port_attr_set(dev, attr);
+	BUG_ON(err);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(switchdev_port_attr_set);
+
 /**
  *	switchdev_port_stp_update - Notify switch device port of STP
  *					state change
-- 
cgit v1.2.3


From f8e20a9f87d33865cc1d67f13da0db8d457fc3c9 Mon Sep 17 00:00:00 2001
From: Scott Feldman <sfeldma@gmail.com>
Date: Sun, 10 May 2015 09:47:49 -0700
Subject: switchdev: convert parent_id_get to switchdev attr get

Switch ID is just a gettable port attribute.  Convert switchdev op
switchdev_parent_id_get to a switchdev attr.

Note: for sysfs and netlink interfaces, SWITCHDEV_ATTR_PORT_PARENT_ID is
called with SWITCHDEV_F_NO_RECUSE to limit switch ID user-visiblity to only
port netdevs.  So when a port is stacked under bond/bridge, the user can
only query switch id via the switch ports, but not via the upper devices

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/rocker/rocker.c | 17 +++++++++++-----
 include/net/switchdev.h              | 18 ++++-------------
 net/core/net-sysfs.c                 | 10 +++++++---
 net/core/rtnetlink.c                 |  9 ++++++---
 net/dsa/slave.c                      | 16 ++++++++++-----
 net/switchdev/switchdev.c            | 38 +++++++++++-------------------------
 6 files changed, 51 insertions(+), 57 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index 7473874daf1f..24282991d9fd 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -4221,14 +4221,21 @@ static const struct net_device_ops rocker_port_netdev_ops = {
  * swdev interface
  ********************/
 
-static int rocker_port_switchdev_parent_id_get(struct net_device *dev,
-					       struct netdev_phys_item_id *psid)
+static int rocker_port_attr_get(struct net_device *dev,
+				struct switchdev_attr *attr)
 {
 	struct rocker_port *rocker_port = netdev_priv(dev);
 	struct rocker *rocker = rocker_port->rocker;
 
-	psid->id_len = sizeof(rocker->hw.id);
-	memcpy(&psid->id, &rocker->hw.id, psid->id_len);
+	switch (attr->id) {
+	case SWITCHDEV_ATTR_PORT_PARENT_ID:
+		attr->ppid.id_len = sizeof(rocker->hw.id);
+		memcpy(&attr->ppid.id, &rocker->hw.id, attr->ppid.id_len);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
 	return 0;
 }
 
@@ -4266,7 +4273,7 @@ static int rocker_port_switchdev_fib_ipv4_del(struct net_device *dev,
 }
 
 static const struct switchdev_ops rocker_port_switchdev_ops = {
-	.switchdev_parent_id_get	= rocker_port_switchdev_parent_id_get,
+	.switchdev_port_attr_get	= rocker_port_attr_get,
 	.switchdev_port_stp_update	= rocker_port_switchdev_port_stp_update,
 	.switchdev_fib_ipv4_add		= rocker_port_switchdev_fib_ipv4_add,
 	.switchdev_fib_ipv4_del		= rocker_port_switchdev_fib_ipv4_del,
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 282043844bcf..93316e7ab372 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -25,12 +25,16 @@ enum switchdev_trans {
 
 enum switchdev_attr_id {
 	SWITCHDEV_ATTR_UNDEFINED,
+	SWITCHDEV_ATTR_PORT_PARENT_ID,
 };
 
 struct switchdev_attr {
 	enum switchdev_attr_id id;
 	enum switchdev_trans trans;
 	u32 flags;
+	union {
+		struct netdev_phys_item_id ppid;	/* PORT_PARENT_ID */
+	};
 };
 
 struct fib_info;
@@ -38,10 +42,6 @@ struct fib_info;
 /**
  * struct switchdev_ops - switchdev operations
  *
- * @switchdev_parent_id_get: Called to get an ID of the switch chip this port
- *   is part of.  If driver implements this, it indicates that it
- *   represents a port of a switch chip.
- *
  * @switchdev_port_attr_get: Get a port attribute (see switchdev_attr).
  *
  * @switchdev_port_attr_set: Set a port attribute (see switchdev_attr).
@@ -54,8 +54,6 @@ struct fib_info;
  * @switchdev_fib_ipv4_del: Called to delete IPv4 route from switch device.
  */
 struct switchdev_ops {
-	int	(*switchdev_parent_id_get)(struct net_device *dev,
-					   struct netdev_phys_item_id *psid);
 	int	(*switchdev_port_attr_get)(struct net_device *dev,
 					   struct switchdev_attr *attr);
 	int	(*switchdev_port_attr_set)(struct net_device *dev,
@@ -93,8 +91,6 @@ switchdev_notifier_info_to_dev(const struct switchdev_notifier_info *info)
 
 #ifdef CONFIG_NET_SWITCHDEV
 
-int switchdev_parent_id_get(struct net_device *dev,
-			    struct netdev_phys_item_id *psid);
 int switchdev_port_attr_get(struct net_device *dev,
 			    struct switchdev_attr *attr);
 int switchdev_port_attr_set(struct net_device *dev,
@@ -120,12 +116,6 @@ void switchdev_fib_ipv4_abort(struct fib_info *fi);
 
 #else
 
-static inline int switchdev_parent_id_get(struct net_device *dev,
-					  struct netdev_phys_item_id *psid)
-{
-	return -EOPNOTSUPP;
-}
-
 static inline int switchdev_port_attr_get(struct net_device *dev,
 					  struct switchdev_attr *attr)
 {
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index be86a7ce9282..5a9ce96f6d27 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -458,11 +458,15 @@ static ssize_t phys_switch_id_show(struct device *dev,
 		return restart_syscall();
 
 	if (dev_isalive(netdev)) {
-		struct netdev_phys_item_id ppid;
+		struct switchdev_attr attr = {
+			.id = SWITCHDEV_ATTR_PORT_PARENT_ID,
+			.flags = SWITCHDEV_F_NO_RECURSE,
+		};
 
-		ret = switchdev_parent_id_get(netdev, &ppid);
+		ret = switchdev_port_attr_get(netdev, &attr);
 		if (!ret)
-			ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id);
+			ret = sprintf(buf, "%*phN\n", attr.ppid.id_len,
+				      attr.ppid.id);
 	}
 	rtnl_unlock();
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index fcd41fcd7e70..c6c6b2c34926 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1004,16 +1004,19 @@ static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev)
 static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev)
 {
 	int err;
-	struct netdev_phys_item_id psid;
+	struct switchdev_attr attr = {
+		.id = SWITCHDEV_ATTR_PORT_PARENT_ID,
+		.flags = SWITCHDEV_F_NO_RECURSE,
+	};
 
-	err = switchdev_parent_id_get(dev, &psid);
+	err = switchdev_port_attr_get(dev, &attr);
 	if (err) {
 		if (err == -EOPNOTSUPP)
 			return 0;
 		return err;
 	}
 
-	if (nla_put(skb, IFLA_PHYS_SWITCH_ID, psid.id_len, psid.id))
+	if (nla_put(skb, IFLA_PHYS_SWITCH_ID, attr.ppid.id_len, attr.ppid.id))
 		return -EMSGSIZE;
 
 	return 0;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 1546acf6ebd3..de705b674ac9 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -382,14 +382,20 @@ static int dsa_slave_bridge_port_leave(struct net_device *dev)
 	return ret;
 }
 
-static int dsa_slave_parent_id_get(struct net_device *dev,
-				   struct netdev_phys_item_id *psid)
+static int dsa_slave_port_attr_get(struct net_device *dev,
+				   struct switchdev_attr *attr)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
 	struct dsa_switch *ds = p->parent;
 
-	psid->id_len = sizeof(ds->index);
-	memcpy(&psid->id, &ds->index, psid->id_len);
+	switch (attr->id) {
+	case SWITCHDEV_ATTR_PORT_PARENT_ID:
+		attr->ppid.id_len = sizeof(ds->index);
+		memcpy(&attr->ppid.id, &ds->index, attr->ppid.id_len);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
 
 	return 0;
 }
@@ -676,7 +682,7 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
 };
 
 static const struct switchdev_ops dsa_slave_switchdev_ops = {
-	.switchdev_parent_id_get	= dsa_slave_parent_id_get,
+	.switchdev_port_attr_get	= dsa_slave_port_attr_get,
 	.switchdev_port_stp_update	= dsa_slave_stp_update,
 };
 
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 8f47187dc185..117fd0797abd 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -18,24 +18,6 @@
 #include <net/ip_fib.h>
 #include <net/switchdev.h>
 
-/**
- *	switchdev_parent_id_get - Get ID of a switch
- *	@dev: port device
- *	@psid: switch ID
- *
- *	Get ID of a switch this port is part of.
- */
-int switchdev_parent_id_get(struct net_device *dev,
-			    struct netdev_phys_item_id *psid)
-{
-	const struct switchdev_ops *ops = dev->switchdev_ops;
-
-	if (!ops || !ops->switchdev_parent_id_get)
-		return -EOPNOTSUPP;
-	return ops->switchdev_parent_id_get(dev, psid);
-}
-EXPORT_SYMBOL_GPL(switchdev_parent_id_get);
-
 /**
  *	switchdev_port_attr_get - Get port attribute
  *
@@ -414,11 +396,10 @@ static struct net_device *switchdev_get_lowest_dev(struct net_device *dev)
 	struct list_head *iter;
 
 	/* Recusively search down until we find a sw port dev.
-	 * (A sw port dev supports switchdev_parent_id_get).
+	 * (A sw port dev supports switchdev_port_attr_get).
 	 */
 
-	if (dev->features & NETIF_F_HW_SWITCH_OFFLOAD &&
-	    ops && ops->switchdev_parent_id_get)
+	if (ops && ops->switchdev_port_attr_get)
 		return dev;
 
 	netdev_for_each_lower_dev(dev, lower_dev, iter) {
@@ -432,8 +413,10 @@ static struct net_device *switchdev_get_lowest_dev(struct net_device *dev)
 
 static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi)
 {
-	struct netdev_phys_item_id psid;
-	struct netdev_phys_item_id prev_psid;
+	struct switchdev_attr attr = {
+		.id = SWITCHDEV_ATTR_PORT_PARENT_ID,
+	};
+	struct switchdev_attr prev_attr;
 	struct net_device *dev = NULL;
 	int nhsel;
 
@@ -449,17 +432,18 @@ static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi)
 		if (!dev)
 			return NULL;
 
-		if (switchdev_parent_id_get(dev, &psid))
+		if (switchdev_port_attr_get(dev, &attr))
 			return NULL;
 
 		if (nhsel > 0) {
-			if (prev_psid.id_len != psid.id_len)
+			if (prev_attr.ppid.id_len != attr.ppid.id_len)
 				return NULL;
-			if (memcmp(prev_psid.id, psid.id, psid.id_len))
+			if (memcmp(prev_attr.ppid.id, attr.ppid.id,
+				   attr.ppid.id_len))
 				return NULL;
 		}
 
-		prev_psid = psid;
+		prev_attr = attr;
 	}
 
 	return dev;
-- 
cgit v1.2.3


From 3563606258cf3b8f02eabddb1cb45a94c44d9611 Mon Sep 17 00:00:00 2001
From: Scott Feldman <sfeldma@gmail.com>
Date: Sun, 10 May 2015 09:47:51 -0700
Subject: switchdev: convert STP update to switchdev attr set

STP update is just a settable port attribute, so convert
switchdev_port_stp_update to an attr set.

For DSA, the prepare phase is skipped and STP updates are only done in the
commit phase.  This is because currently the DSA drivers don't need to
allocate any memory for STP updates and the STP update will not fail to HW
(unless something horrible goes wrong on the MDIO bus, in which case the
prepare phase wouldn't have been able to predict anyway).

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/rocker/rocker.c | 14 +++++---------
 include/net/switchdev.h              | 13 ++-----------
 net/bridge/br_stp.c                  |  6 +++++-
 net/dsa/slave.c                      | 20 +++++++++++++++++++-
 net/switchdev/switchdev.c            | 28 ----------------------------
 5 files changed, 31 insertions(+), 50 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index fa0fa545c0d1..36827d39a70a 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -4395,14 +4395,6 @@ static int rocker_port_attr_get(struct net_device *dev,
 	return 0;
 }
 
-static int rocker_port_switchdev_port_stp_update(struct net_device *dev,
-						 u8 state)
-{
-	struct rocker_port *rocker_port = netdev_priv(dev);
-
-	return rocker_port_stp_update(rocker_port, SWITCHDEV_TRANS_NONE, state);
-}
-
 static void rocker_port_trans_abort(struct rocker_port *rocker_port)
 {
 	struct list_head *mem, *tmp;
@@ -4431,6 +4423,10 @@ static int rocker_port_attr_set(struct net_device *dev,
 	}
 
 	switch (attr->id) {
+	case SWITCHDEV_ATTR_PORT_STP_STATE:
+		err = rocker_port_stp_update(rocker_port, attr->trans,
+					     attr->stp_state);
+		break;
 	default:
 		err = -EOPNOTSUPP;
 		break;
@@ -4466,7 +4462,7 @@ static int rocker_port_switchdev_fib_ipv4_del(struct net_device *dev,
 
 static const struct switchdev_ops rocker_port_switchdev_ops = {
 	.switchdev_port_attr_get	= rocker_port_attr_get,
-	.switchdev_port_stp_update	= rocker_port_switchdev_port_stp_update,
+	.switchdev_port_attr_set	= rocker_port_attr_set,
 	.switchdev_fib_ipv4_add		= rocker_port_switchdev_fib_ipv4_add,
 	.switchdev_fib_ipv4_del		= rocker_port_switchdev_fib_ipv4_del,
 };
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 93316e7ab372..aec5e49661a2 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -26,6 +26,7 @@ enum switchdev_trans {
 enum switchdev_attr_id {
 	SWITCHDEV_ATTR_UNDEFINED,
 	SWITCHDEV_ATTR_PORT_PARENT_ID,
+	SWITCHDEV_ATTR_PORT_STP_STATE,
 };
 
 struct switchdev_attr {
@@ -34,6 +35,7 @@ struct switchdev_attr {
 	u32 flags;
 	union {
 		struct netdev_phys_item_id ppid;	/* PORT_PARENT_ID */
+		u8 stp_state;				/* PORT_STP_STATE */
 	};
 };
 
@@ -46,9 +48,6 @@ struct fib_info;
  *
  * @switchdev_port_attr_set: Set a port attribute (see switchdev_attr).
  *
- * @switchdev_port_stp_update: Called to notify switch device port of bridge
- *   port STP state change.
- *
  * @switchdev_fib_ipv4_add: Called to add/modify IPv4 route to switch device.
  *
  * @switchdev_fib_ipv4_del: Called to delete IPv4 route from switch device.
@@ -58,7 +57,6 @@ struct switchdev_ops {
 					   struct switchdev_attr *attr);
 	int	(*switchdev_port_attr_set)(struct net_device *dev,
 					   struct switchdev_attr *attr);
-	int	(*switchdev_port_stp_update)(struct net_device *dev, u8 state);
 	int	(*switchdev_fib_ipv4_add)(struct net_device *dev, __be32 dst,
 					  int dst_len, struct fib_info *fi,
 					  u8 tos, u8 type, u32 nlflags,
@@ -95,7 +93,6 @@ int switchdev_port_attr_get(struct net_device *dev,
 			    struct switchdev_attr *attr);
 int switchdev_port_attr_set(struct net_device *dev,
 			    struct switchdev_attr *attr);
-int switchdev_port_stp_update(struct net_device *dev, u8 state);
 int register_switchdev_notifier(struct notifier_block *nb);
 int unregister_switchdev_notifier(struct notifier_block *nb);
 int call_switchdev_notifiers(unsigned long val, struct net_device *dev,
@@ -128,12 +125,6 @@ static inline int switchdev_port_attr_set(struct net_device *dev,
 	return -EOPNOTSUPP;
 }
 
-static inline int switchdev_port_stp_update(struct net_device *dev,
-					    u8 state)
-{
-	return -EOPNOTSUPP;
-}
-
 static inline int register_switchdev_notifier(struct notifier_block *nb)
 {
 	return 0;
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index 28e3f4bc01e0..b9300da31565 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -39,10 +39,14 @@ void br_log_state(const struct net_bridge_port *p)
 
 void br_set_state(struct net_bridge_port *p, unsigned int state)
 {
+	struct switchdev_attr attr = {
+		.id = SWITCHDEV_ATTR_PORT_STP_STATE,
+		.stp_state = state,
+	};
 	int err;
 
 	p->state = state;
-	err = switchdev_port_stp_update(p->dev, state);
+	err = switchdev_port_attr_set(p->dev, &attr);
 	if (err && err != -EOPNOTSUPP)
 		br_warn(p->br, "error setting offload STP state on port %u(%s)\n",
 				(unsigned int) p->port_no, p->dev->name);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index de705b674ac9..3fb5210e318c 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -345,6 +345,24 @@ static int dsa_slave_stp_update(struct net_device *dev, u8 state)
 	return ret;
 }
 
+static int dsa_slave_port_attr_set(struct net_device *dev,
+				   struct switchdev_attr *attr)
+{
+	int ret = 0;
+
+	switch (attr->id) {
+	case SWITCHDEV_ATTR_PORT_STP_STATE:
+		if (attr->trans == SWITCHDEV_TRANS_COMMIT)
+			ret = dsa_slave_stp_update(dev, attr->stp_state);
+		break;
+	default:
+		ret = -EOPNOTSUPP;
+		break;
+	}
+
+	return ret;
+}
+
 static int dsa_slave_bridge_port_join(struct net_device *dev,
 				      struct net_device *br)
 {
@@ -683,7 +701,7 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
 
 static const struct switchdev_ops dsa_slave_switchdev_ops = {
 	.switchdev_port_attr_get	= dsa_slave_port_attr_get,
-	.switchdev_port_stp_update	= dsa_slave_stp_update,
+	.switchdev_port_attr_set	= dsa_slave_port_attr_set,
 };
 
 static void dsa_slave_adjust_link(struct net_device *dev)
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 117fd0797abd..a3c359004902 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -187,34 +187,6 @@ int switchdev_port_attr_set(struct net_device *dev, struct switchdev_attr *attr)
 }
 EXPORT_SYMBOL_GPL(switchdev_port_attr_set);
 
-/**
- *	switchdev_port_stp_update - Notify switch device port of STP
- *					state change
- *	@dev: port device
- *	@state: port STP state
- *
- *	Notify switch device port of bridge port STP state change.
- */
-int switchdev_port_stp_update(struct net_device *dev, u8 state)
-{
-	const struct switchdev_ops *ops = dev->switchdev_ops;
-	struct net_device *lower_dev;
-	struct list_head *iter;
-	int err = -EOPNOTSUPP;
-
-	if (ops && ops->switchdev_port_stp_update)
-		return ops->switchdev_port_stp_update(dev, state);
-
-	netdev_for_each_lower_dev(dev, lower_dev, iter) {
-		err = switchdev_port_stp_update(lower_dev, state);
-		if (err && err != -EOPNOTSUPP)
-			return err;
-	}
-
-	return err;
-}
-EXPORT_SYMBOL_GPL(switchdev_port_stp_update);
-
 static DEFINE_MUTEX(switchdev_mutex);
 static RAW_NOTIFIER_HEAD(switchdev_notif_chain);
 
-- 
cgit v1.2.3


From 491d0f1533ac750260406dbf84cdad44fd3d8a29 Mon Sep 17 00:00:00 2001
From: Scott Feldman <sfeldma@gmail.com>
Date: Sun, 10 May 2015 09:47:52 -0700
Subject: switchdev: introduce switchdev add/del obj ops

Like switchdev attr get/set, add new switchdev obj add/del.  switchdev objs
will be things like VLANs or FIB entries, so add/del fits better for
objects than get/set used for attributes.

Use same two-phase prepare-commit transaction model as in attr set.

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/switchdev.h   |  31 ++++++++++++++
 net/switchdev/switchdev.c | 107 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 138 insertions(+)

(limited to 'include/net')

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index aec5e49661a2..4f43300df72c 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -41,6 +41,15 @@ struct switchdev_attr {
 
 struct fib_info;
 
+enum switchdev_obj_id {
+	SWITCHDEV_OBJ_UNDEFINED,
+};
+
+struct switchdev_obj {
+	enum switchdev_obj_id id;
+	enum switchdev_trans trans;
+};
+
 /**
  * struct switchdev_ops - switchdev operations
  *
@@ -48,6 +57,10 @@ struct fib_info;
  *
  * @switchdev_port_attr_set: Set a port attribute (see switchdev_attr).
  *
+ * @switchdev_port_obj_add: Add an object to port (see switchdev_obj).
+ *
+ * @switchdev_port_obj_del: Delete an object from port (see switchdev_obj).
+ *
  * @switchdev_fib_ipv4_add: Called to add/modify IPv4 route to switch device.
  *
  * @switchdev_fib_ipv4_del: Called to delete IPv4 route from switch device.
@@ -57,6 +70,10 @@ struct switchdev_ops {
 					   struct switchdev_attr *attr);
 	int	(*switchdev_port_attr_set)(struct net_device *dev,
 					   struct switchdev_attr *attr);
+	int	(*switchdev_port_obj_add)(struct net_device *dev,
+					  struct switchdev_obj *obj);
+	int	(*switchdev_port_obj_del)(struct net_device *dev,
+					  struct switchdev_obj *obj);
 	int	(*switchdev_fib_ipv4_add)(struct net_device *dev, __be32 dst,
 					  int dst_len, struct fib_info *fi,
 					  u8 tos, u8 type, u32 nlflags,
@@ -93,6 +110,8 @@ int switchdev_port_attr_get(struct net_device *dev,
 			    struct switchdev_attr *attr);
 int switchdev_port_attr_set(struct net_device *dev,
 			    struct switchdev_attr *attr);
+int switchdev_port_obj_add(struct net_device *dev, struct switchdev_obj *obj);
+int switchdev_port_obj_del(struct net_device *dev, struct switchdev_obj *obj);
 int register_switchdev_notifier(struct notifier_block *nb);
 int unregister_switchdev_notifier(struct notifier_block *nb);
 int call_switchdev_notifiers(unsigned long val, struct net_device *dev,
@@ -125,6 +144,18 @@ static inline int switchdev_port_attr_set(struct net_device *dev,
 	return -EOPNOTSUPP;
 }
 
+static inline int switchdev_port_obj_add(struct net_device *dev,
+					 struct switchdev_obj *obj)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int switchdev_port_obj_del(struct net_device *dev,
+					 struct switchdev_obj *obj)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int register_switchdev_notifier(struct notifier_block *nb)
 {
 	return 0;
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index a3c359004902..3d4d99a70b80 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -187,6 +187,113 @@ int switchdev_port_attr_set(struct net_device *dev, struct switchdev_attr *attr)
 }
 EXPORT_SYMBOL_GPL(switchdev_port_attr_set);
 
+int __switchdev_port_obj_add(struct net_device *dev, struct switchdev_obj *obj)
+{
+	const struct switchdev_ops *ops = dev->switchdev_ops;
+	struct net_device *lower_dev;
+	struct list_head *iter;
+	int err = -EOPNOTSUPP;
+
+	if (ops && ops->switchdev_port_obj_add)
+		return ops->switchdev_port_obj_add(dev, obj);
+
+	/* Switch device port(s) may be stacked under
+	 * bond/team/vlan dev, so recurse down to add object on
+	 * each port.
+	 */
+
+	netdev_for_each_lower_dev(dev, lower_dev, iter) {
+		err = __switchdev_port_obj_add(lower_dev, obj);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+
+/**
+ *	switchdev_port_obj_add - Add port object
+ *
+ *	@dev: port device
+ *	@obj: object to add
+ *
+ *	Use a 2-phase prepare-commit transaction model to ensure
+ *	system is not left in a partially updated state due to
+ *	failure from driver/device.
+ *
+ *	rtnl_lock must be held.
+ */
+int switchdev_port_obj_add(struct net_device *dev, struct switchdev_obj *obj)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	/* Phase I: prepare for obj add. Driver/device should fail
+	 * here if there are going to be issues in the commit phase,
+	 * such as lack of resources or support.  The driver/device
+	 * should reserve resources needed for the commit phase here,
+	 * but should not commit the obj.
+	 */
+
+	obj->trans = SWITCHDEV_TRANS_PREPARE;
+	err = __switchdev_port_obj_add(dev, obj);
+	if (err) {
+		/* Prepare phase failed: abort the transaction.  Any
+		 * resources reserved in the prepare phase are
+		 * released.
+		 */
+
+		obj->trans = SWITCHDEV_TRANS_ABORT;
+		__switchdev_port_obj_add(dev, obj);
+
+		return err;
+	}
+
+	/* Phase II: commit obj add.  This cannot fail as a fault
+	 * of driver/device.  If it does, it's a bug in the driver/device
+	 * because the driver said everythings was OK in phase I.
+	 */
+
+	obj->trans = SWITCHDEV_TRANS_COMMIT;
+	err = __switchdev_port_obj_add(dev, obj);
+	WARN(err, "%s: Commit of object (id=%d) failed.\n", dev->name, obj->id);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(switchdev_port_obj_add);
+
+/**
+ *	switchdev_port_obj_del - Delete port object
+ *
+ *	@dev: port device
+ *	@obj: object to delete
+ */
+int switchdev_port_obj_del(struct net_device *dev, struct switchdev_obj *obj)
+{
+	const struct switchdev_ops *ops = dev->switchdev_ops;
+	struct net_device *lower_dev;
+	struct list_head *iter;
+	int err = -EOPNOTSUPP;
+
+	if (ops && ops->switchdev_port_obj_del)
+		return ops->switchdev_port_obj_del(dev, obj);
+
+	/* Switch device port(s) may be stacked under
+	 * bond/team/vlan dev, so recurse down to delete object on
+	 * each port.
+	 */
+
+	netdev_for_each_lower_dev(dev, lower_dev, iter) {
+		err = switchdev_port_obj_del(lower_dev, obj);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(switchdev_port_obj_del);
+
 static DEFINE_MUTEX(switchdev_mutex);
 static RAW_NOTIFIER_HEAD(switchdev_notif_chain);
 
-- 
cgit v1.2.3


From 6fc3016da7c1587aa59e71f8c4dbc4cf1343eab2 Mon Sep 17 00:00:00 2001
From: Scott Feldman <sfeldma@gmail.com>
Date: Sun, 10 May 2015 09:47:53 -0700
Subject: switchdev: add port vlan obj

VLAN obj has flags (PVID and untagged) as well as start and end vid ranges.
The switchdev driver can optimize programing the device using the ranges.

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/switchdev.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/net')

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 4f43300df72c..e598c2d45786 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -43,11 +43,19 @@ struct fib_info;
 
 enum switchdev_obj_id {
 	SWITCHDEV_OBJ_UNDEFINED,
+	SWITCHDEV_OBJ_PORT_VLAN,
 };
 
 struct switchdev_obj {
 	enum switchdev_obj_id id;
 	enum switchdev_trans trans;
+	union {
+		struct switchdev_obj_vlan {			/* PORT_VLAN */
+			u16 flags;
+			u16 vid_start;
+			u16 vid_end;
+		} vlan;
+	};
 };
 
 /**
-- 
cgit v1.2.3


From 6004c86718998aee1337efd3b087d6e17284632d Mon Sep 17 00:00:00 2001
From: Scott Feldman <sfeldma@gmail.com>
Date: Sun, 10 May 2015 09:47:55 -0700
Subject: switchdev: add bridge port flags attr

rocker: use switchdev get/set attr for bridge port flags

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/rocker/rocker.c | 25 +++++++++++++++++++++++++
 include/net/switchdev.h              |  2 ++
 2 files changed, 27 insertions(+)

(limited to 'include/net')

diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index 84084806231a..ceb6a6436144 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -4394,6 +4394,9 @@ static int rocker_port_attr_get(struct net_device *dev,
 		attr->ppid.id_len = sizeof(rocker->hw.id);
 		memcpy(&attr->ppid.id, &rocker->hw.id, attr->ppid.id_len);
 		break;
+	case SWITCHDEV_ATTR_PORT_BRIDGE_FLAGS:
+		attr->brport_flags = rocker_port->brport_flags;
+		break;
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -4411,6 +4414,24 @@ static void rocker_port_trans_abort(struct rocker_port *rocker_port)
 	}
 }
 
+static int rocker_port_brport_flags_set(struct rocker_port *rocker_port,
+					enum switchdev_trans trans,
+					unsigned long brport_flags)
+{
+	unsigned long orig_flags;
+	int err = 0;
+
+	orig_flags = rocker_port->brport_flags;
+	rocker_port->brport_flags = brport_flags;
+	if ((orig_flags ^ rocker_port->brport_flags) & BR_LEARNING)
+		err = rocker_port_set_learning(rocker_port, trans);
+
+	if (trans == SWITCHDEV_TRANS_PREPARE)
+		rocker_port->brport_flags = orig_flags;
+
+	return err;
+}
+
 static int rocker_port_attr_set(struct net_device *dev,
 				struct switchdev_attr *attr)
 {
@@ -4433,6 +4454,10 @@ static int rocker_port_attr_set(struct net_device *dev,
 		err = rocker_port_stp_update(rocker_port, attr->trans,
 					     attr->stp_state);
 		break;
+	case SWITCHDEV_ATTR_PORT_BRIDGE_FLAGS:
+		err = rocker_port_brport_flags_set(rocker_port, attr->trans,
+						   attr->brport_flags);
+		break;
 	default:
 		err = -EOPNOTSUPP;
 		break;
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index e598c2d45786..6cf6de1a90b4 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -27,6 +27,7 @@ enum switchdev_attr_id {
 	SWITCHDEV_ATTR_UNDEFINED,
 	SWITCHDEV_ATTR_PORT_PARENT_ID,
 	SWITCHDEV_ATTR_PORT_STP_STATE,
+	SWITCHDEV_ATTR_PORT_BRIDGE_FLAGS,
 };
 
 struct switchdev_attr {
@@ -36,6 +37,7 @@ struct switchdev_attr {
 	union {
 		struct netdev_phys_item_id ppid;	/* PORT_PARENT_ID */
 		u8 stp_state;				/* PORT_STP_STATE */
+		unsigned long brport_flags;		/* PORT_BRIDGE_FLAGS */
 	};
 };
 
-- 
cgit v1.2.3


From e71f220b342d78cfb8ee9f1b60f1351f7183f2a5 Mon Sep 17 00:00:00 2001
From: Scott Feldman <sfeldma@gmail.com>
Date: Sun, 10 May 2015 09:47:58 -0700
Subject: switchdev: remove old switchdev_port_bridge_setlink

New attr-based bridge_setlink can recurse lower devs and recover on err, so
remove old wrapper (including ndo_dflt_switchdev_port_bridge_setlink).

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/switchdev.h   |  9 ---------
 net/switchdev/switchdev.c | 30 ------------------------------
 2 files changed, 39 deletions(-)

(limited to 'include/net')

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 6cf6de1a90b4..ce5ceb2dc677 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -132,8 +132,6 @@ int switchdev_port_bridge_dellink(struct net_device *dev,
 				  struct nlmsghdr *nlh, u16 flags);
 int ndo_dflt_switchdev_port_bridge_dellink(struct net_device *dev,
 					   struct nlmsghdr *nlh, u16 flags);
-int ndo_dflt_switchdev_port_bridge_setlink(struct net_device *dev,
-					   struct nlmsghdr *nlh, u16 flags);
 int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
 			   u8 tos, u8 type, u32 nlflags, u32 tb_id);
 int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
@@ -204,13 +202,6 @@ static inline int ndo_dflt_switchdev_port_bridge_dellink(struct net_device *dev,
 	return 0;
 }
 
-static inline int ndo_dflt_switchdev_port_bridge_setlink(struct net_device *dev,
-							 struct nlmsghdr *nlh,
-							 u16 flags)
-{
-	return 0;
-}
-
 static inline int switchdev_fib_ipv4_add(u32 dst, int dst_len,
 					 struct fib_info *fi,
 					 u8 tos, u8 type,
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index b01791a9b56d..dcdec9de9137 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -536,36 +536,6 @@ int switchdev_port_bridge_dellink(struct net_device *dev,
 }
 EXPORT_SYMBOL_GPL(switchdev_port_bridge_dellink);
 
-/**
- *	ndo_dflt_switchdev_port_bridge_setlink - default ndo bridge setlink
- *						 op for master devices
- *
- *	@dev: port device
- *	@nlh: netlink msg with bridge port attributes
- *	@flags: bridge setlink flags
- *
- *	Notify master device slaves of bridge port attributes
- */
-int ndo_dflt_switchdev_port_bridge_setlink(struct net_device *dev,
-					   struct nlmsghdr *nlh, u16 flags)
-{
-	struct net_device *lower_dev;
-	struct list_head *iter;
-	int ret = 0, err = 0;
-
-	if (!(dev->features & NETIF_F_HW_SWITCH_OFFLOAD))
-		return ret;
-
-	netdev_for_each_lower_dev(dev, lower_dev, iter) {
-		err = switchdev_port_bridge_setlink(lower_dev, nlh, flags);
-		if (err && err != -EOPNOTSUPP)
-			ret = err;
-	}
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(ndo_dflt_switchdev_port_bridge_setlink);
-
 /**
  *	ndo_dflt_switchdev_port_bridge_dellink - default ndo bridge dellink
  *						 op for master devices
-- 
cgit v1.2.3


From 5c34e0221423aeabc0b085adc5fccda3f91e2c49 Mon Sep 17 00:00:00 2001
From: Scott Feldman <sfeldma@gmail.com>
Date: Sun, 10 May 2015 09:48:00 -0700
Subject: switchdev: add new switchdev_port_bridge_dellink

Same change as setlink.  Provide the wrapper op for SELF ndo_bridge_dellink
and call into the switchdev driver to delete afspec VLANs.

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/switchdev.h   |  6 ++++++
 net/switchdev/switchdev.c | 24 ++++++++++++------------
 2 files changed, 18 insertions(+), 12 deletions(-)

(limited to 'include/net')

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index ce5ceb2dc677..8ffadca9e760 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -164,6 +164,12 @@ static inline int switchdev_port_obj_del(struct net_device *dev,
 	return -EOPNOTSUPP;
 }
 
+static inline int switchdev_port_bridge_dellink(struct net_device *dev,
+					    struct nlmsghdr *nlh, u16 flags)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int register_switchdev_notifier(struct notifier_block *nb)
 {
 	return 0;
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index dcdec9de9137..8ce678e397b4 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -512,27 +512,27 @@ int switchdev_port_bridge_setlink(struct net_device *dev,
 EXPORT_SYMBOL_GPL(switchdev_port_bridge_setlink);
 
 /**
- *	switchdev_port_bridge_dellink - Notify switch device port of bridge
- *	port attribute delete
+ *	switchdev_port_bridge_dellink - Set bridge port attributes
  *
  *	@dev: port device
- *	@nlh: netlink msg with bridge port attributes
- *	@flags: bridge setlink flags
+ *	@nlh: netlink header
+ *	@flags: netlink flags
  *
- *	Notify switch device port of bridge port attribute delete
+ *	Called for SELF on rtnl_bridge_dellink to set bridge port
+ *	attributes.
  */
 int switchdev_port_bridge_dellink(struct net_device *dev,
 				  struct nlmsghdr *nlh, u16 flags)
 {
-	const struct net_device_ops *ops = dev->netdev_ops;
-
-	if (!(dev->features & NETIF_F_HW_SWITCH_OFFLOAD))
-		return 0;
+	struct nlattr *afspec;
 
-	if (!ops->ndo_bridge_dellink)
-		return -EOPNOTSUPP;
+	afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg),
+				 IFLA_AF_SPEC);
+	if (afspec)
+		return switchdev_port_br_afspec(dev, afspec,
+						switchdev_port_obj_del);
 
-	return ops->ndo_bridge_dellink(dev, nlh, flags);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(switchdev_port_bridge_dellink);
 
-- 
cgit v1.2.3


From 87a5dae59e7abaad911ab719caa5548dd6df5557 Mon Sep 17 00:00:00 2001
From: Scott Feldman <sfeldma@gmail.com>
Date: Sun, 10 May 2015 09:48:02 -0700
Subject: switchdev: remove unused switchdev_port_bridge_dellink

Now we can remove old wrappers for dellink.

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/switchdev.h   | 15 ---------------
 net/switchdev/switchdev.c | 30 ------------------------------
 2 files changed, 45 deletions(-)

(limited to 'include/net')

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 8ffadca9e760..397b1e6e04f6 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -130,8 +130,6 @@ int switchdev_port_bridge_setlink(struct net_device *dev,
 				  struct nlmsghdr *nlh, u16 flags);
 int switchdev_port_bridge_dellink(struct net_device *dev,
 				  struct nlmsghdr *nlh, u16 flags);
-int ndo_dflt_switchdev_port_bridge_dellink(struct net_device *dev,
-					   struct nlmsghdr *nlh, u16 flags);
 int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
 			   u8 tos, u8 type, u32 nlflags, u32 tb_id);
 int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
@@ -164,12 +162,6 @@ static inline int switchdev_port_obj_del(struct net_device *dev,
 	return -EOPNOTSUPP;
 }
 
-static inline int switchdev_port_bridge_dellink(struct net_device *dev,
-					    struct nlmsghdr *nlh, u16 flags)
-{
-	return -EOPNOTSUPP;
-}
-
 static inline int register_switchdev_notifier(struct notifier_block *nb)
 {
 	return 0;
@@ -201,13 +193,6 @@ static inline int switchdev_port_bridge_dellink(struct net_device *dev,
 	return -EOPNOTSUPP;
 }
 
-static inline int ndo_dflt_switchdev_port_bridge_dellink(struct net_device *dev,
-							 struct nlmsghdr *nlh,
-							 u16 flags)
-{
-	return 0;
-}
-
 static inline int switchdev_fib_ipv4_add(u32 dst, int dst_len,
 					 struct fib_info *fi,
 					 u8 tos, u8 type,
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 8ce678e397b4..0e15b6f6bb56 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -536,36 +536,6 @@ int switchdev_port_bridge_dellink(struct net_device *dev,
 }
 EXPORT_SYMBOL_GPL(switchdev_port_bridge_dellink);
 
-/**
- *	ndo_dflt_switchdev_port_bridge_dellink - default ndo bridge dellink
- *						 op for master devices
- *
- *	@dev: port device
- *	@nlh: netlink msg with bridge port attributes
- *	@flags: bridge dellink flags
- *
- *	Notify master device slaves of bridge port attribute deletes
- */
-int ndo_dflt_switchdev_port_bridge_dellink(struct net_device *dev,
-					   struct nlmsghdr *nlh, u16 flags)
-{
-	struct net_device *lower_dev;
-	struct list_head *iter;
-	int ret = 0, err = 0;
-
-	if (!(dev->features & NETIF_F_HW_SWITCH_OFFLOAD))
-		return ret;
-
-	netdev_for_each_lower_dev(dev, lower_dev, iter) {
-		err = switchdev_port_bridge_dellink(lower_dev, nlh, flags);
-		if (err && err != -EOPNOTSUPP)
-			ret = err;
-	}
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(ndo_dflt_switchdev_port_bridge_dellink);
-
 static struct net_device *switchdev_get_lowest_dev(struct net_device *dev)
 {
 	const struct switchdev_ops *ops = dev->switchdev_ops;
-- 
cgit v1.2.3


From 8793d0a664a8a2c5e18e929c1f995c784c105705 Mon Sep 17 00:00:00 2001
From: Scott Feldman <sfeldma@gmail.com>
Date: Sun, 10 May 2015 09:48:04 -0700
Subject: switchdev: add new switchdev_port_bridge_getlink

Like bridge_setlink, add switchdev wrapper to handle bridge_getlink and
call into port driver to get port attrs.  For now, only BR_LEARNING and
BR_LEARNING_SYNC are returned.  To add more, we'll probably want to break
away from ndo_dflt_bridge_getlink() and build the netlink skb directly in
the switchdev code.

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/switchdev.h   | 10 ++++++++++
 net/switchdev/switchdev.c | 28 ++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

(limited to 'include/net')

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 397b1e6e04f6..e081d67f973a 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -126,6 +126,9 @@ int register_switchdev_notifier(struct notifier_block *nb);
 int unregister_switchdev_notifier(struct notifier_block *nb);
 int call_switchdev_notifiers(unsigned long val, struct net_device *dev,
 			     struct switchdev_notifier_info *info);
+int switchdev_port_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
+				  struct net_device *dev, u32 filter_mask,
+				  int nlflags);
 int switchdev_port_bridge_setlink(struct net_device *dev,
 				  struct nlmsghdr *nlh, u16 flags);
 int switchdev_port_bridge_dellink(struct net_device *dev,
@@ -179,6 +182,13 @@ static inline int call_switchdev_notifiers(unsigned long val,
 	return NOTIFY_DONE;
 }
 
+static inline int switchdev_port_bridge_getlink(struct sk_buff *skb, u32 pid,
+					    u32 seq, struct net_device *dev,
+					    u32 filter_mask, int nlflags)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int switchdev_port_bridge_setlink(struct net_device *dev,
 						struct nlmsghdr *nlh,
 						u16 flags)
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 0e15b6f6bb56..9210355ec965 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -358,6 +358,34 @@ int call_switchdev_notifiers(unsigned long val, struct net_device *dev,
 }
 EXPORT_SYMBOL_GPL(call_switchdev_notifiers);
 
+/**
+ *	switchdev_port_bridge_getlink - Get bridge port attributes
+ *
+ *	@dev: port device
+ *
+ *	Called for SELF on rtnl_bridge_getlink to get bridge port
+ *	attributes.
+ */
+int switchdev_port_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
+				  struct net_device *dev, u32 filter_mask,
+				  int nlflags)
+{
+	struct switchdev_attr attr = {
+		.id = SWITCHDEV_ATTR_PORT_BRIDGE_FLAGS,
+	};
+	u16 mode = BRIDGE_MODE_UNDEF;
+	u32 mask = BR_LEARNING | BR_LEARNING_SYNC;
+	int err;
+
+	err = switchdev_port_attr_get(dev, &attr);
+	if (err)
+		return err;
+
+	return ndo_dflt_bridge_getlink(skb, pid, seq, dev, mode,
+				       attr.brport_flags, mask, nlflags);
+}
+EXPORT_SYMBOL_GPL(switchdev_port_bridge_getlink);
+
 static int switchdev_port_br_setflag(struct net_device *dev,
 				     struct nlattr *nlattr,
 				     unsigned long brport_flag)
-- 
cgit v1.2.3


From 58c2cb16b116d7feace621bd6b647bbabacfa225 Mon Sep 17 00:00:00 2001
From: Scott Feldman <sfeldma@gmail.com>
Date: Sun, 10 May 2015 09:48:06 -0700
Subject: switchdev: convert fib_ipv4_add/del over to
 switchdev_port_obj_add/del

The IPv4 FIB ops convert nicely to the switchdev objs and we're left with
only four switchdev ops: port get/set and port add/del.  Other objs will
follow, such as FDB.  So go ahead and convert IPv4 FIB over to switchdev
obj for consistency, anticipating more objs to come.

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/rocker/rocker.c | 41 +++++++++++-------------------
 include/net/switchdev.h              | 21 ++++++++--------
 net/switchdev/switchdev.c            | 49 +++++++++++++++++++++++-------------
 3 files changed, 55 insertions(+), 56 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index 56ee316db1a1..1c906504cfe2 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -4449,6 +4449,7 @@ static int rocker_port_obj_add(struct net_device *dev,
 			       struct switchdev_obj *obj)
 {
 	struct rocker_port *rocker_port = netdev_priv(dev);
+	struct switchdev_obj_ipv4_fib *fib4;
 	int err = 0;
 
 	switch (obj->trans) {
@@ -4467,6 +4468,12 @@ static int rocker_port_obj_add(struct net_device *dev,
 		err = rocker_port_vlans_add(rocker_port, obj->trans,
 					    &obj->vlan);
 		break;
+	case SWITCHDEV_OBJ_IPV4_FIB:
+		fib4 = &obj->ipv4_fib;
+		err = rocker_port_fib_ipv4(rocker_port, obj->trans,
+					   fib4->dst, fib4->dst_len,
+					   fib4->fi, fib4->tb_id, 0);
+		break;
 	default:
 		err = -EOPNOTSUPP;
 		break;
@@ -4508,12 +4515,19 @@ static int rocker_port_obj_del(struct net_device *dev,
 			       struct switchdev_obj *obj)
 {
 	struct rocker_port *rocker_port = netdev_priv(dev);
+	struct switchdev_obj_ipv4_fib *fib4;
 	int err = 0;
 
 	switch (obj->id) {
 	case SWITCHDEV_OBJ_PORT_VLAN:
 		err = rocker_port_vlans_del(rocker_port, &obj->vlan);
 		break;
+	case SWITCHDEV_OBJ_IPV4_FIB:
+		fib4 = &obj->ipv4_fib;
+		err = rocker_port_fib_ipv4(rocker_port, SWITCHDEV_TRANS_NONE,
+					   fib4->dst, fib4->dst_len, fib4->fi,
+					   fib4->tb_id, ROCKER_OP_FLAG_REMOVE);
+		break;
 	default:
 		err = -EOPNOTSUPP;
 		break;
@@ -4522,38 +4536,11 @@ static int rocker_port_obj_del(struct net_device *dev,
 	return err;
 }
 
-static int rocker_port_switchdev_fib_ipv4_add(struct net_device *dev,
-					      __be32 dst, int dst_len,
-					      struct fib_info *fi,
-					      u8 tos, u8 type,
-					      u32 nlflags, u32 tb_id)
-{
-	struct rocker_port *rocker_port = netdev_priv(dev);
-	int flags = 0;
-
-	return rocker_port_fib_ipv4(rocker_port, SWITCHDEV_TRANS_NONE,
-				    dst, dst_len, fi, tb_id, flags);
-}
-
-static int rocker_port_switchdev_fib_ipv4_del(struct net_device *dev,
-					      __be32 dst, int dst_len,
-					      struct fib_info *fi,
-					      u8 tos, u8 type, u32 tb_id)
-{
-	struct rocker_port *rocker_port = netdev_priv(dev);
-	int flags = ROCKER_OP_FLAG_REMOVE;
-
-	return rocker_port_fib_ipv4(rocker_port, SWITCHDEV_TRANS_NONE,
-				    dst, dst_len, fi, tb_id, flags);
-}
-
 static const struct switchdev_ops rocker_port_switchdev_ops = {
 	.switchdev_port_attr_get	= rocker_port_attr_get,
 	.switchdev_port_attr_set	= rocker_port_attr_set,
 	.switchdev_port_obj_add		= rocker_port_obj_add,
 	.switchdev_port_obj_del		= rocker_port_obj_del,
-	.switchdev_fib_ipv4_add		= rocker_port_switchdev_fib_ipv4_add,
-	.switchdev_fib_ipv4_del		= rocker_port_switchdev_fib_ipv4_del,
 };
 
 /********************
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index e081d67f973a..3b217b4cca27 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -46,6 +46,7 @@ struct fib_info;
 enum switchdev_obj_id {
 	SWITCHDEV_OBJ_UNDEFINED,
 	SWITCHDEV_OBJ_PORT_VLAN,
+	SWITCHDEV_OBJ_IPV4_FIB,
 };
 
 struct switchdev_obj {
@@ -57,6 +58,15 @@ struct switchdev_obj {
 			u16 vid_start;
 			u16 vid_end;
 		} vlan;
+		struct switchdev_obj_ipv4_fib {		/* IPV4_FIB */
+			u32 dst;
+			int dst_len;
+			struct fib_info *fi;
+			u8 tos;
+			u8 type;
+			u32 nlflags;
+			u32 tb_id;
+		} ipv4_fib;
 	};
 };
 
@@ -70,10 +80,6 @@ struct switchdev_obj {
  * @switchdev_port_obj_add: Add an object to port (see switchdev_obj).
  *
  * @switchdev_port_obj_del: Delete an object from port (see switchdev_obj).
- *
- * @switchdev_fib_ipv4_add: Called to add/modify IPv4 route to switch device.
- *
- * @switchdev_fib_ipv4_del: Called to delete IPv4 route from switch device.
  */
 struct switchdev_ops {
 	int	(*switchdev_port_attr_get)(struct net_device *dev,
@@ -84,13 +90,6 @@ struct switchdev_ops {
 					  struct switchdev_obj *obj);
 	int	(*switchdev_port_obj_del)(struct net_device *dev,
 					  struct switchdev_obj *obj);
-	int	(*switchdev_fib_ipv4_add)(struct net_device *dev, __be32 dst,
-					  int dst_len, struct fib_info *fi,
-					  u8 tos, u8 type, u32 nlflags,
-					  u32 tb_id);
-	int	(*switchdev_fib_ipv4_del)(struct net_device *dev, __be32 dst,
-					  int dst_len, struct fib_info *fi,
-					  u8 tos, u8 type, u32 tb_id);
 };
 
 enum switchdev_notifier_type {
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 9210355ec965..65d49d4477b9 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -641,8 +641,19 @@ static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi)
 int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
 			   u8 tos, u8 type, u32 nlflags, u32 tb_id)
 {
+	struct switchdev_obj fib_obj = {
+		.id = SWITCHDEV_OBJ_IPV4_FIB,
+		.ipv4_fib = {
+			.dst = htonl(dst),
+			.dst_len = dst_len,
+			.fi = fi,
+			.tos = tos,
+			.type = type,
+			.nlflags = nlflags,
+			.tb_id = tb_id,
+		},
+	};
 	struct net_device *dev;
-	const struct switchdev_ops *ops;
 	int err = 0;
 
 	/* Don't offload route if using custom ip rules or if
@@ -660,15 +671,10 @@ int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
 	dev = switchdev_get_dev_by_nhs(fi);
 	if (!dev)
 		return 0;
-	ops = dev->switchdev_ops;
-
-	if (ops->switchdev_fib_ipv4_add) {
-		err = ops->switchdev_fib_ipv4_add(dev, htonl(dst), dst_len,
-						  fi, tos, type, nlflags,
-						  tb_id);
-		if (!err)
-			fi->fib_flags |= RTNH_F_EXTERNAL;
-	}
+
+	err = switchdev_port_obj_add(dev, &fib_obj);
+	if (!err)
+		fi->fib_flags |= RTNH_F_EXTERNAL;
 
 	return err;
 }
@@ -689,8 +695,19 @@ EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_add);
 int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
 			   u8 tos, u8 type, u32 tb_id)
 {
+	struct switchdev_obj fib_obj = {
+		.id = SWITCHDEV_OBJ_IPV4_FIB,
+		.ipv4_fib = {
+			.dst = htonl(dst),
+			.dst_len = dst_len,
+			.fi = fi,
+			.tos = tos,
+			.type = type,
+			.nlflags = 0,
+			.tb_id = tb_id,
+		},
+	};
 	struct net_device *dev;
-	const struct switchdev_ops *ops;
 	int err = 0;
 
 	if (!(fi->fib_flags & RTNH_F_EXTERNAL))
@@ -699,14 +716,10 @@ int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
 	dev = switchdev_get_dev_by_nhs(fi);
 	if (!dev)
 		return 0;
-	ops = dev->switchdev_ops;
 
-	if (ops->switchdev_fib_ipv4_del) {
-		err = ops->switchdev_fib_ipv4_del(dev, htonl(dst), dst_len,
-						  fi, tos, type, tb_id);
-		if (!err)
-			fi->fib_flags &= ~RTNH_F_EXTERNAL;
-	}
+	err = switchdev_port_obj_del(dev, &fib_obj);
+	if (!err)
+		fi->fib_flags &= ~RTNH_F_EXTERNAL;
 
 	return err;
 }
-- 
cgit v1.2.3


From 9449c3cd90472141cf081af88181a56163ff7132 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Tue, 12 May 2015 18:29:44 +0800
Subject: net: make skb_dst_pop routine static

As xfrm_output_one() is the only caller of skb_dst_pop(), we should
make skb_dst_pop() localized.

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h      | 12 ------------
 net/xfrm/xfrm_output.c | 12 ++++++++++++
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/net')

diff --git a/include/net/dst.h b/include/net/dst.h
index 22aa93f165f1..2bc73f8a00a9 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -349,18 +349,6 @@ static inline void skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev,
 	__skb_tunnel_rx(skb, dev, net);
 }
 
-/* Children define the path of the packet through the
- * Linux networking.  Thus, destinations are stackable.
- */
-
-static inline struct dst_entry *skb_dst_pop(struct sk_buff *skb)
-{
-	struct dst_entry *child = dst_clone(skb_dst(skb)->child);
-
-	skb_dst_drop(skb);
-	return child;
-}
-
 int dst_discard_sk(struct sock *sk, struct sk_buff *skb);
 static inline int dst_discard(struct sk_buff *skb)
 {
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index fbcedbe33190..68ada2ca4b60 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -38,6 +38,18 @@ static int xfrm_skb_check_space(struct sk_buff *skb)
 	return pskb_expand_head(skb, nhead, ntail, GFP_ATOMIC);
 }
 
+/* Children define the path of the packet through the
+ * Linux networking.  Thus, destinations are stackable.
+ */
+
+static struct dst_entry *skb_dst_pop(struct sk_buff *skb)
+{
+	struct dst_entry *child = dst_clone(skb_dst(skb)->child);
+
+	skb_dst_drop(skb);
+	return child;
+}
+
 static int xfrm_output_one(struct sk_buff *skb, int err)
 {
 	struct dst_entry *dst = skb_dst(skb);
-- 
cgit v1.2.3


From 5eb764edee52e837638b8d55ceace2c68e248cd2 Mon Sep 17 00:00:00 2001
From: Scott Feldman <sfeldma@gmail.com>
Date: Tue, 12 May 2015 23:03:53 -0700
Subject: switchdev: align comment with other comments in block

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/switchdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 3b217b4cca27..9f9a7cc573c3 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -53,7 +53,7 @@ struct switchdev_obj {
 	enum switchdev_obj_id id;
 	enum switchdev_trans trans;
 	union {
-		struct switchdev_obj_vlan {			/* PORT_VLAN */
+		struct switchdev_obj_vlan {		/* PORT_VLAN */
 			u16 flags;
 			u16 vid_start;
 			u16 vid_end;
-- 
cgit v1.2.3


From 42275bd8fcb351f951781d8882f359d25976824b Mon Sep 17 00:00:00 2001
From: Scott Feldman <sfeldma@gmail.com>
Date: Wed, 13 May 2015 11:16:50 -0700
Subject: switchdev: don't use anonymous union on switchdev attr/obj structs

Older gcc versions (e.g.  gcc version 4.4.6) don't like anonymous unions
which was causing build issues on the newly added switchdev attr/obj
structs.  Fix this by using named union on structs.

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Reported-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/rocker/rocker.c | 18 ++++++++---------
 include/net/switchdev.h              |  4 ++--
 net/bridge/br_stp.c                  |  2 +-
 net/core/net-sysfs.c                 |  4 ++--
 net/core/rtnetlink.c                 |  3 ++-
 net/dsa/slave.c                      |  6 +++---
 net/switchdev/switchdev.c            | 39 ++++++++++++++++++------------------
 7 files changed, 39 insertions(+), 37 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index ca533936658b..f0a9cb44be6b 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -4339,11 +4339,11 @@ static int rocker_port_attr_get(struct net_device *dev,
 
 	switch (attr->id) {
 	case SWITCHDEV_ATTR_PORT_PARENT_ID:
-		attr->ppid.id_len = sizeof(rocker->hw.id);
-		memcpy(&attr->ppid.id, &rocker->hw.id, attr->ppid.id_len);
+		attr->u.ppid.id_len = sizeof(rocker->hw.id);
+		memcpy(&attr->u.ppid.id, &rocker->hw.id, attr->u.ppid.id_len);
 		break;
 	case SWITCHDEV_ATTR_PORT_BRIDGE_FLAGS:
-		attr->brport_flags = rocker_port->brport_flags;
+		attr->u.brport_flags = rocker_port->brport_flags;
 		break;
 	default:
 		return -EOPNOTSUPP;
@@ -4400,11 +4400,11 @@ static int rocker_port_attr_set(struct net_device *dev,
 	switch (attr->id) {
 	case SWITCHDEV_ATTR_PORT_STP_STATE:
 		err = rocker_port_stp_update(rocker_port, attr->trans,
-					     attr->stp_state);
+					     attr->u.stp_state);
 		break;
 	case SWITCHDEV_ATTR_PORT_BRIDGE_FLAGS:
 		err = rocker_port_brport_flags_set(rocker_port, attr->trans,
-						   attr->brport_flags);
+						   attr->u.brport_flags);
 		break;
 	default:
 		err = -EOPNOTSUPP;
@@ -4466,10 +4466,10 @@ static int rocker_port_obj_add(struct net_device *dev,
 	switch (obj->id) {
 	case SWITCHDEV_OBJ_PORT_VLAN:
 		err = rocker_port_vlans_add(rocker_port, obj->trans,
-					    &obj->vlan);
+					    &obj->u.vlan);
 		break;
 	case SWITCHDEV_OBJ_IPV4_FIB:
-		fib4 = &obj->ipv4_fib;
+		fib4 = &obj->u.ipv4_fib;
 		err = rocker_port_fib_ipv4(rocker_port, obj->trans,
 					   htonl(fib4->dst), fib4->dst_len,
 					   fib4->fi, fib4->tb_id, 0);
@@ -4520,10 +4520,10 @@ static int rocker_port_obj_del(struct net_device *dev,
 
 	switch (obj->id) {
 	case SWITCHDEV_OBJ_PORT_VLAN:
-		err = rocker_port_vlans_del(rocker_port, &obj->vlan);
+		err = rocker_port_vlans_del(rocker_port, &obj->u.vlan);
 		break;
 	case SWITCHDEV_OBJ_IPV4_FIB:
-		fib4 = &obj->ipv4_fib;
+		fib4 = &obj->u.ipv4_fib;
 		err = rocker_port_fib_ipv4(rocker_port, SWITCHDEV_TRANS_NONE,
 					   htonl(fib4->dst), fib4->dst_len,
 					   fib4->fi, fib4->tb_id,
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 9f9a7cc573c3..ea5b1c230d3d 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -38,7 +38,7 @@ struct switchdev_attr {
 		struct netdev_phys_item_id ppid;	/* PORT_PARENT_ID */
 		u8 stp_state;				/* PORT_STP_STATE */
 		unsigned long brport_flags;		/* PORT_BRIDGE_FLAGS */
-	};
+	} u;
 };
 
 struct fib_info;
@@ -67,7 +67,7 @@ struct switchdev_obj {
 			u32 nlflags;
 			u32 tb_id;
 		} ipv4_fib;
-	};
+	} u;
 };
 
 /**
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index b9300da31565..45f1ff113af9 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -41,7 +41,7 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
 {
 	struct switchdev_attr attr = {
 		.id = SWITCHDEV_ATTR_PORT_STP_STATE,
-		.stp_state = state,
+		.u.stp_state = state,
 	};
 	int err;
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 5a9ce96f6d27..18b34d771ed4 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -465,8 +465,8 @@ static ssize_t phys_switch_id_show(struct device *dev,
 
 		ret = switchdev_port_attr_get(netdev, &attr);
 		if (!ret)
-			ret = sprintf(buf, "%*phN\n", attr.ppid.id_len,
-				      attr.ppid.id);
+			ret = sprintf(buf, "%*phN\n", attr.u.ppid.id_len,
+				      attr.u.ppid.id);
 	}
 	rtnl_unlock();
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index c6c6b2c34926..141ccc357e2e 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1016,7 +1016,8 @@ static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev)
 		return err;
 	}
 
-	if (nla_put(skb, IFLA_PHYS_SWITCH_ID, attr.ppid.id_len, attr.ppid.id))
+	if (nla_put(skb, IFLA_PHYS_SWITCH_ID, attr.u.ppid.id_len,
+		    attr.u.ppid.id))
 		return -EMSGSIZE;
 
 	return 0;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 3fb5210e318c..04ffad311704 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -353,7 +353,7 @@ static int dsa_slave_port_attr_set(struct net_device *dev,
 	switch (attr->id) {
 	case SWITCHDEV_ATTR_PORT_STP_STATE:
 		if (attr->trans == SWITCHDEV_TRANS_COMMIT)
-			ret = dsa_slave_stp_update(dev, attr->stp_state);
+			ret = dsa_slave_stp_update(dev, attr->u.stp_state);
 		break;
 	default:
 		ret = -EOPNOTSUPP;
@@ -408,8 +408,8 @@ static int dsa_slave_port_attr_get(struct net_device *dev,
 
 	switch (attr->id) {
 	case SWITCHDEV_ATTR_PORT_PARENT_ID:
-		attr->ppid.id_len = sizeof(ds->index);
-		memcpy(&attr->ppid.id, &ds->index, attr->ppid.id_len);
+		attr->u.ppid.id_len = sizeof(ds->index);
+		memcpy(&attr->u.ppid.id, &ds->index, attr->u.ppid.id_len);
 		break;
 	default:
 		return -EOPNOTSUPP;
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 77f1b6e3f78e..0409f9b5bdbc 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -383,7 +383,7 @@ int switchdev_port_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 		return err;
 
 	return ndo_dflt_bridge_getlink(skb, pid, seq, dev, mode,
-				       attr.brport_flags, mask, nlflags);
+				       attr.u.brport_flags, mask, nlflags);
 }
 EXPORT_SYMBOL_GPL(switchdev_port_bridge_getlink);
 
@@ -402,9 +402,9 @@ static int switchdev_port_br_setflag(struct net_device *dev,
 		return err;
 
 	if (flag)
-		attr.brport_flags |= brport_flag;
+		attr.u.brport_flags |= brport_flag;
 	else
-		attr.brport_flags &= ~brport_flag;
+		attr.u.brport_flags &= ~brport_flag;
 
 	return switchdev_port_attr_set(dev, &attr);
 }
@@ -466,6 +466,7 @@ static int switchdev_port_br_afspec(struct net_device *dev,
 	struct switchdev_obj obj = {
 		.id = SWITCHDEV_OBJ_PORT_VLAN,
 	};
+	struct switchdev_obj_vlan *vlan = &obj.u.vlan;
 	int rem;
 	int err;
 
@@ -475,30 +476,30 @@ static int switchdev_port_br_afspec(struct net_device *dev,
 		if (nla_len(attr) != sizeof(struct bridge_vlan_info))
 			return -EINVAL;
 		vinfo = nla_data(attr);
-		obj.vlan.flags = vinfo->flags;
+		vlan->flags = vinfo->flags;
 		if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) {
-			if (obj.vlan.vid_start)
+			if (vlan->vid_start)
 				return -EINVAL;
-			obj.vlan.vid_start = vinfo->vid;
+			vlan->vid_start = vinfo->vid;
 		} else if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_END) {
-			if (!obj.vlan.vid_start)
+			if (!vlan->vid_start)
 				return -EINVAL;
-			obj.vlan.vid_end = vinfo->vid;
-			if (obj.vlan.vid_end <= obj.vlan.vid_start)
+			vlan->vid_end = vinfo->vid;
+			if (vlan->vid_end <= vlan->vid_start)
 				return -EINVAL;
 			err = f(dev, &obj);
 			if (err)
 				return err;
-			memset(&obj.vlan, 0, sizeof(obj.vlan));
+			memset(vlan, 0, sizeof(*vlan));
 		} else {
-			if (obj.vlan.vid_start)
+			if (vlan->vid_start)
 				return -EINVAL;
-			obj.vlan.vid_start = vinfo->vid;
-			obj.vlan.vid_end = vinfo->vid;
+			vlan->vid_start = vinfo->vid;
+			vlan->vid_end = vinfo->vid;
 			err = f(dev, &obj);
 			if (err)
 				return err;
-			memset(&obj.vlan, 0, sizeof(obj.vlan));
+			memset(vlan, 0, sizeof(*vlan));
 		}
 	}
 
@@ -613,10 +614,10 @@ static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi)
 			return NULL;
 
 		if (nhsel > 0) {
-			if (prev_attr.ppid.id_len != attr.ppid.id_len)
+			if (prev_attr.u.ppid.id_len != attr.u.ppid.id_len)
 				return NULL;
-			if (memcmp(prev_attr.ppid.id, attr.ppid.id,
-				   attr.ppid.id_len))
+			if (memcmp(prev_attr.u.ppid.id, attr.u.ppid.id,
+				   attr.u.ppid.id_len))
 				return NULL;
 		}
 
@@ -644,7 +645,7 @@ int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
 {
 	struct switchdev_obj fib_obj = {
 		.id = SWITCHDEV_OBJ_IPV4_FIB,
-		.ipv4_fib = {
+		.u.ipv4_fib = {
 			.dst = dst,
 			.dst_len = dst_len,
 			.fi = fi,
@@ -698,7 +699,7 @@ int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
 {
 	struct switchdev_obj fib_obj = {
 		.id = SWITCHDEV_OBJ_IPV4_FIB,
-		.ipv4_fib = {
+		.u.ipv4_fib = {
 			.dst = dst,
 			.dst_len = dst_len,
 			.fi = fi,
-- 
cgit v1.2.3


From e578d9c02587d57bfa7b560767c698a668a468c6 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 11 May 2015 19:50:41 +0200
Subject: net: sched: use counter to break reclassify loops

Seems all we want here is to avoid endless 'goto reclassify' loop.
tc_classify_compat even resets this counter when something other
than TC_ACT_RECLASSIFY is returned, so this skb-counter doesn't
break hypothetical loops induced by something other than perpetual
TC_ACT_RECLASSIFY return values.

skb_act_clone is now identical to skb_clone, so just use that.

Tested with following (bogus) filter:
tc filter add dev eth0 parent ffff: \
 protocol ip u32 match u32 0 0 police rate 10Kbit burst \
 64000 mtu 1500 action reclassify

Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/tc-actions-env-rules.txt |  4 ----
 include/net/sch_generic.h                         | 15 ---------------
 include/uapi/linux/pkt_cls.h                      |  2 +-
 net/sched/act_mirred.c                            |  2 +-
 net/sched/sch_api.c                               | 12 +++---------
 5 files changed, 5 insertions(+), 30 deletions(-)

(limited to 'include/net')

diff --git a/Documentation/networking/tc-actions-env-rules.txt b/Documentation/networking/tc-actions-env-rules.txt
index 95c71716b2e2..f37814693ad3 100644
--- a/Documentation/networking/tc-actions-env-rules.txt
+++ b/Documentation/networking/tc-actions-env-rules.txt
@@ -8,10 +8,6 @@ For example if your action queues a packet to be processed later,
 or intentionally branches by redirecting a packet, then you need to
 clone the packet.
 
-There are certain fields in the skb tc_verd that need to be reset so we
-avoid loops, etc.  A few are generic enough that skb_act_clone()
-resets them for you, so invoke skb_act_clone() rather than skb_clone().
-
 2) If you munge any packet thou shalt call pskb_expand_head in the case
 someone else is referencing the skb. After that you "own" the skb.
 
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 1b0a2e88ed2b..2738f6f87908 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -739,21 +739,6 @@ static inline u32 qdisc_l2t(struct qdisc_rate_table* rtab, unsigned int pktlen)
 	return rtab->data[slot];
 }
 
-#ifdef CONFIG_NET_CLS_ACT
-static inline struct sk_buff *skb_act_clone(struct sk_buff *skb, gfp_t gfp_mask,
-					    int action)
-{
-	struct sk_buff *n;
-
-	n = skb_clone(skb, gfp_mask);
-
-	if (n) {
-		n->tc_verd = SET_TC_VERD(n->tc_verd, 0);
-	}
-	return n;
-}
-#endif
-
 struct psched_ratecfg {
 	u64	rate_bytes_ps; /* bytes per second */
 	u32	mult;
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 596ffa0c7084..ffc112c8e1c2 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -44,13 +44,13 @@ bits 9,10,11: redirect counter -  redirect TTL. Loop avoidance
 #define TC_OK2MUNGE        _TC_MAKEMASK1(1)
 #define SET_TC_OK2MUNGE(v)   ( TC_OK2MUNGE | (v & ~TC_OK2MUNGE))
 #define CLR_TC_OK2MUNGE(v)   ( v & ~TC_OK2MUNGE)
-#endif
 
 #define S_TC_VERD          _TC_MAKE32(2)
 #define M_TC_VERD          _TC_MAKEMASK(4,S_TC_VERD)
 #define G_TC_VERD(x)       _TC_GETVALUE(x,S_TC_VERD,M_TC_VERD)
 #define V_TC_VERD(x)       _TC_MAKEVALUE(x,S_TC_VERD)
 #define SET_TC_VERD(v,n)   ((V_TC_VERD(n)) | (v & ~M_TC_VERD))
+#endif
 
 #define S_TC_FROM          _TC_MAKE32(6)
 #define M_TC_FROM          _TC_MAKEMASK(2,S_TC_FROM)
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 3f63ceac8e01..a42a3b257226 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -151,7 +151,7 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 	}
 
 	at = G_TC_AT(skb->tc_verd);
-	skb2 = skb_act_clone(skb, GFP_ATOMIC, m->tcf_action);
+	skb2 = skb_clone(skb, GFP_ATOMIC);
 	if (skb2 == NULL)
 		goto out;
 
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index ad9eed70bc8f..0b74dc0ede9c 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1816,13 +1816,8 @@ int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
 			continue;
 		err = tp->classify(skb, tp, res);
 
-		if (err >= 0) {
-#ifdef CONFIG_NET_CLS_ACT
-			if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
-				skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
-#endif
+		if (err >= 0)
 			return err;
-		}
 	}
 	return -1;
 }
@@ -1834,23 +1829,22 @@ int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 	int err = 0;
 #ifdef CONFIG_NET_CLS_ACT
 	const struct tcf_proto *otp = tp;
+	int limit = 0;
 reclassify:
 #endif
 
 	err = tc_classify_compat(skb, tp, res);
 #ifdef CONFIG_NET_CLS_ACT
 	if (err == TC_ACT_RECLASSIFY) {
-		u32 verd = G_TC_VERD(skb->tc_verd);
 		tp = otp;
 
-		if (verd++ >= MAX_REC_LOOP) {
+		if (unlikely(limit++ >= MAX_REC_LOOP)) {
 			net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n",
 					       tp->q->ops->id,
 					       tp->prio & 0xffff,
 					       ntohs(tp->protocol));
 			return TC_ACT_SHOT;
 		}
-		skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
 		goto reclassify;
 	}
 #endif
-- 
cgit v1.2.3


From 1bd758eb1cab2fa5b71a23f9e5d3c8076f4ed650 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 12 May 2015 14:56:07 +0200
Subject: net: change name of flow_dissector header to match the .c file name

add couple of empty lines on the way.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c             |  2 +-
 drivers/net/ethernet/cisco/enic/enic_clsf.c |  2 +-
 include/linux/skbuff.h                      |  2 +-
 include/net/flow_dissector.h                | 68 +++++++++++++++++++++++++++++
 include/net/flow_keys.h                     | 61 --------------------------
 include/net/ip.h                            |  2 +-
 include/net/ipv6.h                          |  2 +-
 net/core/flow_dissector.c                   |  2 +-
 net/sched/cls_flow.c                        |  2 +-
 net/sched/sch_choke.c                       |  2 +-
 10 files changed, 76 insertions(+), 69 deletions(-)
 create mode 100644 include/net/flow_dissector.h
 delete mode 100644 include/net/flow_keys.h

(limited to 'include/net')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index a2e25de98bde..fe7c38721775 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -76,7 +76,7 @@
 #include <net/netns/generic.h>
 #include <net/pkt_sched.h>
 #include <linux/rculist.h>
-#include <net/flow_keys.h>
+#include <net/flow_dissector.h>
 #include <net/switchdev.h>
 #include <net/bonding.h>
 #include <net/bond_3ad.h>
diff --git a/drivers/net/ethernet/cisco/enic/enic_clsf.c b/drivers/net/ethernet/cisco/enic/enic_clsf.c
index 0be6850be8a2..380f04903a36 100644
--- a/drivers/net/ethernet/cisco/enic/enic_clsf.c
+++ b/drivers/net/ethernet/cisco/enic/enic_clsf.c
@@ -5,7 +5,7 @@
 #include <linux/in.h>
 #include <linux/types.h>
 #include <linux/skbuff.h>
-#include <net/flow_keys.h>
+#include <net/flow_dissector.h>
 #include "enic_res.h"
 #include "enic_clsf.h"
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c0b574a414e7..e35c2b13d434 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -34,7 +34,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/netdev_features.h>
 #include <linux/sched.h>
-#include <net/flow_keys.h>
+#include <net/flow_dissector.h>
 
 /* A. Checksumming of received packets by device.
  *
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
new file mode 100644
index 000000000000..5e99a7b3963c
--- /dev/null
+++ b/include/net/flow_dissector.h
@@ -0,0 +1,68 @@
+#ifndef _NET_FLOW_DISSECTOR_H
+#define _NET_FLOW_DISSECTOR_H
+
+/* struct flow_keys:
+ *	@src: source ip address in case of IPv4
+ *	      For IPv6 it contains 32bit hash of src address
+ *	@dst: destination ip address in case of IPv4
+ *	      For IPv6 it contains 32bit hash of dst address
+ *	@ports: port numbers of Transport header
+ *		port16[0]: src port number
+ *		port16[1]: dst port number
+ *	@thoff: Transport header offset
+ *	@n_proto: Network header protocol (eg. IPv4/IPv6)
+ *	@ip_proto: Transport header protocol (eg. TCP/UDP)
+ * All the members, except thoff, are in network byte order.
+ */
+struct flow_keys {
+	/* (src,dst) must be grouped, in the same way than in IP header */
+	__be32 src;
+	__be32 dst;
+	union {
+		__be32 ports;
+		__be16 port16[2];
+	};
+	u16	thoff;
+	__be16	n_proto;
+	u8	ip_proto;
+};
+
+bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
+			void *data, __be16 proto, int nhoff, int hlen);
+
+static inline bool skb_flow_dissect(const struct sk_buff *skb,
+				    struct flow_keys *flow)
+{
+	return __skb_flow_dissect(skb, flow, NULL, 0, 0, 0);
+}
+
+__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
+			    void *data, int hlen_proto);
+
+static inline __be32 skb_flow_get_ports(const struct sk_buff *skb,
+					int thoff, u8 ip_proto)
+{
+	return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0);
+}
+
+u32 flow_hash_from_keys(struct flow_keys *keys);
+
+unsigned int flow_get_hlen(const unsigned char *data, unsigned int max_len,
+			   __be16 protocol);
+
+/* struct flow_keys_digest:
+ *
+ * This structure is used to hold a digest of the full flow keys. This is a
+ * larger "hash" of a flow to allow definitively matching specific flows where
+ * the 32 bit skb->hash is not large enough. The size is limited to 16 bytes so
+ * that it can by used in CB of skb (see sch_choke for an example).
+ */
+#define FLOW_KEYS_DIGEST_LEN	16
+struct flow_keys_digest {
+	u8	data[FLOW_KEYS_DIGEST_LEN];
+};
+
+void make_flow_keys_digest(struct flow_keys_digest *digest,
+			   const struct flow_keys *flow);
+
+#endif
diff --git a/include/net/flow_keys.h b/include/net/flow_keys.h
deleted file mode 100644
index 6d6ef626811a..000000000000
--- a/include/net/flow_keys.h
+++ /dev/null
@@ -1,61 +0,0 @@
-#ifndef _NET_FLOW_KEYS_H
-#define _NET_FLOW_KEYS_H
-
-/* struct flow_keys:
- *	@src: source ip address in case of IPv4
- *	      For IPv6 it contains 32bit hash of src address
- *	@dst: destination ip address in case of IPv4
- *	      For IPv6 it contains 32bit hash of dst address
- *	@ports: port numbers of Transport header
- *		port16[0]: src port number
- *		port16[1]: dst port number
- *	@thoff: Transport header offset
- *	@n_proto: Network header protocol (eg. IPv4/IPv6)
- *	@ip_proto: Transport header protocol (eg. TCP/UDP)
- * All the members, except thoff, are in network byte order.
- */
-struct flow_keys {
-	/* (src,dst) must be grouped, in the same way than in IP header */
-	__be32 src;
-	__be32 dst;
-	union {
-		__be32 ports;
-		__be16 port16[2];
-	};
-	u16	thoff;
-	__be16	n_proto;
-	u8	ip_proto;
-};
-
-bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
-			void *data, __be16 proto, int nhoff, int hlen);
-static inline bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow)
-{
-	return __skb_flow_dissect(skb, flow, NULL, 0, 0, 0);
-}
-__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
-			    void *data, int hlen_proto);
-static inline __be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto)
-{
-	return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0);
-}
-u32 flow_hash_from_keys(struct flow_keys *keys);
-unsigned int flow_get_hlen(const unsigned char *data, unsigned int max_len,
-			   __be16 protocol);
-
-/* struct flow_keys_digest:
- *
- * This structure is used to hold a digest of the full flow keys. This is a
- * larger "hash" of a flow to allow definitively matching specific flows where
- * the 32 bit skb->hash is not large enough. The size is limited to 16 bytes so
- * that it can by used in CB of skb (see sch_choke for an example).
- */
-#define FLOW_KEYS_DIGEST_LEN	16
-struct flow_keys_digest {
-	u8	data[FLOW_KEYS_DIGEST_LEN];
-};
-
-void make_flow_keys_digest(struct flow_keys_digest *digest,
-			   const struct flow_keys *flow);
-
-#endif
diff --git a/include/net/ip.h b/include/net/ip.h
index d14af7edd197..562eb653aebc 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -31,7 +31,7 @@
 #include <net/route.h>
 #include <net/snmp.h>
 #include <net/flow.h>
-#include <net/flow_keys.h>
+#include <net/flow_dissector.h>
 
 struct sock;
 
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 53d25ef1699a..9932b86394a9 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -19,7 +19,7 @@
 #include <net/if_inet6.h>
 #include <net/ndisc.h>
 #include <net/flow.h>
-#include <net/flow_keys.h>
+#include <net/flow_dissector.h>
 #include <net/snmp.h>
 
 #define SIN6_LEN_RFC2133	24
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index d3acc4dff4ae..eaa86b539221 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -12,7 +12,7 @@
 #include <linux/if_tunnel.h>
 #include <linux/if_pppox.h>
 #include <linux/ppp_defs.h>
-#include <net/flow_keys.h>
+#include <net/flow_dissector.h>
 #include <scsi/fc/fc_fcoe.h>
 
 /* copy saddr & daddr, possibly using 64bit load/store
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index a620c4e288a5..4c5a92298906 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -26,7 +26,7 @@
 #include <net/pkt_cls.h>
 #include <net/ip.h>
 #include <net/route.h>
-#include <net/flow_keys.h>
+#include <net/flow_dissector.h>
 
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 #include <net/netfilter/nf_conntrack.h>
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index dfe3da75594c..2624e991a2d1 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -18,7 +18,7 @@
 #include <net/pkt_sched.h>
 #include <net/inet_ecn.h>
 #include <net/red.h>
-#include <net/flow_keys.h>
+#include <net/flow_dissector.h>
 
 /*
    CHOKe stateless AQM for fair bandwidth allocation
-- 
cgit v1.2.3


From b0a31431b4d81fb1ccc36ce64ce3fe6a0aca4031 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 12 May 2015 14:56:08 +0200
Subject: flow_dissector: remove unused function flow_get_hlen declaration

commit 56193d1bce ("net: Add function for parsing the header length out
of linear ethernet frames") added this function declaration but it is
defined nowhere.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 5e99a7b3963c..118ae6926a72 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -47,9 +47,6 @@ static inline __be32 skb_flow_get_ports(const struct sk_buff *skb,
 
 u32 flow_hash_from_keys(struct flow_keys *keys);
 
-unsigned int flow_get_hlen(const unsigned char *data, unsigned int max_len,
-			   __be16 protocol);
-
 /* struct flow_keys_digest:
  *
  * This structure is used to hold a digest of the full flow keys. This is a
-- 
cgit v1.2.3


From 10b89ee43e849544eddfe34e535341fc077464ec Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 12 May 2015 14:56:09 +0200
Subject: net: move *skb_get_poff declarations into correct header

Since these functions are defined in flow_dissector.c, move header
declarations from skbuff.h into flow_dissector.h

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h       | 4 ----
 include/net/flow_dissector.h | 3 +++
 net/core/filter.c            | 1 +
 net/ethernet/eth.c           | 1 +
 4 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index e35c2b13d434..17607ab9e7a2 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3424,10 +3424,6 @@ struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
 				     unsigned int transport_len,
 				     __sum16(*skb_chkf)(struct sk_buff *skb));
 
-u32 skb_get_poff(const struct sk_buff *skb);
-u32 __skb_get_poff(const struct sk_buff *skb, void *data,
-		   const struct flow_keys *keys, int hlen);
-
 /**
  * skb_head_is_locked - Determine if the skb->head is locked down
  * @skb: skb to check
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 118ae6926a72..4570ccafe59d 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -46,6 +46,9 @@ static inline __be32 skb_flow_get_ports(const struct sk_buff *skb,
 }
 
 u32 flow_hash_from_keys(struct flow_keys *keys);
+u32 skb_get_poff(const struct sk_buff *skb);
+u32 __skb_get_poff(const struct sk_buff *skb, void *data,
+		   const struct flow_keys *keys, int hlen);
 
 /* struct flow_keys_digest:
  *
diff --git a/net/core/filter.c b/net/core/filter.c
index a831f193e2c7..6805717be614 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -36,6 +36,7 @@
 #include <net/netlink.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
+#include <net/flow_dissector.h>
 #include <linux/errno.h>
 #include <linux/timer.h>
 #include <asm/uaccess.h>
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 9045e2a1108f..9332a0ab0698 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -58,6 +58,7 @@
 #include <net/ipv6.h>
 #include <net/ip.h>
 #include <net/dsa.h>
+#include <net/flow_dissector.h>
 #include <linux/uaccess.h>
 
 __setup("ether=", netdev_boot_setup);
-- 
cgit v1.2.3


From 9c684b5083bc191c4b7b189c73d75587e167a474 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 12 May 2015 14:56:11 +0200
Subject: net: move __skb_get_hash function declaration to flow_dissector.h

Since the definition of the function is in flow_dissector.c, it makes
sense to have the declaration in flow_dissector.h

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h       | 1 -
 include/net/flow_dissector.h | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 17607ab9e7a2..ae2d1b7769d8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -918,7 +918,6 @@ skb_set_hash(struct sk_buff *skb, __u32 hash, enum pkt_hash_types type)
 	skb->hash = hash;
 }
 
-void __skb_get_hash(struct sk_buff *skb);
 static inline __u32 skb_get_hash(struct sk_buff *skb)
 {
 	if (!skb->l4_hash && !skb->sw_hash)
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 4570ccafe59d..e4ee761ca8c0 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -46,6 +46,7 @@ static inline __be32 skb_flow_get_ports(const struct sk_buff *skb,
 }
 
 u32 flow_hash_from_keys(struct flow_keys *keys);
+void __skb_get_hash(struct sk_buff *skb);
 u32 skb_get_poff(const struct sk_buff *skb);
 u32 __skb_get_poff(const struct sk_buff *skb, void *data,
 		   const struct flow_keys *keys, int hlen);
-- 
cgit v1.2.3


From fbff949e3bc7f3f7d9e8b3ef4855ec7138276a25 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 12 May 2015 14:56:15 +0200
Subject: flow_dissector: introduce programable flow_dissector

Introduce dissector infrastructure which allows user to specify which
parts of skb he wants to dissect.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h | 61 ++++++++++++++++++++++++++++++++++++++++++++
 net/core/flow_dissector.c    | 48 ++++++++++++++++++++++++++++++++++
 2 files changed, 109 insertions(+)

(limited to 'include/net')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index e4ee761ca8c0..20239e807f77 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -1,6 +1,64 @@
 #ifndef _NET_FLOW_DISSECTOR_H
 #define _NET_FLOW_DISSECTOR_H
 
+/**
+ * struct flow_dissector_key_basic:
+ * @thoff: Transport header offset
+ * @n_proto: Network header protocol (eg. IPv4/IPv6)
+ * @ip_proto: Transport header protocol (eg. TCP/UDP)
+ */
+struct flow_dissector_key_basic {
+	u16	thoff;
+	__be16	n_proto;
+	u8	ip_proto;
+};
+
+/**
+ * struct flow_dissector_key_addrs:
+ * @src: source ip address in case of IPv4
+ *	 For IPv6 it contains 32bit hash of src address
+ * @dst: destination ip address in case of IPv4
+ *	 For IPv6 it contains 32bit hash of dst address
+ */
+struct flow_dissector_key_addrs {
+	/* (src,dst) must be grouped, in the same way than in IP header */
+	__be32 src;
+	__be32 dst;
+};
+
+/**
+ * flow_dissector_key_tp_ports:
+ *	@ports: port numbers of Transport header
+ *		port16[0]: src port number
+ *		port16[1]: dst port number
+ */
+struct flow_dissector_key_ports {
+	union {
+		__be32 ports;
+		__be16 port16[2];
+	};
+};
+
+enum flow_dissector_key_id {
+	FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */
+	FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_addrs */
+	FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, /* struct flow_dissector_key_addrs */
+	FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */
+
+	FLOW_DISSECTOR_KEY_MAX,
+};
+
+struct flow_dissector_key {
+	enum flow_dissector_key_id key_id;
+	size_t offset; /* offset of struct flow_dissector_key_*
+			  in target the struct */
+};
+
+struct flow_dissector {
+	unsigned int used_keys; /* each bit repesents presence of one key id */
+	unsigned short int offset[FLOW_DISSECTOR_KEY_MAX];
+};
+
 /* struct flow_keys:
  *	@src: source ip address in case of IPv4
  *	      For IPv6 it contains 32bit hash of src address
@@ -27,6 +85,9 @@ struct flow_keys {
 	u8	ip_proto;
 };
 
+void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
+			     const struct flow_dissector_key *key,
+			     unsigned int key_count);
 bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
 			void *data, __be16 proto, int nhoff, int hlen);
 
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index d885e282908e..6883e22b8fc4 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1,3 +1,4 @@
+#include <linux/kernel.h>
 #include <linux/skbuff.h>
 #include <linux/export.h>
 #include <linux/ip.h>
@@ -15,6 +16,53 @@
 #include <net/flow_dissector.h>
 #include <scsi/fc/fc_fcoe.h>
 
+static bool skb_flow_dissector_uses_key(struct flow_dissector *flow_dissector,
+					enum flow_dissector_key_id key_id)
+{
+	return flow_dissector->used_keys & (1 << key_id);
+}
+
+static void skb_flow_dissector_set_key(struct flow_dissector *flow_dissector,
+				       enum flow_dissector_key_id key_id)
+{
+	flow_dissector->used_keys |= (1 << key_id);
+}
+
+static void *skb_flow_dissector_target(struct flow_dissector *flow_dissector,
+				       enum flow_dissector_key_id key_id,
+				       void *target_container)
+{
+	return ((char *) target_container) + flow_dissector->offset[key_id];
+}
+
+void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
+			     const struct flow_dissector_key *key,
+			     unsigned int key_count)
+{
+	unsigned int i;
+
+	memset(flow_dissector, 0, sizeof(*flow_dissector));
+
+	for (i = 0; i < key_count; i++, key++) {
+		/* User should make sure that every key target offset is withing
+		 * boundaries of unsigned short.
+		 */
+		BUG_ON(key->offset > USHRT_MAX);
+		BUG_ON(skb_flow_dissector_uses_key(flow_dissector,
+						   key->key_id));
+
+		skb_flow_dissector_set_key(flow_dissector, key->key_id);
+		flow_dissector->offset[key->key_id] = key->offset;
+	}
+
+	/* Ensure that the dissector always includes basic key. That way
+	 * we are able to avoid handling lack of it in fast path.
+	 */
+	BUG_ON(!skb_flow_dissector_uses_key(flow_dissector,
+					    FLOW_DISSECTOR_KEY_BASIC));
+}
+EXPORT_SYMBOL(skb_flow_dissector_init);
+
 /* copy saddr & daddr, possibly using 64bit load/store
  * Equivalent to :	flow->src = iph->saddr;
  *			flow->dst = iph->daddr;
-- 
cgit v1.2.3


From 06635a35d13d42b95422bba6633f175245cc644e Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 12 May 2015 14:56:16 +0200
Subject: flow_dissect: use programable dissector in skb_flow_dissect and
 friends

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c                |  18 +--
 drivers/net/ethernet/cisco/enic/enic_clsf.c    |  27 ++--
 drivers/net/ethernet/cisco/enic/enic_ethtool.c |  10 +-
 drivers/net/hyperv/netvsc_drv.c                |   8 +-
 include/linux/skbuff.h                         |   4 +-
 include/net/flow_dissector.h                   |  63 ++++----
 include/net/ip.h                               |   8 +-
 include/net/ipv6.h                             |   8 +-
 net/core/flow_dissector.c                      | 199 +++++++++++++++++--------
 net/ethernet/eth.c                             |   6 +-
 net/sched/cls_flow.c                           |  20 +--
 net/sched/sch_choke.c                          |   4 +-
 12 files changed, 229 insertions(+), 146 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index fe7c38721775..ef26e0147050 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3051,16 +3051,16 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
 	int noff, proto = -1;
 
 	if (bond->params.xmit_policy > BOND_XMIT_POLICY_LAYER23)
-		return skb_flow_dissect(skb, fk);
+		return skb_flow_dissect_flow_keys(skb, fk);
 
-	fk->ports = 0;
+	fk->ports.ports = 0;
 	noff = skb_network_offset(skb);
 	if (skb->protocol == htons(ETH_P_IP)) {
 		if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph))))
 			return false;
 		iph = ip_hdr(skb);
-		fk->src = iph->saddr;
-		fk->dst = iph->daddr;
+		fk->addrs.src = iph->saddr;
+		fk->addrs.dst = iph->daddr;
 		noff += iph->ihl << 2;
 		if (!ip_is_fragment(iph))
 			proto = iph->protocol;
@@ -3068,15 +3068,15 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
 		if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph6))))
 			return false;
 		iph6 = ipv6_hdr(skb);
-		fk->src = (__force __be32)ipv6_addr_hash(&iph6->saddr);
-		fk->dst = (__force __be32)ipv6_addr_hash(&iph6->daddr);
+		fk->addrs.src = (__force __be32)ipv6_addr_hash(&iph6->saddr);
+		fk->addrs.dst = (__force __be32)ipv6_addr_hash(&iph6->daddr);
 		noff += sizeof(*iph6);
 		proto = iph6->nexthdr;
 	} else {
 		return false;
 	}
 	if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34 && proto >= 0)
-		fk->ports = skb_flow_get_ports(skb, noff, proto);
+		fk->ports.ports = skb_flow_get_ports(skb, noff, proto);
 
 	return true;
 }
@@ -3102,8 +3102,8 @@ u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb)
 	    bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP23)
 		hash = bond_eth_hash(skb);
 	else
-		hash = (__force u32)flow.ports;
-	hash ^= (__force u32)flow.dst ^ (__force u32)flow.src;
+		hash = (__force u32)flow.ports.ports;
+	hash ^= (__force u32)flow.addrs.dst ^ (__force u32)flow.addrs.src;
 	hash ^= (hash >> 16);
 	hash ^= (hash >> 8);
 
diff --git a/drivers/net/ethernet/cisco/enic/enic_clsf.c b/drivers/net/ethernet/cisco/enic/enic_clsf.c
index 380f04903a36..d3d25c7e5b96 100644
--- a/drivers/net/ethernet/cisco/enic/enic_clsf.c
+++ b/drivers/net/ethernet/cisco/enic/enic_clsf.c
@@ -22,7 +22,7 @@ int enic_addfltr_5t(struct enic *enic, struct flow_keys *keys, u16 rq)
 	int res;
 	struct filter data;
 
-	switch (keys->ip_proto) {
+	switch (keys->basic.ip_proto) {
 	case IPPROTO_TCP:
 		data.u.ipv4.protocol = PROTO_TCP;
 		break;
@@ -33,10 +33,10 @@ int enic_addfltr_5t(struct enic *enic, struct flow_keys *keys, u16 rq)
 		return -EPROTONOSUPPORT;
 	};
 	data.type = FILTER_IPV4_5TUPLE;
-	data.u.ipv4.src_addr = ntohl(keys->src);
-	data.u.ipv4.dst_addr = ntohl(keys->dst);
-	data.u.ipv4.src_port = ntohs(keys->port16[0]);
-	data.u.ipv4.dst_port = ntohs(keys->port16[1]);
+	data.u.ipv4.src_addr = ntohl(keys->addrs.src);
+	data.u.ipv4.dst_addr = ntohl(keys->addrs.dst);
+	data.u.ipv4.src_port = ntohs(keys->ports.port16[0]);
+	data.u.ipv4.dst_port = ntohs(keys->ports.port16[1]);
 	data.u.ipv4.flags = FILTER_FIELDS_IPV4_5TUPLE;
 
 	spin_lock_bh(&enic->devcmd_lock);
@@ -158,11 +158,11 @@ static struct enic_rfs_fltr_node *htbl_key_search(struct hlist_head *h,
 	struct enic_rfs_fltr_node *tpos;
 
 	hlist_for_each_entry(tpos, h, node)
-		if (tpos->keys.src == k->src &&
-		    tpos->keys.dst == k->dst &&
-		    tpos->keys.ports == k->ports &&
-		    tpos->keys.ip_proto == k->ip_proto &&
-		    tpos->keys.n_proto == k->n_proto)
+		if (tpos->keys.addrs.src == k->addrs.src &&
+		    tpos->keys.addrs.dst == k->addrs.dst &&
+		    tpos->keys.ports.ports == k->ports.ports &&
+		    tpos->keys.basic.ip_proto == k->basic.ip_proto &&
+		    tpos->keys.basic.n_proto == k->basic.n_proto)
 			return tpos;
 	return NULL;
 }
@@ -177,9 +177,10 @@ int enic_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
 	int res, i;
 
 	enic = netdev_priv(dev);
-	res = skb_flow_dissect(skb, &keys);
-	if (!res || keys.n_proto != htons(ETH_P_IP) ||
-	    (keys.ip_proto != IPPROTO_TCP && keys.ip_proto != IPPROTO_UDP))
+	res = skb_flow_dissect_flow_keys(skb, &keys);
+	if (!res || keys.basic.n_proto != htons(ETH_P_IP) ||
+	    (keys.basic.ip_proto != IPPROTO_TCP &&
+	     keys.basic.ip_proto != IPPROTO_UDP))
 		return -EPROTONOSUPPORT;
 
 	tbl_idx = skb_get_hash_raw(skb) & ENIC_RFS_FLW_MASK;
diff --git a/drivers/net/ethernet/cisco/enic/enic_ethtool.c b/drivers/net/ethernet/cisco/enic/enic_ethtool.c
index 28d9ca675a27..7588f8dcc890 100644
--- a/drivers/net/ethernet/cisco/enic/enic_ethtool.c
+++ b/drivers/net/ethernet/cisco/enic/enic_ethtool.c
@@ -334,7 +334,7 @@ static int enic_grxclsrule(struct enic *enic, struct ethtool_rxnfc *cmd)
 	n = htbl_fltr_search(enic, (u16)fsp->location);
 	if (!n)
 		return -EINVAL;
-	switch (n->keys.ip_proto) {
+	switch (n->keys.basic.ip_proto) {
 	case IPPROTO_TCP:
 		fsp->flow_type = TCP_V4_FLOW;
 		break;
@@ -346,16 +346,16 @@ static int enic_grxclsrule(struct enic *enic, struct ethtool_rxnfc *cmd)
 		break;
 	}
 
-	fsp->h_u.tcp_ip4_spec.ip4src = n->keys.src;
+	fsp->h_u.tcp_ip4_spec.ip4src = n->keys.addrs.src;
 	fsp->m_u.tcp_ip4_spec.ip4src = (__u32)~0;
 
-	fsp->h_u.tcp_ip4_spec.ip4dst = n->keys.dst;
+	fsp->h_u.tcp_ip4_spec.ip4dst = n->keys.addrs.dst;
 	fsp->m_u.tcp_ip4_spec.ip4dst = (__u32)~0;
 
-	fsp->h_u.tcp_ip4_spec.psrc = n->keys.port16[0];
+	fsp->h_u.tcp_ip4_spec.psrc = n->keys.ports.port16[0];
 	fsp->m_u.tcp_ip4_spec.psrc = (__u16)~0;
 
-	fsp->h_u.tcp_ip4_spec.pdst = n->keys.port16[1];
+	fsp->h_u.tcp_ip4_spec.pdst = n->keys.ports.port16[1];
 	fsp->m_u.tcp_ip4_spec.pdst = (__u16)~0;
 
 	fsp->ring_cookie = n->rq_id;
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 5993c7e2d723..8e5fe888a0ec 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -196,12 +196,12 @@ static bool netvsc_set_hash(u32 *hash, struct sk_buff *skb)
 	struct flow_keys flow;
 	int data_len;
 
-	if (!skb_flow_dissect(skb, &flow) ||
-	    !(flow.n_proto == htons(ETH_P_IP) ||
-	      flow.n_proto == htons(ETH_P_IPV6)))
+	if (!skb_flow_dissect_flow_keys(skb, &flow) ||
+	    !(flow.basic.n_proto == htons(ETH_P_IP) ||
+	      flow.basic.n_proto == htons(ETH_P_IPV6)))
 		return false;
 
-	if (flow.ip_proto == IPPROTO_TCP)
+	if (flow.basic.ip_proto == IPPROTO_TCP)
 		data_len = 12;
 	else
 		data_len = 8;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b01c7fba7c17..f83aa6568cbf 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1935,8 +1935,8 @@ static inline void skb_probe_transport_header(struct sk_buff *skb,
 
 	if (skb_transport_header_was_set(skb))
 		return;
-	else if (skb_flow_dissect(skb, &keys))
-		skb_set_transport_header(skb, keys.thoff);
+	else if (skb_flow_dissect_flow_keys(skb, &keys))
+		skb_set_transport_header(skb, keys.basic.thoff);
 	else
 		skb_set_transport_header(skb, offset_hint);
 }
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 20239e807f77..0c8d406fb730 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -59,42 +59,47 @@ struct flow_dissector {
 	unsigned short int offset[FLOW_DISSECTOR_KEY_MAX];
 };
 
-/* struct flow_keys:
- *	@src: source ip address in case of IPv4
- *	      For IPv6 it contains 32bit hash of src address
- *	@dst: destination ip address in case of IPv4
- *	      For IPv6 it contains 32bit hash of dst address
- *	@ports: port numbers of Transport header
- *		port16[0]: src port number
- *		port16[1]: dst port number
- *	@thoff: Transport header offset
- *	@n_proto: Network header protocol (eg. IPv4/IPv6)
- *	@ip_proto: Transport header protocol (eg. TCP/UDP)
- * All the members, except thoff, are in network byte order.
- */
-struct flow_keys {
-	/* (src,dst) must be grouped, in the same way than in IP header */
-	__be32 src;
-	__be32 dst;
-	union {
-		__be32 ports;
-		__be16 port16[2];
-	};
-	u16	thoff;
-	__be16	n_proto;
-	u8	ip_proto;
-};
-
 void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
 			     const struct flow_dissector_key *key,
 			     unsigned int key_count);
-bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
+
+bool __skb_flow_dissect(const struct sk_buff *skb,
+			struct flow_dissector *flow_dissector,
+			void *target_container,
 			void *data, __be16 proto, int nhoff, int hlen);
 
 static inline bool skb_flow_dissect(const struct sk_buff *skb,
-				    struct flow_keys *flow)
+				    struct flow_dissector *flow_dissector,
+				    void *target_container)
+{
+	return __skb_flow_dissect(skb, flow_dissector, target_container,
+				  NULL, 0, 0, 0);
+}
+
+struct flow_keys {
+	struct flow_dissector_key_addrs addrs;
+	struct flow_dissector_key_ports ports;
+	struct flow_dissector_key_basic basic;
+};
+
+extern struct flow_dissector flow_keys_dissector;
+extern struct flow_dissector flow_keys_buf_dissector;
+
+static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb,
+					      struct flow_keys *flow)
+{
+	memset(flow, 0, sizeof(*flow));
+	return __skb_flow_dissect(skb, &flow_keys_dissector, flow,
+				  NULL, 0, 0, 0);
+}
+
+static inline bool skb_flow_dissect_flow_keys_buf(struct flow_keys *flow,
+						  void *data, __be16 proto,
+						  int nhoff, int hlen)
 {
-	return __skb_flow_dissect(skb, flow, NULL, 0, 0, 0);
+	memset(flow, 0, sizeof(*flow));
+	return __skb_flow_dissect(NULL, &flow_keys_buf_dissector, flow,
+				  data, proto, nhoff, hlen);
 }
 
 __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
diff --git a/include/net/ip.h b/include/net/ip.h
index 562eb653aebc..b0443d4fe13f 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -360,10 +360,10 @@ static inline void inet_set_txhash(struct sock *sk)
 	struct inet_sock *inet = inet_sk(sk);
 	struct flow_keys keys;
 
-	keys.src = inet->inet_saddr;
-	keys.dst = inet->inet_daddr;
-	keys.port16[0] = inet->inet_sport;
-	keys.port16[1] = inet->inet_dport;
+	keys.addrs.src = inet->inet_saddr;
+	keys.addrs.dst = inet->inet_daddr;
+	keys.ports.port16[0] = inet->inet_sport;
+	keys.ports.port16[1] = inet->inet_dport;
 
 	sk->sk_txhash = flow_hash_from_keys(&keys);
 }
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 9932b86394a9..9eed9761dfce 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -698,10 +698,10 @@ static inline void ip6_set_txhash(struct sock *sk)
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct flow_keys keys;
 
-	keys.src = (__force __be32)ipv6_addr_hash(&np->saddr);
-	keys.dst = (__force __be32)ipv6_addr_hash(&sk->sk_v6_daddr);
-	keys.port16[0] = inet->inet_sport;
-	keys.port16[1] = inet->inet_dport;
+	keys.addrs.src = (__force __be32)ipv6_addr_hash(&np->saddr);
+	keys.addrs.dst = (__force __be32)ipv6_addr_hash(&sk->sk_v6_daddr);
+	keys.ports.port16[0] = inet->inet_sport;
+	keys.ports.port16[1] = inet->inet_dport;
 
 	sk->sk_txhash = flow_hash_from_keys(&keys);
 }
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 6883e22b8fc4..6a49acaa6651 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -13,6 +13,7 @@
 #include <linux/if_tunnel.h>
 #include <linux/if_pppox.h>
 #include <linux/ppp_defs.h>
+#include <linux/stddef.h>
 #include <net/flow_dissector.h>
 #include <scsi/fc/fc_fcoe.h>
 
@@ -63,17 +64,6 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
 }
 EXPORT_SYMBOL(skb_flow_dissector_init);
 
-/* copy saddr & daddr, possibly using 64bit load/store
- * Equivalent to :	flow->src = iph->saddr;
- *			flow->dst = iph->daddr;
- */
-static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *iph)
-{
-	BUILD_BUG_ON(offsetof(typeof(*flow), dst) !=
-		     offsetof(typeof(*flow), src) + sizeof(flow->src));
-	memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst));
-}
-
 /**
  * __skb_flow_get_ports - extract the upper layer ports and return them
  * @skb: sk_buff to extract the ports from
@@ -111,17 +101,27 @@ EXPORT_SYMBOL(__skb_flow_get_ports);
 /**
  * __skb_flow_dissect - extract the flow_keys struct and return it
  * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified
+ * @flow_dissector: list of keys to dissect
+ * @target_container: target structure to put dissected values into
  * @data: raw buffer pointer to the packet, if NULL use skb->data
  * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol
  * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb)
  * @hlen: packet header length, if @data is NULL use skb_headlen(skb)
  *
- * The function will try to retrieve the struct flow_keys from either the skbuff
- * or a raw buffer specified by the rest parameters
+ * The function will try to retrieve individual keys into target specified
+ * by flow_dissector from either the skbuff or a raw buffer specified by the
+ * rest parameters.
+ *
+ * Caller must take care of zeroing target container memory.
  */
-bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
+bool __skb_flow_dissect(const struct sk_buff *skb,
+			struct flow_dissector *flow_dissector,
+			void *target_container,
 			void *data, __be16 proto, int nhoff, int hlen)
 {
+	struct flow_dissector_key_basic *key_basic;
+	struct flow_dissector_key_addrs *key_addrs;
+	struct flow_dissector_key_ports *key_ports;
 	u8 ip_proto;
 
 	if (!data) {
@@ -131,7 +131,12 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
 		hlen = skb_headlen(skb);
 	}
 
-	memset(flow, 0, sizeof(*flow));
+	/* It is ensured by skb_flow_dissector_init() that basic key will
+	 * be always present.
+	 */
+	key_basic = skb_flow_dissector_target(flow_dissector,
+					      FLOW_DISSECTOR_KEY_BASIC,
+					      target_container);
 
 again:
 	switch (proto) {
@@ -148,14 +153,13 @@ ip:
 		if (ip_is_fragment(iph))
 			ip_proto = 0;
 
-		/* skip the address processing if skb is NULL.  The assumption
-		 * here is that if there is no skb we are not looking for flow
-		 * info but lengths and protocols.
-		 */
-		if (!skb)
+		if (!skb_flow_dissector_uses_key(flow_dissector,
+						 FLOW_DISSECTOR_KEY_IPV4_ADDRS))
 			break;
-
-		iph_to_flow_copy_addrs(flow, iph);
+		key_addrs = skb_flow_dissector_target(flow_dissector,
+						      FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+						      target_container);
+		memcpy(key_addrs, &iph->saddr, sizeof(*key_addrs));
 		break;
 	}
 	case htons(ETH_P_IPV6): {
@@ -171,12 +175,15 @@ ipv6:
 		ip_proto = iph->nexthdr;
 		nhoff += sizeof(struct ipv6hdr);
 
-		/* see comment above in IPv4 section */
-		if (!skb)
+		if (!skb_flow_dissector_uses_key(flow_dissector,
+						 FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS))
 			break;
+		key_addrs = skb_flow_dissector_target(flow_dissector,
+						      FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS,
+						      target_container);
 
-		flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr);
-		flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr);
+		key_addrs->src = (__force __be32)ipv6_addr_hash(&iph->saddr);
+		key_addrs->dst = (__force __be32)ipv6_addr_hash(&iph->daddr);
 
 		flow_label = ip6_flowlabel(iph);
 		if (flow_label) {
@@ -184,10 +191,18 @@ ipv6:
 			 * use that to represent the ports without any
 			 * further dissection.
 			 */
-			flow->n_proto = proto;
-			flow->ip_proto = ip_proto;
-			flow->ports = flow_label;
-			flow->thoff = (u16)nhoff;
+
+			key_basic->n_proto = proto;
+			key_basic->ip_proto = ip_proto;
+			key_basic->thoff = (u16)nhoff;
+
+			if (!skb_flow_dissector_uses_key(flow_dissector,
+							 FLOW_DISSECTOR_KEY_PORTS))
+				break;
+			key_ports = skb_flow_dissector_target(flow_dissector,
+							      FLOW_DISSECTOR_KEY_PORTS,
+							      target_container);
+			key_ports->ports = flow_label;
 
 			return true;
 		}
@@ -234,14 +249,22 @@ ipv6:
 		hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
 		if (!hdr)
 			return false;
-		flow->src = hdr->srcnode;
-		flow->dst = 0;
-		flow->n_proto = proto;
-		flow->thoff = (u16)nhoff;
+		key_basic->n_proto = proto;
+		key_basic->thoff = (u16)nhoff;
+
+		if (skb_flow_dissector_uses_key(flow_dissector,
+						FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS)) {
+			return true;
+			key_addrs = skb_flow_dissector_target(flow_dissector,
+							      FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS,
+							      target_container);
+			key_addrs->src = hdr->srcnode;
+			key_addrs->dst = 0;
+		}
 		return true;
 	}
 	case htons(ETH_P_FCOE):
-		flow->thoff = (u16)(nhoff + FCOE_HEADER_LEN);
+		key_basic->thoff = (u16)(nhoff + FCOE_HEADER_LEN);
 		/* fall through */
 	default:
 		return false;
@@ -296,14 +319,24 @@ ipv6:
 		break;
 	}
 
-	flow->n_proto = proto;
-	flow->ip_proto = ip_proto;
-	flow->thoff = (u16) nhoff;
-
-	/* unless skb is set we don't need to record port info */
-	if (skb)
-		flow->ports = __skb_flow_get_ports(skb, nhoff, ip_proto,
-						   data, hlen);
+	/* It is ensured by skb_flow_dissector_init() that basic key will
+	 * be always present.
+	 */
+	key_basic = skb_flow_dissector_target(flow_dissector,
+					      FLOW_DISSECTOR_KEY_BASIC,
+					      target_container);
+	key_basic->n_proto = proto;
+	key_basic->ip_proto = ip_proto;
+	key_basic->thoff = (u16) nhoff;
+
+	if (skb_flow_dissector_uses_key(flow_dissector,
+					FLOW_DISSECTOR_KEY_PORTS)) {
+		key_ports = skb_flow_dissector_target(flow_dissector,
+						      FLOW_DISSECTOR_KEY_PORTS,
+						      target_container);
+		key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto,
+							data, hlen);
+	}
 
 	return true;
 }
@@ -325,16 +358,16 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
 	u32 hash;
 
 	/* get a consistent hash (same value on both flow directions) */
-	if (((__force u32)keys->dst < (__force u32)keys->src) ||
-	    (((__force u32)keys->dst == (__force u32)keys->src) &&
-	     ((__force u16)keys->port16[1] < (__force u16)keys->port16[0]))) {
-		swap(keys->dst, keys->src);
-		swap(keys->port16[0], keys->port16[1]);
+	if (((__force u32)keys->addrs.dst < (__force u32)keys->addrs.src) ||
+	    (((__force u32)keys->addrs.dst == (__force u32)keys->addrs.src) &&
+	     ((__force u16)keys->ports.port16[1] < (__force u16)keys->ports.port16[0]))) {
+		swap(keys->addrs.dst, keys->addrs.src);
+		swap(keys->ports.port16[0], keys->ports.port16[1]);
 	}
 
-	hash = __flow_hash_3words((__force u32)keys->dst,
-				  (__force u32)keys->src,
-				  (__force u32)keys->ports,
+	hash = __flow_hash_3words((__force u32)keys->addrs.dst,
+				  (__force u32)keys->addrs.src,
+				  (__force u32)keys->ports.ports,
 				  keyval);
 	if (!hash)
 		hash = 1;
@@ -352,7 +385,7 @@ EXPORT_SYMBOL(flow_hash_from_keys);
 static inline u32 ___skb_get_hash(const struct sk_buff *skb,
 				  struct flow_keys *keys, u32 keyval)
 {
-	if (!skb_flow_dissect(skb, keys))
+	if (!skb_flow_dissect_flow_keys(skb, keys))
 		return 0;
 
 	return __flow_hash_from_keys(keys, keyval);
@@ -377,11 +410,11 @@ void make_flow_keys_digest(struct flow_keys_digest *digest,
 
 	memset(digest, 0, sizeof(*digest));
 
-	data->n_proto = flow->n_proto;
-	data->ip_proto = flow->ip_proto;
-	data->ports = flow->ports;
-	data->src = flow->src;
-	data->dst = flow->dst;
+	data->n_proto = flow->basic.n_proto;
+	data->ip_proto = flow->basic.ip_proto;
+	data->ports = flow->ports.ports;
+	data->src = flow->addrs.src;
+	data->dst = flow->addrs.dst;
 }
 EXPORT_SYMBOL(make_flow_keys_digest);
 
@@ -404,7 +437,7 @@ void __skb_get_hash(struct sk_buff *skb)
 	hash = ___skb_get_hash(skb, &keys, hashrnd);
 	if (!hash)
 		return;
-	if (keys.ports)
+	if (keys.ports.ports)
 		skb->l4_hash = 1;
 	skb->sw_hash = 1;
 	skb->hash = hash;
@@ -422,9 +455,9 @@ EXPORT_SYMBOL(skb_get_hash_perturb);
 u32 __skb_get_poff(const struct sk_buff *skb, void *data,
 		   const struct flow_keys *keys, int hlen)
 {
-	u32 poff = keys->thoff;
+	u32 poff = keys->basic.thoff;
 
-	switch (keys->ip_proto) {
+	switch (keys->basic.ip_proto) {
 	case IPPROTO_TCP: {
 		/* access doff as u8 to avoid unaligned access */
 		const u8 *doff;
@@ -478,8 +511,52 @@ u32 skb_get_poff(const struct sk_buff *skb)
 {
 	struct flow_keys keys;
 
-	if (!skb_flow_dissect(skb, &keys))
+	if (!skb_flow_dissect_flow_keys(skb, &keys))
 		return 0;
 
 	return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
 }
+
+static const struct flow_dissector_key flow_keys_dissector_keys[] = {
+	{
+		.key_id = FLOW_DISSECTOR_KEY_BASIC,
+		.offset = offsetof(struct flow_keys, basic),
+	},
+	{
+		.key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+		.offset = offsetof(struct flow_keys, addrs),
+	},
+	{
+		.key_id = FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS,
+		.offset = offsetof(struct flow_keys, addrs),
+	},
+	{
+		.key_id = FLOW_DISSECTOR_KEY_PORTS,
+		.offset = offsetof(struct flow_keys, ports),
+	},
+};
+
+static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = {
+	{
+		.key_id = FLOW_DISSECTOR_KEY_BASIC,
+		.offset = offsetof(struct flow_keys, basic),
+	},
+};
+
+struct flow_dissector flow_keys_dissector __read_mostly;
+EXPORT_SYMBOL(flow_keys_dissector);
+
+struct flow_dissector flow_keys_buf_dissector __read_mostly;
+
+static int __init init_default_flow_dissectors(void)
+{
+	skb_flow_dissector_init(&flow_keys_dissector,
+				flow_keys_dissector_keys,
+				ARRAY_SIZE(flow_keys_dissector_keys));
+	skb_flow_dissector_init(&flow_keys_buf_dissector,
+				flow_keys_buf_dissector_keys,
+				ARRAY_SIZE(flow_keys_buf_dissector_keys));
+	return 0;
+}
+
+late_initcall_sync(init_default_flow_dissectors);
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 9332a0ab0698..c3325bd2f3fb 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -131,9 +131,9 @@ u32 eth_get_headlen(void *data, unsigned int len)
 		return len;
 
 	/* parse any remaining L2/L3 headers, check for L4 */
-	if (!__skb_flow_dissect(NULL, &keys, data,
-				eth->h_proto, sizeof(*eth), len))
-		return max_t(u32, keys.thoff, sizeof(*eth));
+	if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto,
+					    sizeof(*eth), len))
+		return max_t(u32, keys.basic.thoff, sizeof(*eth));
 
 	/* parse for any L4 headers */
 	return min_t(u32, __skb_get_poff(NULL, data, &keys, len), len);
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 4c5a92298906..4b3e3e30bf4d 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -68,35 +68,35 @@ static inline u32 addr_fold(void *addr)
 
 static u32 flow_get_src(const struct sk_buff *skb, const struct flow_keys *flow)
 {
-	if (flow->src)
-		return ntohl(flow->src);
+	if (flow->addrs.src)
+		return ntohl(flow->addrs.src);
 	return addr_fold(skb->sk);
 }
 
 static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow)
 {
-	if (flow->dst)
-		return ntohl(flow->dst);
+	if (flow->addrs.dst)
+		return ntohl(flow->addrs.dst);
 	return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb);
 }
 
 static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flow)
 {
-	return flow->ip_proto;
+	return flow->basic.ip_proto;
 }
 
 static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow)
 {
-	if (flow->ports)
-		return ntohs(flow->port16[0]);
+	if (flow->ports.ports)
+		return ntohs(flow->ports.port16[0]);
 
 	return addr_fold(skb->sk);
 }
 
 static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow)
 {
-	if (flow->ports)
-		return ntohs(flow->port16[1]);
+	if (flow->ports.ports)
+		return ntohs(flow->ports.port16[1]);
 
 	return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb);
 }
@@ -295,7 +295,7 @@ static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 
 		keymask = f->keymask;
 		if (keymask & FLOW_KEYS_NEEDED)
-			skb_flow_dissect(skb, &flow_keys);
+			skb_flow_dissect_flow_keys(skb, &flow_keys);
 
 		for (n = 0; n < f->nkeys; n++) {
 			key = ffs(keymask) - 1;
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index 2624e991a2d1..93d5742dc7e0 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -170,13 +170,13 @@ static bool choke_match_flow(struct sk_buff *skb1,
 
 	if (!choke_skb_cb(skb1)->keys_valid) {
 		choke_skb_cb(skb1)->keys_valid = 1;
-		skb_flow_dissect(skb1, &temp);
+		skb_flow_dissect_flow_keys(skb1, &temp);
 		make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp);
 	}
 
 	if (!choke_skb_cb(skb2)->keys_valid) {
 		choke_skb_cb(skb2)->keys_valid = 1;
-		skb_flow_dissect(skb2, &temp);
+		skb_flow_dissect_flow_keys(skb2, &temp);
 		make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp);
 	}
 
-- 
cgit v1.2.3


From c3f8eaeb6ea501bd0e2e424f5dfecf952b12a06f Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 12 May 2015 14:56:17 +0200
Subject: flow_dissector: add missing header includes

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/net')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 0c8d406fb730..1add8918a155 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -1,6 +1,9 @@
 #ifndef _NET_FLOW_DISSECTOR_H
 #define _NET_FLOW_DISSECTOR_H
 
+#include <linux/types.h>
+#include <linux/skbuff.h>
+
 /**
  * struct flow_dissector_key_basic:
  * @thoff: Transport header offset
-- 
cgit v1.2.3


From b924933cbbfbdcaa2831a39780c116ec6e48c397 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 12 May 2015 14:56:18 +0200
Subject: flow_dissector: introduce support for ipv6 addressses

So far, only hashes made out of ipv6 addresses could be dissected. This
patch introduces support for dissection of full ipv6 addresses.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h | 13 +++++++++++++
 net/core/flow_dissector.c    | 29 +++++++++++++++++++++--------
 2 files changed, 34 insertions(+), 8 deletions(-)

(limited to 'include/net')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 1add8918a155..586b12306a52 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -3,6 +3,7 @@
 
 #include <linux/types.h>
 #include <linux/skbuff.h>
+#include <linux/in6.h>
 
 /**
  * struct flow_dissector_key_basic:
@@ -42,11 +43,23 @@ struct flow_dissector_key_ports {
 	};
 };
 
+/**
+ * struct flow_dissector_key_ipv6_addrs:
+ * @src: source ip address
+ * @dst: destination ip address
+ */
+struct flow_dissector_key_ipv6_addrs {
+	/* (src,dst) must be grouped, in the same way than in IP header */
+	struct in6_addr src;
+	struct in6_addr dst;
+};
+
 enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */
 	FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_addrs */
 	FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, /* struct flow_dissector_key_addrs */
 	FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */
+	FLOW_DISSECTOR_KEY_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */
 
 	FLOW_DISSECTOR_KEY_MAX,
 };
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 6a49acaa6651..1b95d5ccc9d6 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -175,16 +175,29 @@ ipv6:
 		ip_proto = iph->nexthdr;
 		nhoff += sizeof(struct ipv6hdr);
 
-		if (!skb_flow_dissector_uses_key(flow_dissector,
-						 FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS))
-			break;
-		key_addrs = skb_flow_dissector_target(flow_dissector,
-						      FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS,
-						      target_container);
+		if (skb_flow_dissector_uses_key(flow_dissector,
+						FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS)) {
+			key_addrs = skb_flow_dissector_target(flow_dissector,
+							      FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS,
+							      target_container);
 
-		key_addrs->src = (__force __be32)ipv6_addr_hash(&iph->saddr);
-		key_addrs->dst = (__force __be32)ipv6_addr_hash(&iph->daddr);
+			key_addrs->src = (__force __be32)ipv6_addr_hash(&iph->saddr);
+			key_addrs->dst = (__force __be32)ipv6_addr_hash(&iph->daddr);
+			goto flow_label;
+		}
+		if (skb_flow_dissector_uses_key(flow_dissector,
+						FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
+			struct flow_dissector_key_ipv6_addrs *key_ipv6_addrs;
 
+			key_ipv6_addrs = skb_flow_dissector_target(flow_dissector,
+								   FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+								   target_container);
+
+			memcpy(key_ipv6_addrs, &iph->saddr, sizeof(*key_ipv6_addrs));
+			goto flow_label;
+		}
+		break;
+flow_label:
 		flow_label = ip6_flowlabel(iph);
 		if (flow_label) {
 			/* Awesome, IPv6 packet has a flow label so we can
-- 
cgit v1.2.3


From 67a900cc0436d74e7ff89042371760def087680d Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 12 May 2015 14:56:19 +0200
Subject: flow_dissector: introduce support for Ethernet addresses

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h | 13 +++++++++++++
 net/core/flow_dissector.c    | 12 ++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'include/net')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 586b12306a52..5eac9870689c 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -4,6 +4,7 @@
 #include <linux/types.h>
 #include <linux/skbuff.h>
 #include <linux/in6.h>
+#include <uapi/linux/if_ether.h>
 
 /**
  * struct flow_dissector_key_basic:
@@ -54,12 +55,24 @@ struct flow_dissector_key_ipv6_addrs {
 	struct in6_addr dst;
 };
 
+/**
+ * struct flow_dissector_key_eth_addrs:
+ * @src: source Ethernet address
+ * @dst: destination Ethernet address
+ */
+struct flow_dissector_key_eth_addrs {
+	/* (dst,src) must be grouped, in the same way than in ETH header */
+	unsigned char dst[ETH_ALEN];
+	unsigned char src[ETH_ALEN];
+};
+
 enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */
 	FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_addrs */
 	FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, /* struct flow_dissector_key_addrs */
 	FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */
 	FLOW_DISSECTOR_KEY_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */
+	FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */
 
 	FLOW_DISSECTOR_KEY_MAX,
 };
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 1b95d5ccc9d6..7a0b391114a5 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -14,6 +14,7 @@
 #include <linux/if_pppox.h>
 #include <linux/ppp_defs.h>
 #include <linux/stddef.h>
+#include <linux/if_ether.h>
 #include <net/flow_dissector.h>
 #include <scsi/fc/fc_fcoe.h>
 
@@ -138,6 +139,17 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 					      FLOW_DISSECTOR_KEY_BASIC,
 					      target_container);
 
+	if (skb_flow_dissector_uses_key(flow_dissector,
+					FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
+		struct ethhdr *eth = eth_hdr(skb);
+		struct flow_dissector_key_eth_addrs *key_eth_addrs;
+
+		key_eth_addrs = skb_flow_dissector_target(flow_dissector,
+							  FLOW_DISSECTOR_KEY_ETH_ADDRS,
+							  target_container);
+		memcpy(key_eth_addrs, &eth->h_dest, sizeof(*key_eth_addrs));
+	}
+
 again:
 	switch (proto) {
 	case htons(ETH_P_IP): {
-- 
cgit v1.2.3


From 59346afe7a5548ab3e9730aeff33993faa76abbe Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 12 May 2015 14:56:20 +0200
Subject: flow_dissector: change port array into src, dst tuple

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cisco/enic/enic_clsf.c    | 4 ++--
 drivers/net/ethernet/cisco/enic/enic_ethtool.c | 4 ++--
 include/net/flow_dissector.h                   | 9 ++++++---
 include/net/ip.h                               | 4 ++--
 include/net/ipv6.h                             | 4 ++--
 net/core/flow_dissector.c                      | 4 ++--
 net/sched/cls_flow.c                           | 4 ++--
 7 files changed, 18 insertions(+), 15 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/ethernet/cisco/enic/enic_clsf.c b/drivers/net/ethernet/cisco/enic/enic_clsf.c
index d3d25c7e5b96..6739ebc08c47 100644
--- a/drivers/net/ethernet/cisco/enic/enic_clsf.c
+++ b/drivers/net/ethernet/cisco/enic/enic_clsf.c
@@ -35,8 +35,8 @@ int enic_addfltr_5t(struct enic *enic, struct flow_keys *keys, u16 rq)
 	data.type = FILTER_IPV4_5TUPLE;
 	data.u.ipv4.src_addr = ntohl(keys->addrs.src);
 	data.u.ipv4.dst_addr = ntohl(keys->addrs.dst);
-	data.u.ipv4.src_port = ntohs(keys->ports.port16[0]);
-	data.u.ipv4.dst_port = ntohs(keys->ports.port16[1]);
+	data.u.ipv4.src_port = ntohs(keys->ports.src);
+	data.u.ipv4.dst_port = ntohs(keys->ports.dst);
 	data.u.ipv4.flags = FILTER_FIELDS_IPV4_5TUPLE;
 
 	spin_lock_bh(&enic->devcmd_lock);
diff --git a/drivers/net/ethernet/cisco/enic/enic_ethtool.c b/drivers/net/ethernet/cisco/enic/enic_ethtool.c
index 7588f8dcc890..117c0968dd0b 100644
--- a/drivers/net/ethernet/cisco/enic/enic_ethtool.c
+++ b/drivers/net/ethernet/cisco/enic/enic_ethtool.c
@@ -352,10 +352,10 @@ static int enic_grxclsrule(struct enic *enic, struct ethtool_rxnfc *cmd)
 	fsp->h_u.tcp_ip4_spec.ip4dst = n->keys.addrs.dst;
 	fsp->m_u.tcp_ip4_spec.ip4dst = (__u32)~0;
 
-	fsp->h_u.tcp_ip4_spec.psrc = n->keys.ports.port16[0];
+	fsp->h_u.tcp_ip4_spec.psrc = n->keys.ports.src;
 	fsp->m_u.tcp_ip4_spec.psrc = (__u16)~0;
 
-	fsp->h_u.tcp_ip4_spec.pdst = n->keys.ports.port16[1];
+	fsp->h_u.tcp_ip4_spec.pdst = n->keys.ports.dst;
 	fsp->m_u.tcp_ip4_spec.pdst = (__u16)~0;
 
 	fsp->ring_cookie = n->rq_id;
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 5eac9870689c..bac9c1421f58 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -34,13 +34,16 @@ struct flow_dissector_key_addrs {
 /**
  * flow_dissector_key_tp_ports:
  *	@ports: port numbers of Transport header
- *		port16[0]: src port number
- *		port16[1]: dst port number
+ *		src: source port number
+ *		dst: destination port number
  */
 struct flow_dissector_key_ports {
 	union {
 		__be32 ports;
-		__be16 port16[2];
+		struct {
+			__be16 src;
+			__be16 dst;
+		};
 	};
 };
 
diff --git a/include/net/ip.h b/include/net/ip.h
index b0443d4fe13f..0ed6d768e606 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -362,8 +362,8 @@ static inline void inet_set_txhash(struct sock *sk)
 
 	keys.addrs.src = inet->inet_saddr;
 	keys.addrs.dst = inet->inet_daddr;
-	keys.ports.port16[0] = inet->inet_sport;
-	keys.ports.port16[1] = inet->inet_dport;
+	keys.ports.src = inet->inet_sport;
+	keys.ports.dst = inet->inet_dport;
 
 	sk->sk_txhash = flow_hash_from_keys(&keys);
 }
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 9eed9761dfce..aab8190d16e8 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -700,8 +700,8 @@ static inline void ip6_set_txhash(struct sock *sk)
 
 	keys.addrs.src = (__force __be32)ipv6_addr_hash(&np->saddr);
 	keys.addrs.dst = (__force __be32)ipv6_addr_hash(&sk->sk_v6_daddr);
-	keys.ports.port16[0] = inet->inet_sport;
-	keys.ports.port16[1] = inet->inet_dport;
+	keys.ports.src = inet->inet_sport;
+	keys.ports.dst = inet->inet_dport;
 
 	sk->sk_txhash = flow_hash_from_keys(&keys);
 }
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 7a0b391114a5..204d09c42510 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -385,9 +385,9 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
 	/* get a consistent hash (same value on both flow directions) */
 	if (((__force u32)keys->addrs.dst < (__force u32)keys->addrs.src) ||
 	    (((__force u32)keys->addrs.dst == (__force u32)keys->addrs.src) &&
-	     ((__force u16)keys->ports.port16[1] < (__force u16)keys->ports.port16[0]))) {
+	     ((__force u16)keys->ports.dst < (__force u16)keys->ports.src))) {
 		swap(keys->addrs.dst, keys->addrs.src);
-		swap(keys->ports.port16[0], keys->ports.port16[1]);
+		swap(keys->ports.src, keys->ports.dst);
 	}
 
 	hash = __flow_hash_3words((__force u32)keys->addrs.dst,
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 4b3e3e30bf4d..b4359924846c 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -88,7 +88,7 @@ static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flo
 static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow)
 {
 	if (flow->ports.ports)
-		return ntohs(flow->ports.port16[0]);
+		return ntohs(flow->ports.src);
 
 	return addr_fold(skb->sk);
 }
@@ -96,7 +96,7 @@ static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys
 static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow)
 {
 	if (flow->ports.ports)
-		return ntohs(flow->ports.port16[1]);
+		return ntohs(flow->ports.dst);
 
 	return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb);
 }
-- 
cgit v1.2.3


From 35d32e8fe4ab44180e46a0dd54abea6985398d00 Mon Sep 17 00:00:00 2001
From: "John W. Linville" <linville@tuxdriver.com>
Date: Wed, 13 May 2015 12:57:27 -0400
Subject: geneve: move definition of geneve_hdr() to geneve.h

This is a static inline with identical definitions in multiple places...

Signed-off-by: John W. Linville <linville@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/geneve.h           | 5 +++++
 net/ipv4/geneve.c              | 5 -----
 net/openvswitch/vport-geneve.c | 5 -----
 3 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'include/net')

diff --git a/include/net/geneve.h b/include/net/geneve.h
index 14fb8d3390b4..2a0543a1899d 100644
--- a/include/net/geneve.h
+++ b/include/net/geneve.h
@@ -62,6 +62,11 @@ struct genevehdr {
 	struct geneve_opt options[];
 };
 
+static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
+{
+	return (struct genevehdr *)(udp_hdr(skb) + 1);
+}
+
 #ifdef CONFIG_INET
 struct geneve_sock;
 
diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
index 8e6a7fe27a4c..001843d41135 100644
--- a/net/ipv4/geneve.c
+++ b/net/ipv4/geneve.c
@@ -60,11 +60,6 @@ struct geneve_net {
 
 static int geneve_net_id;
 
-static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
-{
-	return (struct genevehdr *)(udp_hdr(skb) + 1);
-}
-
 static struct geneve_sock *geneve_find_sock(struct net *net,
 					    sa_family_t family, __be16 port)
 {
diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c
index bf02fd5808c9..208c576bd1b6 100644
--- a/net/openvswitch/vport-geneve.c
+++ b/net/openvswitch/vport-geneve.c
@@ -46,11 +46,6 @@ static inline struct geneve_port *geneve_vport(const struct vport *vport)
 	return vport_priv(vport);
 }
 
-static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
-{
-	return (struct genevehdr *)(udp_hdr(skb) + 1);
-}
-
 /* Convert 64 bit tunnel ID to 24 bit VNI. */
 static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni)
 {
-- 
cgit v1.2.3


From 264ea103a7473f51aced838e68ed384ea2c759f5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 14 May 2015 14:26:56 -0700
Subject: tcp: syncookies: extend validity range

Now we allow storing more request socks per listener, we might
hit syncookie mode less often and hit following bug in our stack :

When we send a burst of syncookies, then exit this mode,
tcp_synq_no_recent_overflow() can return false if the ACK packets coming
from clients are coming three seconds after the end of syncookie
episode.

This is a way too strong requirement and conflicts with rest of
syncookie code which allows ACK to be aged up to 2 minutes.

Perfectly valid ACK packets are dropped just because clients might be
in a crowded wifi environment or on another planet.

So let's fix this, and also change tcp_synq_overflow() to not
dirty a cache line for every syncookie we send, as we are under attack.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Florian Westphal <fw@strlen.de>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 38 ++++++++++++++++++++++++--------------
 1 file changed, 24 insertions(+), 14 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index b8ea12880fd9..7ace6acbf5fd 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -326,18 +326,6 @@ static inline bool tcp_too_many_orphans(struct sock *sk, int shift)
 
 bool tcp_check_oom(struct sock *sk, int shift);
 
-/* syncookies: remember time of last synqueue overflow */
-static inline void tcp_synq_overflow(struct sock *sk)
-{
-	tcp_sk(sk)->rx_opt.ts_recent_stamp = jiffies;
-}
-
-/* syncookies: no recent synqueue overflow on this listening socket? */
-static inline bool tcp_synq_no_recent_overflow(const struct sock *sk)
-{
-	unsigned long last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp;
-	return time_after(jiffies, last_overflow + TCP_TIMEOUT_FALLBACK);
-}
 
 extern struct proto tcp_prot;
 
@@ -483,13 +471,35 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb);
  * i.e. a sent cookie is valid only at most for 2*60 seconds (or less if
  * the counter advances immediately after a cookie is generated).
  */
-#define MAX_SYNCOOKIE_AGE 2
+#define MAX_SYNCOOKIE_AGE	2
+#define TCP_SYNCOOKIE_PERIOD	(60 * HZ)
+#define TCP_SYNCOOKIE_VALID	(MAX_SYNCOOKIE_AGE * TCP_SYNCOOKIE_PERIOD)
+
+/* syncookies: remember time of last synqueue overflow
+ * But do not dirty this field too often (once per second is enough)
+ */
+static inline void tcp_synq_overflow(struct sock *sk)
+{
+	unsigned long last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp;
+	unsigned long now = jiffies;
+
+	if (time_after(now, last_overflow + HZ))
+		tcp_sk(sk)->rx_opt.ts_recent_stamp = now;
+}
+
+/* syncookies: no recent synqueue overflow on this listening socket? */
+static inline bool tcp_synq_no_recent_overflow(const struct sock *sk)
+{
+	unsigned long last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp;
+
+	return time_after(jiffies, last_overflow + TCP_SYNCOOKIE_VALID);
+}
 
 static inline u32 tcp_cookie_time(void)
 {
 	u64 val = get_jiffies_64();
 
-	do_div(val, 60 * HZ);
+	do_div(val, TCP_SYNCOOKIE_PERIOD);
 	return val;
 }
 
-- 
cgit v1.2.3


From d53a2aa3a116609c7db8799da31541c4ba5999eb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 15 May 2015 08:52:19 -0700
Subject: net: fix sparse error in csum_replace4()

make C=2 CF=-D__CHECK_ENDIAN__ net/ipv4/netfilter/nf_nat_l3proto_ipv4.o
  CHECK   net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
include/net/checksum.h:125:64: warning: incorrect type in argument 2 (different base types)
include/net/checksum.h:125:64:    expected restricted __wsum [usertype] addend
include/net/checksum.h:125:64:    got restricted __be32 [usertype] from
include/net/checksum.h:125:71: warning: incorrect type in argument 2 (different base types)
include/net/checksum.h:125:71:    expected restricted __wsum [usertype] addend
include/net/checksum.h:125:71:    got restricted __be32 [usertype] to
include/net/checksum.h:125:64: warning: incorrect type in argument 2 (different base types)
include/net/checksum.h:125:64:    expected restricted __wsum [usertype] addend
include/net/checksum.h:125:64:    got restricted __be32 [usertype] from
include/net/checksum.h:125:71: warning: incorrect type in argument 2 (different base types)
include/net/checksum.h:125:71:    expected restricted __wsum [usertype] addend
include/net/checksum.h:125:71:    got restricted __be32 [usertype] to

Fixes: 4565af0d406b ("net: optimise csum_replace4()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/checksum.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/checksum.h b/include/net/checksum.h
index 0a55ac715077..2d1d73cb773e 100644
--- a/include/net/checksum.h
+++ b/include/net/checksum.h
@@ -122,7 +122,9 @@ static inline __wsum csum_partial_ext(const void *buff, int len, __wsum sum)
 
 static inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to)
 {
-	*sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), from), to));
+	__wsum tmp = csum_sub(~csum_unfold(*sum), (__force __wsum)from);
+
+	*sum = csum_fold(csum_add(tmp, (__force __wsum)to));
 }
 
 /* Implements RFC 1624 (Incremental Internet Checksum)
-- 
cgit v1.2.3


From 1a24e04e4b50939daa3041682b38b82c896ca438 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 15 May 2015 12:39:25 -0700
Subject: net: fix sk_mem_reclaim_partial()

sk_mem_reclaim_partial() goal is to ensure each socket has
one SK_MEM_QUANTUM forward allocation. This is needed both for
performance and better handling of memory pressure situations in
follow up patches.

SK_MEM_QUANTUM is currently a page, but might be reduced to 4096 bytes
as some arches have 64KB pages.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 6 +++---
 net/core/sock.c    | 9 +++++----
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index d882f4c8e438..4581a60636f8 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1368,7 +1368,7 @@ static inline struct inode *SOCK_INODE(struct socket *socket)
  * Functions for memory accounting
  */
 int __sk_mem_schedule(struct sock *sk, int size, int kind);
-void __sk_mem_reclaim(struct sock *sk);
+void __sk_mem_reclaim(struct sock *sk, int amount);
 
 #define SK_MEM_QUANTUM ((int)PAGE_SIZE)
 #define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM)
@@ -1409,7 +1409,7 @@ static inline void sk_mem_reclaim(struct sock *sk)
 	if (!sk_has_account(sk))
 		return;
 	if (sk->sk_forward_alloc >= SK_MEM_QUANTUM)
-		__sk_mem_reclaim(sk);
+		__sk_mem_reclaim(sk, sk->sk_forward_alloc);
 }
 
 static inline void sk_mem_reclaim_partial(struct sock *sk)
@@ -1417,7 +1417,7 @@ static inline void sk_mem_reclaim_partial(struct sock *sk)
 	if (!sk_has_account(sk))
 		return;
 	if (sk->sk_forward_alloc > SK_MEM_QUANTUM)
-		__sk_mem_reclaim(sk);
+		__sk_mem_reclaim(sk, sk->sk_forward_alloc - 1);
 }
 
 static inline void sk_mem_charge(struct sock *sk, int size)
diff --git a/net/core/sock.c b/net/core/sock.c
index c18738a795b0..29124fcdc42a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2069,12 +2069,13 @@ EXPORT_SYMBOL(__sk_mem_schedule);
 /**
  *	__sk_reclaim - reclaim memory_allocated
  *	@sk: socket
+ *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
  */
-void __sk_mem_reclaim(struct sock *sk)
+void __sk_mem_reclaim(struct sock *sk, int amount)
 {
-	sk_memory_allocated_sub(sk,
-				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
-	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
+	amount >>= SK_MEM_QUANTUM_SHIFT;
+	sk_memory_allocated_sub(sk, amount);
+	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
 
 	if (sk_under_memory_pressure(sk) &&
 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
-- 
cgit v1.2.3


From a6c5ea4ccf0033591e6e476d7a273c0074c07aa7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 15 May 2015 12:39:26 -0700
Subject: tcp: rename sk_forced_wmem_schedule() to sk_forced_mem_schedule()

We plan to use sk_forced_wmem_schedule() in input path as well,
so make it non static and rename it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     | 2 ++
 net/ipv4/tcp_output.c | 6 ++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7ace6acbf5fd..841691a296dc 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -311,6 +311,8 @@ static inline bool tcp_out_of_memory(struct sock *sk)
 	return false;
 }
 
+void sk_forced_mem_schedule(struct sock *sk, int size);
+
 static inline bool tcp_too_many_orphans(struct sock *sk, int shift)
 {
 	struct percpu_counter *ocp = sk->sk_prot->orphan_count;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7386d32cd670..bac1a950d087 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2816,8 +2816,10 @@ begin_fwd:
  * connection tear down and (memory) recovery.
  * Otherwise tcp_send_fin() could be tempted to either delay FIN
  * or even be forced to close flow without any FIN.
+ * In general, we want to allow one skb per socket to avoid hangs
+ * with edge trigger epoll()
  */
-static void sk_forced_wmem_schedule(struct sock *sk, int size)
+void sk_forced_mem_schedule(struct sock *sk, int size)
 {
 	int amt, status;
 
@@ -2864,7 +2866,7 @@ coalesce:
 			return;
 		}
 		skb_reserve(skb, MAX_TCP_HEADER);
-		sk_forced_wmem_schedule(sk, skb->truesize);
+		sk_forced_mem_schedule(sk, skb->truesize);
 		/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
 		tcp_init_nondata_skb(skb, tp->write_seq,
 				     TCPHDR_ACK | TCPHDR_FIN);
-- 
cgit v1.2.3


From b8da51ebb1aa93908350f95efae73aecbc2e266c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 15 May 2015 12:39:27 -0700
Subject: tcp: introduce tcp_under_memory_pressure()

Introduce an optimized version of sk_under_memory_pressure()
for TCP. Our intent is to use it in fast paths.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     | 8 ++++++++
 net/ipv4/tcp_input.c  | 8 ++++----
 net/ipv4/tcp_output.c | 4 ++--
 net/ipv4/tcp_timer.c  | 2 +-
 4 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 841691a296dc..0d85223efa4c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -286,6 +286,14 @@ extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
 extern int tcp_memory_pressure;
 
+/* optimized version of sk_under_memory_pressure() for TCP sockets */
+static inline bool tcp_under_memory_pressure(const struct sock *sk)
+{
+	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
+		return !!sk->sk_cgrp->memory_pressure;
+
+	return tcp_memory_pressure;
+}
 /*
  * The next routines deal with comparing 32 bit unsigned ints
  * and worry about wraparound (automatic with unsigned arithmetic).
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index cf8b20ff6658..093779f7e893 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -359,7 +359,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
 	/* Check #1 */
 	if (tp->rcv_ssthresh < tp->window_clamp &&
 	    (int)tp->rcv_ssthresh < tcp_space(sk) &&
-	    !sk_under_memory_pressure(sk)) {
+	    !tcp_under_memory_pressure(sk)) {
 		int incr;
 
 		/* Check #2. Increase window, if skb with such overhead
@@ -446,7 +446,7 @@ static void tcp_clamp_window(struct sock *sk)
 
 	if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
 	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
-	    !sk_under_memory_pressure(sk) &&
+	    !tcp_under_memory_pressure(sk) &&
 	    sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
 		sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
 				    sysctl_tcp_rmem[2]);
@@ -4781,7 +4781,7 @@ static int tcp_prune_queue(struct sock *sk)
 
 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
 		tcp_clamp_window(sk);
-	else if (sk_under_memory_pressure(sk))
+	else if (tcp_under_memory_pressure(sk))
 		tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
 
 	tcp_collapse_ofo_queue(sk);
@@ -4825,7 +4825,7 @@ static bool tcp_should_expand_sndbuf(const struct sock *sk)
 		return false;
 
 	/* If we are under global TCP memory pressure, do not expand.  */
-	if (sk_under_memory_pressure(sk))
+	if (tcp_under_memory_pressure(sk))
 		return false;
 
 	/* If we are under soft global TCP memory pressure, do not expand.  */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index bac1a950d087..08c2cc40b26d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2392,7 +2392,7 @@ u32 __tcp_select_window(struct sock *sk)
 	if (free_space < (full_space >> 1)) {
 		icsk->icsk_ack.quick = 0;
 
-		if (sk_under_memory_pressure(sk))
+		if (tcp_under_memory_pressure(sk))
 			tp->rcv_ssthresh = min(tp->rcv_ssthresh,
 					       4U * tp->advmss);
 
@@ -2843,7 +2843,7 @@ void tcp_send_fin(struct sock *sk)
 	 * Note: in the latter case, FIN packet will be sent after a timeout,
 	 * as TCP stack thinks it has already been transmitted.
 	 */
-	if (tskb && (tcp_send_head(sk) || sk_under_memory_pressure(sk))) {
+	if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) {
 coalesce:
 		TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
 		TCP_SKB_CB(tskb)->end_seq++;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 65bf670e8714..5b752f58a900 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -247,7 +247,7 @@ void tcp_delack_timer_handler(struct sock *sk)
 	}
 
 out:
-	if (sk_under_memory_pressure(sk))
+	if (tcp_under_memory_pressure(sk))
 		sk_mem_reclaim(sk);
 }
 
-- 
cgit v1.2.3


From 45d4122ca7cdb3a4b91f392605cd22cfa75f1d99 Mon Sep 17 00:00:00 2001
From: "Samudrala, Sridhar" <sridhar.samudrala@intel.com>
Date: Wed, 13 May 2015 21:55:43 -0700
Subject: switchdev: add support for fdb add/del/dump via switchdev_port_obj
 ops.

- introduce port fdb obj and generic switchdev_port_fdb_add/del/dump()
- use switchdev_port_fdb_add/del/dump in rocker/team/bonding ndo ops.
- add support for fdb obj in switchdev_port_obj_add/del/dump()
- switch rocker to implement fdb ops via switchdev_ops

v3: updated to sync with named union changes.

Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c      |   3 +
 drivers/net/ethernet/rocker/rocker.c | 188 +++++++++++++++--------------------
 drivers/net/team/team.c              |   3 +
 include/net/switchdev.h              |  50 ++++++++++
 net/switchdev/switchdev.c            | 175 ++++++++++++++++++++++++++++++++
 5 files changed, 312 insertions(+), 107 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index ef26e0147050..2268438f3f63 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4039,6 +4039,9 @@ static const struct net_device_ops bond_netdev_ops = {
 	.ndo_bridge_setlink	= switchdev_port_bridge_setlink,
 	.ndo_bridge_getlink	= switchdev_port_bridge_getlink,
 	.ndo_bridge_dellink	= switchdev_port_bridge_dellink,
+	.ndo_fdb_add		= switchdev_port_fdb_add,
+	.ndo_fdb_del		= switchdev_port_fdb_del,
+	.ndo_fdb_dump		= switchdev_port_fdb_dump,
 	.ndo_features_check	= passthru_features_check,
 };
 
diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index 3e13dcccd73d..0f5e962d691c 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -4193,110 +4193,6 @@ static int rocker_port_vlan_rx_kill_vid(struct net_device *dev,
 				ROCKER_OP_FLAG_REMOVE, vid);
 }
 
-static int rocker_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
-			       struct net_device *dev,
-			       const unsigned char *addr, u16 vid,
-			       u16 nlm_flags)
-{
-	struct rocker_port *rocker_port = netdev_priv(dev);
-	__be16 vlan_id = rocker_port_vid_to_vlan(rocker_port, vid, NULL);
-	int flags = 0;
-
-	if (!rocker_port_is_bridged(rocker_port))
-		return -EINVAL;
-
-	return rocker_port_fdb(rocker_port, SWITCHDEV_TRANS_NONE,
-			       addr, vlan_id, flags);
-}
-
-static int rocker_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
-			       struct net_device *dev,
-			       const unsigned char *addr, u16 vid)
-{
-	struct rocker_port *rocker_port = netdev_priv(dev);
-	__be16 vlan_id = rocker_port_vid_to_vlan(rocker_port, vid, NULL);
-	int flags = ROCKER_OP_FLAG_REMOVE;
-
-	if (!rocker_port_is_bridged(rocker_port))
-		return -EINVAL;
-
-	return rocker_port_fdb(rocker_port, SWITCHDEV_TRANS_NONE,
-			       addr, vlan_id, flags);
-}
-
-static int rocker_fdb_fill_info(struct sk_buff *skb,
-				struct rocker_port *rocker_port,
-				const unsigned char *addr, u16 vid,
-				u32 portid, u32 seq, int type,
-				unsigned int flags)
-{
-	struct nlmsghdr *nlh;
-	struct ndmsg *ndm;
-
-	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
-	if (!nlh)
-		return -EMSGSIZE;
-
-	ndm = nlmsg_data(nlh);
-	ndm->ndm_family	 = AF_BRIDGE;
-	ndm->ndm_pad1    = 0;
-	ndm->ndm_pad2    = 0;
-	ndm->ndm_flags	 = NTF_SELF;
-	ndm->ndm_type	 = 0;
-	ndm->ndm_ifindex = rocker_port->dev->ifindex;
-	ndm->ndm_state   = NUD_REACHABLE;
-
-	if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr))
-		goto nla_put_failure;
-
-	if (vid && nla_put_u16(skb, NDA_VLAN, vid))
-		goto nla_put_failure;
-
-	nlmsg_end(skb, nlh);
-	return 0;
-
-nla_put_failure:
-	nlmsg_cancel(skb, nlh);
-	return -EMSGSIZE;
-}
-
-static int rocker_port_fdb_dump(struct sk_buff *skb,
-				struct netlink_callback *cb,
-				struct net_device *dev,
-				struct net_device *filter_dev,
-				int idx)
-{
-	struct rocker_port *rocker_port = netdev_priv(dev);
-	struct rocker *rocker = rocker_port->rocker;
-	struct rocker_fdb_tbl_entry *found;
-	struct hlist_node *tmp;
-	int bkt;
-	unsigned long lock_flags;
-	const unsigned char *addr;
-	u16 vid;
-	int err;
-
-	spin_lock_irqsave(&rocker->fdb_tbl_lock, lock_flags);
-	hash_for_each_safe(rocker->fdb_tbl, bkt, tmp, found, entry) {
-		if (found->key.pport != rocker_port->pport)
-			continue;
-		if (idx < cb->args[0])
-			goto skip;
-		addr = found->key.addr;
-		vid = rocker_port_vlan_to_vid(rocker_port, found->key.vlan_id);
-		err = rocker_fdb_fill_info(skb, rocker_port, addr, vid,
-					   NETLINK_CB(cb->skb).portid,
-					   cb->nlh->nlmsg_seq,
-					   RTM_NEWNEIGH, NLM_F_MULTI);
-		if (err < 0)
-			break;
-skip:
-		++idx;
-	}
-	spin_unlock_irqrestore(&rocker->fdb_tbl_lock, lock_flags);
-	return idx;
-}
-
 static int rocker_port_get_phys_port_name(struct net_device *dev,
 					  char *buf, size_t len)
 {
@@ -4320,12 +4216,12 @@ static const struct net_device_ops rocker_port_netdev_ops = {
 	.ndo_set_mac_address		= rocker_port_set_mac_address,
 	.ndo_vlan_rx_add_vid		= rocker_port_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid		= rocker_port_vlan_rx_kill_vid,
-	.ndo_fdb_add			= rocker_port_fdb_add,
-	.ndo_fdb_del			= rocker_port_fdb_del,
-	.ndo_fdb_dump			= rocker_port_fdb_dump,
 	.ndo_bridge_getlink		= switchdev_port_bridge_getlink,
 	.ndo_bridge_setlink		= switchdev_port_bridge_setlink,
 	.ndo_bridge_dellink		= switchdev_port_bridge_dellink,
+	.ndo_fdb_add			= switchdev_port_fdb_add,
+	.ndo_fdb_del			= switchdev_port_fdb_del,
+	.ndo_fdb_dump			= switchdev_port_fdb_dump,
 	.ndo_get_phys_port_name		= rocker_port_get_phys_port_name,
 };
 
@@ -4447,6 +4343,19 @@ static int rocker_port_vlans_add(struct rocker_port *rocker_port,
 	return 0;
 }
 
+static int rocker_port_fdb_add(struct rocker_port *rocker_port,
+			       enum switchdev_trans trans,
+			       struct switchdev_obj_fdb *fdb)
+{
+	__be16 vlan_id = rocker_port_vid_to_vlan(rocker_port, fdb->vid, NULL);
+	int flags = 0;
+
+	if (!rocker_port_is_bridged(rocker_port))
+		return -EINVAL;
+
+	return rocker_port_fdb(rocker_port, trans, fdb->addr, vlan_id, flags);
+}
+
 static int rocker_port_obj_add(struct net_device *dev,
 			       struct switchdev_obj *obj)
 {
@@ -4476,6 +4385,9 @@ static int rocker_port_obj_add(struct net_device *dev,
 					   htonl(fib4->dst), fib4->dst_len,
 					   fib4->fi, fib4->tb_id, 0);
 		break;
+	case SWITCHDEV_OBJ_PORT_FDB:
+		err = rocker_port_fdb_add(rocker_port, obj->trans, &obj->u.fdb);
+		break;
 	default:
 		err = -EOPNOTSUPP;
 		break;
@@ -4513,6 +4425,19 @@ static int rocker_port_vlans_del(struct rocker_port *rocker_port,
 	return 0;
 }
 
+static int rocker_port_fdb_del(struct rocker_port *rocker_port,
+			       enum switchdev_trans trans,
+			       struct switchdev_obj_fdb *fdb)
+{
+	__be16 vlan_id = rocker_port_vid_to_vlan(rocker_port, fdb->vid, NULL);
+	int flags = ROCKER_OP_FLAG_REMOVE;
+
+	if (!rocker_port_is_bridged(rocker_port))
+		return -EINVAL;
+
+	return rocker_port_fdb(rocker_port, trans, fdb->addr, vlan_id, flags);
+}
+
 static int rocker_port_obj_del(struct net_device *dev,
 			       struct switchdev_obj *obj)
 {
@@ -4531,6 +4456,54 @@ static int rocker_port_obj_del(struct net_device *dev,
 					   fib4->fi, fib4->tb_id,
 					   ROCKER_OP_FLAG_REMOVE);
 		break;
+	case SWITCHDEV_OBJ_PORT_FDB:
+		err = rocker_port_fdb_del(rocker_port, obj->trans, &obj->u.fdb);
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	return err;
+}
+
+static int rocker_port_fdb_dump(struct rocker_port *rocker_port,
+				struct switchdev_obj *obj)
+{
+	struct rocker *rocker = rocker_port->rocker;
+	struct switchdev_obj_fdb *fdb = &obj->u.fdb;
+	struct rocker_fdb_tbl_entry *found;
+	struct hlist_node *tmp;
+	unsigned long lock_flags;
+	int bkt;
+	int err = 0;
+
+	spin_lock_irqsave(&rocker->fdb_tbl_lock, lock_flags);
+	hash_for_each_safe(rocker->fdb_tbl, bkt, tmp, found, entry) {
+		if (found->key.pport != rocker_port->pport)
+			continue;
+		fdb->addr = found->key.addr;
+		fdb->vid = rocker_port_vlan_to_vid(rocker_port,
+						   found->key.vlan_id);
+		err = obj->cb(rocker_port->dev, obj);
+		if (err)
+			break;
+	}
+	spin_unlock_irqrestore(&rocker->fdb_tbl_lock, lock_flags);
+
+	return err;
+}
+
+static int rocker_port_obj_dump(struct net_device *dev,
+				struct switchdev_obj *obj)
+{
+	struct rocker_port *rocker_port = netdev_priv(dev);
+	int err = 0;
+
+	switch (obj->id) {
+	case SWITCHDEV_OBJ_PORT_FDB:
+		err = rocker_port_fdb_dump(rocker_port, obj);
+		break;
 	default:
 		err = -EOPNOTSUPP;
 		break;
@@ -4544,6 +4517,7 @@ static const struct switchdev_ops rocker_port_switchdev_ops = {
 	.switchdev_port_attr_set	= rocker_port_attr_set,
 	.switchdev_port_obj_add		= rocker_port_obj_add,
 	.switchdev_port_obj_del		= rocker_port_obj_del,
+	.switchdev_port_obj_dump	= rocker_port_obj_dump,
 };
 
 /********************
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 1ec035a53c3d..daa054b3ff03 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1980,6 +1980,9 @@ static const struct net_device_ops team_netdev_ops = {
 	.ndo_bridge_setlink	= switchdev_port_bridge_setlink,
 	.ndo_bridge_getlink	= switchdev_port_bridge_getlink,
 	.ndo_bridge_dellink	= switchdev_port_bridge_dellink,
+	.ndo_fdb_add		= switchdev_port_fdb_add,
+	.ndo_fdb_del		= switchdev_port_fdb_del,
+	.ndo_fdb_dump		= switchdev_port_fdb_dump,
 	.ndo_features_check	= passthru_features_check,
 };
 
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index ea5b1c230d3d..437f8fe75705 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -47,11 +47,13 @@ enum switchdev_obj_id {
 	SWITCHDEV_OBJ_UNDEFINED,
 	SWITCHDEV_OBJ_PORT_VLAN,
 	SWITCHDEV_OBJ_IPV4_FIB,
+	SWITCHDEV_OBJ_PORT_FDB,
 };
 
 struct switchdev_obj {
 	enum switchdev_obj_id id;
 	enum switchdev_trans trans;
+	int (*cb)(struct net_device *dev, struct switchdev_obj *obj);
 	union {
 		struct switchdev_obj_vlan {		/* PORT_VLAN */
 			u16 flags;
@@ -67,6 +69,10 @@ struct switchdev_obj {
 			u32 nlflags;
 			u32 tb_id;
 		} ipv4_fib;
+		struct switchdev_obj_fdb {		/* PORT_FDB */
+			const unsigned char *addr;
+			u16 vid;
+		} fdb;
 	} u;
 };
 
@@ -80,6 +86,8 @@ struct switchdev_obj {
  * @switchdev_port_obj_add: Add an object to port (see switchdev_obj).
  *
  * @switchdev_port_obj_del: Delete an object from port (see switchdev_obj).
+ *
+ * @switchdev_port_obj_dump: Dump port objects (see switchdev_obj).
  */
 struct switchdev_ops {
 	int	(*switchdev_port_attr_get)(struct net_device *dev,
@@ -90,6 +98,8 @@ struct switchdev_ops {
 					  struct switchdev_obj *obj);
 	int	(*switchdev_port_obj_del)(struct net_device *dev,
 					  struct switchdev_obj *obj);
+	int	(*switchdev_port_obj_dump)(struct net_device *dev,
+					  struct switchdev_obj *obj);
 };
 
 enum switchdev_notifier_type {
@@ -121,6 +131,7 @@ int switchdev_port_attr_set(struct net_device *dev,
 			    struct switchdev_attr *attr);
 int switchdev_port_obj_add(struct net_device *dev, struct switchdev_obj *obj);
 int switchdev_port_obj_del(struct net_device *dev, struct switchdev_obj *obj);
+int switchdev_port_obj_dump(struct net_device *dev, struct switchdev_obj *obj);
 int register_switchdev_notifier(struct notifier_block *nb);
 int unregister_switchdev_notifier(struct notifier_block *nb);
 int call_switchdev_notifiers(unsigned long val, struct net_device *dev,
@@ -137,6 +148,15 @@ int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
 int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
 			   u8 tos, u8 type, u32 tb_id);
 void switchdev_fib_ipv4_abort(struct fib_info *fi);
+int switchdev_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
+			   struct net_device *dev, const unsigned char *addr,
+			   u16 vid, u16 nlm_flags);
+int switchdev_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
+			   struct net_device *dev, const unsigned char *addr,
+			   u16 vid);
+int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
+			    struct net_device *dev,
+			    struct net_device *filter_dev, int idx);
 
 #else
 
@@ -164,6 +184,12 @@ static inline int switchdev_port_obj_del(struct net_device *dev,
 	return -EOPNOTSUPP;
 }
 
+static inline int switchdev_port_obj_dump(struct net_device *dev,
+					  struct switchdev_obj *obj)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int register_switchdev_notifier(struct notifier_block *nb)
 {
 	return 0;
@@ -221,6 +247,30 @@ static inline void switchdev_fib_ipv4_abort(struct fib_info *fi)
 {
 }
 
+static inline int switchdev_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
+					 struct net_device *dev,
+					 const unsigned char *addr,
+					 u16 vid, u16 nlm_flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int switchdev_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
+					 struct net_device *dev,
+					 const unsigned char *addr, u16 vid)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int switchdev_port_fdb_dump(struct sk_buff *skb,
+					  struct netlink_callback *cb,
+					  struct net_device *dev,
+					  struct net_device *filter_dev,
+					  int idx)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif
 
 #endif /* _LINUX_SWITCHDEV_H_ */
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 0409f9b5bdbc..d4c8cf828240 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -296,6 +296,36 @@ int switchdev_port_obj_del(struct net_device *dev, struct switchdev_obj *obj)
 }
 EXPORT_SYMBOL_GPL(switchdev_port_obj_del);
 
+/**
+ *	switchdev_port_obj_dump - Dump port objects
+ *
+ *	@dev: port device
+ *	@obj: object to dump
+ */
+int switchdev_port_obj_dump(struct net_device *dev, struct switchdev_obj *obj)
+{
+	const struct switchdev_ops *ops = dev->switchdev_ops;
+	struct net_device *lower_dev;
+	struct list_head *iter;
+	int err = -EOPNOTSUPP;
+
+	if (ops && ops->switchdev_port_obj_dump)
+		return ops->switchdev_port_obj_dump(dev, obj);
+
+	/* Switch device port(s) may be stacked under
+	 * bond/team/vlan dev, so recurse down to dump objects on
+	 * first port at bottom of stack.
+	 */
+
+	netdev_for_each_lower_dev(dev, lower_dev, iter) {
+		err = switchdev_port_obj_dump(lower_dev, obj);
+		break;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(switchdev_port_obj_dump);
+
 static DEFINE_MUTEX(switchdev_mutex);
 static RAW_NOTIFIER_HEAD(switchdev_notif_chain);
 
@@ -566,6 +596,151 @@ int switchdev_port_bridge_dellink(struct net_device *dev,
 }
 EXPORT_SYMBOL_GPL(switchdev_port_bridge_dellink);
 
+/**
+ *	switchdev_port_fdb_add - Add FDB (MAC/VLAN) entry to port
+ *
+ *	@ndmsg: netlink hdr
+ *	@nlattr: netlink attributes
+ *	@dev: port device
+ *	@addr: MAC address to add
+ *	@vid: VLAN to add
+ *
+ *	Add FDB entry to switch device.
+ */
+int switchdev_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
+			   struct net_device *dev, const unsigned char *addr,
+			   u16 vid, u16 nlm_flags)
+{
+	struct switchdev_obj obj = {
+		.id = SWITCHDEV_OBJ_PORT_FDB,
+		.u.fdb = {
+			.addr = addr,
+			.vid = vid,
+		},
+	};
+
+	return switchdev_port_obj_add(dev, &obj);
+}
+EXPORT_SYMBOL_GPL(switchdev_port_fdb_add);
+
+/**
+ *	switchdev_port_fdb_del - Delete FDB (MAC/VLAN) entry from port
+ *
+ *	@ndmsg: netlink hdr
+ *	@nlattr: netlink attributes
+ *	@dev: port device
+ *	@addr: MAC address to delete
+ *	@vid: VLAN to delete
+ *
+ *	Delete FDB entry from switch device.
+ */
+int switchdev_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
+			   struct net_device *dev, const unsigned char *addr,
+			   u16 vid)
+{
+	struct switchdev_obj obj = {
+		.id = SWITCHDEV_OBJ_PORT_FDB,
+		.u.fdb = {
+			.addr = addr,
+			.vid = vid,
+		},
+	};
+
+	return switchdev_port_obj_del(dev, &obj);
+}
+EXPORT_SYMBOL_GPL(switchdev_port_fdb_del);
+
+struct switchdev_fdb_dump {
+	struct switchdev_obj obj;
+	struct sk_buff *skb;
+	struct netlink_callback *cb;
+	struct net_device *filter_dev;
+	int idx;
+};
+
+static int switchdev_port_fdb_dump_cb(struct net_device *dev,
+				      struct switchdev_obj *obj)
+{
+	struct switchdev_fdb_dump *dump =
+		container_of(obj, struct switchdev_fdb_dump, obj);
+	u32 portid = NETLINK_CB(dump->cb->skb).portid;
+	u32 seq = dump->cb->nlh->nlmsg_seq;
+	struct nlmsghdr *nlh;
+	struct ndmsg *ndm;
+	struct net_device *master = netdev_master_upper_dev_get(dev);
+
+	if (dump->idx < dump->cb->args[0])
+		goto skip;
+
+	if (master && dump->filter_dev != master)
+		goto skip;
+
+	nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH,
+			sizeof(*ndm), NLM_F_MULTI);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	ndm = nlmsg_data(nlh);
+	ndm->ndm_family  = AF_BRIDGE;
+	ndm->ndm_pad1    = 0;
+	ndm->ndm_pad2    = 0;
+	ndm->ndm_flags   = NTF_SELF;
+	ndm->ndm_type    = 0;
+	ndm->ndm_ifindex = dev->ifindex;
+	ndm->ndm_state   = NUD_REACHABLE;
+
+	if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, obj->u.fdb.addr))
+		goto nla_put_failure;
+
+	if (obj->u.fdb.vid && nla_put_u16(dump->skb, NDA_VLAN, obj->u.fdb.vid))
+		goto nla_put_failure;
+
+	nlmsg_end(dump->skb, nlh);
+
+skip:
+	dump->idx++;
+	return 0;
+
+nla_put_failure:
+	nlmsg_cancel(dump->skb, nlh);
+	return -EMSGSIZE;
+}
+
+/**
+ *	switchdev_port_fdb_dump - Dump port FDB (MAC/VLAN) entries
+ *
+ *	@skb: netlink skb
+ *	@cb: netlink callback
+ *	@dev: port device
+ *	@filter_dev: filter device
+ *	@idx:
+ *
+ *	Delete FDB entry from switch device.
+ */
+int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
+			    struct net_device *dev,
+			    struct net_device *filter_dev, int idx)
+{
+	struct switchdev_fdb_dump dump = {
+		.obj = {
+			.id = SWITCHDEV_OBJ_PORT_FDB,
+			.cb = switchdev_port_fdb_dump_cb,
+		},
+		.skb = skb,
+		.cb = cb,
+		.filter_dev = filter_dev,
+		.idx = idx,
+	};
+	int err;
+
+	err = switchdev_port_obj_dump(dev, &dump.obj);
+	if (err)
+		return err;
+
+	return dump.idx;
+}
+EXPORT_SYMBOL_GPL(switchdev_port_fdb_dump);
+
 static struct net_device *switchdev_get_lowest_dev(struct net_device *dev)
 {
 	const struct switchdev_ops *ops = dev->switchdev_ops;
-- 
cgit v1.2.3


From de133464c9e70808d3e5a861294bc55940988178 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Fri, 15 May 2015 14:47:32 -0700
Subject: netns: make nsid_lock per net

The spinlock is used to protect netns_ids which is per net,
so there is no need to use a global spinlock.

Cc: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/net_namespace.h |  1 +
 net/core/net_namespace.c    | 32 ++++++++++++++++----------------
 2 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'include/net')

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 3f850acc844e..72eb23723294 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -58,6 +58,7 @@ struct net {
 	struct list_head	exit_list;	/* Use only net_mutex */
 
 	struct user_namespace   *user_ns;	/* Owning user namespace */
+	spinlock_t		nsid_lock;
 	struct idr		netns_ids;
 
 	struct ns_common	ns;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index d2f42da9479b..2c2eb1b629b1 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -28,7 +28,6 @@
 static LIST_HEAD(pernet_list);
 static struct list_head *first_device = &pernet_list;
 DEFINE_MUTEX(net_mutex);
-static DEFINE_SPINLOCK(nsid_lock);
 
 LIST_HEAD(net_namespace_list);
 EXPORT_SYMBOL_GPL(net_namespace_list);
@@ -218,10 +217,10 @@ int peernet2id_alloc(struct net *net, struct net *peer)
 	bool alloc;
 	int id;
 
-	spin_lock_irqsave(&nsid_lock, flags);
+	spin_lock_irqsave(&net->nsid_lock, flags);
 	alloc = atomic_read(&peer->count) == 0 ? false : true;
 	id = __peernet2id_alloc(net, peer, &alloc);
-	spin_unlock_irqrestore(&nsid_lock, flags);
+	spin_unlock_irqrestore(&net->nsid_lock, flags);
 	if (alloc && id >= 0)
 		rtnl_net_notifyid(net, RTM_NEWNSID, id);
 	return id;
@@ -234,9 +233,9 @@ int peernet2id(struct net *net, struct net *peer)
 	unsigned long flags;
 	int id;
 
-	spin_lock_irqsave(&nsid_lock, flags);
+	spin_lock_irqsave(&net->nsid_lock, flags);
 	id = __peernet2id(net, peer);
-	spin_unlock_irqrestore(&nsid_lock, flags);
+	spin_unlock_irqrestore(&net->nsid_lock, flags);
 	return id;
 }
 
@@ -257,11 +256,11 @@ struct net *get_net_ns_by_id(struct net *net, int id)
 		return NULL;
 
 	rcu_read_lock();
-	spin_lock_irqsave(&nsid_lock, flags);
+	spin_lock_irqsave(&net->nsid_lock, flags);
 	peer = idr_find(&net->netns_ids, id);
 	if (peer)
 		get_net(peer);
-	spin_unlock_irqrestore(&nsid_lock, flags);
+	spin_unlock_irqrestore(&net->nsid_lock, flags);
 	rcu_read_unlock();
 
 	return peer;
@@ -282,6 +281,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
 	net->dev_base_seq = 1;
 	net->user_ns = user_ns;
 	idr_init(&net->netns_ids);
+	spin_lock_init(&net->nsid_lock);
 
 	list_for_each_entry(ops, &pernet_list, list) {
 		error = ops_init(ops, net);
@@ -404,17 +404,17 @@ static void cleanup_net(struct work_struct *work)
 		for_each_net(tmp) {
 			int id;
 
-			spin_lock_irq(&nsid_lock);
+			spin_lock_irq(&tmp->nsid_lock);
 			id = __peernet2id(tmp, net);
 			if (id >= 0)
 				idr_remove(&tmp->netns_ids, id);
-			spin_unlock_irq(&nsid_lock);
+			spin_unlock_irq(&tmp->nsid_lock);
 			if (id >= 0)
 				rtnl_net_notifyid(tmp, RTM_DELNSID, id);
 		}
-		spin_lock_irq(&nsid_lock);
+		spin_lock_irq(&net->nsid_lock);
 		idr_destroy(&net->netns_ids);
-		spin_unlock_irq(&nsid_lock);
+		spin_unlock_irq(&net->nsid_lock);
 
 	}
 	rtnl_unlock();
@@ -563,15 +563,15 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh)
 	if (IS_ERR(peer))
 		return PTR_ERR(peer);
 
-	spin_lock_irqsave(&nsid_lock, flags);
+	spin_lock_irqsave(&net->nsid_lock, flags);
 	if (__peernet2id(net, peer) >= 0) {
-		spin_unlock_irqrestore(&nsid_lock, flags);
+		spin_unlock_irqrestore(&net->nsid_lock, flags);
 		err = -EEXIST;
 		goto out;
 	}
 
 	err = alloc_netid(net, peer, nsid);
-	spin_unlock_irqrestore(&nsid_lock, flags);
+	spin_unlock_irqrestore(&net->nsid_lock, flags);
 	if (err >= 0) {
 		rtnl_net_notifyid(net, RTM_NEWNSID, err);
 		err = 0;
@@ -695,9 +695,9 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
 	};
 	unsigned long flags;
 
-	spin_lock_irqsave(&nsid_lock, flags);
+	spin_lock_irqsave(&net->nsid_lock, flags);
 	idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb);
-	spin_unlock_irqrestore(&nsid_lock, flags);
+	spin_unlock_irqrestore(&net->nsid_lock, flags);
 
 	cb->args[0] = net_cb.idx;
 	return skb->len;
-- 
cgit v1.2.3


From 5cf422808244ca8f1177c72fe6e1ce8322794b57 Mon Sep 17 00:00:00 2001
From: Andy Zhou <azhou@nicira.com>
Date: Fri, 15 May 2015 14:15:35 -0700
Subject: ipv4: introduce frag_expire_skip_icmp()

Improve readability of skip ICMP for de-fragmentation expiration logic.
This change will also make the logic easier to maintain when the
following patches in this series are applied.

Signed-off-by: Andy Zhou <azhou@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h       | 10 ++++++++++
 net/ipv4/ip_fragment.c | 13 +++++++++----
 2 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip.h b/include/net/ip.h
index 0ed6d768e606..43f6f39df9fc 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -478,6 +478,16 @@ enum ip_defrag_users {
 	IP_DEFRAG_MACVLAN,
 };
 
+/* Return true if the value of 'user' is between 'lower_bond'
+ * and 'upper_bond' inclusively.
+ */
+static inline bool ip_defrag_user_in_between(u32 user,
+					     enum ip_defrag_users lower_bond,
+					     enum ip_defrag_users upper_bond)
+{
+	return user >= lower_bond && user <= upper_bond;
+}
+
 int ip_defrag(struct sk_buff *skb, u32 user);
 #ifdef CONFIG_INET
 struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index cc1da6d9cb35..83424f1742b8 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -173,6 +173,13 @@ static void ipq_kill(struct ipq *ipq)
 	inet_frag_kill(&ipq->q, &ip4_frags);
 }
 
+static bool frag_expire_skip_icmp(u32 user)
+{
+	return user == IP_DEFRAG_AF_PACKET ||
+	       ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_IN,
+					 __IP_DEFRAG_CONNTRACK_IN_END);
+}
+
 /*
  * Oops, a fragment queue timed out.  Kill it and send an ICMP reply.
  */
@@ -217,10 +224,8 @@ static void ip_expire(unsigned long arg)
 		/* Only an end host needs to send an ICMP
 		 * "Fragment Reassembly Timeout" message, per RFC792.
 		 */
-		if (qp->user == IP_DEFRAG_AF_PACKET ||
-		    ((qp->user >= IP_DEFRAG_CONNTRACK_IN) &&
-		     (qp->user <= __IP_DEFRAG_CONNTRACK_IN_END) &&
-		     (skb_rtable(head)->rt_type != RTN_LOCAL)))
+		if (frag_expire_skip_icmp(qp->user) &&
+		    (skb_rtable(head)->rt_type != RTN_LOCAL))
 			goto out_rcu_unlock;
 
 		/* Send an ICMP "Fragment Reassembly Timeout" message. */
-- 
cgit v1.2.3


From 49d16b23cd1e61c028ee088c5a64e9ac6a9c6147 Mon Sep 17 00:00:00 2001
From: Andy Zhou <azhou@nicira.com>
Date: Fri, 15 May 2015 14:15:37 -0700
Subject: bridge_netfilter: No ICMP packet on IPv4 fragmentation error

When bridge netfilter re-fragments an IP packet for output, all
packets that can not be re-fragmented to their original input size
should be silently discarded.

However, current bridge netfilter output path generates an ICMP packet
with 'size exceeded MTU' message for such packets, this is a bug.

This patch refactors the ip_fragment() API to allow two separate
use cases. The bridge netfilter user case will not
send ICMP, the routing output will, as before.

Signed-off-by: Andy Zhou <azhou@nicira.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h          |  4 ++--
 net/bridge/br_netfilter.c | 21 ++++++++++++++++++++-
 net/ipv4/ip_output.c      | 40 ++++++++++++++++++++++++++++------------
 3 files changed, 50 insertions(+), 15 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip.h b/include/net/ip.h
index 43f6f39df9fc..cd7a6a458bb6 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -108,8 +108,8 @@ int ip_local_deliver(struct sk_buff *skb);
 int ip_mr_input(struct sk_buff *skb);
 int ip_output(struct sock *sk, struct sk_buff *skb);
 int ip_mc_output(struct sock *sk, struct sk_buff *skb);
-int ip_fragment(struct sock *sk, struct sk_buff *skb,
-		int (*output)(struct sock *, struct sk_buff *));
+int ip_do_fragment(struct sock *sk, struct sk_buff *skb,
+		   int (*output)(struct sock *, struct sk_buff *));
 int ip_do_nat(struct sk_buff *skb);
 void ip_send_check(struct iphdr *ip);
 int __ip_local_out(struct sk_buff *skb);
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 2b0e8bb49944..1d2eb32d8270 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -853,6 +853,25 @@ static int br_nf_push_frag_xmit(struct sock *sk, struct sk_buff *skb)
 	return br_dev_queue_push_xmit(sk, skb);
 }
 
+static int br_nf_ip_fragment(struct sock *sk, struct sk_buff *skb,
+			     int (*output)(struct sock *, struct sk_buff *))
+{
+	unsigned int mtu = ip_skb_dst_mtu(skb);
+	struct iphdr *iph = ip_hdr(skb);
+	struct rtable *rt = skb_rtable(skb);
+	struct net_device *dev = rt->dst.dev;
+
+	if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
+		     (IPCB(skb)->frag_max_size &&
+		      IPCB(skb)->frag_max_size > mtu))) {
+		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+		kfree_skb(skb);
+		return -EMSGSIZE;
+	}
+
+	return ip_do_fragment(sk, skb, output);
+}
+
 static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb)
 {
 	int ret;
@@ -886,7 +905,7 @@ static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb)
 		skb_copy_from_linear_data_offset(skb, -data->size, data->mac,
 						 data->size);
 
-		ret = ip_fragment(sk, skb, br_nf_push_frag_xmit);
+		ret = br_nf_ip_fragment(sk, skb, br_nf_push_frag_xmit);
 	} else {
 		nf_bridge_info_free(skb);
 		ret = br_dev_queue_push_xmit(sk, skb);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 2acc5dc32807..8d91b922fcfe 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -83,6 +83,9 @@
 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
 EXPORT_SYMBOL(sysctl_ip_default_ttl);
 
+static int ip_fragment(struct sock *sk, struct sk_buff *skb,
+		       int (*output)(struct sock *, struct sk_buff *));
+
 /* Generate a checksum for an outgoing IP datagram. */
 void ip_send_check(struct iphdr *iph)
 {
@@ -478,6 +481,28 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	skb_copy_secmark(to, from);
 }
 
+static int ip_fragment(struct sock *sk, struct sk_buff *skb,
+		       int (*output)(struct sock *, struct sk_buff *))
+{
+	struct iphdr *iph = ip_hdr(skb);
+	unsigned int mtu = ip_skb_dst_mtu(skb);
+
+	if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
+		     (IPCB(skb)->frag_max_size &&
+		      IPCB(skb)->frag_max_size > mtu))) {
+		struct rtable *rt = skb_rtable(skb);
+		struct net_device *dev = rt->dst.dev;
+
+		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+			  htonl(mtu));
+		kfree_skb(skb);
+		return -EMSGSIZE;
+	}
+
+	return ip_do_fragment(sk, skb, output);
+}
+
 /*
  *	This IP datagram is too large to be sent in one piece.  Break it up into
  *	smaller pieces (each of size equal to IP header plus
@@ -485,8 +510,8 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
  *	single device frame, and queue such a frame for sending.
  */
 
-int ip_fragment(struct sock *sk, struct sk_buff *skb,
-		int (*output)(struct sock *, struct sk_buff *))
+int ip_do_fragment(struct sock *sk, struct sk_buff *skb,
+		   int (*output)(struct sock *, struct sk_buff *))
 {
 	struct iphdr *iph;
 	int ptr;
@@ -507,15 +532,6 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb,
 	iph = ip_hdr(skb);
 
 	mtu = ip_skb_dst_mtu(skb);
-	if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
-		     (IPCB(skb)->frag_max_size &&
-		      IPCB(skb)->frag_max_size > mtu))) {
-		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
-		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
-			  htonl(mtu));
-		kfree_skb(skb);
-		return -EMSGSIZE;
-	}
 
 	/*
 	 *	Setup starting values.
@@ -751,7 +767,7 @@ fail:
 	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 	return err;
 }
-EXPORT_SYMBOL(ip_fragment);
+EXPORT_SYMBOL(ip_do_fragment);
 
 int
 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
-- 
cgit v1.2.3


From 1a19cb680be0d4b06ce9a9d6516b8f45f544d3e8 Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Sun, 17 May 2015 21:44:39 +0200
Subject: ieee802154: change transmit power to s32

This patch change the transmit power from s8 to s32. This prepares to store a
mbm value instead dbm inside the transmit power variable. The old
interface keep the a s8 dbm value, which should be backward compatibility
when assign s8 to s32.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/net/ieee802154/at86rf230.c | 2 +-
 include/net/cfg802154.h            | 2 +-
 include/net/mac802154.h            | 2 +-
 net/ieee802154/nl802154.c          | 6 +++---
 net/mac802154/driver-ops.h         | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/ieee802154/at86rf230.c b/drivers/net/ieee802154/at86rf230.c
index 67d00fbc2e0e..02b6bb72304d 100644
--- a/drivers/net/ieee802154/at86rf230.c
+++ b/drivers/net/ieee802154/at86rf230.c
@@ -1194,7 +1194,7 @@ at86rf230_set_hw_addr_filt(struct ieee802154_hw *hw,
 }
 
 static int
-at86rf230_set_txpower(struct ieee802154_hw *hw, s8 db)
+at86rf230_set_txpower(struct ieee802154_hw *hw, s32 db)
 {
 	struct at86rf230_local *lp = hw->priv;
 
diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index 6ea16c84293b..47804cddb46f 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -85,7 +85,7 @@ struct wpan_phy {
 	u8 current_channel;
 	u8 current_page;
 	u32 channels_supported[IEEE802154_MAX_PAGE + 1];
-	s8 transmit_power;
+	s32 transmit_power;
 	struct wpan_phy_cca cca;
 
 	__le64 perm_extended_addr;
diff --git a/include/net/mac802154.h b/include/net/mac802154.h
index 7df28a4c23f9..400e4e85c53f 100644
--- a/include/net/mac802154.h
+++ b/include/net/mac802154.h
@@ -213,7 +213,7 @@ struct ieee802154_ops {
 	int		(*set_hw_addr_filt)(struct ieee802154_hw *hw,
 					    struct ieee802154_hw_addr_filt *filt,
 					    unsigned long changed);
-	int		(*set_txpower)(struct ieee802154_hw *hw, s8 dbm);
+	int		(*set_txpower)(struct ieee802154_hw *hw, s32 dbm);
 	int		(*set_lbt)(struct ieee802154_hw *hw, bool on);
 	int		(*set_cca_mode)(struct ieee802154_hw *hw,
 					const struct wpan_phy_cca *cca);
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index d750d9778ba7..f3185c71af10 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -207,7 +207,7 @@ static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = {
 	[NL802154_ATTR_PAGE] = { .type = NLA_U8, },
 	[NL802154_ATTR_CHANNEL] = { .type = NLA_U8, },
 
-	[NL802154_ATTR_TX_POWER] = { .type = NLA_S8, },
+	[NL802154_ATTR_TX_POWER] = { .type = NLA_S32, },
 
 	[NL802154_ATTR_CCA_MODE] = { .type = NLA_U32, },
 	[NL802154_ATTR_CCA_OPT] = { .type = NLA_U32, },
@@ -301,8 +301,8 @@ static int nl802154_send_wpan_phy(struct cfg802154_registered_device *rdev,
 			goto nla_put_failure;
 	}
 
-	if (nla_put_s8(msg, NL802154_ATTR_TX_POWER,
-		       rdev->wpan_phy.transmit_power))
+	if (nla_put_s32(msg, NL802154_ATTR_TX_POWER,
+			rdev->wpan_phy.transmit_power))
 		goto nla_put_failure;
 
 finish:
diff --git a/net/mac802154/driver-ops.h b/net/mac802154/driver-ops.h
index a0533357b9ea..57c1bdbfaa91 100644
--- a/net/mac802154/driver-ops.h
+++ b/net/mac802154/driver-ops.h
@@ -58,7 +58,7 @@ drv_set_channel(struct ieee802154_local *local, u8 page, u8 channel)
 	return local->ops->set_channel(&local->hw, page, channel);
 }
 
-static inline int drv_set_tx_power(struct ieee802154_local *local, s8 dbm)
+static inline int drv_set_tx_power(struct ieee802154_local *local, s32 dbm)
 {
 	might_sleep();
 
-- 
cgit v1.2.3


From e2eb173aaacd1a1bcd255d3e74ffb719e47eeadb Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Sun, 17 May 2015 21:44:40 +0200
Subject: ieee802154: change transmit power to mbm

This patch change the handling of transmit power level from dbm to mbm.
This prepares to handle floating point transmit power levels values. The
old netlink 802.15.4 will convert the dbm value to mbm for handling
backward compatibility.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/net/ieee802154/at86rf230.c | 3 ++-
 include/net/cfg802154.h            | 1 +
 include/net/mac802154.h            | 4 ++--
 net/ieee802154/nl-mac.c            | 4 ++--
 net/mac802154/driver-ops.h         | 4 ++--
 5 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/ieee802154/at86rf230.c b/drivers/net/ieee802154/at86rf230.c
index 02b6bb72304d..3a303e4a3c07 100644
--- a/drivers/net/ieee802154/at86rf230.c
+++ b/drivers/net/ieee802154/at86rf230.c
@@ -1194,9 +1194,10 @@ at86rf230_set_hw_addr_filt(struct ieee802154_hw *hw,
 }
 
 static int
-at86rf230_set_txpower(struct ieee802154_hw *hw, s32 db)
+at86rf230_set_txpower(struct ieee802154_hw *hw, s32 mbm)
 {
 	struct at86rf230_local *lp = hw->priv;
+	s8 db = mbm / 100;
 
 	/* typical maximum output is 5dBm with RG_PHY_TX_PWR 0x60, lower five
 	 * bits decrease power in 1dB steps. 0x60 represents extra PA gain of
diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index 47804cddb46f..b5b3f9f43084 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -85,6 +85,7 @@ struct wpan_phy {
 	u8 current_channel;
 	u8 current_page;
 	u32 channels_supported[IEEE802154_MAX_PAGE + 1];
+	/* current transmit_power in mBm */
 	s32 transmit_power;
 	struct wpan_phy_cca cca;
 
diff --git a/include/net/mac802154.h b/include/net/mac802154.h
index 400e4e85c53f..e863a8557c0a 100644
--- a/include/net/mac802154.h
+++ b/include/net/mac802154.h
@@ -171,7 +171,7 @@ struct ieee802154_hw {
  *	  Returns either zero, or negative errno.
  *
  * set_txpower:
- *	  Set radio transmit power in dB. Called with pib_lock held.
+ *	  Set radio transmit power in mBm. Called with pib_lock held.
  *	  Returns either zero, or negative errno.
  *
  * set_lbt
@@ -213,7 +213,7 @@ struct ieee802154_ops {
 	int		(*set_hw_addr_filt)(struct ieee802154_hw *hw,
 					    struct ieee802154_hw_addr_filt *filt,
 					    unsigned long changed);
-	int		(*set_txpower)(struct ieee802154_hw *hw, s32 dbm);
+	int		(*set_txpower)(struct ieee802154_hw *hw, s32 mbm);
 	int		(*set_lbt)(struct ieee802154_hw *hw, bool on);
 	int		(*set_cca_mode)(struct ieee802154_hw *hw,
 					const struct wpan_phy_cca *cca);
diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c
index 2b4955d7aae5..4ba2e13f7a07 100644
--- a/net/ieee802154/nl-mac.c
+++ b/net/ieee802154/nl-mac.c
@@ -117,7 +117,7 @@ static int ieee802154_nl_fill_iface(struct sk_buff *msg, u32 portid,
 		rtnl_unlock();
 
 		if (nla_put_s8(msg, IEEE802154_ATTR_TXPOWER,
-			       params.transmit_power) ||
+			       params.transmit_power / 100) ||
 		    nla_put_u8(msg, IEEE802154_ATTR_LBT_ENABLED, params.lbt) ||
 		    nla_put_u8(msg, IEEE802154_ATTR_CCA_MODE,
 			       params.cca.mode) ||
@@ -510,7 +510,7 @@ int ieee802154_set_macparams(struct sk_buff *skb, struct genl_info *info)
 	ops->get_mac_params(dev, &params);
 
 	if (info->attrs[IEEE802154_ATTR_TXPOWER])
-		params.transmit_power = nla_get_s8(info->attrs[IEEE802154_ATTR_TXPOWER]);
+		params.transmit_power = nla_get_s8(info->attrs[IEEE802154_ATTR_TXPOWER]) * 100;
 
 	if (info->attrs[IEEE802154_ATTR_LBT_ENABLED])
 		params.lbt = nla_get_u8(info->attrs[IEEE802154_ATTR_LBT_ENABLED]);
diff --git a/net/mac802154/driver-ops.h b/net/mac802154/driver-ops.h
index 57c1bdbfaa91..d289ae3f1e93 100644
--- a/net/mac802154/driver-ops.h
+++ b/net/mac802154/driver-ops.h
@@ -58,7 +58,7 @@ drv_set_channel(struct ieee802154_local *local, u8 page, u8 channel)
 	return local->ops->set_channel(&local->hw, page, channel);
 }
 
-static inline int drv_set_tx_power(struct ieee802154_local *local, s32 dbm)
+static inline int drv_set_tx_power(struct ieee802154_local *local, s32 mbm)
 {
 	might_sleep();
 
@@ -67,7 +67,7 @@ static inline int drv_set_tx_power(struct ieee802154_local *local, s32 dbm)
 		return -EOPNOTSUPP;
 	}
 
-	return local->ops->set_txpower(&local->hw, dbm);
+	return local->ops->set_txpower(&local->hw, mbm);
 }
 
 static inline int drv_set_cca_mode(struct ieee802154_local *local,
-- 
cgit v1.2.3


From 32b23550ad64d9676f2218b3d5de46bacf98ef1d Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Sun, 17 May 2015 21:44:41 +0200
Subject: ieee802154: change cca ed level to mbm

This patch change the handling of cca energy detection level from dbm to
mbm. This prepares to handle floating point cca energy detection levels
values. The old netlink 802.15.4 will convert the dbm value to mbm for
handling backward compatibility.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/net/ieee802154/at86rf230.c | 3 ++-
 include/net/cfg802154.h            | 1 +
 include/net/mac802154.h            | 5 ++---
 net/ieee802154/nl-mac.c            | 4 ++--
 net/mac802154/driver-ops.h         | 4 ++--
 5 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/ieee802154/at86rf230.c b/drivers/net/ieee802154/at86rf230.c
index 3a303e4a3c07..e68d45ed622e 100644
--- a/drivers/net/ieee802154/at86rf230.c
+++ b/drivers/net/ieee802154/at86rf230.c
@@ -1268,9 +1268,10 @@ at86rf23x_get_desens_steps(struct at86rf230_local *lp, s32 level)
 }
 
 static int
-at86rf230_set_cca_ed_level(struct ieee802154_hw *hw, s32 level)
+at86rf230_set_cca_ed_level(struct ieee802154_hw *hw, s32 mbm)
 {
 	struct at86rf230_local *lp = hw->priv;
+	s32 level = mbm / 100;
 
 	if (level < lp->data->rssi_base_val || level > 30)
 		return -EINVAL;
diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index b5b3f9f43084..9ced2c9fdbfc 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -91,6 +91,7 @@ struct wpan_phy {
 
 	__le64 perm_extended_addr;
 
+	/* current cca ed threshold in mBm */
 	s32 cca_ed_level;
 
 	/* PHY depended MAC PIB values */
diff --git a/include/net/mac802154.h b/include/net/mac802154.h
index e863a8557c0a..71e245605ef8 100644
--- a/include/net/mac802154.h
+++ b/include/net/mac802154.h
@@ -184,7 +184,7 @@ struct ieee802154_hw {
  *	  Returns either zero, or negative errno.
  *
  * set_cca_ed_level
- *	  Sets the CCA energy detection threshold in dBm. Called with pib_lock
+ *	  Sets the CCA energy detection threshold in mBm. Called with pib_lock
  *	  held.
  *	  Returns either zero, or negative errno.
  *
@@ -217,8 +217,7 @@ struct ieee802154_ops {
 	int		(*set_lbt)(struct ieee802154_hw *hw, bool on);
 	int		(*set_cca_mode)(struct ieee802154_hw *hw,
 					const struct wpan_phy_cca *cca);
-	int		(*set_cca_ed_level)(struct ieee802154_hw *hw,
-					    s32 level);
+	int		(*set_cca_ed_level)(struct ieee802154_hw *hw, s32 mbm);
 	int		(*set_csma_params)(struct ieee802154_hw *hw,
 					   u8 min_be, u8 max_be, u8 retries);
 	int		(*set_frame_retries)(struct ieee802154_hw *hw,
diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c
index 4ba2e13f7a07..cdc1cc3543f6 100644
--- a/net/ieee802154/nl-mac.c
+++ b/net/ieee802154/nl-mac.c
@@ -122,7 +122,7 @@ static int ieee802154_nl_fill_iface(struct sk_buff *msg, u32 portid,
 		    nla_put_u8(msg, IEEE802154_ATTR_CCA_MODE,
 			       params.cca.mode) ||
 		    nla_put_s32(msg, IEEE802154_ATTR_CCA_ED_LEVEL,
-				params.cca_ed_level) ||
+				params.cca_ed_level / 100) ||
 		    nla_put_u8(msg, IEEE802154_ATTR_CSMA_RETRIES,
 			       params.csma_retries) ||
 		    nla_put_u8(msg, IEEE802154_ATTR_CSMA_MIN_BE,
@@ -519,7 +519,7 @@ int ieee802154_set_macparams(struct sk_buff *skb, struct genl_info *info)
 		params.cca.mode = nla_get_u8(info->attrs[IEEE802154_ATTR_CCA_MODE]);
 
 	if (info->attrs[IEEE802154_ATTR_CCA_ED_LEVEL])
-		params.cca_ed_level = nla_get_s32(info->attrs[IEEE802154_ATTR_CCA_ED_LEVEL]);
+		params.cca_ed_level = nla_get_s32(info->attrs[IEEE802154_ATTR_CCA_ED_LEVEL]) * 100;
 
 	if (info->attrs[IEEE802154_ATTR_CSMA_RETRIES])
 		params.csma_retries = nla_get_u8(info->attrs[IEEE802154_ATTR_CSMA_RETRIES]);
diff --git a/net/mac802154/driver-ops.h b/net/mac802154/driver-ops.h
index d289ae3f1e93..caecd5f43aa7 100644
--- a/net/mac802154/driver-ops.h
+++ b/net/mac802154/driver-ops.h
@@ -96,7 +96,7 @@ static inline int drv_set_lbt_mode(struct ieee802154_local *local, bool mode)
 }
 
 static inline int
-drv_set_cca_ed_level(struct ieee802154_local *local, s32 ed_level)
+drv_set_cca_ed_level(struct ieee802154_local *local, s32 mbm)
 {
 	might_sleep();
 
@@ -105,7 +105,7 @@ drv_set_cca_ed_level(struct ieee802154_local *local, s32 ed_level)
 		return -EOPNOTSUPP;
 	}
 
-	return local->ops->set_cca_ed_level(&local->hw, ed_level);
+	return local->ops->set_cca_ed_level(&local->hw, mbm);
 }
 
 static inline int drv_set_pan_id(struct ieee802154_local *local, __le16 pan_id)
-- 
cgit v1.2.3


From 72f655e44db9c7e835ceba96dc03cbe979d3f80d Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Sun, 17 May 2015 21:44:42 +0200
Subject: ieee802154: introduce wpan_phy_supported

This patch introduce the wpan_phy_supported struct for wpan_phy. There
is currently no way to check if a transceiver can handle IEEE 802.15.4
complaint values. With this struct we can check before if the
transceiver supports these values before sending to driver layer.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Suggested-by: Phoebe Buckheister <phoebe.buckheister@itwm.fraunhofer.de>
Acked-by: Varka Bhadram <varkabhadram@gmail.com>
Cc: Alan Ott <alan@signal11.us>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/net/ieee802154/at86rf230.c |  8 ++++----
 drivers/net/ieee802154/cc2520.c    |  2 +-
 drivers/net/ieee802154/fakelb.c    | 30 +++++++++++++++---------------
 drivers/net/ieee802154/mrf24j40.c  |  2 +-
 include/net/cfg802154.h            |  6 +++++-
 net/ieee802154/nl-phy.c            |  4 ++--
 net/ieee802154/nl802154.c          |  4 ++--
 7 files changed, 30 insertions(+), 26 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/ieee802154/at86rf230.c b/drivers/net/ieee802154/at86rf230.c
index e68d45ed622e..151d57f4320d 100644
--- a/drivers/net/ieee802154/at86rf230.c
+++ b/drivers/net/ieee802154/at86rf230.c
@@ -1579,7 +1579,7 @@ at86rf230_detect_device(struct at86rf230_local *lp)
 	case 3:
 		chip = "at86rf231";
 		lp->data = &at86rf231_data;
-		lp->hw->phy->channels_supported[0] = 0x7FFF800;
+		lp->hw->phy->supported.channels[0] = 0x7FFF800;
 		lp->hw->phy->current_channel = 11;
 		lp->hw->phy->symbol_duration = 16;
 		break;
@@ -1587,15 +1587,15 @@ at86rf230_detect_device(struct at86rf230_local *lp)
 		chip = "at86rf212";
 		lp->data = &at86rf212_data;
 		lp->hw->flags |= IEEE802154_HW_LBT;
-		lp->hw->phy->channels_supported[0] = 0x00007FF;
-		lp->hw->phy->channels_supported[2] = 0x00007FF;
+		lp->hw->phy->supported.channels[0] = 0x00007FF;
+		lp->hw->phy->supported.channels[2] = 0x00007FF;
 		lp->hw->phy->current_channel = 5;
 		lp->hw->phy->symbol_duration = 25;
 		break;
 	case 11:
 		chip = "at86rf233";
 		lp->data = &at86rf233_data;
-		lp->hw->phy->channels_supported[0] = 0x7FFF800;
+		lp->hw->phy->supported.channels[0] = 0x7FFF800;
 		lp->hw->phy->current_channel = 13;
 		lp->hw->phy->symbol_duration = 16;
 		break;
diff --git a/drivers/net/ieee802154/cc2520.c b/drivers/net/ieee802154/cc2520.c
index f833b8bb6663..84b28a05c5a1 100644
--- a/drivers/net/ieee802154/cc2520.c
+++ b/drivers/net/ieee802154/cc2520.c
@@ -653,7 +653,7 @@ static int cc2520_register(struct cc2520_private *priv)
 	ieee802154_random_extended_addr(&priv->hw->phy->perm_extended_addr);
 
 	/* We do support only 2.4 Ghz */
-	priv->hw->phy->channels_supported[0] = 0x7FFF800;
+	priv->hw->phy->supported.channels[0] = 0x7FFF800;
 	priv->hw->flags = IEEE802154_HW_OMIT_CKSUM | IEEE802154_HW_AACK |
 			  IEEE802154_HW_AFILT;
 
diff --git a/drivers/net/ieee802154/fakelb.c b/drivers/net/ieee802154/fakelb.c
index dc2bfb600b4b..91bbf030a443 100644
--- a/drivers/net/ieee802154/fakelb.c
+++ b/drivers/net/ieee802154/fakelb.c
@@ -149,35 +149,35 @@ static int fakelb_add_one(struct device *dev, struct fakelb_priv *fake)
 	priv->hw = hw;
 
 	/* 868 MHz BPSK	802.15.4-2003 */
-	hw->phy->channels_supported[0] |= 1;
+	hw->phy->supported.channels[0] |= 1;
 	/* 915 MHz BPSK	802.15.4-2003 */
-	hw->phy->channels_supported[0] |= 0x7fe;
+	hw->phy->supported.channels[0] |= 0x7fe;
 	/* 2.4 GHz O-QPSK 802.15.4-2003 */
-	hw->phy->channels_supported[0] |= 0x7FFF800;
+	hw->phy->supported.channels[0] |= 0x7FFF800;
 	/* 868 MHz ASK 802.15.4-2006 */
-	hw->phy->channels_supported[1] |= 1;
+	hw->phy->supported.channels[1] |= 1;
 	/* 915 MHz ASK 802.15.4-2006 */
-	hw->phy->channels_supported[1] |= 0x7fe;
+	hw->phy->supported.channels[1] |= 0x7fe;
 	/* 868 MHz O-QPSK 802.15.4-2006 */
-	hw->phy->channels_supported[2] |= 1;
+	hw->phy->supported.channels[2] |= 1;
 	/* 915 MHz O-QPSK 802.15.4-2006 */
-	hw->phy->channels_supported[2] |= 0x7fe;
+	hw->phy->supported.channels[2] |= 0x7fe;
 	/* 2.4 GHz CSS 802.15.4a-2007 */
-	hw->phy->channels_supported[3] |= 0x3fff;
+	hw->phy->supported.channels[3] |= 0x3fff;
 	/* UWB Sub-gigahertz 802.15.4a-2007 */
-	hw->phy->channels_supported[4] |= 1;
+	hw->phy->supported.channels[4] |= 1;
 	/* UWB Low band 802.15.4a-2007 */
-	hw->phy->channels_supported[4] |= 0x1e;
+	hw->phy->supported.channels[4] |= 0x1e;
 	/* UWB High band 802.15.4a-2007 */
-	hw->phy->channels_supported[4] |= 0xffe0;
+	hw->phy->supported.channels[4] |= 0xffe0;
 	/* 750 MHz O-QPSK 802.15.4c-2009 */
-	hw->phy->channels_supported[5] |= 0xf;
+	hw->phy->supported.channels[5] |= 0xf;
 	/* 750 MHz MPSK 802.15.4c-2009 */
-	hw->phy->channels_supported[5] |= 0xf0;
+	hw->phy->supported.channels[5] |= 0xf0;
 	/* 950 MHz BPSK 802.15.4d-2009 */
-	hw->phy->channels_supported[6] |= 0x3ff;
+	hw->phy->supported.channels[6] |= 0x3ff;
 	/* 950 MHz GFSK 802.15.4d-2009 */
-	hw->phy->channels_supported[6] |= 0x3ffc00;
+	hw->phy->supported.channels[6] |= 0x3ffc00;
 
 	INIT_LIST_HEAD(&priv->list);
 	priv->fake = fake;
diff --git a/drivers/net/ieee802154/mrf24j40.c b/drivers/net/ieee802154/mrf24j40.c
index fba2dfd910f7..f2a1bd122a74 100644
--- a/drivers/net/ieee802154/mrf24j40.c
+++ b/drivers/net/ieee802154/mrf24j40.c
@@ -750,7 +750,7 @@ static int mrf24j40_probe(struct spi_device *spi)
 
 	devrec->hw->priv = devrec;
 	devrec->hw->parent = &devrec->spi->dev;
-	devrec->hw->phy->channels_supported[0] = CHANNEL_MASK;
+	devrec->hw->phy->supported.channels[0] = CHANNEL_MASK;
 	devrec->hw->flags = IEEE802154_HW_OMIT_CKSUM | IEEE802154_HW_AACK |
 			    IEEE802154_HW_AFILT;
 
diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index 9ced2c9fdbfc..1941d7a79219 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -61,6 +61,10 @@ struct cfg802154_ops {
 				struct wpan_dev *wpan_dev, bool mode);
 };
 
+struct wpan_phy_supported {
+	u32 channels[IEEE802154_MAX_PAGE + 1];
+};
+
 struct wpan_phy_cca {
 	enum nl802154_cca_modes mode;
 	enum nl802154_cca_opts opt;
@@ -84,7 +88,7 @@ struct wpan_phy {
 	 */
 	u8 current_channel;
 	u8 current_page;
-	u32 channels_supported[IEEE802154_MAX_PAGE + 1];
+	struct wpan_phy_supported supported;
 	/* current transmit_power in mBm */
 	s32 transmit_power;
 	struct wpan_phy_cca cca;
diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c
index 346c6665d25e..cbc0d09351e0 100644
--- a/net/ieee802154/nl-phy.c
+++ b/net/ieee802154/nl-phy.c
@@ -56,8 +56,8 @@ static int ieee802154_nl_fill_phy(struct sk_buff *msg, u32 portid,
 	    nla_put_u8(msg, IEEE802154_ATTR_CHANNEL, phy->current_channel))
 		goto nla_put_failure;
 	for (i = 0; i < 32; i++) {
-		if (phy->channels_supported[i])
-			buf[pages++] = phy->channels_supported[i] | (i << 27);
+		if (phy->supported.channels[i])
+			buf[pages++] = phy->supported.channels[i] | (i << 27);
 	}
 	if (pages &&
 	    nla_put(msg, IEEE802154_ATTR_CHANNEL_PAGE_LIST,
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index f3185c71af10..d271879a2174 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -248,7 +248,7 @@ nl802154_send_wpan_phy_channels(struct cfg802154_registered_device *rdev,
 
 	for (page = 0; page <= IEEE802154_MAX_PAGE; page++) {
 		if (nla_put_u32(msg, NL802154_ATTR_SUPPORTED_CHANNEL,
-				rdev->wpan_phy.channels_supported[page]))
+				rdev->wpan_phy.supported.channels[page]))
 			return -ENOBUFS;
 	}
 	nla_nest_end(msg, nl_page);
@@ -626,7 +626,7 @@ static int nl802154_set_channel(struct sk_buff *skb, struct genl_info *info)
 
 	/* check 802.15.4 constraints */
 	if (page > IEEE802154_MAX_PAGE || channel > IEEE802154_MAX_CHANNEL ||
-	    !(rdev->wpan_phy.channels_supported[page] & BIT(channel)))
+	    !(rdev->wpan_phy.supported.channels[page] & BIT(channel)))
 		return -EINVAL;
 
 	return rdev_set_channel(rdev, page, channel);
-- 
cgit v1.2.3


From fea3318d20776a94afeea0460c6ee9904e60569e Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Sun, 17 May 2015 21:44:43 +0200
Subject: ieee802154: add several phy supported handling

This patch adds support for phy supported handling for all other already
existing handling 802.15.4 functionality. We assume now a fully 802.15.4
complaint transceiver at phy allocation. If a transceiver can support
802.15.4 default values only, then the values should be overwirtten by
values the transceiver supports. If the transceiver doesn't set the
according hardware flags, we assume the 802.15.4 defaults now which
cannot be changed.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Suggested-by: Phoebe Buckheister <phoebe.buckheister@itwm.fraunhofer.de>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/cfg802154.h   | 26 +++++++++++++++++++++++++-
 include/net/nl802154.h    | 22 ++++++++++++++++++++++
 net/ieee802154/nl802154.c | 22 +++++++++++++++++-----
 net/mac802154/main.c      | 26 ++++++++++++++++++++++++++
 4 files changed, 90 insertions(+), 6 deletions(-)

(limited to 'include/net')

diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index 1941d7a79219..23abd08a310a 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -61,8 +61,32 @@ struct cfg802154_ops {
 				struct wpan_dev *wpan_dev, bool mode);
 };
 
+static inline bool
+wpan_phy_supported_bool(bool b, enum nl802154_supported_bool_states st)
+{
+	switch (st) {
+	case NL802154_SUPPORTED_BOOL_TRUE:
+		return b;
+	case NL802154_SUPPORTED_BOOL_FALSE:
+		return !b;
+	case NL802154_SUPPORTED_BOOL_BOTH:
+		return true;
+	default:
+		WARN_ON(1);
+	}
+
+	return false;
+}
+
 struct wpan_phy_supported {
-	u32 channels[IEEE802154_MAX_PAGE + 1];
+	u32 channels[IEEE802154_MAX_PAGE + 1],
+	    cca_modes, cca_opts;
+	enum nl802154_supported_bool_states lbt;
+	u8 min_minbe, max_minbe, min_maxbe, max_maxbe,
+	   min_csma_backoffs, max_csma_backoffs;
+	s8 min_frame_retries, max_frame_retries;
+	size_t tx_powers_size, cca_ed_levels_size;
+	const s32 *tx_powers, *cca_ed_levels;
 };
 
 struct wpan_phy_cca {
diff --git a/include/net/nl802154.h b/include/net/nl802154.h
index f8b5bc997959..055277156eef 100644
--- a/include/net/nl802154.h
+++ b/include/net/nl802154.h
@@ -162,4 +162,26 @@ enum nl802154_cca_opts {
 	NL802154_CCA_OPT_ATTR_MAX = __NL802154_CCA_OPT_ATTR_AFTER_LAST - 1
 };
 
+/**
+ * enum nl802154_supported_bool_states - bool states for bool capability entry
+ *
+ * @NL802154_SUPPORTED_BOOL_FALSE: indicates to set false
+ * @NL802154_SUPPORTED_BOOL_TRUE: indicates to set true
+ * @__NL802154_SUPPORTED_BOOL_INVALD: reserved
+ * @NL802154_SUPPORTED_BOOL_BOTH: indicates to set true and false
+ * @__NL802154_SUPPORTED_BOOL_AFTER_LAST: Internal
+ * @NL802154_SUPPORTED_BOOL_MAX: highest value for bool states
+ */
+enum nl802154_supported_bool_states {
+	NL802154_SUPPORTED_BOOL_FALSE,
+	NL802154_SUPPORTED_BOOL_TRUE,
+	/* to handle them in a mask */
+	__NL802154_SUPPORTED_BOOL_INVALD,
+	NL802154_SUPPORTED_BOOL_BOTH,
+
+	/* keep last */
+	__NL802154_SUPPORTED_BOOL_AFTER_LAST,
+	NL802154_SUPPORTED_BOOL_MAX = __NL802154_SUPPORTED_BOOL_AFTER_LAST - 1
+};
+
 #endif /* __NL802154_H */
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index d271879a2174..fa0c4048b0e8 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -642,7 +642,9 @@ static int nl802154_set_cca_mode(struct sk_buff *skb, struct genl_info *info)
 
 	cca.mode = nla_get_u32(info->attrs[NL802154_ATTR_CCA_MODE]);
 	/* checking 802.15.4 constraints */
-	if (cca.mode < NL802154_CCA_ENERGY || cca.mode > NL802154_CCA_ATTR_MAX)
+	if (cca.mode < NL802154_CCA_ENERGY ||
+	    cca.mode > NL802154_CCA_ATTR_MAX ||
+	    !(rdev->wpan_phy.supported.cca_modes & BIT(cca.mode)))
 		return -EINVAL;
 
 	if (cca.mode == NL802154_CCA_ENERGY_CARRIER) {
@@ -650,7 +652,8 @@ static int nl802154_set_cca_mode(struct sk_buff *skb, struct genl_info *info)
 			return -EINVAL;
 
 		cca.opt = nla_get_u32(info->attrs[NL802154_ATTR_CCA_OPT]);
-		if (cca.opt > NL802154_CCA_OPT_ATTR_MAX)
+		if (cca.opt > NL802154_CCA_OPT_ATTR_MAX ||
+		    !(rdev->wpan_phy.supported.cca_opts & BIT(cca.opt)))
 			return -EINVAL;
 	}
 
@@ -744,7 +747,11 @@ nl802154_set_backoff_exponent(struct sk_buff *skb, struct genl_info *info)
 	max_be = nla_get_u8(info->attrs[NL802154_ATTR_MAX_BE]);
 
 	/* check 802.15.4 constraints */
-	if (max_be < 3 || max_be > 8 || min_be > max_be)
+	if (min_be < rdev->wpan_phy.supported.min_minbe ||
+	    min_be > rdev->wpan_phy.supported.max_minbe ||
+	    max_be < rdev->wpan_phy.supported.min_maxbe ||
+	    max_be > rdev->wpan_phy.supported.max_maxbe ||
+	    min_be > max_be)
 		return -EINVAL;
 
 	return rdev_set_backoff_exponent(rdev, wpan_dev, min_be, max_be);
@@ -769,7 +776,8 @@ nl802154_set_max_csma_backoffs(struct sk_buff *skb, struct genl_info *info)
 			info->attrs[NL802154_ATTR_MAX_CSMA_BACKOFFS]);
 
 	/* check 802.15.4 constraints */
-	if (max_csma_backoffs > 5)
+	if (max_csma_backoffs < rdev->wpan_phy.supported.min_csma_backoffs ||
+	    max_csma_backoffs > rdev->wpan_phy.supported.max_csma_backoffs)
 		return -EINVAL;
 
 	return rdev_set_max_csma_backoffs(rdev, wpan_dev, max_csma_backoffs);
@@ -793,7 +801,8 @@ nl802154_set_max_frame_retries(struct sk_buff *skb, struct genl_info *info)
 			info->attrs[NL802154_ATTR_MAX_FRAME_RETRIES]);
 
 	/* check 802.15.4 constraints */
-	if (max_frame_retries < -1 || max_frame_retries > 7)
+	if (max_frame_retries < rdev->wpan_phy.supported.min_frame_retries ||
+	    max_frame_retries > rdev->wpan_phy.supported.max_frame_retries)
 		return -EINVAL;
 
 	return rdev_set_max_frame_retries(rdev, wpan_dev, max_frame_retries);
@@ -813,6 +822,9 @@ static int nl802154_set_lbt_mode(struct sk_buff *skb, struct genl_info *info)
 		return -EINVAL;
 
 	mode = !!nla_get_u8(info->attrs[NL802154_ATTR_LBT_MODE]);
+	if (!wpan_phy_supported_bool(mode, rdev->wpan_phy.supported.lbt))
+		return -EINVAL;
+
 	return rdev_set_lbt_mode(rdev, wpan_dev, mode);
 }
 
diff --git a/net/mac802154/main.c b/net/mac802154/main.c
index 08cb32dc8fd3..ddcd6ff8d39c 100644
--- a/net/mac802154/main.c
+++ b/net/mac802154/main.c
@@ -107,6 +107,15 @@ ieee802154_alloc_hw(size_t priv_data_len, const struct ieee802154_ops *ops)
 
 	skb_queue_head_init(&local->skb_queue);
 
+	/* init supported flags with 802.15.4 default ranges */
+	phy->supported.max_minbe = 8;
+	phy->supported.min_maxbe = 3;
+	phy->supported.max_maxbe = 8;
+	phy->supported.min_frame_retries = -1;
+	phy->supported.max_frame_retries = 7;
+	phy->supported.max_csma_backoffs = 5;
+	phy->supported.lbt = NL802154_SUPPORTED_BOOL_FALSE;
+
 	return &local->hw;
 }
 EXPORT_SYMBOL(ieee802154_alloc_hw);
@@ -155,6 +164,23 @@ int ieee802154_register_hw(struct ieee802154_hw *hw)
 
 	ieee802154_setup_wpan_phy_pib(local->phy);
 
+	if (!(hw->flags & IEEE802154_HW_CSMA_PARAMS)) {
+		local->phy->supported.min_csma_backoffs = 4;
+		local->phy->supported.max_csma_backoffs = 4;
+		local->phy->supported.min_maxbe = 5;
+		local->phy->supported.max_maxbe = 5;
+		local->phy->supported.min_minbe = 3;
+		local->phy->supported.max_minbe = 3;
+	}
+
+	if (!(hw->flags & IEEE802154_HW_FRAME_RETRIES)) {
+		/* TODO should be 3, but our default value is -1 which means
+		 * no ARET handling.
+		 */
+		local->phy->supported.min_frame_retries = -1;
+		local->phy->supported.max_frame_retries = -1;
+	}
+
 	rc = wpan_phy_register(local->phy);
 	if (rc < 0)
 		goto out_wq;
-- 
cgit v1.2.3


From 791021bf13ec9d0fc14bfd8c9c4b368ace568239 Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Sun, 17 May 2015 21:44:44 +0200
Subject: mac802154: check for really changes

This patch adds check if the value is really changed inside pib/mib.
If a transceiver do support only one value for e.g. max_be then this
will also handle that the driver layer doesn't need to care about
handling to set one value only.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/cfg802154.h | 12 ++++++++++++
 net/mac802154/cfg.c     | 26 ++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

(limited to 'include/net')

diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index 23abd08a310a..37abc1603285 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -94,6 +94,18 @@ struct wpan_phy_cca {
 	enum nl802154_cca_opts opt;
 };
 
+static inline bool
+wpan_phy_cca_cmp(const struct wpan_phy_cca *a, const struct wpan_phy_cca *b)
+{
+	if (a->mode != b->mode)
+		return false;
+
+	if (a->mode == NL802154_CCA_ENERGY_CARRIER)
+		return a->opt == b->opt;
+
+	return true;
+}
+
 struct wpan_phy {
 	struct mutex pib_lock;
 
diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c
index c63601582c71..45c4dc39766e 100644
--- a/net/mac802154/cfg.c
+++ b/net/mac802154/cfg.c
@@ -73,6 +73,10 @@ ieee802154_set_channel(struct wpan_phy *wpan_phy, u8 page, u8 channel)
 
 	ASSERT_RTNL();
 
+	if (wpan_phy->current_page == page &&
+	    wpan_phy->current_channel == channel)
+		return 0;
+
 	ret = drv_set_channel(local, page, channel);
 	if (!ret) {
 		wpan_phy->current_page = page;
@@ -91,6 +95,9 @@ ieee802154_set_cca_mode(struct wpan_phy *wpan_phy,
 
 	ASSERT_RTNL();
 
+	if (wpan_phy_cca_cmp(&wpan_phy->cca, cca))
+		return 0;
+
 	/* check if phy support this setting */
 	if (!(local->hw.flags & IEEE802154_HW_CCA_MODE))
 		return -EOPNOTSUPP;
@@ -108,6 +115,9 @@ ieee802154_set_pan_id(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
 {
 	ASSERT_RTNL();
 
+	if (wpan_dev->pan_id == pan_id)
+		return 0;
+
 	wpan_dev->pan_id = pan_id;
 	return 0;
 }
@@ -121,6 +131,10 @@ ieee802154_set_backoff_exponent(struct wpan_phy *wpan_phy,
 
 	ASSERT_RTNL();
 
+	if (wpan_dev->min_be == min_be &&
+	    wpan_dev->max_be == max_be)
+		return 0;
+
 	if (!(local->hw.flags & IEEE802154_HW_CSMA_PARAMS))
 		return -EOPNOTSUPP;
 
@@ -135,6 +149,9 @@ ieee802154_set_short_addr(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
 {
 	ASSERT_RTNL();
 
+	if (wpan_dev->short_addr == short_addr)
+		return 0;
+
 	wpan_dev->short_addr = short_addr;
 	return 0;
 }
@@ -148,6 +165,9 @@ ieee802154_set_max_csma_backoffs(struct wpan_phy *wpan_phy,
 
 	ASSERT_RTNL();
 
+	if (wpan_dev->csma_retries == max_csma_backoffs)
+		return 0;
+
 	if (!(local->hw.flags & IEEE802154_HW_CSMA_PARAMS))
 		return -EOPNOTSUPP;
 
@@ -164,6 +184,9 @@ ieee802154_set_max_frame_retries(struct wpan_phy *wpan_phy,
 
 	ASSERT_RTNL();
 
+	if (wpan_dev->frame_retries == max_frame_retries)
+		return 0;
+
 	if (!(local->hw.flags & IEEE802154_HW_FRAME_RETRIES))
 		return -EOPNOTSUPP;
 
@@ -179,6 +202,9 @@ ieee802154_set_lbt_mode(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
 
 	ASSERT_RTNL();
 
+	if (wpan_dev->lbt == mode)
+		return 0;
+
 	if (!(local->hw.flags & IEEE802154_HW_LBT))
 		return -EOPNOTSUPP;
 
-- 
cgit v1.2.3


From edea8f7c75ec6c238130bd7e74d9f6f4c26e97b0 Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Sun, 17 May 2015 21:44:46 +0200
Subject: cfg802154: introduce wpan phy flags

This patch introduce a flag property for the wpan phy structure.
The current flag settings in ieee802154_hw are accessable in mac802154
layer only which is okay for flags which indicates MAC handling which
are done by phy. For real PHY layer settings like cca mode, transmit
power, cca energy detection level.

The difference between these flags are that the MAC handling flags are
only handled in mac802154/HardMac layer e.g. on an interface up. The phy
settings are direct netlink calls from nl802154 into the driver layer
and the nl802154 need to have a chance to check if the driver supports
this handling before sending to the next layer.

We also check now on PHY flags while dumping and setting pib attributes.
In comparing with MIB attributes the 802.15.4 gives us an default value
which we assume when a transceiver implement less functionality. In case
of MIB settings the nl802154 layer doesn't need to check on the
ieee802154_hw flags then.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/net/ieee802154/at86rf230.c |  9 +++++++--
 include/net/cfg802154.h            | 16 ++++++++++++++++
 include/net/mac802154.h            | 29 +++++++----------------------
 net/ieee802154/nl802154.c          | 27 +++++++++++++++++----------
 net/mac802154/mac_cmd.c            |  6 +++---
 5 files changed, 50 insertions(+), 37 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/ieee802154/at86rf230.c b/drivers/net/ieee802154/at86rf230.c
index 151d57f4320d..b0ffd1cd0010 100644
--- a/drivers/net/ieee802154/at86rf230.c
+++ b/drivers/net/ieee802154/at86rf230.c
@@ -1566,8 +1566,13 @@ at86rf230_detect_device(struct at86rf230_local *lp)
 	}
 
 	lp->hw->flags = IEEE802154_HW_TX_OMIT_CKSUM | IEEE802154_HW_AACK |
-			IEEE802154_HW_TXPOWER | IEEE802154_HW_ARET |
-			IEEE802154_HW_AFILT | IEEE802154_HW_PROMISCUOUS;
+			IEEE802154_HW_CSMA_PARAMS |
+			IEEE802154_HW_FRAME_RETRIES | IEEE802154_HW_AFILT |
+			IEEE802154_HW_PROMISCUOUS;
+
+	lp->hw->phy->flags = WPAN_PHY_FLAG_TXPOWER |
+			     WPAN_PHY_FLAG_CCA_ED_LEVEL |
+			     WPAN_PHY_FLAG_CCA_MODE;
 
 	lp->hw->phy->cca.mode = NL802154_CCA_ENERGY;
 
diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index 37abc1603285..a12c6c5247e9 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -106,6 +106,20 @@ wpan_phy_cca_cmp(const struct wpan_phy_cca *a, const struct wpan_phy_cca *b)
 	return true;
 }
 
+/**
+ * @WPAN_PHY_FLAG_TRANSMIT_POWER: Indicates that transceiver will support
+ *	transmit power setting.
+ * @WPAN_PHY_FLAG_CCA_ED_LEVEL: Indicates that transceiver will support cca ed
+ *	level setting.
+ * @WPAN_PHY_FLAG_CCA_MODE: Indicates that transceiver will support cca mode
+ *	setting.
+ */
+enum wpan_phy_flags {
+	WPAN_PHY_FLAG_TXPOWER		= BIT(1),
+	WPAN_PHY_FLAG_CCA_ED_LEVEL	= BIT(2),
+	WPAN_PHY_FLAG_CCA_MODE		= BIT(3),
+};
+
 struct wpan_phy {
 	struct mutex pib_lock;
 
@@ -117,6 +131,8 @@ struct wpan_phy {
 	 */
 	const void *privid;
 
+	u32 flags;
+
 	/*
 	 * This is a PIB according to 802.15.4-2011.
 	 * We do not provide timing-related variables, as they
diff --git a/include/net/mac802154.h b/include/net/mac802154.h
index 71e245605ef8..9605c7f7453f 100644
--- a/include/net/mac802154.h
+++ b/include/net/mac802154.h
@@ -89,41 +89,26 @@ struct ieee802154_hw {
 #define IEEE802154_HW_TX_OMIT_CKSUM	0x00000001
 /* Indicates that receiver will autorespond with ACK frames. */
 #define IEEE802154_HW_AACK		0x00000002
-/* Indicates that transceiver will support transmit power setting. */
-#define IEEE802154_HW_TXPOWER		0x00000004
 /* Indicates that transceiver will support listen before transmit. */
-#define IEEE802154_HW_LBT		0x00000008
-/* Indicates that transceiver will support cca mode setting. */
-#define IEEE802154_HW_CCA_MODE		0x00000010
-/* Indicates that transceiver will support cca ed level setting. */
-#define IEEE802154_HW_CCA_ED_LEVEL	0x00000020
+#define IEEE802154_HW_LBT		0x00000004
 /* Indicates that transceiver will support csma (max_be, min_be, csma retries)
  * settings. */
-#define IEEE802154_HW_CSMA_PARAMS	0x00000040
+#define IEEE802154_HW_CSMA_PARAMS	0x00000008
 /* Indicates that transceiver will support ARET frame retries setting. */
-#define IEEE802154_HW_FRAME_RETRIES	0x00000080
+#define IEEE802154_HW_FRAME_RETRIES	0x00000010
 /* Indicates that transceiver will support hardware address filter setting. */
-#define IEEE802154_HW_AFILT		0x00000100
+#define IEEE802154_HW_AFILT		0x00000020
 /* Indicates that transceiver will support promiscuous mode setting. */
-#define IEEE802154_HW_PROMISCUOUS	0x00000200
+#define IEEE802154_HW_PROMISCUOUS	0x00000040
 /* Indicates that receiver omits FCS. */
-#define IEEE802154_HW_RX_OMIT_CKSUM	0x00000400
+#define IEEE802154_HW_RX_OMIT_CKSUM	0x00000080
 /* Indicates that receiver will not filter frames with bad checksum. */
-#define IEEE802154_HW_RX_DROP_BAD_CKSUM	0x00000800
+#define IEEE802154_HW_RX_DROP_BAD_CKSUM	0x00000100
 
 /* Indicates that receiver omits FCS and xmitter will add FCS on it's own. */
 #define IEEE802154_HW_OMIT_CKSUM	(IEEE802154_HW_TX_OMIT_CKSUM | \
 					 IEEE802154_HW_RX_OMIT_CKSUM)
 
-/* This groups the most common CSMA support fields into one. */
-#define IEEE802154_HW_CSMA		(IEEE802154_HW_CCA_MODE | \
-					 IEEE802154_HW_CCA_ED_LEVEL | \
-					 IEEE802154_HW_CSMA_PARAMS)
-
-/* This groups the most common ARET support fields into one. */
-#define IEEE802154_HW_ARET		(IEEE802154_HW_CSMA | \
-					 IEEE802154_HW_FRAME_RETRIES)
-
 /* struct ieee802154_ops - callbacks from mac802154 to the driver
  *
  * This structure contains various callbacks that the driver may
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index fa0c4048b0e8..40fb0be009b2 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -291,19 +291,23 @@ static int nl802154_send_wpan_phy(struct cfg802154_registered_device *rdev,
 		goto nla_put_failure;
 
 	/* cca mode */
-	if (nla_put_u32(msg, NL802154_ATTR_CCA_MODE,
-			rdev->wpan_phy.cca.mode))
-		goto nla_put_failure;
-
-	if (rdev->wpan_phy.cca.mode == NL802154_CCA_ENERGY_CARRIER) {
-		if (nla_put_u32(msg, NL802154_ATTR_CCA_OPT,
-				rdev->wpan_phy.cca.opt))
+	if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE) {
+		if (nla_put_u32(msg, NL802154_ATTR_CCA_MODE,
+				rdev->wpan_phy.cca.mode))
 			goto nla_put_failure;
+
+		if (rdev->wpan_phy.cca.mode == NL802154_CCA_ENERGY_CARRIER) {
+			if (nla_put_u32(msg, NL802154_ATTR_CCA_OPT,
+					rdev->wpan_phy.cca.opt))
+				goto nla_put_failure;
+		}
 	}
 
-	if (nla_put_s32(msg, NL802154_ATTR_TX_POWER,
-			rdev->wpan_phy.transmit_power))
-		goto nla_put_failure;
+	if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER) {
+		if (nla_put_s32(msg, NL802154_ATTR_TX_POWER,
+				rdev->wpan_phy.transmit_power))
+			goto nla_put_failure;
+	}
 
 finish:
 	genlmsg_end(msg, hdr);
@@ -637,6 +641,9 @@ static int nl802154_set_cca_mode(struct sk_buff *skb, struct genl_info *info)
 	struct cfg802154_registered_device *rdev = info->user_ptr[0];
 	struct wpan_phy_cca cca;
 
+	if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE)
+		return -EOPNOTSUPP;
+
 	if (!info->attrs[NL802154_ATTR_CCA_MODE])
 		return -EINVAL;
 
diff --git a/net/mac802154/mac_cmd.c b/net/mac802154/mac_cmd.c
index bdccb4ecd30f..6dcbb3b5994c 100644
--- a/net/mac802154/mac_cmd.c
+++ b/net/mac802154/mac_cmd.c
@@ -91,19 +91,19 @@ static int mac802154_set_mac_params(struct net_device *dev,
 	wpan_dev->frame_retries = params->frame_retries;
 	wpan_dev->lbt = params->lbt;
 
-	if (local->hw.flags & IEEE802154_HW_TXPOWER) {
+	if (local->hw.phy->flags & WPAN_PHY_FLAG_TXPOWER) {
 		ret = drv_set_tx_power(local, params->transmit_power);
 		if (ret < 0)
 			return ret;
 	}
 
-	if (local->hw.flags & IEEE802154_HW_CCA_MODE) {
+	if (local->hw.phy->flags & WPAN_PHY_FLAG_CCA_MODE) {
 		ret = drv_set_cca_mode(local, &params->cca);
 		if (ret < 0)
 			return ret;
 	}
 
-	if (local->hw.flags & IEEE802154_HW_CCA_ED_LEVEL) {
+	if (local->hw.phy->flags & WPAN_PHY_FLAG_CCA_ED_LEVEL) {
 		ret = drv_set_cca_ed_level(local, params->cca_ed_level);
 		if (ret < 0)
 			return ret;
-- 
cgit v1.2.3


From 65318680c97cca15e3678148b3a5acaa33e991ec Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Sun, 17 May 2015 21:44:47 +0200
Subject: ieee802154: add iftypes capability

This patch adds capability flags for supported interface types.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/cfg802154.h   | 2 +-
 net/ieee802154/nl802154.c | 3 ++-
 net/mac802154/main.c      | 6 ++++++
 3 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index a12c6c5247e9..11bbf17ad865 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -80,7 +80,7 @@ wpan_phy_supported_bool(bool b, enum nl802154_supported_bool_states st)
 
 struct wpan_phy_supported {
 	u32 channels[IEEE802154_MAX_PAGE + 1],
-	    cca_modes, cca_opts;
+	    cca_modes, cca_opts, iftypes;
 	enum nl802154_supported_bool_states lbt;
 	u8 min_minbe, max_minbe, min_maxbe, max_maxbe,
 	   min_csma_backoffs, max_csma_backoffs;
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index 40fb0be009b2..3afb20e43ff8 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -579,7 +579,8 @@ static int nl802154_new_interface(struct sk_buff *skb, struct genl_info *info)
 
 	if (info->attrs[NL802154_ATTR_IFTYPE]) {
 		type = nla_get_u32(info->attrs[NL802154_ATTR_IFTYPE]);
-		if (type > NL802154_IFTYPE_MAX)
+		if (type > NL802154_IFTYPE_MAX ||
+		    !(rdev->wpan_phy.supported.iftypes & BIT(type)))
 			return -EINVAL;
 	}
 
diff --git a/net/mac802154/main.c b/net/mac802154/main.c
index ddcd6ff8d39c..356b346e1ee8 100644
--- a/net/mac802154/main.c
+++ b/net/mac802154/main.c
@@ -116,6 +116,9 @@ ieee802154_alloc_hw(size_t priv_data_len, const struct ieee802154_ops *ops)
 	phy->supported.max_csma_backoffs = 5;
 	phy->supported.lbt = NL802154_SUPPORTED_BOOL_FALSE;
 
+	/* always supported */
+	phy->supported.iftypes = BIT(NL802154_IFTYPE_NODE);
+
 	return &local->hw;
 }
 EXPORT_SYMBOL(ieee802154_alloc_hw);
@@ -181,6 +184,9 @@ int ieee802154_register_hw(struct ieee802154_hw *hw)
 		local->phy->supported.max_frame_retries = -1;
 	}
 
+	if (hw->flags & IEEE802154_HW_PROMISCUOUS)
+		local->phy->supported.iftypes |= BIT(NL802154_IFTYPE_MONITOR);
+
 	rc = wpan_phy_register(local->phy);
 	if (rc < 0)
 		goto out_wq;
-- 
cgit v1.2.3


From 0e66545701014814360b08a7a43ca652f82b6e5a Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Sun, 17 May 2015 21:44:53 +0200
Subject: nl802154: add support for dump phy capabilities

This patch add support to nl802154 to dump all phy capabilities which is
inside the wpan_phy_supported struct. Also we introduce a new method to
dumping supported channels. The new method will offer a easier interface
and has lesser netlink traffic.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/nl802154.h    |  57 ++++++++++++++++++++++
 net/ieee802154/nl802154.c | 117 +++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 173 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/nl802154.h b/include/net/nl802154.h
index 055277156eef..0badebd1de7f 100644
--- a/include/net/nl802154.h
+++ b/include/net/nl802154.h
@@ -100,6 +100,8 @@ enum nl802154_attrs {
 
 	NL802154_ATTR_EXTENDED_ADDR,
 
+	NL802154_ATTR_WPAN_PHY_CAPS,
+
 	/* add attributes here, update the policy in nl802154.c */
 
 	__NL802154_ATTR_AFTER_LAST,
@@ -119,6 +121,61 @@ enum nl802154_iftype {
 	NL802154_IFTYPE_MAX = NUM_NL802154_IFTYPES - 1
 };
 
+/**
+ * enum nl802154_wpan_phy_capability_attr - wpan phy capability attributes
+ *
+ * @__NL802154_CAP_ATTR_INVALID: attribute number 0 is reserved
+ * @NL802154_CAP_ATTR_CHANNELS: a nested attribute for nl802154_channel_attr
+ * @NL802154_CAP_ATTR_TX_POWERS: a nested attribute for
+ *	nl802154_wpan_phy_tx_power
+ * @NL802154_CAP_ATTR_MIN_CCA_ED_LEVEL: minimum value for cca_ed_level
+ * @NL802154_CAP_ATTR_MAX_CCA_ED_LEVEL: maxmimum value for cca_ed_level
+ * @NL802154_CAP_ATTR_CCA_MODES: nl802154_cca_modes flags
+ * @NL802154_CAP_ATTR_CCA_OPTS: nl802154_cca_opts flags
+ * @NL802154_CAP_ATTR_MIN_MINBE: minimum of minbe value
+ * @NL802154_CAP_ATTR_MAX_MINBE: maximum of minbe value
+ * @NL802154_CAP_ATTR_MIN_MAXBE: minimum of maxbe value
+ * @NL802154_CAP_ATTR_MAX_MINBE: maximum of maxbe value
+ * @NL802154_CAP_ATTR_MIN_CSMA_BACKOFFS: minimum of csma backoff value
+ * @NL802154_CAP_ATTR_MAX_CSMA_BACKOFFS: maximum of csma backoffs value
+ * @NL802154_CAP_ATTR_MIN_FRAME_RETRIES: minimum of frame retries value
+ * @NL802154_CAP_ATTR_MAX_FRAME_RETRIES: maximum of frame retries value
+ * @NL802154_CAP_ATTR_IFTYPES: nl802154_iftype flags
+ * @NL802154_CAP_ATTR_LBT: nl802154_supported_bool_states flags
+ * @NL802154_CAP_ATTR_MAX: highest cap attribute currently defined
+ * @__NL802154_CAP_ATTR_AFTER_LAST: internal use
+ */
+enum nl802154_wpan_phy_capability_attr {
+	__NL802154_CAP_ATTR_INVALID,
+
+	NL802154_CAP_ATTR_IFTYPES,
+
+	NL802154_CAP_ATTR_CHANNELS,
+	NL802154_CAP_ATTR_TX_POWERS,
+
+	NL802154_CAP_ATTR_CCA_ED_LEVELS,
+	NL802154_CAP_ATTR_CCA_MODES,
+	NL802154_CAP_ATTR_CCA_OPTS,
+
+	NL802154_CAP_ATTR_MIN_MINBE,
+	NL802154_CAP_ATTR_MAX_MINBE,
+
+	NL802154_CAP_ATTR_MIN_MAXBE,
+	NL802154_CAP_ATTR_MAX_MAXBE,
+
+	NL802154_CAP_ATTR_MIN_CSMA_BACKOFFS,
+	NL802154_CAP_ATTR_MAX_CSMA_BACKOFFS,
+
+	NL802154_CAP_ATTR_MIN_FRAME_RETRIES,
+	NL802154_CAP_ATTR_MAX_FRAME_RETRIES,
+
+	NL802154_CAP_ATTR_LBT,
+
+	/* keep last */
+	__NL802154_CAP_ATTR_AFTER_LAST,
+	NL802154_CAP_ATTR_MAX = __NL802154_CAP_ATTR_AFTER_LAST - 1
+};
+
 /**
  * enum nl802154_cca_modes - cca modes
  *
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index 3afb20e43ff8..54f4959fc9bb 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -225,6 +225,8 @@ static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = {
 	[NL802154_ATTR_MAX_FRAME_RETRIES] = { .type = NLA_S8, },
 
 	[NL802154_ATTR_LBT_MODE] = { .type = NLA_U8, },
+
+	[NL802154_ATTR_WPAN_PHY_CAPS] = { .type = NLA_NESTED },
 };
 
 /* message building helper */
@@ -235,6 +237,28 @@ static inline void *nl802154hdr_put(struct sk_buff *skb, u32 portid, u32 seq,
 	return genlmsg_put(skb, portid, seq, &nl802154_fam, flags, cmd);
 }
 
+static int
+nl802154_put_flags(struct sk_buff *msg, int attr, u32 mask)
+{
+	struct nlattr *nl_flags = nla_nest_start(msg, attr);
+	int i;
+
+	if (!nl_flags)
+		return -ENOBUFS;
+
+	i = 0;
+	while (mask) {
+		if ((mask & 1) && nla_put_flag(msg, i))
+			return -ENOBUFS;
+
+		mask >>= 1;
+		i++;
+	}
+
+	nla_nest_end(msg, nl_flags);
+	return 0;
+}
+
 static int
 nl802154_send_wpan_phy_channels(struct cfg802154_registered_device *rdev,
 				struct sk_buff *msg)
@@ -256,6 +280,92 @@ nl802154_send_wpan_phy_channels(struct cfg802154_registered_device *rdev,
 	return 0;
 }
 
+static int
+nl802154_put_capabilities(struct sk_buff *msg,
+			  struct cfg802154_registered_device *rdev)
+{
+	const struct wpan_phy_supported *caps = &rdev->wpan_phy.supported;
+	struct nlattr *nl_caps, *nl_channels;
+	int i;
+
+	nl_caps = nla_nest_start(msg, NL802154_ATTR_WPAN_PHY_CAPS);
+	if (!nl_caps)
+		return -ENOBUFS;
+
+	nl_channels = nla_nest_start(msg, NL802154_CAP_ATTR_CHANNELS);
+	if (!nl_channels)
+		return -ENOBUFS;
+
+	for (i = 0; i <= IEEE802154_MAX_PAGE; i++) {
+		if (caps->channels[i]) {
+			if (nl802154_put_flags(msg, i, caps->channels[i]))
+				return -ENOBUFS;
+		}
+	}
+
+	nla_nest_end(msg, nl_channels);
+
+	if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL) {
+		struct nlattr *nl_ed_lvls;
+
+		nl_ed_lvls = nla_nest_start(msg,
+					    NL802154_CAP_ATTR_CCA_ED_LEVELS);
+		if (!nl_ed_lvls)
+			return -ENOBUFS;
+
+		for (i = 0; i < caps->cca_ed_levels_size; i++) {
+			if (nla_put_s32(msg, i, caps->cca_ed_levels[i]))
+				return -ENOBUFS;
+		}
+
+		nla_nest_end(msg, nl_ed_lvls);
+	}
+
+	if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER) {
+		struct nlattr *nl_tx_pwrs;
+
+		nl_tx_pwrs = nla_nest_start(msg, NL802154_CAP_ATTR_TX_POWERS);
+		if (!nl_tx_pwrs)
+			return -ENOBUFS;
+
+		for (i = 0; i < caps->tx_powers_size; i++) {
+			if (nla_put_s32(msg, i, caps->tx_powers[i]))
+				return -ENOBUFS;
+		}
+
+		nla_nest_end(msg, nl_tx_pwrs);
+	}
+
+	if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE) {
+		if (nl802154_put_flags(msg, NL802154_CAP_ATTR_CCA_MODES,
+				       caps->cca_modes) ||
+		    nl802154_put_flags(msg, NL802154_CAP_ATTR_CCA_OPTS,
+				       caps->cca_opts))
+			return -ENOBUFS;
+	}
+
+	if (nla_put_u8(msg, NL802154_CAP_ATTR_MIN_MINBE, caps->min_minbe) ||
+	    nla_put_u8(msg, NL802154_CAP_ATTR_MAX_MINBE, caps->max_minbe) ||
+	    nla_put_u8(msg, NL802154_CAP_ATTR_MIN_MAXBE, caps->min_maxbe) ||
+	    nla_put_u8(msg, NL802154_CAP_ATTR_MAX_MAXBE, caps->max_maxbe) ||
+	    nla_put_u8(msg, NL802154_CAP_ATTR_MIN_CSMA_BACKOFFS,
+		       caps->min_csma_backoffs) ||
+	    nla_put_u8(msg, NL802154_CAP_ATTR_MAX_CSMA_BACKOFFS,
+		       caps->max_csma_backoffs) ||
+	    nla_put_s8(msg, NL802154_CAP_ATTR_MIN_FRAME_RETRIES,
+		       caps->min_frame_retries) ||
+	    nla_put_s8(msg, NL802154_CAP_ATTR_MAX_FRAME_RETRIES,
+		       caps->max_frame_retries) ||
+	    nl802154_put_flags(msg, NL802154_CAP_ATTR_IFTYPES,
+			       caps->iftypes) ||
+	    nla_put_u32(msg, NL802154_CAP_ATTR_LBT, caps->lbt))
+		return -ENOBUFS;
+
+	nla_nest_end(msg, nl_caps);
+
+	return 0;
+}
+
 static int nl802154_send_wpan_phy(struct cfg802154_registered_device *rdev,
 				  enum nl802154_commands cmd,
 				  struct sk_buff *msg, u32 portid, u32 seq,
@@ -286,7 +396,9 @@ static int nl802154_send_wpan_phy(struct cfg802154_registered_device *rdev,
 		       rdev->wpan_phy.current_channel))
 		goto nla_put_failure;
 
-	/* supported channels array */
+	/* TODO remove this behaviour, we still keep support it for a while
+	 * so users can change the behaviour to the new one.
+	 */
 	if (nl802154_send_wpan_phy_channels(rdev, msg))
 		goto nla_put_failure;
 
@@ -309,6 +421,9 @@ static int nl802154_send_wpan_phy(struct cfg802154_registered_device *rdev,
 			goto nla_put_failure;
 	}
 
+	if (nl802154_put_capabilities(msg, rdev))
+		goto nla_put_failure;
+
 finish:
 	genlmsg_end(msg, hdr);
 	return 0;
-- 
cgit v1.2.3


From b5d721d761fccf491f92eefc611e318561683d0a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 18 May 2015 17:06:14 -0700
Subject: inet: properly align icsk_ca_priv

tcp_illinois and upcoming tcp_cdg require 64bit alignment of
icsk_ca_priv

x86 does not care, but other architectures might.

Fixes: 05cbc0db03e82 ("ipv4: Create probe timer for tcp PMTU as per RFC4821")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Fan Du <fan.du@intel.com>
Acked-by: Fan Du <fan.du@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_connection_sock.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 48a815823587..497bc14cdb85 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -129,9 +129,10 @@ struct inet_connection_sock {
 
 		u32		  probe_timestamp;
 	} icsk_mtup;
-	u32			  icsk_ca_priv[16];
 	u32			  icsk_user_timeout;
-#define ICSK_CA_PRIV_SIZE	(16 * sizeof(u32))
+
+	u64			  icsk_ca_priv[64 / sizeof(u64)];
+#define ICSK_CA_PRIV_SIZE      (8 * sizeof(u64))
 };
 
 #define ICSK_TIME_RETRANS	1	/* Retransmit timer */
-- 
cgit v1.2.3


From 492135557dc090a1abb2cfbe1a412757e3ed68ab Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 19 May 2015 21:04:22 +0200
Subject: tcp: add rfc3168, section 6.1.1.1. fallback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This work as a follow-up of commit f7b3bec6f516 ("net: allow setting ecn
via routing table") and adds RFC3168 section 6.1.1.1. fallback for outgoing
ECN connections. In other words, this work adds a retry with a non-ECN
setup SYN packet, as suggested from the RFC on the first timeout:

  [...] A host that receives no reply to an ECN-setup SYN within the
  normal SYN retransmission timeout interval MAY resend the SYN and
  any subsequent SYN retransmissions with CWR and ECE cleared. [...]

Schematic client-side view when assuming the server is in tcp_ecn=2 mode,
that is, Linux default since 2009 via commit 255cac91c3c9 ("tcp: extend
ECN sysctl to allow server-side only ECN"):

 1) Normal ECN-capable path:

    SYN ECE CWR ----->
                <----- SYN ACK ECE
            ACK ----->

 2) Path with broken middlebox, when client has fallback:

    SYN ECE CWR ----X crappy middlebox drops packet
                      (timeout, rtx)
            SYN ----->
                <----- SYN ACK
            ACK ----->

In case we would not have the fallback implemented, the middlebox drop
point would basically end up as:

    SYN ECE CWR ----X crappy middlebox drops packet
                      (timeout, rtx)
    SYN ECE CWR ----X crappy middlebox drops packet
                      (timeout, rtx)
    SYN ECE CWR ----X crappy middlebox drops packet
                      (timeout, rtx)

In any case, it's rather a smaller percentage of sites where there would
occur such additional setup latency: it was found in end of 2014 that ~56%
of IPv4 and 65% of IPv6 servers of Alexa 1 million list would negotiate
ECN (aka tcp_ecn=2 default), 0.42% of these webservers will fail to connect
when trying to negotiate with ECN (tcp_ecn=1) due to timeouts, which the
fallback would mitigate with a slight latency trade-off. Recent related
paper on this topic:

  Brian Trammell, Mirja Kühlewind, Damiano Boppart, Iain Learmonth,
  Gorry Fairhurst, and Richard Scheffenegger:
    "Enabling Internet-Wide Deployment of Explicit Congestion Notification."
    Proc. PAM 2015, New York.
  http://ecn.ethz.ch/ecn-pam15.pdf

Thus, when net.ipv4.tcp_ecn=1 is being set, the patch will perform RFC3168,
section 6.1.1.1. fallback on timeout. For users explicitly not wanting this
which can be in DC use case, we add a net.ipv4.tcp_ecn_fallback knob that
allows for disabling the fallback.

tp->ecn_flags are not being cleared in tcp_ecn_clear_syn() on output, but
rather we let tcp_ecn_rcv_synack() take that over on input path in case a
SYN ACK ECE was delayed. Thus a spurious SYN retransmission will not prevent
ECN being negotiated eventually in that case.

Reference: https://www.ietf.org/proceedings/92/slides/slides-92-iccrg-1.pdf
Reference: https://www.ietf.org/proceedings/89/slides/slides-89-tsvarea-1.pdf
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Mirja Kühlewind <mirja.kuehlewind@tik.ee.ethz.ch>
Signed-off-by: Brian Trammell <trammell@tik.ee.ethz.ch>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Dave That <dave.taht@gmail.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/dctcp.txt     |  1 +
 Documentation/networking/ip-sysctl.txt |  9 +++++++++
 include/net/netns/ipv4.h               |  2 ++
 include/net/tcp.h                      |  2 ++
 net/ipv4/sysctl_net_ipv4.c             |  7 +++++++
 net/ipv4/tcp_ipv4.c                    |  5 ++++-
 net/ipv4/tcp_output.c                  | 13 +++++++++++++
 7 files changed, 38 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/Documentation/networking/dctcp.txt b/Documentation/networking/dctcp.txt
index 0d5dfbc89ec9..13a857753208 100644
--- a/Documentation/networking/dctcp.txt
+++ b/Documentation/networking/dctcp.txt
@@ -8,6 +8,7 @@ the data center network to provide multi-bit feedback to the end hosts.
 To enable it on end hosts:
 
   sysctl -w net.ipv4.tcp_congestion_control=dctcp
+  sysctl -w net.ipv4.tcp_ecn_fallback=0 (optional)
 
 All switches in the data center network running DCTCP must support ECN
 marking and be configured for marking when reaching defined switch buffer
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 5095c63e50ed..cb083e0d682c 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -267,6 +267,15 @@ tcp_ecn - INTEGER
 		  but do not request ECN on outgoing connections.
 	Default: 2
 
+tcp_ecn_fallback - BOOLEAN
+	If the kernel detects that ECN connection misbehaves, enable fall
+	back to non-ECN. Currently, this knob implements the fallback
+	from RFC3168, section 6.1.1.1., but we reserve that in future,
+	additional detection mechanisms could be implemented under this
+	knob. The value	is not used, if tcp_ecn or per route (or congestion
+	control) ECN settings are disabled.
+	Default: 1 (fallback enabled)
+
 tcp_fack - BOOLEAN
 	Enable FACK congestion avoidance and fast retransmission.
 	The value is not used, if tcp_sack is not enabled.
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 614a49be68a9..6848b8bb2e63 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -77,6 +77,8 @@ struct netns_ipv4 {
 	struct local_ports ip_local_ports;
 
 	int sysctl_tcp_ecn;
+	int sysctl_tcp_ecn_fallback;
+
 	int sysctl_ip_no_pmtu_disc;
 	int sysctl_ip_fwd_use_pmtu;
 	int sysctl_ip_nonlocal_bind;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0d85223efa4c..2bb2bad21d5c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -712,6 +712,8 @@ static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
 #define TCPHDR_ECE 0x40
 #define TCPHDR_CWR 0x80
 
+#define TCPHDR_SYN_ECN	(TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR)
+
 /* This is what the send packet queuing engine uses to pass
  * TCP per-packet control information to the transmission code.
  * We also store the host-order sequence numbers in here too.
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index c3852a7ff3c7..841de32f1fee 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -820,6 +820,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_ecn_fallback",
+		.data		= &init_net.ipv4.sysctl_tcp_ecn_fallback,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{
 		.procname	= "ip_local_port_range",
 		.maxlen		= sizeof(init_net.ipv4.ip_local_ports.range),
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 91cb4768a860..0cc4b5a630cd 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2411,12 +2411,15 @@ static int __net_init tcp_sk_init(struct net *net)
 			goto fail;
 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
 	}
+
 	net->ipv4.sysctl_tcp_ecn = 2;
+	net->ipv4.sysctl_tcp_ecn_fallback = 1;
+
 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
-	return 0;
 
+	return 0;
 fail:
 	tcp_sk_exit(net);
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 08c2cc40b26d..e29d43b5a0bb 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -350,6 +350,15 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
 	}
 }
 
+static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
+{
+	if (sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)
+		/* tp->ecn_flags are cleared at a later point in time when
+		 * SYN ACK is ultimatively being received.
+		 */
+		TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR);
+}
+
 static void
 tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th,
 		    struct sock *sk)
@@ -2615,6 +2624,10 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 		}
 	}
 
+	/* RFC3168, section 6.1.1.1. ECN fallback */
+	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
+		tcp_ecn_clear_syn(sk, skb);
+
 	tcp_retrans_try_collapse(sk, skb, cur_mss);
 
 	/* Make a copy, if the first transmission SKB clone we made
-- 
cgit v1.2.3


From 06b2c61c92a9942769ee8da22d3ce8b8b935c038 Mon Sep 17 00:00:00 2001
From: Andy Zhou <azhou@nicira.com>
Date: Tue, 19 May 2015 12:41:47 -0700
Subject: ip: remove unused function prototype

ip_do_nat() function was removed prior to kernel 3.4. Remove the
unnecessary function prototype as well.

Reported-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Andy Zhou <azhou@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/ip.h b/include/net/ip.h
index cd7a6a458bb6..7921a36b805c 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -110,7 +110,6 @@ int ip_output(struct sock *sk, struct sk_buff *skb);
 int ip_mc_output(struct sock *sk, struct sk_buff *skb);
 int ip_do_fragment(struct sock *sk, struct sk_buff *skb,
 		   int (*output)(struct sock *, struct sk_buff *));
-int ip_do_nat(struct sk_buff *skb);
 void ip_send_check(struct iphdr *ip);
 int __ip_local_out(struct sk_buff *skb);
 int ip_local_out_sk(struct sock *sk, struct sk_buff *skb);
-- 
cgit v1.2.3


From eb9344781a2f8381ed60cd9e662d9ced2d168ecb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 19 May 2015 13:26:55 -0700
Subject: tcp: add a force_schedule argument to sk_stream_alloc_skb()

In commit 8e4d980ac215 ("tcp: fix behavior for epoll edge trigger")
we fixed a possible hang of TCP sockets under memory pressure,
by allowing sk_stream_alloc_skb() to use sk_forced_mem_schedule()
if no packet is in socket write queue.

It turns out there are other cases where we want to force memory
schedule :

tcp_fragment() & tso_fragment() need to split a big TSO packet into
two smaller ones. If we block here because of TCP memory pressure,
we can effectively block TCP socket from sending new data.
If no further ACK is coming, this hang would be definitive, and socket
has no chance to effectively reduce its memory usage.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h    |  3 ++-
 net/ipv4/tcp.c        | 19 +++++++++++--------
 net/ipv4/tcp_output.c | 10 +++++-----
 3 files changed, 18 insertions(+), 14 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index 4581a60636f8..26c1c3171e00 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2025,7 +2025,8 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk)
 	}
 }
 
-struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp);
+struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
+				    bool force_schedule);
 
 /**
  * sk_page_frag - return an appropriate page_frag
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index bb9bb844204f..ca1d476c80ef 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -808,7 +808,8 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
 }
 EXPORT_SYMBOL(tcp_splice_read);
 
-struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
+struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
+				    bool force_schedule)
 {
 	struct sk_buff *skb;
 
@@ -820,15 +821,15 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
 
 	skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
 	if (likely(skb)) {
-		bool mem_schedule;
+		bool mem_scheduled;
 
-		if (skb_queue_len(&sk->sk_write_queue) == 0) {
-			mem_schedule = true;
+		if (force_schedule) {
+			mem_scheduled = true;
 			sk_forced_mem_schedule(sk, skb->truesize);
 		} else {
-			mem_schedule = sk_wmem_schedule(sk, skb->truesize);
+			mem_scheduled = sk_wmem_schedule(sk, skb->truesize);
 		}
-		if (likely(mem_schedule)) {
+		if (likely(mem_scheduled)) {
 			skb_reserve(skb, sk->sk_prot->max_header);
 			/*
 			 * Make sure that we have exactly size bytes
@@ -918,7 +919,8 @@ new_segment:
 			if (!sk_stream_memory_free(sk))
 				goto wait_for_sndbuf;
 
-			skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
+			skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
+						  skb_queue_empty(&sk->sk_write_queue));
 			if (!skb)
 				goto wait_for_memory;
 
@@ -1154,7 +1156,8 @@ new_segment:
 
 			skb = sk_stream_alloc_skb(sk,
 						  select_size(sk, sg),
-						  sk->sk_allocation);
+						  sk->sk_allocation,
+						  skb_queue_empty(&sk->sk_write_queue));
 			if (!skb)
 				goto wait_for_memory;
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e29d43b5a0bb..3aebe0157dfa 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1172,7 +1172,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
 		return -ENOMEM;
 
 	/* Get a new skb... force flag on. */
-	buff = sk_stream_alloc_skb(sk, nsize, gfp);
+	buff = sk_stream_alloc_skb(sk, nsize, gfp, true);
 	if (!buff)
 		return -ENOMEM; /* We'll just try again later. */
 
@@ -1731,7 +1731,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 	if (skb->len != skb->data_len)
 		return tcp_fragment(sk, skb, len, mss_now, gfp);
 
-	buff = sk_stream_alloc_skb(sk, 0, gfp);
+	buff = sk_stream_alloc_skb(sk, 0, gfp, true);
 	if (unlikely(!buff))
 		return -ENOMEM;
 
@@ -1950,7 +1950,7 @@ static int tcp_mtu_probe(struct sock *sk)
 	}
 
 	/* We're allowed to probe.  Build it now. */
-	nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC);
+	nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false);
 	if (!nskb)
 		return -1;
 	sk->sk_wmem_queued += nskb->truesize;
@@ -3190,7 +3190,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
 	/* limit to order-0 allocations */
 	space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
 
-	syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation);
+	syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation, false);
 	if (!syn_data)
 		goto fallback;
 	syn_data->ip_summed = CHECKSUM_PARTIAL;
@@ -3256,7 +3256,7 @@ int tcp_connect(struct sock *sk)
 		return 0;
 	}
 
-	buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
+	buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true);
 	if (unlikely(!buff))
 		return -ENOBUFS;
 
-- 
cgit v1.2.3


From f5af1f57a2914e290de40e2c93716da8885c4965 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 20 May 2015 10:59:01 -0700
Subject: inet_hashinfo: remove bsocket counter

We no longer need bsocket atomic counter, as inet_csk_get_port()
calls bind_conflict() regardless of its value, after commit
2b05ad33e1e624e ("tcp: bind() fix autoselection to share ports")

This patch removes overhead of maintaining this counter and
double inet_csk_get_port() calls under pressure.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Marcelo Ricardo Leitner <mleitner@redhat.com>
Cc: Flavio Leitner <fbl@redhat.com>
Acked-by: Flavio Leitner <fbl@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_hashtables.h   | 2 --
 net/ipv4/inet_connection_sock.c | 5 -----
 net/ipv4/inet_hashtables.c      | 7 -------
 3 files changed, 14 deletions(-)

(limited to 'include/net')

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 73fe0f9525d9..774d24151d4a 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -148,8 +148,6 @@ struct inet_hashinfo {
 	 */
 	struct inet_listen_hashbucket	listening_hash[INET_LHTABLE_SIZE]
 					____cacheline_aligned_in_smp;
-
-	atomic_t			bsockets;
 };
 
 static inline struct inet_ehash_bucket *inet_ehash_bucket(
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 8976ca423a07..b95fb263a13f 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -127,11 +127,6 @@ again:
 					    (tb->num_owners < smallest_size || smallest_size == -1)) {
 						smallest_size = tb->num_owners;
 						smallest_rover = rover;
-						if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&
-						    !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
-							snum = smallest_rover;
-							goto tb_found;
-						}
 					}
 					if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
 						snum = rover;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index c6fb80bd5826..3766bddb3e8a 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -90,10 +90,6 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket
 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
 		    const unsigned short snum)
 {
-	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
-
-	atomic_inc(&hashinfo->bsockets);
-
 	inet_sk(sk)->inet_num = snum;
 	sk_add_bind_node(sk, &tb->owners);
 	tb->num_owners++;
@@ -111,8 +107,6 @@ static void __inet_put_port(struct sock *sk)
 	struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
 	struct inet_bind_bucket *tb;
 
-	atomic_dec(&hashinfo->bsockets);
-
 	spin_lock(&head->lock);
 	tb = inet_csk(sk)->icsk_bind_hash;
 	__sk_del_bind_node(sk);
@@ -608,7 +602,6 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
 {
 	int i;
 
-	atomic_set(&h->bsockets, 0);
 	for (i = 0; i < INET_LHTABLE_SIZE; i++) {
 		spin_lock_init(&h->listening_hash[i].lock);
 		INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head,
-- 
cgit v1.2.3


From 4a3a8c0c3a613e481bea931f0d65dc4a7efaa9b9 Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Fri, 22 May 2015 17:43:52 +0200
Subject: mac802154: remove pib lock

This patch removes the pib lock which is now replaced by rtnl lock. The
new interface already use the rtnl lock only. Nevertheless this patch
will fix issues while using new and old interface at the same time.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Reviewed-by: Stefan Schmidt <stefan@osg.samsung.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/cfg802154.h | 2 --
 net/ieee802154/core.c   | 2 --
 net/ieee802154/nl-phy.c | 6 +++---
 net/mac802154/iface.c   | 7 -------
 net/mac802154/mib.c     | 4 ++--
 5 files changed, 5 insertions(+), 16 deletions(-)

(limited to 'include/net')

diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index 11bbf17ad865..c6aa1d210182 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -121,8 +121,6 @@ enum wpan_phy_flags {
 };
 
 struct wpan_phy {
-	struct mutex pib_lock;
-
 	/* If multiple wpan_phys are registered and you're handed e.g.
 	 * a regular netdev with assigned ieee802154_ptr, you won't
 	 * know whether it points to a wpan_phy your driver has registered
diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c
index 2ee00e8a0308..b0248e934230 100644
--- a/net/ieee802154/core.c
+++ b/net/ieee802154/core.c
@@ -121,8 +121,6 @@ wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size)
 	/* atomic_inc_return makes it start at 1, make it start at 0 */
 	rdev->wpan_phy_idx--;
 
-	mutex_init(&rdev->wpan_phy.pib_lock);
-
 	INIT_LIST_HEAD(&rdev->wpan_dev_list);
 	device_initialize(&rdev->wpan_phy.dev);
 	dev_set_name(&rdev->wpan_phy.dev, PHY_NAME "%d", rdev->wpan_phy_idx);
diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c
index cbc0d09351e0..77d73014bde3 100644
--- a/net/ieee802154/nl-phy.c
+++ b/net/ieee802154/nl-phy.c
@@ -50,7 +50,7 @@ static int ieee802154_nl_fill_phy(struct sk_buff *msg, u32 portid,
 	if (!hdr)
 		goto out;
 
-	mutex_lock(&phy->pib_lock);
+	rtnl_lock();
 	if (nla_put_string(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)) ||
 	    nla_put_u8(msg, IEEE802154_ATTR_PAGE, phy->current_page) ||
 	    nla_put_u8(msg, IEEE802154_ATTR_CHANNEL, phy->current_channel))
@@ -63,13 +63,13 @@ static int ieee802154_nl_fill_phy(struct sk_buff *msg, u32 portid,
 	    nla_put(msg, IEEE802154_ATTR_CHANNEL_PAGE_LIST,
 		    pages * sizeof(uint32_t), buf))
 		goto nla_put_failure;
-	mutex_unlock(&phy->pib_lock);
+	rtnl_unlock();
 	kfree(buf);
 	genlmsg_end(msg, hdr);
 	return 0;
 
 nla_put_failure:
-	mutex_unlock(&phy->pib_lock);
+	rtnl_unlock();
 	genlmsg_cancel(msg, hdr);
 out:
 	kfree(buf);
diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c
index 2a5878889289..22f478be7489 100644
--- a/net/mac802154/iface.c
+++ b/net/mac802154/iface.c
@@ -242,7 +242,6 @@ static int mac802154_wpan_open(struct net_device *dev)
 	struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
 	struct ieee802154_local *local = sdata->local;
 	struct wpan_dev *wpan_dev = &sdata->wpan_dev;
-	struct wpan_phy *phy = sdata->local->phy;
 
 	rc = ieee802154_check_concurrent_iface(sdata, sdata->vif.type);
 	if (rc < 0)
@@ -252,8 +251,6 @@ static int mac802154_wpan_open(struct net_device *dev)
 	if (rc < 0)
 		return rc;
 
-	mutex_lock(&phy->pib_lock);
-
 	if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) {
 		rc = drv_set_promiscuous_mode(local,
 					      wpan_dev->promiscuous_mode);
@@ -295,11 +292,7 @@ static int mac802154_wpan_open(struct net_device *dev)
 			goto out;
 	}
 
-	mutex_unlock(&phy->pib_lock);
-	return 0;
-
 out:
-	mutex_unlock(&phy->pib_lock);
 	return rc;
 }
 
diff --git a/net/mac802154/mib.c b/net/mac802154/mib.c
index 5cf019a57fd7..033dfc7755c6 100644
--- a/net/mac802154/mib.c
+++ b/net/mac802154/mib.c
@@ -91,16 +91,16 @@ void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan)
 	struct ieee802154_local *local = sdata->local;
 	int res;
 
+	ASSERT_RTNL();
+
 	BUG_ON(dev->type != ARPHRD_IEEE802154);
 
 	res = drv_set_channel(local, page, chan);
 	if (res) {
 		pr_debug("set_channel failed\n");
 	} else {
-		mutex_lock(&local->phy->pib_lock);
 		local->phy->current_channel = chan;
 		local->phy->current_page = page;
-		mutex_unlock(&local->phy->pib_lock);
 	}
 }
 
-- 
cgit v1.2.3


From 344f8c119df742f2bf7098cf8fc326351f583249 Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Fri, 22 May 2015 17:43:53 +0200
Subject: mac802154: use atomic ops for sequence incrementation

This patch will use atomic operations for sequence number incrementation
while MAC header generation. Upper layers like af_802154 or 6LoWPAN
could call this function in a parallel context while generating 802.15.4
MAC header before queuing into wpan interfaces transmit queue.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Reviewed-by: Stefan Schmidt <stefan@osg.samsung.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/cfg802154.h         | 4 ++--
 include/net/ieee802154_netdev.h | 1 -
 net/ieee802154/6lowpan/core.c   | 8 --------
 net/mac802154/ieee802154_i.h    | 1 -
 net/mac802154/iface.c           | 9 ++++++---
 net/mac802154/mac_cmd.c         | 1 -
 net/mac802154/mib.c             | 9 ---------
 7 files changed, 8 insertions(+), 25 deletions(-)

(limited to 'include/net')

diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index c6aa1d210182..4de59aa96173 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -177,9 +177,9 @@ struct wpan_dev {
 	__le64 extended_addr;
 
 	/* MAC BSN field */
-	u8 bsn;
+	atomic_t bsn;
 	/* MAC DSN field */
-	u8 dsn;
+	atomic_t dsn;
 
 	u8 min_be;
 	u8 max_be;
diff --git a/include/net/ieee802154_netdev.h b/include/net/ieee802154_netdev.h
index 94a297052442..144fefb4d619 100644
--- a/include/net/ieee802154_netdev.h
+++ b/include/net/ieee802154_netdev.h
@@ -431,7 +431,6 @@ struct ieee802154_mlme_ops {
 	 */
 	__le16 (*get_pan_id)(const struct net_device *dev);
 	__le16 (*get_short_addr)(const struct net_device *dev);
-	u8 (*get_dsn)(const struct net_device *dev);
 };
 
 static inline struct ieee802154_mlme_ops *
diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c
index 0ae5822ef944..2e77fada7e54 100644
--- a/net/ieee802154/6lowpan/core.c
+++ b/net/ieee802154/6lowpan/core.c
@@ -69,13 +69,6 @@ static __le16 lowpan_get_short_addr(const struct net_device *dev)
 	return ieee802154_mlme_ops(real_dev)->get_short_addr(real_dev);
 }
 
-static u8 lowpan_get_dsn(const struct net_device *dev)
-{
-	struct net_device *real_dev = lowpan_dev_info(dev)->real_dev;
-
-	return ieee802154_mlme_ops(real_dev)->get_dsn(real_dev);
-}
-
 static struct header_ops lowpan_header_ops = {
 	.create	= lowpan_header_create,
 };
@@ -106,7 +99,6 @@ static const struct net_device_ops lowpan_netdev_ops = {
 static struct ieee802154_mlme_ops lowpan_mlme = {
 	.get_pan_id = lowpan_get_pan_id,
 	.get_short_addr = lowpan_get_short_addr,
-	.get_dsn = lowpan_get_dsn,
 };
 
 static void lowpan_setup(struct net_device *dev)
diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h
index 127ba18386fc..56b3847466e9 100644
--- a/net/mac802154/ieee802154_i.h
+++ b/net/mac802154/ieee802154_i.h
@@ -141,7 +141,6 @@ __le16 mac802154_dev_get_short_addr(const struct net_device *dev);
 __le16 mac802154_dev_get_pan_id(const struct net_device *dev);
 void mac802154_dev_set_pan_id(struct net_device *dev, __le16 val);
 void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan);
-u8 mac802154_dev_get_dsn(const struct net_device *dev);
 
 int mac802154_get_params(struct net_device *dev,
 			 struct ieee802154_llsec_params *params);
diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c
index 22f478be7489..0ec7bc402e29 100644
--- a/net/mac802154/iface.c
+++ b/net/mac802154/iface.c
@@ -368,7 +368,7 @@ static int mac802154_header_create(struct sk_buff *skb,
 	hdr.fc.type = cb->type;
 	hdr.fc.security_enabled = cb->secen;
 	hdr.fc.ack_request = cb->ackreq;
-	hdr.seq = ieee802154_mlme_ops(dev)->get_dsn(dev);
+	hdr.seq = atomic_inc_return(&dev->ieee802154_ptr->dsn) & 0xFF;
 
 	if (mac802154_set_header_security(sdata, &hdr, cb) < 0)
 		return -EINVAL;
@@ -468,13 +468,16 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata,
 		       enum nl802154_iftype type)
 {
 	struct wpan_dev *wpan_dev = &sdata->wpan_dev;
+	u8 tmp;
 
 	/* set some type-dependent values */
 	sdata->vif.type = type;
 	sdata->wpan_dev.iftype = type;
 
-	get_random_bytes(&wpan_dev->bsn, 1);
-	get_random_bytes(&wpan_dev->dsn, 1);
+	get_random_bytes(&tmp, sizeof(tmp));
+	atomic_set(&wpan_dev->bsn, tmp);
+	get_random_bytes(&tmp, sizeof(tmp));
+	atomic_set(&wpan_dev->dsn, tmp);
 
 	/* defaults per 802.15.4-2011 */
 	wpan_dev->min_be = 3;
diff --git a/net/mac802154/mac_cmd.c b/net/mac802154/mac_cmd.c
index 6dcbb3b5994c..3b347d4d3d4c 100644
--- a/net/mac802154/mac_cmd.c
+++ b/net/mac802154/mac_cmd.c
@@ -153,7 +153,6 @@ struct ieee802154_mlme_ops mac802154_mlme_wpan = {
 	.start_req = mac802154_mlme_start_req,
 	.get_pan_id = mac802154_dev_get_pan_id,
 	.get_short_addr = mac802154_dev_get_short_addr,
-	.get_dsn = mac802154_dev_get_dsn,
 
 	.llsec = &mac802154_llsec_ops,
 
diff --git a/net/mac802154/mib.c b/net/mac802154/mib.c
index 033dfc7755c6..00a62de9cc23 100644
--- a/net/mac802154/mib.c
+++ b/net/mac802154/mib.c
@@ -76,15 +76,6 @@ void mac802154_dev_set_pan_id(struct net_device *dev, __le16 val)
 	spin_unlock_bh(&sdata->mib_lock);
 }
 
-u8 mac802154_dev_get_dsn(const struct net_device *dev)
-{
-	struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
-
-	BUG_ON(dev->type != ARPHRD_IEEE802154);
-
-	return sdata->wpan_dev.dsn++;
-}
-
 void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan)
 {
 	struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
-- 
cgit v1.2.3


From c947f7e1e31a708f5a4ea8c1a627bec578cd9223 Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Fri, 22 May 2015 17:43:54 +0200
Subject: mac802154: remove mib lock

This patch removes the mib lock. The new locking mechanism is to protect
the mib values with the rtnl lock. Note that this isn't always necessary
if we have an interface up the most mib values are readonly (e.g.
address settings). With this behaviour we can remove locking in
hotpath like frame parsing completely. It depends on context if we need
to hold the rtnl lock or not, this makes the callbacks of
ieee802154_mlme_ops unnecessary because these callbacks hols always the
locks.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Reviewed-by: Stefan Schmidt <stefan@osg.samsung.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/ieee802154_netdev.h |  9 --------
 net/ieee802154/6lowpan/core.c   | 20 -----------------
 net/ieee802154/6lowpan/tx.c     |  2 +-
 net/ieee802154/nl-mac.c         | 14 ++++++++----
 net/ieee802154/socket.c         | 12 +++++-----
 net/mac802154/ieee802154_i.h    |  6 -----
 net/mac802154/iface.c           |  8 -------
 net/mac802154/mac_cmd.c         |  6 ++---
 net/mac802154/mib.c             | 50 -----------------------------------------
 net/mac802154/rx.c              |  5 -----
 10 files changed, 18 insertions(+), 114 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ieee802154_netdev.h b/include/net/ieee802154_netdev.h
index 144fefb4d619..84a72a1382a8 100644
--- a/include/net/ieee802154_netdev.h
+++ b/include/net/ieee802154_netdev.h
@@ -422,15 +422,6 @@ struct ieee802154_mlme_ops {
 			       struct ieee802154_mac_params *params);
 
 	struct ieee802154_llsec_ops *llsec;
-
-	/* The fields below are required. */
-
-	/*
-	 * FIXME: these should become the part of PIB/MIB interface.
-	 * However we still don't have IB interface of any kind
-	 */
-	__le16 (*get_pan_id)(const struct net_device *dev);
-	__le16 (*get_short_addr)(const struct net_device *dev);
 };
 
 static inline struct ieee802154_mlme_ops *
diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c
index 2e77fada7e54..f20a387a1011 100644
--- a/net/ieee802154/6lowpan/core.c
+++ b/net/ieee802154/6lowpan/core.c
@@ -55,20 +55,6 @@
 LIST_HEAD(lowpan_devices);
 static int lowpan_open_count;
 
-static __le16 lowpan_get_pan_id(const struct net_device *dev)
-{
-	struct net_device *real_dev = lowpan_dev_info(dev)->real_dev;
-
-	return ieee802154_mlme_ops(real_dev)->get_pan_id(real_dev);
-}
-
-static __le16 lowpan_get_short_addr(const struct net_device *dev)
-{
-	struct net_device *real_dev = lowpan_dev_info(dev)->real_dev;
-
-	return ieee802154_mlme_ops(real_dev)->get_short_addr(real_dev);
-}
-
 static struct header_ops lowpan_header_ops = {
 	.create	= lowpan_header_create,
 };
@@ -96,11 +82,6 @@ static const struct net_device_ops lowpan_netdev_ops = {
 	.ndo_start_xmit		= lowpan_xmit,
 };
 
-static struct ieee802154_mlme_ops lowpan_mlme = {
-	.get_pan_id = lowpan_get_pan_id,
-	.get_short_addr = lowpan_get_short_addr,
-};
-
 static void lowpan_setup(struct net_device *dev)
 {
 	dev->addr_len		= IEEE802154_ADDR_LEN;
@@ -116,7 +97,6 @@ static void lowpan_setup(struct net_device *dev)
 
 	dev->netdev_ops		= &lowpan_netdev_ops;
 	dev->header_ops		= &lowpan_header_ops;
-	dev->ml_priv		= &lowpan_mlme;
 	dev->destructor		= free_netdev;
 	dev->features		|= NETIF_F_NETNS_LOCAL;
 }
diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c
index 2349070bd534..98acf7319754 100644
--- a/net/ieee802154/6lowpan/tx.c
+++ b/net/ieee802154/6lowpan/tx.c
@@ -207,7 +207,7 @@ static int lowpan_header(struct sk_buff *skb, struct net_device *dev)
 
 	/* prepare wpan address data */
 	sa.mode = IEEE802154_ADDR_LONG;
-	sa.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev);
+	sa.pan_id = lowpan_dev_info(dev)->real_dev->ieee802154_ptr->pan_id;
 	sa.extended_addr = ieee802154_devaddr_from_raw(saddr);
 
 	/* intra-PAN communications */
diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c
index cdc1cc3543f6..ada58a81b753 100644
--- a/net/ieee802154/nl-mac.c
+++ b/net/ieee802154/nl-mac.c
@@ -97,8 +97,10 @@ static int ieee802154_nl_fill_iface(struct sk_buff *msg, u32 portid,
 	BUG_ON(!phy);
 	get_device(&phy->dev);
 
-	short_addr = ops->get_short_addr(dev);
-	pan_id = ops->get_pan_id(dev);
+	rtnl_lock();
+	short_addr = dev->ieee802154_ptr->short_addr;
+	pan_id = dev->ieee802154_ptr->pan_id;
+	rtnl_unlock();
 
 	if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) ||
 	    nla_put_string(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)) ||
@@ -244,7 +246,9 @@ int ieee802154_associate_resp(struct sk_buff *skb, struct genl_info *info)
 	addr.mode = IEEE802154_ADDR_LONG;
 	addr.extended_addr = nla_get_hwaddr(
 			info->attrs[IEEE802154_ATTR_DEST_HW_ADDR]);
-	addr.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev);
+	rtnl_lock();
+	addr.pan_id = dev->ieee802154_ptr->pan_id;
+	rtnl_unlock();
 
 	ret = ieee802154_mlme_ops(dev)->assoc_resp(dev, &addr,
 		nla_get_shortaddr(info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]),
@@ -281,7 +285,9 @@ int ieee802154_disassociate_req(struct sk_buff *skb, struct genl_info *info)
 		addr.short_addr = nla_get_shortaddr(
 				info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]);
 	}
-	addr.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev);
+	rtnl_lock();
+	addr.pan_id = dev->ieee802154_ptr->pan_id;
+	rtnl_unlock();
 
 	ret = ieee802154_mlme_ops(dev)->disassoc_req(dev, &addr,
 			nla_get_u8(info->attrs[IEEE802154_ATTR_REASON]));
diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
index 7aaaf967df58..e5cc2537c2a3 100644
--- a/net/ieee802154/socket.c
+++ b/net/ieee802154/socket.c
@@ -64,10 +64,8 @@ ieee802154_get_dev(struct net *net, const struct ieee802154_addr *addr)
 			if (tmp->type != ARPHRD_IEEE802154)
 				continue;
 
-			pan_id = ieee802154_mlme_ops(tmp)->get_pan_id(tmp);
-			short_addr =
-				ieee802154_mlme_ops(tmp)->get_short_addr(tmp);
-
+			pan_id = tmp->ieee802154_ptr->pan_id;
+			short_addr = tmp->ieee802154_ptr->short_addr;
 			if (pan_id == addr->pan_id &&
 			    short_addr == addr->short_addr) {
 				dev = tmp;
@@ -797,9 +795,9 @@ static int ieee802154_dgram_deliver(struct net_device *dev, struct sk_buff *skb)
 	/* Data frame processing */
 	BUG_ON(dev->type != ARPHRD_IEEE802154);
 
-	pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev);
-	short_addr = ieee802154_mlme_ops(dev)->get_short_addr(dev);
-	hw_addr = ieee802154_devaddr_from_raw(dev->dev_addr);
+	pan_id = dev->ieee802154_ptr->pan_id;
+	short_addr = dev->ieee802154_ptr->short_addr;
+	hw_addr = dev->ieee802154_ptr->extended_addr;
 
 	read_lock(&dgram_lock);
 	sk_for_each(sk, &dgram_head) {
diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h
index 56b3847466e9..eec668f3637f 100644
--- a/net/mac802154/ieee802154_i.h
+++ b/net/mac802154/ieee802154_i.h
@@ -86,8 +86,6 @@ struct ieee802154_sub_if_data {
 	unsigned long state;
 	char name[IFNAMSIZ];
 
-	spinlock_t mib_lock;
-
 	/* protects sec from concurrent access by netlink. access by
 	 * encrypt/decrypt/header_create safe without additional protection.
 	 */
@@ -136,10 +134,6 @@ ieee802154_subif_start_xmit(struct sk_buff *skb, struct net_device *dev);
 enum hrtimer_restart ieee802154_xmit_ifs_timer(struct hrtimer *timer);
 
 /* MIB callbacks */
-void mac802154_dev_set_short_addr(struct net_device *dev, __le16 val);
-__le16 mac802154_dev_get_short_addr(const struct net_device *dev);
-__le16 mac802154_dev_get_pan_id(const struct net_device *dev);
-void mac802154_dev_set_pan_id(struct net_device *dev, __le16 val);
 void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan);
 
 int mac802154_get_params(struct net_device *dev,
diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c
index 0ec7bc402e29..f30788d2702f 100644
--- a/net/mac802154/iface.c
+++ b/net/mac802154/iface.c
@@ -63,7 +63,6 @@ mac802154_wpan_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	int err = -ENOIOCTLCMD;
 
 	rtnl_lock();
-	spin_lock_bh(&sdata->mib_lock);
 
 	switch (cmd) {
 	case SIOCGIFADDR:
@@ -88,7 +87,6 @@ mac802154_wpan_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	}
 	case SIOCSIFADDR:
 		if (netif_running(dev)) {
-			spin_unlock_bh(&sdata->mib_lock);
 			rtnl_unlock();
 			return -EBUSY;
 		}
@@ -111,7 +109,6 @@ mac802154_wpan_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 		break;
 	}
 
-	spin_unlock_bh(&sdata->mib_lock);
 	rtnl_unlock();
 	return err;
 }
@@ -374,8 +371,6 @@ static int mac802154_header_create(struct sk_buff *skb,
 		return -EINVAL;
 
 	if (!saddr) {
-		spin_lock_bh(&sdata->mib_lock);
-
 		if (wpan_dev->short_addr == cpu_to_le16(IEEE802154_ADDR_BROADCAST) ||
 		    wpan_dev->short_addr == cpu_to_le16(IEEE802154_ADDR_UNDEF) ||
 		    wpan_dev->pan_id == cpu_to_le16(IEEE802154_PANID_BROADCAST)) {
@@ -387,8 +382,6 @@ static int mac802154_header_create(struct sk_buff *skb,
 		}
 
 		hdr.source.pan_id = wpan_dev->pan_id;
-
-		spin_unlock_bh(&sdata->mib_lock);
 	} else {
 		hdr.source = *(const struct ieee802154_addr *)saddr;
 	}
@@ -500,7 +493,6 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata,
 		sdata->dev->ml_priv = &mac802154_mlme_wpan;
 		wpan_dev->promiscuous_mode = false;
 
-		spin_lock_init(&sdata->mib_lock);
 		mutex_init(&sdata->sec_mtx);
 
 		mac802154_llsec_init(&sdata->sec);
diff --git a/net/mac802154/mac_cmd.c b/net/mac802154/mac_cmd.c
index 3b347d4d3d4c..5220c2b2735b 100644
--- a/net/mac802154/mac_cmd.c
+++ b/net/mac802154/mac_cmd.c
@@ -43,8 +43,8 @@ static int mac802154_mlme_start_req(struct net_device *dev,
 
 	BUG_ON(addr->mode != IEEE802154_ADDR_SHORT);
 
-	mac802154_dev_set_pan_id(dev, addr->pan_id);
-	mac802154_dev_set_short_addr(dev, addr->short_addr);
+	dev->ieee802154_ptr->pan_id = addr->pan_id;
+	dev->ieee802154_ptr->short_addr = addr->short_addr;
 	mac802154_dev_set_page_channel(dev, page, channel);
 
 	if (ops->llsec) {
@@ -151,8 +151,6 @@ static struct ieee802154_llsec_ops mac802154_llsec_ops = {
 
 struct ieee802154_mlme_ops mac802154_mlme_wpan = {
 	.start_req = mac802154_mlme_start_req,
-	.get_pan_id = mac802154_dev_get_pan_id,
-	.get_short_addr = mac802154_dev_get_short_addr,
 
 	.llsec = &mac802154_llsec_ops,
 
diff --git a/net/mac802154/mib.c b/net/mac802154/mib.c
index 00a62de9cc23..73f94fbf8785 100644
--- a/net/mac802154/mib.c
+++ b/net/mac802154/mib.c
@@ -26,56 +26,6 @@
 #include "ieee802154_i.h"
 #include "driver-ops.h"
 
-void mac802154_dev_set_short_addr(struct net_device *dev, __le16 val)
-{
-	struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
-
-	BUG_ON(dev->type != ARPHRD_IEEE802154);
-
-	spin_lock_bh(&sdata->mib_lock);
-	sdata->wpan_dev.short_addr = val;
-	spin_unlock_bh(&sdata->mib_lock);
-}
-
-__le16 mac802154_dev_get_short_addr(const struct net_device *dev)
-{
-	struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
-	__le16 ret;
-
-	BUG_ON(dev->type != ARPHRD_IEEE802154);
-
-	spin_lock_bh(&sdata->mib_lock);
-	ret = sdata->wpan_dev.short_addr;
-	spin_unlock_bh(&sdata->mib_lock);
-
-	return ret;
-}
-
-__le16 mac802154_dev_get_pan_id(const struct net_device *dev)
-{
-	struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
-	__le16 ret;
-
-	BUG_ON(dev->type != ARPHRD_IEEE802154);
-
-	spin_lock_bh(&sdata->mib_lock);
-	ret = sdata->wpan_dev.pan_id;
-	spin_unlock_bh(&sdata->mib_lock);
-
-	return ret;
-}
-
-void mac802154_dev_set_pan_id(struct net_device *dev, __le16 val)
-{
-	struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
-
-	BUG_ON(dev->type != ARPHRD_IEEE802154);
-
-	spin_lock_bh(&sdata->mib_lock);
-	sdata->wpan_dev.pan_id = val;
-	spin_unlock_bh(&sdata->mib_lock);
-}
-
 void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan)
 {
 	struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c
index c0d67b2b4132..e0f10063cac3 100644
--- a/net/mac802154/rx.c
+++ b/net/mac802154/rx.c
@@ -47,8 +47,6 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata,
 
 	pr_debug("getting packet via slave interface %s\n", sdata->dev->name);
 
-	spin_lock_bh(&sdata->mib_lock);
-
 	span = wpan_dev->pan_id;
 	sshort = wpan_dev->short_addr;
 
@@ -83,13 +81,10 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata,
 			skb->pkt_type = PACKET_OTHERHOST;
 		break;
 	default:
-		spin_unlock_bh(&sdata->mib_lock);
 		pr_debug("invalid dest mode\n");
 		goto fail;
 	}
 
-	spin_unlock_bh(&sdata->mib_lock);
-
 	skb->dev = sdata->dev;
 
 	rc = mac802154_llsec_decrypt(&sdata->sec, skb);
-- 
cgit v1.2.3


From 286c2349f6665c3e67f464a5faa14a0e28be4842 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 22 May 2015 20:55:56 -0700
Subject: ipv6: Clean up ipv6_select_ident() and ip6_fragment()

This patch changes the ipv6_select_ident() signature to return a
fragment id instead of taking a whole frag_hdr as a param to
only set the frag_hdr->identification.

It also cleans up ip6_fragment() to obtain the fragment id at the
beginning instead of using multiple "if" later to check fragment id
has been generated or not.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h     |  3 +--
 net/ipv6/ip6_output.c  | 17 ++++++-----------
 net/ipv6/output_core.c |  5 ++---
 3 files changed, 9 insertions(+), 16 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index aab8190d16e8..8c4f881edbd2 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -671,8 +671,7 @@ static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_add
 	return __ipv6_addr_diff(a1, a2, sizeof(struct in6_addr));
 }
 
-void ipv6_select_ident(struct net *net, struct frag_hdr *fhdr,
-		       struct rt6_info *rt);
+u32 ipv6_select_ident(struct net *net, struct rt6_info *rt);
 void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb);
 
 int ip6_dst_hoplimit(struct dst_entry *dst);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index bc09cb97b840..05e2cdf938bd 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -551,7 +551,7 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb,
 	struct frag_hdr *fh;
 	unsigned int mtu, hlen, left, len;
 	int hroom, troom;
-	__be32 frag_id = 0;
+	__be32 frag_id;
 	int ptr, offset = 0, err = 0;
 	u8 *prevhdr, nexthdr = 0;
 	struct net *net = dev_net(skb_dst(skb)->dev);
@@ -584,6 +584,8 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb,
 	}
 	mtu -= hlen + sizeof(struct frag_hdr);
 
+	frag_id = ipv6_select_ident(net, rt);
+
 	if (skb_has_frag_list(skb)) {
 		int first_len = skb_pagelen(skb);
 		struct sk_buff *frag2;
@@ -632,11 +634,10 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb,
 		skb_reset_network_header(skb);
 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
 
-		ipv6_select_ident(net, fh, rt);
 		fh->nexthdr = nexthdr;
 		fh->reserved = 0;
 		fh->frag_off = htons(IP6_MF);
-		frag_id = fh->identification;
+		fh->identification = frag_id;
 
 		first_len = skb_pagelen(skb);
 		skb->data_len = first_len - skb_headlen(skb);
@@ -778,11 +779,7 @@ slow_path:
 		 */
 		fh->nexthdr = nexthdr;
 		fh->reserved = 0;
-		if (!frag_id) {
-			ipv6_select_ident(net, fh, rt);
-			frag_id = fh->identification;
-		} else
-			fh->identification = frag_id;
+		fh->identification = frag_id;
 
 		/*
 		 *	Copy a block of the IP datagram.
@@ -1064,7 +1061,6 @@ static inline int ip6_ufo_append_data(struct sock *sk,
 
 {
 	struct sk_buff *skb;
-	struct frag_hdr fhdr;
 	int err;
 
 	/* There is support for UDP large send offload by network
@@ -1106,8 +1102,7 @@ static inline int ip6_ufo_append_data(struct sock *sk,
 	skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
 				     sizeof(struct frag_hdr)) & ~7;
 	skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
-	ipv6_select_ident(sock_net(sk), &fhdr, rt);
-	skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
+	skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk), rt);
 
 append:
 	return skb_append_datato_frags(sk, skb, getfrag, from,
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index 85892af57364..ef0e2326496b 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -60,8 +60,7 @@ void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb)
 }
 EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident);
 
-void ipv6_select_ident(struct net *net, struct frag_hdr *fhdr,
-		       struct rt6_info *rt)
+u32 ipv6_select_ident(struct net *net, struct rt6_info *rt)
 {
 	static u32 ip6_idents_hashrnd __read_mostly;
 	u32 id;
@@ -70,7 +69,7 @@ void ipv6_select_ident(struct net *net, struct frag_hdr *fhdr,
 
 	id = __ipv6_select_ident(net, ip6_idents_hashrnd, &rt->rt6i_dst.addr,
 				 &rt->rt6i_src.addr);
-	fhdr->identification = htonl(id);
+	return htonl(id);
 }
 EXPORT_SYMBOL(ipv6_select_ident);
 
-- 
cgit v1.2.3


From fd0273d7939f2ce3247f6aac5f6b9a0135d4cd39 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 22 May 2015 20:55:57 -0700
Subject: ipv6: Remove external dependency on rt6i_dst and rt6i_src

This patch removes the assumptions that the returned rt is always
a RTF_CACHE entry with the rt6i_dst and rt6i_src containing the
destination and source address.  The dst and src can be recovered from
the calling site.

We may consider to rename (rt6i_dst, rt6i_src) to
(rt6i_key_dst, rt6i_key_src) later.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Reviewed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/scsi/cxgbi/libcxgbi.c   |  2 +-
 include/net/ipv6.h              |  4 +++-
 net/ipv6/icmp.c                 |  2 +-
 net/ipv6/ip6_output.c           | 13 ++++++++-----
 net/ipv6/ndisc.c                |  2 +-
 net/ipv6/output_core.c          | 10 ++++++----
 net/ipv6/tcp_ipv6.c             |  2 +-
 net/netfilter/ipvs/ip_vs_xmit.c |  4 ++--
 net/sctp/ipv6.c                 |  3 ++-
 9 files changed, 25 insertions(+), 17 deletions(-)

(limited to 'include/net')

diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c
index eb58afcfb73b..45d30398d7c3 100644
--- a/drivers/scsi/cxgbi/libcxgbi.c
+++ b/drivers/scsi/cxgbi/libcxgbi.c
@@ -728,7 +728,7 @@ static struct cxgbi_sock *cxgbi_check_route6(struct sockaddr *dst_addr)
 	}
 	ndev = n->dev;
 
-	if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
+	if (ipv6_addr_is_multicast(&daddr6->sin6_addr)) {
 		pr_info("multi-cast route %pI6 port %u, dev %s.\n",
 			daddr6->sin6_addr.s6_addr,
 			ntohs(daddr6->sin6_port), ndev->name);
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 8c4f881edbd2..b950a2000b7f 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -671,7 +671,9 @@ static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_add
 	return __ipv6_addr_diff(a1, a2, sizeof(struct in6_addr));
 }
 
-u32 ipv6_select_ident(struct net *net, struct rt6_info *rt);
+u32 ipv6_select_ident(struct net *net,
+		      const struct in6_addr *daddr,
+		      const struct in6_addr *saddr);
 void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb);
 
 int ip6_dst_hoplimit(struct dst_entry *dst);
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 2c2b5d51f15c..24b359d1425b 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -207,7 +207,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
 			struct inet_peer *peer;
 
 			peer = inet_getpeer_v6(net->ipv6.peers,
-					       &rt->rt6i_dst.addr, 1);
+					       &fl6->daddr, 1);
 			res = inet_peer_xrlim_allow(peer, tmo);
 			if (peer)
 				inet_putpeer(peer);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 05e2cdf938bd..3097f2c5e890 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -459,7 +459,7 @@ int ip6_forward(struct sk_buff *skb)
 		else
 			target = &hdr->daddr;
 
-		peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
+		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 
 		/* Limit redirects both by destination (here)
 		   and by source (inside ndisc_send_redirect)
@@ -584,7 +584,8 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb,
 	}
 	mtu -= hlen + sizeof(struct frag_hdr);
 
-	frag_id = ipv6_select_ident(net, rt);
+	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
+				    &ipv6_hdr(skb)->saddr);
 
 	if (skb_has_frag_list(skb)) {
 		int first_len = skb_pagelen(skb);
@@ -1057,7 +1058,7 @@ static inline int ip6_ufo_append_data(struct sock *sk,
 			int odd, struct sk_buff *skb),
 			void *from, int length, int hh_len, int fragheaderlen,
 			int transhdrlen, int mtu, unsigned int flags,
-			struct rt6_info *rt)
+			const struct flowi6 *fl6)
 
 {
 	struct sk_buff *skb;
@@ -1102,7 +1103,9 @@ static inline int ip6_ufo_append_data(struct sock *sk,
 	skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
 				     sizeof(struct frag_hdr)) & ~7;
 	skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
-	skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk), rt);
+	skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
+							 &fl6->daddr,
+							 &fl6->saddr);
 
 append:
 	return skb_append_datato_frags(sk, skb, getfrag, from,
@@ -1327,7 +1330,7 @@ emsgsize:
 	    (sk->sk_type == SOCK_DGRAM)) {
 		err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
 					  hh_len, fragheaderlen,
-					  transhdrlen, mtu, flags, rt);
+					  transhdrlen, mtu, flags, fl6);
 		if (err)
 			goto error;
 		return 0;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 96f153c0846b..0a05b35a90fc 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1506,7 +1506,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
 			  "Redirect: destination is not a neighbour\n");
 		goto release;
 	}
-	peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
+	peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr, 1);
 	ret = inet_peer_xrlim_allow(peer, 1*HZ);
 	if (peer)
 		inet_putpeer(peer);
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index ef0e2326496b..055e85cb7b65 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -10,7 +10,8 @@
 #include <net/secure_seq.h>
 
 static u32 __ipv6_select_ident(struct net *net, u32 hashrnd,
-			       struct in6_addr *dst, struct in6_addr *src)
+			       const struct in6_addr *dst,
+			       const struct in6_addr *src)
 {
 	u32 hash, id;
 
@@ -60,15 +61,16 @@ void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb)
 }
 EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident);
 
-u32 ipv6_select_ident(struct net *net, struct rt6_info *rt)
+u32 ipv6_select_ident(struct net *net,
+		      const struct in6_addr *daddr,
+		      const struct in6_addr *saddr)
 {
 	static u32 ip6_idents_hashrnd __read_mostly;
 	u32 id;
 
 	net_get_random_once(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd));
 
-	id = __ipv6_select_ident(net, ip6_idents_hashrnd, &rt->rt6i_dst.addr,
-				 &rt->rt6i_src.addr);
+	id = __ipv6_select_ident(net, ip6_idents_hashrnd, daddr, saddr);
 	return htonl(id);
 }
 EXPORT_SYMBOL(ipv6_select_ident);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index fefe4455e1f3..1a9390a370c0 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -262,7 +262,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	rt = (struct rt6_info *) dst;
 	if (tcp_death_row.sysctl_tw_recycle &&
 	    !tp->rx_opt.ts_recent_stamp &&
-	    ipv6_addr_equal(&rt->rt6i_dst.addr, &sk->sk_v6_daddr))
+	    ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr))
 		tcp_fetch_timewait_stamp(sk, dst);
 
 	icsk->icsk_ext_hdr_len = 0;
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 19986ec5f21a..38f862728f75 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -781,7 +781,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	/* From world but DNAT to loopback address? */
 	if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
-	    ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
+	    ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) {
 		IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0,
 				 "ip_vs_nat_xmit_v6(): "
 				 "stopping DNAT to loopback address");
@@ -1346,7 +1346,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	/* From world but DNAT to loopback address? */
 	if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
-	    ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
+	    ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) {
 		IP_VS_DBG(1, "%s(): "
 			  "stopping DNAT to loopback %pI6\n",
 			  __func__, &cp->daddr.in6);
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index e703ff7fed40..17a0120ae5a5 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -332,7 +332,8 @@ out:
 		rt = (struct rt6_info *)dst;
 		t->dst = dst;
 		t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
-		pr_debug("rt6_dst:%pI6 rt6_src:%pI6\n", &rt->rt6i_dst.addr,
+		pr_debug("rt6_dst:%pI6/%d rt6_src:%pI6\n",
+			 &rt->rt6i_dst.addr, rt->rt6i_dst.plen,
 			 &fl6->saddr);
 	} else {
 		t->dst = NULL;
-- 
cgit v1.2.3


From 2647a9b07032c5a95ddee1fcb65d95bddbc6b7f9 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 22 May 2015 20:55:58 -0700
Subject: ipv6: Remove external dependency on rt6i_gateway and RTF_ANYCAST

When creating a RTF_CACHE route, RTF_ANYCAST is set based on rt6i_dst.
Also, rt6i_gateway is always set to the nexthop while the nexthop
could be a gateway or the rt6i_dst.addr.

After removing the rt6i_dst and rt6i_src dependency in the last patch,
we also need to stop the caller from depending on rt6i_gateway and
RTF_ANYCAST.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h                | 19 ++++++++++++++-----
 net/bluetooth/6lowpan.c                |  2 +-
 net/ipv6/icmp.c                        |  4 ++--
 net/ipv6/ip6_output.c                  |  5 +++--
 net/ipv6/route.c                       |  6 +-----
 net/netfilter/nf_conntrack_h323_main.c |  4 ++--
 net/netfilter/xt_addrtype.c            |  2 +-
 7 files changed, 24 insertions(+), 18 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 5e192068e6cb..4caf7d697dee 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -163,11 +163,14 @@ static inline bool ipv6_unicast_destination(const struct sk_buff *skb)
 	return rt->rt6i_flags & RTF_LOCAL;
 }
 
-static inline bool ipv6_anycast_destination(const struct sk_buff *skb)
+static inline bool ipv6_anycast_destination(const struct dst_entry *dst,
+					    const struct in6_addr *daddr)
 {
-	struct rt6_info *rt = (struct rt6_info *) skb_dst(skb);
+	struct rt6_info *rt = (struct rt6_info *)dst;
 
-	return rt->rt6i_flags & RTF_ANYCAST;
+	return rt->rt6i_flags & RTF_ANYCAST ||
+		(rt->rt6i_dst.plen != 128 &&
+		 ipv6_addr_equal(&rt->rt6i_dst.addr, daddr));
 }
 
 int ip6_fragment(struct sock *sk, struct sk_buff *skb,
@@ -194,9 +197,15 @@ static inline bool ip6_sk_ignore_df(const struct sock *sk)
 	       inet6_sk(sk)->pmtudisc == IPV6_PMTUDISC_OMIT;
 }
 
-static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt)
+static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt,
+					   struct in6_addr *daddr)
 {
-	return &rt->rt6i_gateway;
+	if (rt->rt6i_flags & RTF_GATEWAY)
+		return &rt->rt6i_gateway;
+	else if (rt->rt6i_flags & RTF_CACHE)
+		return &rt->rt6i_dst.addr;
+	else
+		return daddr;
 }
 
 #endif
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index 1742b849fcff..f3d6046c8ee7 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -192,7 +192,7 @@ static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_dev *dev,
 		if (ipv6_addr_any(nexthop))
 			return NULL;
 	} else {
-		nexthop = rt6_nexthop(rt);
+		nexthop = rt6_nexthop(rt, daddr);
 
 		/* We need to remember the address because it is needed
 		 * by bt_xmit() when sending the packet. In bt_xmit(), the
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 24b359d1425b..713d7434c911 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -337,7 +337,7 @@ static struct dst_entry *icmpv6_route_lookup(struct net *net,
 	 * We won't send icmp if the destination is known
 	 * anycast.
 	 */
-	if (((struct rt6_info *)dst)->rt6i_flags & RTF_ANYCAST) {
+	if (ipv6_anycast_destination(dst, &fl6->daddr)) {
 		net_dbg_ratelimited("icmp6_send: acast source\n");
 		dst_release(dst);
 		return ERR_PTR(-EINVAL);
@@ -564,7 +564,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
 
 	if (!ipv6_unicast_destination(skb) &&
 	    !(net->ipv6.sysctl.anycast_src_echo_reply &&
-	      ipv6_anycast_destination(skb)))
+	      ipv6_anycast_destination(skb_dst(skb), saddr)))
 		saddr = NULL;
 
 	memcpy(&tmp_hdr, icmph, sizeof(tmp_hdr));
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 3097f2c5e890..61741676b987 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -105,7 +105,7 @@ static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb)
 	}
 
 	rcu_read_lock_bh();
-	nexthop = rt6_nexthop((struct rt6_info *)dst);
+	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 	if (unlikely(!neigh))
 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
@@ -934,7 +934,8 @@ static int ip6_dst_lookup_tail(struct sock *sk,
 	 */
 	rt = (struct rt6_info *) *dst;
 	rcu_read_lock_bh();
-	n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt));
+	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
+				      rt6_nexthop(rt, &fl6->daddr));
 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
 	rcu_read_unlock_bh();
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 0c889cb89cc3..ce3b1754b4f5 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1945,11 +1945,7 @@ static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
 		if (rt->rt6i_idev)
 			in6_dev_hold(rt->rt6i_idev);
 		rt->dst.lastuse = jiffies;
-
-		if (ort->rt6i_flags & RTF_GATEWAY)
-			rt->rt6i_gateway = ort->rt6i_gateway;
-		else
-			rt->rt6i_gateway = *dest;
+		rt->rt6i_gateway = ort->rt6i_gateway;
 		rt->rt6i_flags = ort->rt6i_flags;
 		rt6_set_from(rt, ort);
 		rt->rt6i_metric = 0;
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 1d69f5b9748f..9511af04dc81 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -779,8 +779,8 @@ static int callforward_do_filter(struct net *net,
 				   flowi6_to_flowi(&fl1), false)) {
 			if (!afinfo->route(net, (struct dst_entry **)&rt2,
 					   flowi6_to_flowi(&fl2), false)) {
-				if (ipv6_addr_equal(rt6_nexthop(rt1),
-						    rt6_nexthop(rt2)) &&
+				if (ipv6_addr_equal(rt6_nexthop(rt1, &fl1.daddr),
+						    rt6_nexthop(rt2, &fl2.daddr)) &&
 				    rt1->dst.dev == rt2->dst.dev)
 					ret = 1;
 				dst_release(&rt2->dst);
diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c
index fab6eea1bf38..5b4743cc0436 100644
--- a/net/netfilter/xt_addrtype.c
+++ b/net/netfilter/xt_addrtype.c
@@ -73,7 +73,7 @@ static u32 match_lookup_rt6(struct net *net, const struct net_device *dev,
 
 	if (dev == NULL && rt->rt6i_flags & RTF_LOCAL)
 		ret |= XT_ADDRTYPE_LOCAL;
-	if (rt->rt6i_flags & RTF_ANYCAST)
+	if (ipv6_anycast_destination((struct dst_entry *)rt, addr))
 		ret |= XT_ADDRTYPE_ANYCAST;
 
 	dst_release(&rt->dst);
-- 
cgit v1.2.3


From 45e4fd26683c9a5f88600d91b08a484f7f09226a Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 22 May 2015 20:56:00 -0700
Subject: ipv6: Only create RTF_CACHE routes after encountering pmtu exception

This patch creates a RTF_CACHE routes only after encountering a pmtu
exception.

After ip6_rt_update_pmtu() has inserted the RTF_CACHE route to the fib6
tree, the rt->rt6i_node->fn_sernum is bumped which will fail the
ip6_dst_check() and trigger a relookup.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h |   2 +-
 net/ipv6/ip6_fib.c      |   1 +
 net/ipv6/route.c        | 100 ++++++++++++++++++++++++------------------------
 3 files changed, 53 insertions(+), 50 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 4caf7d697dee..784ee3d01dbf 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -202,7 +202,7 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt,
 {
 	if (rt->rt6i_flags & RTF_GATEWAY)
 		return &rt->rt6i_gateway;
-	else if (rt->rt6i_flags & RTF_CACHE)
+	else if (unlikely(rt->rt6i_flags & RTF_CACHE))
 		return &rt->rt6i_dst.addr;
 	else
 		return daddr;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index bde57b113009..83341b3a248d 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -738,6 +738,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 					rt6_clean_expires(iter);
 				else
 					rt6_set_expires(iter, rt->dst.expires);
+				iter->rt6i_pmtu = rt->rt6i_pmtu;
 				return -EEXIST;
 			}
 			/* If we have the same destination and the same metric,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f199d6357b31..e7ae2430dfed 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -873,16 +873,13 @@ static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 				      struct flowi6 *fl6, int flags)
 {
 	struct fib6_node *fn, *saved_fn;
-	struct rt6_info *rt, *nrt;
+	struct rt6_info *rt;
 	int strict = 0;
-	int attempts = 3;
-	int err;
 
 	strict |= flags & RT6_LOOKUP_F_IFACE;
 	if (net->ipv6.devconf_all->forwarding == 0)
 		strict |= RT6_LOOKUP_F_REACHABLE;
 
-redo_fib6_lookup_lock:
 	read_lock_bh(&table->tb6_lock);
 
 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
@@ -901,46 +898,12 @@ redo_rt6_select:
 			strict &= ~RT6_LOOKUP_F_REACHABLE;
 			fn = saved_fn;
 			goto redo_rt6_select;
-		} else {
-			dst_hold(&rt->dst);
-			read_unlock_bh(&table->tb6_lock);
-			goto out2;
 		}
 	}
 
 	dst_hold(&rt->dst);
 	read_unlock_bh(&table->tb6_lock);
 
-	if (rt->rt6i_flags & RTF_CACHE)
-		goto out2;
-
-	if (!rt6_is_gw_or_nonexthop(rt) ||
-	    !(rt->dst.flags & DST_HOST) || !(rt->rt6i_flags & RTF_LOCAL))
-		nrt = ip6_rt_cache_alloc(rt, &fl6->daddr, &fl6->saddr);
-	else
-		goto out2;
-
-	ip6_rt_put(rt);
-	rt = nrt ? : net->ipv6.ip6_null_entry;
-
-	dst_hold(&rt->dst);
-	if (nrt) {
-		err = ip6_ins_rt(nrt);
-		if (!err)
-			goto out2;
-	}
-
-	if (--attempts <= 0)
-		goto out2;
-
-	/*
-	 * Race condition! In the gap, when table->tb6_lock was
-	 * released someone could insert this route.  Relookup.
-	 */
-	ip6_rt_put(rt);
-	goto redo_fib6_lookup_lock;
-
-out2:
 	rt6_dst_from_metrics_check(rt);
 	rt->dst.lastuse = jiffies;
 	rt->dst.__use++;
@@ -1113,24 +1076,63 @@ static void ip6_link_failure(struct sk_buff *skb)
 	}
 }
 
-static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
-			       struct sk_buff *skb, u32 mtu)
+static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
+{
+	struct net *net = dev_net(rt->dst.dev);
+
+	rt->rt6i_flags |= RTF_MODIFIED;
+	rt->rt6i_pmtu = mtu;
+	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
+}
+
+static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
+				 const struct ipv6hdr *iph, u32 mtu)
 {
 	struct rt6_info *rt6 = (struct rt6_info *)dst;
 
-	dst_confirm(dst);
-	if (mtu < dst_mtu(dst) && (rt6->rt6i_flags & RTF_CACHE)) {
-		struct net *net = dev_net(dst->dev);
+	if (rt6->rt6i_flags & RTF_LOCAL)
+		return;
 
-		rt6->rt6i_flags |= RTF_MODIFIED;
-		if (mtu < IPV6_MIN_MTU)
-			mtu = IPV6_MIN_MTU;
+	dst_confirm(dst);
+	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
+	if (mtu >= dst_mtu(dst))
+		return;
 
-		rt6->rt6i_pmtu = mtu;
-		rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires);
+	if (rt6->rt6i_flags & RTF_CACHE) {
+		rt6_do_update_pmtu(rt6, mtu);
+	} else {
+		const struct in6_addr *daddr, *saddr;
+		struct rt6_info *nrt6;
+
+		if (iph) {
+			daddr = &iph->daddr;
+			saddr = &iph->saddr;
+		} else if (sk) {
+			daddr = &sk->sk_v6_daddr;
+			saddr = &inet6_sk(sk)->saddr;
+		} else {
+			return;
+		}
+		nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
+		if (nrt6) {
+			rt6_do_update_pmtu(nrt6, mtu);
+
+			/* ip6_ins_rt(nrt6) will bump the
+			 * rt6->rt6i_node->fn_sernum
+			 * which will fail the next rt6_check() and
+			 * invalidate the sk->sk_dst_cache.
+			 */
+			ip6_ins_rt(nrt6);
+		}
 	}
 }
 
+static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+			       struct sk_buff *skb, u32 mtu)
+{
+	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
+}
+
 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
 		     int oif, u32 mark)
 {
@@ -1147,7 +1149,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
 
 	dst = ip6_route_output(net, NULL, &fl6);
 	if (!dst->error)
-		ip6_rt_update_pmtu(dst, NULL, skb, ntohl(mtu));
+		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
 	dst_release(dst);
 }
 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
-- 
cgit v1.2.3


From b197df4f0f3782782e9ea8996e91b65ae33e8dd9 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 22 May 2015 20:56:01 -0700
Subject: ipv6: Add rt6_get_cookie() function

Instead of doing the rt6->rt6i_node check whenever we need
to get the route's cookie.  Refactor it into rt6_get_cookie().
It is a prep work to handle FLOWI_FLAG_KNOWN_NH and also
percpu rt6_info later.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h           | 5 +++++
 include/net/ip6_route.h         | 2 +-
 net/ipv6/ip6_tunnel.c           | 2 +-
 net/ipv6/tcp_ipv6.c             | 3 +--
 net/ipv6/xfrm6_policy.c         | 6 ++----
 net/netfilter/ipvs/ip_vs_xmit.c | 2 +-
 net/sctp/ipv6.c                 | 2 +-
 7 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index e00018011028..a4bece6797da 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -159,6 +159,11 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout)
 	rt0->rt6i_flags |= RTF_EXPIRES;
 }
 
+static inline u32 rt6_get_cookie(const struct rt6_info *rt)
+{
+	return rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+}
+
 static inline void ip6_rt_put(struct rt6_info *rt)
 {
 	/* dst_release() accepts a NULL parameter.
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 784ee3d01dbf..297629aadb19 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -145,7 +145,7 @@ static inline void __ip6_dst_store(struct sock *sk, struct dst_entry *dst,
 #ifdef CONFIG_IPV6_SUBTREES
 	np->saddr_cache = saddr;
 #endif
-	np->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+	np->dst_cookie = rt6_get_cookie(rt);
 }
 
 static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst,
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 5cafd92c2312..2e67b660118b 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -151,7 +151,7 @@ EXPORT_SYMBOL_GPL(ip6_tnl_dst_reset);
 void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst)
 {
 	struct rt6_info *rt = (struct rt6_info *) dst;
-	t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+	t->dst_cookie = rt6_get_cookie(rt);
 	dst_release(t->dst_cache);
 	t->dst_cache = dst;
 }
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 1a9390a370c0..7be3d858cbf0 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -99,8 +99,7 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
 		dst_hold(dst);
 		sk->sk_rx_dst = dst;
 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
-		if (rt->rt6i_node)
-			inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum;
+		inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt);
 	}
 }
 
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 6ae256b4c55a..ed0583c1b9fc 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -76,8 +76,7 @@ static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst,
 {
 	if (dst->ops->family == AF_INET6) {
 		struct rt6_info *rt = (struct rt6_info *)dst;
-		if (rt->rt6i_node)
-			path->path_cookie = rt->rt6i_node->fn_sernum;
+		path->path_cookie = rt6_get_cookie(rt);
 	}
 
 	path->u.rt6.rt6i_nfheader_len = nfheader_len;
@@ -105,8 +104,7 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 						   RTF_LOCAL);
 	xdst->u.rt6.rt6i_metric = rt->rt6i_metric;
 	xdst->u.rt6.rt6i_node = rt->rt6i_node;
-	if (rt->rt6i_node)
-		xdst->route_cookie = rt->rt6i_node->fn_sernum;
+	xdst->route_cookie = rt6_get_cookie(rt);
 	xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway;
 	xdst->u.rt6.rt6i_dst = rt->rt6i_dst;
 	xdst->u.rt6.rt6i_src = rt->rt6i_src;
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 38f862728f75..5eff9f6dcfb8 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -435,7 +435,7 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest,
 				goto err_unreach;
 			}
 			rt = (struct rt6_info *) dst;
-			cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+			cookie = rt6_get_cookie(rt);
 			__ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie);
 			spin_unlock_bh(&dest->dst_lock);
 			IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 17a0120ae5a5..e917d27328ea 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -331,7 +331,7 @@ out:
 
 		rt = (struct rt6_info *)dst;
 		t->dst = dst;
-		t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+		t->dst_cookie = rt6_get_cookie(rt);
 		pr_debug("rt6_dst:%pI6/%d rt6_src:%pI6\n",
 			 &rt->rt6i_dst.addr, rt->rt6i_dst.plen,
 			 &fl6->saddr);
-- 
cgit v1.2.3


From 3da59bd94583d1239e4fbdee452265a160b9cd71 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 22 May 2015 20:56:03 -0700
Subject: ipv6: Create RTF_CACHE clone when FLOWI_FLAG_KNOWN_NH is set

This patch always creates RTF_CACHE clone with DST_NOCACHE
when FLOWI_FLAG_KNOWN_NH is set so that the rt6i_dst is set to
the fl6->daddr.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Tested-by: Julian Anastasov <ja@ssi.bg>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  3 +++
 net/ipv6/route.c      | 59 ++++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 52 insertions(+), 10 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index a4bece6797da..5556111022eb 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -161,6 +161,9 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout)
 
 static inline u32 rt6_get_cookie(const struct rt6_info *rt)
 {
+	if (unlikely(rt->dst.flags & DST_NOCACHE))
+		rt = (struct rt6_info *)(rt->dst.from);
+
 	return rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
 }
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index e7ae2430dfed..9e4c3c5d1591 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -901,13 +901,34 @@ redo_rt6_select:
 		}
 	}
 
-	dst_hold(&rt->dst);
+	dst_use(&rt->dst, jiffies);
 	read_unlock_bh(&table->tb6_lock);
 
-	rt6_dst_from_metrics_check(rt);
-	rt->dst.lastuse = jiffies;
-	rt->dst.__use++;
+	if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
+		goto done;
+	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
+			    !(rt->rt6i_flags & RTF_GATEWAY))) {
+		/* Create a RTF_CACHE clone which will not be
+		 * owned by the fib6 tree.  It is for the special case where
+		 * the daddr in the skb during the neighbor look-up is different
+		 * from the fl6->daddr used to look-up route here.
+		 */
+
+		struct rt6_info *uncached_rt;
+
+		uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
+		dst_release(&rt->dst);
+
+		if (uncached_rt)
+			uncached_rt->dst.flags |= DST_NOCACHE;
+		else
+			uncached_rt = net->ipv6.ip6_null_entry;
+		dst_hold(&uncached_rt->dst);
+		return uncached_rt;
+	}
 
+done:
+	rt6_dst_from_metrics_check(rt);
 	return rt;
 }
 
@@ -1019,6 +1040,26 @@ static void rt6_dst_from_metrics_check(struct rt6_info *rt)
 		dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
 }
 
+static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
+{
+	if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
+		return NULL;
+
+	if (rt6_check_expired(rt))
+		return NULL;
+
+	return &rt->dst;
+}
+
+static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
+{
+	if (rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
+	    rt6_check((struct rt6_info *)(rt->dst.from), cookie))
+		return &rt->dst;
+	else
+		return NULL;
+}
+
 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
 {
 	struct rt6_info *rt;
@@ -1029,15 +1070,13 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
 	 * into this function always.
 	 */
-	if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
-		return NULL;
-
-	if (rt6_check_expired(rt))
-		return NULL;
 
 	rt6_dst_from_metrics_check(rt);
 
-	return dst;
+	if (unlikely(dst->flags & DST_NOCACHE))
+		return rt6_dst_from_check(rt, cookie);
+	else
+		return rt6_check(rt, cookie);
 }
 
 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
-- 
cgit v1.2.3


From 8d0b94afdca84598912347e61defa846a0988d04 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 22 May 2015 20:56:04 -0700
Subject: ipv6: Keep track of DST_NOCACHE routes in case of iface
 down/unregister

This patch keeps track of the DST_NOCACHE routes in a list and replaces its
dev with loopback during the iface down/unregister event.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  3 ++
 net/ipv6/route.c      | 78 +++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 79 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 5556111022eb..cc8f03c10c43 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -120,6 +120,9 @@ struct rt6_info {
 	struct rt6key			rt6i_src;
 	struct rt6key			rt6i_prefsrc;
 
+	struct list_head		rt6i_uncached;
+	struct uncached_list		*rt6i_uncached_list;
+
 	struct inet6_dev		*rt6i_idev;
 
 	u32				rt6i_metric;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 9e4c3c5d1591..94ce1e0b5a33 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -105,6 +105,67 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
 					   const struct in6_addr *gwaddr, int ifindex);
 #endif
 
+struct uncached_list {
+	spinlock_t		lock;
+	struct list_head	head;
+};
+
+static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
+
+static void rt6_uncached_list_add(struct rt6_info *rt)
+{
+	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
+
+	rt->dst.flags |= DST_NOCACHE;
+	rt->rt6i_uncached_list = ul;
+
+	spin_lock_bh(&ul->lock);
+	list_add_tail(&rt->rt6i_uncached, &ul->head);
+	spin_unlock_bh(&ul->lock);
+}
+
+static void rt6_uncached_list_del(struct rt6_info *rt)
+{
+	if (!list_empty(&rt->rt6i_uncached)) {
+		struct uncached_list *ul = rt->rt6i_uncached_list;
+
+		spin_lock_bh(&ul->lock);
+		list_del(&rt->rt6i_uncached);
+		spin_unlock_bh(&ul->lock);
+	}
+}
+
+static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
+{
+	struct net_device *loopback_dev = net->loopback_dev;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
+		struct rt6_info *rt;
+
+		spin_lock_bh(&ul->lock);
+		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
+			struct inet6_dev *rt_idev = rt->rt6i_idev;
+			struct net_device *rt_dev = rt->dst.dev;
+
+			if (rt_idev && (rt_idev->dev == dev || !dev) &&
+			    rt_idev->dev != loopback_dev) {
+				rt->rt6i_idev = in6_dev_get(loopback_dev);
+				in6_dev_put(rt_idev);
+			}
+
+			if (rt_dev && (rt_dev == dev || !dev) &&
+			    rt_dev != loopback_dev) {
+				rt->dst.dev = loopback_dev;
+				dev_hold(rt->dst.dev);
+				dev_put(rt_dev);
+			}
+		}
+		spin_unlock_bh(&ul->lock);
+	}
+}
+
 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
 {
 	struct rt6_info *rt = (struct rt6_info *)dst;
@@ -262,6 +323,7 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net,
 
 		memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
 		INIT_LIST_HEAD(&rt->rt6i_siblings);
+		INIT_LIST_HEAD(&rt->rt6i_uncached);
 	}
 	return rt;
 }
@@ -269,11 +331,14 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net,
 static void ip6_dst_destroy(struct dst_entry *dst)
 {
 	struct rt6_info *rt = (struct rt6_info *)dst;
-	struct inet6_dev *idev = rt->rt6i_idev;
 	struct dst_entry *from = dst->from;
+	struct inet6_dev *idev;
 
 	dst_destroy_metrics_generic(dst);
 
+	rt6_uncached_list_del(rt);
+
+	idev = rt->rt6i_idev;
 	if (idev) {
 		rt->rt6i_idev = NULL;
 		in6_dev_put(idev);
@@ -920,7 +985,7 @@ redo_rt6_select:
 		dst_release(&rt->dst);
 
 		if (uncached_rt)
-			uncached_rt->dst.flags |= DST_NOCACHE;
+			rt6_uncached_list_add(uncached_rt);
 		else
 			uncached_rt = net->ipv6.ip6_null_entry;
 		dst_hold(&uncached_rt->dst);
@@ -2367,6 +2432,7 @@ void rt6_ifdown(struct net *net, struct net_device *dev)
 
 	fib6_clean_all(net, fib6_ifdown, &adn);
 	icmp6_clean_all(fib6_ifdown, &adn);
+	rt6_uncached_list_flush_dev(net, dev);
 }
 
 struct rt6_mtu_change_arg {
@@ -3263,6 +3329,7 @@ static struct notifier_block ip6_route_dev_notifier = {
 int __init ip6_route_init(void)
 {
 	int ret;
+	int cpu;
 
 	ret = -ENOMEM;
 	ip6_dst_ops_template.kmem_cachep =
@@ -3322,6 +3389,13 @@ int __init ip6_route_init(void)
 	if (ret)
 		goto out_register_late_subsys;
 
+	for_each_possible_cpu(cpu) {
+		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
+
+		INIT_LIST_HEAD(&ul->head);
+		spin_lock_init(&ul->lock);
+	}
+
 out:
 	return ret;
 
-- 
cgit v1.2.3


From d52d3997f843ffefaa8d8462790ffcaca6c74192 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 22 May 2015 20:56:06 -0700
Subject: ipv6: Create percpu rt6_info

After the patch
'ipv6: Only create RTF_CACHE routes after encountering pmtu exception',
we need to compensate the performance hit (bouncing dst->__refcnt).

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h           |   3 +-
 include/uapi/linux/ipv6_route.h |   1 +
 net/ipv6/ip6_fib.c              |  24 +++++++-
 net/ipv6/route.c                | 132 +++++++++++++++++++++++++++++++++++-----
 4 files changed, 142 insertions(+), 18 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index cc8f03c10c43..3b76849c190f 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -124,6 +124,7 @@ struct rt6_info {
 	struct uncached_list		*rt6i_uncached_list;
 
 	struct inet6_dev		*rt6i_idev;
+	struct rt6_info * __percpu	*rt6i_pcpu;
 
 	u32				rt6i_metric;
 	u32				rt6i_pmtu;
@@ -164,7 +165,7 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout)
 
 static inline u32 rt6_get_cookie(const struct rt6_info *rt)
 {
-	if (unlikely(rt->dst.flags & DST_NOCACHE))
+	if (rt->rt6i_flags & RTF_PCPU || unlikely(rt->dst.flags & DST_NOCACHE))
 		rt = (struct rt6_info *)(rt->dst.from);
 
 	return rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
diff --git a/include/uapi/linux/ipv6_route.h b/include/uapi/linux/ipv6_route.h
index 2be7bd174751..f6598d1c886e 100644
--- a/include/uapi/linux/ipv6_route.h
+++ b/include/uapi/linux/ipv6_route.h
@@ -34,6 +34,7 @@
 #define RTF_PREF(pref)	((pref) << 27)
 #define RTF_PREF_MASK	0x18000000
 
+#define RTF_PCPU	0x40000000
 #define RTF_LOCAL	0x80000000
 
 
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 83341b3a248d..55d19861ab20 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -154,10 +154,32 @@ static void node_free(struct fib6_node *fn)
 	kmem_cache_free(fib6_node_kmem, fn);
 }
 
+static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
+{
+	int cpu;
+
+	if (!non_pcpu_rt->rt6i_pcpu)
+		return;
+
+	for_each_possible_cpu(cpu) {
+		struct rt6_info **ppcpu_rt;
+		struct rt6_info *pcpu_rt;
+
+		ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt6i_pcpu, cpu);
+		pcpu_rt = *ppcpu_rt;
+		if (pcpu_rt) {
+			dst_free(&pcpu_rt->dst);
+			*ppcpu_rt = NULL;
+		}
+	}
+}
+
 static void rt6_release(struct rt6_info *rt)
 {
-	if (atomic_dec_and_test(&rt->rt6i_ref))
+	if (atomic_dec_and_test(&rt->rt6i_ref)) {
+		rt6_free_pcpu(rt);
 		dst_free(&rt->dst);
+	}
 }
 
 static void fib6_link_table(struct net *net, struct fib6_table *tb)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 90c8eaa24565..1a1122a6bbf5 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -165,11 +165,18 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
 	}
 }
 
+static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
+{
+	return dst_metrics_write_ptr(rt->dst.from);
+}
+
 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
 {
 	struct rt6_info *rt = (struct rt6_info *)dst;
 
-	if (rt->rt6i_flags & RTF_CACHE)
+	if (rt->rt6i_flags & RTF_PCPU)
+		return rt6_pcpu_cow_metrics(rt);
+	else if (rt->rt6i_flags & RTF_CACHE)
 		return NULL;
 	else
 		return dst_cow_metrics_generic(dst, old);
@@ -309,10 +316,10 @@ static const struct rt6_info ip6_blk_hole_entry_template = {
 #endif
 
 /* allocate dst with ip6_dst_ops */
-static inline struct rt6_info *ip6_dst_alloc(struct net *net,
-					     struct net_device *dev,
-					     int flags,
-					     struct fib6_table *table)
+static struct rt6_info *__ip6_dst_alloc(struct net *net,
+					struct net_device *dev,
+					int flags,
+					struct fib6_table *table)
 {
 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
 					0, DST_OBSOLETE_FORCE_CHK, flags);
@@ -327,6 +334,34 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net,
 	return rt;
 }
 
+static struct rt6_info *ip6_dst_alloc(struct net *net,
+				      struct net_device *dev,
+				      int flags,
+				      struct fib6_table *table)
+{
+	struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags, table);
+
+	if (rt) {
+		rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
+		if (rt->rt6i_pcpu) {
+			int cpu;
+
+			for_each_possible_cpu(cpu) {
+				struct rt6_info **p;
+
+				p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
+				/* no one shares rt */
+				*p =  NULL;
+			}
+		} else {
+			dst_destroy((struct dst_entry *)rt);
+			return NULL;
+		}
+	}
+
+	return rt;
+}
+
 static void ip6_dst_destroy(struct dst_entry *dst)
 {
 	struct rt6_info *rt = (struct rt6_info *)dst;
@@ -335,6 +370,9 @@ static void ip6_dst_destroy(struct dst_entry *dst)
 
 	dst_destroy_metrics_generic(dst);
 
+	if (rt->rt6i_pcpu)
+		free_percpu(rt->rt6i_pcpu);
+
 	rt6_uncached_list_del(rt);
 
 	idev = rt->rt6i_idev;
@@ -912,11 +950,11 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
 	 *	Clone the route.
 	 */
 
-	if (ort->rt6i_flags & RTF_CACHE)
+	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
 		ort = (struct rt6_info *)ort->dst.from;
 
-	rt = ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev,
-			   0, ort->rt6i_table);
+	rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev,
+			     0, ort->rt6i_table);
 
 	if (!rt)
 		return NULL;
@@ -943,6 +981,54 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
 	return rt;
 }
 
+static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
+{
+	struct rt6_info *pcpu_rt;
+
+	pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
+				  rt->dst.dev, rt->dst.flags,
+				  rt->rt6i_table);
+
+	if (!pcpu_rt)
+		return NULL;
+	ip6_rt_copy_init(pcpu_rt, rt);
+	pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
+	pcpu_rt->rt6i_flags |= RTF_PCPU;
+	return pcpu_rt;
+}
+
+/* It should be called with read_lock_bh(&tb6_lock) acquired */
+static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
+{
+	struct rt6_info *pcpu_rt, *prev, **p;
+
+	p = this_cpu_ptr(rt->rt6i_pcpu);
+	pcpu_rt = *p;
+
+	if (pcpu_rt)
+		goto done;
+
+	pcpu_rt = ip6_rt_pcpu_alloc(rt);
+	if (!pcpu_rt) {
+		struct net *net = dev_net(rt->dst.dev);
+
+		pcpu_rt = net->ipv6.ip6_null_entry;
+		goto done;
+	}
+
+	prev = cmpxchg(p, NULL, pcpu_rt);
+	if (prev) {
+		/* If someone did it before us, return prev instead */
+		dst_destroy(&pcpu_rt->dst);
+		pcpu_rt = prev;
+	}
+
+done:
+	dst_hold(&pcpu_rt->dst);
+	rt6_dst_from_metrics_check(pcpu_rt);
+	return pcpu_rt;
+}
+
 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
 				      struct flowi6 *fl6, int flags)
 {
@@ -975,11 +1061,13 @@ redo_rt6_select:
 		}
 	}
 
-	dst_use(&rt->dst, jiffies);
-	read_unlock_bh(&table->tb6_lock);
 
 	if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
-		goto done;
+		dst_use(&rt->dst, jiffies);
+		read_unlock_bh(&table->tb6_lock);
+
+		rt6_dst_from_metrics_check(rt);
+		return rt;
 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
 			    !(rt->rt6i_flags & RTF_GATEWAY))) {
 		/* Create a RTF_CACHE clone which will not be
@@ -990,6 +1078,9 @@ redo_rt6_select:
 
 		struct rt6_info *uncached_rt;
 
+		dst_use(&rt->dst, jiffies);
+		read_unlock_bh(&table->tb6_lock);
+
 		uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
 		dst_release(&rt->dst);
 
@@ -997,13 +1088,22 @@ redo_rt6_select:
 			rt6_uncached_list_add(uncached_rt);
 		else
 			uncached_rt = net->ipv6.ip6_null_entry;
+
 		dst_hold(&uncached_rt->dst);
 		return uncached_rt;
-	}
 
-done:
-	rt6_dst_from_metrics_check(rt);
-	return rt;
+	} else {
+		/* Get a percpu copy */
+
+		struct rt6_info *pcpu_rt;
+
+		rt->dst.lastuse = jiffies;
+		rt->dst.__use++;
+		pcpu_rt = rt6_get_pcpu_route(rt);
+		read_unlock_bh(&table->tb6_lock);
+
+		return pcpu_rt;
+	}
 }
 
 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
@@ -1147,7 +1247,7 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
 
 	rt6_dst_from_metrics_check(rt);
 
-	if (unlikely(dst->flags & DST_NOCACHE))
+	if ((rt->rt6i_flags & RTF_PCPU) || unlikely(dst->flags & DST_NOCACHE))
 		return rt6_dst_from_check(rt, cookie);
 	else
 		return rt6_check(rt, cookie);
-- 
cgit v1.2.3


From 7f1598678d4c05e3e085bf780a5ab3119637ac3c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 25 May 2015 16:02:21 -0700
Subject: ipv6: ipv6_select_ident() returns a __be32

ipv6_select_ident() returns a 32bit value in network order.

Fixes: 286c2349f666 ("ipv6: Clean up ipv6_select_ident() and ip6_fragment()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: kbuild test robot <fengguang.wu@intel.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h     | 6 +++---
 net/ipv6/output_core.c | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index b950a2000b7f..35d485c78080 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -671,9 +671,9 @@ static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_add
 	return __ipv6_addr_diff(a1, a2, sizeof(struct in6_addr));
 }
 
-u32 ipv6_select_ident(struct net *net,
-		      const struct in6_addr *daddr,
-		      const struct in6_addr *saddr);
+__be32 ipv6_select_ident(struct net *net,
+			 const struct in6_addr *daddr,
+			 const struct in6_addr *saddr);
 void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb);
 
 int ip6_dst_hoplimit(struct dst_entry *dst);
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index 055e85cb7b65..21678acd4521 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -61,9 +61,9 @@ void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb)
 }
 EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident);
 
-u32 ipv6_select_ident(struct net *net,
-		      const struct in6_addr *daddr,
-		      const struct in6_addr *saddr)
+__be32 ipv6_select_ident(struct net *net,
+			 const struct in6_addr *daddr,
+			 const struct in6_addr *saddr)
 {
 	static u32 ip6_idents_hashrnd __read_mostly;
 	u32 id;
-- 
cgit v1.2.3


From ebddf1a8d78aa3436353fae75c4396e50cb2d6cf Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 26 May 2015 18:41:20 +0200
Subject: netfilter: nf_tables: allow to bind table to net_device

This patch adds the internal NFT_AF_NEEDS_DEV flag to indicate that you must
attach this table to a net_device.

This change is required by the follow up patch that introduces the new netdev
table.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h        |  8 ++++++
 include/uapi/linux/netfilter/nf_tables.h |  2 ++
 net/netfilter/nf_tables_api.c            | 46 ++++++++++++++++++++++++++++----
 3 files changed, 51 insertions(+), 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index e6bcf55dcf20..3d6f48ca40a7 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -819,6 +819,7 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt,
  *	@use: number of chain references to this table
  *	@flags: table flag (see enum nft_table_flags)
  *	@name: name of the table
+ *	@dev: this table is bound to this device (if any)
  */
 struct nft_table {
 	struct list_head		list;
@@ -828,6 +829,11 @@ struct nft_table {
 	u32				use;
 	u16				flags;
 	char				name[NFT_TABLE_MAXNAMELEN];
+	struct net_device		*dev;
+};
+
+enum nft_af_flags {
+	NFT_AF_NEEDS_DEV	= (1 << 0),
 };
 
 /**
@@ -838,6 +844,7 @@ struct nft_table {
  *	@nhooks: number of hooks in this family
  *	@owner: module owner
  *	@tables: used internally
+ *	@flags: family flags
  *	@nops: number of hook ops in this family
  *	@hook_ops_init: initialization function for chain hook ops
  *	@hooks: hookfn overrides for packet validation
@@ -848,6 +855,7 @@ struct nft_af_info {
 	unsigned int			nhooks;
 	struct module			*owner;
 	struct list_head		tables;
+	u32				flags;
 	unsigned int			nops;
 	void				(*hook_ops_init)(struct nf_hook_ops *,
 							 unsigned int);
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 5fa1cd04762e..89a671e0f5e7 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -146,12 +146,14 @@ enum nft_table_flags {
  * @NFTA_TABLE_NAME: name of the table (NLA_STRING)
  * @NFTA_TABLE_FLAGS: bitmask of enum nft_table_flags (NLA_U32)
  * @NFTA_TABLE_USE: number of chains in this table (NLA_U32)
+ * @NFTA_TABLE_DEV: net device name (NLA_STRING)
  */
 enum nft_table_attributes {
 	NFTA_TABLE_UNSPEC,
 	NFTA_TABLE_NAME,
 	NFTA_TABLE_FLAGS,
 	NFTA_TABLE_USE,
+	NFTA_TABLE_DEV,
 	__NFTA_TABLE_MAX
 };
 #define NFTA_TABLE_MAX		(__NFTA_TABLE_MAX - 1)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index ad9d11fb29fd..2fd4e99dd074 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -399,6 +399,8 @@ static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = {
 	[NFTA_TABLE_NAME]	= { .type = NLA_STRING,
 				    .len = NFT_TABLE_MAXNAMELEN - 1 },
 	[NFTA_TABLE_FLAGS]	= { .type = NLA_U32 },
+	[NFTA_TABLE_DEV]	= { .type = NLA_STRING,
+				    .len = IFNAMSIZ - 1 },
 };
 
 static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net,
@@ -423,6 +425,10 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net,
 	    nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use)))
 		goto nla_put_failure;
 
+	if (table->dev &&
+	    nla_put_string(skb, NFTA_TABLE_DEV, table->dev->name))
+		goto nla_put_failure;
+
 	nlmsg_end(skb, nlh);
 	return 0;
 
@@ -608,6 +614,11 @@ static int nf_tables_updtable(struct nft_ctx *ctx)
 	if (flags == ctx->table->flags)
 		return 0;
 
+	if ((ctx->afi->flags & NFT_AF_NEEDS_DEV) &&
+	    ctx->nla[NFTA_TABLE_DEV] &&
+	    nla_strcmp(ctx->nla[NFTA_TABLE_DEV], ctx->table->dev->name))
+		return -EOPNOTSUPP;
+
 	trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE,
 				sizeof(struct nft_trans_table));
 	if (trans == NULL)
@@ -645,6 +656,7 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb,
 	struct nft_table *table;
 	struct net *net = sock_net(skb->sk);
 	int family = nfmsg->nfgen_family;
+	struct net_device *dev = NULL;
 	u32 flags = 0;
 	struct nft_ctx ctx;
 	int err;
@@ -679,30 +691,50 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb,
 			return -EINVAL;
 	}
 
+	if (afi->flags & NFT_AF_NEEDS_DEV) {
+		char ifname[IFNAMSIZ];
+
+		if (!nla[NFTA_TABLE_DEV])
+			return -EOPNOTSUPP;
+
+		nla_strlcpy(ifname, nla[NFTA_TABLE_DEV], IFNAMSIZ);
+		dev = dev_get_by_name(net, ifname);
+		if (!dev)
+			return -ENOENT;
+	} else if (nla[NFTA_TABLE_DEV]) {
+		return -EOPNOTSUPP;
+	}
+
+	err = -EAFNOSUPPORT;
 	if (!try_module_get(afi->owner))
-		return -EAFNOSUPPORT;
+		goto err1;
 
 	err = -ENOMEM;
 	table = kzalloc(sizeof(*table), GFP_KERNEL);
 	if (table == NULL)
-		goto err1;
+		goto err2;
 
 	nla_strlcpy(table->name, name, NFT_TABLE_MAXNAMELEN);
 	INIT_LIST_HEAD(&table->chains);
 	INIT_LIST_HEAD(&table->sets);
 	table->flags = flags;
+	table->dev   = dev;
 
 	nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla);
 	err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE);
 	if (err < 0)
-		goto err2;
+		goto err3;
 
 	list_add_tail_rcu(&table->list, &afi->tables);
 	return 0;
-err2:
+err3:
 	kfree(table);
-err1:
+err2:
 	module_put(afi->owner);
+err1:
+	if (dev != NULL)
+		dev_put(dev);
+
 	return err;
 }
 
@@ -806,6 +838,9 @@ static void nf_tables_table_destroy(struct nft_ctx *ctx)
 {
 	BUG_ON(ctx->table->use > 0);
 
+	if (ctx->table->dev)
+		dev_put(ctx->table->dev);
+
 	kfree(ctx->table);
 	module_put(ctx->afi->owner);
 }
@@ -1361,6 +1396,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 			ops->priority	= priority;
 			ops->priv	= chain;
 			ops->hook	= afi->hooks[ops->hooknum];
+			ops->dev	= table->dev;
 			if (hookfn)
 				ops->hook = hookfn;
 			if (afi->hook_ops_init)
-- 
cgit v1.2.3


From ed6c4136f1571bd6ab362afc3410905a8a69ca42 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 26 May 2015 18:41:40 +0200
Subject: netfilter: nf_tables: add netdev table to filter from ingress

This allows us to create netdev tables that contain ingress chains. Use
skb_header_pointer() as we may see shared sk_buffs at this stage.

This change provides access to the existing nf_tables features from the ingress
hook.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netns/nftables.h     |   1 +
 net/netfilter/Kconfig            |   5 ++
 net/netfilter/Makefile           |   1 +
 net/netfilter/nf_tables_netdev.c | 183 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 190 insertions(+)
 create mode 100644 net/netfilter/nf_tables_netdev.c

(limited to 'include/net')

diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h
index eee608b12cc9..c80781146019 100644
--- a/include/net/netns/nftables.h
+++ b/include/net/netns/nftables.h
@@ -13,6 +13,7 @@ struct netns_nftables {
 	struct nft_af_info	*inet;
 	struct nft_af_info	*arp;
 	struct nft_af_info	*bridge;
+	struct nft_af_info	*netdev;
 	unsigned int		base_seq;
 	u8			gencursor;
 };
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 9a89e7c67d78..bd5aaebaa9a2 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -456,6 +456,11 @@ config NF_TABLES_INET
 	help
 	  This option enables support for a mixed IPv4/IPv6 "inet" table.
 
+config NF_TABLES_NETDEV
+	tristate "Netfilter nf_tables netdev tables support"
+	help
+	  This option enables support for the "netdev" table.
+
 config NFT_EXTHDR
 	tristate "Netfilter nf_tables IPv6 exthdr module"
 	help
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index a87d8b8ec730..70d026d46fe7 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -75,6 +75,7 @@ nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o
 
 obj-$(CONFIG_NF_TABLES)		+= nf_tables.o
 obj-$(CONFIG_NF_TABLES_INET)	+= nf_tables_inet.o
+obj-$(CONFIG_NF_TABLES_NETDEV)	+= nf_tables_netdev.o
 obj-$(CONFIG_NFT_COMPAT)	+= nft_compat.o
 obj-$(CONFIG_NFT_EXTHDR)	+= nft_exthdr.o
 obj-$(CONFIG_NFT_META)		+= nft_meta.o
diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c
new file mode 100644
index 000000000000..04cb17057f46
--- /dev/null
+++ b/net/netfilter/nf_tables_netdev.c
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2015 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <net/netfilter/nf_tables.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/netfilter/nf_tables_ipv4.h>
+#include <net/netfilter/nf_tables_ipv6.h>
+
+static inline void
+nft_netdev_set_pktinfo_ipv4(struct nft_pktinfo *pkt,
+			    const struct nf_hook_ops *ops, struct sk_buff *skb,
+			    const struct nf_hook_state *state)
+{
+	struct iphdr *iph, _iph;
+	u32 len, thoff;
+
+	nft_set_pktinfo(pkt, ops, skb, state);
+
+	iph = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*iph),
+				 &_iph);
+	if (!iph)
+		return;
+
+	iph = ip_hdr(skb);
+	if (iph->ihl < 5 || iph->version != 4)
+		return;
+
+	len = ntohs(iph->tot_len);
+	thoff = iph->ihl * 4;
+	if (skb->len < len)
+		return;
+	else if (len < thoff)
+		return;
+
+	pkt->tprot = iph->protocol;
+	pkt->xt.thoff = thoff;
+	pkt->xt.fragoff = ntohs(iph->frag_off) & IP_OFFSET;
+}
+
+static inline void
+__nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
+			      const struct nf_hook_ops *ops,
+			      struct sk_buff *skb,
+			      const struct nf_hook_state *state)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	struct ipv6hdr *ip6h, _ip6h;
+	unsigned int thoff = 0;
+	unsigned short frag_off;
+	int protohdr;
+	u32 pkt_len;
+
+	ip6h = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*ip6h),
+				  &_ip6h);
+	if (!ip6h)
+		return;
+
+	if (ip6h->version != 6)
+		return;
+
+	pkt_len = ntohs(ip6h->payload_len);
+	if (pkt_len + sizeof(*ip6h) > skb->len)
+		return;
+
+	protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL);
+	if (protohdr < 0)
+                return;
+
+	pkt->tprot = protohdr;
+	pkt->xt.thoff = thoff;
+	pkt->xt.fragoff = frag_off;
+#endif
+}
+
+static inline void nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
+					       const struct nf_hook_ops *ops,
+					       struct sk_buff *skb,
+					       const struct nf_hook_state *state)
+{
+	nft_set_pktinfo(pkt, ops, skb, state);
+	__nft_netdev_set_pktinfo_ipv6(pkt, ops, skb, state);
+}
+
+static unsigned int
+nft_do_chain_netdev(const struct nf_hook_ops *ops, struct sk_buff *skb,
+		    const struct nf_hook_state *state)
+{
+	struct nft_pktinfo pkt;
+
+	switch (eth_hdr(skb)->h_proto) {
+	case htons(ETH_P_IP):
+		nft_netdev_set_pktinfo_ipv4(&pkt, ops, skb, state);
+		break;
+	case htons(ETH_P_IPV6):
+		nft_netdev_set_pktinfo_ipv6(&pkt, ops, skb, state);
+		break;
+	default:
+		nft_set_pktinfo(&pkt, ops, skb, state);
+		break;
+	}
+
+	return nft_do_chain(&pkt, ops);
+}
+
+static struct nft_af_info nft_af_netdev __read_mostly = {
+	.family		= NFPROTO_NETDEV,
+	.nhooks		= NF_NETDEV_NUMHOOKS,
+	.owner		= THIS_MODULE,
+	.flags		= NFT_AF_NEEDS_DEV,
+	.nops		= 1,
+	.hooks		= {
+		[NF_NETDEV_INGRESS]	= nft_do_chain_netdev,
+	},
+};
+
+static int nf_tables_netdev_init_net(struct net *net)
+{
+	net->nft.netdev = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
+	if (net->nft.netdev == NULL)
+		return -ENOMEM;
+
+	memcpy(net->nft.netdev, &nft_af_netdev, sizeof(nft_af_netdev));
+
+	if (nft_register_afinfo(net, net->nft.netdev) < 0)
+		goto err;
+
+	return 0;
+err:
+	kfree(net->nft.netdev);
+	return -ENOMEM;
+}
+
+static void nf_tables_netdev_exit_net(struct net *net)
+{
+	nft_unregister_afinfo(net->nft.netdev);
+	kfree(net->nft.netdev);
+}
+
+static struct pernet_operations nf_tables_netdev_net_ops = {
+	.init	= nf_tables_netdev_init_net,
+	.exit	= nf_tables_netdev_exit_net,
+};
+
+static const struct nf_chain_type nft_filter_chain_netdev = {
+	.name		= "filter",
+	.type		= NFT_CHAIN_T_DEFAULT,
+	.family		= NFPROTO_NETDEV,
+	.owner		= THIS_MODULE,
+	.hook_mask	= (1 << NF_NETDEV_INGRESS),
+};
+
+static int __init nf_tables_netdev_init(void)
+{
+	int ret;
+
+	nft_register_chain_type(&nft_filter_chain_netdev);
+	ret = register_pernet_subsys(&nf_tables_netdev_net_ops);
+	if (ret < 0)
+		nft_unregister_chain_type(&nft_filter_chain_netdev);
+
+	return ret;
+}
+
+static void __exit nf_tables_netdev_exit(void)
+{
+	unregister_pernet_subsys(&nf_tables_netdev_net_ops);
+	nft_unregister_chain_type(&nft_filter_chain_netdev);
+}
+
+module_init(nf_tables_netdev_init);
+module_exit(nf_tables_netdev_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_FAMILY(5); /* NFPROTO_NETDEV */
-- 
cgit v1.2.3


From d0997b44c9081071e39417b188f7db6d1ea37341 Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@wantstofly.org>
Date: Mon, 25 May 2015 15:38:33 +0300
Subject: ieee802154: Remove ieee802154_reduced_mlme_ops references.

As there doesn't seem to be a definition of it or any users of it.

Signed-off-by: Lennert Buytenhek <buytenh@wantstofly.org>
Acked-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/ieee802154_netdev.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ieee802154_netdev.h b/include/net/ieee802154_netdev.h
index 84a72a1382a8..0a87975128ec 100644
--- a/include/net/ieee802154_netdev.h
+++ b/include/net/ieee802154_netdev.h
@@ -430,10 +430,4 @@ ieee802154_mlme_ops(const struct net_device *dev)
 	return dev->ml_priv;
 }
 
-static inline struct ieee802154_reduced_mlme_ops *
-ieee802154_reduced_mlme_ops(const struct net_device *dev)
-{
-	return dev->ml_priv;
-}
-
 #endif
-- 
cgit v1.2.3


From 095dc8e0c3686d586a01a50abc3e1bb9ac633054 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 26 May 2015 07:55:34 -0700
Subject: tcp: fix/cleanup inet_ehash_locks_alloc()

If tcp ehash table is constrained to a very small number of buckets
(eg boot parameter thash_entries=128), then we can crash if spinlock
array has more entries.

While we are at it, un-inline inet_ehash_locks_alloc() and make
following changes :

- Budget 2 cache lines per cpu worth of 'spinlocks'
- Try to kmalloc() the array to avoid extra TLB pressure.
  (Most servers at Google allocate 8192 bytes for this hash table)
- Get rid of various #ifdef

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_hashtables.h | 47 +++----------------------------------------
 net/ipv4/inet_hashtables.c    | 31 ++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 44 deletions(-)

(limited to 'include/net')

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 774d24151d4a..b73c88a19dd4 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -24,7 +24,6 @@
 #include <linux/spinlock.h>
 #include <linux/types.h>
 #include <linux/wait.h>
-#include <linux/vmalloc.h>
 
 #include <net/inet_connection_sock.h>
 #include <net/inet_sock.h>
@@ -164,52 +163,12 @@ static inline spinlock_t *inet_ehash_lockp(
 	return &hashinfo->ehash_locks[hash & hashinfo->ehash_locks_mask];
 }
 
-static inline int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
-{
-	unsigned int i, size = 256;
-#if defined(CONFIG_PROVE_LOCKING)
-	unsigned int nr_pcpus = 2;
-#else
-	unsigned int nr_pcpus = num_possible_cpus();
-#endif
-	if (nr_pcpus >= 4)
-		size = 512;
-	if (nr_pcpus >= 8)
-		size = 1024;
-	if (nr_pcpus >= 16)
-		size = 2048;
-	if (nr_pcpus >= 32)
-		size = 4096;
-	if (sizeof(spinlock_t) != 0) {
-#ifdef CONFIG_NUMA
-		if (size * sizeof(spinlock_t) > PAGE_SIZE)
-			hashinfo->ehash_locks = vmalloc(size * sizeof(spinlock_t));
-		else
-#endif
-		hashinfo->ehash_locks =	kmalloc(size * sizeof(spinlock_t),
-						GFP_KERNEL);
-		if (!hashinfo->ehash_locks)
-			return ENOMEM;
-		for (i = 0; i < size; i++)
-			spin_lock_init(&hashinfo->ehash_locks[i]);
-	}
-	hashinfo->ehash_locks_mask = size - 1;
-	return 0;
-}
+int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo);
 
 static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo)
 {
-	if (hashinfo->ehash_locks) {
-#ifdef CONFIG_NUMA
-		unsigned int size = (hashinfo->ehash_locks_mask + 1) *
-							sizeof(spinlock_t);
-		if (size > PAGE_SIZE)
-			vfree(hashinfo->ehash_locks);
-		else
-#endif
-		kfree(hashinfo->ehash_locks);
-		hashinfo->ehash_locks = NULL;
-	}
+	kvfree(hashinfo->ehash_locks);
+	hashinfo->ehash_locks = NULL;
 }
 
 struct inet_bind_bucket *
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 3766bddb3e8a..185efef0f125 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -18,6 +18,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/wait.h>
+#include <linux/vmalloc.h>
 
 #include <net/inet_connection_sock.h>
 #include <net/inet_hashtables.h>
@@ -609,3 +610,33 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
 		}
 }
 EXPORT_SYMBOL_GPL(inet_hashinfo_init);
+
+int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
+{
+	unsigned int i, nblocks = 1;
+
+	if (sizeof(spinlock_t) != 0) {
+		/* allocate 2 cache lines or at least one spinlock per cpu */
+		nblocks = max_t(unsigned int,
+				2 * L1_CACHE_BYTES / sizeof(spinlock_t),
+				1);
+		nblocks = roundup_pow_of_two(nblocks * num_possible_cpus());
+
+		/* no more locks than number of hash buckets */
+		nblocks = min(nblocks, hashinfo->ehash_mask + 1);
+
+		hashinfo->ehash_locks =	kmalloc_array(nblocks, sizeof(spinlock_t),
+						      GFP_KERNEL | __GFP_NOWARN);
+		if (!hashinfo->ehash_locks)
+			hashinfo->ehash_locks = vmalloc(nblocks * sizeof(spinlock_t));
+
+		if (!hashinfo->ehash_locks)
+			return -ENOMEM;
+
+		for (i = 0; i < nblocks; i++)
+			spin_lock_init(&hashinfo->ehash_locks[i]);
+	}
+	hashinfo->ehash_locks_mask = nblocks - 1;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc);
-- 
cgit v1.2.3


From 0f999b09f5c1b135e840501840dbcd01fad66f79 Mon Sep 17 00:00:00 2001
From: Varka Bhadram <varkabhadram@gmail.com>
Date: Wed, 27 May 2015 09:10:54 +0530
Subject: ieee802154: add set transmit power support

This patch adds transmission power setting support for IEEE-802.15.4
devices via nl802154.

Signed-off-by: Varka Bhadram <varkab@cdac.in>
Acked-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/cfg802154.h   |  1 +
 net/ieee802154/nl802154.c | 30 ++++++++++++++++++++++++++++++
 net/ieee802154/rdev-ops.h | 12 ++++++++++++
 net/ieee802154/trace.h    | 15 +++++++++++++++
 net/mac802154/cfg.c       | 19 +++++++++++++++++++
 5 files changed, 77 insertions(+)

(limited to 'include/net')

diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index 4de59aa96173..2e3bb012c2ff 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -44,6 +44,7 @@ struct cfg802154_ops {
 	int	(*set_channel)(struct wpan_phy *wpan_phy, u8 page, u8 channel);
 	int	(*set_cca_mode)(struct wpan_phy *wpan_phy,
 				const struct wpan_phy_cca *cca);
+	int     (*set_tx_power)(struct wpan_phy *wpan_phy, s32 power);
 	int	(*set_pan_id)(struct wpan_phy *wpan_phy,
 			      struct wpan_dev *wpan_dev, __le16 pan_id);
 	int	(*set_short_addr)(struct wpan_phy *wpan_phy,
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index 212a795b33c9..5c9d524ba02c 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -783,6 +783,28 @@ static int nl802154_set_cca_mode(struct sk_buff *skb, struct genl_info *info)
 	return rdev_set_cca_mode(rdev, &cca);
 }
 
+static int nl802154_set_tx_power(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg802154_registered_device *rdev = info->user_ptr[0];
+	s32 power;
+	int i;
+
+	if (!(rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER))
+		return -EOPNOTSUPP;
+
+	if (!info->attrs[NL802154_ATTR_TX_POWER])
+		return -EINVAL;
+
+	power = nla_get_s32(info->attrs[NL802154_ATTR_TX_POWER]);
+
+	for (i = 0; i < rdev->wpan_phy.supported.tx_powers_size; i++) {
+		if (power == rdev->wpan_phy.supported.tx_powers[i])
+			return rdev_set_tx_power(rdev, power);
+	}
+
+	return -EINVAL;
+}
+
 static int nl802154_set_pan_id(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg802154_registered_device *rdev = info->user_ptr[0];
@@ -1093,6 +1115,14 @@ static const struct genl_ops nl802154_ops[] = {
 		.internal_flags = NL802154_FLAG_NEED_WPAN_PHY |
 				  NL802154_FLAG_NEED_RTNL,
 	},
+	{
+		.cmd = NL802154_CMD_SET_TX_POWER,
+		.doit = nl802154_set_tx_power,
+		.policy = nl802154_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL802154_FLAG_NEED_WPAN_PHY |
+				  NL802154_FLAG_NEED_RTNL,
+	},
 	{
 		.cmd = NL802154_CMD_SET_PAN_ID,
 		.doit = nl802154_set_pan_id,
diff --git a/net/ieee802154/rdev-ops.h b/net/ieee802154/rdev-ops.h
index 7b5a9dd94fe5..36456118a6ec 100644
--- a/net/ieee802154/rdev-ops.h
+++ b/net/ieee802154/rdev-ops.h
@@ -74,6 +74,18 @@ rdev_set_cca_mode(struct cfg802154_registered_device *rdev,
 	return ret;
 }
 
+static inline int
+rdev_set_tx_power(struct cfg802154_registered_device *rdev,
+		  s32 power)
+{
+	int ret;
+
+	trace_802154_rdev_set_tx_power(&rdev->wpan_phy, power);
+	ret = rdev->ops->set_tx_power(&rdev->wpan_phy, power);
+	trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+	return ret;
+}
+
 static inline int
 rdev_set_pan_id(struct cfg802154_registered_device *rdev,
 		struct wpan_dev *wpan_dev, __le16 pan_id)
diff --git a/net/ieee802154/trace.h b/net/ieee802154/trace.h
index 5ac25eb6ed17..1f1cecc420d7 100644
--- a/net/ieee802154/trace.h
+++ b/net/ieee802154/trace.h
@@ -93,6 +93,21 @@ TRACE_EVENT(802154_rdev_set_channel,
 		  __entry->page, __entry->channel)
 );
 
+TRACE_EVENT(802154_rdev_set_tx_power,
+	TP_PROTO(struct wpan_phy *wpan_phy, s32 power),
+	TP_ARGS(wpan_phy, power),
+	TP_STRUCT__entry(
+		WPAN_PHY_ENTRY
+		__field(s32, power)
+	),
+	TP_fast_assign(
+		WPAN_PHY_ASSIGN;
+		__entry->power = power;
+	),
+	TP_printk(WPAN_PHY_PR_FMT ", power: %d", WPAN_PHY_PR_ARG,
+		  __entry->power)
+);
+
 TRACE_EVENT(802154_rdev_set_cca_mode,
 	TP_PROTO(struct wpan_phy *wpan_phy, const struct wpan_phy_cca *cca),
 	TP_ARGS(wpan_phy, cca),
diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c
index d5290ea49dca..ab12fa296eb3 100644
--- a/net/mac802154/cfg.c
+++ b/net/mac802154/cfg.c
@@ -105,6 +105,24 @@ ieee802154_set_cca_mode(struct wpan_phy *wpan_phy,
 	return ret;
 }
 
+static int
+ieee802154_set_tx_power(struct wpan_phy *wpan_phy, s32 power)
+{
+	struct ieee802154_local *local = wpan_phy_priv(wpan_phy);
+	int ret;
+
+	ASSERT_RTNL();
+
+	if (wpan_phy->transmit_power == power)
+		return 0;
+
+	ret = drv_set_tx_power(local, power);
+	if (!ret)
+		wpan_phy->transmit_power = power;
+
+	return ret;
+}
+
 static int
 ieee802154_set_pan_id(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
 		      __le16 pan_id)
@@ -195,6 +213,7 @@ const struct cfg802154_ops mac802154_config_ops = {
 	.del_virtual_intf = ieee802154_del_iface,
 	.set_channel = ieee802154_set_channel,
 	.set_cca_mode = ieee802154_set_cca_mode,
+	.set_tx_power = ieee802154_set_tx_power,
 	.set_pan_id = ieee802154_set_pan_id,
 	.set_short_addr = ieee802154_set_short_addr,
 	.set_backoff_exponent = ieee802154_set_backoff_exponent,
-- 
cgit v1.2.3


From d6b915e29f4adea94bc02ba7675bb4f84e6a1abd Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 22 May 2015 16:32:51 +0200
Subject: ip_fragment: don't forward defragmented DF packet

We currently always send fragments without DF bit set.

Thus, given following setup:

mtu1500 - mtu1500:1400 - mtu1400:1280 - mtu1280
   A           R1              R2         B

Where R1 and R2 run linux with netfilter defragmentation/conntrack
enabled, then if Host A sent a fragmented packet _with_ DF set to B, R1
will respond with icmp too big error if one of these fragments exceeded
1400 bytes.

However, if R1 receives fragment sizes 1200 and 100, it would
forward the reassembled packet without refragmenting, i.e.
R2 will send an icmp error in response to a packet that was never sent,
citing mtu that the original sender never exceeded.

The other minor issue is that a refragmentation on R1 will conceal the
MTU of R2-B since refragmentation does not set DF bit on the fragments.

This modifies ip_fragment so that we track largest fragment size seen
both for DF and non-DF packets, and set frag_max_size to the largest
value.

If the DF fragment size is larger or equal to the non-df one, we will
consider the packet a path mtu probe:
We set DF bit on the reassembled skb and also tag it with a new IPCB flag
to force refragmentation even if skb fits outdev mtu.

We will also set DF bit on each fragment in this case.

Joint work with Hannes Frederic Sowa.

Reported-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h |  2 +-
 include/net/ip.h        |  1 +
 net/ipv4/ip_fragment.c  | 31 ++++++++++++++++++++++++++-----
 net/ipv4/ip_output.c    | 12 ++++++++++--
 4 files changed, 38 insertions(+), 8 deletions(-)

(limited to 'include/net')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 8d1765577acc..e1300b3dd597 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -43,7 +43,7 @@ enum {
  * @len: total length of the original datagram
  * @meat: length of received fragments so far
  * @flags: fragment queue flags
- * @max_size: (ipv4 only) maximum received fragment size with IP_DF set
+ * @max_size: maximum received fragment size
  * @net: namespace that this frag belongs to
  */
 struct inet_frag_queue {
diff --git a/include/net/ip.h b/include/net/ip.h
index 7921a36b805c..9b976cf99122 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -45,6 +45,7 @@ struct inet_skb_parm {
 #define IPSKB_FRAG_COMPLETE	BIT(3)
 #define IPSKB_REROUTED		BIT(4)
 #define IPSKB_DOREDIRECT	BIT(5)
+#define IPSKB_FRAG_PMTU		BIT(6)
 
 	u16			frag_max_size;
 };
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 47fa64ee82b1..a50dc6d408d1 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -75,6 +75,7 @@ struct ipq {
 	__be16		id;
 	u8		protocol;
 	u8		ecn; /* RFC3168 support */
+	u16		max_df_size; /* largest frag with DF set seen */
 	int             iif;
 	unsigned int    rid;
 	struct inet_peer *peer;
@@ -326,6 +327,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 {
 	struct sk_buff *prev, *next;
 	struct net_device *dev;
+	unsigned int fragsize;
 	int flags, offset;
 	int ihl, end;
 	int err = -ENOENT;
@@ -481,9 +483,14 @@ found:
 	if (offset == 0)
 		qp->q.flags |= INET_FRAG_FIRST_IN;
 
+	fragsize = skb->len + ihl;
+
+	if (fragsize > qp->q.max_size)
+		qp->q.max_size = fragsize;
+
 	if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
-	    skb->len + ihl > qp->q.max_size)
-		qp->q.max_size = skb->len + ihl;
+	    fragsize > qp->max_df_size)
+		qp->max_df_size = fragsize;
 
 	if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
 	    qp->q.meat == qp->q.len) {
@@ -613,13 +620,27 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 	head->next = NULL;
 	head->dev = dev;
 	head->tstamp = qp->q.stamp;
-	IPCB(head)->frag_max_size = qp->q.max_size;
+	IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
 
 	iph = ip_hdr(head);
-	/* max_size != 0 implies at least one fragment had IP_DF set */
-	iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0;
 	iph->tot_len = htons(len);
 	iph->tos |= ecn;
+
+	/* When we set IP_DF on a refragmented skb we must also force a
+	 * call to ip_fragment to avoid forwarding a DF-skb of size s while
+	 * original sender only sent fragments of size f (where f < s).
+	 *
+	 * We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest
+	 * frag seen to avoid sending tiny DF-fragments in case skb was built
+	 * from one very small df-fragment and one large non-df frag.
+	 */
+	if (qp->max_df_size == qp->q.max_size) {
+		IPCB(head)->flags |= IPSKB_FRAG_PMTU;
+		iph->frag_off = htons(IP_DF);
+	} else {
+		iph->frag_off = 0;
+	}
+
 	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
 	qp->q.fragments = NULL;
 	qp->q.fragments_tail = NULL;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index d6dd8ba04441..f5f5ef1cebd5 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -278,7 +278,7 @@ static int ip_finish_output(struct sock *sk, struct sk_buff *skb)
 	if (skb_is_gso(skb))
 		return ip_finish_output_gso(sk, skb, mtu);
 
-	if (skb->len > mtu)
+	if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU))
 		return ip_fragment(sk, skb, mtu, ip_finish_output2);
 
 	return ip_finish_output2(sk, skb);
@@ -492,7 +492,10 @@ static int ip_fragment(struct sock *sk, struct sk_buff *skb,
 {
 	struct iphdr *iph = ip_hdr(skb);
 
-	if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
+	if ((iph->frag_off & htons(IP_DF)) == 0)
+		return ip_do_fragment(sk, skb, output);
+
+	if (unlikely(!skb->ignore_df ||
 		     (IPCB(skb)->frag_max_size &&
 		      IPCB(skb)->frag_max_size > mtu))) {
 		struct rtable *rt = skb_rtable(skb);
@@ -537,6 +540,8 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb,
 	iph = ip_hdr(skb);
 
 	mtu = ip_skb_dst_mtu(skb);
+	if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
+		mtu = IPCB(skb)->frag_max_size;
 
 	/*
 	 *	Setup starting values.
@@ -732,6 +737,9 @@ slow_path:
 		iph = ip_hdr(skb2);
 		iph->frag_off = htons((offset >> 3));
 
+		if (IPCB(skb)->flags & IPSKB_FRAG_PMTU)
+			iph->frag_off |= htons(IP_DF);
+
 		/* ANK: dirty, but effective trick. Upgrade options only if
 		 * the segment to be fragmented was THE FIRST (otherwise,
 		 * options are already fixed) and make it ONCE
-- 
cgit v1.2.3


From b69644c1c72e179738dd5c7e52e99d8550189472 Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Wed, 27 May 2015 13:42:10 +0200
Subject: nl802154: add support to set cca ed level

This patch adds support for setting the current cca ed level value over
nl802154.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Reviewed-by: Varka Bhadram <varkabhadram@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/cfg802154.h   |  1 +
 net/ieee802154/nl802154.c | 30 ++++++++++++++++++++++++++++++
 net/ieee802154/rdev-ops.h | 11 +++++++++++
 net/ieee802154/trace.h    | 15 +++++++++++++++
 net/mac802154/cfg.c       | 19 +++++++++++++++++++
 5 files changed, 76 insertions(+)

(limited to 'include/net')

diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index 2e3bb012c2ff..290a9a69af07 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -44,6 +44,7 @@ struct cfg802154_ops {
 	int	(*set_channel)(struct wpan_phy *wpan_phy, u8 page, u8 channel);
 	int	(*set_cca_mode)(struct wpan_phy *wpan_phy,
 				const struct wpan_phy_cca *cca);
+	int     (*set_cca_ed_level)(struct wpan_phy *wpan_phy, s32 ed_level);
 	int     (*set_tx_power)(struct wpan_phy *wpan_phy, s32 power);
 	int	(*set_pan_id)(struct wpan_phy *wpan_phy,
 			      struct wpan_dev *wpan_dev, __le16 pan_id);
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index a89d9d024126..7dbb1f4ce7df 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -790,6 +790,28 @@ static int nl802154_set_cca_mode(struct sk_buff *skb, struct genl_info *info)
 	return rdev_set_cca_mode(rdev, &cca);
 }
 
+static int nl802154_set_cca_ed_level(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg802154_registered_device *rdev = info->user_ptr[0];
+	s32 ed_level;
+	int i;
+
+	if (!(rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL))
+		return -EOPNOTSUPP;
+
+	if (!info->attrs[NL802154_ATTR_CCA_ED_LEVEL])
+		return -EINVAL;
+
+	ed_level = nla_get_s32(info->attrs[NL802154_ATTR_CCA_ED_LEVEL]);
+
+	for (i = 0; i < rdev->wpan_phy.supported.cca_ed_levels_size; i++) {
+		if (ed_level == rdev->wpan_phy.supported.cca_ed_levels[i])
+			return rdev_set_cca_ed_level(rdev, ed_level);
+	}
+
+	return -EINVAL;
+}
+
 static int nl802154_set_tx_power(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg802154_registered_device *rdev = info->user_ptr[0];
@@ -1122,6 +1144,14 @@ static const struct genl_ops nl802154_ops[] = {
 		.internal_flags = NL802154_FLAG_NEED_WPAN_PHY |
 				  NL802154_FLAG_NEED_RTNL,
 	},
+	{
+		.cmd = NL802154_CMD_SET_CCA_ED_LEVEL,
+		.doit = nl802154_set_cca_ed_level,
+		.policy = nl802154_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL802154_FLAG_NEED_WPAN_PHY |
+				  NL802154_FLAG_NEED_RTNL,
+	},
 	{
 		.cmd = NL802154_CMD_SET_TX_POWER,
 		.doit = nl802154_set_tx_power,
diff --git a/net/ieee802154/rdev-ops.h b/net/ieee802154/rdev-ops.h
index 36456118a6ec..b2155a123f6c 100644
--- a/net/ieee802154/rdev-ops.h
+++ b/net/ieee802154/rdev-ops.h
@@ -74,6 +74,17 @@ rdev_set_cca_mode(struct cfg802154_registered_device *rdev,
 	return ret;
 }
 
+static inline int
+rdev_set_cca_ed_level(struct cfg802154_registered_device *rdev, s32 ed_level)
+{
+	int ret;
+
+	trace_802154_rdev_set_cca_ed_level(&rdev->wpan_phy, ed_level);
+	ret = rdev->ops->set_cca_ed_level(&rdev->wpan_phy, ed_level);
+	trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+	return ret;
+}
+
 static inline int
 rdev_set_tx_power(struct cfg802154_registered_device *rdev,
 		  s32 power)
diff --git a/net/ieee802154/trace.h b/net/ieee802154/trace.h
index ac790bb1b4fe..73eb7605c1eb 100644
--- a/net/ieee802154/trace.h
+++ b/net/ieee802154/trace.h
@@ -123,6 +123,21 @@ TRACE_EVENT(802154_rdev_set_cca_mode,
 		  WPAN_CCA_PR_ARG)
 );
 
+TRACE_EVENT(802154_rdev_set_cca_ed_level,
+	TP_PROTO(struct wpan_phy *wpan_phy, s32 ed_level),
+	TP_ARGS(wpan_phy, ed_level),
+	TP_STRUCT__entry(
+		WPAN_PHY_ENTRY
+		__field(s32, ed_level)
+	),
+	TP_fast_assign(
+		WPAN_PHY_ASSIGN;
+		__entry->ed_level = ed_level;
+	),
+	TP_printk(WPAN_PHY_PR_FMT ", ed_level: %d", WPAN_PHY_PR_ARG,
+		  __entry->ed_level)
+);
+
 DECLARE_EVENT_CLASS(802154_le16_template,
 	TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
 		 __le16 le16arg),
diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c
index ab12fa296eb3..317c4662e544 100644
--- a/net/mac802154/cfg.c
+++ b/net/mac802154/cfg.c
@@ -105,6 +105,24 @@ ieee802154_set_cca_mode(struct wpan_phy *wpan_phy,
 	return ret;
 }
 
+static int
+ieee802154_set_cca_ed_level(struct wpan_phy *wpan_phy, s32 ed_level)
+{
+	struct ieee802154_local *local = wpan_phy_priv(wpan_phy);
+	int ret;
+
+	ASSERT_RTNL();
+
+	if (wpan_phy->cca_ed_level == ed_level)
+		return 0;
+
+	ret = drv_set_cca_ed_level(local, ed_level);
+	if (!ret)
+		wpan_phy->cca_ed_level = ed_level;
+
+	return ret;
+}
+
 static int
 ieee802154_set_tx_power(struct wpan_phy *wpan_phy, s32 power)
 {
@@ -213,6 +231,7 @@ const struct cfg802154_ops mac802154_config_ops = {
 	.del_virtual_intf = ieee802154_del_iface,
 	.set_channel = ieee802154_set_channel,
 	.set_cca_mode = ieee802154_set_cca_mode,
+	.set_cca_ed_level = ieee802154_set_cca_ed_level,
 	.set_tx_power = ieee802154_set_tx_power,
 	.set_pan_id = ieee802154_set_pan_id,
 	.set_short_addr = ieee802154_set_short_addr,
-- 
cgit v1.2.3


From 9302d7bb0c5cd46be5706859301f18c137b2439f Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Date: Tue, 26 May 2015 17:30:17 -0600
Subject: sctp: Fix mangled IPv4 addresses on a IPv6 listening socket

sctp_v4_map_v6 was subtly writing and reading from members
of a union in a way the clobbered data it needed to read before
it read it.

Zeroing the v6 flowinfo overwrites the v4 sin_addr with 0, meaning
that every place that calls sctp_v4_map_v6 gets ::ffff:0.0.0.0 as the
result.

Reorder things to guarantee correct behaviour no matter what the
union layout is.

This impacts user space clients that open an IPv6 SCTP socket and
receive IPv4 connections. Prior to 299ee user space would see a
sockaddr with AF_INET and a correct address, after 299ee the sockaddr
is AF_INET6, but the address is wrong.

Fixes: 299ee123e198 (sctp: Fixup v4mapped behaviour to comply with Sock API)
Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index c56a438c3a1e..ce13cf20f625 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -574,11 +574,14 @@ static inline void sctp_v6_map_v4(union sctp_addr *addr)
 /* Map v4 address to v4-mapped v6 address */
 static inline void sctp_v4_map_v6(union sctp_addr *addr)
 {
+	__be16 port;
+
+	port = addr->v4.sin_port;
+	addr->v6.sin6_addr.s6_addr32[3] = addr->v4.sin_addr.s_addr;
+	addr->v6.sin6_port = port;
 	addr->v6.sin6_family = AF_INET6;
 	addr->v6.sin6_flowinfo = 0;
 	addr->v6.sin6_scope_id = 0;
-	addr->v6.sin6_port = addr->v4.sin_port;
-	addr->v6.sin6_addr.s6_addr32[3] = addr->v4.sin_addr.s_addr;
 	addr->v6.sin6_addr.s6_addr32[0] = 0;
 	addr->v6.sin6_addr.s6_addr32[1] = 0;
 	addr->v6.sin6_addr.s6_addr32[2] = htonl(0x0000ffff);
-- 
cgit v1.2.3


From ed2dfd900992aa7b6b3d0abd8ec9a7e9d2c7f827 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 27 May 2015 11:34:37 -0700
Subject: tcp/dccp: warn user for preferred ip_local_port_range

After commit 07f4c90062f8f ("tcp/dccp: try to not exhaust
ip_local_port_range in connect()") it is advised to have an even number
of ports described in /proc/sys/net/ipv4/ip_local_port_range

This means start/end values should have a different parity.

Let's warn sysadmins of this, so that they can update their settings
if they want to.

Suggested-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   | 1 +
 net/ipv4/sysctl_net_ipv4.c | 6 ++++++
 2 files changed, 7 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 6848b8bb2e63..c68926b4899c 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -19,6 +19,7 @@ struct sock;
 struct local_ports {
 	seqlock_t	lock;
 	int		range[2];
+	bool		warned;
 };
 
 struct ping_group_range {
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index e64892769607..0330ab2e2b63 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -45,7 +45,13 @@ static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
 /* Update system visible IP port range */
 static void set_local_port_range(struct net *net, int range[2])
 {
+	bool same_parity = !((range[0] ^ range[1]) & 1);
+
 	write_seqlock(&net->ipv4.ip_local_ports.lock);
+	if (same_parity && !net->ipv4.ip_local_ports.warned) {
+		net->ipv4.ip_local_ports.warned = true;
+		pr_err_ratelimited("ip_local_port_range: prefer different parity for start/end values.\n");
+	}
 	net->ipv4.ip_local_ports.range[0] = range[0];
 	net->ipv4.ip_local_ports.range[1] = range[1];
 	write_sequnlock(&net->ipv4.ip_local_ports.lock);
-- 
cgit v1.2.3


From 3a7af58faa7829faa26026c245d2a8a44e9605c5 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Mon, 13 Apr 2015 18:27:35 +0200
Subject: mac80211: Fix mac80211.h docbook comments

A couple of enums in mac80211.h became structures recently, but the
comments didn't follow suit, leading to errors like:

  Error(.//include/net/mac80211.h:367): Cannot parse enum!
  Documentation/DocBook/Makefile:93: recipe for target 'Documentation/DocBook/80211.xml' failed
  make[1]: *** [Documentation/DocBook/80211.xml] Error 1
  Makefile:1361: recipe for target 'mandocs' failed
  make: *** [mandocs] Error 2

Fix the comments comments accordingly.  Added a couple of other small
comment fixes while I was there to silence other recently-added docbook
warnings.

Reported-by: Jim Davis <jim.epost@gmail.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 8e3668b44c29..fc57f6b82fc5 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -354,7 +354,7 @@ enum ieee80211_rssi_event_data {
 };
 
 /**
- * enum ieee80211_rssi_event - data attached to an %RSSI_EVENT
+ * struct ieee80211_rssi_event - data attached to an %RSSI_EVENT
  * @data: See &enum ieee80211_rssi_event_data
  */
 struct ieee80211_rssi_event {
@@ -388,7 +388,7 @@ enum ieee80211_mlme_event_status {
 };
 
 /**
- * enum ieee80211_mlme_event - data attached to an %MLME_EVENT
+ * struct ieee80211_mlme_event - data attached to an %MLME_EVENT
  * @data: See &enum ieee80211_mlme_event_data
  * @status: See &enum ieee80211_mlme_event_status
  * @reason: the reason code if applicable
@@ -401,9 +401,10 @@ struct ieee80211_mlme_event {
 
 /**
  * struct ieee80211_event - event to be sent to the driver
- * @type The event itself. See &enum ieee80211_event_type.
+ * @type: The event itself. See &enum ieee80211_event_type.
  * @rssi: relevant if &type is %RSSI_EVENT
  * @mlme: relevant if &type is %AUTH_EVENT
+ * @u:    union holding the above two fields
  */
 struct ieee80211_event {
 	enum ieee80211_event_type type;
-- 
cgit v1.2.3


From 9f950415e4e28e7cfae2e416b43e862e8101d996 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Fri, 29 May 2015 13:47:07 -0400
Subject: tcp: fix child sockets to use system default congestion control if
 not set

Linux 3.17 and earlier are explicitly engineered so that if the app
doesn't specifically request a CC module on a listener before the SYN
arrives, then the child gets the system default CC when the connection
is established. See tcp_init_congestion_control() in 3.17 or earlier,
which says "if no choice made yet assign the current value set as
default". The change ("net: tcp: assign tcp cong_ops when tcp sk is
created") altered these semantics, so that children got their parent
listener's congestion control even if the system default had changed
after the listener was created.

This commit returns to those original semantics from 3.17 and earlier,
since they are the original semantics from 2007 in 4d4d3d1e8 ("[TCP]:
Congestion control initialization."), and some Linux congestion
control workflows depend on that.

In summary, if a listener socket specifically sets TCP_CONGESTION to
"x", or the route locks the CC module to "x", then the child gets
"x". Otherwise the child gets current system default from
net.ipv4.tcp_congestion_control. That's the behavior in 3.17 and
earlier, and this commit restores that.

Fixes: 55d8694fa82c ("net: tcp: assign tcp cong_ops when tcp sk is created")
Cc: Florian Westphal <fw@strlen.de>
Cc: Daniel Borkmann <dborkman@redhat.com>
Cc: Glenn Judd <glenn.judd@morganstanley.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_connection_sock.h | 3 ++-
 net/ipv4/tcp_cong.c                | 5 ++++-
 net/ipv4/tcp_minisocks.c           | 5 ++++-
 3 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 497bc14cdb85..0320bbb7d7b5 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -98,7 +98,8 @@ struct inet_connection_sock {
 	const struct tcp_congestion_ops *icsk_ca_ops;
 	const struct inet_connection_sock_af_ops *icsk_af_ops;
 	unsigned int		  (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
-	__u8			  icsk_ca_state:7,
+	__u8			  icsk_ca_state:6,
+				  icsk_ca_setsockopt:1,
 				  icsk_ca_dst_locked:1;
 	__u8			  icsk_retransmits;
 	__u8			  icsk_pending;
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 7a5ae50c80c8..84be008c945c 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -187,6 +187,7 @@ static void tcp_reinit_congestion_control(struct sock *sk,
 
 	tcp_cleanup_congestion_control(sk);
 	icsk->icsk_ca_ops = ca;
+	icsk->icsk_ca_setsockopt = 1;
 
 	if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init)
 		icsk->icsk_ca_ops->init(sk);
@@ -335,8 +336,10 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
 	rcu_read_lock();
 	ca = __tcp_ca_find_autoload(name);
 	/* No change asking for existing value */
-	if (ca == icsk->icsk_ca_ops)
+	if (ca == icsk->icsk_ca_ops) {
+		icsk->icsk_ca_setsockopt = 1;
 		goto out;
+	}
 	if (!ca)
 		err = -ENOENT;
 	else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b5732a54f2ad..17e7339ee5ca 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -420,7 +420,10 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
 		rcu_read_unlock();
 	}
 
-	if (!ca_got_dst && !try_module_get(icsk->icsk_ca_ops->owner))
+	/* If no valid choice made yet, assign current system default ca. */
+	if (!ca_got_dst &&
+	    (!icsk->icsk_ca_setsockopt ||
+	     !try_module_get(icsk->icsk_ca_ops->owner)))
 		tcp_assign_congestion_control(sk);
 
 	tcp_set_ca_state(sk, TCP_CA_Open);
-- 
cgit v1.2.3


From 42aecaa9bb2bd57eb8d61b4565cee5d3640863fb Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Thu, 4 Jun 2015 09:16:39 -0700
Subject: net: Get skb hash over flow_keys structure

This patch changes flow hashing to use jhash2 over the flow_keys
structure instead just doing jhash_3words over src, dst, and ports.
This method will allow us take more input into the hashing function
so that we can include full IPv6 addresses, VLAN, flow labels etc.
without needing to resort to xor'ing which makes for a poor hash.

Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h       |  2 +-
 include/net/flow_dissector.h | 21 ++++++++++++++---
 include/net/ip.h             |  2 ++
 include/net/ipv6.h           |  2 ++
 net/core/flow_dissector.c    | 54 +++++++++++++++++++++++++++++++++-----------
 net/sched/cls_flower.c       |  2 ++
 6 files changed, 66 insertions(+), 17 deletions(-)

(limited to 'include/net')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6b41c15efa27..cc612fc0a894 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1943,7 +1943,7 @@ static inline void skb_probe_transport_header(struct sk_buff *skb,
 	if (skb_transport_header_was_set(skb))
 		return;
 	else if (skb_flow_dissect_flow_keys(skb, &keys))
-		skb_set_transport_header(skb, keys.basic.thoff);
+		skb_set_transport_header(skb, keys.control.thoff);
 	else
 		skb_set_transport_header(skb, offset_hint);
 }
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index bac9c1421f58..cba6a10b214a 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -6,6 +6,15 @@
 #include <linux/in6.h>
 #include <uapi/linux/if_ether.h>
 
+/**
+ * struct flow_dissector_key_control:
+ * @thoff: Transport header offset
+ */
+struct flow_dissector_key_control {
+	u16	thoff;
+	u16	padding;
+};
+
 /**
  * struct flow_dissector_key_basic:
  * @thoff: Transport header offset
@@ -13,9 +22,9 @@
  * @ip_proto: Transport header protocol (eg. TCP/UDP)
  */
 struct flow_dissector_key_basic {
-	u16	thoff;
 	__be16	n_proto;
 	u8	ip_proto;
+	u8	padding;
 };
 
 /**
@@ -70,6 +79,7 @@ struct flow_dissector_key_eth_addrs {
 };
 
 enum flow_dissector_key_id {
+	FLOW_DISSECTOR_KEY_CONTROL, /* struct flow_dissector_key_control */
 	FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */
 	FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_addrs */
 	FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, /* struct flow_dissector_key_addrs */
@@ -109,11 +119,16 @@ static inline bool skb_flow_dissect(const struct sk_buff *skb,
 }
 
 struct flow_keys {
-	struct flow_dissector_key_addrs addrs;
-	struct flow_dissector_key_ports ports;
+	struct flow_dissector_key_control control;
+#define FLOW_KEYS_HASH_START_FIELD basic
 	struct flow_dissector_key_basic basic;
+	struct flow_dissector_key_ports ports;
+	struct flow_dissector_key_addrs addrs;
 };
 
+#define FLOW_KEYS_HASH_OFFSET		\
+	offsetof(struct flow_keys, FLOW_KEYS_HASH_START_FIELD)
+
 extern struct flow_dissector flow_keys_dissector;
 extern struct flow_dissector flow_keys_buf_dissector;
 
diff --git a/include/net/ip.h b/include/net/ip.h
index 9b976cf99122..16cfc87fed6c 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -360,6 +360,8 @@ static inline void inet_set_txhash(struct sock *sk)
 	struct inet_sock *inet = inet_sk(sk);
 	struct flow_keys keys;
 
+	memset(&keys, 0, sizeof(keys));
+
 	keys.addrs.src = inet->inet_saddr;
 	keys.addrs.dst = inet->inet_daddr;
 	keys.ports.src = inet->inet_sport;
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 35d485c78080..474ca466a091 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -699,6 +699,8 @@ static inline void ip6_set_txhash(struct sock *sk)
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct flow_keys keys;
 
+	memset(&keys, 0, sizeof(keys));
+
 	keys.addrs.src = (__force __be32)ipv6_addr_hash(&np->saddr);
 	keys.addrs.dst = (__force __be32)ipv6_addr_hash(&sk->sk_v6_daddr);
 	keys.ports.src = inet->inet_sport;
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 0763795bea8f..55b5f2962afa 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -57,9 +57,11 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
 		flow_dissector->offset[key->key_id] = key->offset;
 	}
 
-	/* Ensure that the dissector always includes basic key. That way
-	 * we are able to avoid handling lack of it in fast path.
+	/* Ensure that the dissector always includes control and basic key.
+	 * That way we are able to avoid handling lack of these in fast path.
 	 */
+	BUG_ON(!skb_flow_dissector_uses_key(flow_dissector,
+					    FLOW_DISSECTOR_KEY_CONTROL));
 	BUG_ON(!skb_flow_dissector_uses_key(flow_dissector,
 					    FLOW_DISSECTOR_KEY_BASIC));
 }
@@ -120,6 +122,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 			void *target_container,
 			void *data, __be16 proto, int nhoff, int hlen)
 {
+	struct flow_dissector_key_control *key_control;
 	struct flow_dissector_key_basic *key_basic;
 	struct flow_dissector_key_addrs *key_addrs;
 	struct flow_dissector_key_ports *key_ports;
@@ -132,6 +135,13 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 		hlen = skb_headlen(skb);
 	}
 
+	/* It is ensured by skb_flow_dissector_init() that control key will
+	 * be always present.
+	 */
+	key_control = skb_flow_dissector_target(flow_dissector,
+						FLOW_DISSECTOR_KEY_CONTROL,
+						target_container);
+
 	/* It is ensured by skb_flow_dissector_init() that basic key will
 	 * be always present.
 	 */
@@ -219,7 +229,7 @@ flow_label:
 
 			key_basic->n_proto = proto;
 			key_basic->ip_proto = ip_proto;
-			key_basic->thoff = (u16)nhoff;
+			key_control->thoff = (u16)nhoff;
 
 			if (skb_flow_dissector_uses_key(flow_dissector,
 							FLOW_DISSECTOR_KEY_PORTS)) {
@@ -275,7 +285,7 @@ flow_label:
 		if (!hdr)
 			return false;
 		key_basic->n_proto = proto;
-		key_basic->thoff = (u16)nhoff;
+		key_control->thoff = (u16)nhoff;
 
 		if (skb_flow_dissector_uses_key(flow_dissector,
 						FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS)) {
@@ -288,7 +298,7 @@ flow_label:
 		return true;
 	}
 	case htons(ETH_P_FCOE):
-		key_basic->thoff = (u16)(nhoff + FCOE_HEADER_LEN);
+		key_control->thoff = (u16)(nhoff + FCOE_HEADER_LEN);
 		/* fall through */
 	default:
 		return false;
@@ -345,7 +355,7 @@ flow_label:
 
 	key_basic->n_proto = proto;
 	key_basic->ip_proto = ip_proto;
-	key_basic->thoff = (u16) nhoff;
+	key_control->thoff = (u16)nhoff;
 
 	if (skb_flow_dissector_uses_key(flow_dissector,
 					FLOW_DISSECTOR_KEY_PORTS)) {
@@ -366,9 +376,21 @@ static __always_inline void __flow_hash_secret_init(void)
 	net_get_random_once(&hashrnd, sizeof(hashrnd));
 }
 
-static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c, u32 keyval)
+static __always_inline u32 __flow_hash_words(u32 *words, u32 length, u32 keyval)
+{
+	return jhash2(words, length, keyval);
+}
+
+static inline void *flow_keys_hash_start(struct flow_keys *flow)
 {
-	return jhash_3words(a, b, c, keyval);
+	BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % sizeof(u32));
+	return (void *)flow + FLOW_KEYS_HASH_OFFSET;
+}
+
+static inline size_t flow_keys_hash_length(struct flow_keys *flow)
+{
+	BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32));
+	return (sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) / sizeof(u32);
 }
 
 static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
@@ -383,10 +405,8 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
 		swap(keys->ports.src, keys->ports.dst);
 	}
 
-	hash = __flow_hash_3words((__force u32)keys->addrs.dst,
-				  (__force u32)keys->addrs.src,
-				  (__force u32)keys->ports.ports,
-				  keyval);
+	hash = __flow_hash_words((u32 *)flow_keys_hash_start(keys),
+				 flow_keys_hash_length(keys), keyval);
 	if (!hash)
 		hash = 1;
 
@@ -473,7 +493,7 @@ EXPORT_SYMBOL(skb_get_hash_perturb);
 u32 __skb_get_poff(const struct sk_buff *skb, void *data,
 		   const struct flow_keys *keys, int hlen)
 {
-	u32 poff = keys->basic.thoff;
+	u32 poff = keys->control.thoff;
 
 	switch (keys->basic.ip_proto) {
 	case IPPROTO_TCP: {
@@ -536,6 +556,10 @@ u32 skb_get_poff(const struct sk_buff *skb)
 }
 
 static const struct flow_dissector_key flow_keys_dissector_keys[] = {
+	{
+		.key_id = FLOW_DISSECTOR_KEY_CONTROL,
+		.offset = offsetof(struct flow_keys, control),
+	},
 	{
 		.key_id = FLOW_DISSECTOR_KEY_BASIC,
 		.offset = offsetof(struct flow_keys, basic),
@@ -555,6 +579,10 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = {
 };
 
 static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = {
+	{
+		.key_id = FLOW_DISSECTOR_KEY_CONTROL,
+		.offset = offsetof(struct flow_keys, control),
+	},
 	{
 		.key_id = FLOW_DISSECTOR_KEY_BASIC,
 		.offset = offsetof(struct flow_keys, basic),
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 8c8f34ef6980..5a7d66c59684 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -25,6 +25,7 @@
 
 struct fl_flow_key {
 	int	indev_ifindex;
+	struct flow_dissector_key_control control;
 	struct flow_dissector_key_basic basic;
 	struct flow_dissector_key_eth_addrs eth;
 	union {
@@ -347,6 +348,7 @@ static void fl_init_dissector(struct cls_fl_head *head,
 	struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX];
 	size_t cnt = 0;
 
+	FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control);
 	FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic);
 	FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt,
 			       FLOW_DISSECTOR_KEY_ETH_ADDRS, eth);
-- 
cgit v1.2.3


From c3f8324188fa80178f20c8209b492ca6191177e8 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Thu, 4 Jun 2015 09:16:40 -0700
Subject: net: Add full IPv6 addresses to flow_keys

This patch adds full IPv6 addresses into flow_keys and uses them as
input to the flow hash function. The implementation supports either
IPv4 or IPv6 addresses in a union, and selector is used to determine
how may words to input to jhash2.

We also add flow_get_u32_dst and flow_get_u32_src functions which are
used to get a u32 representation of the source and destination
addresses. For IPv6, ipv6_addr_hash is called. These functions retain
getting the legacy values of src and dst in flow_keys.

With this patch, Ethertype and IP protocol are now included in the
flow hash input.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c                |   9 +-
 drivers/net/ethernet/cisco/enic/enic_clsf.c    |   8 +-
 drivers/net/ethernet/cisco/enic/enic_ethtool.c |   4 +-
 include/net/flow_dissector.h                   |  52 +++++++----
 include/net/ip.h                               |  19 +++-
 include/net/ipv6.h                             |  21 ++++-
 net/core/flow_dissector.c                      | 116 +++++++++++++++++++++----
 net/ethernet/eth.c                             |   2 +-
 net/sched/cls_flow.c                           |  14 ++-
 net/sched/cls_flower.c                         |  11 +--
 10 files changed, 193 insertions(+), 63 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 2268438f3f63..19eb990d398c 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3059,8 +3059,7 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
 		if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph))))
 			return false;
 		iph = ip_hdr(skb);
-		fk->addrs.src = iph->saddr;
-		fk->addrs.dst = iph->daddr;
+		iph_to_flow_copy_v4addrs(fk, iph);
 		noff += iph->ihl << 2;
 		if (!ip_is_fragment(iph))
 			proto = iph->protocol;
@@ -3068,8 +3067,7 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
 		if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph6))))
 			return false;
 		iph6 = ipv6_hdr(skb);
-		fk->addrs.src = (__force __be32)ipv6_addr_hash(&iph6->saddr);
-		fk->addrs.dst = (__force __be32)ipv6_addr_hash(&iph6->daddr);
+		iph_to_flow_copy_v6addrs(fk, iph6);
 		noff += sizeof(*iph6);
 		proto = iph6->nexthdr;
 	} else {
@@ -3103,7 +3101,8 @@ u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb)
 		hash = bond_eth_hash(skb);
 	else
 		hash = (__force u32)flow.ports.ports;
-	hash ^= (__force u32)flow.addrs.dst ^ (__force u32)flow.addrs.src;
+	hash ^= (__force u32)flow_get_u32_dst(&flow) ^
+		(__force u32)flow_get_u32_src(&flow);
 	hash ^= (hash >> 16);
 	hash ^= (hash >> 8);
 
diff --git a/drivers/net/ethernet/cisco/enic/enic_clsf.c b/drivers/net/ethernet/cisco/enic/enic_clsf.c
index a31b57adcb37..d106186f4f4a 100644
--- a/drivers/net/ethernet/cisco/enic/enic_clsf.c
+++ b/drivers/net/ethernet/cisco/enic/enic_clsf.c
@@ -33,8 +33,8 @@ int enic_addfltr_5t(struct enic *enic, struct flow_keys *keys, u16 rq)
 		return -EPROTONOSUPPORT;
 	};
 	data.type = FILTER_IPV4_5TUPLE;
-	data.u.ipv4.src_addr = ntohl(keys->addrs.src);
-	data.u.ipv4.dst_addr = ntohl(keys->addrs.dst);
+	data.u.ipv4.src_addr = ntohl(keys->addrs.v4addrs.src);
+	data.u.ipv4.dst_addr = ntohl(keys->addrs.v4addrs.dst);
 	data.u.ipv4.src_port = ntohs(keys->ports.src);
 	data.u.ipv4.dst_port = ntohs(keys->ports.dst);
 	data.u.ipv4.flags = FILTER_FIELDS_IPV4_5TUPLE;
@@ -158,8 +158,8 @@ static struct enic_rfs_fltr_node *htbl_key_search(struct hlist_head *h,
 	struct enic_rfs_fltr_node *tpos;
 
 	hlist_for_each_entry(tpos, h, node)
-		if (tpos->keys.addrs.src == k->addrs.src &&
-		    tpos->keys.addrs.dst == k->addrs.dst &&
+		if (tpos->keys.addrs.v4addrs.src == k->addrs.v4addrs.src &&
+		    tpos->keys.addrs.v4addrs.dst == k->addrs.v4addrs.dst &&
 		    tpos->keys.ports.ports == k->ports.ports &&
 		    tpos->keys.basic.ip_proto == k->basic.ip_proto &&
 		    tpos->keys.basic.n_proto == k->basic.n_proto)
diff --git a/drivers/net/ethernet/cisco/enic/enic_ethtool.c b/drivers/net/ethernet/cisco/enic/enic_ethtool.c
index 117c0968dd0b..73874b2575bf 100644
--- a/drivers/net/ethernet/cisco/enic/enic_ethtool.c
+++ b/drivers/net/ethernet/cisco/enic/enic_ethtool.c
@@ -346,10 +346,10 @@ static int enic_grxclsrule(struct enic *enic, struct ethtool_rxnfc *cmd)
 		break;
 	}
 
-	fsp->h_u.tcp_ip4_spec.ip4src = n->keys.addrs.src;
+	fsp->h_u.tcp_ip4_spec.ip4src = flow_get_u32_src(&n->keys);
 	fsp->m_u.tcp_ip4_spec.ip4src = (__u32)~0;
 
-	fsp->h_u.tcp_ip4_spec.ip4dst = n->keys.addrs.dst;
+	fsp->h_u.tcp_ip4_spec.ip4dst = flow_get_u32_dst(&n->keys);
 	fsp->m_u.tcp_ip4_spec.ip4dst = (__u32)~0;
 
 	fsp->h_u.tcp_ip4_spec.psrc = n->keys.ports.src;
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index cba6a10b214a..306d4613abbb 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -12,7 +12,7 @@
  */
 struct flow_dissector_key_control {
 	u16	thoff;
-	u16	padding;
+	u16	addr_type;
 };
 
 /**
@@ -28,18 +28,39 @@ struct flow_dissector_key_basic {
 };
 
 /**
- * struct flow_dissector_key_addrs:
- * @src: source ip address in case of IPv4
- *	 For IPv6 it contains 32bit hash of src address
- * @dst: destination ip address in case of IPv4
- *	 For IPv6 it contains 32bit hash of dst address
+ * struct flow_dissector_key_ipv4_addrs:
+ * @src: source ip address
+ * @dst: destination ip address
  */
-struct flow_dissector_key_addrs {
+struct flow_dissector_key_ipv4_addrs {
 	/* (src,dst) must be grouped, in the same way than in IP header */
 	__be32 src;
 	__be32 dst;
 };
 
+/**
+ * struct flow_dissector_key_ipv6_addrs:
+ * @src: source ip address
+ * @dst: destination ip address
+ */
+struct flow_dissector_key_ipv6_addrs {
+	/* (src,dst) must be grouped, in the same way than in IP header */
+	struct in6_addr src;
+	struct in6_addr dst;
+};
+
+/**
+ * struct flow_dissector_key_addrs:
+ * @v4addrs: IPv4 addresses
+ * @v6addrs: IPv6 addresses
+ */
+struct flow_dissector_key_addrs {
+	union {
+		struct flow_dissector_key_ipv4_addrs v4addrs;
+		struct flow_dissector_key_ipv6_addrs v6addrs;
+	};
+};
+
 /**
  * flow_dissector_key_tp_ports:
  *	@ports: port numbers of Transport header
@@ -56,16 +77,6 @@ struct flow_dissector_key_ports {
 	};
 };
 
-/**
- * struct flow_dissector_key_ipv6_addrs:
- * @src: source ip address
- * @dst: destination ip address
- */
-struct flow_dissector_key_ipv6_addrs {
-	/* (src,dst) must be grouped, in the same way than in IP header */
-	struct in6_addr src;
-	struct in6_addr dst;
-};
 
 /**
  * struct flow_dissector_key_eth_addrs:
@@ -81,10 +92,10 @@ struct flow_dissector_key_eth_addrs {
 enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_CONTROL, /* struct flow_dissector_key_control */
 	FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */
-	FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_addrs */
+	FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */
+	FLOW_DISSECTOR_KEY_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */
 	FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, /* struct flow_dissector_key_addrs */
 	FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */
-	FLOW_DISSECTOR_KEY_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */
 	FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */
 
 	FLOW_DISSECTOR_KEY_MAX,
@@ -129,6 +140,9 @@ struct flow_keys {
 #define FLOW_KEYS_HASH_OFFSET		\
 	offsetof(struct flow_keys, FLOW_KEYS_HASH_START_FIELD)
 
+__be32 flow_get_u32_src(const struct flow_keys *flow);
+__be32 flow_get_u32_dst(const struct flow_keys *flow);
+
 extern struct flow_dissector flow_keys_dissector;
 extern struct flow_dissector flow_keys_buf_dissector;
 
diff --git a/include/net/ip.h b/include/net/ip.h
index 16cfc87fed6c..0750a186ea63 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -355,6 +355,20 @@ static inline __wsum inet_compute_pseudo(struct sk_buff *skb, int proto)
 				  skb->len, proto, 0);
 }
 
+/* copy IPv4 saddr & daddr to flow_keys, possibly using 64bit load/store
+ * Equivalent to :	flow->v4addrs.src = iph->saddr;
+ *			flow->v4addrs.dst = iph->daddr;
+ */
+static inline void iph_to_flow_copy_v4addrs(struct flow_keys *flow,
+					    const struct iphdr *iph)
+{
+	BUILD_BUG_ON(offsetof(typeof(flow->addrs), v4addrs.dst) !=
+		     offsetof(typeof(flow->addrs), v4addrs.src) +
+			      sizeof(flow->addrs.v4addrs.src));
+	memcpy(&flow->addrs.v4addrs, &iph->saddr, sizeof(flow->addrs.v4addrs));
+	flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+}
+
 static inline void inet_set_txhash(struct sock *sk)
 {
 	struct inet_sock *inet = inet_sk(sk);
@@ -362,8 +376,9 @@ static inline void inet_set_txhash(struct sock *sk)
 
 	memset(&keys, 0, sizeof(keys));
 
-	keys.addrs.src = inet->inet_saddr;
-	keys.addrs.dst = inet->inet_daddr;
+	keys.addrs.v4addrs.src = inet->inet_saddr;
+	keys.addrs.v4addrs.dst = inet->inet_daddr;
+	keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
 	keys.ports.src = inet->inet_sport;
 	keys.ports.dst = inet->inet_dport;
 
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 474ca466a091..82dbdb092a5d 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -692,6 +692,20 @@ static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6,
 	return hlimit;
 }
 
+/* copy IPv6 saddr & daddr to flow_keys, possibly using 64bit load/store
+ * Equivalent to :	flow->v6addrs.src = iph->saddr;
+ *			flow->v6addrs.dst = iph->daddr;
+ */
+static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow,
+					    const struct ipv6hdr *iph)
+{
+	BUILD_BUG_ON(offsetof(typeof(flow->addrs), v6addrs.dst) !=
+		     offsetof(typeof(flow->addrs), v6addrs.src) +
+		     sizeof(flow->addrs.v6addrs.src));
+	memcpy(&flow->addrs.v6addrs, &iph->saddr, sizeof(flow->addrs.v6addrs));
+	flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+}
+
 #if IS_ENABLED(CONFIG_IPV6)
 static inline void ip6_set_txhash(struct sock *sk)
 {
@@ -701,8 +715,11 @@ static inline void ip6_set_txhash(struct sock *sk)
 
 	memset(&keys, 0, sizeof(keys));
 
-	keys.addrs.src = (__force __be32)ipv6_addr_hash(&np->saddr);
-	keys.addrs.dst = (__force __be32)ipv6_addr_hash(&sk->sk_v6_daddr);
+	memcpy(&keys.addrs.v6addrs.src, &np->saddr,
+	       sizeof(keys.addrs.v6addrs.src));
+	memcpy(&keys.addrs.v6addrs.dst, &sk->sk_v6_daddr,
+	       sizeof(keys.addrs.v6addrs.dst));
+	keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
 	keys.ports.src = inet->inet_sport;
 	keys.ports.dst = inet->inet_dport;
 
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 55b5f2962afa..ca9d22488cfb 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -178,10 +178,12 @@ ip:
 		if (!skb_flow_dissector_uses_key(flow_dissector,
 						 FLOW_DISSECTOR_KEY_IPV4_ADDRS))
 			break;
+
 		key_addrs = skb_flow_dissector_target(flow_dissector,
-						      FLOW_DISSECTOR_KEY_IPV4_ADDRS,
-						      target_container);
-		memcpy(key_addrs, &iph->saddr, sizeof(*key_addrs));
+			      FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container);
+		memcpy(&key_addrs->v4addrs, &iph->saddr,
+		       sizeof(key_addrs->v4addrs));
+		key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
 		break;
 	}
 	case htons(ETH_P_IPV6): {
@@ -203,8 +205,11 @@ ipv6:
 							      FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS,
 							      target_container);
 
-			key_addrs->src = (__force __be32)ipv6_addr_hash(&iph->saddr);
-			key_addrs->dst = (__force __be32)ipv6_addr_hash(&iph->daddr);
+			key_addrs->v4addrs.src =
+				(__force __be32)ipv6_addr_hash(&iph->saddr);
+			key_addrs->v4addrs.dst =
+				(__force __be32)ipv6_addr_hash(&iph->daddr);
+			key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
 			goto flow_label;
 		}
 		if (skb_flow_dissector_uses_key(flow_dissector,
@@ -216,6 +221,7 @@ ipv6:
 								   target_container);
 
 			memcpy(key_ipv6_addrs, &iph->saddr, sizeof(*key_ipv6_addrs));
+			key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
 			goto flow_label;
 		}
 		break;
@@ -292,8 +298,9 @@ flow_label:
 			key_addrs = skb_flow_dissector_target(flow_dissector,
 							      FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS,
 							      target_container);
-			key_addrs->src = hdr->srcnode;
-			key_addrs->dst = 0;
+			key_addrs->v4addrs.src = hdr->srcnode;
+			key_addrs->v4addrs.dst = 0;
+			key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
 		}
 		return true;
 	}
@@ -389,21 +396,88 @@ static inline void *flow_keys_hash_start(struct flow_keys *flow)
 
 static inline size_t flow_keys_hash_length(struct flow_keys *flow)
 {
+	size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs);
 	BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32));
-	return (sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) / sizeof(u32);
+	BUILD_BUG_ON(offsetof(typeof(*flow), addrs) !=
+		     sizeof(*flow) - sizeof(flow->addrs));
+
+	switch (flow->control.addr_type) {
+	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+		diff -= sizeof(flow->addrs.v4addrs);
+		break;
+	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+		diff -= sizeof(flow->addrs.v6addrs);
+		break;
+	}
+	return (sizeof(*flow) - diff) / sizeof(u32);
+}
+
+__be32 flow_get_u32_src(const struct flow_keys *flow)
+{
+	switch (flow->control.addr_type) {
+	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+		return flow->addrs.v4addrs.src;
+	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+		return (__force __be32)ipv6_addr_hash(
+			&flow->addrs.v6addrs.src);
+	default:
+		return 0;
+	}
+}
+EXPORT_SYMBOL(flow_get_u32_src);
+
+__be32 flow_get_u32_dst(const struct flow_keys *flow)
+{
+	switch (flow->control.addr_type) {
+	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+		return flow->addrs.v4addrs.dst;
+	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+		return (__force __be32)ipv6_addr_hash(
+			&flow->addrs.v6addrs.dst);
+	default:
+		return 0;
+	}
+}
+EXPORT_SYMBOL(flow_get_u32_dst);
+
+static inline void __flow_hash_consistentify(struct flow_keys *keys)
+{
+	int addr_diff, i;
+
+	switch (keys->control.addr_type) {
+	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+		addr_diff = (__force u32)keys->addrs.v4addrs.dst -
+			    (__force u32)keys->addrs.v4addrs.src;
+		if ((addr_diff < 0) ||
+		    (addr_diff == 0 &&
+		     ((__force u16)keys->ports.dst <
+		      (__force u16)keys->ports.src))) {
+			swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst);
+			swap(keys->ports.src, keys->ports.dst);
+		}
+		break;
+	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+		addr_diff = memcmp(&keys->addrs.v6addrs.dst,
+				   &keys->addrs.v6addrs.src,
+				   sizeof(keys->addrs.v6addrs.dst));
+		if ((addr_diff < 0) ||
+		    (addr_diff == 0 &&
+		     ((__force u16)keys->ports.dst <
+		      (__force u16)keys->ports.src))) {
+			for (i = 0; i < 4; i++)
+				swap(keys->addrs.v6addrs.src.s6_addr32[i],
+				     keys->addrs.v6addrs.dst.s6_addr32[i]);
+			swap(keys->ports.src, keys->ports.dst);
+		}
+		break;
+	}
 }
 
 static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
 {
 	u32 hash;
 
-	/* get a consistent hash (same value on both flow directions) */
-	if (((__force u32)keys->addrs.dst < (__force u32)keys->addrs.src) ||
-	    (((__force u32)keys->addrs.dst == (__force u32)keys->addrs.src) &&
-	     ((__force u16)keys->ports.dst < (__force u16)keys->ports.src))) {
-		swap(keys->addrs.dst, keys->addrs.src);
-		swap(keys->ports.src, keys->ports.dst);
-	}
+	__flow_hash_consistentify(keys);
 
 	hash = __flow_hash_words((u32 *)flow_keys_hash_start(keys),
 				 flow_keys_hash_length(keys), keyval);
@@ -451,8 +525,8 @@ void make_flow_keys_digest(struct flow_keys_digest *digest,
 	data->n_proto = flow->basic.n_proto;
 	data->ip_proto = flow->basic.ip_proto;
 	data->ports = flow->ports.ports;
-	data->src = flow->addrs.src;
-	data->dst = flow->addrs.dst;
+	data->src = flow->addrs.v4addrs.src;
+	data->dst = flow->addrs.v4addrs.dst;
 }
 EXPORT_SYMBOL(make_flow_keys_digest);
 
@@ -566,11 +640,15 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = {
 	},
 	{
 		.key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS,
-		.offset = offsetof(struct flow_keys, addrs),
+		.offset = offsetof(struct flow_keys, addrs.v4addrs),
+	},
+	{
+		.key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+		.offset = offsetof(struct flow_keys, addrs.v6addrs),
 	},
 	{
 		.key_id = FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS,
-		.offset = offsetof(struct flow_keys, addrs),
+		.offset = offsetof(struct flow_keys, addrs.v4addrs),
 	},
 	{
 		.key_id = FLOW_DISSECTOR_KEY_PORTS,
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 7d0e239a6755..77e0f0e7a88e 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -133,7 +133,7 @@ u32 eth_get_headlen(void *data, unsigned int len)
 	/* parse any remaining L2/L3 headers, check for L4 */
 	if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto,
 					    sizeof(*eth), len))
-		return max_t(u32, keys.basic.thoff, sizeof(*eth));
+		return max_t(u32, keys.control.thoff, sizeof(*eth));
 
 	/* parse for any L4 headers */
 	return min_t(u32, __skb_get_poff(NULL, data, &keys, len), len);
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index b4359924846c..76bc3a20ffdb 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -68,15 +68,21 @@ static inline u32 addr_fold(void *addr)
 
 static u32 flow_get_src(const struct sk_buff *skb, const struct flow_keys *flow)
 {
-	if (flow->addrs.src)
-		return ntohl(flow->addrs.src);
+	__be32 src = flow_get_u32_src(flow);
+
+	if (src)
+		return ntohl(src);
+
 	return addr_fold(skb->sk);
 }
 
 static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow)
 {
-	if (flow->addrs.dst)
-		return ntohl(flow->addrs.dst);
+	__be32 dst = flow_get_u32_dst(flow);
+
+	if (dst)
+		return ntohl(dst);
+
 	return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb);
 }
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 5a7d66c59684..b92d3f49c23e 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -28,8 +28,9 @@ struct fl_flow_key {
 	struct flow_dissector_key_control control;
 	struct flow_dissector_key_basic basic;
 	struct flow_dissector_key_eth_addrs eth;
+	struct flow_dissector_key_addrs ipaddrs;
 	union {
-		struct flow_dissector_key_addrs ipv4;
+		struct flow_dissector_key_ipv4_addrs ipv4;
 		struct flow_dissector_key_ipv6_addrs ipv6;
 	};
 	struct flow_dissector_key_ports tp;
@@ -260,14 +261,14 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 			       &mask->basic.ip_proto, TCA_FLOWER_UNSPEC,
 			       sizeof(key->basic.ip_proto));
 	}
-	if (key->basic.n_proto == htons(ETH_P_IP)) {
+	if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
 		fl_set_key_val(tb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC,
 			       &mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK,
 			       sizeof(key->ipv4.src));
 		fl_set_key_val(tb, &key->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST,
 			       &mask->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST_MASK,
 			       sizeof(key->ipv4.dst));
-	} else if (key->basic.n_proto == htons(ETH_P_IPV6)) {
+	} else if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
 		fl_set_key_val(tb, &key->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC,
 			       &mask->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC_MASK,
 			       sizeof(key->ipv6.src));
@@ -610,7 +611,7 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 			    sizeof(key->basic.ip_proto)))
 		goto nla_put_failure;
 
-	if (key->basic.n_proto == htons(ETH_P_IP) &&
+	if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
 	    (fl_dump_key_val(skb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC,
 			     &mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK,
 			     sizeof(key->ipv4.src)) ||
@@ -618,7 +619,7 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 			     &mask->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST_MASK,
 			     sizeof(key->ipv4.dst))))
 		goto nla_put_failure;
-	else if (key->basic.n_proto == htons(ETH_P_IPV6) &&
+	else if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS &&
 		 (fl_dump_key_val(skb, &key->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC,
 				  &mask->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC_MASK,
 				  sizeof(key->ipv6.src)) ||
-- 
cgit v1.2.3


From 9f24908901c5f4b1e3b07548106b1790af933476 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Thu, 4 Jun 2015 09:16:41 -0700
Subject: net: Add keys for TIPC address

Add a new flow key for TIPC addresses.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h | 10 ++++++++++
 net/core/flow_dissector.c    | 18 +++++++++++++-----
 2 files changed, 23 insertions(+), 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 306d4613abbb..3ee606a54775 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -49,6 +49,14 @@ struct flow_dissector_key_ipv6_addrs {
 	struct in6_addr dst;
 };
 
+/**
+ * struct flow_dissector_key_tipc_addrs:
+ * @srcnode: source node address
+ */
+struct flow_dissector_key_tipc_addrs {
+	__be32 srcnode;
+};
+
 /**
  * struct flow_dissector_key_addrs:
  * @v4addrs: IPv4 addresses
@@ -58,6 +66,7 @@ struct flow_dissector_key_addrs {
 	union {
 		struct flow_dissector_key_ipv4_addrs v4addrs;
 		struct flow_dissector_key_ipv6_addrs v6addrs;
+		struct flow_dissector_key_tipc_addrs tipcaddrs;
 	};
 };
 
@@ -97,6 +106,7 @@ enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, /* struct flow_dissector_key_addrs */
 	FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */
 	FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */
+	FLOW_DISSECTOR_KEY_TIPC_ADDRS, /* struct flow_dissector_key_tipc_addrs */
 
 	FLOW_DISSECTOR_KEY_MAX,
 };
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index ca9d22488cfb..91861c3d45a7 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -294,13 +294,12 @@ flow_label:
 		key_control->thoff = (u16)nhoff;
 
 		if (skb_flow_dissector_uses_key(flow_dissector,
-						FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS)) {
+						FLOW_DISSECTOR_KEY_TIPC_ADDRS)) {
 			key_addrs = skb_flow_dissector_target(flow_dissector,
-							      FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS,
+							      FLOW_DISSECTOR_KEY_TIPC_ADDRS,
 							      target_container);
-			key_addrs->v4addrs.src = hdr->srcnode;
-			key_addrs->v4addrs.dst = 0;
-			key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+			key_addrs->tipcaddrs.srcnode = hdr->srcnode;
+			key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC_ADDRS;
 		}
 		return true;
 	}
@@ -408,6 +407,9 @@ static inline size_t flow_keys_hash_length(struct flow_keys *flow)
 	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
 		diff -= sizeof(flow->addrs.v6addrs);
 		break;
+	case FLOW_DISSECTOR_KEY_TIPC_ADDRS:
+		diff -= sizeof(flow->addrs.tipcaddrs);
+		break;
 	}
 	return (sizeof(*flow) - diff) / sizeof(u32);
 }
@@ -420,6 +422,8 @@ __be32 flow_get_u32_src(const struct flow_keys *flow)
 	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
 		return (__force __be32)ipv6_addr_hash(
 			&flow->addrs.v6addrs.src);
+	case FLOW_DISSECTOR_KEY_TIPC_ADDRS:
+		return flow->addrs.tipcaddrs.srcnode;
 	default:
 		return 0;
 	}
@@ -650,6 +654,10 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = {
 		.key_id = FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS,
 		.offset = offsetof(struct flow_keys, addrs.v4addrs),
 	},
+	{
+		.key_id = FLOW_DISSECTOR_KEY_TIPC_ADDRS,
+		.offset = offsetof(struct flow_keys, addrs.tipcaddrs),
+	},
 	{
 		.key_id = FLOW_DISSECTOR_KEY_PORTS,
 		.offset = offsetof(struct flow_keys, ports),
-- 
cgit v1.2.3


From 45b47fd00ca14df869979dbbe14324625ec93552 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Thu, 4 Jun 2015 09:16:42 -0700
Subject: net: Get rid of IPv6 hash addresses flow keys

We don't need to return the IPv6 address hash as part of flow keys.
In general, using the IPv6 address hash is risky in a hash value
since the underlying use of xor provides no entropy. If someone
really needs the hash value they can get it from the full IPv6
addresses in flow keys (e.g. from flow_get_u32_src).

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h |  1 -
 net/core/flow_dissector.c    | 17 -----------------
 2 files changed, 18 deletions(-)

(limited to 'include/net')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 3ee606a54775..59f00f90fc6b 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -103,7 +103,6 @@ enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */
 	FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */
 	FLOW_DISSECTOR_KEY_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */
-	FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, /* struct flow_dissector_key_addrs */
 	FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */
 	FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */
 	FLOW_DISSECTOR_KEY_TIPC_ADDRS, /* struct flow_dissector_key_tipc_addrs */
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 91861c3d45a7..5348a468de78 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -199,19 +199,6 @@ ipv6:
 		ip_proto = iph->nexthdr;
 		nhoff += sizeof(struct ipv6hdr);
 
-		if (skb_flow_dissector_uses_key(flow_dissector,
-						FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS)) {
-			key_addrs = skb_flow_dissector_target(flow_dissector,
-							      FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS,
-							      target_container);
-
-			key_addrs->v4addrs.src =
-				(__force __be32)ipv6_addr_hash(&iph->saddr);
-			key_addrs->v4addrs.dst =
-				(__force __be32)ipv6_addr_hash(&iph->daddr);
-			key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
-			goto flow_label;
-		}
 		if (skb_flow_dissector_uses_key(flow_dissector,
 						FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
 			struct flow_dissector_key_ipv6_addrs *key_ipv6_addrs;
@@ -650,10 +637,6 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = {
 		.key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS,
 		.offset = offsetof(struct flow_keys, addrs.v6addrs),
 	},
-	{
-		.key_id = FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS,
-		.offset = offsetof(struct flow_keys, addrs.v4addrs),
-	},
 	{
 		.key_id = FLOW_DISSECTOR_KEY_TIPC_ADDRS,
 		.offset = offsetof(struct flow_keys, addrs.tipcaddrs),
-- 
cgit v1.2.3


From d34af823ff401c312541aa613c49ea4b872bde9e Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Thu, 4 Jun 2015 09:16:43 -0700
Subject: net: Add VLAN ID to flow_keys

In flow_dissector set vlan_id in flow_keys when VLAN is found.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h |  6 ++++++
 net/core/flow_dissector.c    | 14 ++++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'include/net')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 59f00f90fc6b..08480fbb9035 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -27,6 +27,10 @@ struct flow_dissector_key_basic {
 	u8	padding;
 };
 
+struct flow_dissector_key_tags {
+	u32	vlan_id:12;
+};
+
 /**
  * struct flow_dissector_key_ipv4_addrs:
  * @src: source ip address
@@ -106,6 +110,7 @@ enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */
 	FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */
 	FLOW_DISSECTOR_KEY_TIPC_ADDRS, /* struct flow_dissector_key_tipc_addrs */
+	FLOW_DISSECTOR_KEY_VLANID, /* struct flow_dissector_key_flow_tags */
 
 	FLOW_DISSECTOR_KEY_MAX,
 };
@@ -142,6 +147,7 @@ struct flow_keys {
 	struct flow_dissector_key_control control;
 #define FLOW_KEYS_HASH_START_FIELD basic
 	struct flow_dissector_key_basic basic;
+	struct flow_dissector_key_tags tags;
 	struct flow_dissector_key_ports ports;
 	struct flow_dissector_key_addrs addrs;
 };
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 5348a468de78..5c66cb2e66df 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -126,6 +126,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 	struct flow_dissector_key_basic *key_basic;
 	struct flow_dissector_key_addrs *key_addrs;
 	struct flow_dissector_key_ports *key_ports;
+	struct flow_dissector_key_tags *key_tags;
 	u8 ip_proto;
 
 	if (!data) {
@@ -246,6 +247,15 @@ flow_label:
 		if (!vlan)
 			return false;
 
+		if (skb_flow_dissector_uses_key(flow_dissector,
+						FLOW_DISSECTOR_KEY_VLANID)) {
+			key_tags = skb_flow_dissector_target(flow_dissector,
+							     FLOW_DISSECTOR_KEY_VLANID,
+							     target_container);
+
+			key_tags->vlan_id = skb_vlan_tag_get_id(skb);
+		}
+
 		proto = vlan->h_vlan_encapsulated_proto;
 		nhoff += sizeof(*vlan);
 		goto again;
@@ -645,6 +655,10 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = {
 		.key_id = FLOW_DISSECTOR_KEY_PORTS,
 		.offset = offsetof(struct flow_keys, ports),
 	},
+	{
+		.key_id = FLOW_DISSECTOR_KEY_VLANID,
+		.offset = offsetof(struct flow_keys, tags),
+	},
 };
 
 static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = {
-- 
cgit v1.2.3


From 87ee9e52ffeb168803a76cc07734425227cc2268 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Thu, 4 Jun 2015 09:16:44 -0700
Subject: net: Add IPv6 flow label to flow_keys

In flow_dissector set the flow label in flow_keys for IPv6. This also
removes the shortcircuiting of flow dissection when a non-zero label
is present, the flow label can be considered to provide additional
entropy for a hash.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h |  4 +++-
 net/core/flow_dissector.c    | 29 ++++++++++-------------------
 2 files changed, 13 insertions(+), 20 deletions(-)

(limited to 'include/net')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 08480fbb9035..14d8483d836e 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -28,7 +28,8 @@ struct flow_dissector_key_basic {
 };
 
 struct flow_dissector_key_tags {
-	u32	vlan_id:12;
+	u32	vlan_id:12,
+		flow_label:20;
 };
 
 /**
@@ -111,6 +112,7 @@ enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */
 	FLOW_DISSECTOR_KEY_TIPC_ADDRS, /* struct flow_dissector_key_tipc_addrs */
 	FLOW_DISSECTOR_KEY_VLANID, /* struct flow_dissector_key_flow_tags */
+	FLOW_DISSECTOR_KEY_FLOW_LABEL, /* struct flow_dissector_key_flow_tags */
 
 	FLOW_DISSECTOR_KEY_MAX,
 };
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 5c66cb2e66df..70f0567b6be9 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -210,30 +210,17 @@ ipv6:
 
 			memcpy(key_ipv6_addrs, &iph->saddr, sizeof(*key_ipv6_addrs));
 			key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
-			goto flow_label;
 		}
-		break;
-flow_label:
+
 		flow_label = ip6_flowlabel(iph);
 		if (flow_label) {
-			/* Awesome, IPv6 packet has a flow label so we can
-			 * use that to represent the ports without any
-			 * further dissection.
-			 */
-
-			key_basic->n_proto = proto;
-			key_basic->ip_proto = ip_proto;
-			key_control->thoff = (u16)nhoff;
-
 			if (skb_flow_dissector_uses_key(flow_dissector,
-							FLOW_DISSECTOR_KEY_PORTS)) {
-				key_ports = skb_flow_dissector_target(flow_dissector,
-								      FLOW_DISSECTOR_KEY_PORTS,
-								      target_container);
-				key_ports->ports = flow_label;
+				FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
+				key_tags = skb_flow_dissector_target(flow_dissector,
+								     FLOW_DISSECTOR_KEY_FLOW_LABEL,
+								     target_container);
+				key_tags->flow_label = ntohl(flow_label);
 			}
-
-			return true;
 		}
 
 		break;
@@ -659,6 +646,10 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = {
 		.key_id = FLOW_DISSECTOR_KEY_VLANID,
 		.offset = offsetof(struct flow_keys, tags),
 	},
+	{
+		.key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL,
+		.offset = offsetof(struct flow_keys, tags),
+	},
 };
 
 static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = {
-- 
cgit v1.2.3


From 1fdd512c92003cf2d671ba22753d13302bf8cd1d Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Thu, 4 Jun 2015 09:16:45 -0700
Subject: net: Add GRE keyid in flow_keys

In flow dissector if a GRE header contains a keyid this is saved in the
new keyid field of flow_keys. The GRE keyid is then represented
in the flow hash function input.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h |  6 ++++++
 net/core/flow_dissector.c    | 24 +++++++++++++++++++++++-
 2 files changed, 29 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 14d8483d836e..6c5e8d262607 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -32,6 +32,10 @@ struct flow_dissector_key_tags {
 		flow_label:20;
 };
 
+struct flow_dissector_key_keyid {
+	__be32	keyid;
+};
+
 /**
  * struct flow_dissector_key_ipv4_addrs:
  * @src: source ip address
@@ -113,6 +117,7 @@ enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_TIPC_ADDRS, /* struct flow_dissector_key_tipc_addrs */
 	FLOW_DISSECTOR_KEY_VLANID, /* struct flow_dissector_key_flow_tags */
 	FLOW_DISSECTOR_KEY_FLOW_LABEL, /* struct flow_dissector_key_flow_tags */
+	FLOW_DISSECTOR_KEY_GRE_KEYID, /* struct flow_dissector_key_keyid */
 
 	FLOW_DISSECTOR_KEY_MAX,
 };
@@ -150,6 +155,7 @@ struct flow_keys {
 #define FLOW_KEYS_HASH_START_FIELD basic
 	struct flow_dissector_key_basic basic;
 	struct flow_dissector_key_tags tags;
+	struct flow_dissector_key_keyid keyid;
 	struct flow_dissector_key_ports ports;
 	struct flow_dissector_key_addrs addrs;
 };
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 70f0567b6be9..7ddc0a7b3da5 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -127,6 +127,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 	struct flow_dissector_key_addrs *key_addrs;
 	struct flow_dissector_key_ports *key_ports;
 	struct flow_dissector_key_tags *key_tags;
+	struct flow_dissector_key_keyid *key_keyid;
 	u8 ip_proto;
 
 	if (!data) {
@@ -315,8 +316,25 @@ ipv6:
 		nhoff += 4;
 		if (hdr->flags & GRE_CSUM)
 			nhoff += 4;
-		if (hdr->flags & GRE_KEY)
+		if (hdr->flags & GRE_KEY) {
+			const __be32 *keyid;
+			__be32 _keyid;
+
+			keyid = __skb_header_pointer(skb, nhoff, sizeof(_keyid),
+						     data, hlen, &_keyid);
+
+			if (!keyid)
+				return false;
+
+			if (skb_flow_dissector_uses_key(flow_dissector,
+							FLOW_DISSECTOR_KEY_GRE_KEYID)) {
+				key_keyid = skb_flow_dissector_target(flow_dissector,
+								      FLOW_DISSECTOR_KEY_GRE_KEYID,
+								      target_container);
+				key_keyid->keyid = *keyid;
+			}
 			nhoff += 4;
+		}
 		if (hdr->flags & GRE_SEQ)
 			nhoff += 4;
 		if (proto == htons(ETH_P_TEB)) {
@@ -650,6 +668,10 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = {
 		.key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL,
 		.offset = offsetof(struct flow_keys, tags),
 	},
+	{
+		.key_id = FLOW_DISSECTOR_KEY_GRE_KEYID,
+		.offset = offsetof(struct flow_keys, keyid),
+	},
 };
 
 static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = {
-- 
cgit v1.2.3


From b3baa0fbd02a1a9d493d8cb92ae4a4491b9e9d13 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Thu, 4 Jun 2015 09:16:46 -0700
Subject: mpls: Add MPLS entropy label in flow_keys

In flow dissector if an MPLS header contains an entropy label this is
saved in the new keyid field of flow_keys. The entropy label is
then represented in the flow hash function input.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h |  1 +
 net/core/flow_dissector.c    | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)

(limited to 'include/net')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 6c5e8d262607..1a8c22419936 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -118,6 +118,7 @@ enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_VLANID, /* struct flow_dissector_key_flow_tags */
 	FLOW_DISSECTOR_KEY_FLOW_LABEL, /* struct flow_dissector_key_flow_tags */
 	FLOW_DISSECTOR_KEY_GRE_KEYID, /* struct flow_dissector_key_keyid */
+	FLOW_DISSECTOR_KEY_MPLS_ENTROPY, /* struct flow_dissector_key_keyid */
 
 	FLOW_DISSECTOR_KEY_MAX,
 };
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 7ddc0a7b3da5..77e22e4fc898 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -15,6 +15,7 @@
 #include <linux/ppp_defs.h>
 #include <linux/stddef.h>
 #include <linux/if_ether.h>
+#include <linux/mpls.h>
 #include <net/flow_dissector.h>
 #include <scsi/fc/fc_fcoe.h>
 
@@ -288,6 +289,37 @@ ipv6:
 		}
 		return true;
 	}
+
+	case htons(ETH_P_MPLS_UC):
+	case htons(ETH_P_MPLS_MC): {
+		struct mpls_label *hdr, _hdr[2];
+mpls:
+		hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data,
+					   hlen, &_hdr);
+		if (!hdr)
+			return false;
+
+		if ((ntohl(hdr[0].entry) & MPLS_LS_LABEL_MASK) ==
+		     MPLS_LABEL_ENTROPY) {
+			if (skb_flow_dissector_uses_key(flow_dissector,
+							FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) {
+				key_keyid = skb_flow_dissector_target(flow_dissector,
+								      FLOW_DISSECTOR_KEY_MPLS_ENTROPY,
+								      target_container);
+				key_keyid->keyid = hdr[1].entry &
+					htonl(MPLS_LS_LABEL_MASK);
+			}
+
+			key_basic->n_proto = proto;
+			key_basic->ip_proto = ip_proto;
+			key_control->thoff = (u16)nhoff;
+
+			return true;
+		}
+
+		return true;
+	}
+
 	case htons(ETH_P_FCOE):
 		key_control->thoff = (u16)(nhoff + FCOE_HEADER_LEN);
 		/* fall through */
@@ -357,6 +389,9 @@ ipv6:
 	case IPPROTO_IPV6:
 		proto = htons(ETH_P_IPV6);
 		goto ipv6;
+	case IPPROTO_MPLS:
+		proto = htons(ETH_P_MPLS_UC);
+		goto mpls;
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From 90c337da1524863838658078ec34241f45d8394d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 6 Jun 2015 21:17:57 -0700
Subject: inet: add IP_BIND_ADDRESS_NO_PORT to overcome bind(0) limitations

When an application needs to force a source IP on an active TCP socket
it has to use bind(IP, port=x).

As most applications do not want to deal with already used ports, x is
often set to 0, meaning the kernel is in charge to find an available
port.
But kernel does not know yet if this socket is going to be a listener or
be connected.
It has very limited choices (no full knowledge of final 4-tuple for a
connect())

With limited ephemeral port range (about 32K ports), it is very easy to
fill the space.

This patch adds a new SOL_IP socket option, asking kernel to ignore
the 0 port provided by application in bind(IP, port=0) and only
remember the given IP address.

The port will be automatically chosen at connect() time, in a way
that allows sharing a source port as long as the 4-tuples are unique.

This new feature is available for both IPv4 and IPv6 (Thanks Neal)

Tested:

Wrote a test program and checked its behavior on IPv4 and IPv6.

strace(1) shows sequences of bind(IP=127.0.0.2, port=0) followed by
connect().
Also getsockname() show that the port is still 0 right after bind()
but properly allocated after connect().

socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 5
setsockopt(5, SOL_IP, IP_BIND_ADDRESS_NO_PORT, [1], 4) = 0
bind(5, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("127.0.0.2")}, 16) = 0
getsockname(5, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("127.0.0.2")}, [16]) = 0
connect(5, {sa_family=AF_INET, sin_port=htons(53174), sin_addr=inet_addr("127.0.0.3")}, 16) = 0
getsockname(5, {sa_family=AF_INET, sin_port=htons(38050), sin_addr=inet_addr("127.0.0.2")}, [16]) = 0

IPv6 test :

socket(PF_INET6, SOCK_STREAM, IPPROTO_IP) = 7
setsockopt(7, SOL_IP, IP_BIND_ADDRESS_NO_PORT, [1], 4) = 0
bind(7, {sa_family=AF_INET6, sin6_port=htons(0), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, 28) = 0
getsockname(7, {sa_family=AF_INET6, sin6_port=htons(0), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, [28]) = 0
connect(7, {sa_family=AF_INET6, sin6_port=htons(57300), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, 28) = 0
getsockname(7, {sa_family=AF_INET6, sin6_port=htons(60964), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, [28]) = 0

I was able to bind()/connect() a million concurrent IPv4 sockets,
instead of ~32000 before patch.

lpaa23:~# ulimit -n 1000010
lpaa23:~# ./bind --connect --num-flows=1000000 &
1000000 sockets

lpaa23:~# grep TCP /proc/net/sockstat
TCP: inuse 2000063 orphan 0 tw 47 alloc 2000157 mem 66

Check that a given source port is indeed used by many different
connections :

lpaa23:~# ss -t src :40000 | head -10
State      Recv-Q Send-Q   Local Address:Port          Peer Address:Port
ESTAB      0      0           127.0.0.2:40000         127.0.202.33:44983
ESTAB      0      0           127.0.0.2:40000         127.2.27.240:44983
ESTAB      0      0           127.0.0.2:40000           127.2.98.5:44983
ESTAB      0      0           127.0.0.2:40000        127.0.124.196:44983
ESTAB      0      0           127.0.0.2:40000         127.2.139.38:44983
ESTAB      0      0           127.0.0.2:40000          127.1.59.80:44983
ESTAB      0      0           127.0.0.2:40000          127.3.6.228:44983
ESTAB      0      0           127.0.0.2:40000          127.0.38.53:44983
ESTAB      0      0           127.0.0.2:40000         127.1.197.10:44983

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_sock.h | 1 +
 include/uapi/linux/in.h | 1 +
 net/ipv4/af_inet.c      | 3 ++-
 net/ipv4/ip_sockglue.c  | 7 +++++++
 net/ipv6/af_inet6.c     | 3 ++-
 5 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index b6c3737da4e9..47eb67b08abd 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -187,6 +187,7 @@ struct inet_sock {
 				transparent:1,
 				mc_all:1,
 				nodefrag:1;
+	__u8			bind_address_no_port:1;
 	__u8			rcv_tos;
 	__u8			convert_csum;
 	int			uc_index;
diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h
index 641338bef651..83d6236a2f08 100644
--- a/include/uapi/linux/in.h
+++ b/include/uapi/linux/in.h
@@ -112,6 +112,7 @@ struct in_addr {
 #define IP_MINTTL       21
 #define IP_NODEFRAG     22
 #define IP_CHECKSUM	23
+#define IP_BIND_ADDRESS_NO_PORT	24
 
 /* IP_MTU_DISCOVER values */
 #define IP_PMTUDISC_DONT		0	/* Never send DF frames */
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 6ad0f7a711c9..cc858ef44451 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -488,7 +488,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		inet->inet_saddr = 0;  /* Use device */
 
 	/* Make sure we are allowed to bind here. */
-	if (sk->sk_prot->get_port(sk, snum)) {
+	if ((snum || !inet->bind_address_no_port) &&
+	    sk->sk_prot->get_port(sk, snum)) {
 		inet->inet_saddr = inet->inet_rcv_saddr = 0;
 		err = -EADDRINUSE;
 		goto out_release_sock;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 7cfb0893f263..04ae2992a5cd 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -582,6 +582,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 	case IP_TRANSPARENT:
 	case IP_MINTTL:
 	case IP_NODEFRAG:
+	case IP_BIND_ADDRESS_NO_PORT:
 	case IP_UNICAST_IF:
 	case IP_MULTICAST_TTL:
 	case IP_MULTICAST_ALL:
@@ -732,6 +733,9 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 		}
 		inet->nodefrag = val ? 1 : 0;
 		break;
+	case IP_BIND_ADDRESS_NO_PORT:
+		inet->bind_address_no_port = val ? 1 : 0;
+		break;
 	case IP_MTU_DISCOVER:
 		if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
 			goto e_inval;
@@ -1324,6 +1328,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
 	case IP_NODEFRAG:
 		val = inet->nodefrag;
 		break;
+	case IP_BIND_ADDRESS_NO_PORT:
+		val = inet->bind_address_no_port;
+		break;
 	case IP_MTU_DISCOVER:
 		val = inet->pmtudisc;
 		break;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index f3866c0b6cfe..7de52b65173f 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -362,7 +362,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		np->saddr = addr->sin6_addr;
 
 	/* Make sure we are allowed to bind here. */
-	if (sk->sk_prot->get_port(sk, snum)) {
+	if ((snum || !inet->bind_address_no_port) &&
+	    sk->sk_prot->get_port(sk, snum)) {
 		inet_reset_saddr(sk);
 		err = -EADDRINUSE;
 		goto out;
-- 
cgit v1.2.3


From b80c0e78582d4a3a004dc1ade4eb06babc6a4eea Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 4 Jun 2015 18:30:43 -0700
Subject: tcp: get_cookie_sock() consolidation

IPv4 and IPv6 share same implementation of get_cookie_sock(),
and there is no point inlining it.

We add tcp_ prefix to the common helper name and export it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     |  3 +++
 net/ipv4/syncookies.c | 10 +++++-----
 net/ipv6/syncookies.c | 19 +------------------
 3 files changed, 9 insertions(+), 23 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2bb2bad21d5c..978cebedd3fc 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -469,6 +469,9 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size);
 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb);
 
 /* From syncookies.c */
+struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
+				 struct request_sock *req,
+				 struct dst_entry *dst);
 int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
 		      u32 cookie);
 struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index df849e5a10f1..d70b1f603692 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -219,9 +219,9 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
 }
 EXPORT_SYMBOL_GPL(__cookie_v4_check);
 
-static struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
-				    struct request_sock *req,
-				    struct dst_entry *dst)
+struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
+				 struct request_sock *req,
+				 struct dst_entry *dst)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct sock *child;
@@ -235,7 +235,7 @@ static struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
 	}
 	return child;
 }
-
+EXPORT_SYMBOL(tcp_get_cookie_sock);
 
 /*
  * when syncookies are in effect and tcp timestamps are enabled we stored
@@ -391,7 +391,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 	ireq->rcv_wscale  = rcv_wscale;
 	ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst);
 
-	ret = get_cookie_sock(sk, skb, req, &rt->dst);
+	ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst);
 	/* ip_queue_xmit() depends on our flow being setup
 	 * Normal sockets get it right from inet_csk_route_child_sock()
 	 */
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 21bc2eb53c57..0909f4e0d53c 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -41,23 +41,6 @@ static __u16 const msstab[] = {
 	9000 - 60,
 };
 
-static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
-					   struct request_sock *req,
-					   struct dst_entry *dst)
-{
-	struct inet_connection_sock *icsk = inet_csk(sk);
-	struct sock *child;
-
-	child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst);
-	if (child) {
-		atomic_set(&req->rsk_refcnt, 1);
-		inet_csk_reqsk_queue_add(sk, req, child);
-	} else {
-		reqsk_free(req);
-	}
-	return child;
-}
-
 static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
 		      ipv6_cookie_scratch);
 
@@ -264,7 +247,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 	ireq->rcv_wscale = rcv_wscale;
 	ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), dst);
 
-	ret = get_cookie_sock(sk, skb, req, dst);
+	ret = tcp_get_cookie_sock(sk, skb, req, dst);
 out:
 	return ret;
 out_free:
-- 
cgit v1.2.3