summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2015-01-05 22:55:28 -0500
committerDavid S. Miller <davem@davemloft.net>2015-01-05 22:55:28 -0500
commita918eb9fac6431901fe8ec8a2cb31dfe6fdd1af7 (patch)
tree3b4de2d4483e984a20640aa9f42681aebb6b955e
parent6cb69742daa1770fe1ce54cf45f8951376518176 (diff)
parent81164413ad096bafe8ad1068f3f095a7dd081d8b (diff)
Merge branch 'rt_cong_ctrl'
Daniel Borkmann says: ==================== net: allow setting congctl via routing table This is the second part of our work and allows for setting the congestion control algorithm via routing table. For details, please see individual patches. Since patch 1 is a bug fix, we suggest applying patch 1 to net, and then merging net into net-next, for example, and following up with the remaining feature patches wrt dependencies. Joint work with Florian Westphal, suggested by Hannes Frederic Sowa. Patch for iproute2 is available under [1], but will be reposted with along with the man-page update when this set hits net-next. [1] http://patchwork.ozlabs.org/patch/418149/ Thanks! v2 -> v3: - Added module auto-loading as suggested by David Miller, thanks! - Added patch 2 for handling possible sleeps in fib6 - While working on this, we discovered a bug, hence fix in patch 1 - Added auto-loading to patch 4 - Rebased, retested, rest the same. v1 -> v2: - Very sorry, I noticed I had decnet disabled during testing. Added missing header include in decnet, rest as is. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/inet_connection_sock.h3
-rw-r--r--include/net/ip6_fib.h10
-rw-r--r--include/net/tcp.h22
-rw-r--r--include/uapi/linux/rtnetlink.h2
-rw-r--r--net/core/rtnetlink.c15
-rw-r--r--net/decnet/dn_fib.c3
-rw-r--r--net/decnet/dn_table.c4
-rw-r--r--net/ipv4/fib_semantics.c14
-rw-r--r--net/ipv4/tcp_cong.c121
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv4/tcp_minisocks.c30
-rw-r--r--net/ipv4/tcp_output.c21
-rw-r--r--net/ipv6/ip6_fib.c68
-rw-r--r--net/ipv6/route.c72
-rw-r--r--net/ipv6/tcp_ipv6.c2
15 files changed, 304 insertions, 85 deletions
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 848e85cb5c61..5976bdecf58b 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -98,7 +98,8 @@ struct inet_connection_sock {
const struct tcp_congestion_ops *icsk_ca_ops;
const struct inet_connection_sock_af_ops *icsk_af_ops;
unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
- __u8 icsk_ca_state;
+ __u8 icsk_ca_state:7,
+ icsk_ca_dst_locked:1;
__u8 icsk_retransmits;
__u8 icsk_pending;
__u8 icsk_backoff;
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 8eea35d32a75..20e80fa7bbdd 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -74,6 +74,11 @@ struct fib6_node {
#define FIB6_SUBTREE(fn) ((fn)->subtree)
#endif
+struct mx6_config {
+ const u32 *mx;
+ DECLARE_BITMAP(mx_valid, RTAX_MAX);
+};
+
/*
* routing information
*
@@ -291,9 +296,8 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg),
void *arg);
-int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info,
- struct nlattr *mx, int mx_len);
-
+int fib6_add(struct fib6_node *root, struct rt6_info *rt,
+ struct nl_info *info, struct mx6_config *mxc);
int fib6_del(struct rt6_info *rt, struct nl_info *info);
void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index f50f29faf76f..b8fdc6bab3f3 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -448,6 +448,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
struct sock *tcp_create_openreq_child(struct sock *sk,
struct request_sock *req,
struct sk_buff *skb);
+void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst);
struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct dst_entry *dst);
@@ -636,6 +637,11 @@ static inline u32 tcp_rto_min_us(struct sock *sk)
return jiffies_to_usecs(tcp_rto_min(sk));
}
+static inline bool tcp_ca_dst_locked(const struct dst_entry *dst)
+{
+ return dst_metric_locked(dst, RTAX_CC_ALGO);
+}
+
/* Compute the actual receive window we are currently advertising.
* Rcv_nxt can be after the window if our peer push more data
* than the offered window.
@@ -787,6 +793,8 @@ enum tcp_ca_ack_event_flags {
#define TCP_CA_MAX 128
#define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX)
+#define TCP_CA_UNSPEC 0
+
/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */
#define TCP_CONG_NON_RESTRICTED 0x1
/* Requires ECN/ECT set on all packets */
@@ -794,7 +802,8 @@ enum tcp_ca_ack_event_flags {
struct tcp_congestion_ops {
struct list_head list;
- unsigned long flags;
+ u32 key;
+ u32 flags;
/* initialize private data (optional) */
void (*init)(struct sock *sk);
@@ -841,6 +850,17 @@ u32 tcp_reno_ssthresh(struct sock *sk);
void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
extern struct tcp_congestion_ops tcp_reno;
+struct tcp_congestion_ops *tcp_ca_find_key(u32 key);
+u32 tcp_ca_get_key_by_name(const char *name);
+#ifdef CONFIG_INET
+char *tcp_ca_get_name_by_key(u32 key, char *buffer);
+#else
+static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
+{
+ return NULL;
+}
+#endif
+
static inline bool tcp_ca_needs_ecn(const struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 9c9b8b4480cd..d81f22d5b390 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -389,6 +389,8 @@ enum {
#define RTAX_INITRWND RTAX_INITRWND
RTAX_QUICKACK,
#define RTAX_QUICKACK RTAX_QUICKACK
+ RTAX_CC_ALGO,
+#define RTAX_CC_ALGO RTAX_CC_ALGO
__RTAX_MAX
};
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index da983d4bac02..6a6cdade1676 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -50,6 +50,7 @@
#include <net/arp.h>
#include <net/route.h>
#include <net/udp.h>
+#include <net/tcp.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
#include <net/fib_rules.h>
@@ -669,9 +670,19 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
for (i = 0; i < RTAX_MAX; i++) {
if (metrics[i]) {
+ if (i == RTAX_CC_ALGO - 1) {
+ char tmp[TCP_CA_NAME_MAX], *name;
+
+ name = tcp_ca_get_name_by_key(metrics[i], tmp);
+ if (!name)
+ continue;
+ if (nla_put_string(skb, i + 1, name))
+ goto nla_put_failure;
+ } else {
+ if (nla_put_u32(skb, i + 1, metrics[i]))
+ goto nla_put_failure;
+ }
valid++;
- if (nla_put_u32(skb, i+1, metrics[i]))
- goto nla_put_failure;
}
}
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index d332aefb0846..df4803437888 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -298,7 +298,8 @@ struct dn_fib_info *dn_fib_create_info(const struct rtmsg *r, struct nlattr *att
int type = nla_type(attr);
if (type) {
- if (type > RTAX_MAX || nla_len(attr) < 4)
+ if (type > RTAX_MAX || type == RTAX_CC_ALGO ||
+ nla_len(attr) < 4)
goto err_inval;
fi->fib_metrics[type-1] = nla_get_u32(attr);
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index 86e3807052e9..3f19fcbf126d 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -29,6 +29,7 @@
#include <linux/route.h> /* RTF_xxx */
#include <net/neighbour.h>
#include <net/netlink.h>
+#include <net/tcp.h>
#include <net/dst.h>
#include <net/flow.h>
#include <net/fib_rules.h>
@@ -273,7 +274,8 @@ static inline size_t dn_fib_nlmsg_size(struct dn_fib_info *fi)
size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
+ nla_total_size(4) /* RTA_TABLE */
+ nla_total_size(2) /* RTA_DST */
- + nla_total_size(4); /* RTA_PRIORITY */
+ + nla_total_size(4) /* RTA_PRIORITY */
+ + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
/* space for nested metrics */
payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index f99f41bd15b8..d2b7b5521b1b 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -360,7 +360,8 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
+ nla_total_size(4) /* RTA_TABLE */
+ nla_total_size(4) /* RTA_DST */
+ nla_total_size(4) /* RTA_PRIORITY */
- + nla_total_size(4); /* RTA_PREFSRC */
+ + nla_total_size(4) /* RTA_PREFSRC */
+ + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
/* space for nested metrics */
payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
@@ -859,7 +860,16 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
if (type > RTAX_MAX)
goto err_inval;
- val = nla_get_u32(nla);
+ if (type == RTAX_CC_ALGO) {
+ char tmp[TCP_CA_NAME_MAX];
+
+ nla_strlcpy(tmp, nla, sizeof(tmp));
+ val = tcp_ca_get_key_by_name(tmp);
+ if (val == TCP_CA_UNSPEC)
+ goto err_inval;
+ } else {
+ val = nla_get_u32(nla);
+ }
if (type == RTAX_ADVMSS && val > 65535 - 40)
val = 65535 - 40;
if (type == RTAX_MTU && val > 65535 - 15)
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 27ead0dd16bc..63c29dba68a8 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -13,6 +13,7 @@
#include <linux/types.h>
#include <linux/list.h>
#include <linux/gfp.h>
+#include <linux/jhash.h>
#include <net/tcp.h>
static DEFINE_SPINLOCK(tcp_cong_list_lock);
@@ -31,6 +32,34 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name)
return NULL;
}
+/* Must be called with rcu lock held */
+static const struct tcp_congestion_ops *__tcp_ca_find_autoload(const char *name)
+{
+ const struct tcp_congestion_ops *ca = tcp_ca_find(name);
+#ifdef CONFIG_MODULES
+ if (!ca && capable(CAP_NET_ADMIN)) {
+ rcu_read_unlock();
+ request_module("tcp_%s", name);
+ rcu_read_lock();
+ ca = tcp_ca_find(name);
+ }
+#endif
+ return ca;
+}
+
+/* Simple linear search, not much in here. */
+struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
+{
+ struct tcp_congestion_ops *e;
+
+ list_for_each_entry_rcu(e, &tcp_cong_list, list) {
+ if (e->key == key)
+ return e;
+ }
+
+ return NULL;
+}
+
/*
* Attach new congestion control algorithm to the list
* of available options.
@@ -45,9 +74,12 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
return -EINVAL;
}
+ ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
+
spin_lock(&tcp_cong_list_lock);
- if (tcp_ca_find(ca->name)) {
- pr_notice("%s already registered\n", ca->name);
+ if (ca->key == TCP_CA_UNSPEC || tcp_ca_find_key(ca->key)) {
+ pr_notice("%s already registered or non-unique key\n",
+ ca->name);
ret = -EEXIST;
} else {
list_add_tail_rcu(&ca->list, &tcp_cong_list);
@@ -70,9 +102,50 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
spin_lock(&tcp_cong_list_lock);
list_del_rcu(&ca->list);
spin_unlock(&tcp_cong_list_lock);
+
+ /* Wait for outstanding readers to complete before the
+ * module gets removed entirely.
+ *
+ * A try_module_get() should fail by now as our module is
+ * in "going" state since no refs are held anymore and
+ * module_exit() handler being called.
+ */
+ synchronize_rcu();
}
EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
+u32 tcp_ca_get_key_by_name(const char *name)
+{
+ const struct tcp_congestion_ops *ca;
+ u32 key;
+
+ might_sleep();
+
+ rcu_read_lock();
+ ca = __tcp_ca_find_autoload(name);
+ key = ca ? ca->key : TCP_CA_UNSPEC;
+ rcu_read_unlock();
+
+ return key;
+}
+EXPORT_SYMBOL_GPL(tcp_ca_get_key_by_name);
+
+char *tcp_ca_get_name_by_key(u32 key, char *buffer)
+{
+ const struct tcp_congestion_ops *ca;
+ char *ret = NULL;
+
+ rcu_read_lock();
+ ca = tcp_ca_find_key(key);
+ if (ca)
+ ret = strncpy(buffer, ca->name,
+ TCP_CA_NAME_MAX);
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_ca_get_name_by_key);
+
/* Assign choice of congestion control. */
void tcp_assign_congestion_control(struct sock *sk)
{
@@ -107,6 +180,18 @@ void tcp_init_congestion_control(struct sock *sk)
icsk->icsk_ca_ops->init(sk);
}
+static void tcp_reinit_congestion_control(struct sock *sk,
+ const struct tcp_congestion_ops *ca)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ tcp_cleanup_congestion_control(sk);
+ icsk->icsk_ca_ops = ca;
+
+ if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init)
+ icsk->icsk_ca_ops->init(sk);
+}
+
/* Manage refcounts on socket close. */
void tcp_cleanup_congestion_control(struct sock *sk)
{
@@ -241,42 +326,26 @@ out:
int tcp_set_congestion_control(struct sock *sk, const char *name)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- struct tcp_congestion_ops *ca;
+ const struct tcp_congestion_ops *ca;
int err = 0;
- rcu_read_lock();
- ca = tcp_ca_find(name);
+ if (icsk->icsk_ca_dst_locked)
+ return -EPERM;
- /* no change asking for existing value */
+ rcu_read_lock();
+ ca = __tcp_ca_find_autoload(name);
+ /* No change asking for existing value */
if (ca == icsk->icsk_ca_ops)
goto out;
-
-#ifdef CONFIG_MODULES
- /* not found attempt to autoload module */
- if (!ca && capable(CAP_NET_ADMIN)) {
- rcu_read_unlock();
- request_module("tcp_%s", name);
- rcu_read_lock();
- ca = tcp_ca_find(name);
- }
-#endif
if (!ca)
err = -ENOENT;
-
else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)))
err = -EPERM;
-
else if (!try_module_get(ca->owner))
err = -EBUSY;
-
- else {
- tcp_cleanup_congestion_control(sk);
- icsk->icsk_ca_ops = ca;
-
- if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init)
- icsk->icsk_ca_ops->init(sk);
- }
+ else
+ tcp_reinit_congestion_control(sk, ca);
out:
rcu_read_unlock();
return err;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a3f72d7fc06c..ad3e65bdd368 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1340,6 +1340,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
}
sk_setup_caps(newsk, dst);
+ tcp_ca_openreq_child(newsk, dst);
+
tcp_sync_mss(newsk, dst_mtu(dst));
newtp->advmss = dst_metric_advmss(dst);
if (tcp_sk(sk)->rx_opt.user_mss &&
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 63d2680b65db..bc9216dc9de1 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -399,6 +399,32 @@ static void tcp_ecn_openreq_child(struct tcp_sock *tp,
tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0;
}
+void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
+ bool ca_got_dst = false;
+
+ if (ca_key != TCP_CA_UNSPEC) {
+ const struct tcp_congestion_ops *ca;
+
+ rcu_read_lock();
+ ca = tcp_ca_find_key(ca_key);
+ if (likely(ca && try_module_get(ca->owner))) {
+ icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
+ icsk->icsk_ca_ops = ca;
+ ca_got_dst = true;
+ }
+ rcu_read_unlock();
+ }
+
+ if (!ca_got_dst && !try_module_get(icsk->icsk_ca_ops->owner))
+ tcp_assign_congestion_control(sk);
+
+ tcp_set_ca_state(sk, TCP_CA_Open);
+}
+EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
+
/* This is not only more efficient than what we used to do, it eliminates
* a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
*
@@ -451,10 +477,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->snd_cwnd = TCP_INIT_CWND;
newtp->snd_cwnd_cnt = 0;
- if (!try_module_get(newicsk->icsk_ca_ops->owner))
- tcp_assign_congestion_control(newsk);
-
- tcp_set_ca_state(newsk, TCP_CA_Open);
tcp_init_xmit_timers(newsk);
__skb_queue_head_init(&newtp->out_of_order_queue);
newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7f18262e2326..dc30cb563e4f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2939,6 +2939,25 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
}
EXPORT_SYMBOL(tcp_make_synack);
+static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcp_congestion_ops *ca;
+ u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
+
+ if (ca_key == TCP_CA_UNSPEC)
+ return;
+
+ rcu_read_lock();
+ ca = tcp_ca_find_key(ca_key);
+ if (likely(ca && try_module_get(ca->owner))) {
+ module_put(icsk->icsk_ca_ops->owner);
+ icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
+ icsk->icsk_ca_ops = ca;
+ }
+ rcu_read_unlock();
+}
+
/* Do all connect socket setups that can be done AF independent. */
static void tcp_connect_init(struct sock *sk)
{
@@ -2964,6 +2983,8 @@ static void tcp_connect_init(struct sock *sk)
tcp_mtup_init(sk);
tcp_sync_mss(sk, dst_mtu(dst));
+ tcp_ca_dst_init(sk, dst);
+
if (!tp->window_clamp)
tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
tp->advmss = dst_metric_advmss(dst);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index b2d1838897c9..03c520a4ebeb 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -630,32 +630,35 @@ static bool rt6_qualify_for_ecmp(struct rt6_info *rt)
RTF_GATEWAY;
}
-static int fib6_commit_metrics(struct dst_entry *dst,
- struct nlattr *mx, int mx_len)
+static void fib6_copy_metrics(u32 *mp, const struct mx6_config *mxc)
{
- struct nlattr *nla;
- int remaining;
- u32 *mp;
+ int i;
- if (dst->flags & DST_HOST) {
- mp = dst_metrics_write_ptr(dst);
- } else {
- mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
- if (!mp)
- return -ENOMEM;
- dst_init_metrics(dst, mp, 0);
+ for (i = 0; i < RTAX_MAX; i++) {
+ if (test_bit(i, mxc->mx_valid))
+ mp[i] = mxc->mx[i];
}
+}
- nla_for_each_attr(nla, mx, mx_len, remaining) {
- int type = nla_type(nla);
+static int fib6_commit_metrics(struct dst_entry *dst, struct mx6_config *mxc)
+{
+ if (!mxc->mx)
+ return 0;
- if (type) {
- if (type > RTAX_MAX)
- return -EINVAL;
+ if (dst->flags & DST_HOST) {
+ u32 *mp = dst_metrics_write_ptr(dst);
- mp[type - 1] = nla_get_u32(nla);
- }
+ if (unlikely(!mp))
+ return -ENOMEM;
+
+ fib6_copy_metrics(mp, mxc);
+ } else {
+ dst_init_metrics(dst, mxc->mx, false);
+
+ /* We've stolen mx now. */
+ mxc->mx = NULL;
}
+
return 0;
}
@@ -664,7 +667,7 @@ static int fib6_commit_metrics(struct dst_entry *dst,
*/
static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
- struct nl_info *info, struct nlattr *mx, int mx_len)
+ struct nl_info *info, struct mx6_config *mxc)
{
struct rt6_info *iter = NULL;
struct rt6_info **ins;
@@ -773,11 +776,10 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
pr_warn("NLM_F_CREATE should be set when creating new route\n");
add:
- if (mx) {
- err = fib6_commit_metrics(&rt->dst, mx, mx_len);
- if (err)
- return err;
- }
+ err = fib6_commit_metrics(&rt->dst, mxc);
+ if (err)
+ return err;
+
rt->dst.rt6_next = iter;
*ins = rt;
rt->rt6i_node = fn;
@@ -797,11 +799,11 @@ add:
pr_warn("NLM_F_REPLACE set, but no existing node found!\n");
return -ENOENT;
}
- if (mx) {
- err = fib6_commit_metrics(&rt->dst, mx, mx_len);
- if (err)
- return err;
- }
+
+ err = fib6_commit_metrics(&rt->dst, mxc);
+ if (err)
+ return err;
+
*ins = rt;
rt->rt6i_node = fn;
rt->dst.rt6_next = iter->dst.rt6_next;
@@ -838,8 +840,8 @@ void fib6_force_start_gc(struct net *net)
* with source addr info in sub-trees
*/
-int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info,
- struct nlattr *mx, int mx_len)
+int fib6_add(struct fib6_node *root, struct rt6_info *rt,
+ struct nl_info *info, struct mx6_config *mxc)
{
struct fib6_node *fn, *pn = NULL;
int err = -ENOMEM;
@@ -934,7 +936,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info,
}
#endif
- err = fib6_add_rt2node(fn, rt, info, mx, mx_len);
+ err = fib6_add_rt2node(fn, rt, info, mxc);
if (!err) {
fib6_start_gc(info->nl_net, rt);
if (!(rt->rt6i_flags & RTF_CACHE))
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index c91083156edb..34dcbb59df75 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -853,14 +853,14 @@ EXPORT_SYMBOL(rt6_lookup);
*/
static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
- struct nlattr *mx, int mx_len)
+ struct mx6_config *mxc)
{
int err;
struct fib6_table *table;
table = rt->rt6i_table;
write_lock_bh(&table->tb6_lock);
- err = fib6_add(&table->tb6_root, rt, info, mx, mx_len);
+ err = fib6_add(&table->tb6_root, rt, info, mxc);
write_unlock_bh(&table->tb6_lock);
return err;
@@ -868,10 +868,10 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
int ip6_ins_rt(struct rt6_info *rt)
{
- struct nl_info info = {
- .nl_net = dev_net(rt->dst.dev),
- };
- return __ip6_ins_rt(rt, &info, NULL, 0);
+ struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
+ struct mx6_config mxc = { .mx = NULL, };
+
+ return __ip6_ins_rt(rt, &info, &mxc);
}
static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
@@ -1470,9 +1470,51 @@ out:
return entries > rt_max_size;
}
-/*
- *
- */
+static int ip6_convert_metrics(struct mx6_config *mxc,
+ const struct fib6_config *cfg)
+{
+ struct nlattr *nla;
+ int remaining;
+ u32 *mp;
+
+ if (cfg->fc_mx == NULL)
+ return 0;
+
+ mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
+ if (unlikely(!mp))
+ return -ENOMEM;
+
+ nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
+ int type = nla_type(nla);
+
+ if (type) {
+ u32 val;
+
+ if (unlikely(type > RTAX_MAX))
+ goto err;
+ if (type == RTAX_CC_ALGO) {
+ char tmp[TCP_CA_NAME_MAX];
+
+ nla_strlcpy(tmp, nla, sizeof(tmp));
+ val = tcp_ca_get_key_by_name(tmp);
+ if (val == TCP_CA_UNSPEC)
+ goto err;
+ } else {
+ val = nla_get_u32(nla);
+ }
+
+ mp[type - 1] = val;
+ __set_bit(type - 1, mxc->mx_valid);
+ }
+ }
+
+ mxc->mx = mp;
+
+ return 0;
+ err:
+ kfree(mp);
+ return -EINVAL;
+}
int ip6_route_add(struct fib6_config *cfg)
{
@@ -1482,6 +1524,7 @@ int ip6_route_add(struct fib6_config *cfg)
struct net_device *dev = NULL;
struct inet6_dev *idev = NULL;
struct fib6_table *table;
+ struct mx6_config mxc = { .mx = NULL, };
int addr_type;
if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
@@ -1677,8 +1720,14 @@ install_route:
cfg->fc_nlinfo.nl_net = dev_net(dev);
- return __ip6_ins_rt(rt, &cfg->fc_nlinfo, cfg->fc_mx, cfg->fc_mx_len);
+ err = ip6_convert_metrics(&mxc, cfg);
+ if (err)
+ goto out;
+
+ err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
+ kfree(mxc.mx);
+ return err;
out:
if (dev)
dev_put(dev);
@@ -2534,7 +2583,8 @@ static inline size_t rt6_nlmsg_size(void)
+ nla_total_size(4) /* RTA_OIF */
+ nla_total_size(4) /* RTA_PRIORITY */
+ RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
- + nla_total_size(sizeof(struct rta_cacheinfo));
+ + nla_total_size(sizeof(struct rta_cacheinfo))
+ + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
}
static int rt6_fill_node(struct net *net,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 9c0b54e87b47..5d46832c6f72 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1199,6 +1199,8 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
newnp->opt->opt_flen);
+ tcp_ca_openreq_child(newsk, dst);
+
tcp_sync_mss(newsk, dst_mtu(dst));
newtp->advmss = dst_metric_advmss(dst);
if (tcp_sk(sk)->rx_opt.user_mss &&