summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2012-05-08 14:40:21 -0400
committerDavid S. Miller <davem@davemloft.net>2012-05-08 14:40:21 -0400
commit9bb862beb6e5839e92f709d33fda07678f062f20 (patch)
treea2c396712c5a2cda380034173fd07a67bfa0489f /net
parentb44907e64cc1987153f6577306108379be1523b7 (diff)
parentd16cf20e2f2f13411eece7f7fb72c17d141c4a84 (diff)
Merge branch 'master' of git://1984.lsi.us.es/net-next
Diffstat (limited to 'net')
-rw-r--r--net/bridge/br_netfilter.c26
-rw-r--r--net/core/sock.c2
-rw-r--r--net/ipv4/netfilter/Makefile3
-rw-r--r--net/ipv4/netfilter/ip_queue.c639
-rw-r--r--net/ipv6/netfilter/Kconfig22
-rw-r--r--net/ipv6/netfilter/Makefile1
-rw-r--r--net/ipv6/netfilter/ip6_queue.c641
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c70
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c30
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c70
-rw-r--r--net/netfilter/ipvs/ip_vs_dh.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_lblc.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_lblcr.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_proto.c6
-rw-r--r--net/netfilter/ipvs/ip_vs_sh.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c662
-rw-r--r--net/netfilter/ipvs/ip_vs_wrr.c2
-rw-r--r--net/netfilter/nf_conntrack_core.c15
-rw-r--r--net/netfilter/nf_conntrack_ecache.c10
-rw-r--r--net/netfilter/nf_conntrack_helper.c122
-rw-r--r--net/netfilter/nf_conntrack_netlink.c10
22 files changed, 740 insertions, 1601 deletions
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 53f083686ae4..dce55d4ee83b 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -54,12 +54,14 @@ static int brnf_call_ip6tables __read_mostly = 1;
static int brnf_call_arptables __read_mostly = 1;
static int brnf_filter_vlan_tagged __read_mostly = 0;
static int brnf_filter_pppoe_tagged __read_mostly = 0;
+static int brnf_pass_vlan_indev __read_mostly = 0;
#else
#define brnf_call_iptables 1
#define brnf_call_ip6tables 1
#define brnf_call_arptables 1
#define brnf_filter_vlan_tagged 0
#define brnf_filter_pppoe_tagged 0
+#define brnf_pass_vlan_indev 0
#endif
#define IS_IP(skb) \
@@ -503,6 +505,19 @@ bridged_dnat:
return 0;
}
+static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct net_device *vlan, *br;
+
+ br = bridge_parent(dev);
+ if (brnf_pass_vlan_indev == 0 || !vlan_tx_tag_present(skb))
+ return br;
+
+ vlan = __vlan_find_dev_deep(br, vlan_tx_tag_get(skb) & VLAN_VID_MASK);
+
+ return vlan ? vlan : br;
+}
+
/* Some common code for IPv4/IPv6 */
static struct net_device *setup_pre_routing(struct sk_buff *skb)
{
@@ -515,7 +530,7 @@ static struct net_device *setup_pre_routing(struct sk_buff *skb)
nf_bridge->mask |= BRNF_NF_BRIDGE_PREROUTING;
nf_bridge->physindev = skb->dev;
- skb->dev = bridge_parent(skb->dev);
+ skb->dev = brnf_get_logical_dev(skb, skb->dev);
if (skb->protocol == htons(ETH_P_8021Q))
nf_bridge->mask |= BRNF_8021Q;
else if (skb->protocol == htons(ETH_P_PPP_SES))
@@ -774,7 +789,7 @@ static unsigned int br_nf_forward_ip(unsigned int hook, struct sk_buff *skb,
else
skb->protocol = htons(ETH_P_IPV6);
- NF_HOOK(pf, NF_INET_FORWARD, skb, bridge_parent(in), parent,
+ NF_HOOK(pf, NF_INET_FORWARD, skb, brnf_get_logical_dev(skb, in), parent,
br_nf_forward_finish);
return NF_STOLEN;
@@ -1002,6 +1017,13 @@ static ctl_table brnf_table[] = {
.mode = 0644,
.proc_handler = brnf_sysctl_call_tables,
},
+ {
+ .procname = "bridge-nf-pass-vlan-input-dev",
+ .data = &brnf_pass_vlan_indev,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = brnf_sysctl_call_tables,
+ },
{ }
};
#endif
diff --git a/net/core/sock.c b/net/core/sock.c
index b8c818e69c23..26ed27fb2bfb 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -259,7 +259,9 @@ static struct lock_class_key af_callback_keys[AF_MAX];
/* Run time adjustable parameters. */
__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
+EXPORT_SYMBOL(sysctl_wmem_max);
__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
+EXPORT_SYMBOL(sysctl_rmem_max);
__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 240b68469a7a..c20674dc9452 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -66,6 +66,3 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
# just filtering instance of ARP tables for now
obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
-
-obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o
-
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
deleted file mode 100644
index 09775a1e1348..000000000000
--- a/net/ipv4/netfilter/ip_queue.c
+++ /dev/null
@@ -1,639 +0,0 @@
-/*
- * This is a module which is used for queueing IPv4 packets and
- * communicating with userspace via netlink.
- *
- * (C) 2000-2002 James Morris <jmorris@intercode.com.au>
- * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/init.h>
-#include <linux/ip.h>
-#include <linux/notifier.h>
-#include <linux/netdevice.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4/ip_queue.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netlink.h>
-#include <linux/spinlock.h>
-#include <linux/sysctl.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/security.h>
-#include <linux/net.h>
-#include <linux/mutex.h>
-#include <linux/slab.h>
-#include <net/net_namespace.h>
-#include <net/sock.h>
-#include <net/route.h>
-#include <net/netfilter/nf_queue.h>
-#include <net/ip.h>
-
-#define IPQ_QMAX_DEFAULT 1024
-#define IPQ_PROC_FS_NAME "ip_queue"
-#define NET_IPQ_QMAX 2088
-#define NET_IPQ_QMAX_NAME "ip_queue_maxlen"
-
-typedef int (*ipq_cmpfn)(struct nf_queue_entry *, unsigned long);
-
-static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE;
-static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT;
-static DEFINE_SPINLOCK(queue_lock);
-static int peer_pid __read_mostly;
-static unsigned int copy_range __read_mostly;
-static unsigned int queue_total;
-static unsigned int queue_dropped = 0;
-static unsigned int queue_user_dropped = 0;
-static struct sock *ipqnl __read_mostly;
-static LIST_HEAD(queue_list);
-static DEFINE_MUTEX(ipqnl_mutex);
-
-static inline void
-__ipq_enqueue_entry(struct nf_queue_entry *entry)
-{
- list_add_tail(&entry->list, &queue_list);
- queue_total++;
-}
-
-static inline int
-__ipq_set_mode(unsigned char mode, unsigned int range)
-{
- int status = 0;
-
- switch(mode) {
- case IPQ_COPY_NONE:
- case IPQ_COPY_META:
- copy_mode = mode;
- copy_range = 0;
- break;
-
- case IPQ_COPY_PACKET:
- if (range > 0xFFFF)
- range = 0xFFFF;
- copy_range = range;
- copy_mode = mode;
- break;
-
- default:
- status = -EINVAL;
-
- }
- return status;
-}
-
-static void __ipq_flush(ipq_cmpfn cmpfn, unsigned long data);
-
-static inline void
-__ipq_reset(void)
-{
- peer_pid = 0;
- net_disable_timestamp();
- __ipq_set_mode(IPQ_COPY_NONE, 0);
- __ipq_flush(NULL, 0);
-}
-
-static struct nf_queue_entry *
-ipq_find_dequeue_entry(unsigned long id)
-{
- struct nf_queue_entry *entry = NULL, *i;
-
- spin_lock_bh(&queue_lock);
-
- list_for_each_entry(i, &queue_list, list) {
- if ((unsigned long)i == id) {
- entry = i;
- break;
- }
- }
-
- if (entry) {
- list_del(&entry->list);
- queue_total--;
- }
-
- spin_unlock_bh(&queue_lock);
- return entry;
-}
-
-static void
-__ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
-{
- struct nf_queue_entry *entry, *next;
-
- list_for_each_entry_safe(entry, next, &queue_list, list) {
- if (!cmpfn || cmpfn(entry, data)) {
- list_del(&entry->list);
- queue_total--;
- nf_reinject(entry, NF_DROP);
- }
- }
-}
-
-static void
-ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
-{
- spin_lock_bh(&queue_lock);
- __ipq_flush(cmpfn, data);
- spin_unlock_bh(&queue_lock);
-}
-
-static struct sk_buff *
-ipq_build_packet_message(struct nf_queue_entry *entry, int *errp)
-{
- sk_buff_data_t old_tail;
- size_t size = 0;
- size_t data_len = 0;
- struct sk_buff *skb;
- struct ipq_packet_msg *pmsg;
- struct nlmsghdr *nlh;
- struct timeval tv;
-
- switch (ACCESS_ONCE(copy_mode)) {
- case IPQ_COPY_META:
- case IPQ_COPY_NONE:
- size = NLMSG_SPACE(sizeof(*pmsg));
- break;
-
- case IPQ_COPY_PACKET:
- if (entry->skb->ip_summed == CHECKSUM_PARTIAL &&
- (*errp = skb_checksum_help(entry->skb)))
- return NULL;
-
- data_len = ACCESS_ONCE(copy_range);
- if (data_len == 0 || data_len > entry->skb->len)
- data_len = entry->skb->len;
-
- size = NLMSG_SPACE(sizeof(*pmsg) + data_len);
- break;
-
- default:
- *errp = -EINVAL;
- return NULL;
- }
-
- skb = alloc_skb(size, GFP_ATOMIC);
- if (!skb)
- goto nlmsg_failure;
-
- old_tail = skb->tail;
- nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh));
- pmsg = NLMSG_DATA(nlh);
- memset(pmsg, 0, sizeof(*pmsg));
-
- pmsg->packet_id = (unsigned long )entry;
- pmsg->data_len = data_len;
- tv = ktime_to_timeval(entry->skb->tstamp);
- pmsg->timestamp_sec = tv.tv_sec;
- pmsg->timestamp_usec = tv.tv_usec;
- pmsg->mark = entry->skb->mark;
- pmsg->hook = entry->hook;
- pmsg->hw_protocol = entry->skb->protocol;
-
- if (entry->indev)
- strcpy(pmsg->indev_name, entry->indev->name);
- else
- pmsg->indev_name[0] = '\0';
-
- if (entry->outdev)
- strcpy(pmsg->outdev_name, entry->outdev->name);
- else
- pmsg->outdev_name[0] = '\0';
-
- if (entry->indev && entry->skb->dev &&
- entry->skb->mac_header != entry->skb->network_header) {
- pmsg->hw_type = entry->skb->dev->type;
- pmsg->hw_addrlen = dev_parse_header(entry->skb,
- pmsg->hw_addr);
- }
-
- if (data_len)
- if (skb_copy_bits(entry->skb, 0, pmsg->payload, data_len))
- BUG();
-
- nlh->nlmsg_len = skb->tail - old_tail;
- return skb;
-
-nlmsg_failure:
- kfree_skb(skb);
- *errp = -EINVAL;
- printk(KERN_ERR "ip_queue: error creating packet message\n");
- return NULL;
-}
-
-static int
-ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
-{
- int status = -EINVAL;
- struct sk_buff *nskb;
-
- if (copy_mode == IPQ_COPY_NONE)
- return -EAGAIN;
-
- nskb = ipq_build_packet_message(entry, &status);
- if (nskb == NULL)
- return status;
-
- spin_lock_bh(&queue_lock);
-
- if (!peer_pid)
- goto err_out_free_nskb;
-
- if (queue_total >= queue_maxlen) {
- queue_dropped++;
- status = -ENOSPC;
- if (net_ratelimit())
- printk (KERN_WARNING "ip_queue: full at %d entries, "
- "dropping packets(s). Dropped: %d\n", queue_total,
- queue_dropped);
- goto err_out_free_nskb;
- }
-
- /* netlink_unicast will either free the nskb or attach it to a socket */
- status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT);
- if (status < 0) {
- queue_user_dropped++;
- goto err_out_unlock;
- }
-
- __ipq_enqueue_entry(entry);
-
- spin_unlock_bh(&queue_lock);
- return status;
-
-err_out_free_nskb:
- kfree_skb(nskb);
-
-err_out_unlock:
- spin_unlock_bh(&queue_lock);
- return status;
-}
-
-static int
-ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct nf_queue_entry *e)
-{
- int diff;
- struct iphdr *user_iph = (struct iphdr *)v->payload;
- struct sk_buff *nskb;
-
- if (v->data_len < sizeof(*user_iph))
- return 0;
- diff = v->data_len - e->skb->len;
- if (diff < 0) {
- if (pskb_trim(e->skb, v->data_len))
- return -ENOMEM;
- } else if (diff > 0) {
- if (v->data_len > 0xFFFF)
- return -EINVAL;
- if (diff > skb_tailroom(e->skb)) {
- nskb = skb_copy_expand(e->skb, skb_headroom(e->skb),
- diff, GFP_ATOMIC);
- if (!nskb) {
- printk(KERN_WARNING "ip_queue: error "
- "in mangle, dropping packet\n");
- return -ENOMEM;
- }
- kfree_skb(e->skb);
- e->skb = nskb;
- }
- skb_put(e->skb, diff);
- }
- if (!skb_make_writable(e->skb, v->data_len))
- return -ENOMEM;
- skb_copy_to_linear_data(e->skb, v->payload, v->data_len);
- e->skb->ip_summed = CHECKSUM_NONE;
-
- return 0;
-}
-
-static int
-ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len)
-{
- struct nf_queue_entry *entry;
-
- if (vmsg->value > NF_MAX_VERDICT || vmsg->value == NF_STOLEN)
- return -EINVAL;
-
- entry = ipq_find_dequeue_entry(vmsg->id);
- if (entry == NULL)
- return -ENOENT;
- else {
- int verdict = vmsg->value;
-
- if (vmsg->data_len && vmsg->data_len == len)
- if (ipq_mangle_ipv4(vmsg, entry) < 0)
- verdict = NF_DROP;
-
- nf_reinject(entry, verdict);
- return 0;
- }
-}
-
-static int
-ipq_set_mode(unsigned char mode, unsigned int range)
-{
- int status;
-
- spin_lock_bh(&queue_lock);
- status = __ipq_set_mode(mode, range);
- spin_unlock_bh(&queue_lock);
- return status;
-}
-
-static int
-ipq_receive_peer(struct ipq_peer_msg *pmsg,
- unsigned char type, unsigned int len)
-{
- int status = 0;
-
- if (len < sizeof(*pmsg))
- return -EINVAL;
-
- switch (type) {
- case IPQM_MODE:
- status = ipq_set_mode(pmsg->msg.mode.value,
- pmsg->msg.mode.range);
- break;
-
- case IPQM_VERDICT:
- status = ipq_set_verdict(&pmsg->msg.verdict,
- len - sizeof(*pmsg));
- break;
- default:
- status = -EINVAL;
- }
- return status;
-}
-
-static int
-dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex)
-{
- if (entry->indev)
- if (entry->indev->ifindex == ifindex)
- return 1;
- if (entry->outdev)
- if (entry->outdev->ifindex == ifindex)
- return 1;
-#ifdef CONFIG_BRIDGE_NETFILTER
- if (entry->skb->nf_bridge) {
- if (entry->skb->nf_bridge->physindev &&
- entry->skb->nf_bridge->physindev->ifindex == ifindex)
- return 1;
- if (entry->skb->nf_bridge->physoutdev &&
- entry->skb->nf_bridge->physoutdev->ifindex == ifindex)
- return 1;
- }
-#endif
- return 0;
-}
-
-static void
-ipq_dev_drop(int ifindex)
-{
- ipq_flush(dev_cmp, ifindex);
-}
-
-#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
-
-static inline void
-__ipq_rcv_skb(struct sk_buff *skb)
-{
- int status, type, pid, flags;
- unsigned int nlmsglen, skblen;
- struct nlmsghdr *nlh;
- bool enable_timestamp = false;
-
- skblen = skb->len;
- if (skblen < sizeof(*nlh))
- return;
-
- nlh = nlmsg_hdr(skb);
- nlmsglen = nlh->nlmsg_len;
- if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen)
- return;
-
- pid = nlh->nlmsg_pid;
- flags = nlh->nlmsg_flags;
-
- if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI)
- RCV_SKB_FAIL(-EINVAL);
-
- if (flags & MSG_TRUNC)
- RCV_SKB_FAIL(-ECOMM);
-
- type = nlh->nlmsg_type;
- if (type < NLMSG_NOOP || type >= IPQM_MAX)
- RCV_SKB_FAIL(-EINVAL);
-
- if (type <= IPQM_BASE)
- return;
-
- if (!capable(CAP_NET_ADMIN))
- RCV_SKB_FAIL(-EPERM);
-
- spin_lock_bh(&queue_lock);
-
- if (peer_pid) {
- if (peer_pid != pid) {
- spin_unlock_bh(&queue_lock);
- RCV_SKB_FAIL(-EBUSY);
- }
- } else {
- enable_timestamp = true;
- peer_pid = pid;
- }
-
- spin_unlock_bh(&queue_lock);
- if (enable_timestamp)
- net_enable_timestamp();
- status = ipq_receive_peer(NLMSG_DATA(nlh), type,
- nlmsglen - NLMSG_LENGTH(0));
- if (status < 0)
- RCV_SKB_FAIL(status);
-
- if (flags & NLM_F_ACK)
- netlink_ack(skb, nlh, 0);
-}
-
-static void
-ipq_rcv_skb(struct sk_buff *skb)
-{
- mutex_lock(&ipqnl_mutex);
- __ipq_rcv_skb(skb);
- mutex_unlock(&ipqnl_mutex);
-}
-
-static int
-ipq_rcv_dev_event(struct notifier_block *this,
- unsigned long event, void *ptr)
-{
- struct net_device *dev = ptr;
-
- if (!net_eq(dev_net(dev), &init_net))
- return NOTIFY_DONE;
-
- /* Drop any packets associated with the downed device */
- if (event == NETDEV_DOWN)
- ipq_dev_drop(dev->ifindex);
- return NOTIFY_DONE;
-}
-
-static struct notifier_block ipq_dev_notifier = {
- .notifier_call = ipq_rcv_dev_event,
-};
-
-static int
-ipq_rcv_nl_event(struct notifier_block *this,
- unsigned long event, void *ptr)
-{
- struct netlink_notify *n = ptr;
-
- if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL) {
- spin_lock_bh(&queue_lock);
- if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid))
- __ipq_reset();
- spin_unlock_bh(&queue_lock);
- }
- return NOTIFY_DONE;
-}
-
-static struct notifier_block ipq_nl_notifier = {
- .notifier_call = ipq_rcv_nl_event,
-};
-
-#ifdef CONFIG_SYSCTL
-static struct ctl_table_header *ipq_sysctl_header;
-
-static ctl_table ipq_table[] = {
- {
- .procname = NET_IPQ_QMAX_NAME,
- .data = &queue_maxlen,
- .maxlen = sizeof(queue_maxlen),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- { }
-};
-#endif
-
-#ifdef CONFIG_PROC_FS
-static int ip_queue_show(struct seq_file *m, void *v)
-{
- spin_lock_bh(&queue_lock);
-
- seq_printf(m,
- "Peer PID : %d\n"
- "Copy mode : %hu\n"
- "Copy range : %u\n"
- "Queue length : %u\n"
- "Queue max. length : %u\n"
- "Queue dropped : %u\n"
- "Netlink dropped : %u\n",
- peer_pid,
- copy_mode,
- copy_range,
- queue_total,
- queue_maxlen,
- queue_dropped,
- queue_user_dropped);
-
- spin_unlock_bh(&queue_lock);
- return 0;
-}
-
-static int ip_queue_open(struct inode *inode, struct file *file)
-{
- return single_open(file, ip_queue_show, NULL);
-}
-
-static const struct file_operations ip_queue_proc_fops = {
- .open = ip_queue_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .owner = THIS_MODULE,
-};
-#endif
-
-static const struct nf_queue_handler nfqh = {
- .name = "ip_queue",
- .outfn = &ipq_enqueue_packet,
-};
-
-static int __init ip_queue_init(void)
-{
- int status = -ENOMEM;
- struct proc_dir_entry *proc __maybe_unused;
-
- netlink_register_notifier(&ipq_nl_notifier);
- ipqnl = netlink_kernel_create(&init_net, NETLINK_FIREWALL, 0,
- ipq_rcv_skb, NULL, THIS_MODULE);
- if (ipqnl == NULL) {
- printk(KERN_ERR "ip_queue: failed to create netlink socket\n");
- goto cleanup_netlink_notifier;
- }
-
-#ifdef CONFIG_PROC_FS
- proc = proc_create(IPQ_PROC_FS_NAME, 0, init_net.proc_net,
- &ip_queue_proc_fops);
- if (!proc) {
- printk(KERN_ERR "ip_queue: failed to create proc entry\n");
- goto cleanup_ipqnl;
- }
-#endif
- register_netdevice_notifier(&ipq_dev_notifier);
-#ifdef CONFIG_SYSCTL
- ipq_sysctl_header = register_net_sysctl(&init_net, "net/ipv4", ipq_table);
-#endif
- status = nf_register_queue_handler(NFPROTO_IPV4, &nfqh);
- if (status < 0) {
- printk(KERN_ERR "ip_queue: failed to register queue handler\n");
- goto cleanup_sysctl;
- }
- return status;
-
-cleanup_sysctl:
-#ifdef CONFIG_SYSCTL
- unregister_net_sysctl_table(ipq_sysctl_header);
-#endif
- unregister_netdevice_notifier(&ipq_dev_notifier);
- proc_net_remove(&init_net, IPQ_PROC_FS_NAME);
-cleanup_ipqnl: __maybe_unused
- netlink_kernel_release(ipqnl);
- mutex_lock(&ipqnl_mutex);
- mutex_unlock(&ipqnl_mutex);
-
-cleanup_netlink_notifier:
- netlink_unregister_notifier(&ipq_nl_notifier);
- return status;
-}
-
-static void __exit ip_queue_fini(void)
-{
- nf_unregister_queue_handlers(&nfqh);
-
- ipq_flush(NULL, 0);
-
-#ifdef CONFIG_SYSCTL
- unregister_net_sysctl_table(ipq_sysctl_header);
-#endif
- unregister_netdevice_notifier(&ipq_dev_notifier);
- proc_net_remove(&init_net, IPQ_PROC_FS_NAME);
-
- netlink_kernel_release(ipqnl);
- mutex_lock(&ipqnl_mutex);
- mutex_unlock(&ipqnl_mutex);
-
- netlink_unregister_notifier(&ipq_nl_notifier);
-}
-
-MODULE_DESCRIPTION("IPv4 packet queue handler");
-MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_FIREWALL);
-
-module_init(ip_queue_init);
-module_exit(ip_queue_fini);
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index d33cddd16fbb..10135342799e 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -25,28 +25,6 @@ config NF_CONNTRACK_IPV6
To compile it as a module, choose M here. If unsure, say N.
-config IP6_NF_QUEUE
- tristate "IP6 Userspace queueing via NETLINK (OBSOLETE)"
- depends on INET && IPV6 && NETFILTER
- depends on NETFILTER_ADVANCED
- ---help---
-
- This option adds a queue handler to the kernel for IPv6
- packets which enables users to receive the filtered packets
- with QUEUE target using libipq.
-
- This option enables the old IPv6-only "ip6_queue" implementation
- which has been obsoleted by the new "nfnetlink_queue" code (see
- CONFIG_NETFILTER_NETLINK_QUEUE).
-
- (C) Fernando Anton 2001
- IPv64 Project - Work based in IPv64 draft by Arturo Azcorra.
- Universidad Carlos III de Madrid
- Universidad Politecnica de Alcala de Henares
- email: <fanton@it.uc3m.es>.
-
- To compile it as a module, choose M here. If unsure, say N.
-
config IP6_NF_IPTABLES
tristate "IP6 tables support (required for filtering)"
depends on INET && IPV6
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index d4dfd0a21097..534d3f216f7b 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -6,7 +6,6 @@
obj-$(CONFIG_IP6_NF_IPTABLES) += ip6_tables.o
obj-$(CONFIG_IP6_NF_FILTER) += ip6table_filter.o
obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o
-obj-$(CONFIG_IP6_NF_QUEUE) += ip6_queue.o
obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o
obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o
diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c
deleted file mode 100644
index 3ca9303b3a19..000000000000
--- a/net/ipv6/netfilter/ip6_queue.c
+++ /dev/null
@@ -1,641 +0,0 @@
-/*
- * This is a module which is used for queueing IPv6 packets and
- * communicating with userspace via netlink.
- *
- * (C) 2001 Fernando Anton, this code is GPL.
- * IPv64 Project - Work based in IPv64 draft by Arturo Azcorra.
- * Universidad Carlos III de Madrid - Leganes (Madrid) - Spain
- * Universidad Politecnica de Alcala de Henares - Alcala de H. (Madrid) - Spain
- * email: fanton@it.uc3m.es
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/init.h>
-#include <linux/ipv6.h>
-#include <linux/notifier.h>
-#include <linux/netdevice.h>
-#include <linux/netfilter.h>
-#include <linux/netlink.h>
-#include <linux/spinlock.h>
-#include <linux/sysctl.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/mutex.h>
-#include <linux/slab.h>
-#include <net/net_namespace.h>
-#include <net/sock.h>
-#include <net/ipv6.h>
-#include <net/ip6_route.h>
-#include <net/netfilter/nf_queue.h>
-#include <linux/netfilter_ipv4/ip_queue.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv6/ip6_tables.h>
-
-#define IPQ_QMAX_DEFAULT 1024
-#define IPQ_PROC_FS_NAME "ip6_queue"
-#define NET_IPQ_QMAX_NAME "ip6_queue_maxlen"
-
-typedef int (*ipq_cmpfn)(struct nf_queue_entry *, unsigned long);
-
-static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE;
-static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT;
-static DEFINE_SPINLOCK(queue_lock);
-static int peer_pid __read_mostly;
-static unsigned int copy_range __read_mostly;
-static unsigned int queue_total;
-static unsigned int queue_dropped = 0;
-static unsigned int queue_user_dropped = 0;
-static struct sock *ipqnl __read_mostly;
-static LIST_HEAD(queue_list);
-static DEFINE_MUTEX(ipqnl_mutex);
-
-static inline void
-__ipq_enqueue_entry(struct nf_queue_entry *entry)
-{
- list_add_tail(&entry->list, &queue_list);
- queue_total++;
-}
-
-static inline int
-__ipq_set_mode(unsigned char mode, unsigned int range)
-{
- int status = 0;
-
- switch(mode) {
- case IPQ_COPY_NONE:
- case IPQ_COPY_META:
- copy_mode = mode;
- copy_range = 0;
- break;
-
- case IPQ_COPY_PACKET:
- if (range > 0xFFFF)
- range = 0xFFFF;
- copy_range = range;
- copy_mode = mode;
- break;
-
- default:
- status = -EINVAL;
-
- }
- return status;
-}
-
-static void __ipq_flush(ipq_cmpfn cmpfn, unsigned long data);
-
-static inline void
-__ipq_reset(void)
-{
- peer_pid = 0;
- net_disable_timestamp();
- __ipq_set_mode(IPQ_COPY_NONE, 0);
- __ipq_flush(NULL, 0);
-}
-
-static struct nf_queue_entry *
-ipq_find_dequeue_entry(unsigned long id)
-{
- struct nf_queue_entry *entry = NULL, *i;
-
- spin_lock_bh(&queue_lock);
-
- list_for_each_entry(i, &queue_list, list) {
- if ((unsigned long)i == id) {
- entry = i;
- break;
- }
- }
-
- if (entry) {
- list_del(&entry->list);
- queue_total--;
- }
-
- spin_unlock_bh(&queue_lock);
- return entry;
-}
-
-static void
-__ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
-{
- struct nf_queue_entry *entry, *next;
-
- list_for_each_entry_safe(entry, next, &queue_list, list) {
- if (!cmpfn || cmpfn(entry, data)) {
- list_del(&entry->list);
- queue_total--;
- nf_reinject(entry, NF_DROP);
- }
- }
-}
-
-static void
-ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
-{
- spin_lock_bh(&queue_lock);
- __ipq_flush(cmpfn, data);
- spin_unlock_bh(&queue_lock);
-}
-
-static struct sk_buff *
-ipq_build_packet_message(struct nf_queue_entry *entry, int *errp)
-{
- sk_buff_data_t old_tail;
- size_t size = 0;
- size_t data_len = 0;
- struct sk_buff *skb;
- struct ipq_packet_msg *pmsg;
- struct nlmsghdr *nlh;
- struct timeval tv;
-
- switch (ACCESS_ONCE(copy_mode)) {
- case IPQ_COPY_META:
- case IPQ_COPY_NONE:
- size = NLMSG_SPACE(sizeof(*pmsg));
- break;
-
- case IPQ_COPY_PACKET:
- if (entry->skb->ip_summed == CHECKSUM_PARTIAL &&
- (*errp = skb_checksum_help(entry->skb)))
- return NULL;
-
- data_len = ACCESS_ONCE(copy_range);
- if (data_len == 0 || data_len > entry->skb->len)
- data_len = entry->skb->len;
-
- size = NLMSG_SPACE(sizeof(*pmsg) + data_len);
- break;
-
- default:
- *errp = -EINVAL;
- return NULL;
- }
-
- skb = alloc_skb(size, GFP_ATOMIC);
- if (!skb)
- goto nlmsg_failure;
-
- old_tail = skb->tail;
- nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh));
- pmsg = NLMSG_DATA(nlh);
- memset(pmsg, 0, sizeof(*pmsg));
-
- pmsg->packet_id = (unsigned long )entry;
- pmsg->data_len = data_len;
- tv = ktime_to_timeval(entry->skb->tstamp);
- pmsg->timestamp_sec = tv.tv_sec;
- pmsg->timestamp_usec = tv.tv_usec;
- pmsg->mark = entry->skb->mark;
- pmsg->hook = entry->hook;
- pmsg->hw_protocol = entry->skb->protocol;
-
- if (entry->indev)
- strcpy(pmsg->indev_name, entry->indev->name);
- else
- pmsg->indev_name[0] = '\0';
-
- if (entry->outdev)
- strcpy(pmsg->outdev_name, entry->outdev->name);
- else
- pmsg->outdev_name[0] = '\0';
-
- if (entry->indev && entry->skb->dev &&
- entry->skb->mac_header != entry->skb->network_header) {
- pmsg->hw_type = entry->skb->dev->type;
- pmsg->hw_addrlen = dev_parse_header(entry->skb, pmsg->hw_addr);
- }
-
- if (data_len)
- if (skb_copy_bits(entry->skb, 0, pmsg->payload, data_len))
- BUG();
-
- nlh->nlmsg_len = skb->tail - old_tail;
- return skb;
-
-nlmsg_failure:
- kfree_skb(skb);
- *errp = -EINVAL;
- printk(KERN_ERR "ip6_queue: error creating packet message\n");
- return NULL;
-}
-
-static int
-ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
-{
- int status = -EINVAL;
- struct sk_buff *nskb;
-
- if (copy_mode == IPQ_COPY_NONE)
- return -EAGAIN;
-
- nskb = ipq_build_packet_message(entry, &status);
- if (nskb == NULL)
- return status;
-
- spin_lock_bh(&queue_lock);
-
- if (!peer_pid)
- goto err_out_free_nskb;
-
- if (queue_total >= queue_maxlen) {
- queue_dropped++;
- status = -ENOSPC;
- if (net_ratelimit())
- printk (KERN_WARNING "ip6_queue: fill at %d entries, "
- "dropping packet(s). Dropped: %d\n", queue_total,
- queue_dropped);
- goto err_out_free_nskb;
- }
-
- /* netlink_unicast will either free the nskb or attach it to a socket */
- status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT);
- if (status < 0) {
- queue_user_dropped++;
- goto err_out_unlock;
- }
-
- __ipq_enqueue_entry(entry);
-
- spin_unlock_bh(&queue_lock);
- return status;
-
-err_out_free_nskb:
- kfree_skb(nskb);
-
-err_out_unlock:
- spin_unlock_bh(&queue_lock);
- return status;
-}
-
-static int
-ipq_mangle_ipv6(ipq_verdict_msg_t *v, struct nf_queue_entry *e)
-{
- int diff;
- struct ipv6hdr *user_iph = (struct ipv6hdr *)v->payload;
- struct sk_buff *nskb;
-
- if (v->data_len < sizeof(*user_iph))
- return 0;
- diff = v->data_len - e->skb->len;
- if (diff < 0) {
- if (pskb_trim(e->skb, v->data_len))
- return -ENOMEM;
- } else if (diff > 0) {
- if (v->data_len > 0xFFFF)
- return -EINVAL;
- if (diff > skb_tailroom(e->skb)) {
- nskb = skb_copy_expand(e->skb, skb_headroom(e->skb),
- diff, GFP_ATOMIC);
- if (!nskb) {
- printk(KERN_WARNING "ip6_queue: OOM "
- "in mangle, dropping packet\n");
- return -ENOMEM;
- }
- kfree_skb(e->skb);
- e->skb = nskb;
- }
- skb_put(e->skb, diff);
- }
- if (!skb_make_writable(e->skb, v->data_len))
- return -ENOMEM;
- skb_copy_to_linear_data(e->skb, v->payload, v->data_len);
- e->skb->ip_summed = CHECKSUM_NONE;
-
- return 0;
-}
-
-static int
-ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len)
-{
- struct nf_queue_entry *entry;
-
- if (vmsg->value > NF_MAX_VERDICT || vmsg->value == NF_STOLEN)
- return -EINVAL;
-
- entry = ipq_find_dequeue_entry(vmsg->id);
- if (entry == NULL)
- return -ENOENT;
- else {
- int verdict = vmsg->value;
-
- if (vmsg->data_len && vmsg->data_len == len)
- if (ipq_mangle_ipv6(vmsg, entry) < 0)
- verdict = NF_DROP;
-
- nf_reinject(entry, verdict);
- return 0;
- }
-}
-
-static int
-ipq_set_mode(unsigned char mode, unsigned int range)
-{
- int status;
-
- spin_lock_bh(&queue_lock);
- status = __ipq_set_mode(mode, range);
- spin_unlock_bh(&queue_lock);
- return status;
-}
-
-static int
-ipq_receive_peer(struct ipq_peer_msg *pmsg,
- unsigned char type, unsigned int len)
-{
- int status = 0;
-
- if (len < sizeof(*pmsg))
- return -EINVAL;
-
- switch (type) {
- case IPQM_MODE:
- status = ipq_set_mode(pmsg->msg.mode.value,
- pmsg->msg.mode.range);
- break;
-
- case IPQM_VERDICT:
- status = ipq_set_verdict(&pmsg->msg.verdict,
- len - sizeof(*pmsg));
- break;
- default:
- status = -EINVAL;
- }
- return status;
-}
-
-static int
-dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex)
-{
- if (entry->indev)
- if (entry->indev->ifindex == ifindex)
- return 1;
-
- if (entry->outdev)
- if (entry->outdev->ifindex == ifindex)
- return 1;
-#ifdef CONFIG_BRIDGE_NETFILTER
- if (entry->skb->nf_bridge) {
- if (entry->skb->nf_bridge->physindev &&
- entry->skb->nf_bridge->physindev->ifindex == ifindex)
- return 1;
- if (entry->skb->nf_bridge->physoutdev &&
- entry->skb->nf_bridge->physoutdev->ifindex == ifindex)
- return 1;
- }
-#endif
- return 0;
-}
-
-static void
-ipq_dev_drop(int ifindex)
-{
- ipq_flush(dev_cmp, ifindex);
-}
-
-#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
-
-static inline void
-__ipq_rcv_skb(struct sk_buff *skb)
-{
- int status, type, pid, flags;
- unsigned int nlmsglen, skblen;
- struct nlmsghdr *nlh;
- bool enable_timestamp = false;
-
- skblen = skb->len;
- if (skblen < sizeof(*nlh))
- return;
-
- nlh = nlmsg_hdr(skb);
- nlmsglen = nlh->nlmsg_len;
- if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen)
- return;
-
- pid = nlh->nlmsg_pid;
- flags = nlh->nlmsg_flags;
-
- if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI)
- RCV_SKB_FAIL(-EINVAL);
-
- if (flags & MSG_TRUNC)
- RCV_SKB_FAIL(-ECOMM);
-
- type = nlh->nlmsg_type;
- if (type < NLMSG_NOOP || type >= IPQM_MAX)
- RCV_SKB_FAIL(-EINVAL);
-
- if (type <= IPQM_BASE)
- return;
-
- if (!capable(CAP_NET_ADMIN))
- RCV_SKB_FAIL(-EPERM);
-
- spin_lock_bh(&queue_lock);
-
- if (peer_pid) {
- if (peer_pid != pid) {
- spin_unlock_bh(&queue_lock);
- RCV_SKB_FAIL(-EBUSY);
- }
- } else {
- enable_timestamp = true;
- peer_pid = pid;
- }
-
- spin_unlock_bh(&queue_lock);
- if (enable_timestamp)
- net_enable_timestamp();
-
- status = ipq_receive_peer(NLMSG_DATA(nlh), type,
- nlmsglen - NLMSG_LENGTH(0));
- if (status < 0)
- RCV_SKB_FAIL(status);
-
- if (flags & NLM_F_ACK)
- netlink_ack(skb, nlh, 0);
-}
-
-static void
-ipq_rcv_skb(struct sk_buff *skb)
-{
- mutex_lock(&ipqnl_mutex);
- __ipq_rcv_skb(skb);
- mutex_unlock(&ipqnl_mutex);
-}
-
-static int
-ipq_rcv_dev_event(struct notifier_block *this,
- unsigned long event, void *ptr)
-{
- struct net_device *dev = ptr;
-
- if (!net_eq(dev_net(dev), &init_net))
- return NOTIFY_DONE;
-
- /* Drop any packets associated with the downed device */
- if (event == NETDEV_DOWN)
- ipq_dev_drop(dev->ifindex);
- return NOTIFY_DONE;
-}
-
-static struct notifier_block ipq_dev_notifier = {
- .notifier_call = ipq_rcv_dev_event,
-};
-
-static int
-ipq_rcv_nl_event(struct notifier_block *this,
- unsigned long event, void *ptr)
-{
- struct netlink_notify *n = ptr;
-
- if (event == NETLINK_URELEASE && n->protocol == NETLINK_IP6_FW) {
- spin_lock_bh(&queue_lock);
- if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid))
- __ipq_reset();
- spin_unlock_bh(&queue_lock);
- }
- return NOTIFY_DONE;
-}
-
-static struct notifier_block ipq_nl_notifier = {
- .notifier_call = ipq_rcv_nl_event,
-};
-
-#ifdef CONFIG_SYSCTL
-static struct ctl_table_header *ipq_sysctl_header;
-
-static ctl_table ipq_table[] = {
- {
- .procname = NET_IPQ_QMAX_NAME,
- .data = &queue_maxlen,
- .maxlen = sizeof(queue_maxlen),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- { }
-};
-#endif
-
-#ifdef CONFIG_PROC_FS
-static int ip6_queue_show(struct seq_file *m, void *v)
-{
- spin_lock_bh(&queue_lock);
-
- seq_printf(m,
- "Peer PID : %d\n"
- "Copy mode : %hu\n"
- "Copy range : %u\n"
- "Queue length : %u\n"
- "Queue max. length : %u\n"
- "Queue dropped : %u\n"
- "Netfilter dropped : %u\n",
- peer_pid,
- copy_mode,
- copy_range,
- queue_total,
- queue_maxlen,
- queue_dropped,
- queue_user_dropped);
-
- spin_unlock_bh(&queue_lock);
- return 0;
-}
-
-static int ip6_queue_open(struct inode *inode, struct file *file)
-{
- return single_open(file, ip6_queue_show, NULL);
-}
-
-static const struct file_operations ip6_queue_proc_fops = {
- .open = ip6_queue_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .owner = THIS_MODULE,
-};
-#endif
-
-static const struct nf_queue_handler nfqh = {
- .name = "ip6_queue",
- .outfn = &ipq_enqueue_packet,
-};
-
-static int __init ip6_queue_init(void)
-{
- int status = -ENOMEM;
- struct proc_dir_entry *proc __maybe_unused;
-
- netlink_register_notifier(&ipq_nl_notifier);
- ipqnl = netlink_kernel_create(&init_net, NETLINK_IP6_FW, 0,
- ipq_rcv_skb, NULL, THIS_MODULE);
- if (ipqnl == NULL) {
- printk(KERN_ERR "ip6_queue: failed to create netlink socket\n");
- goto cleanup_netlink_notifier;
- }
-
-#ifdef CONFIG_PROC_FS
- proc = proc_create(IPQ_PROC_FS_NAME, 0, init_net.proc_net,
- &ip6_queue_proc_fops);
- if (!proc) {
- printk(KERN_ERR "ip6_queue: failed to create proc entry\n");
- goto cleanup_ipqnl;
- }
-#endif
- register_netdevice_notifier(&ipq_dev_notifier);
-#ifdef CONFIG_SYSCTL
- ipq_sysctl_header = register_net_sysctl(&init_net, "net/ipv6", ipq_table);
-#endif
- status = nf_register_queue_handler(NFPROTO_IPV6, &nfqh);
- if (status < 0) {
- printk(KERN_ERR "ip6_queue: failed to register queue handler\n");
- goto cleanup_sysctl;
- }
- return status;
-
-cleanup_sysctl:
-#ifdef CONFIG_SYSCTL
- unregister_net_sysctl_table(ipq_sysctl_header);
-#endif
- unregister_netdevice_notifier(&ipq_dev_notifier);
- proc_net_remove(&init_net, IPQ_PROC_FS_NAME);
-
-cleanup_ipqnl: __maybe_unused
- netlink_kernel_release(ipqnl);
- mutex_lock(&ipqnl_mutex);
- mutex_unlock(&ipqnl_mutex);
-
-cleanup_netlink_notifier:
- netlink_unregister_notifier(&ipq_nl_notifier);
- return status;
-}
-
-static void __exit ip6_queue_fini(void)
-{
- nf_unregister_queue_handlers(&nfqh);
-
- ipq_flush(NULL, 0);
-
-#ifdef CONFIG_SYSCTL
- unregister_net_sysctl_table(ipq_sysctl_header);
-#endif
- unregister_netdevice_notifier(&ipq_dev_notifier);
- proc_net_remove(&init_net, IPQ_PROC_FS_NAME);
-
- netlink_kernel_release(ipqnl);
- mutex_lock(&ipqnl_mutex);
- mutex_unlock(&ipqnl_mutex);
-
- netlink_unregister_notifier(&ipq_nl_notifier);
-}
-
-MODULE_DESCRIPTION("IPv6 packet queue handler");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_IP6_FW);
-
-module_init(ip6_queue_init);
-module_exit(ip6_queue_fini);
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 4a09b7873003..1548df9a7524 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -548,6 +548,7 @@ static inline void
ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
{
unsigned int conn_flags;
+ __u32 flags;
/* if dest is NULL, then return directly */
if (!dest)
@@ -559,17 +560,19 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
conn_flags = atomic_read(&dest->conn_flags);
if (cp->protocol != IPPROTO_UDP)
conn_flags &= ~IP_VS_CONN_F_ONE_PACKET;
+ flags = cp->flags;
/* Bind with the destination and its corresponding transmitter */
- if (cp->flags & IP_VS_CONN_F_SYNC) {
+ if (flags & IP_VS_CONN_F_SYNC) {
/* if the connection is not template and is created
* by sync, preserve the activity flag.
*/
- if (!(cp->flags & IP_VS_CONN_F_TEMPLATE))
+ if (!(flags & IP_VS_CONN_F_TEMPLATE))
conn_flags &= ~IP_VS_CONN_F_INACTIVE;
/* connections inherit forwarding method from dest */
- cp->flags &= ~IP_VS_CONN_F_FWD_MASK;
+ flags &= ~(IP_VS_CONN_F_FWD_MASK | IP_VS_CONN_F_NOOUTPUT);
}
- cp->flags |= conn_flags;
+ flags |= conn_flags;
+ cp->flags = flags;
cp->dest = dest;
IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
@@ -584,12 +587,12 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
atomic_read(&dest->refcnt));
/* Update the connection counters */
- if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
- /* It is a normal connection, so increase the inactive
- connection counter because it is in TCP SYNRECV
- state (inactive) or other protocol inacive state */
- if ((cp->flags & IP_VS_CONN_F_SYNC) &&
- (!(cp->flags & IP_VS_CONN_F_INACTIVE)))
+ if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
+ /* It is a normal connection, so modify the counters
+ * according to the flags, later the protocol can
+ * update them on state change
+ */
+ if (!(flags & IP_VS_CONN_F_INACTIVE))
atomic_inc(&dest->activeconns);
else
atomic_inc(&dest->inactconns);
@@ -613,14 +616,40 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
{
struct ip_vs_dest *dest;
- if ((cp) && (!cp->dest)) {
- dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr,
- cp->dport, &cp->vaddr, cp->vport,
- cp->protocol, cp->fwmark, cp->flags);
+ dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr,
+ cp->dport, &cp->vaddr, cp->vport,
+ cp->protocol, cp->fwmark, cp->flags);
+ if (dest) {
+ struct ip_vs_proto_data *pd;
+
+ spin_lock(&cp->lock);
+ if (cp->dest) {
+ spin_unlock(&cp->lock);
+ return dest;
+ }
+
+ /* Applications work depending on the forwarding method
+ * but better to reassign them always when binding dest */
+ if (cp->app)
+ ip_vs_unbind_app(cp);
+
ip_vs_bind_dest(cp, dest);
- return dest;
- } else
- return NULL;
+ spin_unlock(&cp->lock);
+
+ /* Update its packet transmitter */
+ cp->packet_xmit = NULL;
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6)
+ ip_vs_bind_xmit_v6(cp);
+ else
+#endif
+ ip_vs_bind_xmit(cp);
+
+ pd = ip_vs_proto_data_get(ip_vs_conn_net(cp), cp->protocol);
+ if (pd && atomic_read(&pd->appcnt))
+ ip_vs_bind_app(cp, pd->pp);
+ }
+ return dest;
}
@@ -743,7 +772,8 @@ int ip_vs_check_template(struct ip_vs_conn *ct)
static void ip_vs_conn_expire(unsigned long data)
{
struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
- struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
+ struct net *net = ip_vs_conn_net(cp);
+ struct netns_ipvs *ipvs = net_ipvs(net);
cp->timeout = 60*HZ;
@@ -808,6 +838,9 @@ static void ip_vs_conn_expire(unsigned long data)
atomic_read(&cp->refcnt)-1,
atomic_read(&cp->n_control));
+ if (ipvs->sync_state & IP_VS_STATE_MASTER)
+ ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs));
+
ip_vs_conn_put(cp);
}
@@ -881,6 +914,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
/* Set its state and timeout */
cp->state = 0;
cp->timeout = 3*HZ;
+ cp->sync_endtime = jiffies & ~3UL;
/* Bind its packet transmitter */
#ifdef CONFIG_IP_VS_IPV6
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index c8f36b96f44f..a54b018c6eea 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1613,34 +1613,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
else
pkts = atomic_add_return(1, &cp->in_pkts);
- if ((ipvs->sync_state & IP_VS_STATE_MASTER) &&
- cp->protocol == IPPROTO_SCTP) {
- if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
- (pkts % sysctl_sync_period(ipvs)
- == sysctl_sync_threshold(ipvs))) ||
- (cp->old_state != cp->state &&
- ((cp->state == IP_VS_SCTP_S_CLOSED) ||
- (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) ||
- (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) {
- ip_vs_sync_conn(net, cp);
- goto out;
- }
- }
-
- /* Keep this block last: TCP and others with pp->num_states <= 1 */
- else if ((ipvs->sync_state & IP_VS_STATE_MASTER) &&
- (((cp->protocol != IPPROTO_TCP ||
- cp->state == IP_VS_TCP_S_ESTABLISHED) &&
- (pkts % sysctl_sync_period(ipvs)
- == sysctl_sync_threshold(ipvs))) ||
- ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
- ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
- (cp->state == IP_VS_TCP_S_CLOSE) ||
- (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
- (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
- ip_vs_sync_conn(net, cp);
-out:
- cp->old_state = cp->state;
+ if (ipvs->sync_state & IP_VS_STATE_MASTER)
+ ip_vs_sync_conn(net, cp, pkts);
ip_vs_conn_put(cp);
return ret;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 37b91996bfba..dd811b8dd97c 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1599,6 +1599,10 @@ static int ip_vs_zero_all(struct net *net)
}
#ifdef CONFIG_SYSCTL
+
+static int zero;
+static int three = 3;
+
static int
proc_do_defense_mode(ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -1632,7 +1636,8 @@ proc_do_sync_threshold(ctl_table *table, int write,
memcpy(val, valp, sizeof(val));
rc = proc_dointvec(table, write, buffer, lenp, ppos);
- if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
+ if (write && (valp[0] < 0 || valp[1] < 0 ||
+ (valp[0] >= valp[1] && valp[1]))) {
/* Restore the correct value */
memcpy(valp, val, sizeof(val));
}
@@ -1652,9 +1657,24 @@ proc_do_sync_mode(ctl_table *table, int write,
if ((*valp < 0) || (*valp > 1)) {
/* Restore the correct value */
*valp = val;
- } else {
- struct net *net = current->nsproxy->net_ns;
- ip_vs_sync_switch_mode(net, val);
+ }
+ }
+ return rc;
+}
+
+static int
+proc_do_sync_ports(ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int *valp = table->data;
+ int val = *valp;
+ int rc;
+
+ rc = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (write && (*valp != val)) {
+ if (*valp < 1 || !is_power_of_2(*valp)) {
+ /* Restore the correct value */
+ *valp = val;
}
}
return rc;
@@ -1718,6 +1738,24 @@ static struct ctl_table vs_vars[] = {
.proc_handler = &proc_do_sync_mode,
},
{
+ .procname = "sync_ports",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_do_sync_ports,
+ },
+ {
+ .procname = "sync_qlen_max",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sync_sock_size",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "cache_bypass",
.maxlen = sizeof(int),
.mode = 0644,
@@ -1743,6 +1781,20 @@ static struct ctl_table vs_vars[] = {
.proc_handler = proc_do_sync_threshold,
},
{
+ .procname = "sync_refresh_period",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "sync_retries",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &three,
+ },
+ {
.procname = "nat_icmp_send",
.maxlen = sizeof(int),
.mode = 0644,
@@ -3655,6 +3707,12 @@ int __net_init ip_vs_control_net_init_sysctl(struct net *net)
tbl[idx++].data = &ipvs->sysctl_snat_reroute;
ipvs->sysctl_sync_ver = 1;
tbl[idx++].data = &ipvs->sysctl_sync_ver;
+ ipvs->sysctl_sync_ports = 1;
+ tbl[idx++].data = &ipvs->sysctl_sync_ports;
+ ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32;
+ tbl[idx++].data = &ipvs->sysctl_sync_qlen_max;
+ ipvs->sysctl_sync_sock_size = 0;
+ tbl[idx++].data = &ipvs->sysctl_sync_sock_size;
tbl[idx++].data = &ipvs->sysctl_cache_bypass;
tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
@@ -3662,6 +3720,10 @@ int __net_init ip_vs_control_net_init_sysctl(struct net *net)
ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
tbl[idx].data = &ipvs->sysctl_sync_threshold;
tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
+ ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD;
+ tbl[idx++].data = &ipvs->sysctl_sync_refresh_period;
+ ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3);
+ tbl[idx++].data = &ipvs->sysctl_sync_retries;
tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c
index 1a53a7a2fff0..8b7dca9ea422 100644
--- a/net/netfilter/ipvs/ip_vs_dh.c
+++ b/net/netfilter/ipvs/ip_vs_dh.c
@@ -149,7 +149,7 @@ static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
/* allocate the DH table for this service */
tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE,
- GFP_ATOMIC);
+ GFP_KERNEL);
if (tbl == NULL)
return -ENOMEM;
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 510f2b5a5855..b20b29c903ef 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -485,7 +485,7 @@ static struct pernet_operations ip_vs_ftp_ops = {
.exit = __ip_vs_ftp_exit,
};
-int __init ip_vs_ftp_init(void)
+static int __init ip_vs_ftp_init(void)
{
int rv;
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index 9b0de9a0e08e..df646ccf08a7 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -342,7 +342,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
/*
* Allocate the ip_vs_lblc_table for this service
*/
- tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
+ tbl = kmalloc(sizeof(*tbl), GFP_KERNEL);
if (tbl == NULL)
return -ENOMEM;
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 9dcd39a48897..570e31ea427a 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -511,7 +511,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
/*
* Allocate the ip_vs_lblcr_table for this service
*/
- tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
+ tbl = kmalloc(sizeof(*tbl), GFP_KERNEL);
if (tbl == NULL)
return -ENOMEM;
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index fdc82ad9cc0e..50d82186da87 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -68,7 +68,7 @@ register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp)
struct netns_ipvs *ipvs = net_ipvs(net);
unsigned int hash = IP_VS_PROTO_HASH(pp->protocol);
struct ip_vs_proto_data *pd =
- kzalloc(sizeof(struct ip_vs_proto_data), GFP_ATOMIC);
+ kzalloc(sizeof(struct ip_vs_proto_data), GFP_KERNEL);
if (!pd)
return -ENOMEM;
@@ -156,7 +156,7 @@ EXPORT_SYMBOL(ip_vs_proto_get);
/*
* get ip_vs_protocol object data by netns and proto
*/
-struct ip_vs_proto_data *
+static struct ip_vs_proto_data *
__ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto)
{
struct ip_vs_proto_data *pd;
@@ -199,7 +199,7 @@ void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags)
int *
ip_vs_create_timeout_table(int *table, int size)
{
- return kmemdup(table, size, GFP_ATOMIC);
+ return kmemdup(table, size, GFP_KERNEL);
}
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index 91e97ee049be..05126521743e 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -162,7 +162,7 @@ static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
/* allocate the SH table for this service */
tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE,
- GFP_ATOMIC);
+ GFP_KERNEL);
if (tbl == NULL)
return -ENOMEM;
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index bf5e538af67b..effa10c9e4e3 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -196,6 +196,7 @@ struct ip_vs_sync_thread_data {
struct net *net;
struct socket *sock;
char *buf;
+ int id;
};
/* Version 0 definition of packet sizes */
@@ -271,13 +272,6 @@ struct ip_vs_sync_buff {
unsigned char *end;
};
-/* multicast addr */
-static struct sockaddr_in mcast_addr = {
- .sin_family = AF_INET,
- .sin_port = cpu_to_be16(IP_VS_SYNC_PORT),
- .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP),
-};
-
/*
* Copy of struct ip_vs_seq
* From unaligned network order to aligned host order
@@ -300,18 +294,22 @@ static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no)
put_unaligned_be32(ho->previous_delta, &no->previous_delta);
}
-static inline struct ip_vs_sync_buff *sb_dequeue(struct netns_ipvs *ipvs)
+static inline struct ip_vs_sync_buff *
+sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
{
struct ip_vs_sync_buff *sb;
spin_lock_bh(&ipvs->sync_lock);
- if (list_empty(&ipvs->sync_queue)) {
+ if (list_empty(&ms->sync_queue)) {
sb = NULL;
+ __set_current_state(TASK_INTERRUPTIBLE);
} else {
- sb = list_entry(ipvs->sync_queue.next,
- struct ip_vs_sync_buff,
+ sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff,
list);
list_del(&sb->list);
+ ms->sync_queue_len--;
+ if (!ms->sync_queue_len)
+ ms->sync_queue_delay = 0;
}
spin_unlock_bh(&ipvs->sync_lock);
@@ -334,7 +332,7 @@ ip_vs_sync_buff_create(struct netns_ipvs *ipvs)
kfree(sb);
return NULL;
}
- sb->mesg->reserved = 0; /* old nr_conns i.e. must be zeo now */
+ sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */
sb->mesg->version = SYNC_PROTO_VER;
sb->mesg->syncid = ipvs->master_syncid;
sb->mesg->size = sizeof(struct ip_vs_sync_mesg);
@@ -353,14 +351,22 @@ static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
kfree(sb);
}
-static inline void sb_queue_tail(struct netns_ipvs *ipvs)
+static inline void sb_queue_tail(struct netns_ipvs *ipvs,
+ struct ipvs_master_sync_state *ms)
{
- struct ip_vs_sync_buff *sb = ipvs->sync_buff;
+ struct ip_vs_sync_buff *sb = ms->sync_buff;
spin_lock(&ipvs->sync_lock);
- if (ipvs->sync_state & IP_VS_STATE_MASTER)
- list_add_tail(&sb->list, &ipvs->sync_queue);
- else
+ if (ipvs->sync_state & IP_VS_STATE_MASTER &&
+ ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) {
+ if (!ms->sync_queue_len)
+ schedule_delayed_work(&ms->master_wakeup_work,
+ max(IPVS_SYNC_SEND_DELAY, 1));
+ ms->sync_queue_len++;
+ list_add_tail(&sb->list, &ms->sync_queue);
+ if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE)
+ wake_up_process(ms->master_thread);
+ } else
ip_vs_sync_buff_release(sb);
spin_unlock(&ipvs->sync_lock);
}
@@ -370,49 +376,26 @@ static inline void sb_queue_tail(struct netns_ipvs *ipvs)
* than the specified time or the specified time is zero.
*/
static inline struct ip_vs_sync_buff *
-get_curr_sync_buff(struct netns_ipvs *ipvs, unsigned long time)
+get_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms,
+ unsigned long time)
{
struct ip_vs_sync_buff *sb;
spin_lock_bh(&ipvs->sync_buff_lock);
- if (ipvs->sync_buff &&
- time_after_eq(jiffies - ipvs->sync_buff->firstuse, time)) {
- sb = ipvs->sync_buff;
- ipvs->sync_buff = NULL;
+ sb = ms->sync_buff;
+ if (sb && time_after_eq(jiffies - sb->firstuse, time)) {
+ ms->sync_buff = NULL;
+ __set_current_state(TASK_RUNNING);
} else
sb = NULL;
spin_unlock_bh(&ipvs->sync_buff_lock);
return sb;
}
-/*
- * Switch mode from sending version 0 or 1
- * - must handle sync_buf
- */
-void ip_vs_sync_switch_mode(struct net *net, int mode)
+static inline int
+select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
-
- if (!(ipvs->sync_state & IP_VS_STATE_MASTER))
- return;
- if (mode == sysctl_sync_ver(ipvs) || !ipvs->sync_buff)
- return;
-
- spin_lock_bh(&ipvs->sync_buff_lock);
- /* Buffer empty ? then let buf_create do the job */
- if (ipvs->sync_buff->mesg->size <= sizeof(struct ip_vs_sync_mesg)) {
- kfree(ipvs->sync_buff);
- ipvs->sync_buff = NULL;
- } else {
- spin_lock_bh(&ipvs->sync_lock);
- if (ipvs->sync_state & IP_VS_STATE_MASTER)
- list_add_tail(&ipvs->sync_buff->list,
- &ipvs->sync_queue);
- else
- ip_vs_sync_buff_release(ipvs->sync_buff);
- spin_unlock_bh(&ipvs->sync_lock);
- }
- spin_unlock_bh(&ipvs->sync_buff_lock);
+ return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask;
}
/*
@@ -442,15 +425,101 @@ ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs)
return sb;
}
+/* Check if conn should be synced.
+ * pkts: conn packets, use sysctl_sync_threshold to avoid packet check
+ * - (1) sync_refresh_period: reduce sync rate. Additionally, retry
+ * sync_retries times with period of sync_refresh_period/8
+ * - (2) if both sync_refresh_period and sync_period are 0 send sync only
+ * for state changes or only once when pkts matches sync_threshold
+ * - (3) templates: rate can be reduced only with sync_refresh_period or
+ * with (2)
+ */
+static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs,
+ struct ip_vs_conn *cp, int pkts)
+{
+ unsigned long orig = ACCESS_ONCE(cp->sync_endtime);
+ unsigned long now = jiffies;
+ unsigned long n = (now + cp->timeout) & ~3UL;
+ unsigned int sync_refresh_period;
+ int sync_period;
+ int force;
+
+ /* Check if we sync in current state */
+ if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE))
+ force = 0;
+ else if (likely(cp->protocol == IPPROTO_TCP)) {
+ if (!((1 << cp->state) &
+ ((1 << IP_VS_TCP_S_ESTABLISHED) |
+ (1 << IP_VS_TCP_S_FIN_WAIT) |
+ (1 << IP_VS_TCP_S_CLOSE) |
+ (1 << IP_VS_TCP_S_CLOSE_WAIT) |
+ (1 << IP_VS_TCP_S_TIME_WAIT))))
+ return 0;
+ force = cp->state != cp->old_state;
+ if (force && cp->state != IP_VS_TCP_S_ESTABLISHED)
+ goto set;
+ } else if (unlikely(cp->protocol == IPPROTO_SCTP)) {
+ if (!((1 << cp->state) &
+ ((1 << IP_VS_SCTP_S_ESTABLISHED) |
+ (1 << IP_VS_SCTP_S_CLOSED) |
+ (1 << IP_VS_SCTP_S_SHUT_ACK_CLI) |
+ (1 << IP_VS_SCTP_S_SHUT_ACK_SER))))
+ return 0;
+ force = cp->state != cp->old_state;
+ if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED)
+ goto set;
+ } else {
+ /* UDP or another protocol with single state */
+ force = 0;
+ }
+
+ sync_refresh_period = sysctl_sync_refresh_period(ipvs);
+ if (sync_refresh_period > 0) {
+ long diff = n - orig;
+ long min_diff = max(cp->timeout >> 1, 10UL * HZ);
+
+ /* Avoid sync if difference is below sync_refresh_period
+ * and below the half timeout.
+ */
+ if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) {
+ int retries = orig & 3;
+
+ if (retries >= sysctl_sync_retries(ipvs))
+ return 0;
+ if (time_before(now, orig - cp->timeout +
+ (sync_refresh_period >> 3)))
+ return 0;
+ n |= retries + 1;
+ }
+ }
+ sync_period = sysctl_sync_period(ipvs);
+ if (sync_period > 0) {
+ if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) &&
+ pkts % sync_period != sysctl_sync_threshold(ipvs))
+ return 0;
+ } else if (sync_refresh_period <= 0 &&
+ pkts != sysctl_sync_threshold(ipvs))
+ return 0;
+
+set:
+ cp->old_state = cp->state;
+ n = cmpxchg(&cp->sync_endtime, orig, n);
+ return n == orig || force;
+}
+
/*
* Version 0 , could be switched in by sys_ctl.
* Add an ip_vs_conn information into the current sync_buff.
*/
-void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp)
+static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
+ int pkts)
{
struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_sync_mesg_v0 *m;
struct ip_vs_sync_conn_v0 *s;
+ struct ip_vs_sync_buff *buff;
+ struct ipvs_master_sync_state *ms;
+ int id;
int len;
if (unlikely(cp->af != AF_INET))
@@ -459,21 +528,41 @@ void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp)
if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
return;
+ if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
+ return;
+
spin_lock(&ipvs->sync_buff_lock);
- if (!ipvs->sync_buff) {
- ipvs->sync_buff =
- ip_vs_sync_buff_create_v0(ipvs);
- if (!ipvs->sync_buff) {
+ if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
+ spin_unlock(&ipvs->sync_buff_lock);
+ return;
+ }
+
+ id = select_master_thread_id(ipvs, cp);
+ ms = &ipvs->ms[id];
+ buff = ms->sync_buff;
+ if (buff) {
+ m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
+ /* Send buffer if it is for v1 */
+ if (!m->nr_conns) {
+ sb_queue_tail(ipvs, ms);
+ ms->sync_buff = NULL;
+ buff = NULL;
+ }
+ }
+ if (!buff) {
+ buff = ip_vs_sync_buff_create_v0(ipvs);
+ if (!buff) {
spin_unlock(&ipvs->sync_buff_lock);
pr_err("ip_vs_sync_buff_create failed.\n");
return;
}
+ ms->sync_buff = buff;
}
len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
SIMPLE_CONN_SIZE;
- m = (struct ip_vs_sync_mesg_v0 *)ipvs->sync_buff->mesg;
- s = (struct ip_vs_sync_conn_v0 *)ipvs->sync_buff->head;
+ m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
+ s = (struct ip_vs_sync_conn_v0 *) buff->head;
/* copy members */
s->reserved = 0;
@@ -494,18 +583,24 @@ void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp)
m->nr_conns++;
m->size += len;
- ipvs->sync_buff->head += len;
+ buff->head += len;
/* check if there is a space for next one */
- if (ipvs->sync_buff->head + FULL_CONN_SIZE > ipvs->sync_buff->end) {
- sb_queue_tail(ipvs);
- ipvs->sync_buff = NULL;
+ if (buff->head + FULL_CONN_SIZE > buff->end) {
+ sb_queue_tail(ipvs, ms);
+ ms->sync_buff = NULL;
}
spin_unlock(&ipvs->sync_buff_lock);
/* synchronize its controller if it has */
- if (cp->control)
- ip_vs_sync_conn(net, cp->control);
+ cp = cp->control;
+ if (cp) {
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE)
+ pkts = atomic_add_return(1, &cp->in_pkts);
+ else
+ pkts = sysctl_sync_threshold(ipvs);
+ ip_vs_sync_conn(net, cp->control, pkts);
+ }
}
/*
@@ -513,23 +608,29 @@ void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp)
* Called by ip_vs_in.
* Sending Version 1 messages
*/
-void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp)
+void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts)
{
struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_sync_mesg *m;
union ip_vs_sync_conn *s;
+ struct ip_vs_sync_buff *buff;
+ struct ipvs_master_sync_state *ms;
+ int id;
__u8 *p;
unsigned int len, pe_name_len, pad;
/* Handle old version of the protocol */
if (sysctl_sync_ver(ipvs) == 0) {
- ip_vs_sync_conn_v0(net, cp);
+ ip_vs_sync_conn_v0(net, cp, pkts);
return;
}
/* Do not sync ONE PACKET */
if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
goto control;
sloop:
+ if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
+ goto control;
+
/* Sanity checks */
pe_name_len = 0;
if (cp->pe_data_len) {
@@ -541,6 +642,13 @@ sloop:
}
spin_lock(&ipvs->sync_buff_lock);
+ if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
+ spin_unlock(&ipvs->sync_buff_lock);
+ return;
+ }
+
+ id = select_master_thread_id(ipvs, cp);
+ ms = &ipvs->ms[id];
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6)
@@ -559,27 +667,32 @@ sloop:
/* check if there is a space for this one */
pad = 0;
- if (ipvs->sync_buff) {
- pad = (4 - (size_t)ipvs->sync_buff->head) & 3;
- if (ipvs->sync_buff->head + len + pad > ipvs->sync_buff->end) {
- sb_queue_tail(ipvs);
- ipvs->sync_buff = NULL;
+ buff = ms->sync_buff;
+ if (buff) {
+ m = buff->mesg;
+ pad = (4 - (size_t) buff->head) & 3;
+ /* Send buffer if it is for v0 */
+ if (buff->head + len + pad > buff->end || m->reserved) {
+ sb_queue_tail(ipvs, ms);
+ ms->sync_buff = NULL;
+ buff = NULL;
pad = 0;
}
}
- if (!ipvs->sync_buff) {
- ipvs->sync_buff = ip_vs_sync_buff_create(ipvs);
- if (!ipvs->sync_buff) {
+ if (!buff) {
+ buff = ip_vs_sync_buff_create(ipvs);
+ if (!buff) {
spin_unlock(&ipvs->sync_buff_lock);
pr_err("ip_vs_sync_buff_create failed.\n");
return;
}
+ ms->sync_buff = buff;
+ m = buff->mesg;
}
- m = ipvs->sync_buff->mesg;
- p = ipvs->sync_buff->head;
- ipvs->sync_buff->head += pad + len;
+ p = buff->head;
+ buff->head += pad + len;
m->size += pad + len;
/* Add ev. padding from prev. sync_conn */
while (pad--)
@@ -644,16 +757,10 @@ control:
cp = cp->control;
if (!cp)
return;
- /*
- * Reduce sync rate for templates
- * i.e only increment in_pkts for Templates.
- */
- if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
- int pkts = atomic_add_return(1, &cp->in_pkts);
-
- if (pkts % sysctl_sync_period(ipvs) != 1)
- return;
- }
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE)
+ pkts = atomic_add_return(1, &cp->in_pkts);
+ else
+ pkts = sysctl_sync_threshold(ipvs);
goto sloop;
}
@@ -731,9 +838,32 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
else
cp = ip_vs_ct_in_get(param);
- if (cp && param->pe_data) /* Free pe_data */
+ if (cp) {
+ /* Free pe_data */
kfree(param->pe_data);
- if (!cp) {
+
+ dest = cp->dest;
+ spin_lock(&cp->lock);
+ if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE &&
+ !(flags & IP_VS_CONN_F_TEMPLATE) && dest) {
+ if (flags & IP_VS_CONN_F_INACTIVE) {
+ atomic_dec(&dest->activeconns);
+ atomic_inc(&dest->inactconns);
+ } else {
+ atomic_inc(&dest->activeconns);
+ atomic_dec(&dest->inactconns);
+ }
+ }
+ flags &= IP_VS_CONN_F_BACKUP_UPD_MASK;
+ flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK;
+ cp->flags = flags;
+ spin_unlock(&cp->lock);
+ if (!dest) {
+ dest = ip_vs_try_bind_dest(cp);
+ if (dest)
+ atomic_dec(&dest->refcnt);
+ }
+ } else {
/*
* Find the appropriate destination for the connection.
* If it is not found the connection will remain unbound
@@ -742,18 +872,6 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr,
param->vport, protocol, fwmark, flags);
- /* Set the approprite ativity flag */
- if (protocol == IPPROTO_TCP) {
- if (state != IP_VS_TCP_S_ESTABLISHED)
- flags |= IP_VS_CONN_F_INACTIVE;
- else
- flags &= ~IP_VS_CONN_F_INACTIVE;
- } else if (protocol == IPPROTO_SCTP) {
- if (state != IP_VS_SCTP_S_ESTABLISHED)
- flags |= IP_VS_CONN_F_INACTIVE;
- else
- flags &= ~IP_VS_CONN_F_INACTIVE;
- }
cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark);
if (dest)
atomic_dec(&dest->refcnt);
@@ -763,34 +881,6 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
IP_VS_DBG(2, "BACKUP, add new conn. failed\n");
return;
}
- } else if (!cp->dest) {
- dest = ip_vs_try_bind_dest(cp);
- if (dest)
- atomic_dec(&dest->refcnt);
- } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
- (cp->state != state)) {
- /* update active/inactive flag for the connection */
- dest = cp->dest;
- if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
- (state != IP_VS_TCP_S_ESTABLISHED)) {
- atomic_dec(&dest->activeconns);
- atomic_inc(&dest->inactconns);
- cp->flags |= IP_VS_CONN_F_INACTIVE;
- } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
- (state == IP_VS_TCP_S_ESTABLISHED)) {
- atomic_inc(&dest->activeconns);
- atomic_dec(&dest->inactconns);
- cp->flags &= ~IP_VS_CONN_F_INACTIVE;
- }
- } else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) &&
- (cp->state != state)) {
- dest = cp->dest;
- if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
- (state != IP_VS_SCTP_S_ESTABLISHED)) {
- atomic_dec(&dest->activeconns);
- atomic_inc(&dest->inactconns);
- cp->flags &= ~IP_VS_CONN_F_INACTIVE;
- }
}
if (opt)
@@ -1149,6 +1239,28 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer,
/*
+ * Setup sndbuf (mode=1) or rcvbuf (mode=0)
+ */
+static void set_sock_size(struct sock *sk, int mode, int val)
+{
+ /* setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &val, sizeof(val)); */
+ /* setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)); */
+ lock_sock(sk);
+ if (mode) {
+ val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2,
+ sysctl_wmem_max);
+ sk->sk_sndbuf = val * 2;
+ sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+ } else {
+ val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2,
+ sysctl_rmem_max);
+ sk->sk_rcvbuf = val * 2;
+ sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+ }
+ release_sock(sk);
+}
+
+/*
* Setup loopback of outgoing multicasts on a sending socket
*/
static void set_mcast_loop(struct sock *sk, u_char loop)
@@ -1298,9 +1410,15 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname)
/*
* Set up sending multicast socket over UDP
*/
-static struct socket *make_send_sock(struct net *net)
+static struct socket *make_send_sock(struct net *net, int id)
{
struct netns_ipvs *ipvs = net_ipvs(net);
+ /* multicast addr */
+ struct sockaddr_in mcast_addr = {
+ .sin_family = AF_INET,
+ .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id),
+ .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP),
+ };
struct socket *sock;
int result;
@@ -1324,6 +1442,9 @@ static struct socket *make_send_sock(struct net *net)
set_mcast_loop(sock->sk, 0);
set_mcast_ttl(sock->sk, 1);
+ result = sysctl_sync_sock_size(ipvs);
+ if (result > 0)
+ set_sock_size(sock->sk, 1, result);
result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn);
if (result < 0) {
@@ -1349,9 +1470,15 @@ error:
/*
* Set up receiving multicast socket over UDP
*/
-static struct socket *make_receive_sock(struct net *net)
+static struct socket *make_receive_sock(struct net *net, int id)
{
struct netns_ipvs *ipvs = net_ipvs(net);
+ /* multicast addr */
+ struct sockaddr_in mcast_addr = {
+ .sin_family = AF_INET,
+ .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id),
+ .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP),
+ };
struct socket *sock;
int result;
@@ -1369,6 +1496,9 @@ static struct socket *make_receive_sock(struct net *net)
sk_change_net(sock->sk, net);
/* it is equivalent to the REUSEADDR option in user-space */
sock->sk->sk_reuse = SK_CAN_REUSE;
+ result = sysctl_sync_sock_size(ipvs);
+ if (result > 0)
+ set_sock_size(sock->sk, 0, result);
result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr,
sizeof(struct sockaddr));
@@ -1411,18 +1541,22 @@ ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
return len;
}
-static void
+static int
ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
{
int msize;
+ int ret;
msize = msg->size;
/* Put size in network byte order */
msg->size = htons(msg->size);
- if (ip_vs_send_async(sock, (char *)msg, msize) != msize)
- pr_err("ip_vs_send_async error\n");
+ ret = ip_vs_send_async(sock, (char *)msg, msize);
+ if (ret >= 0 || ret == -EAGAIN)
+ return ret;
+ pr_err("ip_vs_send_async error %d\n", ret);
+ return 0;
}
static int
@@ -1438,48 +1572,90 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
iov.iov_base = buffer;
iov.iov_len = (size_t)buflen;
- len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0);
+ len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, MSG_DONTWAIT);
if (len < 0)
- return -1;
+ return len;
LeaveFunction(7);
return len;
}
+/* Wakeup the master thread for sending */
+static void master_wakeup_work_handler(struct work_struct *work)
+{
+ struct ipvs_master_sync_state *ms =
+ container_of(work, struct ipvs_master_sync_state,
+ master_wakeup_work.work);
+ struct netns_ipvs *ipvs = ms->ipvs;
+
+ spin_lock_bh(&ipvs->sync_lock);
+ if (ms->sync_queue_len &&
+ ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) {
+ ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE;
+ wake_up_process(ms->master_thread);
+ }
+ spin_unlock_bh(&ipvs->sync_lock);
+}
+
+/* Get next buffer to send */
+static inline struct ip_vs_sync_buff *
+next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
+{
+ struct ip_vs_sync_buff *sb;
+
+ sb = sb_dequeue(ipvs, ms);
+ if (sb)
+ return sb;
+ /* Do not delay entries in buffer for more than 2 seconds */
+ return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME);
+}
static int sync_thread_master(void *data)
{
struct ip_vs_sync_thread_data *tinfo = data;
struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
+ struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id];
+ struct sock *sk = tinfo->sock->sk;
struct ip_vs_sync_buff *sb;
pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
- "syncid = %d\n",
- ipvs->master_mcast_ifn, ipvs->master_syncid);
+ "syncid = %d, id = %d\n",
+ ipvs->master_mcast_ifn, ipvs->master_syncid, tinfo->id);
- while (!kthread_should_stop()) {
- while ((sb = sb_dequeue(ipvs))) {
- ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
- ip_vs_sync_buff_release(sb);
+ for (;;) {
+ sb = next_sync_buff(ipvs, ms);
+ if (unlikely(kthread_should_stop()))
+ break;
+ if (!sb) {
+ schedule_timeout(IPVS_SYNC_CHECK_PERIOD);
+ continue;
}
-
- /* check if entries stay in ipvs->sync_buff for 2 seconds */
- sb = get_curr_sync_buff(ipvs, 2 * HZ);
- if (sb) {
- ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
- ip_vs_sync_buff_release(sb);
+ while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) {
+ int ret = 0;
+
+ __wait_event_interruptible(*sk_sleep(sk),
+ sock_writeable(sk) ||
+ kthread_should_stop(),
+ ret);
+ if (unlikely(kthread_should_stop()))
+ goto done;
}
-
- schedule_timeout_interruptible(HZ);
+ ip_vs_sync_buff_release(sb);
}
+done:
+ __set_current_state(TASK_RUNNING);
+ if (sb)
+ ip_vs_sync_buff_release(sb);
+
/* clean up the sync_buff queue */
- while ((sb = sb_dequeue(ipvs)))
+ while ((sb = sb_dequeue(ipvs, ms)))
ip_vs_sync_buff_release(sb);
+ __set_current_state(TASK_RUNNING);
/* clean up the current sync_buff */
- sb = get_curr_sync_buff(ipvs, 0);
+ sb = get_curr_sync_buff(ipvs, ms, 0);
if (sb)
ip_vs_sync_buff_release(sb);
@@ -1498,8 +1674,8 @@ static int sync_thread_backup(void *data)
int len;
pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
- "syncid = %d\n",
- ipvs->backup_mcast_ifn, ipvs->backup_syncid);
+ "syncid = %d, id = %d\n",
+ ipvs->backup_mcast_ifn, ipvs->backup_syncid, tinfo->id);
while (!kthread_should_stop()) {
wait_event_interruptible(*sk_sleep(tinfo->sock->sk),
@@ -1511,7 +1687,8 @@ static int sync_thread_backup(void *data)
len = ip_vs_receive(tinfo->sock, tinfo->buf,
ipvs->recv_mesg_maxlen);
if (len <= 0) {
- pr_err("receiving message error\n");
+ if (len != -EAGAIN)
+ pr_err("receiving message error\n");
break;
}
@@ -1535,86 +1712,140 @@ static int sync_thread_backup(void *data)
int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
{
struct ip_vs_sync_thread_data *tinfo;
- struct task_struct **realtask, *task;
+ struct task_struct **array = NULL, *task;
struct socket *sock;
struct netns_ipvs *ipvs = net_ipvs(net);
- char *name, *buf = NULL;
+ char *name;
int (*threadfn)(void *data);
+ int id, count;
int result = -ENOMEM;
IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
sizeof(struct ip_vs_sync_conn_v0));
+ if (!ipvs->sync_state) {
+ count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX);
+ ipvs->threads_mask = count - 1;
+ } else
+ count = ipvs->threads_mask + 1;
if (state == IP_VS_STATE_MASTER) {
- if (ipvs->master_thread)
+ if (ipvs->ms)
return -EEXIST;
strlcpy(ipvs->master_mcast_ifn, mcast_ifn,
sizeof(ipvs->master_mcast_ifn));
ipvs->master_syncid = syncid;
- realtask = &ipvs->master_thread;
- name = "ipvs_master:%d";
+ name = "ipvs-m:%d:%d";
threadfn = sync_thread_master;
- sock = make_send_sock(net);
} else if (state == IP_VS_STATE_BACKUP) {
- if (ipvs->backup_thread)
+ if (ipvs->backup_threads)
return -EEXIST;
strlcpy(ipvs->backup_mcast_ifn, mcast_ifn,
sizeof(ipvs->backup_mcast_ifn));
ipvs->backup_syncid = syncid;
- realtask = &ipvs->backup_thread;
- name = "ipvs_backup:%d";
+ name = "ipvs-b:%d:%d";
threadfn = sync_thread_backup;
- sock = make_receive_sock(net);
} else {
return -EINVAL;
}
- if (IS_ERR(sock)) {
- result = PTR_ERR(sock);
- goto out;
- }
+ if (state == IP_VS_STATE_MASTER) {
+ struct ipvs_master_sync_state *ms;
- set_sync_mesg_maxlen(net, state);
- if (state == IP_VS_STATE_BACKUP) {
- buf = kmalloc(ipvs->recv_mesg_maxlen, GFP_KERNEL);
- if (!buf)
- goto outsocket;
+ ipvs->ms = kzalloc(count * sizeof(ipvs->ms[0]), GFP_KERNEL);
+ if (!ipvs->ms)
+ goto out;
+ ms = ipvs->ms;
+ for (id = 0; id < count; id++, ms++) {
+ INIT_LIST_HEAD(&ms->sync_queue);
+ ms->sync_queue_len = 0;
+ ms->sync_queue_delay = 0;
+ INIT_DELAYED_WORK(&ms->master_wakeup_work,
+ master_wakeup_work_handler);
+ ms->ipvs = ipvs;
+ }
+ } else {
+ array = kzalloc(count * sizeof(struct task_struct *),
+ GFP_KERNEL);
+ if (!array)
+ goto out;
}
+ set_sync_mesg_maxlen(net, state);
- tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
- if (!tinfo)
- goto outbuf;
-
- tinfo->net = net;
- tinfo->sock = sock;
- tinfo->buf = buf;
+ tinfo = NULL;
+ for (id = 0; id < count; id++) {
+ if (state == IP_VS_STATE_MASTER)
+ sock = make_send_sock(net, id);
+ else
+ sock = make_receive_sock(net, id);
+ if (IS_ERR(sock)) {
+ result = PTR_ERR(sock);
+ goto outtinfo;
+ }
+ tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
+ if (!tinfo)
+ goto outsocket;
+ tinfo->net = net;
+ tinfo->sock = sock;
+ if (state == IP_VS_STATE_BACKUP) {
+ tinfo->buf = kmalloc(ipvs->recv_mesg_maxlen,
+ GFP_KERNEL);
+ if (!tinfo->buf)
+ goto outtinfo;
+ }
+ tinfo->id = id;
- task = kthread_run(threadfn, tinfo, name, ipvs->gen);
- if (IS_ERR(task)) {
- result = PTR_ERR(task);
- goto outtinfo;
+ task = kthread_run(threadfn, tinfo, name, ipvs->gen, id);
+ if (IS_ERR(task)) {
+ result = PTR_ERR(task);
+ goto outtinfo;
+ }
+ tinfo = NULL;
+ if (state == IP_VS_STATE_MASTER)
+ ipvs->ms[id].master_thread = task;
+ else
+ array[id] = task;
}
/* mark as active */
- *realtask = task;
+
+ if (state == IP_VS_STATE_BACKUP)
+ ipvs->backup_threads = array;
+ spin_lock_bh(&ipvs->sync_buff_lock);
ipvs->sync_state |= state;
+ spin_unlock_bh(&ipvs->sync_buff_lock);
/* increase the module use count */
ip_vs_use_count_inc();
return 0;
-outtinfo:
- kfree(tinfo);
-outbuf:
- kfree(buf);
outsocket:
sk_release_kernel(sock->sk);
+
+outtinfo:
+ if (tinfo) {
+ sk_release_kernel(tinfo->sock->sk);
+ kfree(tinfo->buf);
+ kfree(tinfo);
+ }
+ count = id;
+ while (count-- > 0) {
+ if (state == IP_VS_STATE_MASTER)
+ kthread_stop(ipvs->ms[count].master_thread);
+ else
+ kthread_stop(array[count]);
+ }
+ kfree(array);
+
out:
+ if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
+ kfree(ipvs->ms);
+ ipvs->ms = NULL;
+ }
return result;
}
@@ -1622,38 +1853,60 @@ out:
int stop_sync_thread(struct net *net, int state)
{
struct netns_ipvs *ipvs = net_ipvs(net);
+ struct task_struct **array;
+ int id;
int retc = -EINVAL;
IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
if (state == IP_VS_STATE_MASTER) {
- if (!ipvs->master_thread)
+ if (!ipvs->ms)
return -ESRCH;
- pr_info("stopping master sync thread %d ...\n",
- task_pid_nr(ipvs->master_thread));
-
/*
* The lock synchronizes with sb_queue_tail(), so that we don't
* add sync buffers to the queue, when we are already in
* progress of stopping the master sync daemon.
*/
- spin_lock_bh(&ipvs->sync_lock);
+ spin_lock_bh(&ipvs->sync_buff_lock);
+ spin_lock(&ipvs->sync_lock);
ipvs->sync_state &= ~IP_VS_STATE_MASTER;
- spin_unlock_bh(&ipvs->sync_lock);
- retc = kthread_stop(ipvs->master_thread);
- ipvs->master_thread = NULL;
+ spin_unlock(&ipvs->sync_lock);
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+
+ retc = 0;
+ for (id = ipvs->threads_mask; id >= 0; id--) {
+ struct ipvs_master_sync_state *ms = &ipvs->ms[id];
+ int ret;
+
+ pr_info("stopping master sync thread %d ...\n",
+ task_pid_nr(ms->master_thread));
+ cancel_delayed_work_sync(&ms->master_wakeup_work);
+ ret = kthread_stop(ms->master_thread);
+ if (retc >= 0)
+ retc = ret;
+ }
+ kfree(ipvs->ms);
+ ipvs->ms = NULL;
} else if (state == IP_VS_STATE_BACKUP) {
- if (!ipvs->backup_thread)
+ if (!ipvs->backup_threads)
return -ESRCH;
- pr_info("stopping backup sync thread %d ...\n",
- task_pid_nr(ipvs->backup_thread));
-
ipvs->sync_state &= ~IP_VS_STATE_BACKUP;
- retc = kthread_stop(ipvs->backup_thread);
- ipvs->backup_thread = NULL;
+ array = ipvs->backup_threads;
+ retc = 0;
+ for (id = ipvs->threads_mask; id >= 0; id--) {
+ int ret;
+
+ pr_info("stopping backup sync thread %d ...\n",
+ task_pid_nr(array[id]));
+ ret = kthread_stop(array[id]);
+ if (retc >= 0)
+ retc = ret;
+ }
+ kfree(array);
+ ipvs->backup_threads = NULL;
}
/* decrease the module use count */
@@ -1670,13 +1923,8 @@ int __net_init ip_vs_sync_net_init(struct net *net)
struct netns_ipvs *ipvs = net_ipvs(net);
__mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key);
- INIT_LIST_HEAD(&ipvs->sync_queue);
spin_lock_init(&ipvs->sync_lock);
spin_lock_init(&ipvs->sync_buff_lock);
-
- ipvs->sync_mcast_addr.sin_family = AF_INET;
- ipvs->sync_mcast_addr.sin_port = cpu_to_be16(IP_VS_SYNC_PORT);
- ipvs->sync_mcast_addr.sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP);
return 0;
}
diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c
index fd0d4e09876a..231be7dd547a 100644
--- a/net/netfilter/ipvs/ip_vs_wrr.c
+++ b/net/netfilter/ipvs/ip_vs_wrr.c
@@ -84,7 +84,7 @@ static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
/*
* Allocate the mark variable for WRR scheduling
*/
- mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC);
+ mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_KERNEL);
if (mark == NULL)
return -ENOMEM;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index cf0747c5741f..32c59093146e 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1336,7 +1336,6 @@ static void nf_conntrack_cleanup_init_net(void)
while (untrack_refs() > 0)
schedule();
- nf_conntrack_helper_fini();
nf_conntrack_proto_fini();
#ifdef CONFIG_NF_CONNTRACK_ZONES
nf_ct_extend_unregister(&nf_ct_zone_extend);
@@ -1354,6 +1353,7 @@ static void nf_conntrack_cleanup_net(struct net *net)
}
nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
+ nf_conntrack_helper_fini(net);
nf_conntrack_timeout_fini(net);
nf_conntrack_ecache_fini(net);
nf_conntrack_tstamp_fini(net);
@@ -1504,10 +1504,6 @@ static int nf_conntrack_init_init_net(void)
if (ret < 0)
goto err_proto;
- ret = nf_conntrack_helper_init();
- if (ret < 0)
- goto err_helper;
-
#ifdef CONFIG_NF_CONNTRACK_ZONES
ret = nf_ct_extend_register(&nf_ct_zone_extend);
if (ret < 0)
@@ -1525,10 +1521,8 @@ static int nf_conntrack_init_init_net(void)
#ifdef CONFIG_NF_CONNTRACK_ZONES
err_extend:
- nf_conntrack_helper_fini();
-#endif
-err_helper:
nf_conntrack_proto_fini();
+#endif
err_proto:
return ret;
}
@@ -1589,9 +1583,14 @@ static int nf_conntrack_init_net(struct net *net)
ret = nf_conntrack_timeout_init(net);
if (ret < 0)
goto err_timeout;
+ ret = nf_conntrack_helper_init(net);
+ if (ret < 0)
+ goto err_helper;
return 0;
+err_helper:
+ nf_conntrack_timeout_fini(net);
err_timeout:
nf_conntrack_ecache_fini(net);
err_ecache:
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index b924f3a49a8e..e7be79e640de 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -84,7 +84,7 @@ EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events);
int nf_conntrack_register_notifier(struct net *net,
struct nf_ct_event_notifier *new)
{
- int ret = 0;
+ int ret;
struct nf_ct_event_notifier *notify;
mutex_lock(&nf_ct_ecache_mutex);
@@ -95,8 +95,7 @@ int nf_conntrack_register_notifier(struct net *net,
goto out_unlock;
}
rcu_assign_pointer(net->ct.nf_conntrack_event_cb, new);
- mutex_unlock(&nf_ct_ecache_mutex);
- return ret;
+ ret = 0;
out_unlock:
mutex_unlock(&nf_ct_ecache_mutex);
@@ -121,7 +120,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier);
int nf_ct_expect_register_notifier(struct net *net,
struct nf_exp_event_notifier *new)
{
- int ret = 0;
+ int ret;
struct nf_exp_event_notifier *notify;
mutex_lock(&nf_ct_ecache_mutex);
@@ -132,8 +131,7 @@ int nf_ct_expect_register_notifier(struct net *net,
goto out_unlock;
}
rcu_assign_pointer(net->ct.nf_expect_event_cb, new);
- mutex_unlock(&nf_ct_ecache_mutex);
- return ret;
+ ret = 0;
out_unlock:
mutex_unlock(&nf_ct_ecache_mutex);
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 436b7cb79ba4..4fa2ff961f5a 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -34,6 +34,67 @@ static struct hlist_head *nf_ct_helper_hash __read_mostly;
static unsigned int nf_ct_helper_hsize __read_mostly;
static unsigned int nf_ct_helper_count __read_mostly;
+static bool nf_ct_auto_assign_helper __read_mostly = true;
+module_param_named(nf_conntrack_helper, nf_ct_auto_assign_helper, bool, 0644);
+MODULE_PARM_DESC(nf_conntrack_helper,
+ "Enable automatic conntrack helper assignment (default 1)");
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table helper_sysctl_table[] = {
+ {
+ .procname = "nf_conntrack_helper",
+ .data = &init_net.ct.sysctl_auto_assign_helper,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {}
+};
+
+static int nf_conntrack_helper_init_sysctl(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = kmemdup(helper_sysctl_table, sizeof(helper_sysctl_table),
+ GFP_KERNEL);
+ if (!table)
+ goto out;
+
+ table[0].data = &net->ct.sysctl_auto_assign_helper;
+
+ net->ct.helper_sysctl_header =
+ register_net_sysctl(net, "net/netfilter", table);
+
+ if (!net->ct.helper_sysctl_header) {
+ pr_err("nf_conntrack_helper: can't register to sysctl.\n");
+ goto out_register;
+ }
+ return 0;
+
+out_register:
+ kfree(table);
+out:
+ return -ENOMEM;
+}
+
+static void nf_conntrack_helper_fini_sysctl(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = net->ct.helper_sysctl_header->ctl_table_arg;
+ unregister_net_sysctl_table(net->ct.helper_sysctl_header);
+ kfree(table);
+}
+#else
+static int nf_conntrack_helper_init_sysctl(struct net *net)
+{
+ return 0;
+}
+
+static void nf_conntrack_helper_fini_sysctl(struct net *net)
+{
+}
+#endif /* CONFIG_SYSCTL */
/* Stupid hash, but collision free for the default registrations of the
* helpers currently in the kernel. */
@@ -118,17 +179,38 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
{
struct nf_conntrack_helper *helper = NULL;
struct nf_conn_help *help;
+ struct net *net = nf_ct_net(ct);
int ret = 0;
+ /* We already got a helper explicitly attached. The function
+ * nf_conntrack_alter_reply - in case NAT is in use - asks for looking
+ * the helper up again. Since now the user is in full control of
+ * making consistent helper configurations, skip this automatic
+ * re-lookup, otherwise we'll lose the helper.
+ */
+ if (test_bit(IPS_HELPER_BIT, &ct->status))
+ return 0;
+
if (tmpl != NULL) {
help = nfct_help(tmpl);
- if (help != NULL)
+ if (help != NULL) {
helper = help->helper;
+ set_bit(IPS_HELPER_BIT, &ct->status);
+ }
}
help = nfct_help(ct);
- if (helper == NULL)
+ if (net->ct.sysctl_auto_assign_helper && helper == NULL) {
helper = __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+ if (unlikely(!net->ct.auto_assign_helper_warned && helper)) {
+ pr_info("nf_conntrack: automatic helper "
+ "assignment is deprecated and it will "
+ "be removed soon. Use the iptables CT target "
+ "to attach helpers instead.\n");
+ net->ct.auto_assign_helper_warned = true;
+ }
+ }
+
if (helper == NULL) {
if (help)
RCU_INIT_POINTER(help->helper, NULL);
@@ -315,28 +397,44 @@ static struct nf_ct_ext_type helper_extend __read_mostly = {
.id = NF_CT_EXT_HELPER,
};
-int nf_conntrack_helper_init(void)
+int nf_conntrack_helper_init(struct net *net)
{
int err;
- nf_ct_helper_hsize = 1; /* gets rounded up to use one page */
- nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0);
- if (!nf_ct_helper_hash)
- return -ENOMEM;
+ net->ct.auto_assign_helper_warned = false;
+ net->ct.sysctl_auto_assign_helper = nf_ct_auto_assign_helper;
+
+ if (net_eq(net, &init_net)) {
+ nf_ct_helper_hsize = 1; /* gets rounded up to use one page */
+ nf_ct_helper_hash =
+ nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0);
+ if (!nf_ct_helper_hash)
+ return -ENOMEM;
- err = nf_ct_extend_register(&helper_extend);
+ err = nf_ct_extend_register(&helper_extend);
+ if (err < 0)
+ goto err1;
+ }
+
+ err = nf_conntrack_helper_init_sysctl(net);
if (err < 0)
- goto err1;
+ goto out_sysctl;
return 0;
+out_sysctl:
+ if (net_eq(net, &init_net))
+ nf_ct_extend_unregister(&helper_extend);
err1:
nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize);
return err;
}
-void nf_conntrack_helper_fini(void)
+void nf_conntrack_helper_fini(struct net *net)
{
- nf_ct_extend_unregister(&helper_extend);
- nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize);
+ nf_conntrack_helper_fini_sysctl(net);
+ if (net_eq(net, &init_net)) {
+ nf_ct_extend_unregister(&helper_extend);
+ nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize);
+ }
}
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 462ec2dbe561..6f4b00a8fc73 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2080,7 +2080,15 @@ static int
ctnetlink_change_expect(struct nf_conntrack_expect *x,
const struct nlattr * const cda[])
{
- return -EOPNOTSUPP;
+ if (cda[CTA_EXPECT_TIMEOUT]) {
+ if (!del_timer(&x->timeout))
+ return -ETIME;
+
+ x->timeout.expires = jiffies +
+ ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ;
+ add_timer(&x->timeout);
+ }
+ return 0;
}
static const struct nla_policy exp_nat_nla_policy[CTA_EXPECT_NAT_MAX+1] = {