summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/bluetooth/cmtp/core.c6
-rw-r--r--net/bluetooth/hidp/core.c5
-rw-r--r--net/bluetooth/rfcomm/sock.c7
-rw-r--r--net/bluetooth/rfcomm/tty.c2
-rw-r--r--net/core/dev.c7
-rw-r--r--net/core/filter.c104
-rw-r--r--net/core/skbuff.c19
-rw-r--r--net/core/sock.c11
-rw-r--r--net/decnet/af_decnet.c10
-rw-r--r--net/decnet/dn_fib.c3
-rw-r--r--net/decnet/dn_nsp_out.c3
-rw-r--r--net/ipv4/af_inet.c11
-rw-r--r--net/ipv4/fib_trie.c202
-rw-r--r--net/ipv4/icmp.c3
-rw-r--r--net/ipv4/igmp.c96
-rw-r--r--net/ipv4/ip_output.c16
-rw-r--r--net/ipv4/ip_sockglue.c6
-rw-r--r--net/ipv4/route.c126
-rw-r--r--net/ipv4/tcp.c52
-rw-r--r--net/ipv4/tcp_input.c87
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv4/tcp_output.c546
-rw-r--r--net/ipv4/tcp_timer.c5
-rw-r--r--net/ipv6/af_inet6.c4
-rw-r--r--net/ipv6/ip6_output.c1
-rw-r--r--net/ipv6/mcast.c29
-rw-r--r--net/ipv6/tcp_ipv6.c2
-rw-r--r--net/irda/irlap.c3
-rw-r--r--net/irda/irlap_event.c14
-rw-r--r--net/irda/irlap_frame.c8
-rw-r--r--net/irda/irttp.c2
-rw-r--r--net/llc/llc_c_ev.c2
-rw-r--r--net/netlink/af_netlink.c2
-rw-r--r--net/sched/Makefile2
-rw-r--r--net/sched/em_meta.c6
-rw-r--r--net/sched/sch_api.c63
-rw-r--r--net/sched/sch_blackhole.c54
-rw-r--r--net/sched/sch_generic.c35
-rw-r--r--net/sched/sch_red.c2
-rw-r--r--net/sctp/associola.c2
-rw-r--r--net/sctp/input.c26
-rw-r--r--net/sctp/inqueue.c18
-rw-r--r--net/sctp/output.c22
-rw-r--r--net/sctp/outqueue.c50
-rw-r--r--net/sctp/sm_make_chunk.c12
-rw-r--r--net/sctp/socket.c2
-rw-r--r--net/sunrpc/xprt.c2
-rw-r--r--net/unix/af_unix.c4
48 files changed, 1120 insertions, 576 deletions
diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
index 2e341de3e763..901eff7ebe74 100644
--- a/net/bluetooth/cmtp/core.c
+++ b/net/bluetooth/cmtp/core.c
@@ -213,7 +213,7 @@ static int cmtp_send_frame(struct cmtp_session *session, unsigned char *data, in
return kernel_sendmsg(sock, &msg, &iv, 1, len);
}
-static int cmtp_process_transmit(struct cmtp_session *session)
+static void cmtp_process_transmit(struct cmtp_session *session)
{
struct sk_buff *skb, *nskb;
unsigned char *hdr;
@@ -223,7 +223,7 @@ static int cmtp_process_transmit(struct cmtp_session *session)
if (!(nskb = alloc_skb(session->mtu, GFP_ATOMIC))) {
BT_ERR("Can't allocate memory for new frame");
- return -ENOMEM;
+ return;
}
while ((skb = skb_dequeue(&session->transmit))) {
@@ -275,8 +275,6 @@ static int cmtp_process_transmit(struct cmtp_session *session)
cmtp_send_frame(session, nskb->data, nskb->len);
kfree_skb(nskb);
-
- return skb_queue_len(&session->transmit);
}
static int cmtp_session(void *arg)
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index affbc55462e8..de8af5f42394 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -428,7 +428,7 @@ static int hidp_send_frame(struct socket *sock, unsigned char *data, int len)
return kernel_sendmsg(sock, &msg, &iv, 1, len);
}
-static int hidp_process_transmit(struct hidp_session *session)
+static void hidp_process_transmit(struct hidp_session *session)
{
struct sk_buff *skb;
@@ -453,9 +453,6 @@ static int hidp_process_transmit(struct hidp_session *session)
hidp_set_timer(session);
kfree_skb(skb);
}
-
- return skb_queue_len(&session->ctrl_transmit) +
- skb_queue_len(&session->intr_transmit);
}
static int hidp_session(void *arg)
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index f3f6355a2786..63a123c5c41b 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -590,8 +590,11 @@ static long rfcomm_sock_data_wait(struct sock *sk, long timeo)
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
- if (skb_queue_len(&sk->sk_receive_queue) || sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN) ||
- signal_pending(current) || !timeo)
+ if (!skb_queue_empty(&sk->sk_receive_queue) ||
+ sk->sk_err ||
+ (sk->sk_shutdown & RCV_SHUTDOWN) ||
+ signal_pending(current) ||
+ !timeo)
break;
set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 6d689200bcf3..6304590fd36a 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -781,7 +781,7 @@ static int rfcomm_tty_chars_in_buffer(struct tty_struct *tty)
BT_DBG("tty %p dev %p", tty, dev);
- if (skb_queue_len(&dlc->tx_queue))
+ if (!skb_queue_empty(&dlc->tx_queue))
return dlc->mtu;
return 0;
diff --git a/net/core/dev.c b/net/core/dev.c
index 7016e0c36b3d..ff9dc029233a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1127,7 +1127,7 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
extern void skb_release_data(struct sk_buff *);
/* Keep head the same: replace data */
-int __skb_linearize(struct sk_buff *skb, int gfp_mask)
+int __skb_linearize(struct sk_buff *skb, unsigned int __nocast gfp_mask)
{
unsigned int size;
u8 *data;
@@ -2089,10 +2089,11 @@ void dev_set_promiscuity(struct net_device *dev, int inc)
{
unsigned short old_flags = dev->flags;
- dev->flags |= IFF_PROMISC;
if ((dev->promiscuity += inc) == 0)
dev->flags &= ~IFF_PROMISC;
- if (dev->flags ^ old_flags) {
+ else
+ dev->flags |= IFF_PROMISC;
+ if (dev->flags != old_flags) {
dev_mc_upload(dev);
printk(KERN_INFO "device %s %s promiscuous mode\n",
dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
diff --git a/net/core/filter.c b/net/core/filter.c
index f3b88205ace2..cd91a24f9720 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -36,7 +36,7 @@
#include <linux/filter.h>
/* No hurry in this branch */
-static u8 *load_pointer(struct sk_buff *skb, int k)
+static void *__load_pointer(struct sk_buff *skb, int k)
{
u8 *ptr = NULL;
@@ -50,6 +50,18 @@ static u8 *load_pointer(struct sk_buff *skb, int k)
return NULL;
}
+static inline void *load_pointer(struct sk_buff *skb, int k,
+ unsigned int size, void *buffer)
+{
+ if (k >= 0)
+ return skb_header_pointer(skb, k, size, buffer);
+ else {
+ if (k >= SKF_AD_OFF)
+ return NULL;
+ return __load_pointer(skb, k);
+ }
+}
+
/**
* sk_run_filter - run a filter on a socket
* @skb: buffer to run the filter on
@@ -64,15 +76,12 @@ static u8 *load_pointer(struct sk_buff *skb, int k)
int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
{
- unsigned char *data = skb->data;
- /* len is UNSIGNED. Byte wide insns relies only on implicit
- type casts to prevent reading arbitrary memory locations.
- */
- unsigned int len = skb->len-skb->data_len;
struct sock_filter *fentry; /* We walk down these */
+ void *ptr;
u32 A = 0; /* Accumulator */
u32 X = 0; /* Index Register */
u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */
+ u32 tmp;
int k;
int pc;
@@ -168,86 +177,35 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
case BPF_LD|BPF_W|BPF_ABS:
k = fentry->k;
load_w:
- if (k >= 0 && (unsigned int)(k+sizeof(u32)) <= len) {
- A = ntohl(*(u32*)&data[k]);
+ ptr = load_pointer(skb, k, 4, &tmp);
+ if (ptr != NULL) {
+ A = ntohl(*(u32 *)ptr);
continue;
}
- if (k < 0) {
- u8 *ptr;
-
- if (k >= SKF_AD_OFF)
- break;
- ptr = load_pointer(skb, k);
- if (ptr) {
- A = ntohl(*(u32*)ptr);
- continue;
- }
- } else {
- u32 _tmp, *p;
- p = skb_header_pointer(skb, k, 4, &_tmp);
- if (p != NULL) {
- A = ntohl(*p);
- continue;
- }
- }
return 0;
case BPF_LD|BPF_H|BPF_ABS:
k = fentry->k;
load_h:
- if (k >= 0 && (unsigned int)(k + sizeof(u16)) <= len) {
- A = ntohs(*(u16*)&data[k]);
+ ptr = load_pointer(skb, k, 2, &tmp);
+ if (ptr != NULL) {
+ A = ntohs(*(u16 *)ptr);
continue;
}
- if (k < 0) {
- u8 *ptr;
-
- if (k >= SKF_AD_OFF)
- break;
- ptr = load_pointer(skb, k);
- if (ptr) {
- A = ntohs(*(u16*)ptr);
- continue;
- }
- } else {
- u16 _tmp, *p;
- p = skb_header_pointer(skb, k, 2, &_tmp);
- if (p != NULL) {
- A = ntohs(*p);
- continue;
- }
- }
return 0;
case BPF_LD|BPF_B|BPF_ABS:
k = fentry->k;
load_b:
- if (k >= 0 && (unsigned int)k < len) {
- A = data[k];
+ ptr = load_pointer(skb, k, 1, &tmp);
+ if (ptr != NULL) {
+ A = *(u8 *)ptr;
continue;
}
- if (k < 0) {
- u8 *ptr;
-
- if (k >= SKF_AD_OFF)
- break;
- ptr = load_pointer(skb, k);
- if (ptr) {
- A = *ptr;
- continue;
- }
- } else {
- u8 _tmp, *p;
- p = skb_header_pointer(skb, k, 1, &_tmp);
- if (p != NULL) {
- A = *p;
- continue;
- }
- }
return 0;
case BPF_LD|BPF_W|BPF_LEN:
- A = len;
+ A = skb->len;
continue;
case BPF_LDX|BPF_W|BPF_LEN:
- X = len;
+ X = skb->len;
continue;
case BPF_LD|BPF_W|BPF_IND:
k = X + fentry->k;
@@ -259,10 +217,12 @@ load_b:
k = X + fentry->k;
goto load_b;
case BPF_LDX|BPF_B|BPF_MSH:
- if (fentry->k >= len)
- return 0;
- X = (data[fentry->k] & 0xf) << 2;
- continue;
+ ptr = load_pointer(skb, fentry->k, 1, &tmp);
+ if (ptr != NULL) {
+ X = (*(u8 *)ptr & 0xf) << 2;
+ continue;
+ }
+ return 0;
case BPF_LD|BPF_IMM:
A = fentry->k;
continue;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index bb73b2190ec7..d9f7b06fe886 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -129,7 +129,7 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
* Buffers may only be allocated from interrupts using a @gfp_mask of
* %GFP_ATOMIC.
*/
-struct sk_buff *alloc_skb(unsigned int size, int gfp_mask)
+struct sk_buff *alloc_skb(unsigned int size, unsigned int __nocast gfp_mask)
{
struct sk_buff *skb;
u8 *data;
@@ -182,7 +182,8 @@ nodata:
* %GFP_ATOMIC.
*/
struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
- unsigned int size, int gfp_mask)
+ unsigned int size,
+ unsigned int __nocast gfp_mask)
{
struct sk_buff *skb;
u8 *data;
@@ -322,7 +323,7 @@ void __kfree_skb(struct sk_buff *skb)
* %GFP_ATOMIC.
*/
-struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask)
+struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask)
{
struct sk_buff *n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
@@ -357,7 +358,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask)
C(ip_summed);
C(priority);
C(protocol);
- C(security);
n->destructor = NULL;
#ifdef CONFIG_NETFILTER
C(nfmark);
@@ -422,7 +422,6 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
new->pkt_type = old->pkt_type;
new->stamp = old->stamp;
new->destructor = NULL;
- new->security = old->security;
#ifdef CONFIG_NETFILTER
new->nfmark = old->nfmark;
new->nfcache = old->nfcache;
@@ -462,7 +461,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
* header is going to be modified. Use pskb_copy() instead.
*/
-struct sk_buff *skb_copy(const struct sk_buff *skb, int gfp_mask)
+struct sk_buff *skb_copy(const struct sk_buff *skb, unsigned int __nocast gfp_mask)
{
int headerlen = skb->data - skb->head;
/*
@@ -501,7 +500,7 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, int gfp_mask)
* The returned buffer has a reference count of 1.
*/
-struct sk_buff *pskb_copy(struct sk_buff *skb, int gfp_mask)
+struct sk_buff *pskb_copy(struct sk_buff *skb, unsigned int __nocast gfp_mask)
{
/*
* Allocate the copy buffer
@@ -559,7 +558,8 @@ out:
* reloaded after call to this function.
*/
-int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, int gfp_mask)
+int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
+ unsigned int __nocast gfp_mask)
{
int i;
u8 *data;
@@ -649,7 +649,8 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
* only by netfilter in the cases when checksum is recalculated? --ANK
*/
struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
- int newheadroom, int newtailroom, int gfp_mask)
+ int newheadroom, int newtailroom,
+ unsigned int __nocast gfp_mask)
{
/*
* Allocate the copy buffer
diff --git a/net/core/sock.c b/net/core/sock.c
index a6ec3ada7f9e..8b35ccdc2b3b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -622,7 +622,8 @@ lenout:
* @prot: struct proto associated with this new sock instance
* @zero_it: if we should zero the newly allocated sock
*/
-struct sock *sk_alloc(int family, int priority, struct proto *prot, int zero_it)
+struct sock *sk_alloc(int family, unsigned int __nocast priority,
+ struct proto *prot, int zero_it)
{
struct sock *sk = NULL;
kmem_cache_t *slab = prot->slab;
@@ -750,7 +751,8 @@ unsigned long sock_i_ino(struct sock *sk)
/*
* Allocate a skb from the socket's send buffer.
*/
-struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int priority)
+struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
+ unsigned int __nocast priority)
{
if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
struct sk_buff * skb = alloc_skb(size, priority);
@@ -765,7 +767,8 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int
/*
* Allocate a skb from the socket's receive buffer.
*/
-struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int priority)
+struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
+ unsigned int __nocast priority)
{
if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
struct sk_buff *skb = alloc_skb(size, priority);
@@ -780,7 +783,7 @@ struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int
/*
* Allocate a memory block from the socket's option memory buffer.
*/
-void *sock_kmalloc(struct sock *sk, int size, int priority)
+void *sock_kmalloc(struct sock *sk, int size, unsigned int __nocast priority)
{
if ((unsigned)size <= sysctl_optmem_max &&
atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 29bb3cd21965..96a02800cd28 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -536,7 +536,7 @@ static void dn_keepalive(struct sock *sk)
* we are double checking that we are not sending too
* many of these keepalive frames.
*/
- if (skb_queue_len(&scp->other_xmit_queue) == 0)
+ if (skb_queue_empty(&scp->other_xmit_queue))
dn_nsp_send_link(sk, DN_NOCHANGE, 0);
}
@@ -1191,7 +1191,7 @@ static unsigned int dn_poll(struct file *file, struct socket *sock, poll_table
struct dn_scp *scp = DN_SK(sk);
int mask = datagram_poll(file, sock, wait);
- if (skb_queue_len(&scp->other_receive_queue))
+ if (!skb_queue_empty(&scp->other_receive_queue))
mask |= POLLRDBAND;
return mask;
@@ -1214,7 +1214,7 @@ static int dn_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
case SIOCATMARK:
lock_sock(sk);
- val = (skb_queue_len(&scp->other_receive_queue) != 0);
+ val = !skb_queue_empty(&scp->other_receive_queue);
if (scp->state != DN_RUN)
val = -ENOTCONN;
release_sock(sk);
@@ -1630,7 +1630,7 @@ static int dn_data_ready(struct sock *sk, struct sk_buff_head *q, int flags, int
int len = 0;
if (flags & MSG_OOB)
- return skb_queue_len(q) ? 1 : 0;
+ return !skb_queue_empty(q) ? 1 : 0;
while(skb != (struct sk_buff *)q) {
struct dn_skb_cb *cb = DN_SKB_CB(skb);
@@ -1707,7 +1707,7 @@ static int dn_recvmsg(struct kiocb *iocb, struct socket *sock,
if (sk->sk_err)
goto out;
- if (skb_queue_len(&scp->other_receive_queue)) {
+ if (!skb_queue_empty(&scp->other_receive_queue)) {
if (!(flags & MSG_OOB)) {
msg->msg_flags |= MSG_OOB;
if (!scp->other_report) {
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index 9934b25720e4..99bc061759c3 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -551,7 +551,8 @@ int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb)
if (t < s_t)
continue;
if (t > s_t)
- memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(int));
+ memset(&cb->args[1], 0,
+ sizeof(cb->args) - sizeof(cb->args[0]));
tb = dn_fib_get_table(t, 0);
if (tb == NULL)
continue;
diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
index 42abbf3f524f..8cce1fdbda90 100644
--- a/net/decnet/dn_nsp_out.c
+++ b/net/decnet/dn_nsp_out.c
@@ -342,7 +342,8 @@ int dn_nsp_xmit_timeout(struct sock *sk)
dn_nsp_output(sk);
- if (skb_queue_len(&scp->data_xmit_queue) || skb_queue_len(&scp->other_xmit_queue))
+ if (!skb_queue_empty(&scp->data_xmit_queue) ||
+ !skb_queue_empty(&scp->other_xmit_queue))
scp->persist = dn_nsp_persist(sk);
return 0;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 658e7977924d..ef7468376ae6 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1009,6 +1009,15 @@ static int __init init_ipv4_mibs(void)
static int ipv4_proc_init(void);
extern void ipfrag_init(void);
+/*
+ * IP protocol layer initialiser
+ */
+
+static struct packet_type ip_packet_type = {
+ .type = __constant_htons(ETH_P_IP),
+ .func = ip_rcv,
+};
+
static int __init inet_init(void)
{
struct sk_buff *dummy_skb;
@@ -1102,6 +1111,8 @@ static int __init inet_init(void)
ipfrag_init();
+ dev_add_pack(&ip_packet_type);
+
rc = 0;
out:
return rc;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index b56e88edf1b3..4be234c7d8c3 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -43,7 +43,7 @@
* 2 of the License, or (at your option) any later version.
*/
-#define VERSION "0.324"
+#define VERSION "0.325"
#include <linux/config.h>
#include <asm/uaccess.h>
@@ -136,6 +136,7 @@ struct trie_use_stats {
unsigned int semantic_match_passed;
unsigned int semantic_match_miss;
unsigned int null_node_hit;
+ unsigned int resize_node_skipped;
};
#endif
@@ -164,8 +165,8 @@ static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
static int tnode_child_length(struct tnode *tn);
static struct node *resize(struct trie *t, struct tnode *tn);
-static struct tnode *inflate(struct trie *t, struct tnode *tn);
-static struct tnode *halve(struct trie *t, struct tnode *tn);
+static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err);
+static struct tnode *halve(struct trie *t, struct tnode *tn, int *err);
static void tnode_free(struct tnode *tn);
static void trie_dump_seq(struct seq_file *seq, struct trie *t);
extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
@@ -358,11 +359,32 @@ static inline void free_leaf_info(struct leaf_info *li)
kfree(li);
}
+static struct tnode *tnode_alloc(unsigned int size)
+{
+ if (size <= PAGE_SIZE) {
+ return kmalloc(size, GFP_KERNEL);
+ } else {
+ return (struct tnode *)
+ __get_free_pages(GFP_KERNEL, get_order(size));
+ }
+}
+
+static void __tnode_free(struct tnode *tn)
+{
+ unsigned int size = sizeof(struct tnode) +
+ (1<<tn->bits) * sizeof(struct node *);
+
+ if (size <= PAGE_SIZE)
+ kfree(tn);
+ else
+ free_pages((unsigned long)tn, get_order(size));
+}
+
static struct tnode* tnode_new(t_key key, int pos, int bits)
{
int nchildren = 1<<bits;
int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
- struct tnode *tn = kmalloc(sz, GFP_KERNEL);
+ struct tnode *tn = tnode_alloc(sz);
if(tn) {
memset(tn, 0, sz);
@@ -390,7 +412,7 @@ static void tnode_free(struct tnode *tn)
printk("FL %p \n", tn);
}
else if(IS_TNODE(tn)) {
- kfree(tn);
+ __tnode_free(tn);
if(trie_debug > 0 )
printk("FT %p \n", tn);
}
@@ -460,6 +482,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w
static struct node *resize(struct trie *t, struct tnode *tn)
{
int i;
+ int err = 0;
if (!tn)
return NULL;
@@ -556,12 +579,20 @@ static struct node *resize(struct trie *t, struct tnode *tn)
*/
check_tnode(tn);
-
+
+ err = 0;
while ((tn->full_children > 0 &&
50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
inflate_threshold * tnode_child_length(tn))) {
- tn = inflate(t, tn);
+ tn = inflate(t, tn, &err);
+
+ if(err) {
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ t->stats.resize_node_skipped++;
+#endif
+ break;
+ }
}
check_tnode(tn);
@@ -570,11 +601,22 @@ static struct node *resize(struct trie *t, struct tnode *tn)
* Halve as long as the number of empty children in this
* node is above threshold.
*/
+
+ err = 0;
while (tn->bits > 1 &&
100 * (tnode_child_length(tn) - tn->empty_children) <
- halve_threshold * tnode_child_length(tn))
+ halve_threshold * tnode_child_length(tn)) {
+
+ tn = halve(t, tn, &err);
+
+ if(err) {
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ t->stats.resize_node_skipped++;
+#endif
+ break;
+ }
+ }
- tn = halve(t, tn);
/* Only one child remains */
@@ -599,7 +641,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
return (struct node *) tn;
}
-static struct tnode *inflate(struct trie *t, struct tnode *tn)
+static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
{
struct tnode *inode;
struct tnode *oldtnode = tn;
@@ -611,8 +653,63 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
- if (!tn)
- trie_bug("tnode_new failed");
+ if (!tn) {
+ *err = -ENOMEM;
+ return oldtnode;
+ }
+
+ /*
+ * Preallocate and store tnodes before the actual work so we
+ * don't get into an inconsistent state if memory allocation
+ * fails. In case of failure we return the oldnode and inflate
+ * of tnode is ignored.
+ */
+
+ for(i = 0; i < olen; i++) {
+ struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
+
+ if (inode &&
+ IS_TNODE(inode) &&
+ inode->pos == oldtnode->pos + oldtnode->bits &&
+ inode->bits > 1) {
+ struct tnode *left, *right;
+
+ t_key m = TKEY_GET_MASK(inode->pos, 1);
+
+ left = tnode_new(inode->key&(~m), inode->pos + 1,
+ inode->bits - 1);
+
+ if(!left) {
+ *err = -ENOMEM;
+ break;
+ }
+
+ right = tnode_new(inode->key|m, inode->pos + 1,
+ inode->bits - 1);
+
+ if(!right) {
+ *err = -ENOMEM;
+ break;
+ }
+
+ put_child(t, tn, 2*i, (struct node *) left);
+ put_child(t, tn, 2*i+1, (struct node *) right);
+ }
+ }
+
+ if(*err) {
+ int size = tnode_child_length(tn);
+ int j;
+
+ for(j = 0; j < size; j++)
+ if( tn->child[j])
+ tnode_free((struct tnode *)tn->child[j]);
+
+ tnode_free(tn);
+
+ *err = -ENOMEM;
+ return oldtnode;
+ }
for(i = 0; i < olen; i++) {
struct node *node = tnode_get_child(oldtnode, i);
@@ -625,7 +722,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
if(IS_LEAF(node) || ((struct tnode *) node)->pos >
tn->pos + tn->bits - 1) {
- if(tkey_extract_bits(node->key, tn->pos + tn->bits - 1,
+ if(tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
1) == 0)
put_child(t, tn, 2*i, node);
else
@@ -665,27 +762,22 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
* the position (inode->pos)
*/
- t_key m = TKEY_GET_MASK(inode->pos, 1);
-
/* Use the old key, but set the new significant
* bit to zero.
*/
- left = tnode_new(inode->key&(~m), inode->pos + 1,
- inode->bits - 1);
- if(!left)
- trie_bug("tnode_new failed");
-
-
- /* Use the old key, but set the new significant
- * bit to one.
- */
- right = tnode_new(inode->key|m, inode->pos + 1,
- inode->bits - 1);
+ left = (struct tnode *) tnode_get_child(tn, 2*i);
+ put_child(t, tn, 2*i, NULL);
+
+ if(!left)
+ BUG();
+
+ right = (struct tnode *) tnode_get_child(tn, 2*i+1);
+ put_child(t, tn, 2*i+1, NULL);
+
+ if(!right)
+ BUG();
- if(!right)
- trie_bug("tnode_new failed");
-
size = tnode_child_length(left);
for(j = 0; j < size; j++) {
put_child(t, left, j, inode->child[j]);
@@ -701,7 +793,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
return tn;
}
-static struct tnode *halve(struct trie *t, struct tnode *tn)
+static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
{
struct tnode *oldtnode = tn;
struct node *left, *right;
@@ -712,8 +804,48 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
- if(!tn)
- trie_bug("tnode_new failed");
+ if (!tn) {
+ *err = -ENOMEM;
+ return oldtnode;
+ }
+
+ /*
+ * Preallocate and store tnodes before the actual work so we
+ * don't get into an inconsistent state if memory allocation
+ * fails. In case of failure we return the oldnode and halve
+ * of tnode is ignored.
+ */
+
+ for(i = 0; i < olen; i += 2) {
+ left = tnode_get_child(oldtnode, i);
+ right = tnode_get_child(oldtnode, i+1);
+
+ /* Two nonempty children */
+ if( left && right) {
+ struct tnode *newBinNode =
+ tnode_new(left->key, tn->pos + tn->bits, 1);
+
+ if(!newBinNode) {
+ *err = -ENOMEM;
+ break;
+ }
+ put_child(t, tn, i/2, (struct node *)newBinNode);
+ }
+ }
+
+ if(*err) {
+ int size = tnode_child_length(tn);
+ int j;
+
+ for(j = 0; j < size; j++)
+ if( tn->child[j])
+ tnode_free((struct tnode *)tn->child[j]);
+
+ tnode_free(tn);
+
+ *err = -ENOMEM;
+ return oldtnode;
+ }
for(i = 0; i < olen; i += 2) {
left = tnode_get_child(oldtnode, i);
@@ -730,10 +862,11 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
/* Two nonempty children */
else {
struct tnode *newBinNode =
- tnode_new(left->key, tn->pos + tn->bits, 1);
+ (struct tnode *) tnode_get_child(tn, i/2);
+ put_child(t, tn, i/2, NULL);
if(!newBinNode)
- trie_bug("tnode_new failed");
+ BUG();
put_child(t, newBinNode, 0, left);
put_child(t, newBinNode, 1, right);
@@ -2301,6 +2434,7 @@ static void collect_and_show(struct trie *t, struct seq_file *seq)
seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
+ seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
#ifdef CLEAR_STATS
memset(&(t->stats), 0, sizeof(t->stats));
#endif
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index cb759484979d..279f57abfecb 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -970,7 +970,8 @@ int icmp_rcv(struct sk_buff *skb)
* RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently
* discarded if to broadcast/multicast.
*/
- if (icmph->type == ICMP_ECHO &&
+ if ((icmph->type == ICMP_ECHO ||
+ icmph->type == ICMP_TIMESTAMP) &&
sysctl_icmp_echo_ignore_broadcasts) {
goto error;
}
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 1f3183168a90..5088f90835ae 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1615,9 +1615,10 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
{
int err;
u32 addr = imr->imr_multiaddr.s_addr;
- struct ip_mc_socklist *iml, *i;
+ struct ip_mc_socklist *iml=NULL, *i;
struct in_device *in_dev;
struct inet_sock *inet = inet_sk(sk);
+ int ifindex;
int count = 0;
if (!MULTICAST(addr))
@@ -1633,37 +1634,30 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
goto done;
}
- iml = (struct ip_mc_socklist *)sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
-
err = -EADDRINUSE;
+ ifindex = imr->imr_ifindex;
for (i = inet->mc_list; i; i = i->next) {
- if (memcmp(&i->multi, imr, sizeof(*imr)) == 0) {
- /* New style additions are reference counted */
- if (imr->imr_address.s_addr == 0) {
- i->count++;
- err = 0;
- }
+ if (i->multi.imr_multiaddr.s_addr == addr &&
+ i->multi.imr_ifindex == ifindex)
goto done;
- }
count++;
}
err = -ENOBUFS;
- if (iml == NULL || count >= sysctl_igmp_max_memberships)
+ if (count >= sysctl_igmp_max_memberships)
+ goto done;
+ iml = (struct ip_mc_socklist *)sock_kmalloc(sk,sizeof(*iml),GFP_KERNEL);
+ if (iml == NULL)
goto done;
+
memcpy(&iml->multi, imr, sizeof(*imr));
iml->next = inet->mc_list;
- iml->count = 1;
iml->sflist = NULL;
iml->sfmode = MCAST_EXCLUDE;
inet->mc_list = iml;
ip_mc_inc_group(in_dev, addr);
- iml = NULL;
err = 0;
-
done:
rtnl_shunlock();
- if (iml)
- sock_kfree_s(sk, iml, sizeof(*iml));
return err;
}
@@ -1693,30 +1687,25 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
{
struct inet_sock *inet = inet_sk(sk);
struct ip_mc_socklist *iml, **imlp;
+ struct in_device *in_dev;
+ u32 group = imr->imr_multiaddr.s_addr;
+ u32 ifindex;
rtnl_lock();
+ in_dev = ip_mc_find_dev(imr);
+ if (!in_dev) {
+ rtnl_unlock();
+ return -ENODEV;
+ }
+ ifindex = imr->imr_ifindex;
for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) {
- if (iml->multi.imr_multiaddr.s_addr==imr->imr_multiaddr.s_addr &&
- iml->multi.imr_address.s_addr==imr->imr_address.s_addr &&
- (!imr->imr_ifindex || iml->multi.imr_ifindex==imr->imr_ifindex)) {
- struct in_device *in_dev;
-
- in_dev = inetdev_by_index(iml->multi.imr_ifindex);
- if (in_dev)
- (void) ip_mc_leave_src(sk, iml, in_dev);
- if (--iml->count) {
- rtnl_unlock();
- if (in_dev)
- in_dev_put(in_dev);
- return 0;
- }
+ if (iml->multi.imr_multiaddr.s_addr == group &&
+ iml->multi.imr_ifindex == ifindex) {
+ (void) ip_mc_leave_src(sk, iml, in_dev);
*imlp = iml->next;
- if (in_dev) {
- ip_mc_dec_group(in_dev, imr->imr_multiaddr.s_addr);
- in_dev_put(in_dev);
- }
+ ip_mc_dec_group(in_dev, group);
rtnl_unlock();
sock_kfree_s(sk, iml, sizeof(*iml));
return 0;
@@ -1736,6 +1725,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
struct in_device *in_dev = NULL;
struct inet_sock *inet = inet_sk(sk);
struct ip_sf_socklist *psl;
+ int leavegroup = 0;
int i, j, rv;
if (!MULTICAST(addr))
@@ -1755,15 +1745,20 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
err = -EADDRNOTAVAIL;
for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
- if (memcmp(&pmc->multi, mreqs, 2*sizeof(__u32)) == 0)
+ if (pmc->multi.imr_multiaddr.s_addr == imr.imr_multiaddr.s_addr
+ && pmc->multi.imr_ifindex == imr.imr_ifindex)
break;
}
- if (!pmc) /* must have a prior join */
+ if (!pmc) { /* must have a prior join */
+ err = -EINVAL;
goto done;
+ }
/* if a source filter was set, must be the same mode as before */
if (pmc->sflist) {
- if (pmc->sfmode != omode)
+ if (pmc->sfmode != omode) {
+ err = -EINVAL;
goto done;
+ }
} else if (pmc->sfmode != omode) {
/* allow mode switches for empty-set filters */
ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0);
@@ -1775,7 +1770,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
psl = pmc->sflist;
if (!add) {
if (!psl)
- goto done;
+ goto done; /* err = -EADDRNOTAVAIL */
rv = !0;
for (i=0; i<psl->sl_count; i++) {
rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
@@ -1784,7 +1779,13 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
break;
}
if (rv) /* source not found */
+ goto done; /* err = -EADDRNOTAVAIL */
+
+ /* special case - (INCLUDE, empty) == LEAVE_GROUP */
+ if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
+ leavegroup = 1;
goto done;
+ }
/* update the interface filter */
ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
@@ -1842,18 +1843,21 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
&mreqs->imr_sourceaddr, 1);
done:
rtnl_shunlock();
+ if (leavegroup)
+ return ip_mc_leave_group(sk, &imr);
return err;
}
int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
{
- int err;
+ int err = 0;
struct ip_mreqn imr;
u32 addr = msf->imsf_multiaddr;
struct ip_mc_socklist *pmc;
struct in_device *in_dev;
struct inet_sock *inet = inet_sk(sk);
struct ip_sf_socklist *newpsl, *psl;
+ int leavegroup = 0;
if (!MULTICAST(addr))
return -EINVAL;
@@ -1872,15 +1876,22 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
err = -ENODEV;
goto done;
}
- err = -EADDRNOTAVAIL;
+
+ /* special case - (INCLUDE, empty) == LEAVE_GROUP */
+ if (msf->imsf_fmode == MCAST_INCLUDE && msf->imsf_numsrc == 0) {
+ leavegroup = 1;
+ goto done;
+ }
for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
pmc->multi.imr_ifindex == imr.imr_ifindex)
break;
}
- if (!pmc) /* must have a prior join */
+ if (!pmc) { /* must have a prior join */
+ err = -EINVAL;
goto done;
+ }
if (msf->imsf_numsrc) {
newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk,
IP_SFLSIZE(msf->imsf_numsrc), GFP_KERNEL);
@@ -1909,8 +1920,11 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
0, NULL, 0);
pmc->sflist = newpsl;
pmc->sfmode = msf->imsf_fmode;
+ err = 0;
done:
rtnl_shunlock();
+ if (leavegroup)
+ err = ip_mc_leave_group(sk, &imr);
return err;
}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 6ce5c3292f9f..9de83e6e0f1d 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -389,7 +389,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
to->pkt_type = from->pkt_type;
to->priority = from->priority;
to->protocol = from->protocol;
- to->security = from->security;
dst_release(to->dst);
to->dst = dst_clone(from->dst);
to->dev = from->dev;
@@ -1329,23 +1328,8 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
ip_rt_put(rt);
}
-/*
- * IP protocol layer initialiser
- */
-
-static struct packet_type ip_packet_type = {
- .type = __constant_htons(ETH_P_IP),
- .func = ip_rcv,
-};
-
-/*
- * IP registers the packet type and then calls the subprotocol initialisers
- */
-
void __init ip_init(void)
{
- dev_add_pack(&ip_packet_type);
-
ip_rt_init();
inet_initpeers();
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index f8b172f89811..fc7c481d0d79 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -677,11 +677,11 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
mreq.imr_address.s_addr = mreqs.imr_interface;
mreq.imr_ifindex = 0;
err = ip_mc_join_group(sk, &mreq);
- if (err)
+ if (err && err != -EADDRINUSE)
break;
omode = MCAST_INCLUDE;
add = 1;
- } else /*IP_DROP_SOURCE_MEMBERSHIP */ {
+ } else /* IP_DROP_SOURCE_MEMBERSHIP */ {
omode = MCAST_INCLUDE;
add = 0;
}
@@ -754,7 +754,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
mreq.imr_address.s_addr = 0;
mreq.imr_ifindex = greqs.gsr_interface;
err = ip_mc_join_group(sk, &mreq);
- if (err)
+ if (err && err != -EADDRINUSE)
break;
greqs.gsr_interface = mreq.imr_ifindex;
omode = MCAST_INCLUDE;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 12a1cf306f67..726ea5e8180a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -54,6 +54,7 @@
* Marc Boucher : routing by fwmark
* Robert Olsson : Added rt_cache statistics
* Arnaldo C. Melo : Convert proc stuff to seq_file
+ * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -70,6 +71,7 @@
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/mm.h>
+#include <linux/bootmem.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
@@ -201,8 +203,37 @@ __u8 ip_tos2prio[16] = {
struct rt_hash_bucket {
struct rtable *chain;
- spinlock_t lock;
-} __attribute__((__aligned__(8)));
+};
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+/*
+ * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
+ * The size of this table is a power of two and depends on the number of CPUS.
+ */
+#if NR_CPUS >= 32
+#define RT_HASH_LOCK_SZ 4096
+#elif NR_CPUS >= 16
+#define RT_HASH_LOCK_SZ 2048
+#elif NR_CPUS >= 8
+#define RT_HASH_LOCK_SZ 1024
+#elif NR_CPUS >= 4
+#define RT_HASH_LOCK_SZ 512
+#else
+#define RT_HASH_LOCK_SZ 256
+#endif
+
+static spinlock_t *rt_hash_locks;
+# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
+# define rt_hash_lock_init() { \
+ int i; \
+ rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
+ if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
+ for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
+ spin_lock_init(&rt_hash_locks[i]); \
+ }
+#else
+# define rt_hash_lock_addr(slot) NULL
+# define rt_hash_lock_init()
+#endif
static struct rt_hash_bucket *rt_hash_table;
static unsigned rt_hash_mask;
@@ -575,19 +606,26 @@ static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
/* This runs via a timer and thus is always in BH context. */
static void rt_check_expire(unsigned long dummy)
{
- static int rover;
- int i = rover, t;
+ static unsigned int rover;
+ unsigned int i = rover, goal;
struct rtable *rth, **rthp;
unsigned long now = jiffies;
-
- for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
- t -= ip_rt_gc_timeout) {
+ u64 mult;
+
+ mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
+ if (ip_rt_gc_timeout > 1)
+ do_div(mult, ip_rt_gc_timeout);
+ goal = (unsigned int)mult;
+ if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
+ for (; goal > 0; goal--) {
unsigned long tmo = ip_rt_gc_timeout;
i = (i + 1) & rt_hash_mask;
rthp = &rt_hash_table[i].chain;
- spin_lock(&rt_hash_table[i].lock);
+ if (*rthp == 0)
+ continue;
+ spin_lock(rt_hash_lock_addr(i));
while ((rth = *rthp) != NULL) {
if (rth->u.dst.expires) {
/* Entry is expired even if it is in use */
@@ -620,14 +658,14 @@ static void rt_check_expire(unsigned long dummy)
rt_free(rth);
#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
}
- spin_unlock(&rt_hash_table[i].lock);
+ spin_unlock(rt_hash_lock_addr(i));
/* Fallback loop breaker. */
if (time_after(jiffies, now))
break;
}
rover = i;
- mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
+ mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
}
/* This can run from both BH and non-BH contexts, the latter
@@ -643,11 +681,11 @@ static void rt_run_flush(unsigned long dummy)
get_random_bytes(&rt_hash_rnd, 4);
for (i = rt_hash_mask; i >= 0; i--) {
- spin_lock_bh(&rt_hash_table[i].lock);
+ spin_lock_bh(rt_hash_lock_addr(i));
rth = rt_hash_table[i].chain;
if (rth)
rt_hash_table[i].chain = NULL;
- spin_unlock_bh(&rt_hash_table[i].lock);
+ spin_unlock_bh(rt_hash_lock_addr(i));
for (; rth; rth = next) {
next = rth->u.rt_next;
@@ -780,7 +818,7 @@ static int rt_garbage_collect(void)
k = (k + 1) & rt_hash_mask;
rthp = &rt_hash_table[k].chain;
- spin_lock_bh(&rt_hash_table[k].lock);
+ spin_lock_bh(rt_hash_lock_addr(k));
while ((rth = *rthp) != NULL) {
if (!rt_may_expire(rth, tmo, expire)) {
tmo >>= 1;
@@ -812,7 +850,7 @@ static int rt_garbage_collect(void)
goal--;
#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
}
- spin_unlock_bh(&rt_hash_table[k].lock);
+ spin_unlock_bh(rt_hash_lock_addr(k));
if (goal <= 0)
break;
}
@@ -882,7 +920,7 @@ restart:
rthp = &rt_hash_table[hash].chain;
- spin_lock_bh(&rt_hash_table[hash].lock);
+ spin_lock_bh(rt_hash_lock_addr(hash));
while ((rth = *rthp) != NULL) {
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
if (!(rth->u.dst.flags & DST_BALANCED) &&
@@ -908,7 +946,7 @@ restart:
rth->u.dst.__use++;
dst_hold(&rth->u.dst);
rth->u.dst.lastuse = now;
- spin_unlock_bh(&rt_hash_table[hash].lock);
+ spin_unlock_bh(rt_hash_lock_addr(hash));
rt_drop(rt);
*rp = rth;
@@ -949,7 +987,7 @@ restart:
if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
int err = arp_bind_neighbour(&rt->u.dst);
if (err) {
- spin_unlock_bh(&rt_hash_table[hash].lock);
+ spin_unlock_bh(rt_hash_lock_addr(hash));
if (err != -ENOBUFS) {
rt_drop(rt);
@@ -990,7 +1028,7 @@ restart:
}
#endif
rt_hash_table[hash].chain = rt;
- spin_unlock_bh(&rt_hash_table[hash].lock);
+ spin_unlock_bh(rt_hash_lock_addr(hash));
*rp = rt;
return 0;
}
@@ -1058,7 +1096,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
{
struct rtable **rthp;
- spin_lock_bh(&rt_hash_table[hash].lock);
+ spin_lock_bh(rt_hash_lock_addr(hash));
ip_rt_put(rt);
for (rthp = &rt_hash_table[hash].chain; *rthp;
rthp = &(*rthp)->u.rt_next)
@@ -1067,7 +1105,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
rt_free(rt);
break;
}
- spin_unlock_bh(&rt_hash_table[hash].lock);
+ spin_unlock_bh(rt_hash_lock_addr(hash));
}
void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
@@ -3073,12 +3111,14 @@ __setup("rhash_entries=", set_rhash_entries);
int __init ip_rt_init(void)
{
- int i, order, goal, rc = 0;
+ int rc = 0;
rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
(jiffies ^ (jiffies >> 7)));
#ifdef CONFIG_NET_CLS_ROUTE
+ {
+ int order;
for (order = 0;
(PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
/* NOTHING */;
@@ -3086,6 +3126,7 @@ int __init ip_rt_init(void)
if (!ip_rt_acct)
panic("IP: failed to allocate ip_rt_acct\n");
memset(ip_rt_acct, 0, PAGE_SIZE << order);
+ }
#endif
ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
@@ -3096,36 +3137,19 @@ int __init ip_rt_init(void)
if (!ipv4_dst_ops.kmem_cachep)
panic("IP: failed to allocate ip_dst_cache\n");
- goal = num_physpages >> (26 - PAGE_SHIFT);
- if (rhash_entries)
- goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
- for (order = 0; (1UL << order) < goal; order++)
- /* NOTHING */;
-
- do {
- rt_hash_mask = (1UL << order) * PAGE_SIZE /
- sizeof(struct rt_hash_bucket);
- while (rt_hash_mask & (rt_hash_mask - 1))
- rt_hash_mask--;
- rt_hash_table = (struct rt_hash_bucket *)
- __get_free_pages(GFP_ATOMIC, order);
- } while (rt_hash_table == NULL && --order > 0);
-
- if (!rt_hash_table)
- panic("Failed to allocate IP route cache hash table\n");
-
- printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
- rt_hash_mask,
- (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
-
- for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
- /* NOTHING */;
-
- rt_hash_mask--;
- for (i = 0; i <= rt_hash_mask; i++) {
- spin_lock_init(&rt_hash_table[i].lock);
- rt_hash_table[i].chain = NULL;
- }
+ rt_hash_table = (struct rt_hash_bucket *)
+ alloc_large_system_hash("IP route cache",
+ sizeof(struct rt_hash_bucket),
+ rhash_entries,
+ (num_physpages >= 128 * 1024) ?
+ (27 - PAGE_SHIFT) :
+ (29 - PAGE_SHIFT),
+ HASH_HIGHMEM,
+ &rt_hash_log,
+ &rt_hash_mask,
+ 0);
+ memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
+ rt_hash_lock_init();
ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
ip_rt_max_size = (rt_hash_mask + 1) * 16;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 882436da9a3a..ddb6ce4ecff2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -615,7 +615,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
size_t psize, int flags)
{
struct tcp_sock *tp = tcp_sk(sk);
- int mss_now;
+ int mss_now, size_goal;
int err;
ssize_t copied;
long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
@@ -628,6 +628,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+ size_goal = tp->xmit_size_goal;
copied = 0;
err = -EPIPE;
@@ -641,7 +642,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
int offset = poffset % PAGE_SIZE;
int size = min_t(size_t, psize, PAGE_SIZE - offset);
- if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
+ if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
new_segment:
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
@@ -652,7 +653,7 @@ new_segment:
goto wait_for_memory;
skb_entail(sk, tp, skb);
- copy = mss_now;
+ copy = size_goal;
}
if (copy > size)
@@ -693,7 +694,7 @@ new_segment:
if (!(psize -= copy))
goto out;
- if (skb->len != mss_now || (flags & MSG_OOB))
+ if (skb->len < mss_now || (flags & MSG_OOB))
continue;
if (forced_push(tp)) {
@@ -713,6 +714,7 @@ wait_for_memory:
goto do_error;
mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+ size_goal = tp->xmit_size_goal;
}
out:
@@ -754,15 +756,20 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
static inline int select_size(struct sock *sk, struct tcp_sock *tp)
{
- int tmp = tp->mss_cache_std;
+ int tmp = tp->mss_cache;
if (sk->sk_route_caps & NETIF_F_SG) {
- int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
+ if (sk->sk_route_caps & NETIF_F_TSO)
+ tmp = 0;
+ else {
+ int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
- if (tmp >= pgbreak &&
- tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
- tmp = pgbreak;
+ if (tmp >= pgbreak &&
+ tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
+ tmp = pgbreak;
+ }
}
+
return tmp;
}
@@ -773,7 +780,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
int iovlen, flags;
- int mss_now;
+ int mss_now, size_goal;
int err, copied;
long timeo;
@@ -792,6 +799,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+ size_goal = tp->xmit_size_goal;
/* Ok commence sending. */
iovlen = msg->msg_iovlen;
@@ -814,7 +822,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
skb = sk->sk_write_queue.prev;
if (!sk->sk_send_head ||
- (copy = mss_now - skb->len) <= 0) {
+ (copy = size_goal - skb->len) <= 0) {
new_segment:
/* Allocate new segment. If the interface is SG,
@@ -837,7 +845,7 @@ new_segment:
skb->ip_summed = CHECKSUM_HW;
skb_entail(sk, tp, skb);
- copy = mss_now;
+ copy = size_goal;
}
/* Try to append data to the end of skb. */
@@ -872,11 +880,6 @@ new_segment:
tcp_mark_push(tp, skb);
goto new_segment;
} else if (page) {
- /* If page is cached, align
- * offset to L1 cache boundary
- */
- off = (off + L1_CACHE_BYTES - 1) &
- ~(L1_CACHE_BYTES - 1);
if (off == PAGE_SIZE) {
put_page(page);
TCP_PAGE(sk) = page = NULL;
@@ -937,7 +940,7 @@ new_segment:
if ((seglen -= copy) == 0 && iovlen == 0)
goto out;
- if (skb->len != mss_now || (flags & MSG_OOB))
+ if (skb->len < mss_now || (flags & MSG_OOB))
continue;
if (forced_push(tp)) {
@@ -957,6 +960,7 @@ wait_for_memory:
goto do_error;
mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+ size_goal = tp->xmit_size_goal;
}
}
@@ -1101,7 +1105,7 @@ static void tcp_prequeue_process(struct sock *sk)
struct sk_buff *skb;
struct tcp_sock *tp = tcp_sk(sk);
- NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue));
+ NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
/* RX process wants to run with disabled BHs, though it is not
* necessary */
@@ -1365,7 +1369,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
* is not empty. It is more elegant, but eats cycles,
* unfortunately.
*/
- if (skb_queue_len(&tp->ucopy.prequeue))
+ if (!skb_queue_empty(&tp->ucopy.prequeue))
goto do_prequeue;
/* __ Set realtime policy in scheduler __ */
@@ -1390,7 +1394,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
}
if (tp->rcv_nxt == tp->copied_seq &&
- skb_queue_len(&tp->ucopy.prequeue)) {
+ !skb_queue_empty(&tp->ucopy.prequeue)) {
do_prequeue:
tcp_prequeue_process(sk);
@@ -1472,7 +1476,7 @@ skip_copy:
} while (len > 0);
if (user_recv) {
- if (skb_queue_len(&tp->ucopy.prequeue)) {
+ if (!skb_queue_empty(&tp->ucopy.prequeue)) {
int chunk;
tp->ucopy.len = copied > 0 ? len : 0;
@@ -2128,7 +2132,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_rto = jiffies_to_usecs(tp->rto);
info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
- info->tcpi_snd_mss = tp->mss_cache_std;
+ info->tcpi_snd_mss = tp->mss_cache;
info->tcpi_rcv_mss = tp->ack.rcv_mss;
info->tcpi_unacked = tp->packets_out;
@@ -2178,7 +2182,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
switch (optname) {
case TCP_MAXSEG:
- val = tp->mss_cache_std;
+ val = tp->mss_cache;
if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
val = tp->rx_opt.user_mss;
break;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7bbbbc33eb4b..53a8a5399f1e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -740,10 +740,10 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
if (!cwnd) {
- if (tp->mss_cache_std > 1460)
+ if (tp->mss_cache > 1460)
cwnd = 2;
else
- cwnd = (tp->mss_cache_std > 1095) ? 3 : 4;
+ cwnd = (tp->mss_cache > 1095) ? 3 : 4;
}
return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}
@@ -914,7 +914,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
if (sk->sk_route_caps & NETIF_F_TSO) {
sk->sk_route_caps &= ~NETIF_F_TSO;
sock_set_flag(sk, SOCK_NO_LARGESEND);
- tp->mss_cache = tp->mss_cache_std;
+ tp->mss_cache = tp->mss_cache;
}
if (!tp->sacked_out)
@@ -1077,7 +1077,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
(IsFack(tp) ||
!before(lost_retrans,
TCP_SKB_CB(skb)->ack_seq + tp->reordering *
- tp->mss_cache_std))) {
+ tp->mss_cache))) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
@@ -1957,15 +1957,6 @@ static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
}
}
-/* There is one downside to this scheme. Although we keep the
- * ACK clock ticking, adjusting packet counters and advancing
- * congestion window, we do not liberate socket send buffer
- * space.
- *
- * Mucking with skb->truesize and sk->sk_wmem_alloc et al.
- * then making a write space wakeup callback is a possible
- * future enhancement. WARNING: it is not trivial to make.
- */
static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
__u32 now, __s32 *seq_rtt)
{
@@ -2047,7 +2038,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
* the other end.
*/
if (after(scb->end_seq, tp->snd_una)) {
- if (tcp_skb_pcount(skb) > 1)
+ if (tcp_skb_pcount(skb) > 1 &&
+ after(tp->snd_una, scb->seq))
acked |= tcp_tso_acked(sk, skb,
now, &seq_rtt);
break;
@@ -2810,7 +2802,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
int this_sack;
/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
- if (skb_queue_len(&tp->out_of_order_queue) == 0) {
+ if (skb_queue_empty(&tp->out_of_order_queue)) {
tp->rx_opt.num_sacks = 0;
tp->rx_opt.eff_sacks = tp->rx_opt.dsack;
return;
@@ -2943,13 +2935,13 @@ queue_and_out:
if(th->fin)
tcp_fin(skb, sk, th);
- if (skb_queue_len(&tp->out_of_order_queue)) {
+ if (!skb_queue_empty(&tp->out_of_order_queue)) {
tcp_ofo_queue(sk);
/* RFC2581. 4.2. SHOULD send immediate ACK, when
* gap in queue is filled.
*/
- if (!skb_queue_len(&tp->out_of_order_queue))
+ if (skb_queue_empty(&tp->out_of_order_queue))
tp->ack.pingpong = 0;
}
@@ -3257,9 +3249,8 @@ static int tcp_prune_queue(struct sock *sk)
* This must not ever occur. */
/* First, purge the out_of_order queue. */
- if (skb_queue_len(&tp->out_of_order_queue)) {
- NET_ADD_STATS_BH(LINUX_MIB_OFOPRUNED,
- skb_queue_len(&tp->out_of_order_queue));
+ if (!skb_queue_empty(&tp->out_of_order_queue)) {
+ NET_INC_STATS_BH(LINUX_MIB_OFOPRUNED);
__skb_queue_purge(&tp->out_of_order_queue);
/* Reset SACK state. A conforming SACK implementation will
@@ -3308,6 +3299,28 @@ void tcp_cwnd_application_limited(struct sock *sk)
tp->snd_cwnd_stamp = tcp_time_stamp;
}
+static inline int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp)
+{
+ /* If the user specified a specific send buffer setting, do
+ * not modify it.
+ */
+ if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
+ return 0;
+
+ /* If we are under global TCP memory pressure, do not expand. */
+ if (tcp_memory_pressure)
+ return 0;
+
+ /* If we are under soft global TCP memory pressure, do not expand. */
+ if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
+ return 0;
+
+ /* If we filled the congestion window, do not expand. */
+ if (tp->packets_out >= tp->snd_cwnd)
+ return 0;
+
+ return 1;
+}
/* When incoming ACK allowed to free some skb from write_queue,
* we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
@@ -3319,11 +3332,8 @@ static void tcp_new_space(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (tp->packets_out < tp->snd_cwnd &&
- !(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
- !tcp_memory_pressure &&
- atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
- int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) +
+ if (tcp_should_expand_sndbuf(sk, tp)) {
+ int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
demanded = max_t(unsigned int, tp->snd_cwnd,
tp->reordering + 1);
@@ -3346,22 +3356,9 @@ static inline void tcp_check_space(struct sock *sk)
}
}
-static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) ||
- tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
- tcp_write_xmit(sk, tp->nonagle))
- tcp_check_probe_timer(sk, tp);
-}
-
-static __inline__ void tcp_data_snd_check(struct sock *sk)
+static __inline__ void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp)
{
- struct sk_buff *skb = sk->sk_send_head;
-
- if (skb != NULL)
- __tcp_data_snd_check(sk, skb);
+ tcp_push_pending_frames(sk, tp);
tcp_check_space(sk);
}
@@ -3655,7 +3652,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
*/
tcp_ack(sk, skb, 0);
__kfree_skb(skb);
- tcp_data_snd_check(sk);
+ tcp_data_snd_check(sk, tp);
return 0;
} else { /* Header too small */
TCP_INC_STATS_BH(TCP_MIB_INERRS);
@@ -3721,7 +3718,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
/* Well, only one small jumplet in fast path... */
tcp_ack(sk, skb, FLAG_DATA);
- tcp_data_snd_check(sk);
+ tcp_data_snd_check(sk, tp);
if (!tcp_ack_scheduled(tp))
goto no_ack;
}
@@ -3799,7 +3796,7 @@ step5:
/* step 7: process the segment text */
tcp_data_queue(sk, skb);
- tcp_data_snd_check(sk);
+ tcp_data_snd_check(sk, tp);
tcp_ack_snd_check(sk);
return 0;
@@ -4109,7 +4106,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
/* Do step6 onward by hand. */
tcp_urg(sk, skb, th);
__kfree_skb(skb);
- tcp_data_snd_check(sk);
+ tcp_data_snd_check(sk, tp);
return 0;
}
@@ -4300,7 +4297,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
/* tcp_data could move socket to TIME-WAIT */
if (sk->sk_state != TCP_CLOSE) {
- tcp_data_snd_check(sk);
+ tcp_data_snd_check(sk, tp);
tcp_ack_snd_check(sk);
}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ebf112347a97..62f62bb05c2a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2045,7 +2045,7 @@ static int tcp_v4_init_sock(struct sock *sk)
*/
tp->snd_ssthresh = 0x7fffffff; /* Infinity */
tp->snd_cwnd_clamp = ~0;
- tp->mss_cache_std = tp->mss_cache = 536;
+ tp->mss_cache = 536;
tp->reordering = sysctl_tcp_reordering;
tp->ca_ops = &tcp_init_congestion_ops;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0e17c244875c..e3f8ea1bfa9c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -49,7 +49,7 @@ int sysctl_tcp_retrans_collapse = 1;
* will allow a single TSO frame to consume. Building TSO frames
* which are too large can cause TCP streams to be bursty.
*/
-int sysctl_tcp_tso_win_divisor = 8;
+int sysctl_tcp_tso_win_divisor = 3;
static inline void update_send_head(struct sock *sk, struct tcp_sock *tp,
struct sk_buff *skb)
@@ -140,11 +140,11 @@ static inline void tcp_event_data_sent(struct tcp_sock *tp,
tp->ack.pingpong = 1;
}
-static __inline__ void tcp_event_ack_sent(struct sock *sk)
+static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
{
struct tcp_sock *tp = tcp_sk(sk);
- tcp_dec_quickack_mode(tp);
+ tcp_dec_quickack_mode(tp, pkts);
tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
}
@@ -355,7 +355,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
tp->af_specific->send_check(sk, th, skb->len, skb);
if (tcb->flags & TCPCB_FLAG_ACK)
- tcp_event_ack_sent(sk);
+ tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
if (skb->len != tcp_header_size)
tcp_event_data_sent(tp, skb, sk);
@@ -403,42 +403,11 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
sk->sk_send_head = skb;
}
-static inline void tcp_tso_set_push(struct sk_buff *skb)
-{
- /* Force push to be on for any TSO frames to workaround
- * problems with busted implementations like Mac OS-X that
- * hold off socket receive wakeups until push is seen.
- */
- if (tcp_skb_pcount(skb) > 1)
- TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
-}
-
-/* Send _single_ skb sitting at the send head. This function requires
- * true push pending frames to setup probe timer etc.
- */
-void tcp_push_one(struct sock *sk, unsigned cur_mss)
+static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb = sk->sk_send_head;
- if (tcp_snd_test(sk, skb, cur_mss, TCP_NAGLE_PUSH)) {
- /* Send it out now. */
- TCP_SKB_CB(skb)->when = tcp_time_stamp;
- tcp_tso_set_push(skb);
- if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
- sk->sk_send_head = NULL;
- tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
- tcp_packets_out_inc(sk, tp, skb);
- return;
- }
- }
-}
-
-void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (skb->len <= tp->mss_cache_std ||
+ if (skb->len <= tp->mss_cache ||
!(sk->sk_route_caps & NETIF_F_TSO)) {
/* Avoid the costly divide in the normal
* non-TSO case.
@@ -448,10 +417,10 @@ void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
} else {
unsigned int factor;
- factor = skb->len + (tp->mss_cache_std - 1);
- factor /= tp->mss_cache_std;
+ factor = skb->len + (tp->mss_cache - 1);
+ factor /= tp->mss_cache;
skb_shinfo(skb)->tso_segs = factor;
- skb_shinfo(skb)->tso_size = tp->mss_cache_std;
+ skb_shinfo(skb)->tso_size = tp->mss_cache;
}
}
@@ -537,6 +506,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
}
/* Link BUFF into the send queue. */
+ skb_header_release(buff);
__skb_append(skb, buff);
return 0;
@@ -657,7 +627,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
/* And store cached results */
tp->pmtu_cookie = pmtu;
- tp->mss_cache = tp->mss_cache_std = mss_now;
+ tp->mss_cache = mss_now;
return mss_now;
}
@@ -669,57 +639,316 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
* cannot be large. However, taking into account rare use of URG, this
* is not a big flaw.
*/
-
-unsigned int tcp_current_mss(struct sock *sk, int large)
+unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
{
struct tcp_sock *tp = tcp_sk(sk);
struct dst_entry *dst = __sk_dst_get(sk);
- unsigned int do_large, mss_now;
+ u32 mss_now;
+ u16 xmit_size_goal;
+ int doing_tso = 0;
+
+ mss_now = tp->mss_cache;
+
+ if (large_allowed &&
+ (sk->sk_route_caps & NETIF_F_TSO) &&
+ !tp->urg_mode)
+ doing_tso = 1;
- mss_now = tp->mss_cache_std;
if (dst) {
u32 mtu = dst_mtu(dst);
if (mtu != tp->pmtu_cookie)
mss_now = tcp_sync_mss(sk, mtu);
}
- do_large = (large &&
- (sk->sk_route_caps & NETIF_F_TSO) &&
- !tp->urg_mode);
+ if (tp->rx_opt.eff_sacks)
+ mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
+ (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
- if (do_large) {
- unsigned int large_mss, factor, limit;
+ xmit_size_goal = mss_now;
- large_mss = 65535 - tp->af_specific->net_header_len -
+ if (doing_tso) {
+ xmit_size_goal = 65535 -
+ tp->af_specific->net_header_len -
tp->ext_header_len - tp->tcp_header_len;
- if (tp->max_window && large_mss > (tp->max_window>>1))
- large_mss = max((tp->max_window>>1),
- 68U - tp->tcp_header_len);
+ if (tp->max_window &&
+ (xmit_size_goal > (tp->max_window >> 1)))
+ xmit_size_goal = max((tp->max_window >> 1),
+ 68U - tp->tcp_header_len);
+
+ xmit_size_goal -= (xmit_size_goal % mss_now);
+ }
+ tp->xmit_size_goal = xmit_size_goal;
- factor = large_mss / mss_now;
+ return mss_now;
+}
- /* Always keep large mss multiple of real mss, but
- * do not exceed 1/tso_win_divisor of the congestion window
- * so we can keep the ACK clock ticking and minimize
- * bursting.
- */
- limit = tp->snd_cwnd;
- if (sysctl_tcp_tso_win_divisor)
- limit /= sysctl_tcp_tso_win_divisor;
- limit = max(1U, limit);
- if (factor > limit)
- factor = limit;
+/* Congestion window validation. (RFC2861) */
- tp->mss_cache = mss_now * factor;
+static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
+{
+ __u32 packets_out = tp->packets_out;
+
+ if (packets_out >= tp->snd_cwnd) {
+ /* Network is feed fully. */
+ tp->snd_cwnd_used = 0;
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ } else {
+ /* Network starves. */
+ if (tp->packets_out > tp->snd_cwnd_used)
+ tp->snd_cwnd_used = tp->packets_out;
- mss_now = tp->mss_cache;
+ if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto)
+ tcp_cwnd_application_limited(sk);
}
+}
- if (tp->rx_opt.eff_sacks)
- mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
- (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
- return mss_now;
+static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd)
+{
+ u32 window, cwnd_len;
+
+ window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq);
+ cwnd_len = mss_now * cwnd;
+ return min(window, cwnd_len);
+}
+
+/* Can at least one segment of SKB be sent right now, according to the
+ * congestion window rules? If so, return how many segments are allowed.
+ */
+static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
+{
+ u32 in_flight, cwnd;
+
+ /* Don't be strict about the congestion window for the final FIN. */
+ if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
+ return 1;
+
+ in_flight = tcp_packets_in_flight(tp);
+ cwnd = tp->snd_cwnd;
+ if (in_flight < cwnd)
+ return (cwnd - in_flight);
+
+ return 0;
+}
+
+/* This must be invoked the first time we consider transmitting
+ * SKB onto the wire.
+ */
+static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb)
+{
+ int tso_segs = tcp_skb_pcount(skb);
+
+ if (!tso_segs) {
+ tcp_set_skb_tso_segs(sk, skb);
+ tso_segs = tcp_skb_pcount(skb);
+ }
+ return tso_segs;
+}
+
+static inline int tcp_minshall_check(const struct tcp_sock *tp)
+{
+ return after(tp->snd_sml,tp->snd_una) &&
+ !after(tp->snd_sml, tp->snd_nxt);
+}
+
+/* Return 0, if packet can be sent now without violation Nagle's rules:
+ * 1. It is full sized.
+ * 2. Or it contains FIN. (already checked by caller)
+ * 3. Or TCP_NODELAY was set.
+ * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
+ * With Minshall's modification: all sent small packets are ACKed.
+ */
+
+static inline int tcp_nagle_check(const struct tcp_sock *tp,
+ const struct sk_buff *skb,
+ unsigned mss_now, int nonagle)
+{
+ return (skb->len < mss_now &&
+ ((nonagle&TCP_NAGLE_CORK) ||
+ (!nonagle &&
+ tp->packets_out &&
+ tcp_minshall_check(tp))));
+}
+
+/* Return non-zero if the Nagle test allows this packet to be
+ * sent now.
+ */
+static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
+ unsigned int cur_mss, int nonagle)
+{
+ /* Nagle rule does not apply to frames, which sit in the middle of the
+ * write_queue (they have no chances to get new data).
+ *
+ * This is implemented in the callers, where they modify the 'nonagle'
+ * argument based upon the location of SKB in the send queue.
+ */
+ if (nonagle & TCP_NAGLE_PUSH)
+ return 1;
+
+ /* Don't use the nagle rule for urgent data (or for the final FIN). */
+ if (tp->urg_mode ||
+ (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
+ return 1;
+
+ if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
+ return 1;
+
+ return 0;
+}
+
+/* Does at least the first segment of SKB fit into the send window? */
+static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
+{
+ u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+
+ if (skb->len > cur_mss)
+ end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
+
+ return !after(end_seq, tp->snd_una + tp->snd_wnd);
+}
+
+/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
+ * should be put on the wire right now. If so, it returns the number of
+ * packets allowed by the congestion window.
+ */
+static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
+ unsigned int cur_mss, int nonagle)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned int cwnd_quota;
+
+ tcp_init_tso_segs(sk, skb);
+
+ if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
+ return 0;
+
+ cwnd_quota = tcp_cwnd_test(tp, skb);
+ if (cwnd_quota &&
+ !tcp_snd_wnd_test(tp, skb, cur_mss))
+ cwnd_quota = 0;
+
+ return cwnd_quota;
+}
+
+static inline int tcp_skb_is_last(const struct sock *sk,
+ const struct sk_buff *skb)
+{
+ return skb->next == (struct sk_buff *)&sk->sk_write_queue;
+}
+
+int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
+{
+ struct sk_buff *skb = sk->sk_send_head;
+
+ return (skb &&
+ tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
+ (tcp_skb_is_last(sk, skb) ?
+ TCP_NAGLE_PUSH :
+ tp->nonagle)));
+}
+
+/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
+ * which is put after SKB on the list. It is very much like
+ * tcp_fragment() except that it may make several kinds of assumptions
+ * in order to speed up the splitting operation. In particular, we
+ * know that all the data is in scatter-gather pages, and that the
+ * packet has never been sent out before (and thus is not cloned).
+ */
+static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len)
+{
+ struct sk_buff *buff;
+ int nlen = skb->len - len;
+ u16 flags;
+
+ /* All of a TSO frame must be composed of paged data. */
+ BUG_ON(skb->len != skb->data_len);
+
+ buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC);
+ if (unlikely(buff == NULL))
+ return -ENOMEM;
+
+ buff->truesize = nlen;
+ skb->truesize -= nlen;
+
+ /* Correct the sequence numbers. */
+ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
+ TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
+
+ /* PSH and FIN should only be set in the second packet. */
+ flags = TCP_SKB_CB(skb)->flags;
+ TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
+ TCP_SKB_CB(buff)->flags = flags;
+
+ /* This packet was never sent out yet, so no SACK bits. */
+ TCP_SKB_CB(buff)->sacked = 0;
+
+ buff->ip_summed = skb->ip_summed = CHECKSUM_HW;
+ skb_split(skb, buff, len);
+
+ /* Fix up tso_factor for both original and new SKB. */
+ tcp_set_skb_tso_segs(sk, skb);
+ tcp_set_skb_tso_segs(sk, buff);
+
+ /* Link BUFF into the send queue. */
+ skb_header_release(buff);
+ __skb_append(skb, buff);
+
+ return 0;
+}
+
+/* Try to defer sending, if possible, in order to minimize the amount
+ * of TSO splitting we do. View it as a kind of TSO Nagle test.
+ *
+ * This algorithm is from John Heffner.
+ */
+static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
+{
+ u32 send_win, cong_win, limit, in_flight;
+
+ if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
+ return 0;
+
+ if (tp->ca_state != TCP_CA_Open)
+ return 0;
+
+ in_flight = tcp_packets_in_flight(tp);
+
+ BUG_ON(tcp_skb_pcount(skb) <= 1 ||
+ (tp->snd_cwnd <= in_flight));
+
+ send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq;
+
+ /* From in_flight test above, we know that cwnd > in_flight. */
+ cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
+
+ limit = min(send_win, cong_win);
+
+ /* If sk_send_head can be sent fully now, just do it. */
+ if (skb->len <= limit)
+ return 0;
+
+ if (sysctl_tcp_tso_win_divisor) {
+ u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
+
+ /* If at least some fraction of a window is available,
+ * just use it.
+ */
+ chunk /= sysctl_tcp_tso_win_divisor;
+ if (limit >= chunk)
+ return 0;
+ } else {
+ /* Different approach, try not to defer past a single
+ * ACK. Receiver should ACK every other full sized
+ * frame, so if we have space for more than 3 frames
+ * then send now.
+ */
+ if (limit > tcp_max_burst(tp) * tp->mss_cache)
+ return 0;
+ }
+
+ /* Ok, it looks like it is advisable to defer. */
+ return 1;
}
/* This routine writes packets to the network. It advances the
@@ -729,57 +958,158 @@ unsigned int tcp_current_mss(struct sock *sk, int large)
* Returns 1, if no segments are in flight and we have queued segments, but
* cannot send anything now because of SWS or another problem.
*/
-int tcp_write_xmit(struct sock *sk, int nonagle)
+static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
{
struct tcp_sock *tp = tcp_sk(sk);
- unsigned int mss_now;
+ struct sk_buff *skb;
+ unsigned int tso_segs, sent_pkts;
+ int cwnd_quota;
/* If we are closed, the bytes will have to remain here.
* In time closedown will finish, we empty the write queue and all
* will be happy.
*/
- if (sk->sk_state != TCP_CLOSE) {
- struct sk_buff *skb;
- int sent_pkts = 0;
+ if (unlikely(sk->sk_state == TCP_CLOSE))
+ return 0;
+
+ skb = sk->sk_send_head;
+ if (unlikely(!skb))
+ return 0;
+
+ tso_segs = tcp_init_tso_segs(sk, skb);
+ cwnd_quota = tcp_cwnd_test(tp, skb);
+ if (unlikely(!cwnd_quota))
+ goto out;
+
+ sent_pkts = 0;
+ while (likely(tcp_snd_wnd_test(tp, skb, mss_now))) {
+ BUG_ON(!tso_segs);
+
+ if (tso_segs == 1) {
+ if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
+ (tcp_skb_is_last(sk, skb) ?
+ nonagle : TCP_NAGLE_PUSH))))
+ break;
+ } else {
+ if (tcp_tso_should_defer(sk, tp, skb))
+ break;
+ }
- /* Account for SACKS, we may need to fragment due to this.
- * It is just like the real MSS changing on us midstream.
- * We also handle things correctly when the user adds some
- * IP options mid-stream. Silly to do, but cover it.
- */
- mss_now = tcp_current_mss(sk, 1);
-
- while ((skb = sk->sk_send_head) &&
- tcp_snd_test(sk, skb, mss_now,
- tcp_skb_is_last(sk, skb) ? nonagle :
- TCP_NAGLE_PUSH)) {
- if (skb->len > mss_now) {
- if (tcp_fragment(sk, skb, mss_now))
+ if (tso_segs > 1) {
+ u32 limit = tcp_window_allows(tp, skb,
+ mss_now, cwnd_quota);
+
+ if (skb->len < limit) {
+ unsigned int trim = skb->len % mss_now;
+
+ if (trim)
+ limit = skb->len - trim;
+ }
+ if (skb->len > limit) {
+ if (tso_fragment(sk, skb, limit))
break;
}
-
- TCP_SKB_CB(skb)->when = tcp_time_stamp;
- tcp_tso_set_push(skb);
- if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
+ } else if (unlikely(skb->len > mss_now)) {
+ if (unlikely(tcp_fragment(sk, skb, mss_now)))
break;
+ }
- /* Advance the send_head. This one is sent out.
- * This call will increment packets_out.
- */
- update_send_head(sk, tp, skb);
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
+
+ if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))))
+ break;
+
+ /* Advance the send_head. This one is sent out.
+ * This call will increment packets_out.
+ */
+ update_send_head(sk, tp, skb);
+
+ tcp_minshall_update(tp, mss_now, skb);
+ sent_pkts++;
+
+ /* Do not optimize this to use tso_segs. If we chopped up
+ * the packet above, tso_segs will no longer be valid.
+ */
+ cwnd_quota -= tcp_skb_pcount(skb);
+
+ BUG_ON(cwnd_quota < 0);
+ if (!cwnd_quota)
+ break;
+
+ skb = sk->sk_send_head;
+ if (!skb)
+ break;
+ tso_segs = tcp_init_tso_segs(sk, skb);
+ }
+
+ if (likely(sent_pkts)) {
+ tcp_cwnd_validate(sk, tp);
+ return 0;
+ }
+out:
+ return !tp->packets_out && sk->sk_send_head;
+}
+
+/* Push out any pending frames which were held back due to
+ * TCP_CORK or attempt at coalescing tiny packets.
+ * The socket must be locked by the caller.
+ */
+void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp,
+ unsigned int cur_mss, int nonagle)
+{
+ struct sk_buff *skb = sk->sk_send_head;
- tcp_minshall_update(tp, mss_now, skb);
- sent_pkts = 1;
+ if (skb) {
+ if (tcp_write_xmit(sk, cur_mss, nonagle))
+ tcp_check_probe_timer(sk, tp);
+ }
+}
+
+/* Send _single_ skb sitting at the send head. This function requires
+ * true push pending frames to setup probe timer etc.
+ */
+void tcp_push_one(struct sock *sk, unsigned int mss_now)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb = sk->sk_send_head;
+ unsigned int tso_segs, cwnd_quota;
+
+ BUG_ON(!skb || skb->len < mss_now);
+
+ tso_segs = tcp_init_tso_segs(sk, skb);
+ cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
+
+ if (likely(cwnd_quota)) {
+ BUG_ON(!tso_segs);
+
+ if (tso_segs > 1) {
+ u32 limit = tcp_window_allows(tp, skb,
+ mss_now, cwnd_quota);
+
+ if (skb->len < limit) {
+ unsigned int trim = skb->len % mss_now;
+
+ if (trim)
+ limit = skb->len - trim;
+ }
+ if (skb->len > limit) {
+ if (unlikely(tso_fragment(sk, skb, limit)))
+ return;
+ }
+ } else if (unlikely(skb->len > mss_now)) {
+ if (unlikely(tcp_fragment(sk, skb, mss_now)))
+ return;
}
- if (sent_pkts) {
+ /* Send it out now. */
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
+
+ if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) {
+ update_send_head(sk, tp, skb);
tcp_cwnd_validate(sk, tp);
- return 0;
+ return;
}
-
- return !tp->packets_out && sk->sk_send_head;
}
- return 0;
}
/* This function returns the amount that we can raise the
@@ -1039,7 +1369,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
if (sk->sk_route_caps & NETIF_F_TSO) {
sk->sk_route_caps &= ~NETIF_F_TSO;
sock_set_flag(sk, SOCK_NO_LARGESEND);
- tp->mss_cache = tp->mss_cache_std;
}
if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
@@ -1101,7 +1430,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
* is still in somebody's hands, else make a clone.
*/
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- tcp_tso_set_push(skb);
err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
pskb_copy(skb, GFP_ATOMIC):
@@ -1285,7 +1613,7 @@ void tcp_send_fin(struct sock *sk)
* was unread data in the receive queue. This behavior is recommended
* by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
*/
-void tcp_send_active_reset(struct sock *sk, int priority)
+void tcp_send_active_reset(struct sock *sk, unsigned int __nocast priority)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
@@ -1670,14 +1998,12 @@ int tcp_write_wakeup(struct sock *sk)
if (sk->sk_route_caps & NETIF_F_TSO) {
sock_set_flag(sk, SOCK_NO_LARGESEND);
sk->sk_route_caps &= ~NETIF_F_TSO;
- tp->mss_cache = tp->mss_cache_std;
}
} else if (!tcp_skb_pcount(skb))
tcp_set_skb_tso_segs(sk, skb);
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- tcp_tso_set_push(skb);
err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
if (!err) {
update_send_head(sk, tp, skb);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b127b4498565..0084227438c2 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -231,11 +231,10 @@ static void tcp_delack_timer(unsigned long data)
}
tp->ack.pending &= ~TCP_ACK_TIMER;
- if (skb_queue_len(&tp->ucopy.prequeue)) {
+ if (!skb_queue_empty(&tp->ucopy.prequeue)) {
struct sk_buff *skb;
- NET_ADD_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED,
- skb_queue_len(&tp->ucopy.prequeue));
+ NET_INC_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED);
while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
sk->sk_backlog_rcv(sk, skb);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 2b193e3df49a..28d9bcab0970 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -774,7 +774,6 @@ static int __init inet6_init(void)
if (if6_proc_init())
goto proc_if6_fail;
#endif
- ipv6_packet_init();
ip6_route_init();
ip6_flowlabel_init();
err = addrconf_init();
@@ -791,6 +790,8 @@ static int __init inet6_init(void)
/* Init v6 transport protocols. */
udpv6_init();
tcpv6_init();
+
+ ipv6_packet_init();
err = 0;
out:
return err;
@@ -798,7 +799,6 @@ out:
addrconf_fail:
ip6_flowlabel_cleanup();
ip6_route_cleanup();
- ipv6_packet_cleanup();
#ifdef CONFIG_PROC_FS
if6_proc_exit();
proc_if6_fail:
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 06e7cdaeedc5..1f2c2f9e353f 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -465,7 +465,6 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
to->pkt_type = from->pkt_type;
to->priority = from->priority;
to->protocol = from->protocol;
- to->security = from->security;
dst_release(to->dst);
to->dst = dst_clone(from->dst);
to->dev = from->dev;
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 562fcd14fdea..29fed6e58d0a 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -281,7 +281,7 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, struct in6_addr *addr)
}
write_unlock_bh(&ipv6_sk_mc_lock);
- return -ENOENT;
+ return -EADDRNOTAVAIL;
}
static struct inet6_dev *ip6_mc_find_dev(struct in6_addr *group, int ifindex)
@@ -386,12 +386,16 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
if (ipv6_addr_equal(&pmc->addr, group))
break;
}
- if (!pmc) /* must have a prior join */
+ if (!pmc) { /* must have a prior join */
+ err = -EINVAL;
goto done;
+ }
/* if a source filter was set, must be the same mode as before */
if (pmc->sflist) {
- if (pmc->sfmode != omode)
+ if (pmc->sfmode != omode) {
+ err = -EINVAL;
goto done;
+ }
} else if (pmc->sfmode != omode) {
/* allow mode switches for empty-set filters */
ip6_mc_add_src(idev, group, omode, 0, NULL, 0);
@@ -402,7 +406,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
psl = pmc->sflist;
if (!add) {
if (!psl)
- goto done;
+ goto done; /* err = -EADDRNOTAVAIL */
rv = !0;
for (i=0; i<psl->sl_count; i++) {
rv = memcmp(&psl->sl_addr[i], source,
@@ -411,7 +415,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
break;
}
if (rv) /* source not found */
- goto done;
+ goto done; /* err = -EADDRNOTAVAIL */
/* special case - (INCLUDE, empty) == LEAVE_GROUP */
if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
@@ -488,6 +492,7 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
struct inet6_dev *idev;
struct ipv6_pinfo *inet6 = inet6_sk(sk);
struct ip6_sf_socklist *newpsl, *psl;
+ int leavegroup = 0;
int i, err;
group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr;
@@ -503,7 +508,12 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
if (!idev)
return -ENODEV;
dev = idev->dev;
- err = -EADDRNOTAVAIL;
+
+ err = 0;
+ if (gsf->gf_fmode == MCAST_INCLUDE && gsf->gf_numsrc == 0) {
+ leavegroup = 1;
+ goto done;
+ }
for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) {
if (pmc->ifindex != gsf->gf_interface)
@@ -511,8 +521,10 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
if (ipv6_addr_equal(&pmc->addr, group))
break;
}
- if (!pmc) /* must have a prior join */
+ if (!pmc) { /* must have a prior join */
+ err = -EINVAL;
goto done;
+ }
if (gsf->gf_numsrc) {
newpsl = (struct ip6_sf_socklist *)sock_kmalloc(sk,
IP6_SFLSIZE(gsf->gf_numsrc), GFP_ATOMIC);
@@ -544,10 +556,13 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
(void) ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0);
pmc->sflist = newpsl;
pmc->sfmode = gsf->gf_fmode;
+ err = 0;
done:
read_unlock_bh(&idev->lock);
in6_dev_put(idev);
dev_put(dev);
+ if (leavegroup)
+ err = ipv6_sock_mc_drop(sk, gsf->gf_interface, group);
return err;
}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 9dac7fdf4726..f6e288dc116e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2018,7 +2018,7 @@ static int tcp_v6_init_sock(struct sock *sk)
*/
tp->snd_ssthresh = 0x7fffffff;
tp->snd_cwnd_clamp = ~0;
- tp->mss_cache_std = tp->mss_cache = 536;
+ tp->mss_cache = 536;
tp->reordering = sysctl_tcp_reordering;
diff --git a/net/irda/irlap.c b/net/irda/irlap.c
index 046ad0750e48..7029618f5719 100644
--- a/net/irda/irlap.c
+++ b/net/irda/irlap.c
@@ -445,9 +445,8 @@ void irlap_disconnect_request(struct irlap_cb *self)
IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
/* Don't disconnect until all data frames are successfully sent */
- if (skb_queue_len(&self->txq) > 0) {
+ if (!skb_queue_empty(&self->txq)) {
self->disconnect_pending = TRUE;
-
return;
}
diff --git a/net/irda/irlap_event.c b/net/irda/irlap_event.c
index 1cd89f5f3b75..a505b5457608 100644
--- a/net/irda/irlap_event.c
+++ b/net/irda/irlap_event.c
@@ -191,7 +191,7 @@ static void irlap_start_poll_timer(struct irlap_cb *self, int timeout)
* Send out the RR frames faster if our own transmit queue is empty, or
* if the peer is busy. The effect is a much faster conversation
*/
- if ((skb_queue_len(&self->txq) == 0) || (self->remote_busy)) {
+ if (skb_queue_empty(&self->txq) || self->remote_busy) {
if (self->fast_RR == TRUE) {
/*
* Assert that the fast poll timer has not reached the
@@ -263,7 +263,7 @@ void irlap_do_event(struct irlap_cb *self, IRLAP_EVENT event,
IRDA_DEBUG(2, "%s() : queue len = %d\n", __FUNCTION__,
skb_queue_len(&self->txq));
- if (skb_queue_len(&self->txq)) {
+ if (!skb_queue_empty(&self->txq)) {
/* Prevent race conditions with irlap_data_request() */
self->local_busy = TRUE;
@@ -1074,7 +1074,7 @@ static int irlap_state_xmit_p(struct irlap_cb *self, IRLAP_EVENT event,
#else /* CONFIG_IRDA_DYNAMIC_WINDOW */
/* Window has been adjusted for the max packet
* size, so much simpler... - Jean II */
- nextfit = (skb_queue_len(&self->txq) > 0);
+ nextfit = !skb_queue_empty(&self->txq);
#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
/*
* Send data with poll bit cleared only if window > 1
@@ -1814,7 +1814,7 @@ static int irlap_state_xmit_s(struct irlap_cb *self, IRLAP_EVENT event,
#else /* CONFIG_IRDA_DYNAMIC_WINDOW */
/* Window has been adjusted for the max packet
* size, so much simpler... - Jean II */
- nextfit = (skb_queue_len(&self->txq) > 0);
+ nextfit = !skb_queue_empty(&self->txq);
#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
/*
* Send data with final bit cleared only if window > 1
@@ -1937,7 +1937,7 @@ static int irlap_state_nrm_s(struct irlap_cb *self, IRLAP_EVENT event,
irlap_data_indication(self, skb, FALSE);
/* Any pending data requests? */
- if ((skb_queue_len(&self->txq) > 0) &&
+ if (!skb_queue_empty(&self->txq) &&
(self->window > 0))
{
self->ack_required = TRUE;
@@ -2038,7 +2038,7 @@ static int irlap_state_nrm_s(struct irlap_cb *self, IRLAP_EVENT event,
/*
* Any pending data requests?
*/
- if ((skb_queue_len(&self->txq) > 0) &&
+ if (!skb_queue_empty(&self->txq) &&
(self->window > 0) && !self->remote_busy)
{
irlap_data_indication(self, skb, TRUE);
@@ -2069,7 +2069,7 @@ static int irlap_state_nrm_s(struct irlap_cb *self, IRLAP_EVENT event,
*/
nr_status = irlap_validate_nr_received(self, info->nr);
if (nr_status == NR_EXPECTED) {
- if ((skb_queue_len( &self->txq) > 0) &&
+ if (!skb_queue_empty(&self->txq) &&
(self->window > 0)) {
self->remote_busy = FALSE;
diff --git a/net/irda/irlap_frame.c b/net/irda/irlap_frame.c
index 040abe714aa3..6dafbb43b529 100644
--- a/net/irda/irlap_frame.c
+++ b/net/irda/irlap_frame.c
@@ -1018,11 +1018,10 @@ void irlap_resend_rejected_frames(struct irlap_cb *self, int command)
/*
* We can now fill the window with additional data frames
*/
- while (skb_queue_len( &self->txq) > 0) {
+ while (!skb_queue_empty(&self->txq)) {
IRDA_DEBUG(0, "%s(), sending additional frames!\n", __FUNCTION__);
- if ((skb_queue_len( &self->txq) > 0) &&
- (self->window > 0)) {
+ if (self->window > 0) {
skb = skb_dequeue( &self->txq);
IRDA_ASSERT(skb != NULL, return;);
@@ -1031,8 +1030,7 @@ void irlap_resend_rejected_frames(struct irlap_cb *self, int command)
* bit cleared
*/
if ((self->window > 1) &&
- skb_queue_len(&self->txq) > 0)
- {
+ !skb_queue_empty(&self->txq)) {
irlap_send_data_primary(self, skb);
} else {
irlap_send_data_primary_poll(self, skb);
diff --git a/net/irda/irttp.c b/net/irda/irttp.c
index d091ccf773b3..6602d901f8b1 100644
--- a/net/irda/irttp.c
+++ b/net/irda/irttp.c
@@ -1513,7 +1513,7 @@ int irttp_disconnect_request(struct tsap_cb *self, struct sk_buff *userdata,
/*
* Check if there is still data segments in the transmit queue
*/
- if (skb_queue_len(&self->tx_queue) > 0) {
+ if (!skb_queue_empty(&self->tx_queue)) {
if (priority == P_HIGH) {
/*
* No need to send the queued data, if we are
diff --git a/net/llc/llc_c_ev.c b/net/llc/llc_c_ev.c
index cd130c3b72bc..d5bdb53a348f 100644
--- a/net/llc/llc_c_ev.c
+++ b/net/llc/llc_c_ev.c
@@ -84,7 +84,7 @@ static u16 llc_util_nr_inside_tx_window(struct sock *sk, u8 nr)
if (llc->dev->flags & IFF_LOOPBACK)
goto out;
rc = 1;
- if (!skb_queue_len(&llc->pdu_unack_q))
+ if (skb_queue_empty(&llc->pdu_unack_q))
goto out;
skb = skb_peek(&llc->pdu_unack_q);
pdu = llc_pdu_sn_hdr(skb);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index fc456a7aaec3..3405fdf41b93 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -858,7 +858,7 @@ static inline void netlink_rcv_wake(struct sock *sk)
{
struct netlink_sock *nlk = nlk_sk(sk);
- if (!skb_queue_len(&sk->sk_receive_queue))
+ if (skb_queue_empty(&sk->sk_receive_queue))
clear_bit(0, &nlk->state);
if (!test_bit(0, &nlk->state))
wake_up_interruptible(&nlk->wait);
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 8f58cecd6266..e48d0d456b3e 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -4,7 +4,7 @@
obj-y := sch_generic.o
-obj-$(CONFIG_NET_SCHED) += sch_api.o sch_fifo.o
+obj-$(CONFIG_NET_SCHED) += sch_api.o sch_fifo.o sch_blackhole.o
obj-$(CONFIG_NET_CLS) += cls_api.o
obj-$(CONFIG_NET_CLS_ACT) += act_api.o
obj-$(CONFIG_NET_ACT_POLICE) += police.o
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 48bb23c2a35a..53d98f8d3d80 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -205,11 +205,6 @@ META_COLLECTOR(int_protocol)
dst->value = skb->protocol;
}
-META_COLLECTOR(int_security)
-{
- dst->value = skb->security;
-}
-
META_COLLECTOR(int_pkttype)
{
dst->value = skb->pkt_type;
@@ -524,7 +519,6 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
[META_ID(REALDEV)] = META_FUNC(int_realdev),
[META_ID(PRIORITY)] = META_FUNC(int_priority),
[META_ID(PROTOCOL)] = META_FUNC(int_protocol),
- [META_ID(SECURITY)] = META_FUNC(int_security),
[META_ID(PKTTYPE)] = META_FUNC(int_pkttype),
[META_ID(PKTLEN)] = META_FUNC(int_pktlen),
[META_ID(DATALEN)] = META_FUNC(int_datalen),
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 05e6e0a799da..b9a069af4a02 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -399,10 +399,8 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
{
int err;
struct rtattr *kind = tca[TCA_KIND-1];
- void *p = NULL;
struct Qdisc *sch;
struct Qdisc_ops *ops;
- int size;
ops = qdisc_lookup_ops(kind);
#ifdef CONFIG_KMOD
@@ -437,64 +435,55 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
if (ops == NULL)
goto err_out;
- /* ensure that the Qdisc and the private data are 32-byte aligned */
- size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST);
- size += ops->priv_size + QDISC_ALIGN_CONST;
-
- p = kmalloc(size, GFP_KERNEL);
- err = -ENOBUFS;
- if (!p)
+ sch = qdisc_alloc(dev, ops);
+ if (IS_ERR(sch)) {
+ err = PTR_ERR(sch);
goto err_out2;
- memset(p, 0, size);
- sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST)
- & ~QDISC_ALIGN_CONST);
- sch->padded = (char *)sch - (char *)p;
-
- INIT_LIST_HEAD(&sch->list);
- skb_queue_head_init(&sch->q);
+ }
- if (handle == TC_H_INGRESS)
+ if (handle == TC_H_INGRESS) {
sch->flags |= TCQ_F_INGRESS;
-
- sch->ops = ops;
- sch->enqueue = ops->enqueue;
- sch->dequeue = ops->dequeue;
- sch->dev = dev;
- dev_hold(dev);
- atomic_set(&sch->refcnt, 1);
- sch->stats_lock = &dev->queue_lock;
- if (handle == 0) {
+ handle = TC_H_MAKE(TC_H_INGRESS, 0);
+ } else if (handle == 0) {
handle = qdisc_alloc_handle(dev);
err = -ENOMEM;
if (handle == 0)
goto err_out3;
}
- if (handle == TC_H_INGRESS)
- sch->handle =TC_H_MAKE(TC_H_INGRESS, 0);
- else
- sch->handle = handle;
+ sch->handle = handle;
if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
+#ifdef CONFIG_NET_ESTIMATOR
+ if (tca[TCA_RATE-1]) {
+ err = gen_new_estimator(&sch->bstats, &sch->rate_est,
+ sch->stats_lock,
+ tca[TCA_RATE-1]);
+ if (err) {
+ /*
+ * Any broken qdiscs that would require
+ * a ops->reset() here? The qdisc was never
+ * in action so it shouldn't be necessary.
+ */
+ if (ops->destroy)
+ ops->destroy(sch);
+ goto err_out3;
+ }
+ }
+#endif
qdisc_lock_tree(dev);
list_add_tail(&sch->list, &dev->qdisc_list);
qdisc_unlock_tree(dev);
-#ifdef CONFIG_NET_ESTIMATOR
- if (tca[TCA_RATE-1])
- gen_new_estimator(&sch->bstats, &sch->rate_est,
- sch->stats_lock, tca[TCA_RATE-1]);
-#endif
return sch;
}
err_out3:
dev_put(dev);
+ kfree((char *) sch - sch->padded);
err_out2:
module_put(ops->owner);
err_out:
*errp = err;
- if (p)
- kfree(p);
return NULL;
}
diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c
new file mode 100644
index 000000000000..81f0b8346d17
--- /dev/null
+++ b/net/sched/sch_blackhole.c
@@ -0,0 +1,54 @@
+/*
+ * net/sched/sch_blackhole.c Black hole queue
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ *
+ * Note: Quantum tunneling is not supported.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+
+static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ qdisc_drop(skb, sch);
+ return NET_XMIT_SUCCESS;
+}
+
+static struct sk_buff *blackhole_dequeue(struct Qdisc *sch)
+{
+ return NULL;
+}
+
+static struct Qdisc_ops blackhole_qdisc_ops = {
+ .id = "blackhole",
+ .priv_size = 0,
+ .enqueue = blackhole_enqueue,
+ .dequeue = blackhole_dequeue,
+ .owner = THIS_MODULE,
+};
+
+static int __init blackhole_module_init(void)
+{
+ return register_qdisc(&blackhole_qdisc_ops);
+}
+
+static void __exit blackhole_module_exit(void)
+{
+ unregister_qdisc(&blackhole_qdisc_ops);
+}
+
+module_init(blackhole_module_init)
+module_exit(blackhole_module_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 7683b34dc6a9..73e218e646ac 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -395,24 +395,23 @@ static struct Qdisc_ops pfifo_fast_ops = {
.owner = THIS_MODULE,
};
-struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
+struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
{
void *p;
struct Qdisc *sch;
- int size;
+ unsigned int size;
+ int err = -ENOBUFS;
/* ensure that the Qdisc and the private data are 32-byte aligned */
- size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST);
- size += ops->priv_size + QDISC_ALIGN_CONST;
+ size = QDISC_ALIGN(sizeof(*sch));
+ size += ops->priv_size + (QDISC_ALIGNTO - 1);
p = kmalloc(size, GFP_KERNEL);
if (!p)
- return NULL;
+ goto errout;
memset(p, 0, size);
-
- sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST)
- & ~QDISC_ALIGN_CONST);
- sch->padded = (char *)sch - (char *)p;
+ sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
+ sch->padded = (char *) sch - (char *) p;
INIT_LIST_HEAD(&sch->list);
skb_queue_head_init(&sch->q);
@@ -423,11 +422,24 @@ struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
dev_hold(dev);
sch->stats_lock = &dev->queue_lock;
atomic_set(&sch->refcnt, 1);
+
+ return sch;
+errout:
+ return ERR_PTR(-err);
+}
+
+struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
+{
+ struct Qdisc *sch;
+
+ sch = qdisc_alloc(dev, ops);
+ if (IS_ERR(sch))
+ goto errout;
+
if (!ops->init || ops->init(sch, NULL) == 0)
return sch;
- dev_put(dev);
- kfree(p);
+errout:
return NULL;
}
@@ -591,6 +603,7 @@ EXPORT_SYMBOL(__netdev_watchdog_up);
EXPORT_SYMBOL(noop_qdisc);
EXPORT_SYMBOL(noop_qdisc_ops);
EXPORT_SYMBOL(qdisc_create_dflt);
+EXPORT_SYMBOL(qdisc_alloc);
EXPORT_SYMBOL(qdisc_destroy);
EXPORT_SYMBOL(qdisc_reset);
EXPORT_SYMBOL(qdisc_restart);
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 664d0e47374f..7845d045eec4 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -385,7 +385,7 @@ static int red_change(struct Qdisc *sch, struct rtattr *opt)
memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256);
q->qcount = -1;
- if (skb_queue_len(&sch->q) == 0)
+ if (skb_queue_empty(&sch->q))
PSCHED_SET_PASTPERFECT(q->qidlestart);
sch_tree_unlock(sch);
return 0;
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 7ae6aa772dab..4b47dd6f2485 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -203,7 +203,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
*/
asoc->addip_serial = asoc->c.initial_tsn;
- skb_queue_head_init(&asoc->addip_chunks);
+ INIT_LIST_HEAD(&asoc->addip_chunk_list);
/* Make an empty list of remote transport addresses. */
INIT_LIST_HEAD(&asoc->peer.transport_addr_list);
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 339f7acfdb64..5e085e041a6e 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -115,6 +115,17 @@ static void sctp_rcv_set_owner_r(struct sk_buff *skb, struct sock *sk)
atomic_add(sizeof(struct sctp_chunk),&sk->sk_rmem_alloc);
}
+struct sctp_input_cb {
+ union {
+ struct inet_skb_parm h4;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ struct inet6_skb_parm h6;
+#endif
+ } header;
+ struct sctp_chunk *chunk;
+};
+#define SCTP_INPUT_CB(__skb) ((struct sctp_input_cb *)&((__skb)->cb[0]))
+
/*
* This is the routine which IP calls when receiving an SCTP packet.
*/
@@ -243,6 +254,7 @@ int sctp_rcv(struct sk_buff *skb)
ret = -ENOMEM;
goto discard_release;
}
+ SCTP_INPUT_CB(skb)->chunk = chunk;
sctp_rcv_set_owner_r(skb,sk);
@@ -265,9 +277,9 @@ int sctp_rcv(struct sk_buff *skb)
sctp_bh_lock_sock(sk);
if (sock_owned_by_user(sk))
- sk_add_backlog(sk, (struct sk_buff *) chunk);
+ sk_add_backlog(sk, skb);
else
- sctp_backlog_rcv(sk, (struct sk_buff *) chunk);
+ sctp_backlog_rcv(sk, skb);
/* Release the sock and any reference counts we took in the
* lookup calls.
@@ -302,14 +314,8 @@ discard_release:
*/
int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{
- struct sctp_chunk *chunk;
- struct sctp_inq *inqueue;
-
- /* One day chunk will live inside the skb, but for
- * now this works.
- */
- chunk = (struct sctp_chunk *) skb;
- inqueue = &chunk->rcvr->inqueue;
+ struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk;
+ struct sctp_inq *inqueue = &chunk->rcvr->inqueue;
sctp_inq_push(inqueue, chunk);
return 0;
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index cedf4351556c..2d33922c044b 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -50,7 +50,7 @@
/* Initialize an SCTP inqueue. */
void sctp_inq_init(struct sctp_inq *queue)
{
- skb_queue_head_init(&queue->in);
+ INIT_LIST_HEAD(&queue->in_chunk_list);
queue->in_progress = NULL;
/* Create a task for delivering data. */
@@ -62,11 +62,13 @@ void sctp_inq_init(struct sctp_inq *queue)
/* Release the memory associated with an SCTP inqueue. */
void sctp_inq_free(struct sctp_inq *queue)
{
- struct sctp_chunk *chunk;
+ struct sctp_chunk *chunk, *tmp;
/* Empty the queue. */
- while ((chunk = (struct sctp_chunk *) skb_dequeue(&queue->in)) != NULL)
+ list_for_each_entry_safe(chunk, tmp, &queue->in_chunk_list, list) {
+ list_del_init(&chunk->list);
sctp_chunk_free(chunk);
+ }
/* If there is a packet which is currently being worked on,
* free it as well.
@@ -92,7 +94,7 @@ void sctp_inq_push(struct sctp_inq *q, struct sctp_chunk *packet)
* Eventually, we should clean up inqueue to not rely
* on the BH related data structures.
*/
- skb_queue_tail(&(q->in), (struct sk_buff *) packet);
+ list_add_tail(&packet->list, &q->in_chunk_list);
q->immediate.func(q->immediate.data);
}
@@ -131,12 +133,16 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue)
/* Do we need to take the next packet out of the queue to process? */
if (!chunk) {
+ struct list_head *entry;
+
/* Is the queue empty? */
- if (skb_queue_empty(&queue->in))
+ if (list_empty(&queue->in_chunk_list))
return NULL;
+ entry = queue->in_chunk_list.next;
chunk = queue->in_progress =
- (struct sctp_chunk *) skb_dequeue(&queue->in);
+ list_entry(entry, struct sctp_chunk, list);
+ list_del_init(entry);
/* This is the first chunk in the packet. */
chunk->singleton = 1;
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 84b5b370b09d..931371633464 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -108,7 +108,7 @@ struct sctp_packet *sctp_packet_init(struct sctp_packet *packet,
packet->transport = transport;
packet->source_port = sport;
packet->destination_port = dport;
- skb_queue_head_init(&packet->chunks);
+ INIT_LIST_HEAD(&packet->chunk_list);
if (asoc) {
struct sctp_sock *sp = sctp_sk(asoc->base.sk);
overhead = sp->pf->af->net_header_len;
@@ -129,12 +129,14 @@ struct sctp_packet *sctp_packet_init(struct sctp_packet *packet,
/* Free a packet. */
void sctp_packet_free(struct sctp_packet *packet)
{
- struct sctp_chunk *chunk;
+ struct sctp_chunk *chunk, *tmp;
SCTP_DEBUG_PRINTK("%s: packet:%p\n", __FUNCTION__, packet);
- while ((chunk = (struct sctp_chunk *)__skb_dequeue(&packet->chunks)) != NULL)
+ list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
+ list_del_init(&chunk->list);
sctp_chunk_free(chunk);
+ }
if (packet->malloced)
kfree(packet);
@@ -276,7 +278,7 @@ append:
packet->has_sack = 1;
/* It is OK to send this chunk. */
- __skb_queue_tail(&packet->chunks, (struct sk_buff *)chunk);
+ list_add_tail(&chunk->list, &packet->chunk_list);
packet->size += chunk_len;
chunk->transport = packet->transport;
finish:
@@ -295,7 +297,7 @@ int sctp_packet_transmit(struct sctp_packet *packet)
struct sctphdr *sh;
__u32 crc32;
struct sk_buff *nskb;
- struct sctp_chunk *chunk;
+ struct sctp_chunk *chunk, *tmp;
struct sock *sk;
int err = 0;
int padding; /* How much padding do we need? */
@@ -305,11 +307,11 @@ int sctp_packet_transmit(struct sctp_packet *packet)
SCTP_DEBUG_PRINTK("%s: packet:%p\n", __FUNCTION__, packet);
/* Do NOT generate a chunkless packet. */
- chunk = (struct sctp_chunk *)skb_peek(&packet->chunks);
- if (unlikely(!chunk))
+ if (list_empty(&packet->chunk_list))
return err;
/* Set up convenience variables... */
+ chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list);
sk = chunk->skb->sk;
/* Allocate the new skb. */
@@ -370,7 +372,8 @@ int sctp_packet_transmit(struct sctp_packet *packet)
* [This whole comment explains WORD_ROUND() below.]
*/
SCTP_DEBUG_PRINTK("***sctp_transmit_packet***\n");
- while ((chunk = (struct sctp_chunk *)__skb_dequeue(&packet->chunks)) != NULL) {
+ list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
+ list_del_init(&chunk->list);
if (sctp_chunk_is_data(chunk)) {
if (!chunk->has_tsn) {
@@ -511,7 +514,8 @@ err:
* will get resent or dropped later.
*/
- while ((chunk = (struct sctp_chunk *)__skb_dequeue(&packet->chunks)) != NULL) {
+ list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
+ list_del_init(&chunk->list);
if (!sctp_chunk_is_data(chunk))
sctp_chunk_free(chunk);
}
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 4eb81a1407b7..efb72faba20c 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -75,7 +75,7 @@ static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn);
static inline void sctp_outq_head_data(struct sctp_outq *q,
struct sctp_chunk *ch)
{
- __skb_queue_head(&q->out, (struct sk_buff *)ch);
+ list_add(&ch->list, &q->out_chunk_list);
q->out_qlen += ch->skb->len;
return;
}
@@ -83,17 +83,22 @@ static inline void sctp_outq_head_data(struct sctp_outq *q,
/* Take data from the front of the queue. */
static inline struct sctp_chunk *sctp_outq_dequeue_data(struct sctp_outq *q)
{
- struct sctp_chunk *ch;
- ch = (struct sctp_chunk *)__skb_dequeue(&q->out);
- if (ch)
+ struct sctp_chunk *ch = NULL;
+
+ if (!list_empty(&q->out_chunk_list)) {
+ struct list_head *entry = q->out_chunk_list.next;
+
+ ch = list_entry(entry, struct sctp_chunk, list);
+ list_del_init(entry);
q->out_qlen -= ch->skb->len;
+ }
return ch;
}
/* Add data chunk to the end of the queue. */
static inline void sctp_outq_tail_data(struct sctp_outq *q,
struct sctp_chunk *ch)
{
- __skb_queue_tail(&q->out, (struct sk_buff *)ch);
+ list_add_tail(&ch->list, &q->out_chunk_list);
q->out_qlen += ch->skb->len;
return;
}
@@ -197,8 +202,8 @@ static inline int sctp_cacc_skip(struct sctp_transport *primary,
void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q)
{
q->asoc = asoc;
- skb_queue_head_init(&q->out);
- skb_queue_head_init(&q->control);
+ INIT_LIST_HEAD(&q->out_chunk_list);
+ INIT_LIST_HEAD(&q->control_chunk_list);
INIT_LIST_HEAD(&q->retransmit);
INIT_LIST_HEAD(&q->sacked);
INIT_LIST_HEAD(&q->abandoned);
@@ -217,7 +222,7 @@ void sctp_outq_teardown(struct sctp_outq *q)
{
struct sctp_transport *transport;
struct list_head *lchunk, *pos, *temp;
- struct sctp_chunk *chunk;
+ struct sctp_chunk *chunk, *tmp;
/* Throw away unacknowledged chunks. */
list_for_each(pos, &q->asoc->peer.transport_addr_list) {
@@ -269,8 +274,10 @@ void sctp_outq_teardown(struct sctp_outq *q)
q->error = 0;
/* Throw away any leftover control chunks. */
- while ((chunk = (struct sctp_chunk *) skb_dequeue(&q->control)) != NULL)
+ list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) {
+ list_del_init(&chunk->list);
sctp_chunk_free(chunk);
+ }
}
/* Free the outqueue structure and any related pending chunks. */
@@ -333,7 +340,7 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk)
break;
};
} else {
- __skb_queue_tail(&q->control, (struct sk_buff *) chunk);
+ list_add_tail(&chunk->list, &q->control_chunk_list);
SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS);
}
@@ -650,10 +657,9 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
__u16 sport = asoc->base.bind_addr.port;
__u16 dport = asoc->peer.port;
__u32 vtag = asoc->peer.i.init_tag;
- struct sk_buff_head *queue;
struct sctp_transport *transport = NULL;
struct sctp_transport *new_transport;
- struct sctp_chunk *chunk;
+ struct sctp_chunk *chunk, *tmp;
sctp_xmit_t status;
int error = 0;
int start_timer = 0;
@@ -675,8 +681,9 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
* ...
*/
- queue = &q->control;
- while ((chunk = (struct sctp_chunk *)skb_dequeue(queue)) != NULL) {
+ list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) {
+ list_del_init(&chunk->list);
+
/* Pick the right transport to use. */
new_transport = chunk->transport;
@@ -814,8 +821,6 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
/* Finally, transmit new packets. */
start_timer = 0;
- queue = &q->out;
-
while ((chunk = sctp_outq_dequeue_data(q)) != NULL) {
/* RFC 2960 6.5 Every DATA chunk MUST carry a valid
* stream identifier.
@@ -1149,8 +1154,9 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_sackhdr *sack)
/* See if all chunks are acked.
* Make sure the empty queue handler will get run later.
*/
- q->empty = skb_queue_empty(&q->out) && skb_queue_empty(&q->control) &&
- list_empty(&q->retransmit);
+ q->empty = (list_empty(&q->out_chunk_list) &&
+ list_empty(&q->control_chunk_list) &&
+ list_empty(&q->retransmit));
if (!q->empty)
goto finish;
@@ -1679,9 +1685,9 @@ static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn)
if (TSN_lte(tsn, ctsn)) {
list_del_init(lchunk);
if (!chunk->tsn_gap_acked) {
- chunk->transport->flight_size -=
- sctp_data_size(chunk);
- q->outstanding_bytes -= sctp_data_size(chunk);
+ chunk->transport->flight_size -=
+ sctp_data_size(chunk);
+ q->outstanding_bytes -= sctp_data_size(chunk);
}
sctp_chunk_free(chunk);
} else {
@@ -1729,7 +1735,7 @@ static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn)
nskips, &ftsn_skip_arr[0]);
if (ftsn_chunk) {
- __skb_queue_tail(&q->control, (struct sk_buff *)ftsn_chunk);
+ list_add_tail(&ftsn_chunk->list, &q->control_chunk_list);
SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS);
}
}
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 5baed9bb7de5..773cd93fa3d0 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1003,6 +1003,7 @@ struct sctp_chunk *sctp_chunkify(struct sk_buff *skb,
SCTP_DEBUG_PRINTK("chunkifying skb %p w/o an sk\n", skb);
}
+ INIT_LIST_HEAD(&retval->list);
retval->skb = skb;
retval->asoc = (struct sctp_association *)asoc;
retval->resent = 0;
@@ -1116,8 +1117,7 @@ static void sctp_chunk_destroy(struct sctp_chunk *chunk)
/* Possibly, free the chunk. */
void sctp_chunk_free(struct sctp_chunk *chunk)
{
- /* Make sure that we are not on any list. */
- skb_unlink((struct sk_buff *) chunk);
+ BUG_ON(!list_empty(&chunk->list));
list_del_init(&chunk->transmitted_list);
/* Release our reference on the message tracker. */
@@ -2739,8 +2739,12 @@ int sctp_process_asconf_ack(struct sctp_association *asoc,
asoc->addip_last_asconf = NULL;
/* Send the next asconf chunk from the addip chunk queue. */
- asconf = (struct sctp_chunk *)__skb_dequeue(&asoc->addip_chunks);
- if (asconf) {
+ if (!list_empty(&asoc->addip_chunk_list)) {
+ struct list_head *entry = asoc->addip_chunk_list.next;
+ asconf = list_entry(entry, struct sctp_chunk, list);
+
+ list_del_init(entry);
+
/* Hold the chunk until an ASCONF_ACK is received. */
sctp_chunk_hold(asconf);
if (sctp_primitive_ASCONF(asoc, asconf))
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index aad55dc3792b..091a66f06a35 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -406,7 +406,7 @@ static int sctp_send_asconf(struct sctp_association *asoc,
* transmission.
*/
if (asoc->addip_last_asconf) {
- __skb_queue_tail(&asoc->addip_chunks, (struct sk_buff *)chunk);
+ list_add_tail(&chunk->list, &asoc->addip_chunk_list);
goto out;
}
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 269f217918a3..3c654e06b084 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -145,8 +145,6 @@ __xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task)
if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) {
if (task == xprt->snd_task)
return 1;
- if (task == NULL)
- return 0;
goto out_sleep;
}
if (xprt->nocong || __xprt_get_cong(xprt, task)) {
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index c420eba4876b..d403e34088ad 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -302,7 +302,7 @@ static void unix_write_space(struct sock *sk)
* may receive messages only from that peer. */
static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
{
- if (skb_queue_len(&sk->sk_receive_queue)) {
+ if (!skb_queue_empty(&sk->sk_receive_queue)) {
skb_queue_purge(&sk->sk_receive_queue);
wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
@@ -1619,7 +1619,7 @@ static long unix_stream_data_wait(struct sock * sk, long timeo)
for (;;) {
prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
- if (skb_queue_len(&sk->sk_receive_queue) ||
+ if (!skb_queue_empty(&sk->sk_receive_queue) ||
sk->sk_err ||
(sk->sk_shutdown & RCV_SHUTDOWN) ||
signal_pending(current) ||