From fc6415bcb0f58f03adb910e56d7e1df6368794e0 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:17:45 -0700 Subject: [TCP]: Fix quick-ack decrementing with TSO. On each packet output, we call tcp_dec_quickack_mode() if the ACK flag is set. It drops tp->ack.quick until it hits zero, at which time we deflate the ATO value. When doing TSO, we are emitting multiple packets with ACK set, so we should decrement tp->ack.quick that many segments. Note that, unlike this case, tcp_enter_cwr() should not take the tcp_skb_pcount(skb) into consideration. That function, one time, readjusts tp->snd_cwnd and moves into TCP_CA_CWR state. Signed-off-by: David S. Miller --- include/net/tcp.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index ec9e20c27179..afe41c5de2f2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -721,11 +721,16 @@ static inline int tcp_ack_scheduled(struct tcp_sock *tp) return tp->ack.pending&TCP_ACK_SCHED; } -static __inline__ void tcp_dec_quickack_mode(struct tcp_sock *tp) +static __inline__ void tcp_dec_quickack_mode(struct tcp_sock *tp, unsigned int pkts) { - if (tp->ack.quick && --tp->ack.quick == 0) { - /* Leaving quickack mode we deflate ATO. */ - tp->ack.ato = TCP_ATO_MIN; + if (tp->ack.quick) { + if (pkts >= tp->ack.quick) { + tp->ack.quick = 0; + + /* Leaving quickack mode we deflate ATO. */ + tp->ack.ato = TCP_ATO_MIN; + } else + tp->ack.quick -= pkts; } } -- cgit v1.2.3 From f6302d1d78f77c2d4c8bd32b0afc2df7fdf5f281 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:18:03 -0700 Subject: [TCP]: Move send test logic out of net/tcp.h This just moves the code into tcp_output.c, no code logic changes are made by this patch. Using this as a baseline, we can begin to untangle the mess of comparisons for the Nagle test et al. We will also be able to reduce all of the redundant computation that occurs when outputting data packets. Signed-off-by: David S. Miller --- include/net/tcp.h | 113 +------------------------------------ net/ipv4/tcp_output.c | 150 +++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 132 insertions(+), 131 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index afe41c5de2f2..f2b104532de1 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -849,6 +849,9 @@ extern __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, /* tcp_output.c */ extern int tcp_write_xmit(struct sock *, int nonagle); +extern void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, + unsigned cur_mss, int nonagle); +extern int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp); extern int tcp_retransmit_skb(struct sock *, struct sk_buff *); extern void tcp_xmit_retransmit_queue(struct sock *); extern void tcp_simple_retransmit(struct sock *); @@ -1284,12 +1287,6 @@ static __inline__ __u32 tcp_max_burst(const struct tcp_sock *tp) return 3; } -static __inline__ int tcp_minshall_check(const struct tcp_sock *tp) -{ - return after(tp->snd_sml,tp->snd_una) && - !after(tp->snd_sml, tp->snd_nxt); -} - static __inline__ void tcp_minshall_update(struct tcp_sock *tp, int mss, const struct sk_buff *skb) { @@ -1297,122 +1294,18 @@ static __inline__ void tcp_minshall_update(struct tcp_sock *tp, int mss, tp->snd_sml = TCP_SKB_CB(skb)->end_seq; } -/* Return 0, if packet can be sent now without violation Nagle's rules: - 1. It is full sized. - 2. Or it contains FIN. - 3. Or TCP_NODELAY was set. - 4. Or TCP_CORK is not set, and all sent packets are ACKed. - With Minshall's modification: all sent small packets are ACKed. - */ - -static __inline__ int -tcp_nagle_check(const struct tcp_sock *tp, const struct sk_buff *skb, - unsigned mss_now, int nonagle) -{ - return (skb->len < mss_now && - !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && - ((nonagle&TCP_NAGLE_CORK) || - (!nonagle && - tp->packets_out && - tcp_minshall_check(tp)))); -} - -extern void tcp_set_skb_tso_segs(struct sock *, struct sk_buff *); - -/* This checks if the data bearing packet SKB (usually sk->sk_send_head) - * should be put on the wire right now. - */ -static __inline__ int tcp_snd_test(struct sock *sk, - struct sk_buff *skb, - unsigned cur_mss, int nonagle) -{ - struct tcp_sock *tp = tcp_sk(sk); - int pkts = tcp_skb_pcount(skb); - - if (!pkts) { - tcp_set_skb_tso_segs(sk, skb); - pkts = tcp_skb_pcount(skb); - } - - /* RFC 1122 - section 4.2.3.4 - * - * We must queue if - * - * a) The right edge of this frame exceeds the window - * b) There are packets in flight and we have a small segment - * [SWS avoidance and Nagle algorithm] - * (part of SWS is done on packetization) - * Minshall version sounds: there are no _small_ - * segments in flight. (tcp_nagle_check) - * c) We have too many packets 'in flight' - * - * Don't use the nagle rule for urgent data (or - * for the final FIN -DaveM). - * - * Also, Nagle rule does not apply to frames, which - * sit in the middle of queue (they have no chances - * to get new data) and if room at tail of skb is - * not enough to save something seriously (<32 for now). - */ - - /* Don't be strict about the congestion window for the - * final FIN frame. -DaveM - */ - return (((nonagle&TCP_NAGLE_PUSH) || tp->urg_mode - || !tcp_nagle_check(tp, skb, cur_mss, nonagle)) && - (((tcp_packets_in_flight(tp) + (pkts-1)) < tp->snd_cwnd) || - (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) && - !after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd)); -} - static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *tp) { if (!tp->packets_out && !tp->pending) tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, tp->rto); } -static __inline__ int tcp_skb_is_last(const struct sock *sk, - const struct sk_buff *skb) -{ - return skb->next == (struct sk_buff *)&sk->sk_write_queue; -} - -/* Push out any pending frames which were held back due to - * TCP_CORK or attempt at coalescing tiny packets. - * The socket must be locked by the caller. - */ -static __inline__ void __tcp_push_pending_frames(struct sock *sk, - struct tcp_sock *tp, - unsigned cur_mss, - int nonagle) -{ - struct sk_buff *skb = sk->sk_send_head; - - if (skb) { - if (!tcp_skb_is_last(sk, skb)) - nonagle = TCP_NAGLE_PUSH; - if (!tcp_snd_test(sk, skb, cur_mss, nonagle) || - tcp_write_xmit(sk, nonagle)) - tcp_check_probe_timer(sk, tp); - } - tcp_cwnd_validate(sk, tp); -} - static __inline__ void tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp) { __tcp_push_pending_frames(sk, tp, tcp_current_mss(sk, 1), tp->nonagle); } -static __inline__ int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp) -{ - struct sk_buff *skb = sk->sk_send_head; - - return (skb && - tcp_snd_test(sk, skb, tcp_current_mss(sk, 1), - tcp_skb_is_last(sk, skb) ? TCP_NAGLE_PUSH : tp->nonagle)); -} - static __inline__ void tcp_init_wl(struct tcp_sock *tp, u32 ack, u32 seq) { tp->snd_wl1 = seq; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 389deeb2a457..2cbe879ee16a 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -413,6 +413,135 @@ static inline void tcp_tso_set_push(struct sk_buff *skb) TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; } +static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (skb->len <= tp->mss_cache_std || + !(sk->sk_route_caps & NETIF_F_TSO)) { + /* Avoid the costly divide in the normal + * non-TSO case. + */ + skb_shinfo(skb)->tso_segs = 1; + skb_shinfo(skb)->tso_size = 0; + } else { + unsigned int factor; + + factor = skb->len + (tp->mss_cache_std - 1); + factor /= tp->mss_cache_std; + skb_shinfo(skb)->tso_segs = factor; + skb_shinfo(skb)->tso_size = tp->mss_cache_std; + } +} + +static inline int tcp_minshall_check(const struct tcp_sock *tp) +{ + return after(tp->snd_sml,tp->snd_una) && + !after(tp->snd_sml, tp->snd_nxt); +} + +/* Return 0, if packet can be sent now without violation Nagle's rules: + * 1. It is full sized. + * 2. Or it contains FIN. + * 3. Or TCP_NODELAY was set. + * 4. Or TCP_CORK is not set, and all sent packets are ACKed. + * With Minshall's modification: all sent small packets are ACKed. + */ + +static inline int tcp_nagle_check(const struct tcp_sock *tp, + const struct sk_buff *skb, + unsigned mss_now, int nonagle) +{ + return (skb->len < mss_now && + !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && + ((nonagle&TCP_NAGLE_CORK) || + (!nonagle && + tp->packets_out && + tcp_minshall_check(tp)))); +} + +/* This checks if the data bearing packet SKB (usually sk->sk_send_head) + * should be put on the wire right now. + */ +static int tcp_snd_test(struct sock *sk, struct sk_buff *skb, + unsigned cur_mss, int nonagle) +{ + struct tcp_sock *tp = tcp_sk(sk); + int pkts = tcp_skb_pcount(skb); + + if (!pkts) { + tcp_set_skb_tso_segs(sk, skb); + pkts = tcp_skb_pcount(skb); + } + + /* RFC 1122 - section 4.2.3.4 + * + * We must queue if + * + * a) The right edge of this frame exceeds the window + * b) There are packets in flight and we have a small segment + * [SWS avoidance and Nagle algorithm] + * (part of SWS is done on packetization) + * Minshall version sounds: there are no _small_ + * segments in flight. (tcp_nagle_check) + * c) We have too many packets 'in flight' + * + * Don't use the nagle rule for urgent data (or + * for the final FIN -DaveM). + * + * Also, Nagle rule does not apply to frames, which + * sit in the middle of queue (they have no chances + * to get new data) and if room at tail of skb is + * not enough to save something seriously (<32 for now). + */ + + /* Don't be strict about the congestion window for the + * final FIN frame. -DaveM + */ + return (((nonagle&TCP_NAGLE_PUSH) || tp->urg_mode + || !tcp_nagle_check(tp, skb, cur_mss, nonagle)) && + (((tcp_packets_in_flight(tp) + (pkts-1)) < tp->snd_cwnd) || + (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) && + !after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd)); +} + +static inline int tcp_skb_is_last(const struct sock *sk, + const struct sk_buff *skb) +{ + return skb->next == (struct sk_buff *)&sk->sk_write_queue; +} + +/* Push out any pending frames which were held back due to + * TCP_CORK or attempt at coalescing tiny packets. + * The socket must be locked by the caller. + */ +void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, + unsigned cur_mss, int nonagle) +{ + struct sk_buff *skb = sk->sk_send_head; + + if (skb) { + if (!tcp_skb_is_last(sk, skb)) + nonagle = TCP_NAGLE_PUSH; + if (!tcp_snd_test(sk, skb, cur_mss, nonagle) || + tcp_write_xmit(sk, nonagle)) + tcp_check_probe_timer(sk, tp); + } + tcp_cwnd_validate(sk, tp); +} + +int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp) +{ + struct sk_buff *skb = sk->sk_send_head; + + return (skb && + tcp_snd_test(sk, skb, tcp_current_mss(sk, 1), + (tcp_skb_is_last(sk, skb) ? + TCP_NAGLE_PUSH : + tp->nonagle))); +} + + /* Send _single_ skb sitting at the send head. This function requires * true push pending frames to setup probe timer etc. */ @@ -434,27 +563,6 @@ void tcp_push_one(struct sock *sk, unsigned cur_mss) } } -void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (skb->len <= tp->mss_cache_std || - !(sk->sk_route_caps & NETIF_F_TSO)) { - /* Avoid the costly divide in the normal - * non-TSO case. - */ - skb_shinfo(skb)->tso_segs = 1; - skb_shinfo(skb)->tso_size = 0; - } else { - unsigned int factor; - - factor = skb->len + (tp->mss_cache_std - 1); - factor /= tp->mss_cache_std; - skb_shinfo(skb)->tso_segs = factor; - skb_shinfo(skb)->tso_size = tp->mss_cache_std; - } -} - /* Function to create two new TCP segments. Shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list. This won't be called frequently, I hope. -- cgit v1.2.3 From 84d3e7b9573291a1ea845bdd51b74bb484597661 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:18:18 -0700 Subject: [TCP]: Move __tcp_data_snd_check into tcp_output.c It reimplements portions of tcp_snd_check(), so it we move it to tcp_output.c we can consolidate it's logic much easier in a later change. Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + net/ipv4/tcp_input.c | 10 ---------- net/ipv4/tcp_output.c | 10 ++++++++++ 3 files changed, 11 insertions(+), 10 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index f2b104532de1..4888f9d3f56b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -849,6 +849,7 @@ extern __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, /* tcp_output.c */ extern int tcp_write_xmit(struct sock *, int nonagle); +extern void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb); extern void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, unsigned cur_mss, int nonagle); extern int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7bbbbc33eb4b..577424323d59 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3346,16 +3346,6 @@ static inline void tcp_check_space(struct sock *sk) } } -static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) || - tcp_packets_in_flight(tp) >= tp->snd_cwnd || - tcp_write_xmit(sk, tp->nonagle)) - tcp_check_probe_timer(sk, tp); -} - static __inline__ void tcp_data_snd_check(struct sock *sk) { struct sk_buff *skb = sk->sk_send_head; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 2cbe879ee16a..362b811a2460 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -530,6 +530,16 @@ void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, tcp_cwnd_validate(sk, tp); } +void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) || + tcp_packets_in_flight(tp) >= tp->snd_cwnd || + tcp_write_xmit(sk, tp->nonagle)) + tcp_check_probe_timer(sk, tp); +} + int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp) { struct sk_buff *skb = sk->sk_send_head; -- cgit v1.2.3 From a762a9800752f05fa8768bb0ac35d0e7f1bcfe7f Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:18:51 -0700 Subject: [TCP]: Kill extra cwnd validate in __tcp_push_pending_frames(). The tcp_cwnd_validate() function should only be invoked if we actually send some frames, yet __tcp_push_pending_frames() will always invoke it. tcp_write_xmit() does the call for us, so the call here can simply be removed. Also, tcp_write_xmit() can be marked static. Signed-off-by: David S. Miller --- include/net/tcp.h | 26 ++--------------- net/ipv4/tcp_output.c | 79 ++++++++++++++++++++++++++++++++------------------- 2 files changed, 52 insertions(+), 53 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index 4888f9d3f56b..f32e7aed2c75 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -848,7 +848,6 @@ extern __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, /* tcp_output.c */ -extern int tcp_write_xmit(struct sock *, int nonagle); extern void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb); extern void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, unsigned cur_mss, int nonagle); @@ -868,6 +867,9 @@ extern void tcp_push_one(struct sock *, unsigned mss_now); extern void tcp_send_ack(struct sock *sk); extern void tcp_send_delayed_ack(struct sock *sk); +/* tcp_input.c */ +extern void tcp_cwnd_application_limited(struct sock *sk); + /* tcp_timer.c */ extern void tcp_init_xmit_timers(struct sock *); extern void tcp_clear_xmit_timers(struct sock *); @@ -1234,28 +1236,6 @@ static inline void tcp_sync_left_out(struct tcp_sock *tp) tp->left_out = tp->sacked_out + tp->lost_out; } -extern void tcp_cwnd_application_limited(struct sock *sk); - -/* Congestion window validation. (RFC2861) */ - -static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) -{ - __u32 packets_out = tp->packets_out; - - if (packets_out >= tp->snd_cwnd) { - /* Network is feed fully. */ - tp->snd_cwnd_used = 0; - tp->snd_cwnd_stamp = tcp_time_stamp; - } else { - /* Network starves. */ - if (tp->packets_out > tp->snd_cwnd_used) - tp->snd_cwnd_used = tp->packets_out; - - if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto) - tcp_cwnd_application_limited(sk); - } -} - /* Set slow start threshould and cwnd not falling to slow start */ static inline void __tcp_enter_cwr(struct tcp_sock *tp) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5e63ed09658d..a6375ca2a59e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -511,35 +511,6 @@ static inline int tcp_skb_is_last(const struct sock *sk, return skb->next == (struct sk_buff *)&sk->sk_write_queue; } -/* Push out any pending frames which were held back due to - * TCP_CORK or attempt at coalescing tiny packets. - * The socket must be locked by the caller. - */ -void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, - unsigned cur_mss, int nonagle) -{ - struct sk_buff *skb = sk->sk_send_head; - - if (skb) { - if (!tcp_skb_is_last(sk, skb)) - nonagle = TCP_NAGLE_PUSH; - if (!tcp_snd_test(sk, skb, cur_mss, nonagle) || - tcp_write_xmit(sk, nonagle)) - tcp_check_probe_timer(sk, tp); - } - tcp_cwnd_validate(sk, tp); -} - -void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) || - tcp_packets_in_flight(tp) >= tp->snd_cwnd || - tcp_write_xmit(sk, tp->nonagle)) - tcp_check_probe_timer(sk, tp); -} - int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp) { struct sk_buff *skb = sk->sk_send_head; @@ -841,6 +812,26 @@ unsigned int tcp_current_mss(struct sock *sk, int large) return mss_now; } +/* Congestion window validation. (RFC2861) */ + +static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) +{ + __u32 packets_out = tp->packets_out; + + if (packets_out >= tp->snd_cwnd) { + /* Network is feed fully. */ + tp->snd_cwnd_used = 0; + tp->snd_cwnd_stamp = tcp_time_stamp; + } else { + /* Network starves. */ + if (tp->packets_out > tp->snd_cwnd_used) + tp->snd_cwnd_used = tp->packets_out; + + if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto) + tcp_cwnd_application_limited(sk); + } +} + /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. @@ -848,7 +839,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large) * Returns 1, if no segments are in flight and we have queued segments, but * cannot send anything now because of SWS or another problem. */ -int tcp_write_xmit(struct sock *sk, int nonagle) +static int tcp_write_xmit(struct sock *sk, int nonagle) { struct tcp_sock *tp = tcp_sk(sk); unsigned int mss_now; @@ -901,6 +892,34 @@ int tcp_write_xmit(struct sock *sk, int nonagle) return 0; } +/* Push out any pending frames which were held back due to + * TCP_CORK or attempt at coalescing tiny packets. + * The socket must be locked by the caller. + */ +void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, + unsigned cur_mss, int nonagle) +{ + struct sk_buff *skb = sk->sk_send_head; + + if (skb) { + if (!tcp_skb_is_last(sk, skb)) + nonagle = TCP_NAGLE_PUSH; + if (!tcp_snd_test(sk, skb, cur_mss, nonagle) || + tcp_write_xmit(sk, nonagle)) + tcp_check_probe_timer(sk, tp); + } +} + +void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) || + tcp_packets_in_flight(tp) >= tp->snd_cwnd || + tcp_write_xmit(sk, tp->nonagle)) + tcp_check_probe_timer(sk, tp); +} + /* This function returns the amount that we can raise the * usable window based on the following constraints * -- cgit v1.2.3 From a2e2a59c93cc8ba39caa9011c2573f429e40ccd9 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:19:23 -0700 Subject: [TCP]: Fix redundant calculations of tcp_current_mss() tcp_write_xmit() uses tcp_current_mss(), but some of it's callers, namely __tcp_push_pending_frames(), already has this value available already. While we're here, fix the "cur_mss" argument to be "unsigned int" instead of plain "unsigned". Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- net/ipv4/tcp_output.c | 16 ++++------------ 2 files changed, 5 insertions(+), 13 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index f32e7aed2c75..9416236cc395 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -850,7 +850,7 @@ extern __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, extern void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb); extern void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, - unsigned cur_mss, int nonagle); + unsigned int cur_mss, int nonagle); extern int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp); extern int tcp_retransmit_skb(struct sock *, struct sk_buff *); extern void tcp_xmit_retransmit_queue(struct sock *); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 2a8409c3af1a..e292e11c7319 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -839,11 +839,10 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) * Returns 1, if no segments are in flight and we have queued segments, but * cannot send anything now because of SWS or another problem. */ -static int tcp_write_xmit(struct sock *sk, int nonagle) +static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - unsigned int mss_now; int sent_pkts; /* If we are closed, the bytes will have to remain here. @@ -853,13 +852,6 @@ static int tcp_write_xmit(struct sock *sk, int nonagle) if (unlikely(sk->sk_state == TCP_CLOSE)) return 0; - - /* Account for SACKS, we may need to fragment due to this. - * It is just like the real MSS changing on us midstream. - * We also handle things correctly when the user adds some - * IP options mid-stream. Silly to do, but cover it. - */ - mss_now = tcp_current_mss(sk, 1); sent_pkts = 0; while ((skb = sk->sk_send_head) && tcp_snd_test(sk, skb, mss_now, @@ -897,7 +889,7 @@ static int tcp_write_xmit(struct sock *sk, int nonagle) * The socket must be locked by the caller. */ void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, - unsigned cur_mss, int nonagle) + unsigned int cur_mss, int nonagle) { struct sk_buff *skb = sk->sk_send_head; @@ -905,7 +897,7 @@ void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, if (!tcp_skb_is_last(sk, skb)) nonagle = TCP_NAGLE_PUSH; if (!tcp_snd_test(sk, skb, cur_mss, nonagle) || - tcp_write_xmit(sk, nonagle)) + tcp_write_xmit(sk, cur_mss, nonagle)) tcp_check_probe_timer(sk, tp); } } @@ -916,7 +908,7 @@ void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) || tcp_packets_in_flight(tp) >= tp->snd_cwnd || - tcp_write_xmit(sk, tp->nonagle)) + tcp_write_xmit(sk, tcp_current_mss(sk, 1), tp->nonagle)) tcp_check_probe_timer(sk, tp); } -- cgit v1.2.3 From 55c97f3e990c1ff63957c64f6cb10711a09fd70e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:19:38 -0700 Subject: [TCP]: Fix __tcp_push_pending_frames() 'nonagle' handling. 'nonagle' should be passed to the tcp_snd_test() function as 'TCP_NAGLE_PUSH' if we are checking an SKB not at the tail of the write_queue. This is because Nagle does not apply to such frames since we cannot possibly tack more data onto them. However, while doing this __tcp_push_pending_frames() makes all of the packets in the write_queue use this modified 'nonagle' value. Fix the bug and simplify this function by just calling tcp_write_xmit() directly if sk_send_head is non-NULL. As a result, we can now make tcp_data_snd_check() just call tcp_push_pending_frames() instead of the specialized __tcp_data_snd_check(). Signed-off-by: David S. Miller --- include/net/tcp.h | 1 - net/ipv4/tcp_input.c | 17 +++++++---------- net/ipv4/tcp_output.c | 15 +-------------- 3 files changed, 8 insertions(+), 25 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index 9416236cc395..b19238027da8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -848,7 +848,6 @@ extern __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, /* tcp_output.c */ -extern void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb); extern void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, unsigned int cur_mss, int nonagle); extern int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 577424323d59..b27be2f819ac 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3346,12 +3346,9 @@ static inline void tcp_check_space(struct sock *sk) } } -static __inline__ void tcp_data_snd_check(struct sock *sk) +static __inline__ void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp) { - struct sk_buff *skb = sk->sk_send_head; - - if (skb != NULL) - __tcp_data_snd_check(sk, skb); + tcp_push_pending_frames(sk, tp); tcp_check_space(sk); } @@ -3645,7 +3642,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, */ tcp_ack(sk, skb, 0); __kfree_skb(skb); - tcp_data_snd_check(sk); + tcp_data_snd_check(sk, tp); return 0; } else { /* Header too small */ TCP_INC_STATS_BH(TCP_MIB_INERRS); @@ -3711,7 +3708,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { /* Well, only one small jumplet in fast path... */ tcp_ack(sk, skb, FLAG_DATA); - tcp_data_snd_check(sk); + tcp_data_snd_check(sk, tp); if (!tcp_ack_scheduled(tp)) goto no_ack; } @@ -3789,7 +3786,7 @@ step5: /* step 7: process the segment text */ tcp_data_queue(sk, skb); - tcp_data_snd_check(sk); + tcp_data_snd_check(sk, tp); tcp_ack_snd_check(sk); return 0; @@ -4099,7 +4096,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* Do step6 onward by hand. */ tcp_urg(sk, skb, th); __kfree_skb(skb); - tcp_data_snd_check(sk); + tcp_data_snd_check(sk, tp); return 0; } @@ -4290,7 +4287,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* tcp_data could move socket to TIME-WAIT */ if (sk->sk_state != TCP_CLOSE) { - tcp_data_snd_check(sk); + tcp_data_snd_check(sk, tp); tcp_ack_snd_check(sk); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e292e11c7319..ce1d7cfbecfc 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -894,24 +894,11 @@ void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb = sk->sk_send_head; if (skb) { - if (!tcp_skb_is_last(sk, skb)) - nonagle = TCP_NAGLE_PUSH; - if (!tcp_snd_test(sk, skb, cur_mss, nonagle) || - tcp_write_xmit(sk, cur_mss, nonagle)) + if (tcp_write_xmit(sk, cur_mss, nonagle)) tcp_check_probe_timer(sk, tp); } } -void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) || - tcp_packets_in_flight(tp) >= tp->snd_cwnd || - tcp_write_xmit(sk, tcp_current_mss(sk, 1), tp->nonagle)) - tcp_check_probe_timer(sk, tp); -} - /* This function returns the amount that we can raise the * usable window based on the following constraints * -- cgit v1.2.3 From c1b4a7e69576d65efc31a8cea0714173c2841244 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:24:38 -0700 Subject: [TCP]: Move to new TSO segmenting scheme. Make TSO segment transmit size decisions at send time not earlier. The basic scheme is that we try to build as large a TSO frame as possible when pulling in the user data, but the size of the TSO frame output to the card is determined at transmit time. This is guided by tp->xmit_size_goal. It is always set to a multiple of MSS and tells sendmsg/sendpage how large an SKB to try and build. Later, tcp_write_xmit() and tcp_push_one() chop up the packet if necessary and conditions warrant. These routines can also decide to "defer" in order to wait for more ACKs to arrive and thus allow larger TSO frames to be emitted. A general observation is that TSO elongates the pipe, thus requiring a larger congestion window and larger buffering especially at the sender side. Therefore, it is important that applications 1) get a large enough socket send buffer (this is accomplished by our dynamic send buffer expansion code) 2) do large enough writes. Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 +- include/net/tcp.h | 4 +- net/ipv4/tcp.c | 26 ++- net/ipv4/tcp_input.c | 10 +- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_output.c | 578 +++++++++++++++++++++++++++++++------------------- net/ipv6/tcp_ipv6.c | 2 +- 7 files changed, 384 insertions(+), 240 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index dfd93d03f5d2..e4fd82e42104 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -286,7 +286,7 @@ struct tcp_sock { __u32 max_window; /* Maximal window ever seen from peer */ __u32 pmtu_cookie; /* Last pmtu seen by socket */ __u32 mss_cache; /* Cached effective mss, not including SACKS */ - __u16 mss_cache_std; /* Like mss_cache, but without TSO */ + __u16 xmit_size_goal; /* Goal for segmenting output packets */ __u16 ext_header_len; /* Network protocol overhead (IP/IPv6 options) */ __u8 ca_state; /* State of fast-retransmit machine */ __u8 retransmits; /* Number of unrecovered RTO timeouts. */ diff --git a/include/net/tcp.h b/include/net/tcp.h index b19238027da8..a166918ca56d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -862,7 +862,7 @@ extern int tcp_write_wakeup(struct sock *); extern void tcp_send_fin(struct sock *sk); extern void tcp_send_active_reset(struct sock *sk, int priority); extern int tcp_send_synack(struct sock *); -extern void tcp_push_one(struct sock *, unsigned mss_now); +extern void tcp_push_one(struct sock *, unsigned int mss_now); extern void tcp_send_ack(struct sock *sk); extern void tcp_send_delayed_ack(struct sock *sk); @@ -968,7 +968,7 @@ static inline void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long static inline void tcp_initialize_rcv_mss(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - unsigned int hint = min(tp->advmss, tp->mss_cache_std); + unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); hint = min(hint, tp->rcv_wnd/2); hint = min(hint, TCP_MIN_RCVMSS); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2ba73bf3a8f9..29894c749163 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -615,7 +615,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse size_t psize, int flags) { struct tcp_sock *tp = tcp_sk(sk); - int mss_now; + int mss_now, size_goal; int err; ssize_t copied; long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); @@ -628,6 +628,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + size_goal = tp->xmit_size_goal; copied = 0; err = -EPIPE; @@ -641,7 +642,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse int offset = poffset % PAGE_SIZE; int size = min_t(size_t, psize, PAGE_SIZE - offset); - if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) { + if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) { new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; @@ -652,7 +653,7 @@ new_segment: goto wait_for_memory; skb_entail(sk, tp, skb); - copy = mss_now; + copy = size_goal; } if (copy > size) @@ -693,7 +694,7 @@ new_segment: if (!(psize -= copy)) goto out; - if (skb->len != mss_now || (flags & MSG_OOB)) + if (skb->len < mss_now || (flags & MSG_OOB)) continue; if (forced_push(tp)) { @@ -713,6 +714,7 @@ wait_for_memory: goto do_error; mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + size_goal = tp->xmit_size_goal; } out: @@ -754,7 +756,7 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, static inline int select_size(struct sock *sk, struct tcp_sock *tp) { - int tmp = tp->mss_cache_std; + int tmp = tp->mss_cache; if (sk->sk_route_caps & NETIF_F_SG) { if (sk->sk_route_caps & NETIF_F_TSO) @@ -778,7 +780,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int iovlen, flags; - int mss_now; + int mss_now, size_goal; int err, copied; long timeo; @@ -797,6 +799,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + size_goal = tp->xmit_size_goal; /* Ok commence sending. */ iovlen = msg->msg_iovlen; @@ -819,7 +822,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, skb = sk->sk_write_queue.prev; if (!sk->sk_send_head || - (copy = mss_now - skb->len) <= 0) { + (copy = size_goal - skb->len) <= 0) { new_segment: /* Allocate new segment. If the interface is SG, @@ -842,7 +845,7 @@ new_segment: skb->ip_summed = CHECKSUM_HW; skb_entail(sk, tp, skb); - copy = mss_now; + copy = size_goal; } /* Try to append data to the end of skb. */ @@ -937,7 +940,7 @@ new_segment: if ((seglen -= copy) == 0 && iovlen == 0) goto out; - if (skb->len != mss_now || (flags & MSG_OOB)) + if (skb->len < mss_now || (flags & MSG_OOB)) continue; if (forced_push(tp)) { @@ -957,6 +960,7 @@ wait_for_memory: goto do_error; mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + size_goal = tp->xmit_size_goal; } } @@ -2128,7 +2132,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_rto = jiffies_to_usecs(tp->rto); info->tcpi_ato = jiffies_to_usecs(tp->ack.ato); - info->tcpi_snd_mss = tp->mss_cache_std; + info->tcpi_snd_mss = tp->mss_cache; info->tcpi_rcv_mss = tp->ack.rcv_mss; info->tcpi_unacked = tp->packets_out; @@ -2178,7 +2182,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, switch (optname) { case TCP_MAXSEG: - val = tp->mss_cache_std; + val = tp->mss_cache; if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) val = tp->rx_opt.user_mss; break; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2ef2f355b8b8..8de2f1071c2b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -740,10 +740,10 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); if (!cwnd) { - if (tp->mss_cache_std > 1460) + if (tp->mss_cache > 1460) cwnd = 2; else - cwnd = (tp->mss_cache_std > 1095) ? 3 : 4; + cwnd = (tp->mss_cache > 1095) ? 3 : 4; } return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } @@ -914,7 +914,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (sk->sk_route_caps & NETIF_F_TSO) { sk->sk_route_caps &= ~NETIF_F_TSO; sock_set_flag(sk, SOCK_NO_LARGESEND); - tp->mss_cache = tp->mss_cache_std; + tp->mss_cache = tp->mss_cache; } if (!tp->sacked_out) @@ -1077,7 +1077,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ (IsFack(tp) || !before(lost_retrans, TCP_SKB_CB(skb)->ack_seq + tp->reordering * - tp->mss_cache_std))) { + tp->mss_cache))) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); @@ -3334,7 +3334,7 @@ static void tcp_new_space(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); if (tcp_should_expand_sndbuf(sk, tp)) { - int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) + + int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff), demanded = max_t(unsigned int, tp->snd_cwnd, tp->reordering + 1); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ebf112347a97..62f62bb05c2a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2045,7 +2045,7 @@ static int tcp_v4_init_sock(struct sock *sk) */ tp->snd_ssthresh = 0x7fffffff; /* Infinity */ tp->snd_cwnd_clamp = ~0; - tp->mss_cache_std = tp->mss_cache = 536; + tp->mss_cache = 536; tp->reordering = sysctl_tcp_reordering; tp->ca_ops = &tcp_init_congestion_ops; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0a4cd24b6578..fd3ce38184ae 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -49,7 +49,7 @@ int sysctl_tcp_retrans_collapse = 1; * will allow a single TSO frame to consume. Building TSO frames * which are too large can cause TCP streams to be bursty. */ -int sysctl_tcp_tso_win_divisor = 8; +int sysctl_tcp_tso_win_divisor = 3; static inline void update_send_head(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) @@ -403,21 +403,11 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) sk->sk_send_head = skb; } -static inline void tcp_tso_set_push(struct sk_buff *skb) -{ - /* Force push to be on for any TSO frames to workaround - * problems with busted implementations like Mac OS-X that - * hold off socket receive wakeups until push is seen. - */ - if (tcp_skb_pcount(skb) > 1) - TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; -} - static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - if (skb->len <= tp->mss_cache_std || + if (skb->len <= tp->mss_cache || !(sk->sk_route_caps & NETIF_F_TSO)) { /* Avoid the costly divide in the normal * non-TSO case. @@ -427,164 +417,10 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb) } else { unsigned int factor; - factor = skb->len + (tp->mss_cache_std - 1); - factor /= tp->mss_cache_std; + factor = skb->len + (tp->mss_cache - 1); + factor /= tp->mss_cache; skb_shinfo(skb)->tso_segs = factor; - skb_shinfo(skb)->tso_size = tp->mss_cache_std; - } -} - -/* Does SKB fit into the send window? */ -static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss) -{ - u32 end_seq = TCP_SKB_CB(skb)->end_seq; - - return !after(end_seq, tp->snd_una + tp->snd_wnd); -} - -/* Can at least one segment of SKB be sent right now, according to the - * congestion window rules? If so, return how many segments are allowed. - */ -static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb) -{ - u32 in_flight, cwnd; - - /* Don't be strict about the congestion window for the final FIN. */ - if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) - return 1; - - in_flight = tcp_packets_in_flight(tp); - cwnd = tp->snd_cwnd; - if (in_flight < cwnd) - return (cwnd - in_flight); - - return 0; -} - -static inline int tcp_minshall_check(const struct tcp_sock *tp) -{ - return after(tp->snd_sml,tp->snd_una) && - !after(tp->snd_sml, tp->snd_nxt); -} - -/* Return 0, if packet can be sent now without violation Nagle's rules: - * 1. It is full sized. - * 2. Or it contains FIN. (already checked by caller) - * 3. Or TCP_NODELAY was set. - * 4. Or TCP_CORK is not set, and all sent packets are ACKed. - * With Minshall's modification: all sent small packets are ACKed. - */ - -static inline int tcp_nagle_check(const struct tcp_sock *tp, - const struct sk_buff *skb, - unsigned mss_now, int nonagle) -{ - return (skb->len < mss_now && - ((nonagle&TCP_NAGLE_CORK) || - (!nonagle && - tp->packets_out && - tcp_minshall_check(tp)))); -} - -/* Return non-zero if the Nagle test allows this packet to be - * sent now. - */ -static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, - unsigned int cur_mss, int nonagle) -{ - /* Nagle rule does not apply to frames, which sit in the middle of the - * write_queue (they have no chances to get new data). - * - * This is implemented in the callers, where they modify the 'nonagle' - * argument based upon the location of SKB in the send queue. - */ - if (nonagle & TCP_NAGLE_PUSH) - return 1; - - /* Don't use the nagle rule for urgent data (or for the final FIN). */ - if (tp->urg_mode || - (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) - return 1; - - if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) - return 1; - - return 0; -} - -/* This must be invoked the first time we consider transmitting - * SKB onto the wire. - */ -static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb) -{ - int tso_segs = tcp_skb_pcount(skb); - - if (!tso_segs) { - tcp_set_skb_tso_segs(sk, skb); - tso_segs = tcp_skb_pcount(skb); - } - return tso_segs; -} - -/* This checks if the data bearing packet SKB (usually sk->sk_send_head) - * should be put on the wire right now. If so, it returns the number of - * packets allowed by the congestion window. - */ -static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb, - unsigned int cur_mss, int nonagle) -{ - struct tcp_sock *tp = tcp_sk(sk); - unsigned int cwnd_quota; - - tcp_init_tso_segs(sk, skb); - - if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) - return 0; - - cwnd_quota = tcp_cwnd_test(tp, skb); - if (cwnd_quota && - !tcp_snd_wnd_test(tp, skb, cur_mss)) - cwnd_quota = 0; - - return cwnd_quota; -} - -static inline int tcp_skb_is_last(const struct sock *sk, - const struct sk_buff *skb) -{ - return skb->next == (struct sk_buff *)&sk->sk_write_queue; -} - -int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp) -{ - struct sk_buff *skb = sk->sk_send_head; - - return (skb && - tcp_snd_test(sk, skb, tcp_current_mss(sk, 1), - (tcp_skb_is_last(sk, skb) ? - TCP_NAGLE_PUSH : - tp->nonagle))); -} - - -/* Send _single_ skb sitting at the send head. This function requires - * true push pending frames to setup probe timer etc. - */ -void tcp_push_one(struct sock *sk, unsigned cur_mss) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb = sk->sk_send_head; - - if (tcp_snd_test(sk, skb, cur_mss, TCP_NAGLE_PUSH)) { - /* Send it out now. */ - TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_tso_set_push(skb); - if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) { - sk->sk_send_head = NULL; - tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - tcp_packets_out_inc(sk, tp, skb); - return; - } + skb_shinfo(skb)->tso_size = tp->mss_cache; } } @@ -791,7 +627,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) /* And store cached results */ tp->pmtu_cookie = pmtu; - tp->mss_cache = tp->mss_cache_std = mss_now; + tp->mss_cache = mss_now; return mss_now; } @@ -803,56 +639,47 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) * cannot be large. However, taking into account rare use of URG, this * is not a big flaw. */ - -unsigned int tcp_current_mss(struct sock *sk, int large) +unsigned int tcp_current_mss(struct sock *sk, int large_allowed) { struct tcp_sock *tp = tcp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); - unsigned int do_large, mss_now; + u32 mss_now; + u16 xmit_size_goal; + int doing_tso = 0; + + mss_now = tp->mss_cache; + + if (large_allowed && + (sk->sk_route_caps & NETIF_F_TSO) && + !tp->urg_mode) + doing_tso = 1; - mss_now = tp->mss_cache_std; if (dst) { u32 mtu = dst_mtu(dst); if (mtu != tp->pmtu_cookie) mss_now = tcp_sync_mss(sk, mtu); } - do_large = (large && - (sk->sk_route_caps & NETIF_F_TSO) && - !tp->urg_mode); + if (tp->rx_opt.eff_sacks) + mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + + (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); - if (do_large) { - unsigned int large_mss, factor, limit; + xmit_size_goal = mss_now; - large_mss = 65535 - tp->af_specific->net_header_len - + if (doing_tso) { + xmit_size_goal = 65535 - + tp->af_specific->net_header_len - tp->ext_header_len - tp->tcp_header_len; - if (tp->max_window && large_mss > (tp->max_window>>1)) - large_mss = max((tp->max_window>>1), - 68U - tp->tcp_header_len); - - factor = large_mss / mss_now; + if (tp->max_window && + (xmit_size_goal > (tp->max_window >> 1))) + xmit_size_goal = max((tp->max_window >> 1), + 68U - tp->tcp_header_len); - /* Always keep large mss multiple of real mss, but - * do not exceed 1/tso_win_divisor of the congestion window - * so we can keep the ACK clock ticking and minimize - * bursting. - */ - limit = tp->snd_cwnd; - if (sysctl_tcp_tso_win_divisor) - limit /= sysctl_tcp_tso_win_divisor; - limit = max(1U, limit); - if (factor > limit) - factor = limit; - - tp->mss_cache = mss_now * factor; - - mss_now = tp->mss_cache; + xmit_size_goal -= (xmit_size_goal % mss_now); } + tp->xmit_size_goal = xmit_size_goal; - if (tp->rx_opt.eff_sacks) - mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + - (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); return mss_now; } @@ -876,6 +703,251 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) } } +static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd) +{ + u32 window, cwnd_len; + + window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq); + cwnd_len = mss_now * cwnd; + return min(window, cwnd_len); +} + +/* Can at least one segment of SKB be sent right now, according to the + * congestion window rules? If so, return how many segments are allowed. + */ +static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb) +{ + u32 in_flight, cwnd; + + /* Don't be strict about the congestion window for the final FIN. */ + if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) + return 1; + + in_flight = tcp_packets_in_flight(tp); + cwnd = tp->snd_cwnd; + if (in_flight < cwnd) + return (cwnd - in_flight); + + return 0; +} + +/* This must be invoked the first time we consider transmitting + * SKB onto the wire. + */ +static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb) +{ + int tso_segs = tcp_skb_pcount(skb); + + if (!tso_segs) { + tcp_set_skb_tso_segs(sk, skb); + tso_segs = tcp_skb_pcount(skb); + } + return tso_segs; +} + +static inline int tcp_minshall_check(const struct tcp_sock *tp) +{ + return after(tp->snd_sml,tp->snd_una) && + !after(tp->snd_sml, tp->snd_nxt); +} + +/* Return 0, if packet can be sent now without violation Nagle's rules: + * 1. It is full sized. + * 2. Or it contains FIN. (already checked by caller) + * 3. Or TCP_NODELAY was set. + * 4. Or TCP_CORK is not set, and all sent packets are ACKed. + * With Minshall's modification: all sent small packets are ACKed. + */ + +static inline int tcp_nagle_check(const struct tcp_sock *tp, + const struct sk_buff *skb, + unsigned mss_now, int nonagle) +{ + return (skb->len < mss_now && + ((nonagle&TCP_NAGLE_CORK) || + (!nonagle && + tp->packets_out && + tcp_minshall_check(tp)))); +} + +/* Return non-zero if the Nagle test allows this packet to be + * sent now. + */ +static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, + unsigned int cur_mss, int nonagle) +{ + /* Nagle rule does not apply to frames, which sit in the middle of the + * write_queue (they have no chances to get new data). + * + * This is implemented in the callers, where they modify the 'nonagle' + * argument based upon the location of SKB in the send queue. + */ + if (nonagle & TCP_NAGLE_PUSH) + return 1; + + /* Don't use the nagle rule for urgent data (or for the final FIN). */ + if (tp->urg_mode || + (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) + return 1; + + if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) + return 1; + + return 0; +} + +/* Does at least the first segment of SKB fit into the send window? */ +static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss) +{ + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + + if (skb->len > cur_mss) + end_seq = TCP_SKB_CB(skb)->seq + cur_mss; + + return !after(end_seq, tp->snd_una + tp->snd_wnd); +} + +/* This checks if the data bearing packet SKB (usually sk->sk_send_head) + * should be put on the wire right now. If so, it returns the number of + * packets allowed by the congestion window. + */ +static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb, + unsigned int cur_mss, int nonagle) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned int cwnd_quota; + + tcp_init_tso_segs(sk, skb); + + if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) + return 0; + + cwnd_quota = tcp_cwnd_test(tp, skb); + if (cwnd_quota && + !tcp_snd_wnd_test(tp, skb, cur_mss)) + cwnd_quota = 0; + + return cwnd_quota; +} + +static inline int tcp_skb_is_last(const struct sock *sk, + const struct sk_buff *skb) +{ + return skb->next == (struct sk_buff *)&sk->sk_write_queue; +} + +int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp) +{ + struct sk_buff *skb = sk->sk_send_head; + + return (skb && + tcp_snd_test(sk, skb, tcp_current_mss(sk, 1), + (tcp_skb_is_last(sk, skb) ? + TCP_NAGLE_PUSH : + tp->nonagle))); +} + +/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet + * which is put after SKB on the list. It is very much like + * tcp_fragment() except that it may make several kinds of assumptions + * in order to speed up the splitting operation. In particular, we + * know that all the data is in scatter-gather pages, and that the + * packet has never been sent out before (and thus is not cloned). + */ +static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len) +{ + struct sk_buff *buff; + int nlen = skb->len - len; + u16 flags; + + /* All of a TSO frame must be composed of paged data. */ + BUG_ON(skb->len != skb->data_len); + + buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC); + if (unlikely(buff == NULL)) + return -ENOMEM; + + buff->truesize = nlen; + skb->truesize -= nlen; + + /* Correct the sequence numbers. */ + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; + TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; + + /* PSH and FIN should only be set in the second packet. */ + flags = TCP_SKB_CB(skb)->flags; + TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); + TCP_SKB_CB(buff)->flags = flags; + + /* This packet was never sent out yet, so no SACK bits. */ + TCP_SKB_CB(buff)->sacked = 0; + + buff->ip_summed = skb->ip_summed = CHECKSUM_HW; + skb_split(skb, buff, len); + + /* Fix up tso_factor for both original and new SKB. */ + tcp_set_skb_tso_segs(sk, skb); + tcp_set_skb_tso_segs(sk, buff); + + /* Link BUFF into the send queue. */ + skb_header_release(buff); + __skb_append(skb, buff); + + return 0; +} + +/* Try to defer sending, if possible, in order to minimize the amount + * of TSO splitting we do. View it as a kind of TSO Nagle test. + * + * This algorithm is from John Heffner. + */ +static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) +{ + u32 send_win, cong_win, limit, in_flight; + + if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) + return 0; + + in_flight = tcp_packets_in_flight(tp); + + BUG_ON(tcp_skb_pcount(skb) <= 1 || + (tp->snd_cwnd <= in_flight)); + + send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq; + + /* From in_flight test above, we know that cwnd > in_flight. */ + cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache; + + limit = min(send_win, cong_win); + + /* If sk_send_head can be sent fully now, just do it. */ + if (skb->len <= limit) + return 0; + + if (sysctl_tcp_tso_win_divisor) { + u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); + + /* If at least some fraction of a window is available, + * just use it. + */ + chunk /= sysctl_tcp_tso_win_divisor; + if (limit >= chunk) + return 0; + } else { + /* Different approach, try not to defer past a single + * ACK. Receiver should ACK every other full sized + * frame, so if we have space for more than 3 frames + * then send now. + */ + if (limit > tcp_max_burst(tp) * tp->mss_cache) + return 0; + } + + /* Ok, it looks like it is advisable to defer. */ + return 1; +} + /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. @@ -887,8 +959,8 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - unsigned int tso_segs, cwnd_quota; - int sent_pkts; + unsigned int tso_segs, sent_pkts; + int cwnd_quota; /* If we are closed, the bytes will have to remain here. * In time closedown will finish, we empty the write queue and all @@ -903,24 +975,44 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) tso_segs = tcp_init_tso_segs(sk, skb); cwnd_quota = tcp_cwnd_test(tp, skb); + if (unlikely(!cwnd_quota)) + goto out; + sent_pkts = 0; + while (likely(tcp_snd_wnd_test(tp, skb, mss_now))) { + BUG_ON(!tso_segs); - while (cwnd_quota >= tso_segs) { - if (unlikely(!tcp_nagle_test(tp, skb, mss_now, - (tcp_skb_is_last(sk, skb) ? - nonagle : TCP_NAGLE_PUSH)))) - break; + if (tso_segs == 1) { + if (unlikely(!tcp_nagle_test(tp, skb, mss_now, + (tcp_skb_is_last(sk, skb) ? + nonagle : TCP_NAGLE_PUSH)))) + break; + } else { + if (tcp_tso_should_defer(sk, tp, skb)) + break; + } - if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) - break; + if (tso_segs > 1) { + u32 limit = tcp_window_allows(tp, skb, + mss_now, cwnd_quota); + + if (skb->len < limit) { + unsigned int trim = skb->len % mss_now; - if (unlikely(skb->len > mss_now)) { + if (trim) + limit = skb->len - trim; + } + if (skb->len > limit) { + if (tso_fragment(sk, skb, limit)) + break; + } + } else if (unlikely(skb->len > mss_now)) { if (unlikely(tcp_fragment(sk, skb, mss_now))) break; } TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_tso_set_push(skb); + if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))) break; @@ -936,6 +1028,11 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) * the packet above, tso_segs will no longer be valid. */ cwnd_quota -= tcp_skb_pcount(skb); + + BUG_ON(cwnd_quota < 0); + if (!cwnd_quota) + break; + skb = sk->sk_send_head; if (!skb) break; @@ -946,7 +1043,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) tcp_cwnd_validate(sk, tp); return 0; } - +out: return !tp->packets_out && sk->sk_send_head; } @@ -965,6 +1062,53 @@ void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, } } +/* Send _single_ skb sitting at the send head. This function requires + * true push pending frames to setup probe timer etc. + */ +void tcp_push_one(struct sock *sk, unsigned int mss_now) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb = sk->sk_send_head; + unsigned int tso_segs, cwnd_quota; + + BUG_ON(!skb || skb->len < mss_now); + + tso_segs = tcp_init_tso_segs(sk, skb); + cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH); + + if (likely(cwnd_quota)) { + BUG_ON(!tso_segs); + + if (tso_segs > 1) { + u32 limit = tcp_window_allows(tp, skb, + mss_now, cwnd_quota); + + if (skb->len < limit) { + unsigned int trim = skb->len % mss_now; + + if (trim) + limit = skb->len - trim; + } + if (skb->len > limit) { + if (unlikely(tso_fragment(sk, skb, limit))) + return; + } + } else if (unlikely(skb->len > mss_now)) { + if (unlikely(tcp_fragment(sk, skb, mss_now))) + return; + } + + /* Send it out now. */ + TCP_SKB_CB(skb)->when = tcp_time_stamp; + + if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) { + update_send_head(sk, tp, skb); + tcp_cwnd_validate(sk, tp); + return; + } + } +} + /* This function returns the amount that we can raise the * usable window based on the following constraints * @@ -1222,7 +1366,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (sk->sk_route_caps & NETIF_F_TSO) { sk->sk_route_caps &= ~NETIF_F_TSO; sock_set_flag(sk, SOCK_NO_LARGESEND); - tp->mss_cache = tp->mss_cache_std; } if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) @@ -1284,7 +1427,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) * is still in somebody's hands, else make a clone. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_tso_set_push(skb); err = tcp_transmit_skb(sk, (skb_cloned(skb) ? pskb_copy(skb, GFP_ATOMIC): @@ -1853,14 +1995,12 @@ int tcp_write_wakeup(struct sock *sk) if (sk->sk_route_caps & NETIF_F_TSO) { sock_set_flag(sk, SOCK_NO_LARGESEND); sk->sk_route_caps &= ~NETIF_F_TSO; - tp->mss_cache = tp->mss_cache_std; } } else if (!tcp_skb_pcount(skb)) tcp_set_skb_tso_segs(sk, skb); TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_tso_set_push(skb); err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); if (!err) { update_send_head(sk, tp, skb); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 9dac7fdf4726..f6e288dc116e 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2018,7 +2018,7 @@ static int tcp_v6_init_sock(struct sock *sk) */ tp->snd_ssthresh = 0x7fffffff; tp->snd_cwnd_clamp = ~0; - tp->mss_cache_std = tp->mss_cache = 536; + tp->mss_cache = 536; tp->reordering = sysctl_tcp_reordering; -- cgit v1.2.3