From bdddbf6996c0b9299efc97b8f66e06286f3aa8c9 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Wed, 29 Apr 2015 08:42:44 +0800 Subject: xfrm: fix a race in xfrm_state_lookup_byspi The returned xfrm_state should be hold before unlock xfrm_state_lock, otherwise the returned xfrm_state maybe be released. Fixes: c454997e6[{pktgen, xfrm} Introduce xfrm_state_lookup_byspi..] Cc: Fan Du Signed-off-by: Li RongQing Acked-by: Fan Du Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index f5e39e35d73a..96688cd0f6f1 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -927,8 +927,8 @@ struct xfrm_state *xfrm_state_lookup_byspi(struct net *net, __be32 spi, x->id.spi != spi) continue; - spin_unlock_bh(&net->xfrm.xfrm_state_lock); xfrm_state_hold(x); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); return x; } spin_unlock_bh(&net->xfrm.xfrm_state_lock); -- cgit v1.2.3 From 64aa42338e9a88c139b89797163714f0f95f3c6b Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 13 May 2015 15:26:10 +0800 Subject: esp4: Use high-order sequence number bits for IV generation I noticed we were only using the low-order bits for IV generation when ESN is enabled. This is very bad because it means that the IV can repeat. We must use the full 64 bits. Signed-off-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/ipv4/esp4.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 421a80b09b62..30b544f025ac 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -256,7 +256,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) aead_givcrypt_set_crypt(req, sg, sg, clen, iv); aead_givcrypt_set_assoc(req, asg, assoclen); aead_givcrypt_set_giv(req, esph->enc_data, - XFRM_SKB_CB(skb)->seq.output.low); + XFRM_SKB_CB(skb)->seq.output.low + + ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32)); ESP_SKB_CB(skb)->tmp = tmp; err = crypto_aead_givencrypt(req); -- cgit v1.2.3 From 6d7258ca937027ae86d6d5938d7ae10b6d68f4a4 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 13 May 2015 15:27:18 +0800 Subject: esp6: Use high-order sequence number bits for IV generation I noticed we were only using the low-order bits for IV generation when ESN is enabled. This is very bad because it means that the IV can repeat. We must use the full 64 bits. Signed-off-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/ipv6/esp6.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 31f1b5d5e2ef..7c07ce36aae2 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -248,7 +248,8 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) aead_givcrypt_set_crypt(req, sg, sg, clen, iv); aead_givcrypt_set_assoc(req, asg, assoclen); aead_givcrypt_set_giv(req, esph->enc_data, - XFRM_SKB_CB(skb)->seq.output.low); + XFRM_SKB_CB(skb)->seq.output.low + + ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32)); ESP_SKB_CB(skb)->tmp = tmp; err = crypto_aead_givencrypt(req); -- cgit v1.2.3 From 22d3a3c829fa9ecdb493d1f1f2838d543f8d86a3 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 19 May 2015 15:40:21 +0200 Subject: mac80211: don't use napi_gro_receive() outside NAPI context No matter how the driver manages its NAPI context, there's no way sending frames to it from a timer can be correct, since it would corrupt the internal GRO lists. To avoid that, always use the non-NAPI path when releasing frames from the timer. Cc: stable@vger.kernel.org Reported-by: Jean Trivelly Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 3 +++ net/mac80211/rx.c | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index ab46ab4a7249..cdc374296245 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -205,6 +205,8 @@ enum ieee80211_packet_rx_flags { * @IEEE80211_RX_CMNTR: received on cooked monitor already * @IEEE80211_RX_BEACON_REPORTED: This frame was already reported * to cfg80211_report_obss_beacon(). + * @IEEE80211_RX_REORDER_TIMER: this frame is released by the + * reorder buffer timeout timer, not the normal RX path * * These flags are used across handling multiple interfaces * for a single frame. @@ -212,6 +214,7 @@ enum ieee80211_packet_rx_flags { enum ieee80211_rx_flags { IEEE80211_RX_CMNTR = BIT(0), IEEE80211_RX_BEACON_REPORTED = BIT(1), + IEEE80211_RX_REORDER_TIMER = BIT(2), }; struct ieee80211_rx_data { diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 260eed45b6d2..5793f75c5ffd 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2121,7 +2121,8 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx) /* deliver to local stack */ skb->protocol = eth_type_trans(skb, dev); memset(skb->cb, 0, sizeof(skb->cb)); - if (rx->local->napi) + if (!(rx->flags & IEEE80211_RX_REORDER_TIMER) && + rx->local->napi) napi_gro_receive(rx->local->napi, skb); else netif_receive_skb(skb); @@ -3231,7 +3232,7 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid) /* This is OK -- must be QoS data frame */ .security_idx = tid, .seqno_idx = tid, - .flags = 0, + .flags = IEEE80211_RX_REORDER_TIMER, }; struct tid_ampdu_rx *tid_agg_rx; -- cgit v1.2.3 From 252ec2b3aa6f6a9ac29c6539027db600c11bf45e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 19 May 2015 13:36:38 +0200 Subject: mac80211: don't split remain-on-channel for coalescing Due to remain-on-channel scheduling delays, when we split an ROC while coalescing, we'll usually get a picture like this: existing ROC: |------------------| current time: ^ new ROC: |------| |-------| If the expected response frames are then transmitted by the peer in the hole between the two fragments of the new ROC, we miss them and the process (e.g. ANQP query) fails. mac80211 expects that the window to miss something is small: existing ROC: |------------------| new ROC: |------||-------| but that's normally not the case. To avoid this problem, coalesce only if the new ROC's duration is <= the remaining time on the existing one: existing ROC: |------------------| new ROC: |-----| and never split a new one but schedule it afterwards instead: existing ROC: |------------------| new ROC: |-------------| type=bugfix bug=not-tracked fixes=unknown Reported-by: Matti Gottlieb Reviewed-by: EliadX Peller Reviewed-by: Matti Gottlieb Tested-by: Matti Gottlieb Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 59 +++++++++------------------------------------- net/mac80211/ieee80211_i.h | 6 ----- 2 files changed, 11 insertions(+), 54 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 265e42721a66..ff347a0eebd4 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2495,51 +2495,22 @@ static bool ieee80211_coalesce_started_roc(struct ieee80211_local *local, struct ieee80211_roc_work *new_roc, struct ieee80211_roc_work *cur_roc) { - unsigned long j = jiffies; - unsigned long cur_roc_end = cur_roc->hw_start_time + - msecs_to_jiffies(cur_roc->duration); - struct ieee80211_roc_work *next_roc; - int new_dur; + unsigned long now = jiffies; + unsigned long remaining = cur_roc->hw_start_time + + msecs_to_jiffies(cur_roc->duration) - + now; if (WARN_ON(!cur_roc->started || !cur_roc->hw_begun)) return false; - if (time_after(j + IEEE80211_ROC_MIN_LEFT, cur_roc_end)) + /* if it doesn't fit entirely, schedule a new one */ + if (new_roc->duration > jiffies_to_msecs(remaining)) return false; ieee80211_handle_roc_started(new_roc); - new_dur = new_roc->duration - jiffies_to_msecs(cur_roc_end - j); - - /* cur_roc is long enough - add new_roc to the dependents list. */ - if (new_dur <= 0) { - list_add_tail(&new_roc->list, &cur_roc->dependents); - return true; - } - - new_roc->duration = new_dur; - - /* - * if cur_roc was already coalesced before, we might - * want to extend the next roc instead of adding - * a new one. - */ - next_roc = list_entry(cur_roc->list.next, - struct ieee80211_roc_work, list); - if (&next_roc->list != &local->roc_list && - next_roc->chan == new_roc->chan && - next_roc->sdata == new_roc->sdata && - !WARN_ON(next_roc->started)) { - list_add_tail(&new_roc->list, &next_roc->dependents); - next_roc->duration = max(next_roc->duration, - new_roc->duration); - next_roc->type = max(next_roc->type, new_roc->type); - return true; - } - - /* add right after cur_roc */ - list_add(&new_roc->list, &cur_roc->list); - + /* add to dependents so we send the expired event properly */ + list_add_tail(&new_roc->list, &cur_roc->dependents); return true; } @@ -2652,17 +2623,9 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, * In the offloaded ROC case, if it hasn't begun, add * this new one to the dependent list to be handled * when the master one begins. If it has begun, - * check that there's still a minimum time left and - * if so, start this one, transmitting the frame, but - * add it to the list directly after this one with - * a reduced time so we'll ask the driver to execute - * it right after finishing the previous one, in the - * hope that it'll also be executed right afterwards, - * effectively extending the old one. - * If there's no minimum time left, just add it to the - * normal list. - * TODO: the ROC type is ignored here, assuming that it - * is better to immediately use the current ROC. + * check if it fits entirely within the existing one, + * in which case it will just be dependent as well. + * Otherwise, schedule it by itself. */ if (!tmp->hw_begun) { list_add_tail(&roc->list, &tmp->dependents); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index cdc374296245..c0a9187bc3a9 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -328,12 +328,6 @@ struct mesh_preq_queue { u8 flags; }; -#if HZ/100 == 0 -#define IEEE80211_ROC_MIN_LEFT 1 -#else -#define IEEE80211_ROC_MIN_LEFT (HZ/100) -#endif - struct ieee80211_roc_work { struct list_head list; struct list_head dependents; -- cgit v1.2.3 From f9dca80b98caac8b4bfb43a2edf1e9f877ccf322 Mon Sep 17 00:00:00 2001 From: Michal Kazior Date: Wed, 13 May 2015 09:16:48 +0000 Subject: mac80211: fix AP_VLAN crypto tailroom calculation Some splats I was seeing: (a) WARNING: CPU: 1 PID: 0 at /devel/src/linux/net/mac80211/wep.c:102 ieee80211_wep_add_iv (b) WARNING: CPU: 1 PID: 0 at /devel/src/linux/net/mac80211/wpa.c:73 ieee80211_tx_h_michael_mic_add (c) WARNING: CPU: 3 PID: 0 at /devel/src/linux/net/mac80211/wpa.c:433 ieee80211_crypto_ccmp_encrypt I've seen (a) and (b) with ath9k hw crypto and (c) with ath9k sw crypto. All of them were related to insufficient skb tailroom and I was able to trigger these with ping6 program. AP_VLANs may inherit crypto keys from parent AP. This wasn't considered and yielded problems in some setups resulting in inability to transmit data because mac80211 wouldn't resize skbs when necessary and subsequently drop some packets due to insufficient tailroom. For efficiency purposes don't inspect both AP_VLAN and AP sdata looking for tailroom counter. Instead update AP_VLAN tailroom counters whenever their master AP tailroom counter changes. Signed-off-by: Michal Kazior Signed-off-by: Johannes Berg --- net/mac80211/iface.c | 6 ++++ net/mac80211/key.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++------ net/mac80211/key.h | 1 + net/mac80211/util.c | 3 ++ 4 files changed, 83 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index bab5c63c0bad..84cef600c573 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -522,6 +522,12 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) memcpy(sdata->vif.hw_queue, master->vif.hw_queue, sizeof(sdata->vif.hw_queue)); sdata->vif.bss_conf.chandef = master->vif.bss_conf.chandef; + + mutex_lock(&local->key_mtx); + sdata->crypto_tx_tailroom_needed_cnt += + master->crypto_tx_tailroom_needed_cnt; + mutex_unlock(&local->key_mtx); + break; } case NL80211_IFTYPE_AP: diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 2291cd730091..a907f2d5c12d 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -58,6 +58,22 @@ static void assert_key_lock(struct ieee80211_local *local) lockdep_assert_held(&local->key_mtx); } +static void +update_vlan_tailroom_need_count(struct ieee80211_sub_if_data *sdata, int delta) +{ + struct ieee80211_sub_if_data *vlan; + + if (sdata->vif.type != NL80211_IFTYPE_AP) + return; + + mutex_lock(&sdata->local->mtx); + + list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) + vlan->crypto_tx_tailroom_needed_cnt += delta; + + mutex_unlock(&sdata->local->mtx); +} + static void increment_tailroom_need_count(struct ieee80211_sub_if_data *sdata) { /* @@ -79,6 +95,8 @@ static void increment_tailroom_need_count(struct ieee80211_sub_if_data *sdata) * http://mid.gmane.org/1308590980.4322.19.camel@jlt3.sipsolutions.net */ + update_vlan_tailroom_need_count(sdata, 1); + if (!sdata->crypto_tx_tailroom_needed_cnt++) { /* * Flush all XMIT packets currently using HW encryption or no @@ -88,6 +106,15 @@ static void increment_tailroom_need_count(struct ieee80211_sub_if_data *sdata) } } +static void decrease_tailroom_need_count(struct ieee80211_sub_if_data *sdata, + int delta) +{ + WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt < delta); + + update_vlan_tailroom_need_count(sdata, -delta); + sdata->crypto_tx_tailroom_needed_cnt -= delta; +} + static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) { struct ieee80211_sub_if_data *sdata; @@ -144,7 +171,7 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) if (!((key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC) || (key->conf.flags & IEEE80211_KEY_FLAG_RESERVE_TAILROOM))) - sdata->crypto_tx_tailroom_needed_cnt--; + decrease_tailroom_need_count(sdata, 1); WARN_ON((key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE) && (key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)); @@ -541,7 +568,7 @@ static void __ieee80211_key_destroy(struct ieee80211_key *key, schedule_delayed_work(&sdata->dec_tailroom_needed_wk, HZ/2); } else { - sdata->crypto_tx_tailroom_needed_cnt--; + decrease_tailroom_need_count(sdata, 1); } } @@ -631,6 +658,7 @@ void ieee80211_key_free(struct ieee80211_key *key, bool delay_tailroom) void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata) { struct ieee80211_key *key; + struct ieee80211_sub_if_data *vlan; ASSERT_RTNL(); @@ -639,7 +667,14 @@ void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata) mutex_lock(&sdata->local->key_mtx); - sdata->crypto_tx_tailroom_needed_cnt = 0; + WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt || + sdata->crypto_tx_tailroom_pending_dec); + + if (sdata->vif.type == NL80211_IFTYPE_AP) { + list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) + WARN_ON_ONCE(vlan->crypto_tx_tailroom_needed_cnt || + vlan->crypto_tx_tailroom_pending_dec); + } list_for_each_entry(key, &sdata->key_list, list) { increment_tailroom_need_count(sdata); @@ -649,6 +684,22 @@ void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata) mutex_unlock(&sdata->local->key_mtx); } +void ieee80211_reset_crypto_tx_tailroom(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_sub_if_data *vlan; + + mutex_lock(&sdata->local->key_mtx); + + sdata->crypto_tx_tailroom_needed_cnt = 0; + + if (sdata->vif.type == NL80211_IFTYPE_AP) { + list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) + vlan->crypto_tx_tailroom_needed_cnt = 0; + } + + mutex_unlock(&sdata->local->key_mtx); +} + void ieee80211_iter_keys(struct ieee80211_hw *hw, struct ieee80211_vif *vif, void (*iter)(struct ieee80211_hw *hw, @@ -688,8 +739,8 @@ static void ieee80211_free_keys_iface(struct ieee80211_sub_if_data *sdata, { struct ieee80211_key *key, *tmp; - sdata->crypto_tx_tailroom_needed_cnt -= - sdata->crypto_tx_tailroom_pending_dec; + decrease_tailroom_need_count(sdata, + sdata->crypto_tx_tailroom_pending_dec); sdata->crypto_tx_tailroom_pending_dec = 0; ieee80211_debugfs_key_remove_mgmt_default(sdata); @@ -709,6 +760,7 @@ void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata, { struct ieee80211_local *local = sdata->local; struct ieee80211_sub_if_data *vlan; + struct ieee80211_sub_if_data *master; struct ieee80211_key *key, *tmp; LIST_HEAD(keys); @@ -728,8 +780,20 @@ void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata, list_for_each_entry_safe(key, tmp, &keys, list) __ieee80211_key_destroy(key, false); - WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt || - sdata->crypto_tx_tailroom_pending_dec); + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { + if (sdata->bss) { + master = container_of(sdata->bss, + struct ieee80211_sub_if_data, + u.ap); + + WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt != + master->crypto_tx_tailroom_needed_cnt); + } + } else { + WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt || + sdata->crypto_tx_tailroom_pending_dec); + } + if (sdata->vif.type == NL80211_IFTYPE_AP) { list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) WARN_ON_ONCE(vlan->crypto_tx_tailroom_needed_cnt || @@ -793,8 +857,8 @@ void ieee80211_delayed_tailroom_dec(struct work_struct *wk) */ mutex_lock(&sdata->local->key_mtx); - sdata->crypto_tx_tailroom_needed_cnt -= - sdata->crypto_tx_tailroom_pending_dec; + decrease_tailroom_need_count(sdata, + sdata->crypto_tx_tailroom_pending_dec); sdata->crypto_tx_tailroom_pending_dec = 0; mutex_unlock(&sdata->local->key_mtx); } diff --git a/net/mac80211/key.h b/net/mac80211/key.h index c5a31835be0e..96557dd1e77d 100644 --- a/net/mac80211/key.h +++ b/net/mac80211/key.h @@ -161,6 +161,7 @@ void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata, void ieee80211_free_sta_keys(struct ieee80211_local *local, struct sta_info *sta); void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata); +void ieee80211_reset_crypto_tx_tailroom(struct ieee80211_sub_if_data *sdata); #define key_mtx_dereference(local, ref) \ rcu_dereference_protected(ref, lockdep_is_held(&((local)->key_mtx))) diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 79412f16b61d..b864ebc6ab8f 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -2022,6 +2022,9 @@ int ieee80211_reconfig(struct ieee80211_local *local) mutex_unlock(&local->sta_mtx); /* add back keys */ + list_for_each_entry(sdata, &local->interfaces, list) + ieee80211_reset_crypto_tx_tailroom(sdata); + list_for_each_entry(sdata, &local->interfaces, list) if (ieee80211_sdata_running(sdata)) ieee80211_enable_keys(sdata); -- cgit v1.2.3 From b0494532214bdfbf241e94fabab5dd46f7b82631 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 11 May 2015 17:53:10 +0300 Subject: libceph: request a new osdmap if lingering request maps to no osd This commit does two things. First, if there are any homeless lingering requests, we now request a new osdmap even if the osdmap that is being processed brought no changes, i.e. if a given lingering request turned homeless in one of the previous epochs and remained homeless in the current epoch. Not doing so leaves us with a stale osdmap and as a result we may miss our window for reestablishing the watch and lose notifies. MON=1 OSD=1: # cat linger-needmap.sh #!/bin/bash rbd create --size 1 test DEV=$(rbd map test) ceph osd out 0 rbd map dne/dne # obtain a new osdmap as a side effect (!) sleep 1 ceph osd in 0 rbd resize --size 2 test # rbd info test | grep size -> 2M # blockdev --getsize $DEV -> 1M N.B.: Not obtaining a new osdmap in between "osd out" and "osd in" above is enough to make it miss that resize notify, but that is a bug^Wlimitation of ceph watch/notify v1. Second, homeless lingering requests are now kicked just like those lingering requests whose mapping has changed. This is mainly to recognize that a homeless lingering request makes no sense and to preserve the invariant that a registered lingering request is not sitting on any of r_req_lru_item lists. This spares us a WARN_ON, which commit ba9d114ec557 ("libceph: clear r_req_lru_item in __unregister_linger_request()") tried to fix the _wrong_ way. Cc: stable@vger.kernel.org # 3.10+ Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/osd_client.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 41a4abc7e98e..31d4b1ebff01 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2017,20 +2017,29 @@ static void kick_requests(struct ceph_osd_client *osdc, bool force_resend, err = __map_request(osdc, req, force_resend || force_resend_writes); dout("__map_request returned %d\n", err); - if (err == 0) - continue; /* no change and no osd was specified */ if (err < 0) continue; /* hrm! */ - if (req->r_osd == NULL) { - dout("tid %llu maps to no valid osd\n", req->r_tid); - needmap++; /* request a newer map */ - continue; - } + if (req->r_osd == NULL || err > 0) { + if (req->r_osd == NULL) { + dout("lingering %p tid %llu maps to no osd\n", + req, req->r_tid); + /* + * A homeless lingering request makes + * no sense, as it's job is to keep + * a particular OSD connection open. + * Request a newer map and kick the + * request, knowing that it won't be + * resent until we actually get a map + * that can tell us where to send it. + */ + needmap++; + } - dout("kicking lingering %p tid %llu osd%d\n", req, req->r_tid, - req->r_osd ? req->r_osd->o_osd : -1); - __register_request(osdc, req); - __unregister_linger_request(osdc, req); + dout("kicking lingering %p tid %llu osd%d\n", req, + req->r_tid, req->r_osd ? req->r_osd->o_osd : -1); + __register_request(osdc, req); + __unregister_linger_request(osdc, req); + } } reset_changed_osds(osdc); mutex_unlock(&osdc->request_mutex); -- cgit v1.2.3 From 521a04d06a729e5971cdee7f84080387ed320527 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 11 May 2015 17:53:34 +0300 Subject: Revert "libceph: clear r_req_lru_item in __unregister_linger_request()" This reverts commit ba9d114ec5578e6e99a4dfa37ff8ae688040fd64. .. which introduced a regression that prevented all lingering requests requeued in kick_requests() from ever being sent to the OSDs, resulting in a lot of missed notifies. In retrospect it's pretty obvious that r_req_lru_item item in the case of lingering requests can be used not only for notarget, but also for unsent linkage due to how tightly actual map and enqueue operations are coupled in __map_request(). The assertion that was being silenced is taken care of in the previous ("libceph: request a new osdmap if lingering request maps to no osd") commit: by always kicking homeless lingering requests we ensure that none of them ends up on the notarget list outside of the critical section guarded by request_mutex. Cc: stable@vger.kernel.org # 3.18+, needs b0494532214b "libceph: request a new osdmap if lingering request maps to no osd" Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/osd_client.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 31d4b1ebff01..c4ec9239249a 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1306,8 +1306,6 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc, if (list_empty(&req->r_osd_item)) req->r_osd = NULL; } - - list_del_init(&req->r_req_lru_item); /* can be on notarget */ ceph_osdc_put_request(req); } -- cgit v1.2.3 From 407d34ef294727bdc200934c38d9a8241f4a5547 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 21 May 2015 00:38:12 +0800 Subject: xfrm: Always zero high-order sequence number bits As we're now always including the high bits of the sequence number in the IV generation process we need to ensure that they don't contain crap. This patch ensures that the high sequence bits are always zeroed so that we don't leak random data into the IV. Signed-off-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_replay.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index dab57daae408..4fd725a0c500 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -99,6 +99,7 @@ static int xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb) if (x->type->flags & XFRM_TYPE_REPLAY_PROT) { XFRM_SKB_CB(skb)->seq.output.low = ++x->replay.oseq; + XFRM_SKB_CB(skb)->seq.output.hi = 0; if (unlikely(x->replay.oseq == 0)) { x->replay.oseq--; xfrm_audit_state_replay_overflow(x, skb); @@ -177,6 +178,7 @@ static int xfrm_replay_overflow_bmp(struct xfrm_state *x, struct sk_buff *skb) if (x->type->flags & XFRM_TYPE_REPLAY_PROT) { XFRM_SKB_CB(skb)->seq.output.low = ++replay_esn->oseq; + XFRM_SKB_CB(skb)->seq.output.hi = 0; if (unlikely(replay_esn->oseq == 0)) { replay_esn->oseq--; xfrm_audit_state_replay_overflow(x, skb); -- cgit v1.2.3 From b48732e4a48d80ed4a14812f0bab09560846514e Mon Sep 17 00:00:00 2001 From: Mark Salyzyn Date: Tue, 26 May 2015 08:22:19 -0700 Subject: unix/caif: sk_socket can disappear when state is unlocked got a rare NULL pointer dereference in clear_bit Signed-off-by: Mark Salyzyn Acked-by: Hannes Frederic Sowa ---- v2: switch to sock_flag(sk, SOCK_DEAD) and added net/caif/caif_socket.c v3: return -ECONNRESET in upstream caller of wait function for SOCK_DEAD Signed-off-by: David S. Miller --- net/caif/caif_socket.c | 8 ++++++++ net/unix/af_unix.c | 8 ++++++++ 2 files changed, 16 insertions(+) (limited to 'net') diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 4ec0c803aef1..112ad784838a 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -330,6 +330,10 @@ static long caif_stream_data_wait(struct sock *sk, long timeo) release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); + + if (sock_flag(sk, SOCK_DEAD)) + break; + clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); } @@ -373,6 +377,10 @@ static int caif_stream_recvmsg(struct socket *sock, struct msghdr *msg, struct sk_buff *skb; lock_sock(sk); + if (sock_flag(sk, SOCK_DEAD)) { + err = -ECONNRESET; + goto unlock; + } skb = skb_dequeue(&sk->sk_receive_queue); caif_check_flow_release(sk); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 5266ea7b922b..06430598cf51 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1880,6 +1880,10 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, unix_state_unlock(sk); timeo = freezable_schedule_timeout(timeo); unix_state_lock(sk); + + if (sock_flag(sk, SOCK_DEAD)) + break; + clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); } @@ -1939,6 +1943,10 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, struct sk_buff *skb, *last; unix_state_lock(sk); + if (sock_flag(sk, SOCK_DEAD)) { + err = -ECONNRESET; + goto unlock; + } last = skb = skb_peek(&sk->sk_receive_queue); again: if (skb == NULL) { -- cgit v1.2.3 From 86e363dc3b50bfd50a1f315934583fbda673ab8d Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 26 May 2015 16:08:48 -0700 Subject: net_sched: invoke ->attach() after setting dev->qdisc For mq qdisc, we add per tx queue qdisc to root qdisc for display purpose, however, that happens too early, before the new dev->qdisc is finally set, this causes q->list points to an old root qdisc which is going to be freed right before assigning with a new one. Fix this by moving ->attach() after setting dev->qdisc. For the record, this fixes the following crash: ------------[ cut here ]------------ WARNING: CPU: 1 PID: 975 at lib/list_debug.c:59 __list_del_entry+0x5a/0x98() list_del corruption. prev->next should be ffff8800d1998ae8, but was 6b6b6b6b6b6b6b6b CPU: 1 PID: 975 Comm: tc Not tainted 4.1.0-rc4+ #1019 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 0000000000000009 ffff8800d73fb928 ffffffff81a44e7f 0000000047574756 ffff8800d73fb978 ffff8800d73fb968 ffffffff810790da ffff8800cfc4cd20 ffffffff814e725b ffff8800d1998ae8 ffffffff82381250 0000000000000000 Call Trace: [] dump_stack+0x4c/0x65 [] warn_slowpath_common+0x9c/0xb6 [] ? __list_del_entry+0x5a/0x98 [] warn_slowpath_fmt+0x46/0x48 [] ? dev_graft_qdisc+0x5e/0x6a [] __list_del_entry+0x5a/0x98 [] list_del+0xe/0x2d [] qdisc_list_del+0x1e/0x20 [] qdisc_destroy+0x30/0xd6 [] qdisc_graft+0x11d/0x243 [] tc_get_qdisc+0x1a6/0x1d4 [] ? mark_lock+0x2e/0x226 [] rtnetlink_rcv_msg+0x181/0x194 [] ? rtnl_lock+0x17/0x19 [] ? rtnl_lock+0x17/0x19 [] ? __rtnl_unlock+0x17/0x17 [] netlink_rcv_skb+0x4d/0x93 [] rtnetlink_rcv+0x26/0x2d [] netlink_unicast+0xcb/0x150 [] ? might_fault+0x59/0xa9 [] netlink_sendmsg+0x4fa/0x51c [] sock_sendmsg_nosec+0x12/0x1d [] sock_sendmsg+0x29/0x2e [] ___sys_sendmsg+0x1b4/0x23a [] ? native_sched_clock+0x35/0x37 [] ? sched_clock_local+0x12/0x72 [] ? sched_clock_cpu+0x9e/0xb7 [] ? current_kernel_time+0xe/0x32 [] ? lock_release_holdtime.part.29+0x71/0x7f [] ? read_seqcount_begin.constprop.27+0x5f/0x76 [] ? trace_hardirqs_on_caller+0x17d/0x199 [] ? __fget_light+0x50/0x78 [] __sys_sendmsg+0x42/0x60 [] SyS_sendmsg+0x12/0x1c [] system_call_fastpath+0x12/0x6f ---[ end trace ef29d3fb28e97ae7 ]--- For long term, we probably need to clean up the qdisc_graft() code in case it hides other bugs like this. Fixes: 95dc19299f74 ("pkt_sched: give visibility to mq slave qdiscs") Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_api.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index ad9eed70bc8f..1e1c89e51a11 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -815,10 +815,8 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, if (dev->flags & IFF_UP) dev_deactivate(dev); - if (new && new->ops->attach) { - new->ops->attach(new); - num_q = 0; - } + if (new && new->ops->attach) + goto skip; for (i = 0; i < num_q; i++) { struct netdev_queue *dev_queue = dev_ingress_queue(dev); @@ -834,12 +832,16 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, qdisc_destroy(old); } +skip: if (!ingress) { notify_and_destroy(net, skb, n, classid, dev->qdisc, new); if (new && !new->ops->attach) atomic_inc(&new->refcnt); dev->qdisc = new ? : &noop_qdisc; + + if (new && new->ops->attach) + new->ops->attach(new); } else { notify_and_destroy(net, skb, n, classid, old, new); } -- cgit v1.2.3 From cd5279c194f89c9b97c294af4aaf4ea8c5e3c704 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 27 May 2015 07:16:43 -0700 Subject: ip_vti/ip6_vti: Do not touch skb->mark on xmit Instead of modifying skb->mark we can simply modify the flowi_mark that is generated as a result of the xfrm_decode_session. By doing this we don't need to actually touch the skb->mark and it can be preserved as it passes out through the tunnel. Signed-off-by: Alexander Duyck Signed-off-by: Steffen Klassert --- net/ipv4/ip_vti.c | 5 +++-- net/ipv6/ip6_vti.c | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 9f7269f3c54a..4c318e1c13c8 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -216,8 +216,6 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) memset(&fl, 0, sizeof(fl)); - skb->mark = be32_to_cpu(tunnel->parms.o_key); - switch (skb->protocol) { case htons(ETH_P_IP): xfrm_decode_session(skb, &fl, AF_INET); @@ -233,6 +231,9 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } + /* override mark with tunnel output key */ + fl.flowi_mark = be32_to_cpu(tunnel->parms.o_key); + return vti_xmit(skb, dev, &fl); } diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index ed9d681207fa..104de4da3ff3 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -495,7 +495,6 @@ vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) int ret; memset(&fl, 0, sizeof(fl)); - skb->mark = be32_to_cpu(t->parms.o_key); switch (skb->protocol) { case htons(ETH_P_IPV6): @@ -516,6 +515,9 @@ vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) goto tx_err; } + /* override mark with tunnel output key */ + fl.flowi_mark = be32_to_cpu(t->parms.o_key); + ret = vti6_xmit(skb, dev, &fl); if (ret < 0) goto tx_err; -- cgit v1.2.3 From 049f8e2e28d9c3dac0744cc2f19d3157c7fb5646 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 27 May 2015 07:16:49 -0700 Subject: xfrm: Override skb->mark with tunnel->parm.i_key in xfrm_input This change makes it so that if a tunnel is defined we just use the mark from the tunnel instead of the mark from the skb header. By doing this we can avoid the need to set skb->mark inside of the tunnel receive functions. Signed-off-by: Alexander Duyck Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_input.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 526c4feb3b50..b58286ecd156 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -13,6 +13,8 @@ #include #include #include +#include +#include static struct kmem_cache *secpath_cachep __read_mostly; @@ -186,6 +188,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) struct xfrm_state *x = NULL; xfrm_address_t *daddr; struct xfrm_mode *inner_mode; + u32 mark = skb->mark; unsigned int family; int decaps = 0; int async = 0; @@ -203,6 +206,18 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) XFRM_SPI_SKB_CB(skb)->daddroff); family = XFRM_SPI_SKB_CB(skb)->family; + /* if tunnel is present override skb->mark value with tunnel i_key */ + if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4) { + switch (family) { + case AF_INET: + mark = be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4->parms.i_key); + break; + case AF_INET6: + mark = be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6->parms.i_key); + break; + } + } + /* Allocate new secpath or COW existing one. */ if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) { struct sec_path *sp; @@ -229,7 +244,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) goto drop; } - x = xfrm_state_lookup(net, skb->mark, daddr, spi, nexthdr, family); + x = xfrm_state_lookup(net, mark, daddr, spi, nexthdr, family); if (x == NULL) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES); xfrm_audit_state_notfound(skb, family, spi, seq); -- cgit v1.2.3 From d55c670cbc54b2270a465cdc382ce71adae45785 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 27 May 2015 07:16:54 -0700 Subject: ip_vti/ip6_vti: Preserve skb->mark after rcv_cb call The vti6_rcv_cb and vti_rcv_cb calls were leaving the skb->mark modified after completing the function. This resulted in the original skb->mark value being lost. Since we only need skb->mark to be set for xfrm_policy_check we can pull the assignment into the rcv_cb calls and then just restore the original mark after xfrm_policy_check has been completed. Signed-off-by: Alexander Duyck Signed-off-by: Steffen Klassert --- net/ipv4/ip_vti.c | 9 +++++++-- net/ipv6/ip6_vti.c | 9 +++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 4c318e1c13c8..0c152087ca15 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -65,7 +65,6 @@ static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, goto drop; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel; - skb->mark = be32_to_cpu(tunnel->parms.i_key); return xfrm_input(skb, nexthdr, spi, encap_type); } @@ -91,6 +90,8 @@ static int vti_rcv_cb(struct sk_buff *skb, int err) struct pcpu_sw_netstats *tstats; struct xfrm_state *x; struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4; + u32 orig_mark = skb->mark; + int ret; if (!tunnel) return 1; @@ -107,7 +108,11 @@ static int vti_rcv_cb(struct sk_buff *skb, int err) x = xfrm_input_state(skb); family = x->inner_mode->afinfo->family; - if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) + skb->mark = be32_to_cpu(tunnel->parms.i_key); + ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family); + skb->mark = orig_mark; + + if (!ret) return -EPERM; skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev))); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 104de4da3ff3..ff3bd863fa03 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -322,7 +322,6 @@ static int vti6_rcv(struct sk_buff *skb) } XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t; - skb->mark = be32_to_cpu(t->parms.i_key); rcu_read_unlock(); @@ -342,6 +341,8 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err) struct pcpu_sw_netstats *tstats; struct xfrm_state *x; struct ip6_tnl *t = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6; + u32 orig_mark = skb->mark; + int ret; if (!t) return 1; @@ -358,7 +359,11 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err) x = xfrm_input_state(skb); family = x->inner_mode->afinfo->family; - if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) + skb->mark = be32_to_cpu(t->parms.i_key); + ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family); + skb->mark = orig_mark; + + if (!ret) return -EPERM; skb_scrub_packet(skb, !net_eq(t->net, dev_net(skb->dev))); -- cgit v1.2.3 From 71d9f6149cac8fc6646adfb2a6f3b0de6ddd23f6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 May 2015 04:42:54 -0700 Subject: bridge: fix br_multicast_query_expired() bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit br_multicast_query_expired() querier argument is a pointer to a struct bridge_mcast_querier : struct bridge_mcast_querier { struct br_ip addr; struct net_bridge_port __rcu *port; }; Intent of the code was to clear port field, not the pointer to querier. Fixes: 2cd4143192e8 ("bridge: memorize and export selected IGMP/MLD querier port") Signed-off-by: Eric Dumazet Acked-by: Thadeu Lima de Souza Cascardo Acked-by: Linus Lüssing Cc: Linus Lüssing Cc: Steinar H. Gunderson Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index a3abe6ed111e..22fd0419b314 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1822,7 +1822,7 @@ static void br_multicast_query_expired(struct net_bridge *br, if (query->startup_sent < br->multicast_startup_query_count) query->startup_sent++; - RCU_INIT_POINTER(querier, NULL); + RCU_INIT_POINTER(querier->port, NULL); br_multicast_send_query(br, NULL, query); spin_unlock(&br->multicast_lock); } -- cgit v1.2.3 From beb39db59d14990e401e235faf66a6b9b31240b0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 30 May 2015 09:16:53 -0700 Subject: udp: fix behavior of wrong checksums We have two problems in UDP stack related to bogus checksums : 1) We return -EAGAIN to application even if receive queue is not empty. This breaks applications using edge trigger epoll() 2) Under UDP flood, we can loop forever without yielding to other processes, potentially hanging the host, especially on non SMP. This patch is an attempt to make things better. We might in the future add extra support for rt applications wanting to better control time spent doing a recv() in a hostile environment. For example we could validate checksums before queuing packets in socket receive queue. Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/udp.c | 6 ++---- net/ipv6/udp.c | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d10b7e0112eb..1c92ea67baef 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1345,10 +1345,8 @@ csum_copy_err: } unlock_sock_fast(sk, slow); - if (noblock) - return -EAGAIN; - - /* starting over for a new packet */ + /* starting over for a new packet, but check if we need to yield */ + cond_resched(); msg->msg_flags &= ~MSG_TRUNC; goto try_again; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index c2ec41617a35..e51fc3eee6db 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -525,10 +525,8 @@ csum_copy_err: } unlock_sock_fast(sk, slow); - if (noblock) - return -EAGAIN; - - /* starting over for a new packet */ + /* starting over for a new packet, but check if we need to yield */ + cond_resched(); msg->msg_flags &= ~MSG_TRUNC; goto try_again; } -- cgit v1.2.3 From 9f950415e4e28e7cfae2e416b43e862e8101d996 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Fri, 29 May 2015 13:47:07 -0400 Subject: tcp: fix child sockets to use system default congestion control if not set Linux 3.17 and earlier are explicitly engineered so that if the app doesn't specifically request a CC module on a listener before the SYN arrives, then the child gets the system default CC when the connection is established. See tcp_init_congestion_control() in 3.17 or earlier, which says "if no choice made yet assign the current value set as default". The change ("net: tcp: assign tcp cong_ops when tcp sk is created") altered these semantics, so that children got their parent listener's congestion control even if the system default had changed after the listener was created. This commit returns to those original semantics from 3.17 and earlier, since they are the original semantics from 2007 in 4d4d3d1e8 ("[TCP]: Congestion control initialization."), and some Linux congestion control workflows depend on that. In summary, if a listener socket specifically sets TCP_CONGESTION to "x", or the route locks the CC module to "x", then the child gets "x". Otherwise the child gets current system default from net.ipv4.tcp_congestion_control. That's the behavior in 3.17 and earlier, and this commit restores that. Fixes: 55d8694fa82c ("net: tcp: assign tcp cong_ops when tcp sk is created") Cc: Florian Westphal Cc: Daniel Borkmann Cc: Glenn Judd Cc: Stephen Hemminger Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: Yuchung Cheng Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 3 ++- net/ipv4/tcp_cong.c | 5 ++++- net/ipv4/tcp_minisocks.c | 5 ++++- 3 files changed, 10 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 497bc14cdb85..0320bbb7d7b5 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -98,7 +98,8 @@ struct inet_connection_sock { const struct tcp_congestion_ops *icsk_ca_ops; const struct inet_connection_sock_af_ops *icsk_af_ops; unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); - __u8 icsk_ca_state:7, + __u8 icsk_ca_state:6, + icsk_ca_setsockopt:1, icsk_ca_dst_locked:1; __u8 icsk_retransmits; __u8 icsk_pending; diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 7a5ae50c80c8..84be008c945c 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -187,6 +187,7 @@ static void tcp_reinit_congestion_control(struct sock *sk, tcp_cleanup_congestion_control(sk); icsk->icsk_ca_ops = ca; + icsk->icsk_ca_setsockopt = 1; if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); @@ -335,8 +336,10 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) rcu_read_lock(); ca = __tcp_ca_find_autoload(name); /* No change asking for existing value */ - if (ca == icsk->icsk_ca_ops) + if (ca == icsk->icsk_ca_ops) { + icsk->icsk_ca_setsockopt = 1; goto out; + } if (!ca) err = -ENOENT; else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b5732a54f2ad..17e7339ee5ca 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -420,7 +420,10 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) rcu_read_unlock(); } - if (!ca_got_dst && !try_module_get(icsk->icsk_ca_ops->owner)) + /* If no valid choice made yet, assign current system default ca. */ + if (!ca_got_dst && + (!icsk->icsk_ca_setsockopt || + !try_module_get(icsk->icsk_ca_ops->owner))) tcp_assign_congestion_control(sk); tcp_set_ca_state(sk, TCP_CA_Open); -- cgit v1.2.3 From 24595346d79b6bd98a77d24c493e8490639788fc Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 29 May 2015 10:29:46 -0700 Subject: net: dsa: Properly propagate errors from dsa_switch_setup_one While shuffling some code around, dsa_switch_setup_one() was introduced, and it was modified to return either an error code using ERR_PTR() or a NULL pointer when running out of memory or failing to setup a switch. This is a problem for its caler: dsa_switch_setup() which uses IS_ERR() and expects to find an error code, not a NULL pointer, so we still try to proceed with dsa_switch_setup() and operate on invalid memory addresses. This can be easily reproduced by having e.g: the bcm_sf2 driver built-in, but having no such switch, such that drv->setup will fail. Fix this by using PTR_ERR() consistently which is both more informative and avoids for the caller to use IS_ERR_OR_NULL(). Fixes: df197195a5248 ("net: dsa: split dsa_switch_setup into two functions") Reported-by: Andrew Lunn Signed-off-by: Florian Fainelli Tested-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/dsa.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index e6f6cc3a1bcf..392e29a0227d 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -359,7 +359,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, */ ds = kzalloc(sizeof(*ds) + drv->priv_size, GFP_KERNEL); if (ds == NULL) - return NULL; + return ERR_PTR(-ENOMEM); ds->dst = dst; ds->index = index; @@ -370,7 +370,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, ret = dsa_switch_setup_one(ds, parent); if (ret) - return NULL; + return ERR_PTR(ret); return ds; } -- cgit v1.2.3 From d26e2c9ffa385dd1b646f43c1397ba12af9ed431 Mon Sep 17 00:00:00 2001 From: Bernhard Thaler Date: Thu, 28 May 2015 10:26:18 +0200 Subject: Revert "netfilter: ensure number of counters is >0 in do_replace()" This partially reverts commit 1086bbe97a07 ("netfilter: ensure number of counters is >0 in do_replace()") in net/bridge/netfilter/ebtables.c. Setting rules with ebtables does not work any more with 1086bbe97a07 place. There is an error message and no rules set in the end. e.g. ~# ebtables -t nat -A POSTROUTING --src 12:34:56:78:9a:bc -j DROP Unable to update the kernel. Two possible causes: 1. Multiple ebtables programs were executing simultaneously. The ebtables userspace tool doesn't by default support multiple ebtables programs running Reverting the ebtables part of 1086bbe97a07 makes this work again. Signed-off-by: Bernhard Thaler Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtables.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net') diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 24c7c96bf5f8..91180a7fc943 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1117,8 +1117,6 @@ static int do_replace(struct net *net, const void __user *user, return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct ebt_counter)) return -ENOMEM; - if (tmp.num_counters == 0) - return -EINVAL; tmp.name[sizeof(tmp.name) - 1] = 0; @@ -2161,8 +2159,6 @@ static int compat_copy_ebt_replace_from_user(struct ebt_replace *repl, return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct ebt_counter)) return -ENOMEM; - if (tmp.num_counters == 0) - return -EINVAL; memcpy(repl, &tmp, offsetof(struct ebt_replace, hook_entry)); -- cgit v1.2.3 From 18ec898ee54e03a9aab8b54db50cb2b36209d313 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 1 Jun 2015 14:43:50 -0700 Subject: Revert "net: core: 'ethtool' issue with querying phy settings" This reverts commit f96dee13b8e10f00840124255bed1d8b4c6afd6f. It isn't right, ethtool is meant to manage one PHY instance per netdevice at a time, and this is selected by the SET command. Therefore by definition the GET command must only return the settings for the configured and selected PHY. Reported-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/ethtool.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 1347e11f5cc9..1d00b8922902 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -359,15 +359,7 @@ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) int err; struct ethtool_cmd cmd; - if (!dev->ethtool_ops->get_settings) - return -EOPNOTSUPP; - - if (copy_from_user(&cmd, useraddr, sizeof(cmd))) - return -EFAULT; - - cmd.cmd = ETHTOOL_GSET; - - err = dev->ethtool_ops->get_settings(dev, &cmd); + err = __ethtool_get_settings(dev, &cmd); if (err < 0) return err; -- cgit v1.2.3 From ccd740cbc6e01b2a08baa341867063ed2887f4b9 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Fri, 29 May 2015 11:28:26 -0700 Subject: vti6: Add pmtu handling to vti6_xmit. We currently rely on the PMTU discovery of xfrm. However if a packet is localy sent, the PMTU mechanism of xfrm tries to to local socket notification what might not work for applications like ping that don't check for this. So add pmtu handling to vti6_xmit to report MTU changes immediately. Signed-off-by: Steffen Klassert Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv6/ip6_vti.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'net') diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index ff3bd863fa03..0224c032dca5 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -435,6 +435,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) struct net_device *tdev; struct xfrm_state *x; int err = -1; + int mtu; if (!dst) goto tx_err_link_failure; @@ -468,6 +469,19 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) skb_dst_set(skb, dst); skb->dev = skb_dst(skb)->dev; + mtu = dst_mtu(dst); + if (!skb->ignore_df && skb->len > mtu) { + skb_dst(skb)->ops->update_pmtu(dst, NULL, skb, mtu); + + if (skb->protocol == htons(ETH_P_IPV6)) + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + else + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + + return -EMSGSIZE; + } + err = dst_output(skb); if (net_xmit_eval(err) == 0) { struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); -- cgit v1.2.3