From b9a3b1102bc80b4044224494100f67de132d5448 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 13 Aug 2008 15:18:38 -0700 Subject: pkt_sched: Fix queue quiescence testing in dev_deactivate(). Based upon discussions with Jarek P. and Herbert Xu. First, we're testing the wrong qdisc. We just reset the device queue qdiscs to &noop_qdisc and checking it's state is completely pointless here. We want to wait until the previous qdisc that was sitting at the ->qdisc pointer is not busy any more. And that would be ->qdisc_sleeping. Because of how we propagate the samples qdisc pointer down into qdisc_run and friends via per-cpu ->output_queue and netif_schedule, we have to wait also for the __QDISC_STATE_SCHED bit to clear as well. Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net/sched/sch_generic.c') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 7cf83b37459d..468574682caa 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -647,7 +647,7 @@ static void dev_deactivate_queue(struct net_device *dev, } } -static bool some_qdisc_is_running(struct net_device *dev, int lock) +static bool some_qdisc_is_busy(struct net_device *dev, int lock) { unsigned int i; @@ -658,13 +658,14 @@ static bool some_qdisc_is_running(struct net_device *dev, int lock) int val; dev_queue = netdev_get_tx_queue(dev, i); - q = dev_queue->qdisc; + q = dev_queue->qdisc_sleeping; root_lock = qdisc_lock(q); if (lock) spin_lock_bh(root_lock); - val = test_bit(__QDISC_STATE_RUNNING, &q->state); + val = (test_bit(__QDISC_STATE_RUNNING, &q->state) || + test_bit(__QDISC_STATE_SCHED, &q->state)); if (lock) spin_unlock_bh(root_lock); @@ -689,14 +690,14 @@ void dev_deactivate(struct net_device *dev) /* Wait for outstanding qdisc_run calls. */ do { - while (some_qdisc_is_running(dev, 0)) + while (some_qdisc_is_busy(dev, 0)) yield(); /* * Double-check inside queue lock to ensure that all effects * of the queue run are visible when we return. */ - running = some_qdisc_is_running(dev, 1); + running = some_qdisc_is_busy(dev, 1); /* * The running flag should never be set at this point because -- cgit v1.2.3 From a9312ae89324438b0edc554eb36c3ec6bf927d04 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 17 Aug 2008 21:51:03 -0700 Subject: pkt_sched: Add 'deactivated' state. This new state lets dev_deactivate() mark a qdisc as having been deactivated. dev_queue_xmit() and ing_filter() check for this bit and do not try to process the qdisc if the bit is set. dev_deactivate() polls the qdisc after setting the bit, waiting for both __QDISC_STATE_RUNNING and __QDISC_STATE_SCHED to clear. This isn't perfect yet, but subsequent changesets will make it so. This part is just one piece of the puzzle. Signed-off-by: David S. Miller --- include/net/sch_generic.h | 1 + net/core/dev.c | 9 ++++++++- net/sched/sch_generic.c | 6 ++++++ 3 files changed, 15 insertions(+), 1 deletion(-) (limited to 'net/sched/sch_generic.c') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index a7abfda3e447..757ab087adbf 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -27,6 +27,7 @@ enum qdisc_state_t { __QDISC_STATE_RUNNING, __QDISC_STATE_SCHED, + __QDISC_STATE_DEACTIVATED, }; struct qdisc_size_table { diff --git a/net/core/dev.c b/net/core/dev.c index 600bb23c4c2e..d9e31f63aded 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1800,6 +1800,12 @@ gso: spin_lock(root_lock); + if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { + spin_unlock(root_lock); + rc = NET_XMIT_DROP; + goto out_kfree_skb; + } + rc = qdisc_enqueue_root(skb, q); qdisc_run(q); @@ -2084,7 +2090,8 @@ static int ing_filter(struct sk_buff *skb) q = rxq->qdisc; if (q != &noop_qdisc) { spin_lock(qdisc_lock(q)); - result = qdisc_enqueue_root(skb, q); + if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) + result = qdisc_enqueue_root(skb, q); spin_unlock(qdisc_lock(q)); } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 468574682caa..ff1c4557e5f8 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -597,6 +597,9 @@ static void transition_one_qdisc(struct net_device *dev, struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping; int *need_watchdog_p = _need_watchdog; + if (!(new_qdisc->flags & TCQ_F_BUILTIN)) + clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state); + rcu_assign_pointer(dev_queue->qdisc, new_qdisc); if (need_watchdog_p && new_qdisc != &noqueue_qdisc) *need_watchdog_p = 1; @@ -640,6 +643,9 @@ static void dev_deactivate_queue(struct net_device *dev, if (qdisc) { spin_lock_bh(qdisc_lock(qdisc)); + if (!(qdisc->flags & TCQ_F_BUILTIN)) + set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state); + dev_queue->qdisc = qdisc_default; qdisc_reset(qdisc); -- cgit v1.2.3 From 4335cd2da1e8986fa8aff21a91144d986cb0a5fc Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 17 Aug 2008 21:58:07 -0700 Subject: pkt_sched: Simplify dev_deactivate() polling loop. The condition under which the previous qdisc has no more references after we've attached &noop_qdisc is that both RUNNING and SCHED are both seen clear while holding the root lock. So just make specifically that check in the polling loop, instead of this overly complex "check without then check with lock held" sequence. Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 31 +++++-------------------------- 1 file changed, 5 insertions(+), 26 deletions(-) (limited to 'net/sched/sch_generic.c') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index ff1c4557e5f8..30b76aec723b 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -653,7 +653,7 @@ static void dev_deactivate_queue(struct net_device *dev, } } -static bool some_qdisc_is_busy(struct net_device *dev, int lock) +static bool some_qdisc_is_busy(struct net_device *dev) { unsigned int i; @@ -667,14 +667,12 @@ static bool some_qdisc_is_busy(struct net_device *dev, int lock) q = dev_queue->qdisc_sleeping; root_lock = qdisc_lock(q); - if (lock) - spin_lock_bh(root_lock); + spin_lock_bh(root_lock); val = (test_bit(__QDISC_STATE_RUNNING, &q->state) || test_bit(__QDISC_STATE_SCHED, &q->state)); - if (lock) - spin_unlock_bh(root_lock); + spin_unlock_bh(root_lock); if (val) return true; @@ -684,8 +682,6 @@ static bool some_qdisc_is_busy(struct net_device *dev, int lock) void dev_deactivate(struct net_device *dev) { - bool running; - netdev_for_each_tx_queue(dev, dev_deactivate_queue, &noop_qdisc); dev_deactivate_queue(dev, &dev->rx_queue, &noop_qdisc); @@ -695,25 +691,8 @@ void dev_deactivate(struct net_device *dev) synchronize_rcu(); /* Wait for outstanding qdisc_run calls. */ - do { - while (some_qdisc_is_busy(dev, 0)) - yield(); - - /* - * Double-check inside queue lock to ensure that all effects - * of the queue run are visible when we return. - */ - running = some_qdisc_is_busy(dev, 1); - - /* - * The running flag should never be set at this point because - * we've already set dev->qdisc to noop_qdisc *inside* the same - * pair of spin locks. That is, if any qdisc_run starts after - * our initial test it should see the noop_qdisc and then - * clear the RUNNING bit before dropping the queue lock. So - * if it is set here then we've found a bug. - */ - } while (WARN_ON_ONCE(running)); + while (some_qdisc_is_busy(dev)) + yield(); } static void dev_init_scheduler_queue(struct net_device *dev, -- cgit v1.2.3 From 1e0d5a5747772182d1bb2525d8153da640fdfb58 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 17 Aug 2008 22:31:26 -0700 Subject: pkt_sched: No longer destroy qdiscs from RCU. We can now kill them synchronously with all of the previous dev_deactivate() cures. This makes netdev destruction and shutdown saner as the qdiscs hold references to the device. Signed-off-by: David S. Miller --- include/net/sch_generic.h | 1 - net/sched/sch_generic.c | 27 +++++++++------------------ 2 files changed, 9 insertions(+), 19 deletions(-) (limited to 'net/sched/sch_generic.c') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 757ab087adbf..84d25f2e6188 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -61,7 +61,6 @@ struct Qdisc struct gnet_stats_basic bstats; struct gnet_stats_queue qstats; struct gnet_stats_rate_est rate_est; - struct rcu_head q_rcu; int (*reshape_fail)(struct sk_buff *skb, struct Qdisc *q); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 30b76aec723b..6f96b7bc0809 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -518,14 +518,19 @@ void qdisc_reset(struct Qdisc *qdisc) } EXPORT_SYMBOL(qdisc_reset); -/* this is the rcu callback function to clean up a qdisc when there - * are no further references to it */ +/* Under qdisc_lock(qdisc) and BH! */ -static void __qdisc_destroy(struct rcu_head *head) +void qdisc_destroy(struct Qdisc *qdisc) { - struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu); const struct Qdisc_ops *ops = qdisc->ops; + if (qdisc->flags & TCQ_F_BUILTIN || + !atomic_dec_and_test(&qdisc->refcnt)) + return; + + if (qdisc->parent) + list_del(&qdisc->list); + #ifdef CONFIG_NET_SCHED qdisc_put_stab(qdisc->stab); #endif @@ -542,20 +547,6 @@ static void __qdisc_destroy(struct rcu_head *head) kfree((char *) qdisc - qdisc->padded); } - -/* Under qdisc_lock(qdisc) and BH! */ - -void qdisc_destroy(struct Qdisc *qdisc) -{ - if (qdisc->flags & TCQ_F_BUILTIN || - !atomic_dec_and_test(&qdisc->refcnt)) - return; - - if (qdisc->parent) - list_del(&qdisc->list); - - call_rcu(&qdisc->q_rcu, __qdisc_destroy); -} EXPORT_SYMBOL(qdisc_destroy); static bool dev_all_qdisc_sleeping_noop(struct net_device *dev) -- cgit v1.2.3 From 4d8863a29c4755a0461cd31b6865026187d6c43a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 18 Aug 2008 21:03:15 -0700 Subject: pkt_sched: Don't hold qdisc lock over qdisc_destroy(). Based upon reports by Denys Fedoryshchenko, and feedback and help from Jarek Poplawski and Herbert Xu. We always either: 1) Never made an external reference to this qdisc. or 2) Did a dev_deactivate() which purged all asynchronous references. So do not lock the qdisc when we call qdisc_destroy(), it's illegal anyways as when we drop the lock this is free'd memory. Signed-off-by: David S. Miller --- net/sched/sch_api.c | 13 ++----------- net/sched/sch_generic.c | 6 ------ 2 files changed, 2 insertions(+), 17 deletions(-) (limited to 'net/sched/sch_generic.c') diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 7d7070b1eebd..d91a2338877c 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -638,11 +638,8 @@ static void notify_and_destroy(struct sk_buff *skb, struct nlmsghdr *n, u32 clid if (new || old) qdisc_notify(skb, n, clid, old, new); - if (old) { - sch_tree_lock(old); + if (old) qdisc_destroy(old); - sch_tree_unlock(old); - } } /* Graft qdisc "new" to class "classid" of qdisc "parent" or @@ -1092,16 +1089,10 @@ create_n_graft: graft: if (1) { - spinlock_t *root_lock; - err = qdisc_graft(dev, p, skb, n, clid, q, NULL); if (err) { - if (q) { - root_lock = qdisc_root_lock(q); - spin_lock_bh(root_lock); + if (q) qdisc_destroy(q); - spin_unlock_bh(root_lock); - } return err; } } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 6f96b7bc0809..c3ed4d44fc14 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -518,8 +518,6 @@ void qdisc_reset(struct Qdisc *qdisc) } EXPORT_SYMBOL(qdisc_reset); -/* Under qdisc_lock(qdisc) and BH! */ - void qdisc_destroy(struct Qdisc *qdisc) { const struct Qdisc_ops *ops = qdisc->ops; @@ -712,14 +710,10 @@ static void shutdown_scheduler_queue(struct net_device *dev, struct Qdisc *qdisc_default = _qdisc_default; if (qdisc) { - spinlock_t *root_lock = qdisc_lock(qdisc); - dev_queue->qdisc = qdisc_default; dev_queue->qdisc_sleeping = qdisc_default; - spin_lock_bh(root_lock); qdisc_destroy(qdisc); - spin_unlock_bh(root_lock); } } -- cgit v1.2.3 From f6e0b239a2657ea8cb67f0d83d0bfdbfd19a481b Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Fri, 22 Aug 2008 03:24:05 -0700 Subject: pkt_sched: Fix qdisc list locking Since some qdiscs call qdisc_tree_decrease_qlen() (so qdisc_lookup()) without rtnl_lock(), adding and deleting from a qdisc list needs additional locking. This patch adds global spinlock qdisc_list_lock and wrapper functions for modifying the list. It is considered as a temporary solution until hfsc_dequeue(), netem_dequeue() and tbf_dequeue() (or qdisc_tree_decrease_qlen()) are redone. With feedback from Herbert Xu and David S. Miller. Signed-off-by: Jarek Poplawski Acked-by: Herbert Xu Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 1 + net/sched/sch_api.c | 44 +++++++++++++++++++++++++++++++++++++++----- net/sched/sch_generic.c | 5 ++--- 3 files changed, 42 insertions(+), 8 deletions(-) (limited to 'net/sched/sch_generic.c') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 853fe83d9f37..b786a5b09253 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -78,6 +78,7 @@ extern struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops, extern int register_qdisc(struct Qdisc_ops *qops); extern int unregister_qdisc(struct Qdisc_ops *qops); +extern void qdisc_list_del(struct Qdisc *q); extern struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle); extern struct Qdisc *qdisc_lookup_class(struct net_device *dev, u32 handle); extern struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 45f442d7de47..e7fb9e0d21b4 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -199,19 +199,53 @@ struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) return NULL; } +/* + * This lock is needed until some qdiscs stop calling qdisc_tree_decrease_qlen() + * without rtnl_lock(); currently hfsc_dequeue(), netem_dequeue(), tbf_dequeue() + */ +static DEFINE_SPINLOCK(qdisc_list_lock); + +static void qdisc_list_add(struct Qdisc *q) +{ + if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { + spin_lock_bh(&qdisc_list_lock); + list_add_tail(&q->list, &qdisc_root_sleeping(q)->list); + spin_unlock_bh(&qdisc_list_lock); + } +} + +void qdisc_list_del(struct Qdisc *q) +{ + if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { + spin_lock_bh(&qdisc_list_lock); + list_del(&q->list); + spin_unlock_bh(&qdisc_list_lock); + } +} +EXPORT_SYMBOL(qdisc_list_del); + struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) { unsigned int i; + struct Qdisc *q; + + spin_lock_bh(&qdisc_list_lock); for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); - struct Qdisc *q, *txq_root = txq->qdisc_sleeping; + struct Qdisc *txq_root = txq->qdisc_sleeping; q = qdisc_match_from_root(txq_root, handle); if (q) - return q; + goto unlock; } - return qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle); + + q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle); + +unlock: + spin_unlock_bh(&qdisc_list_lock); + + return q; } static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) @@ -810,8 +844,8 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, goto err_out3; } } - if ((parent != TC_H_ROOT) && !(sch->flags & TCQ_F_INGRESS)) - list_add_tail(&sch->list, &dev_queue->qdisc_sleeping->list); + + qdisc_list_add(sch); return sch; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index c3ed4d44fc14..5f0ade7806a7 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -526,10 +526,9 @@ void qdisc_destroy(struct Qdisc *qdisc) !atomic_dec_and_test(&qdisc->refcnt)) return; - if (qdisc->parent) - list_del(&qdisc->list); - #ifdef CONFIG_NET_SCHED + qdisc_list_del(qdisc); + qdisc_put_stab(qdisc->stab); #endif gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est); -- cgit v1.2.3 From f7a54c13c7b072d9426bd5cec1cdb8306df5ef55 Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Wed, 27 Aug 2008 02:22:07 -0700 Subject: pkt_sched: Use rcu_assign_pointer() to change dev_queue->qdisc These pointers are RCU protected, so proper primitives should be used. Signed-off-by: Jarek Poplawski Signed-off-by: David S. Miller --- net/sched/sch_api.c | 2 +- net/sched/sch_generic.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net/sched/sch_generic.c') diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 341d558b6e39..ad9cda1b8c0a 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -635,7 +635,7 @@ static struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, if (qdisc == NULL) qdisc = &noop_qdisc; dev_queue->qdisc_sleeping = qdisc; - dev_queue->qdisc = &noop_qdisc; + rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc); spin_unlock_bh(root_lock); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 5f0ade7806a7..9634091ee2f0 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -634,7 +634,7 @@ static void dev_deactivate_queue(struct net_device *dev, if (!(qdisc->flags & TCQ_F_BUILTIN)) set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state); - dev_queue->qdisc = qdisc_default; + rcu_assign_pointer(dev_queue->qdisc, qdisc_default); qdisc_reset(qdisc); spin_unlock_bh(qdisc_lock(qdisc)); @@ -709,7 +709,7 @@ static void shutdown_scheduler_queue(struct net_device *dev, struct Qdisc *qdisc_default = _qdisc_default; if (qdisc) { - dev_queue->qdisc = qdisc_default; + rcu_assign_pointer(dev_queue->qdisc, qdisc_default); dev_queue->qdisc_sleeping = qdisc_default; qdisc_destroy(qdisc); -- cgit v1.2.3