rework the quarantine: it doesn't really need to be accounted.
authorPierre Habouzit <pierre.habouzit@intersec.com>
Sat, 14 Jan 2012 20:36:45 +0000 (21:36 +0100)
committerPierre Habouzit <pierre.habouzit@intersec.com>
Sat, 14 Jan 2012 20:39:07 +0000 (21:39 +0100)
The notion of quarantine is purely virtual and we only care about it at
WAKE time. It significantly simplifies the fastpath of our code, namely
__pwqr_sb_update_state.

Also allow the WAKE command to directly unpark threads if that fits with
the concurrency level. I don't really expect userland to really use that,
but it doesn't break anything and makes sense.

Signed-off-by: Pierre Habouzit <pierre.habouzit@intersec.com>
Documentation/pwqr.adoc
kernel/pwqr.c

index 067b2ae..0c04746 100644 (file)
@@ -30,7 +30,8 @@ quarantined::
 +
 This state avoids waking a thread to force userland to "park" the thread, this
 is racy, make the scheduler work for nothing useful.  Though if `PWQR_WAKE` is
 +
 This state avoids waking a thread to force userland to "park" the thread, this
 is racy, make the scheduler work for nothing useful.  Though if `PWQR_WAKE` is
-called, quarantined threads are woken but with a `EDQUOT` errno set.
+called, quarantined threads are woken but with a `EDQUOT` errno set, and only
+one by one, no matter how wakes have been asked.
 +
 This state actually has only one impact: when `PWQR_WAKE` is called for more
 than one threads, for example 4, and that userland knows that there is 5
 +
 This state actually has only one impact: when `PWQR_WAKE` is called for more
 than one threads, for example 4, and that userland knows that there is 5
@@ -226,13 +227,27 @@ Valid values for the `op` argument are:
 
 `PWQR_WAKE`::
        Tries to wake `val` threads from the pool. This is done according to
 
 `PWQR_WAKE`::
        Tries to wake `val` threads from the pool. This is done according to
-       the current concurrency level not to overcommit. On success, the
-       number of woken threads is returned, it can be 0.
+       the current concurrency level not to overcommit. On success, a hint of
+       the number of woken threads is returned, it can be 0.
++
+This is only a hint of the number of threads woken up for two reasons. First,
+the kernel could really have woken up a thread, but when it becomes scheduled,
+it could *then* decide that it would overcommit (because some other thread
+unblocked inbetween for example), and block it again.
++
+But it can also lie in the other direction: userland is supposed to account
+for waiting threads. So when we're overcommiting and userland want a waiting
+thread to be unblocked, we actually say we woke none, but still unblock one
+(the famous quarantined threads we talk about above). This allow the userland
+counter of waiting threads to decrease, but we know the thread won't be usable
+so we return 0.
 
 `PWQR_WAKE_OC`::
        Tries to wake `val` threads from the pool. This is done bypassing the
        current concurrency level (`OC` stands for `OVERCOMMIT`). On success,
 
 `PWQR_WAKE_OC`::
        Tries to wake `val` threads from the pool. This is done bypassing the
        current concurrency level (`OC` stands for `OVERCOMMIT`). On success,
-       the number of woken threads is returned, it can be 0.
+       the number of woken threads is returned, it can be 0, but it's the
+       real count that has been (or will soon be) woken up. If it's less than
+       required, it's because there aren't enough parked threads.
 
 `PWQR_WAIT`::
        Puts the thread to wait for a future `PWQR_WAKE` command. If this
 
 `PWQR_WAIT`::
        Puts the thread to wait for a future `PWQR_WAKE` command. If this
index e4e3f58..8171596 100644 (file)
@@ -60,7 +60,6 @@ struct pwqr_sb {
 
        unsigned                running;
        unsigned                waiting;
 
        unsigned                running;
        unsigned                waiting;
-       unsigned                quarantined;
        unsigned                parked;
        unsigned                overcommit_wakes;
 
        unsigned                parked;
        unsigned                overcommit_wakes;
 
@@ -96,36 +95,16 @@ static struct preempt_ops   pwqr_preempt_noop_ops;
 
 static inline void __pwqr_sb_update_state(struct pwqr_sb *sb, int running_delta)
 {
 
 static inline void __pwqr_sb_update_state(struct pwqr_sb *sb, int running_delta)
 {
-       int overcommit;
-
        sb->running += running_delta;
        sb->running += running_delta;
-       overcommit = sb->running + sb->waiting - sb->concurrency;
-       if (overcommit == 0) {
+       if (sb->running > sb->concurrency) {
+               /* TODO see ../Documentation/pwqr.adoc */
+       } else if (sb->running == sb->concurrency) {
                /* do nothing */
                /* do nothing */
-       } else if (overcommit > 0) {
-               if (overcommit > sb->waiting) {
-                       sb->quarantined += sb->waiting;
-                       sb->waiting      = 0;
-               } else {
-                       sb->quarantined += overcommit;
-                       sb->waiting     -= overcommit;
-               }
-       } else {
-               unsigned undercommit = -overcommit;
-
-               if (undercommit < sb->quarantined) {
-                       sb->waiting     += undercommit;
-                       sb->quarantined -= undercommit;
-               } else if (sb->quarantined) {
-                       sb->waiting     += sb->quarantined;
-                       sb->quarantined  = 0;
-               } else if (sb->waiting == 0 && sb->parked) {
-                       if (!timer_pending(&sb->timer)) {
-                               mod_timer(&sb->timer, jiffies +
-                                         PWQR_UNPARK_DELAY);
-                       }
-                       return;
+       } else if (sb->waiting == 0 && sb->parked) {
+               if (!timer_pending(&sb->timer)) {
+                       mod_timer(&sb->timer, jiffies + PWQR_UNPARK_DELAY);
                }
                }
+               return;
        }
 
        if (timer_pending(&sb->timer))
        }
 
        if (timer_pending(&sb->timer))
@@ -380,7 +359,7 @@ static int pwqr_release(struct inode *inode, struct file *filp)
 
 static long
 do_pwqr_wait(struct pwqr_sb *sb, struct pwqr_task *pwqt,
 
 static long
 do_pwqr_wait(struct pwqr_sb *sb, struct pwqr_task *pwqt,
-           int in_pool, struct pwqr_ioc_wait __user *arg)
+           int is_wait, struct pwqr_ioc_wait __user *arg)
 {
        unsigned long flags;
        struct pwqr_ioc_wait wait;
 {
        unsigned long flags;
        struct pwqr_ioc_wait wait;
@@ -389,14 +368,14 @@ do_pwqr_wait(struct pwqr_sb *sb, struct pwqr_task *pwqt,
 
        preempt_notifier_unregister(&pwqt->notifier);
 
 
        preempt_notifier_unregister(&pwqt->notifier);
 
-       if (in_pool && copy_from_user(&wait, arg, sizeof(wait))) {
+       if (is_wait && copy_from_user(&wait, arg, sizeof(wait))) {
                rc = -EFAULT;
                goto out;
        }
 
        pwqr_sb_lock_irqsave(sb, flags);
        if (sb->running + sb->waiting <= sb->concurrency) {
                rc = -EFAULT;
                goto out;
        }
 
        pwqr_sb_lock_irqsave(sb, flags);
        if (sb->running + sb->waiting <= sb->concurrency) {
-               if (in_pool) {
+               if (is_wait) {
                        while (probe_kernel_address(wait.pwqr_uaddr, uval)) {
                                pwqr_sb_unlock_irqrestore(sb, flags);
                                rc = get_user(uval, (u32 *)wait.pwqr_uaddr);
                        while (probe_kernel_address(wait.pwqr_uaddr, uval)) {
                                pwqr_sb_unlock_irqrestore(sb, flags);
                                rc = get_user(uval, (u32 *)wait.pwqr_uaddr);
@@ -410,7 +389,6 @@ do_pwqr_wait(struct pwqr_sb *sb, struct pwqr_task *pwqt,
                                goto out_unlock;
                        }
                } else {
                                goto out_unlock;
                        }
                } else {
-                       BUG_ON(sb->quarantined != 0);
                        goto out_unlock;
                }
        }
                        goto out_unlock;
                }
        }
@@ -421,7 +399,7 @@ do_pwqr_wait(struct pwqr_sb *sb, struct pwqr_task *pwqt,
 
                __wait.flags |= WQ_FLAG_EXCLUSIVE;
 
 
                __wait.flags |= WQ_FLAG_EXCLUSIVE;
 
-               if (in_pool) {
+               if (is_wait) {
                        sb->waiting++;
                        __add_wait_queue(&sb->wqh, &__wait);
                } else {
                        sb->waiting++;
                        __add_wait_queue(&sb->wqh, &__wait);
                } else {
@@ -441,7 +419,7 @@ do_pwqr_wait(struct pwqr_sb *sb, struct pwqr_task *pwqt,
                        spin_unlock_irq(&sb->wqh.lock);
                        schedule();
                        spin_lock_irq(&sb->wqh.lock);
                        spin_unlock_irq(&sb->wqh.lock);
                        schedule();
                        spin_lock_irq(&sb->wqh.lock);
-                       if (in_pool && sb->waiting)
+                       if (is_wait)
                                break;
                        if (sb->running + sb->waiting < sb->concurrency)
                                break;
                                break;
                        if (sb->running + sb->waiting < sb->concurrency)
                                break;
@@ -450,13 +428,8 @@ do_pwqr_wait(struct pwqr_sb *sb, struct pwqr_task *pwqt,
                __remove_wait_queue(&sb->wqh, &__wait);
                __set_current_state(TASK_RUNNING);
 
                __remove_wait_queue(&sb->wqh, &__wait);
                __set_current_state(TASK_RUNNING);
 
-               if (in_pool) {
-                       if (sb->waiting) {
-                               sb->waiting--;
-                       } else {
-                               BUG_ON(!sb->quarantined);
-                               sb->quarantined--;
-                       }
+               if (is_wait) {
+                       sb->waiting--;
                } else {
                        sb->parked--;
                }
                } else {
                        sb->parked--;
                }
@@ -515,7 +488,7 @@ static long do_pwqr_wake(struct pwqr_sb *sb, int oc, int count)
        pwqr_sb_lock_irqsave(sb, flags);
 
        if (oc) {
        pwqr_sb_lock_irqsave(sb, flags);
 
        if (oc) {
-               nwake = sb->waiting + sb->quarantined + sb->parked - sb->overcommit_wakes;
+               nwake = sb->waiting + sb->parked - sb->overcommit_wakes;
                if (count > nwake) {
                        count = nwake;
                } else {
                if (count > nwake) {
                        count = nwake;
                } else {
@@ -524,6 +497,10 @@ static long do_pwqr_wake(struct pwqr_sb *sb, int oc, int count)
                sb->overcommit_wakes += count;
        } else if (sb->running + sb->overcommit_wakes < sb->concurrency) {
                nwake = sb->concurrency - sb->overcommit_wakes - sb->running;
                sb->overcommit_wakes += count;
        } else if (sb->running + sb->overcommit_wakes < sb->concurrency) {
                nwake = sb->concurrency - sb->overcommit_wakes - sb->running;
+               if (nwake > sb->waiting + sb->parked - sb->overcommit_wakes) {
+                       nwake = sb->waiting + sb->parked -
+                               sb->overcommit_wakes;
+               }
                if (count > nwake) {
                        count = nwake;
                } else {
                if (count > nwake) {
                        count = nwake;
                } else {
@@ -531,23 +508,25 @@ static long do_pwqr_wake(struct pwqr_sb *sb, int oc, int count)
                }
        } else {
                /*
                }
        } else {
                /*
-                * This codepath deserves an explanation: when the thread is
-                * quarantined, for us, really, it's already "parked". Though
-                * userland doesn't know about, so wake as many threads as
-                * userlands would have liked to, and let the wakeup tell
-                * userland those should be parked.
+                * This codepath deserves an explanation: waking the thread
+                * "for real" would overcommit, though userspace KNOWS there
+                * is at least one waiting thread. Such threads are threads
+                * that are "quarantined".
+                *
+                * Quarantined threads are woken up one by one, to allow a
+                * slow ramp down, trying to minimize "waiting" <-> "parked"
+                * flip-flops, no matter how many wakes have been asked.
                 *
                 *
-                * That's why we lie about the number of woken threads,
-                * really, userlandwise we woke up a thread so that it could
-                * be parked for real and avoid spurious syscalls. So it's as
-                * if we woke up 0 threads.
+                * Since releasing one quarantined thread will wake up a
+                * thread that will (almost) straight go to parked mode, lie
+                * to userland about the fact that we unblocked that thread,
+                * and return 0.
+                *
+                * Though if we're already waking all waiting threads for
+                * overcommitting jobs, well, we don't need that.
                 */
                 */
-               nwake = sb->quarantined;
-               if (sb->waiting < sb->overcommit_wakes)
-                       nwake -= sb->overcommit_wakes - sb->waiting;
-               if (nwake > count)
-                       nwake = count;
                count = 0;
                count = 0;
+               nwake = sb->waiting > sb->overcommit_wakes;
        }
        while (nwake-- > 0)
                wake_up_locked(&sb->wqh);
        }
        while (nwake-- > 0)
                wake_up_locked(&sb->wqh);