fix a few more bugs detected by the simple tester.
[~madcoder/pwqr.git] / kernel / pwqr.c
1 /*
2  * Copyright (C) 2012   Pierre Habouzit <pierre.habouzit@intersec.com>
3  * Copyright (C) 2012   Intersec SAS
4  *
5  * This file implements the Linux Pthread Workqueue Regulator, and is part
6  * of the linux kernel.
7  *
8  * The Linux Kernel is free software: you can redistribute it and/or modify it
9  * under the terms of the GNU General Public License version 2 as published by
10  * the Free Software Foundation.
11  *
12  * The Linux Kernel is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14  * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15  * License for more details.
16  *
17  * You should have received a copy of the GNU General Public License version 2
18  * along with The Linux Kernel.  If not, see <http://www.gnu.org/licenses/>.
19  */
20
21 #include <linux/cdev.h>
22 #include <linux/device.h>
23 #include <linux/file.h>
24 #include <linux/fs.h>
25 #include <linux/hash.h>
26 #include <linux/init.h>
27 #include <linux/kref.h>
28 #include <linux/module.h>
29 #include <linux/poll.h>
30 #include <linux/sched.h>
31 #include <linux/slab.h>
32 #include <linux/spinlock.h>
33 #include <linux/timer.h>
34 #include <linux/uaccess.h>
35 #include <linux/wait.h>
36 #include <linux/version.h>
37
38 /*
39  * The pthread workqueue regulator code is for now written as a proof of
40  * concept module, meant to work with 2.6.23+ kernels or redhat5 ones.
41  *
42  * For now it uses a device /dev/pwq, which spawns magic file-descriptors
43  * supporting a few ioctl operations (see Documentation/pwqr.adoc shipped in
44  * the same git repository).
45  *
46  * This code is meant to be merged into mainline, but after the following
47  * changes, kept here as a "todolist":
48  *
49  *   - get rid of the device stuff (which is 100% of the init code for 2.6.23
50  *     kernels);
51  *
52  *   - resubmit the patch that makes it possible to call
53  *     preempt_notifier_unregister from sched_in/sched_out (just a matter of a
54  *     hlist_for_each_safe instead of hlist_for_each), and fix
55  *     pwqr_task_release to not require RCU anymore. It makes
56  *     pwqr_preempt_noop_ops go away.
57  *
58  *   - think about the possibility to add a pwq_notifier pointer directly into
59  *     the task_struct, thought it's not *that* necessary, it grows the
60  *     structure for a speed gain we don't really need (making pwqr_ctl
61  *     faster). I think it's okay to crawl the preempt_notifier list instead.
62  *     We may want to add nice "macros" for that though.
63  *
64  *   - replace the ioctl with a pwqr_ctl syscall
65  *
66  *   - create a pwqr_create() syscall to create a pwqr file-descriptor.
67  *
68  * Summary: most of the code should be untouched or almost not changed,
69  * pwqr_ioctl adapted to become a syscall, and the module boilerplate replaced
70  * with pwqr_create() and file-descriptor creation boilerplate instead. But
71  * looking at fs/eventfd.c this looks rather simple.
72  */
73
74 #ifndef CONFIG_PREEMPT_NOTIFIERS
75 #  error PWQ module requires CONFIG_PREEMPT_NOTIFIERS
76 #else
77
78 #include "pwqr.h"
79
80 #define PWQR_UC_DELAY           (HZ / 10)
81 #define PWQR_OC_DELAY           (HZ / 20)
82
83 #define PWQR_STATE_NONE         0
84 #define PWQR_STATE_UC           1
85 #define PWQR_STATE_OC           2
86 #define PWQR_STATE_DEAD         (-1)
87
88 /*
89  * This is the first inclusion of CONFIG_PREEMPT_NOTIFIERS in the kernel.
90  *
91  * Though I want it to work on older redhat 5 kernels, that have an emulation
92  * of the feature but not implemented the same way, and instead of linking the
93  * preempt_notifiers from the task_struct directly, they have a private
94  * h-table I don't have access to, so I need my own too.
95  *
96  * For vanilla kernels we crawl through the task_struct::preempt_notifiers
97  * hlist until we find our entry, this list is often very short, and it's no
98  * slower than the global h-table which also crawls a list anyway.
99  */
100 #define IS_PRE_2_6_23    (LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 23))
101
102 struct pwqr_sb {
103         struct kref             kref;
104         struct rcu_head         rcu;
105         struct timer_list       timer;
106         wait_queue_head_t       wqh;
107         wait_queue_head_t       wqh_poll;
108
109         unsigned                concurrency;
110         unsigned                registered;
111
112         unsigned                running;
113         unsigned                waiting;
114         unsigned                parked;
115         unsigned                overcommit_wakes;
116
117         int                     state;
118         unsigned                has_pollin;
119 };
120
121 struct pwqr_task {
122         struct preempt_notifier notifier;
123         struct pwqr_sb         *sb;
124         struct rcu_head         rcu;
125 #if IS_PRE_2_6_23
126         struct hlist_node       link;
127         struct task_struct     *task;
128 #endif
129 };
130
131 #if IS_PRE_2_6_23
132
133 #define PWQR_HASH_BITS          5
134 #define PWQR_HASH_SIZE          (1 << PWQR_HASH_BITS)
135
136 struct pwqr_task_bucket {
137         spinlock_t              lock;
138         struct hlist_head       tasks;
139 };
140
141 static struct pwqr_task_bucket  pwqr_tasks_hash[PWQR_HASH_SIZE];
142 #endif
143
144 /*
145  * Global variables
146  */
147 static struct class            *pwqr_class;
148 static int                      pwqr_major;
149 static struct preempt_ops       pwqr_preempt_running_ops;
150 static struct preempt_ops       pwqr_preempt_blocked_ops;
151 static struct preempt_ops       pwqr_preempt_noop_ops;
152
153 /*****************************************************************************
154  * Scoreboards
155  */
156
157 #define pwqr_sb_lock_irqsave(sb, flags) \
158         spin_lock_irqsave(&(sb)->wqh.lock, flags)
159 #define pwqr_sb_unlock_irqrestore(sb, flags) \
160         spin_unlock_irqrestore(&(sb)->wqh.lock, flags)
161
162 static inline void pwqr_arm_timer(struct pwqr_sb *sb, int how, int delay)
163 {
164         if (timer_pending(&sb->timer) && sb->state == how)
165                 return;
166         mod_timer(&sb->timer, jiffies + delay);
167         sb->state = how;
168 }
169
170 static inline void __pwqr_sb_update_state(struct pwqr_sb *sb, int running_delta)
171 {
172         sb->running += running_delta;
173
174         if (sb->running < sb->concurrency && sb->waiting == 0 && sb->parked) {
175                 pwqr_arm_timer(sb, PWQR_STATE_UC, PWQR_UC_DELAY);
176         } else if (sb->running > sb->concurrency) {
177                 pwqr_arm_timer(sb, PWQR_STATE_OC, PWQR_OC_DELAY);
178         } else {
179                 sb->state = PWQR_STATE_NONE;
180                 if (!timer_pending(&sb->timer))
181                         del_timer(&sb->timer);
182         }
183 }
184
185 static void pwqr_sb_timer_cb(unsigned long arg)
186 {
187         struct pwqr_sb *sb = (struct pwqr_sb *)arg;
188         unsigned long flags;
189
190         pwqr_sb_lock_irqsave(sb, flags);
191         if (sb->running < sb->concurrency && sb->waiting == 0 && sb->parked) {
192                 if (sb->overcommit_wakes == 0)
193                         wake_up_locked(&sb->wqh);
194         }
195         if (sb->running > sb->concurrency) {
196                 printk(KERN_DEBUG "wake up poll");
197                 wake_up_poll(&sb->wqh_poll, POLLIN);
198                 sb->has_pollin = 1;
199         }
200         pwqr_sb_unlock_irqrestore(sb, flags);
201 }
202
203 static struct pwqr_sb *pwqr_sb_create(void)
204 {
205         struct pwqr_sb *sb;
206
207         sb = kzalloc(sizeof(struct pwqr_sb), GFP_KERNEL);
208         if (sb == NULL)
209                 return ERR_PTR(-ENOMEM);
210
211         kref_init(&sb->kref);
212         init_waitqueue_head(&sb->wqh);
213         init_waitqueue_head(&sb->wqh_poll);
214         sb->concurrency    = num_online_cpus();
215         init_timer(&sb->timer);
216         sb->timer.function = pwqr_sb_timer_cb;
217         sb->timer.data     = (unsigned long)sb;
218
219         __module_get(THIS_MODULE);
220         return sb;
221 }
222 static inline void pwqr_sb_get(struct pwqr_sb *sb)
223 {
224         kref_get(&sb->kref);
225 }
226
227 static void pwqr_sb_finalize(struct rcu_head *rcu)
228 {
229         struct pwqr_sb *sb = container_of(rcu, struct pwqr_sb, rcu);
230
231         module_put(THIS_MODULE);
232         kfree(sb);
233 }
234
235 static void pwqr_sb_release(struct kref *kref)
236 {
237         struct pwqr_sb *sb = container_of(kref, struct pwqr_sb, kref);
238
239         del_timer_sync(&sb->timer);
240         wake_up_poll(&sb->wqh_poll, POLLHUP);
241         call_rcu(&sb->rcu, pwqr_sb_finalize);
242 }
243 static inline void pwqr_sb_put(struct pwqr_sb *sb)
244 {
245         kref_put(&sb->kref, pwqr_sb_release);
246 }
247
248 /*****************************************************************************
249  * tasks
250  */
251 #if IS_PRE_2_6_23
252 static inline struct pwqr_task_bucket *task_hbucket(struct task_struct *task)
253 {
254         return &pwqr_tasks_hash[hash_ptr(task, PWQR_HASH_BITS)];
255 }
256
257 static struct pwqr_task *pwqr_task_find(struct task_struct *task)
258 {
259         struct pwqr_task_bucket *b = task_hbucket(task);
260         struct hlist_node *node;
261         struct pwqr_task *pwqt = NULL;
262
263         spin_lock(&b->lock);
264         hlist_for_each_entry(pwqt, node, &b->tasks, link) {
265                 if (pwqt->task == task)
266                         break;
267         }
268         spin_unlock(&b->lock);
269         return pwqt;
270 }
271 #else
272 static struct pwqr_task *pwqr_task_find(struct task_struct *task)
273 {
274         struct hlist_node       *node;
275         struct preempt_notifier *it;
276         struct pwqr_task        *pwqt = NULL;
277
278         hlist_for_each_entry(it, node, &task->preempt_notifiers, link) {
279                 if (it->ops == &pwqr_preempt_running_ops ||
280                     it->ops == &pwqr_preempt_blocked_ops ||
281                     it->ops == &pwqr_preempt_noop_ops)
282                 {
283                         pwqt = container_of(it, struct pwqr_task, notifier);
284                         break;
285                 }
286         }
287
288         return pwqt;
289 }
290 #endif
291
292 static struct pwqr_task *pwqr_task_create(struct task_struct *task)
293 {
294         struct pwqr_task *pwqt;
295
296         pwqt = kmalloc(sizeof(*pwqt), GFP_KERNEL);
297         if (pwqt == NULL)
298                 return ERR_PTR(-ENOMEM);
299
300         preempt_notifier_init(&pwqt->notifier, &pwqr_preempt_running_ops);
301         preempt_notifier_register(&pwqt->notifier);
302 #if IS_PRE_2_6_23
303         {
304                 struct pwqr_task_bucket *b = task_hbucket(task);
305
306                 pwqt->task = task;
307                 spin_lock(&b->lock);
308                 hlist_add_head(&pwqt->link, &b->tasks);
309                 spin_unlock(&b->lock);
310         }
311 #endif
312         return pwqt;
313 }
314
315 __cold
316 static void pwqr_task_detach(struct pwqr_task *pwqt, struct pwqr_sb *sb)
317 {
318         unsigned long flags;
319
320         pwqr_sb_lock_irqsave(sb, flags);
321         sb->registered--;
322         if (pwqt->notifier.ops == &pwqr_preempt_running_ops) {
323                 __pwqr_sb_update_state(sb, -1);
324         } else {
325                 __pwqr_sb_update_state(sb, 0);
326         }
327         pwqt->notifier.ops = &pwqr_preempt_noop_ops;
328         pwqr_sb_unlock_irqrestore(sb, flags);
329         pwqr_sb_put(sb);
330         pwqt->sb = NULL;
331 }
332
333 __cold
334 static void pwqr_task_attach(struct pwqr_task *pwqt, struct pwqr_sb *sb)
335 {
336         unsigned long flags;
337
338         pwqr_sb_lock_irqsave(sb, flags);
339         pwqr_sb_get(pwqt->sb = sb);
340         sb->registered++;
341         __pwqr_sb_update_state(sb, 1);
342         pwqr_sb_unlock_irqrestore(sb, flags);
343 }
344
345 __cold
346 static void pwqr_task_release(struct pwqr_task *pwqt, bool from_notifier)
347 {
348 #if IS_PRE_2_6_23
349         struct pwqr_task_bucket *b = task_hbucket(pwqt->task);
350
351         spin_lock(&b->lock);
352         hlist_del(&pwqt->link);
353         spin_unlock(&b->lock);
354 #endif
355
356         if (from_notifier) {
357                 /* When called from sched_{out,in}, it's not allowed to
358                  * call preempt_notifier_unregister (or worse kfree())
359                  *
360                  * Though it's not a good idea to kfree() still registered
361                  * callbacks if we're not dying, it'll panic on the next
362                  * sched_{in,out} call.
363                  */
364                 BUG_ON(!(current->state & TASK_DEAD));
365                 kfree_rcu(pwqt, rcu);
366         } else {
367                 preempt_notifier_unregister(&pwqt->notifier);
368                 kfree(pwqt);
369         }
370 }
371
372 static void pwqr_task_noop_sched_in(struct preempt_notifier *notifier, int cpu)
373 {
374 }
375
376 static void pwqr_task_noop_sched_out(struct preempt_notifier *notifier,
377                                      struct task_struct *next)
378 {
379 }
380
381 static void pwqr_task_blocked_sched_in(struct preempt_notifier *notifier, int cpu)
382 {
383         struct pwqr_task *pwqt = container_of(notifier, struct pwqr_task, notifier);
384         struct pwqr_sb   *sb   = pwqt->sb;
385         unsigned long flags;
386
387         if (unlikely(sb->state < 0)) {
388                 pwqr_task_detach(pwqt, sb);
389                 return;
390         }
391
392         pwqt->notifier.ops = &pwqr_preempt_running_ops;
393         pwqr_sb_lock_irqsave(sb, flags);
394         __pwqr_sb_update_state(sb, 1);
395         pwqr_sb_unlock_irqrestore(sb, flags);
396 }
397
398 static void pwqr_task_sched_out(struct preempt_notifier *notifier,
399                                 struct task_struct *next)
400 {
401         struct pwqr_task   *pwqt = container_of(notifier, struct pwqr_task, notifier);
402         struct pwqr_sb     *sb   = pwqt->sb;
403         struct task_struct *p    = current;
404
405         if (unlikely(p->state & TASK_DEAD) || unlikely(sb->state < 0)) {
406                 pwqr_task_detach(pwqt, sb);
407                 if (p->state & TASK_DEAD)
408                         pwqr_task_release(pwqt, true);
409                 return;
410         }
411         if (p->state == 0 || (p->state & (__TASK_STOPPED | __TASK_TRACED)))
412                 return;
413
414         pwqt->notifier.ops = &pwqr_preempt_blocked_ops;
415         /* see preempt.h: irq are disabled for sched_out */
416         spin_lock(&sb->wqh.lock);
417         __pwqr_sb_update_state(sb, -1);
418         spin_unlock(&sb->wqh.lock);
419 }
420
421 static struct preempt_ops __read_mostly pwqr_preempt_noop_ops = {
422         .sched_in       = pwqr_task_noop_sched_in,
423         .sched_out      = pwqr_task_noop_sched_out,
424 };
425
426 static struct preempt_ops __read_mostly pwqr_preempt_running_ops = {
427         .sched_in       = pwqr_task_noop_sched_in,
428         .sched_out      = pwqr_task_sched_out,
429 };
430
431 static struct preempt_ops __read_mostly pwqr_preempt_blocked_ops = {
432         .sched_in       = pwqr_task_blocked_sched_in,
433         .sched_out      = pwqr_task_sched_out,
434 };
435
436 /*****************************************************************************
437  * file descriptor
438  */
439 static int pwqr_open(struct inode *inode, struct file *filp)
440 {
441         struct pwqr_sb *sb;
442
443         sb = pwqr_sb_create();
444         if (IS_ERR(sb))
445                 return PTR_ERR(sb);
446         filp->private_data = sb;
447         return 0;
448 }
449
450 static int pwqr_release(struct inode *inode, struct file *filp)
451 {
452         struct pwqr_sb *sb = filp->private_data;
453         unsigned long flags;
454
455         pwqr_sb_lock_irqsave(sb, flags);
456         sb->state = PWQR_STATE_DEAD;
457         pwqr_sb_unlock_irqrestore(sb, flags);
458         wake_up_all(&sb->wqh);
459         pwqr_sb_put(sb);
460         return 0;
461 }
462
463 static unsigned int pwqr_poll(struct file *filp, poll_table *wait)
464 {
465         struct pwqr_sb *sb = filp->private_data;
466         unsigned int events = 0;
467         unsigned long flags;
468
469         poll_wait(filp, &sb->wqh_poll, wait);
470
471         pwqr_sb_lock_irqsave(sb, flags);
472         if (sb->has_pollin)
473                 events |= POLLIN;
474         if (sb->state < 0)
475                 events |= POLLHUP;
476         pwqr_sb_unlock_irqrestore(sb, flags);
477
478         return events;
479 }
480
481 static inline ssize_t pwqr_sb_read(struct pwqr_sb *sb, int no_wait, u32 *cnt)
482 {
483         DECLARE_WAITQUEUE(wait, current);
484         ssize_t rc = -EAGAIN;
485
486         spin_lock_irq(&sb->wqh.lock);
487         if (sb->running > sb->concurrency) {
488                 rc = 0;
489         } else if (!no_wait) {
490                 add_wait_queue(&sb->wqh_poll, &wait);
491                 for (;;) {
492                         set_current_state(TASK_INTERRUPTIBLE);
493                         if (sb->running > sb->concurrency) {
494                                 rc = 0;
495                                 break;
496                         }
497                         if (signal_pending(current)) {
498                                 rc = -ERESTARTSYS;
499                                 break;
500                         }
501                         spin_unlock_irq(&sb->wqh.lock);
502                         schedule();
503                         spin_lock_irq(&sb->wqh.lock);
504                 }
505                 remove_wait_queue(&sb->wqh_poll, &wait);
506                 __set_current_state(TASK_RUNNING);
507         }
508         if (likely(rc == 0)) {
509                 *cnt = sb->running - sb->concurrency;
510                 sb->has_pollin = 0;
511         }
512         spin_unlock_irq(&sb->wqh.lock);
513
514         return rc;
515 }
516
517 static ssize_t
518 pwqr_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
519 {
520         struct pwqr_sb *sb = filp->private_data;
521         u32 cnt = 0;
522         ssize_t rc;
523
524         if (count < sizeof(cnt))
525                 return -EINVAL;
526         rc = pwqr_sb_read(sb, filp->f_flags & O_NONBLOCK, &cnt);
527         if (rc < 0)
528                 return rc;
529         return put_user(cnt, (u32 __user *)buf) ? -EFAULT : sizeof(cnt);
530 }
531
532 static long
533 do_pwqr_wait(struct pwqr_sb *sb, struct pwqr_task *pwqt,
534              int is_wait, struct pwqr_ioc_wait __user *arg)
535 {
536         unsigned long flags;
537         struct pwqr_ioc_wait wait;
538         long rc = 0;
539         u32 uval;
540
541         preempt_notifier_unregister(&pwqt->notifier);
542
543         if (is_wait) {
544                 if (copy_from_user(&wait, arg, sizeof(wait))) {
545                         rc = -EFAULT;
546                         goto out;
547                 }
548                 if (unlikely((long)wait.pwqr_uaddr % sizeof(int) != 0)) {
549                         rc = -EINVAL;
550                         goto out;
551                 }
552         }
553
554         pwqr_sb_lock_irqsave(sb, flags);
555         if (sb->running + sb->waiting <= sb->concurrency) {
556                 if (is_wait) {
557                         while (probe_kernel_address(wait.pwqr_uaddr, uval)) {
558                                 pwqr_sb_unlock_irqrestore(sb, flags);
559                                 rc = get_user(uval, (u32 *)wait.pwqr_uaddr);
560                                 if (rc)
561                                         goto out;
562                                 pwqr_sb_lock_irqsave(sb, flags);
563                         }
564
565                         if (uval != (u32)wait.pwqr_ticket) {
566                                 rc = -EWOULDBLOCK;
567                                 goto out_unlock;
568                         }
569                 } else {
570                         goto out_unlock;
571                 }
572         }
573
574         /* @ see <wait_event_interruptible_exclusive_locked_irq> */
575         if (likely(sb->state >= 0)) {
576                 DEFINE_WAIT(__wait);
577                 __wait.flags |= WQ_FLAG_EXCLUSIVE;
578
579                 if (is_wait) {
580                         sb->waiting++;
581                         __add_wait_queue(&sb->wqh, &__wait);
582                 } else {
583                         sb->parked++;
584                         __add_wait_queue_tail(&sb->wqh, &__wait);
585                 }
586                 __pwqr_sb_update_state(sb, -1);
587
588                 do {
589                         set_current_state(TASK_INTERRUPTIBLE);
590                         if (sb->overcommit_wakes)
591                                 break;
592                         if (signal_pending(current)) {
593                                 rc = -ERESTARTSYS;
594                                 break;
595                         }
596                         spin_unlock_irq(&sb->wqh.lock);
597                         schedule();
598                         spin_lock_irq(&sb->wqh.lock);
599                         if (is_wait)
600                                 break;
601                         if (sb->running + sb->waiting < sb->concurrency)
602                                 break;
603                 } while (likely(sb->state >= 0));
604
605                 __remove_wait_queue(&sb->wqh, &__wait);
606                 __set_current_state(TASK_RUNNING);
607                 if (is_wait) {
608                         sb->waiting--;
609                 } else {
610                         sb->parked--;
611                 }
612                 __pwqr_sb_update_state(sb, 1);
613
614                 if (sb->overcommit_wakes)
615                         sb->overcommit_wakes--;
616                 if (sb->waiting + sb->running > sb->concurrency)
617                         rc = -EDQUOT;
618         }
619
620 out_unlock:
621         if (unlikely(sb->state < 0))
622                 rc = -EBADFD;
623         pwqr_sb_unlock_irqrestore(sb, flags);
624 out:
625         preempt_notifier_register(&pwqt->notifier);
626         return rc;
627 }
628
629 static long do_pwqr_unregister(struct pwqr_sb *sb, struct pwqr_task *pwqt)
630 {
631         if (!pwqt)
632                 return -EINVAL;
633         if (pwqt->sb != sb)
634                 return -ENOENT;
635         pwqr_task_detach(pwqt, sb);
636         pwqr_task_release(pwqt, false);
637         return 0;
638 }
639
640 static long do_pwqr_set_conc(struct pwqr_sb *sb, int conc)
641 {
642         long old_conc = sb->concurrency;
643         unsigned long flags;
644
645         pwqr_sb_lock_irqsave(sb, flags);
646         if (conc <= 0)
647                 conc = num_online_cpus();
648         if (conc != old_conc) {
649                 sb->concurrency = conc;
650                 __pwqr_sb_update_state(sb, 0);
651         }
652         pwqr_sb_unlock_irqrestore(sb, flags);
653
654         return old_conc;
655 }
656
657 static long do_pwqr_wake(struct pwqr_sb *sb, int oc, int count)
658 {
659         unsigned long flags;
660         int nwake;
661
662         if (count < 0)
663                 return -EINVAL;
664
665         pwqr_sb_lock_irqsave(sb, flags);
666
667         if (oc) {
668                 nwake = sb->waiting + sb->parked - sb->overcommit_wakes;
669                 if (count > nwake) {
670                         count = nwake;
671                 } else {
672                         nwake = count;
673                 }
674                 sb->overcommit_wakes += count;
675         } else if (sb->running + sb->overcommit_wakes < sb->concurrency) {
676                 nwake = sb->concurrency - sb->overcommit_wakes - sb->running;
677                 if (nwake > sb->waiting + sb->parked - sb->overcommit_wakes) {
678                         nwake = sb->waiting + sb->parked -
679                                 sb->overcommit_wakes;
680                 }
681                 if (count > nwake) {
682                         count = nwake;
683                 } else {
684                         nwake = count;
685                 }
686         } else {
687                 /*
688                  * This codepath deserves an explanation: waking the thread
689                  * "for real" would overcommit, though userspace KNOWS there
690                  * is at least one waiting thread. Such threads are threads
691                  * that are "quarantined".
692                  *
693                  * Quarantined threads are woken up one by one, to allow a
694                  * slow ramp down, trying to minimize "waiting" <-> "parked"
695                  * flip-flops, no matter how many wakes have been asked.
696                  *
697                  * Since releasing one quarantined thread will wake up a
698                  * thread that will (almost) straight go to parked mode, lie
699                  * to userland about the fact that we unblocked that thread,
700                  * and return 0.
701                  *
702                  * Though if we're already waking all waiting threads for
703                  * overcommitting jobs, well, we don't need that.
704                  */
705                 count = 0;
706                 nwake = sb->waiting > sb->overcommit_wakes;
707         }
708         while (nwake-- > 0)
709                 wake_up_locked(&sb->wqh);
710         pwqr_sb_unlock_irqrestore(sb, flags);
711
712         return count;
713 }
714
715 static long pwqr_ioctl(struct file *filp, unsigned command, unsigned long arg)
716 {
717         struct pwqr_sb     *sb   = filp->private_data;
718         struct task_struct *task = current;
719         struct pwqr_task   *pwqt;
720         int rc = 0;
721
722         switch (command) {
723         case PWQR_CTL_GET_CONC:
724                 return sb->concurrency;
725         case PWQR_CTL_SET_CONC:
726                 return do_pwqr_set_conc(sb, (int)arg);
727
728         case PWQR_CTL_WAKE:
729         case PWQR_CTL_WAKE_OC:
730                 return do_pwqr_wake(sb, command == PWQR_CTL_WAKE_OC, (int)arg);
731
732         case PWQR_CTL_WAIT:
733         case PWQR_CTL_PARK:
734         case PWQR_CTL_REGISTER:
735         case PWQR_CTL_UNREGISTER:
736                 break;
737         default:
738                 return -EINVAL;
739         }
740
741         pwqt = pwqr_task_find(task);
742         if (command == PWQR_CTL_UNREGISTER)
743                 return do_pwqr_unregister(sb, pwqt);
744
745         if (pwqt == NULL) {
746                 pwqt = pwqr_task_create(task);
747                 if (IS_ERR(pwqt))
748                         return PTR_ERR(pwqt);
749                 pwqr_task_attach(pwqt, sb);
750         } else if (unlikely(pwqt->sb != sb)) {
751                 pwqr_task_detach(pwqt, pwqt->sb);
752                 pwqr_task_attach(pwqt, sb);
753         }
754
755         switch (command) {
756         case PWQR_CTL_WAIT:
757                 rc = do_pwqr_wait(sb, pwqt, true, (struct pwqr_ioc_wait __user *)arg);
758                 break;
759         case PWQR_CTL_PARK:
760                 rc = do_pwqr_wait(sb, pwqt, false, NULL);
761                 break;
762         }
763         return rc;
764 }
765
766 static const struct file_operations pwqr_dev_fops = {
767         .owner          = THIS_MODULE,
768         .open           = pwqr_open,
769         .release        = pwqr_release,
770         .poll           = pwqr_poll,
771         .read           = pwqr_read,
772         .llseek         = noop_llseek,
773         .unlocked_ioctl = pwqr_ioctl,
774 #ifdef CONFIG_COMPAT
775         .compat_ioctl   = pwqr_ioctl,
776 #endif
777 };
778
779 /*****************************************************************************
780  * module
781  */
782 static int __init pwqr_start(void)
783 {
784 #if IS_PRE_2_6_23
785         int i;
786
787         for (i = 0; i < PWQR_HASH_SIZE; i++) {
788                 spin_lock_init(&pwqr_tasks_hash[i].lock);
789                 INIT_HLIST_HEAD(&pwqr_tasks_hash[i].tasks);
790         }
791 #endif
792
793         /* Register as a character device */
794         pwqr_major = register_chrdev(0, "pwqr", &pwqr_dev_fops);
795         if (pwqr_major < 0) {
796                 printk(KERN_ERR "pwqr: register_chrdev() failed\n");
797                 return pwqr_major;
798         }
799
800         /* Create a device node */
801         pwqr_class = class_create(THIS_MODULE, PWQR_DEVICE_NAME);
802         if (IS_ERR(pwqr_class)) {
803                 printk(KERN_ERR "pwqr: Error creating raw class\n");
804                 unregister_chrdev(pwqr_major, PWQR_DEVICE_NAME);
805                 return PTR_ERR(pwqr_class);
806         }
807         device_create(pwqr_class, NULL, MKDEV(pwqr_major, 0), NULL, PWQR_DEVICE_NAME);
808         printk(KERN_INFO "pwqr: PThreads Work Queues Regulator v1 loaded");
809         return 0;
810 }
811
812 static void __exit pwqr_end(void)
813 {
814         rcu_barrier();
815         device_destroy(pwqr_class, MKDEV(pwqr_major, 0));
816         class_destroy(pwqr_class);
817         unregister_chrdev(pwqr_major, PWQR_DEVICE_NAME);
818 }
819
820 module_init(pwqr_start);
821 module_exit(pwqr_end);
822
823 MODULE_LICENSE("GPL");
824 MODULE_AUTHOR("Pierre Habouzit <pierre.habouzit@intersec.com>");
825 MODULE_DESCRIPTION("PThreads Work Queues Regulator");
826 #endif
827
828 // vim:noet:sw=8:cinoptions+=\:0,L-1,=1s: