e5a232eebfc803a1c728d708f4b6ad639544a104
[~madcoder/pwqr.git] / kernel / pwqr.c
1 /*
2  * Copyright (C) 2012   Pierre Habouzit <pierre.habouzit@intersec.com>
3  * Copyright (C) 2012   Intersec SAS
4  *
5  * This file implements the Linux Pthread Workqueue Regulator, and is part
6  * of the linux kernel.
7  *
8  * The Linux Kernel is free software: you can redistribute it and/or modify it
9  * under the terms of the GNU General Public License version 2 as published by
10  * the Free Software Foundation.
11  *
12  * The Linux Kernel is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14  * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15  * License for more details.
16  *
17  * You should have received a copy of the GNU General Public License version 2
18  * along with The Linux Kernel.  If not, see <http://www.gnu.org/licenses/>.
19  */
20
21 #include <linux/cdev.h>
22 #include <linux/device.h>
23 #include <linux/file.h>
24 #include <linux/fs.h>
25 #include <linux/hash.h>
26 #include <linux/init.h>
27 #include <linux/kref.h>
28 #include <linux/module.h>
29 #include <linux/poll.h>
30 #include <linux/sched.h>
31 #include <linux/slab.h>
32 #include <linux/spinlock.h>
33 #include <linux/timer.h>
34 #include <linux/uaccess.h>
35 #include <linux/wait.h>
36 #include <linux/version.h>
37
38 /*
39  * The pthread workqueue regulator code is for now written as a proof of
40  * concept module, meant to work with 2.6.23+ kernels or redhat5 ones.
41  *
42  * For now it uses a device /dev/pwq, which spawns magic file-descriptors
43  * supporting a few ioctl operations (see Documentation/pwqr.adoc shipped in
44  * the same git repository).
45  *
46  * This code is meant to be merged into mainline, but after the following
47  * changes, kept here as a "todolist":
48  *
49  *   - get rid of the device stuff (which is 100% of the init code for 2.6.23
50  *     kernels);
51  *
52  *   - resubmit the patch that makes it possible to call
53  *     preempt_notifier_unregister from sched_in/sched_out (just a matter of a
54  *     hlist_for_each_safe instead of hlist_for_each), and fix
55  *     pwqr_task_release to not require RCU anymore. It makes
56  *     pwqr_preempt_noop_ops go away.
57  *
58  *   - think about the possibility to add a pwq_notifier pointer directly into
59  *     the task_struct, thought it's not *that* necessary, it grows the
60  *     structure for a speed gain we don't really need (making pwqr_ctl
61  *     faster). I think it's okay to crawl the preempt_notifier list instead.
62  *     We may want to add nice "macros" for that though.
63  *
64  *   - replace the ioctl with a pwqr_ctl syscall
65  *
66  *   - create a pwqr_create() syscall to create a pwqr file-descriptor.
67  *
68  * Summary: most of the code should be untouched or almost not changed,
69  * pwqr_ioctl adapted to become a syscall, and the module boilerplate replaced
70  * with pwqr_create() and file-descriptor creation boilerplate instead. But
71  * looking at fs/eventfd.c this looks rather simple.
72  */
73
74 #ifndef CONFIG_PREEMPT_NOTIFIERS
75 #  error PWQ module requires CONFIG_PREEMPT_NOTIFIERS
76 #else
77
78 #include "pwqr.h"
79
80 #define PWQR_UC_DELAY           (HZ / 10)
81 #define PWQR_OC_DELAY           (HZ / 20)
82
83 #define PWQR_STATE_NONE         0
84 #define PWQR_STATE_UC           1
85 #define PWQR_STATE_OC           2
86 #define PWQR_STATE_DEAD         (-1)
87
88 /*
89  * This is the first inclusion of CONFIG_PREEMPT_NOTIFIERS in the kernel.
90  *
91  * Though I want it to work on older redhat 5 kernels, that have an emulation
92  * of the feature but not implemented the same way, and instead of linking the
93  * preempt_notifiers from the task_struct directly, they have a private
94  * h-table I don't have access to, so I need my own too.
95  *
96  * For vanilla kernels we crawl through the task_struct::preempt_notifiers
97  * hlist until we find our entry, this list is often very short, and it's no
98  * slower than the global h-table which also crawls a list anyway.
99  */
100 #define IS_PRE_2_6_23    (LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 23))
101
102 struct pwqr_sb {
103         struct kref             kref;
104         struct rcu_head         rcu;
105         struct timer_list       timer;
106         wait_queue_head_t       wqh;
107         wait_queue_head_t       wqh_poll;
108
109         unsigned                concurrency;
110         unsigned                registered;
111
112         unsigned                running;
113         unsigned                waiting;
114         unsigned                parked;
115         unsigned                overcommit_wakes;
116
117         int                     state;
118 };
119
120 struct pwqr_task {
121         struct preempt_notifier notifier;
122         struct pwqr_sb         *sb;
123         struct rcu_head         rcu;
124 #if IS_PRE_2_6_23
125         struct hlist_node       link;
126         struct task_struct     *task;
127 #endif
128 };
129
130 #if IS_PRE_2_6_23
131
132 #define PWQR_HASH_BITS          5
133 #define PWQR_HASH_SIZE          (1 << PWQR_HASH_BITS)
134
135 struct pwqr_task_bucket {
136         spinlock_t              lock;
137         struct hlist_head       tasks;
138 };
139
140 static struct pwqr_task_bucket  pwqr_tasks_hash[PWQR_HASH_SIZE];
141 #endif
142
143 /*
144  * Global variables
145  */
146 static struct class            *pwqr_class;
147 static int                      pwqr_major;
148 static struct preempt_ops       pwqr_preempt_running_ops;
149 static struct preempt_ops       pwqr_preempt_blocked_ops;
150 static struct preempt_ops       pwqr_preempt_noop_ops;
151
152 /*****************************************************************************
153  * Scoreboards
154  */
155
156 #define pwqr_sb_lock_irqsave(sb, flags) \
157         spin_lock_irqsave(&(sb)->wqh.lock, flags)
158 #define pwqr_sb_unlock_irqrestore(sb, flags) \
159         spin_unlock_irqrestore(&(sb)->wqh.lock, flags)
160
161 static inline void pwqr_arm_timer(struct pwqr_sb *sb, int how, int delay)
162 {
163         if (timer_pending(&sb->timer) && sb->state == how)
164                 return;
165         mod_timer(&sb->timer, jiffies + delay);
166         sb->state = how;
167 }
168
169 static inline void __pwqr_sb_update_state(struct pwqr_sb *sb, int running_delta)
170 {
171         sb->running += running_delta;
172
173         if (sb->running < sb->concurrency && sb->waiting == 0 && sb->parked) {
174                 pwqr_arm_timer(sb, PWQR_STATE_UC, PWQR_UC_DELAY);
175         } else if (sb->running > sb->concurrency) {
176                 pwqr_arm_timer(sb, PWQR_STATE_OC, PWQR_OC_DELAY);
177         } else {
178                 sb->state = PWQR_STATE_NONE;
179                 if (!timer_pending(&sb->timer))
180                         del_timer(&sb->timer);
181         }
182 }
183
184 static void pwqr_sb_timer_cb(unsigned long arg)
185 {
186         struct pwqr_sb *sb = (struct pwqr_sb *)arg;
187         unsigned long flags;
188
189         pwqr_sb_lock_irqsave(sb, flags);
190         if (sb->running < sb->concurrency && sb->waiting == 0 && sb->parked) {
191                 if (sb->overcommit_wakes == 0)
192                         wake_up_locked(&sb->wqh);
193         }
194         if (sb->running > sb->concurrency) {
195                 wake_up_poll(&sb->wqh_poll, POLLIN);
196         }
197         pwqr_sb_unlock_irqrestore(sb, flags);
198 }
199
200 static struct pwqr_sb *pwqr_sb_create(void)
201 {
202         struct pwqr_sb *sb;
203
204         sb = kzalloc(sizeof(struct pwqr_sb), GFP_KERNEL);
205         if (sb == NULL)
206                 return ERR_PTR(-ENOMEM);
207
208         kref_init(&sb->kref);
209         init_waitqueue_head(&sb->wqh);
210         sb->concurrency    = num_online_cpus();
211         init_timer(&sb->timer);
212         sb->timer.function = pwqr_sb_timer_cb;
213         sb->timer.data     = (unsigned long)sb;
214
215         __module_get(THIS_MODULE);
216         return sb;
217 }
218 static inline void pwqr_sb_get(struct pwqr_sb *sb)
219 {
220         kref_get(&sb->kref);
221 }
222
223 static void pwqr_sb_finalize(struct rcu_head *rcu)
224 {
225         struct pwqr_sb *sb = container_of(rcu, struct pwqr_sb, rcu);
226
227         module_put(THIS_MODULE);
228         kfree(sb);
229 }
230
231 static void pwqr_sb_release(struct kref *kref)
232 {
233         struct pwqr_sb *sb = container_of(kref, struct pwqr_sb, kref);
234
235         del_timer_sync(&sb->timer);
236         wake_up_poll(&sb->wqh_poll, POLLHUP);
237         call_rcu(&sb->rcu, pwqr_sb_finalize);
238 }
239 static inline void pwqr_sb_put(struct pwqr_sb *sb)
240 {
241         kref_put(&sb->kref, pwqr_sb_release);
242 }
243
244 /*****************************************************************************
245  * tasks
246  */
247 #if IS_PRE_2_6_23
248 static inline struct pwqr_task_bucket *task_hbucket(struct task_struct *task)
249 {
250         return &pwqr_tasks_hash[hash_ptr(task, PWQR_HASH_BITS)];
251 }
252
253 static struct pwqr_task *pwqr_task_find(struct task_struct *task)
254 {
255         struct pwqr_task_bucket *b = task_hbucket(task);
256         struct hlist_node *node;
257         struct pwqr_task *pwqt = NULL;
258
259         spin_lock(&b->lock);
260         hlist_for_each_entry(pwqt, node, &b->tasks, link) {
261                 if (pwqt->task == task)
262                         break;
263         }
264         spin_unlock(&b->lock);
265         return pwqt;
266 }
267 #else
268 static struct pwqr_task *pwqr_task_find(struct task_struct *task)
269 {
270         struct hlist_node       *node;
271         struct preempt_notifier *it;
272         struct pwqr_task        *pwqt = NULL;
273
274         hlist_for_each_entry(it, node, &task->preempt_notifiers, link) {
275                 if (it->ops == &pwqr_preempt_running_ops ||
276                     it->ops == &pwqr_preempt_blocked_ops ||
277                     it->ops == &pwqr_preempt_noop_ops)
278                 {
279                         pwqt = container_of(it, struct pwqr_task, notifier);
280                         break;
281                 }
282         }
283
284         return pwqt;
285 }
286 #endif
287
288 static struct pwqr_task *pwqr_task_create(struct task_struct *task)
289 {
290         struct pwqr_task *pwqt;
291
292         pwqt = kmalloc(sizeof(*pwqt), GFP_KERNEL);
293         if (pwqt == NULL)
294                 return ERR_PTR(-ENOMEM);
295
296         preempt_notifier_init(&pwqt->notifier, &pwqr_preempt_running_ops);
297         preempt_notifier_register(&pwqt->notifier);
298 #if IS_PRE_2_6_23
299         {
300                 struct pwqr_task_bucket *b = task_hbucket(task);
301
302                 pwqt->task = task;
303                 spin_lock(&b->lock);
304                 hlist_add_head(&pwqt->link, &b->tasks);
305                 spin_unlock(&b->lock);
306         }
307 #endif
308         return pwqt;
309 }
310
311 __cold
312 static void pwqr_task_detach(struct pwqr_task *pwqt, struct pwqr_sb *sb)
313 {
314         unsigned long flags;
315
316         pwqr_sb_lock_irqsave(sb, flags);
317         sb->registered--;
318         if (pwqt->notifier.ops == &pwqr_preempt_running_ops) {
319                 __pwqr_sb_update_state(sb, -1);
320         } else {
321                 __pwqr_sb_update_state(sb, 0);
322         }
323         pwqr_sb_unlock_irqrestore(sb, flags);
324         pwqr_sb_put(sb);
325         pwqt->sb = NULL;
326 }
327
328 __cold
329 static void pwqr_task_attach(struct pwqr_task *pwqt, struct pwqr_sb *sb)
330 {
331         unsigned long flags;
332
333         pwqr_sb_lock_irqsave(sb, flags);
334         pwqr_sb_get(pwqt->sb = sb);
335         sb->registered++;
336         __pwqr_sb_update_state(sb, 1);
337         pwqr_sb_unlock_irqrestore(sb, flags);
338 }
339
340 __cold
341 static void pwqr_task_release(struct pwqr_task *pwqt, bool from_notifier)
342 {
343 #if IS_PRE_2_6_23
344         struct pwqr_task_bucket *b = task_hbucket(pwqt->task);
345
346         spin_lock(&b->lock);
347         hlist_del(&pwqt->link);
348         spin_unlock(&b->lock);
349 #endif
350         pwqt->notifier.ops = &pwqr_preempt_noop_ops;
351
352         if (from_notifier) {
353                 /* When called from sched_{out,in}, it's not allowed to
354                  * call preempt_notifier_unregister (or worse kfree())
355                  *
356                  * Though it's not a good idea to kfree() still registered
357                  * callbacks if we're not dying, it'll panic on the next
358                  * sched_{in,out} call.
359                  */
360                 BUG_ON(!(current->state & TASK_DEAD));
361                 kfree_rcu(pwqt, rcu);
362         } else {
363                 preempt_notifier_unregister(&pwqt->notifier);
364                 kfree(pwqt);
365         }
366 }
367
368 static void pwqr_task_noop_sched_in(struct preempt_notifier *notifier, int cpu)
369 {
370 }
371
372 static void pwqr_task_noop_sched_out(struct preempt_notifier *notifier,
373                                     struct task_struct *next)
374 {
375 }
376
377 static void pwqr_task_blocked_sched_in(struct preempt_notifier *notifier, int cpu)
378 {
379         struct pwqr_task *pwqt = container_of(notifier, struct pwqr_task, notifier);
380         struct pwqr_sb   *sb   = pwqt->sb;
381         unsigned long flags;
382
383         if (unlikely(sb->state < 0)) {
384                 pwqr_task_detach(pwqt, sb);
385                 pwqr_task_release(pwqt, true);
386                 return;
387         }
388
389         pwqt->notifier.ops = &pwqr_preempt_running_ops;
390         pwqr_sb_lock_irqsave(sb, flags);
391         __pwqr_sb_update_state(sb, 1);
392         pwqr_sb_unlock_irqrestore(sb, flags);
393 }
394
395 static void pwqr_task_sched_out(struct preempt_notifier *notifier,
396                                 struct task_struct *next)
397 {
398         struct pwqr_task   *pwqt = container_of(notifier, struct pwqr_task, notifier);
399         struct pwqr_sb     *sb   = pwqt->sb;
400         struct task_struct *p    = current;
401
402         if (unlikely(p->state & TASK_DEAD) || unlikely(sb->state < 0)) {
403                 pwqr_task_detach(pwqt, sb);
404                 pwqr_task_release(pwqt, true);
405                 return;
406         }
407         if (p->state == 0 || (p->state & (__TASK_STOPPED | __TASK_TRACED)))
408                 return;
409
410         pwqt->notifier.ops = &pwqr_preempt_blocked_ops;
411         /* see preempt.h: irq are disabled for sched_out */
412         spin_lock(&sb->wqh.lock);
413         __pwqr_sb_update_state(sb, -1);
414         spin_unlock(&sb->wqh.lock);
415 }
416
417 static struct preempt_ops __read_mostly pwqr_preempt_noop_ops = {
418         .sched_in       = pwqr_task_noop_sched_in,
419         .sched_out      = pwqr_task_noop_sched_out,
420 };
421
422 static struct preempt_ops __read_mostly pwqr_preempt_running_ops = {
423         .sched_in       = pwqr_task_noop_sched_in,
424         .sched_out      = pwqr_task_sched_out,
425 };
426
427 static struct preempt_ops __read_mostly pwqr_preempt_blocked_ops = {
428         .sched_in       = pwqr_task_blocked_sched_in,
429         .sched_out      = pwqr_task_sched_out,
430 };
431
432 /*****************************************************************************
433  * file descriptor
434  */
435 static int pwqr_open(struct inode *inode, struct file *filp)
436 {
437         struct pwqr_sb *sb;
438
439         sb = pwqr_sb_create();
440         if (IS_ERR(sb))
441                 return PTR_ERR(sb);
442         filp->private_data = sb;
443         return 0;
444 }
445
446 static int pwqr_release(struct inode *inode, struct file *filp)
447 {
448         struct pwqr_sb *sb = filp->private_data;
449         unsigned long flags;
450
451         pwqr_sb_lock_irqsave(sb, flags);
452         sb->state = PWQR_STATE_DEAD;
453         pwqr_sb_unlock_irqrestore(sb, flags);
454         wake_up_all(&sb->wqh);
455         pwqr_sb_put(sb);
456         return 0;
457 }
458
459 static unsigned int pwqr_poll(struct file *filp, poll_table *wait)
460 {
461         struct pwqr_sb *sb = filp->private_data;
462         unsigned int events = 0;
463         unsigned long flags;
464
465         poll_wait(filp, &sb->wqh_poll, wait);
466
467         pwqr_sb_lock_irqsave(sb, flags);
468         if (sb->running > sb->concurrency)
469                 events |= POLLIN;
470         if (sb->state < 0)
471                 events |= POLLHUP;
472         pwqr_sb_unlock_irqrestore(sb, flags);
473
474         return events;
475 }
476
477 static inline ssize_t pwqr_sb_read(struct pwqr_sb *sb, int no_wait, u32 *cnt)
478 {
479         DECLARE_WAITQUEUE(wait, current);
480         ssize_t rc = -EAGAIN;
481
482         spin_lock_irq(&sb->wqh.lock);
483         if (sb->running > sb->concurrency) {
484                 rc = 0;
485         } else if (!no_wait) {
486                 add_wait_queue(&sb->wqh_poll, &wait);
487                 for (;;) {
488                         set_current_state(TASK_INTERRUPTIBLE);
489                         if (sb->running > sb->concurrency) {
490                                 rc = 0;
491                                 break;
492                         }
493                         if (signal_pending(current)) {
494                                 rc = -ERESTARTSYS;
495                                 break;
496                         }
497                         spin_unlock_irq(&sb->wqh.lock);
498                         schedule();
499                         spin_lock_irq(&sb->wqh.lock);
500                 }
501                 remove_wait_queue(&sb->wqh_poll, &wait);
502                 __set_current_state(TASK_RUNNING);
503         }
504         if (likely(rc == 0))
505                 *cnt = sb->running - sb->concurrency;
506         spin_unlock_irq(&sb->wqh.lock);
507
508         return rc;
509 }
510
511 static ssize_t
512 pwqr_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
513 {
514         struct pwqr_sb *sb = filp->private_data;
515         u32 cnt = 0;
516         ssize_t rc;
517
518         if (count < sizeof(cnt))
519                 return -EINVAL;
520         rc = pwqr_sb_read(sb, filp->f_flags & O_NONBLOCK, &cnt);
521         if (rc < 0)
522                 return rc;
523         return put_user(cnt, (u32 __user *)buf) ? -EFAULT : sizeof(cnt);
524 }
525
526 static long
527 do_pwqr_wait(struct pwqr_sb *sb, struct pwqr_task *pwqt,
528              int is_wait, struct pwqr_ioc_wait __user *arg)
529 {
530         unsigned long flags;
531         struct pwqr_ioc_wait wait;
532         long rc = 0;
533         u32 uval;
534
535         preempt_notifier_unregister(&pwqt->notifier);
536
537         if (is_wait) {
538                 if (copy_from_user(&wait, arg, sizeof(wait))) {
539                         rc = -EFAULT;
540                         goto out;
541                 }
542                 if (unlikely((long)wait.pwqr_uaddr % sizeof(int) != 0)) {
543                         rc = -EINVAL;
544                         goto out;
545                 }
546         }
547
548         pwqr_sb_lock_irqsave(sb, flags);
549         if (sb->running + sb->waiting <= sb->concurrency) {
550                 if (is_wait) {
551                         while (probe_kernel_address(wait.pwqr_uaddr, uval)) {
552                                 pwqr_sb_unlock_irqrestore(sb, flags);
553                                 rc = get_user(uval, (u32 *)wait.pwqr_uaddr);
554                                 if (rc)
555                                         goto out;
556                                 pwqr_sb_lock_irqsave(sb, flags);
557                         }
558
559                         if (uval != (u32)wait.pwqr_ticket) {
560                                 rc = -EWOULDBLOCK;
561                                 goto out_unlock;
562                         }
563                 } else {
564                         goto out_unlock;
565                 }
566         }
567
568         /* @ see <wait_event_interruptible_exclusive_locked_irq> */
569         if (likely(sb->state >= 0)) {
570                 DEFINE_WAIT(__wait);
571                 __wait.flags |= WQ_FLAG_EXCLUSIVE;
572
573                 if (is_wait) {
574                         sb->waiting++;
575                         __add_wait_queue(&sb->wqh, &__wait);
576                 } else {
577                         sb->parked++;
578                         __add_wait_queue_tail(&sb->wqh, &__wait);
579                 }
580                 __pwqr_sb_update_state(sb, -1);
581
582                 do {
583                         set_current_state(TASK_INTERRUPTIBLE);
584                         if (sb->overcommit_wakes)
585                                 break;
586                         if (signal_pending(current)) {
587                                 rc = -ERESTARTSYS;
588                                 break;
589                         }
590                         spin_unlock_irq(&sb->wqh.lock);
591                         schedule();
592                         spin_lock_irq(&sb->wqh.lock);
593                         if (is_wait)
594                                 break;
595                         if (sb->running + sb->waiting < sb->concurrency)
596                                 break;
597                 } while (likely(sb->state >= 0));
598
599                 __remove_wait_queue(&sb->wqh, &__wait);
600                 __set_current_state(TASK_RUNNING);
601                 if (is_wait) {
602                         sb->waiting--;
603                 } else {
604                         sb->parked--;
605                 }
606                 __pwqr_sb_update_state(sb, 1);
607
608                 if (sb->overcommit_wakes)
609                         sb->overcommit_wakes--;
610                 if (sb->waiting + sb->running > sb->concurrency)
611                         rc = -EDQUOT;
612         }
613
614 out_unlock:
615         if (unlikely(sb->state < 0))
616                 rc = -EBADFD;
617         pwqr_sb_unlock_irqrestore(sb, flags);
618 out:
619         preempt_notifier_register(&pwqt->notifier);
620         return rc;
621 }
622
623 static long do_pwqr_unregister(struct pwqr_sb *sb, struct pwqr_task *pwqt)
624 {
625         if (!pwqt)
626                 return -EINVAL;
627         if (pwqt->sb != sb)
628                 return -ENOENT;
629         pwqr_task_detach(pwqt, sb);
630         pwqr_task_release(pwqt, false);
631         return 0;
632 }
633
634 static long do_pwqr_set_conc(struct pwqr_sb *sb, int conc)
635 {
636         long old_conc = sb->concurrency;
637         unsigned long flags;
638
639         pwqr_sb_lock_irqsave(sb, flags);
640         if (conc <= 0)
641                 conc = num_online_cpus();
642         if (conc != old_conc) {
643                 sb->concurrency = conc;
644                 __pwqr_sb_update_state(sb, 0);
645         }
646         pwqr_sb_unlock_irqrestore(sb, flags);
647
648         return old_conc;
649 }
650
651 static long do_pwqr_wake(struct pwqr_sb *sb, int oc, int count)
652 {
653         unsigned long flags;
654         int nwake;
655
656         if (count < 0)
657                 return -EINVAL;
658
659         pwqr_sb_lock_irqsave(sb, flags);
660
661         if (oc) {
662                 nwake = sb->waiting + sb->parked - sb->overcommit_wakes;
663                 if (count > nwake) {
664                         count = nwake;
665                 } else {
666                         nwake = count;
667                 }
668                 sb->overcommit_wakes += count;
669         } else if (sb->running + sb->overcommit_wakes < sb->concurrency) {
670                 nwake = sb->concurrency - sb->overcommit_wakes - sb->running;
671                 if (nwake > sb->waiting + sb->parked - sb->overcommit_wakes) {
672                         nwake = sb->waiting + sb->parked -
673                                 sb->overcommit_wakes;
674                 }
675                 if (count > nwake) {
676                         count = nwake;
677                 } else {
678                         nwake = count;
679                 }
680         } else {
681                 /*
682                  * This codepath deserves an explanation: waking the thread
683                  * "for real" would overcommit, though userspace KNOWS there
684                  * is at least one waiting thread. Such threads are threads
685                  * that are "quarantined".
686                  *
687                  * Quarantined threads are woken up one by one, to allow a
688                  * slow ramp down, trying to minimize "waiting" <-> "parked"
689                  * flip-flops, no matter how many wakes have been asked.
690                  *
691                  * Since releasing one quarantined thread will wake up a
692                  * thread that will (almost) straight go to parked mode, lie
693                  * to userland about the fact that we unblocked that thread,
694                  * and return 0.
695                  *
696                  * Though if we're already waking all waiting threads for
697                  * overcommitting jobs, well, we don't need that.
698                  */
699                 count = 0;
700                 nwake = sb->waiting > sb->overcommit_wakes;
701         }
702         while (nwake-- > 0)
703                 wake_up_locked(&sb->wqh);
704         pwqr_sb_unlock_irqrestore(sb, flags);
705
706         return count;
707 }
708
709 static long pwqr_ioctl(struct file *filp, unsigned command, unsigned long arg)
710 {
711         struct pwqr_sb     *sb   = filp->private_data;
712         struct task_struct *task = current;
713         struct pwqr_task   *pwqt;
714         int rc = 0;
715
716         switch (command) {
717         case PWQR_CTL_GET_CONC:
718                 return sb->concurrency;
719         case PWQR_CTL_SET_CONC:
720                 return do_pwqr_set_conc(sb, (int)arg);
721
722         case PWQR_CTL_WAKE:
723         case PWQR_CTL_WAKE_OC:
724                 return do_pwqr_wake(sb, command == PWQR_CTL_WAKE_OC, (int)arg);
725
726         case PWQR_CTL_WAIT:
727         case PWQR_CTL_PARK:
728         case PWQR_CTL_REGISTER:
729         case PWQR_CTL_UNREGISTER:
730                 break;
731         default:
732                 return -EINVAL;
733         }
734
735         pwqt = pwqr_task_find(task);
736         if (command == PWQR_CTL_UNREGISTER)
737                 return do_pwqr_unregister(sb, pwqt);
738
739         if (pwqt == NULL) {
740                 pwqt = pwqr_task_create(task);
741                 if (IS_ERR(pwqt))
742                         return PTR_ERR(pwqt);
743                 pwqr_task_attach(pwqt, sb);
744         } else if (unlikely(pwqt->sb != sb)) {
745                 pwqr_task_detach(pwqt, pwqt->sb);
746                 pwqr_task_attach(pwqt, sb);
747         }
748
749         switch (command) {
750         case PWQR_CTL_WAIT:
751                 rc = do_pwqr_wait(sb, pwqt, true, (struct pwqr_ioc_wait __user *)arg);
752                 break;
753         case PWQR_CTL_PARK:
754                 rc = do_pwqr_wait(sb, pwqt, false, NULL);
755                 break;
756         }
757         return rc;
758 }
759
760 static const struct file_operations pwqr_dev_fops = {
761         .owner          = THIS_MODULE,
762         .open           = pwqr_open,
763         .release        = pwqr_release,
764         .poll           = pwqr_poll,
765         .read           = pwqr_read,
766         .llseek         = noop_llseek,
767         .unlocked_ioctl = pwqr_ioctl,
768 #ifdef CONFIG_COMPAT
769         .compat_ioctl   = pwqr_ioctl,
770 #endif
771 };
772
773 /*****************************************************************************
774  * module
775  */
776 static int __init pwqr_start(void)
777 {
778 #if IS_PRE_2_6_23
779         int i;
780
781         for (i = 0; i < PWQR_HASH_SIZE; i++) {
782                 spin_lock_init(&pwqr_tasks_hash[i].lock);
783                 INIT_HLIST_HEAD(&pwqr_tasks_hash[i].tasks);
784         }
785 #endif
786
787         /* Register as a character device */
788         pwqr_major = register_chrdev(0, "pwqr", &pwqr_dev_fops);
789         if (pwqr_major < 0) {
790                 printk(KERN_ERR "pwqr: register_chrdev() failed\n");
791                 return pwqr_major;
792         }
793
794         /* Create a device node */
795         pwqr_class = class_create(THIS_MODULE, PWQR_DEVICE_NAME);
796         if (IS_ERR(pwqr_class)) {
797                 printk(KERN_ERR "pwqr: Error creating raw class\n");
798                 unregister_chrdev(pwqr_major, PWQR_DEVICE_NAME);
799                 return PTR_ERR(pwqr_class);
800         }
801         device_create(pwqr_class, NULL, MKDEV(pwqr_major, 0), NULL, PWQR_DEVICE_NAME);
802         printk(KERN_INFO "pwqr: PThreads Work Queues Regulator v1 loaded");
803         return 0;
804 }
805
806 static void __exit pwqr_end(void)
807 {
808         rcu_barrier();
809         device_destroy(pwqr_class, MKDEV(pwqr_major, 0));
810         class_destroy(pwqr_class);
811         unregister_chrdev(pwqr_major, PWQR_DEVICE_NAME);
812 }
813
814 module_init(pwqr_start);
815 module_exit(pwqr_end);
816
817 MODULE_LICENSE("GPL");
818 MODULE_AUTHOR("Pierre Habouzit <pierre.habouzit@intersec.com>");
819 MODULE_DESCRIPTION("PThreads Work Queues Regulator");
820 #endif
821
822 // vim:noet:sw=8:cinoptions+=\:0,L-1,=1s: