Add stupid tests.
[~madcoder/pwqr.git] / kernel / pwqr.c
1 /*
2  * Copyright (C) 2012   Pierre Habouzit <pierre.habouzit@intersec.com>
3  * Copyright (C) 2012   Intersec SAS
4  *
5  * This file implements the Linux Pthread Workqueue Regulator, and is part
6  * of the linux kernel.
7  *
8  * The Linux Kernel is free software: you can redistribute it and/or modify it
9  * under the terms of the GNU General Public License version 2 as published by
10  * the Free Software Foundation.
11  *
12  * The Linux Kernel is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14  * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15  * License for more details.
16  *
17  * You should have received a copy of the GNU General Public License version 2
18  * along with The Linux Kernel.  If not, see <http://www.gnu.org/licenses/>.
19  */
20
21 #include <linux/cdev.h>
22 #include <linux/device.h>
23 #include <linux/file.h>
24 #include <linux/fs.h>
25 #include <linux/hash.h>
26 #include <linux/init.h>
27 #include <linux/kref.h>
28 #include <linux/module.h>
29 #include <linux/poll.h>
30 #include <linux/sched.h>
31 #include <linux/slab.h>
32 #include <linux/spinlock.h>
33 #include <linux/timer.h>
34 #include <linux/uaccess.h>
35 #include <linux/wait.h>
36 #include <linux/version.h>
37
38 /*
39  * The pthread workqueue regulator code is for now written as a proof of
40  * concept module, meant to work with 2.6.23+ kernels or redhat5 ones.
41  *
42  * For now it uses a device /dev/pwq, which spawns magic file-descriptors
43  * supporting a few ioctl operations (see Documentation/pwqr.adoc shipped in
44  * the same git repository).
45  *
46  * This code is meant to be merged into mainline, but after the following
47  * changes, kept here as a "todolist":
48  *
49  *   - get rid of the device stuff (which is 100% of the init code for 2.6.23
50  *     kernels);
51  *
52  *   - resubmit the patch that makes it possible to call
53  *     preempt_notifier_unregister from sched_in/sched_out (just a matter of a
54  *     hlist_for_each_safe instead of hlist_for_each), and fix
55  *     pwqr_task_release to not require RCU anymore. It makes
56  *     pwqr_preempt_noop_ops go away.
57  *
58  *   - think about the possibility to add a pwq_notifier pointer directly into
59  *     the task_struct, thought it's not *that* necessary, it grows the
60  *     structure for a speed gain we don't really need (making pwqr_ctl
61  *     faster). I think it's okay to crawl the preempt_notifier list instead.
62  *     We may want to add nice "macros" for that though.
63  *
64  *   - replace the ioctl with a pwqr_ctl syscall
65  *
66  *   - create a pwqr_create() syscall to create a pwqr file-descriptor.
67  *
68  * Summary: most of the code should be untouched or almost not changed,
69  * pwqr_ioctl adapted to become a syscall, and the module boilerplate replaced
70  * with pwqr_create() and file-descriptor creation boilerplate instead. But
71  * looking at fs/eventfd.c this looks rather simple.
72  */
73
74 #ifndef CONFIG_PREEMPT_NOTIFIERS
75 #  error PWQ module requires CONFIG_PREEMPT_NOTIFIERS
76 #else
77
78 #include "pwqr.h"
79
80 #define PWQR_UC_DELAY           (HZ / 10)
81 #define PWQR_OC_DELAY           (HZ / 20)
82
83 #define PWQR_STATE_NONE         0
84 #define PWQR_STATE_UC           1
85 #define PWQR_STATE_OC           2
86 #define PWQR_STATE_DEAD         (-1)
87
88 /*
89  * This is the first inclusion of CONFIG_PREEMPT_NOTIFIERS in the kernel.
90  *
91  * Though I want it to work on older redhat 5 kernels, that have an emulation
92  * of the feature but not implemented the same way, and instead of linking the
93  * preempt_notifiers from the task_struct directly, they have a private
94  * h-table I don't have access to, so I need my own too.
95  *
96  * For vanilla kernels we crawl through the task_struct::preempt_notifiers
97  * hlist until we find our entry, this list is often very short, and it's no
98  * slower than the global h-table which also crawls a list anyway.
99  */
100 #define IS_PRE_2_6_23    (LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 23))
101
102 struct pwqr_sb {
103         struct kref             kref;
104         struct rcu_head         rcu;
105         struct timer_list       timer;
106         wait_queue_head_t       wqh;
107         wait_queue_head_t       wqh_poll;
108
109         unsigned                concurrency;
110         unsigned                registered;
111
112         unsigned                running;
113         unsigned                waiting;
114         unsigned                parked;
115         unsigned                overcommit_wakes;
116
117         int                     state;
118 };
119
120 struct pwqr_task {
121         struct preempt_notifier notifier;
122         struct pwqr_sb         *sb;
123         struct rcu_head         rcu;
124 #if IS_PRE_2_6_23
125         struct hlist_node       link;
126         struct task_struct     *task;
127 #endif
128 };
129
130 #if IS_PRE_2_6_23
131
132 #define PWQR_HASH_BITS          5
133 #define PWQR_HASH_SIZE          (1 << PWQR_HASH_BITS)
134
135 struct pwqr_task_bucket {
136         spinlock_t              lock;
137         struct hlist_head       tasks;
138 };
139
140 static struct pwqr_task_bucket  pwqr_tasks_hash[PWQR_HASH_SIZE];
141 #endif
142
143 /*
144  * Global variables
145  */
146 static struct class            *pwqr_class;
147 static int                      pwqr_major;
148 static struct preempt_ops       pwqr_preempt_running_ops;
149 static struct preempt_ops       pwqr_preempt_blocked_ops;
150 static struct preempt_ops       pwqr_preempt_noop_ops;
151
152 /*****************************************************************************
153  * Scoreboards
154  */
155
156 #define pwqr_sb_lock_irqsave(sb, flags) \
157         spin_lock_irqsave(&(sb)->wqh.lock, flags)
158 #define pwqr_sb_unlock_irqrestore(sb, flags) \
159         spin_unlock_irqrestore(&(sb)->wqh.lock, flags)
160
161 static inline void pwqr_arm_timer(struct pwqr_sb *sb, int how, int delay)
162 {
163         if (timer_pending(&sb->timer) && sb->state == how)
164                 return;
165         mod_timer(&sb->timer, jiffies + delay);
166         sb->state = how;
167 }
168
169 static inline void __pwqr_sb_update_state(struct pwqr_sb *sb, int running_delta)
170 {
171         sb->running += running_delta;
172
173         if (sb->running < sb->concurrency && sb->waiting == 0 && sb->parked) {
174                 pwqr_arm_timer(sb, PWQR_STATE_UC, PWQR_UC_DELAY);
175         } else if (sb->running > sb->concurrency) {
176                 pwqr_arm_timer(sb, PWQR_STATE_OC, PWQR_OC_DELAY);
177         } else {
178                 sb->state = PWQR_STATE_NONE;
179                 if (!timer_pending(&sb->timer))
180                         del_timer(&sb->timer);
181         }
182 }
183
184 static void pwqr_sb_timer_cb(unsigned long arg)
185 {
186         struct pwqr_sb *sb = (struct pwqr_sb *)arg;
187         unsigned long flags;
188
189         pwqr_sb_lock_irqsave(sb, flags);
190         if (sb->running < sb->concurrency && sb->waiting == 0 && sb->parked) {
191                 if (sb->overcommit_wakes == 0)
192                         wake_up_locked(&sb->wqh);
193         }
194         if (sb->running > sb->concurrency) {
195                 printk(KERN_DEBUG "wake up poll");
196                 wake_up_poll(&sb->wqh_poll, POLLIN);
197         }
198         pwqr_sb_unlock_irqrestore(sb, flags);
199 }
200
201 static struct pwqr_sb *pwqr_sb_create(void)
202 {
203         struct pwqr_sb *sb;
204
205         sb = kzalloc(sizeof(struct pwqr_sb), GFP_KERNEL);
206         if (sb == NULL)
207                 return ERR_PTR(-ENOMEM);
208
209         kref_init(&sb->kref);
210         init_waitqueue_head(&sb->wqh);
211         sb->concurrency    = num_online_cpus();
212         init_timer(&sb->timer);
213         sb->timer.function = pwqr_sb_timer_cb;
214         sb->timer.data     = (unsigned long)sb;
215
216         __module_get(THIS_MODULE);
217         return sb;
218 }
219 static inline void pwqr_sb_get(struct pwqr_sb *sb)
220 {
221         kref_get(&sb->kref);
222 }
223
224 static void pwqr_sb_finalize(struct rcu_head *rcu)
225 {
226         struct pwqr_sb *sb = container_of(rcu, struct pwqr_sb, rcu);
227
228         module_put(THIS_MODULE);
229         kfree(sb);
230 }
231
232 static void pwqr_sb_release(struct kref *kref)
233 {
234         struct pwqr_sb *sb = container_of(kref, struct pwqr_sb, kref);
235
236         del_timer_sync(&sb->timer);
237         wake_up_poll(&sb->wqh_poll, POLLHUP);
238         call_rcu(&sb->rcu, pwqr_sb_finalize);
239 }
240 static inline void pwqr_sb_put(struct pwqr_sb *sb)
241 {
242         kref_put(&sb->kref, pwqr_sb_release);
243 }
244
245 /*****************************************************************************
246  * tasks
247  */
248 #if IS_PRE_2_6_23
249 static inline struct pwqr_task_bucket *task_hbucket(struct task_struct *task)
250 {
251         return &pwqr_tasks_hash[hash_ptr(task, PWQR_HASH_BITS)];
252 }
253
254 static struct pwqr_task *pwqr_task_find(struct task_struct *task)
255 {
256         struct pwqr_task_bucket *b = task_hbucket(task);
257         struct hlist_node *node;
258         struct pwqr_task *pwqt = NULL;
259
260         spin_lock(&b->lock);
261         hlist_for_each_entry(pwqt, node, &b->tasks, link) {
262                 if (pwqt->task == task)
263                         break;
264         }
265         spin_unlock(&b->lock);
266         return pwqt;
267 }
268 #else
269 static struct pwqr_task *pwqr_task_find(struct task_struct *task)
270 {
271         struct hlist_node       *node;
272         struct preempt_notifier *it;
273         struct pwqr_task        *pwqt = NULL;
274
275         hlist_for_each_entry(it, node, &task->preempt_notifiers, link) {
276                 if (it->ops == &pwqr_preempt_running_ops ||
277                     it->ops == &pwqr_preempt_blocked_ops ||
278                     it->ops == &pwqr_preempt_noop_ops)
279                 {
280                         pwqt = container_of(it, struct pwqr_task, notifier);
281                         break;
282                 }
283         }
284
285         return pwqt;
286 }
287 #endif
288
289 static struct pwqr_task *pwqr_task_create(struct task_struct *task)
290 {
291         struct pwqr_task *pwqt;
292
293         pwqt = kmalloc(sizeof(*pwqt), GFP_KERNEL);
294         if (pwqt == NULL)
295                 return ERR_PTR(-ENOMEM);
296
297         preempt_notifier_init(&pwqt->notifier, &pwqr_preempt_running_ops);
298         preempt_notifier_register(&pwqt->notifier);
299 #if IS_PRE_2_6_23
300         {
301                 struct pwqr_task_bucket *b = task_hbucket(task);
302
303                 pwqt->task = task;
304                 spin_lock(&b->lock);
305                 hlist_add_head(&pwqt->link, &b->tasks);
306                 spin_unlock(&b->lock);
307         }
308 #endif
309         return pwqt;
310 }
311
312 __cold
313 static void pwqr_task_detach(struct pwqr_task *pwqt, struct pwqr_sb *sb)
314 {
315         unsigned long flags;
316
317         pwqr_sb_lock_irqsave(sb, flags);
318         sb->registered--;
319         if (pwqt->notifier.ops == &pwqr_preempt_running_ops) {
320                 __pwqr_sb_update_state(sb, -1);
321         } else {
322                 __pwqr_sb_update_state(sb, 0);
323         }
324         pwqr_sb_unlock_irqrestore(sb, flags);
325         pwqr_sb_put(sb);
326         pwqt->sb = NULL;
327 }
328
329 __cold
330 static void pwqr_task_attach(struct pwqr_task *pwqt, struct pwqr_sb *sb)
331 {
332         unsigned long flags;
333
334         pwqr_sb_lock_irqsave(sb, flags);
335         pwqr_sb_get(pwqt->sb = sb);
336         sb->registered++;
337         __pwqr_sb_update_state(sb, 1);
338         pwqr_sb_unlock_irqrestore(sb, flags);
339 }
340
341 __cold
342 static void pwqr_task_release(struct pwqr_task *pwqt, bool from_notifier)
343 {
344 #if IS_PRE_2_6_23
345         struct pwqr_task_bucket *b = task_hbucket(pwqt->task);
346
347         spin_lock(&b->lock);
348         hlist_del(&pwqt->link);
349         spin_unlock(&b->lock);
350 #endif
351         pwqt->notifier.ops = &pwqr_preempt_noop_ops;
352
353         if (from_notifier) {
354                 /* When called from sched_{out,in}, it's not allowed to
355                  * call preempt_notifier_unregister (or worse kfree())
356                  *
357                  * Though it's not a good idea to kfree() still registered
358                  * callbacks if we're not dying, it'll panic on the next
359                  * sched_{in,out} call.
360                  */
361                 BUG_ON(!(current->state & TASK_DEAD));
362                 kfree_rcu(pwqt, rcu);
363         } else {
364                 preempt_notifier_unregister(&pwqt->notifier);
365                 kfree(pwqt);
366         }
367 }
368
369 static void pwqr_task_noop_sched_in(struct preempt_notifier *notifier, int cpu)
370 {
371 }
372
373 static void pwqr_task_noop_sched_out(struct preempt_notifier *notifier,
374                                     struct task_struct *next)
375 {
376 }
377
378 static void pwqr_task_blocked_sched_in(struct preempt_notifier *notifier, int cpu)
379 {
380         struct pwqr_task *pwqt = container_of(notifier, struct pwqr_task, notifier);
381         struct pwqr_sb   *sb   = pwqt->sb;
382         unsigned long flags;
383
384         if (unlikely(sb->state < 0)) {
385                 pwqr_task_detach(pwqt, sb);
386                 pwqr_task_release(pwqt, true);
387                 return;
388         }
389
390         pwqt->notifier.ops = &pwqr_preempt_running_ops;
391         pwqr_sb_lock_irqsave(sb, flags);
392         __pwqr_sb_update_state(sb, 1);
393         pwqr_sb_unlock_irqrestore(sb, flags);
394 }
395
396 static void pwqr_task_sched_out(struct preempt_notifier *notifier,
397                                 struct task_struct *next)
398 {
399         struct pwqr_task   *pwqt = container_of(notifier, struct pwqr_task, notifier);
400         struct pwqr_sb     *sb   = pwqt->sb;
401         struct task_struct *p    = current;
402
403         if (unlikely(p->state & TASK_DEAD) || unlikely(sb->state < 0)) {
404                 pwqr_task_detach(pwqt, sb);
405                 pwqr_task_release(pwqt, true);
406                 return;
407         }
408         if (p->state == 0 || (p->state & (__TASK_STOPPED | __TASK_TRACED)))
409                 return;
410
411         pwqt->notifier.ops = &pwqr_preempt_blocked_ops;
412         /* see preempt.h: irq are disabled for sched_out */
413         spin_lock(&sb->wqh.lock);
414         __pwqr_sb_update_state(sb, -1);
415         spin_unlock(&sb->wqh.lock);
416 }
417
418 static struct preempt_ops __read_mostly pwqr_preempt_noop_ops = {
419         .sched_in       = pwqr_task_noop_sched_in,
420         .sched_out      = pwqr_task_noop_sched_out,
421 };
422
423 static struct preempt_ops __read_mostly pwqr_preempt_running_ops = {
424         .sched_in       = pwqr_task_noop_sched_in,
425         .sched_out      = pwqr_task_sched_out,
426 };
427
428 static struct preempt_ops __read_mostly pwqr_preempt_blocked_ops = {
429         .sched_in       = pwqr_task_blocked_sched_in,
430         .sched_out      = pwqr_task_sched_out,
431 };
432
433 /*****************************************************************************
434  * file descriptor
435  */
436 static int pwqr_open(struct inode *inode, struct file *filp)
437 {
438         struct pwqr_sb *sb;
439
440         sb = pwqr_sb_create();
441         if (IS_ERR(sb))
442                 return PTR_ERR(sb);
443         filp->private_data = sb;
444         return 0;
445 }
446
447 static int pwqr_release(struct inode *inode, struct file *filp)
448 {
449         struct pwqr_sb *sb = filp->private_data;
450         unsigned long flags;
451
452         pwqr_sb_lock_irqsave(sb, flags);
453         sb->state = PWQR_STATE_DEAD;
454         pwqr_sb_unlock_irqrestore(sb, flags);
455         wake_up_all(&sb->wqh);
456         pwqr_sb_put(sb);
457         return 0;
458 }
459
460 static unsigned int pwqr_poll(struct file *filp, poll_table *wait)
461 {
462         struct pwqr_sb *sb = filp->private_data;
463         unsigned int events = 0;
464         unsigned long flags;
465
466         poll_wait(filp, &sb->wqh_poll, wait);
467
468         pwqr_sb_lock_irqsave(sb, flags);
469         if (sb->running > sb->concurrency)
470                 events |= POLLIN;
471         if (sb->state < 0)
472                 events |= POLLHUP;
473         pwqr_sb_unlock_irqrestore(sb, flags);
474
475         return events;
476 }
477
478 static inline ssize_t pwqr_sb_read(struct pwqr_sb *sb, int no_wait, u32 *cnt)
479 {
480         DECLARE_WAITQUEUE(wait, current);
481         ssize_t rc = -EAGAIN;
482
483         spin_lock_irq(&sb->wqh.lock);
484         if (sb->running > sb->concurrency) {
485                 rc = 0;
486         } else if (!no_wait) {
487                 add_wait_queue(&sb->wqh_poll, &wait);
488                 for (;;) {
489                         set_current_state(TASK_INTERRUPTIBLE);
490                         if (sb->running > sb->concurrency) {
491                                 rc = 0;
492                                 break;
493                         }
494                         if (signal_pending(current)) {
495                                 rc = -ERESTARTSYS;
496                                 break;
497                         }
498                         spin_unlock_irq(&sb->wqh.lock);
499                         schedule();
500                         spin_lock_irq(&sb->wqh.lock);
501                 }
502                 remove_wait_queue(&sb->wqh_poll, &wait);
503                 __set_current_state(TASK_RUNNING);
504         }
505         if (likely(rc == 0))
506                 *cnt = sb->running - sb->concurrency;
507         spin_unlock_irq(&sb->wqh.lock);
508
509         return rc;
510 }
511
512 static ssize_t
513 pwqr_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
514 {
515         struct pwqr_sb *sb = filp->private_data;
516         u32 cnt = 0;
517         ssize_t rc;
518
519         if (count < sizeof(cnt))
520                 return -EINVAL;
521         rc = pwqr_sb_read(sb, filp->f_flags & O_NONBLOCK, &cnt);
522         if (rc < 0)
523                 return rc;
524         return put_user(cnt, (u32 __user *)buf) ? -EFAULT : sizeof(cnt);
525 }
526
527 static long
528 do_pwqr_wait(struct pwqr_sb *sb, struct pwqr_task *pwqt,
529              int is_wait, struct pwqr_ioc_wait __user *arg)
530 {
531         unsigned long flags;
532         struct pwqr_ioc_wait wait;
533         long rc = 0;
534         u32 uval;
535
536         preempt_notifier_unregister(&pwqt->notifier);
537
538         if (is_wait) {
539                 if (copy_from_user(&wait, arg, sizeof(wait))) {
540                         rc = -EFAULT;
541                         goto out;
542                 }
543                 if (unlikely((long)wait.pwqr_uaddr % sizeof(int) != 0)) {
544                         rc = -EINVAL;
545                         goto out;
546                 }
547         }
548
549         pwqr_sb_lock_irqsave(sb, flags);
550         if (sb->running + sb->waiting <= sb->concurrency) {
551                 if (is_wait) {
552                         while (probe_kernel_address(wait.pwqr_uaddr, uval)) {
553                                 pwqr_sb_unlock_irqrestore(sb, flags);
554                                 rc = get_user(uval, (u32 *)wait.pwqr_uaddr);
555                                 if (rc)
556                                         goto out;
557                                 pwqr_sb_lock_irqsave(sb, flags);
558                         }
559
560                         if (uval != (u32)wait.pwqr_ticket) {
561                                 rc = -EWOULDBLOCK;
562                                 goto out_unlock;
563                         }
564                 } else {
565                         goto out_unlock;
566                 }
567         }
568
569         /* @ see <wait_event_interruptible_exclusive_locked_irq> */
570         if (likely(sb->state >= 0)) {
571                 DEFINE_WAIT(__wait);
572                 __wait.flags |= WQ_FLAG_EXCLUSIVE;
573
574                 if (is_wait) {
575                         sb->waiting++;
576                         __add_wait_queue(&sb->wqh, &__wait);
577                 } else {
578                         sb->parked++;
579                         __add_wait_queue_tail(&sb->wqh, &__wait);
580                 }
581                 __pwqr_sb_update_state(sb, -1);
582
583                 do {
584                         set_current_state(TASK_INTERRUPTIBLE);
585                         if (sb->overcommit_wakes)
586                                 break;
587                         if (signal_pending(current)) {
588                                 rc = -ERESTARTSYS;
589                                 break;
590                         }
591                         spin_unlock_irq(&sb->wqh.lock);
592                         schedule();
593                         spin_lock_irq(&sb->wqh.lock);
594                         if (is_wait)
595                                 break;
596                         if (sb->running + sb->waiting < sb->concurrency)
597                                 break;
598                 } while (likely(sb->state >= 0));
599
600                 __remove_wait_queue(&sb->wqh, &__wait);
601                 __set_current_state(TASK_RUNNING);
602                 if (is_wait) {
603                         sb->waiting--;
604                 } else {
605                         sb->parked--;
606                 }
607                 __pwqr_sb_update_state(sb, 1);
608
609                 if (sb->overcommit_wakes)
610                         sb->overcommit_wakes--;
611                 if (sb->waiting + sb->running > sb->concurrency)
612                         rc = -EDQUOT;
613         }
614
615 out_unlock:
616         if (unlikely(sb->state < 0))
617                 rc = -EBADFD;
618         pwqr_sb_unlock_irqrestore(sb, flags);
619 out:
620         preempt_notifier_register(&pwqt->notifier);
621         return rc;
622 }
623
624 static long do_pwqr_unregister(struct pwqr_sb *sb, struct pwqr_task *pwqt)
625 {
626         if (!pwqt)
627                 return -EINVAL;
628         if (pwqt->sb != sb)
629                 return -ENOENT;
630         pwqr_task_detach(pwqt, sb);
631         pwqr_task_release(pwqt, false);
632         return 0;
633 }
634
635 static long do_pwqr_set_conc(struct pwqr_sb *sb, int conc)
636 {
637         long old_conc = sb->concurrency;
638         unsigned long flags;
639
640         pwqr_sb_lock_irqsave(sb, flags);
641         if (conc <= 0)
642                 conc = num_online_cpus();
643         if (conc != old_conc) {
644                 sb->concurrency = conc;
645                 __pwqr_sb_update_state(sb, 0);
646         }
647         pwqr_sb_unlock_irqrestore(sb, flags);
648
649         return old_conc;
650 }
651
652 static long do_pwqr_wake(struct pwqr_sb *sb, int oc, int count)
653 {
654         unsigned long flags;
655         int nwake;
656
657         if (count < 0)
658                 return -EINVAL;
659
660         pwqr_sb_lock_irqsave(sb, flags);
661
662         if (oc) {
663                 nwake = sb->waiting + sb->parked - sb->overcommit_wakes;
664                 if (count > nwake) {
665                         count = nwake;
666                 } else {
667                         nwake = count;
668                 }
669                 sb->overcommit_wakes += count;
670         } else if (sb->running + sb->overcommit_wakes < sb->concurrency) {
671                 nwake = sb->concurrency - sb->overcommit_wakes - sb->running;
672                 if (nwake > sb->waiting + sb->parked - sb->overcommit_wakes) {
673                         nwake = sb->waiting + sb->parked -
674                                 sb->overcommit_wakes;
675                 }
676                 if (count > nwake) {
677                         count = nwake;
678                 } else {
679                         nwake = count;
680                 }
681         } else {
682                 /*
683                  * This codepath deserves an explanation: waking the thread
684                  * "for real" would overcommit, though userspace KNOWS there
685                  * is at least one waiting thread. Such threads are threads
686                  * that are "quarantined".
687                  *
688                  * Quarantined threads are woken up one by one, to allow a
689                  * slow ramp down, trying to minimize "waiting" <-> "parked"
690                  * flip-flops, no matter how many wakes have been asked.
691                  *
692                  * Since releasing one quarantined thread will wake up a
693                  * thread that will (almost) straight go to parked mode, lie
694                  * to userland about the fact that we unblocked that thread,
695                  * and return 0.
696                  *
697                  * Though if we're already waking all waiting threads for
698                  * overcommitting jobs, well, we don't need that.
699                  */
700                 count = 0;
701                 nwake = sb->waiting > sb->overcommit_wakes;
702         }
703         while (nwake-- > 0)
704                 wake_up_locked(&sb->wqh);
705         pwqr_sb_unlock_irqrestore(sb, flags);
706
707         return count;
708 }
709
710 static long pwqr_ioctl(struct file *filp, unsigned command, unsigned long arg)
711 {
712         struct pwqr_sb     *sb   = filp->private_data;
713         struct task_struct *task = current;
714         struct pwqr_task   *pwqt;
715         int rc = 0;
716
717         switch (command) {
718         case PWQR_CTL_GET_CONC:
719                 return sb->concurrency;
720         case PWQR_CTL_SET_CONC:
721                 return do_pwqr_set_conc(sb, (int)arg);
722
723         case PWQR_CTL_WAKE:
724         case PWQR_CTL_WAKE_OC:
725                 return do_pwqr_wake(sb, command == PWQR_CTL_WAKE_OC, (int)arg);
726
727         case PWQR_CTL_WAIT:
728         case PWQR_CTL_PARK:
729         case PWQR_CTL_REGISTER:
730         case PWQR_CTL_UNREGISTER:
731                 break;
732         default:
733                 return -EINVAL;
734         }
735
736         pwqt = pwqr_task_find(task);
737         if (command == PWQR_CTL_UNREGISTER)
738                 return do_pwqr_unregister(sb, pwqt);
739
740         if (pwqt == NULL) {
741                 pwqt = pwqr_task_create(task);
742                 if (IS_ERR(pwqt))
743                         return PTR_ERR(pwqt);
744                 pwqr_task_attach(pwqt, sb);
745         } else if (unlikely(pwqt->sb != sb)) {
746                 pwqr_task_detach(pwqt, pwqt->sb);
747                 pwqr_task_attach(pwqt, sb);
748         }
749
750         switch (command) {
751         case PWQR_CTL_WAIT:
752                 rc = do_pwqr_wait(sb, pwqt, true, (struct pwqr_ioc_wait __user *)arg);
753                 break;
754         case PWQR_CTL_PARK:
755                 rc = do_pwqr_wait(sb, pwqt, false, NULL);
756                 break;
757         }
758         return rc;
759 }
760
761 static const struct file_operations pwqr_dev_fops = {
762         .owner          = THIS_MODULE,
763         .open           = pwqr_open,
764         .release        = pwqr_release,
765         .poll           = pwqr_poll,
766         .read           = pwqr_read,
767         .llseek         = noop_llseek,
768         .unlocked_ioctl = pwqr_ioctl,
769 #ifdef CONFIG_COMPAT
770         .compat_ioctl   = pwqr_ioctl,
771 #endif
772 };
773
774 /*****************************************************************************
775  * module
776  */
777 static int __init pwqr_start(void)
778 {
779 #if IS_PRE_2_6_23
780         int i;
781
782         for (i = 0; i < PWQR_HASH_SIZE; i++) {
783                 spin_lock_init(&pwqr_tasks_hash[i].lock);
784                 INIT_HLIST_HEAD(&pwqr_tasks_hash[i].tasks);
785         }
786 #endif
787
788         /* Register as a character device */
789         pwqr_major = register_chrdev(0, "pwqr", &pwqr_dev_fops);
790         if (pwqr_major < 0) {
791                 printk(KERN_ERR "pwqr: register_chrdev() failed\n");
792                 return pwqr_major;
793         }
794
795         /* Create a device node */
796         pwqr_class = class_create(THIS_MODULE, PWQR_DEVICE_NAME);
797         if (IS_ERR(pwqr_class)) {
798                 printk(KERN_ERR "pwqr: Error creating raw class\n");
799                 unregister_chrdev(pwqr_major, PWQR_DEVICE_NAME);
800                 return PTR_ERR(pwqr_class);
801         }
802         device_create(pwqr_class, NULL, MKDEV(pwqr_major, 0), NULL, PWQR_DEVICE_NAME);
803         printk(KERN_INFO "pwqr: PThreads Work Queues Regulator v1 loaded");
804         return 0;
805 }
806
807 static void __exit pwqr_end(void)
808 {
809         rcu_barrier();
810         device_destroy(pwqr_class, MKDEV(pwqr_major, 0));
811         class_destroy(pwqr_class);
812         unregister_chrdev(pwqr_major, PWQR_DEVICE_NAME);
813 }
814
815 module_init(pwqr_start);
816 module_exit(pwqr_end);
817
818 MODULE_LICENSE("GPL");
819 MODULE_AUTHOR("Pierre Habouzit <pierre.habouzit@intersec.com>");
820 MODULE_DESCRIPTION("PThreads Work Queues Regulator");
821 #endif
822
823 // vim:noet:sw=8:cinoptions+=\:0,L-1,=1s: