X-Git-Url: http://git.madism.org/?p=~madcoder%2Fpwqr.git;a=blobdiff_plain;f=kernel%2Fpwqr.c;h=233bd7a7a9ca6d41d391bef4fda1f67ab889a46d;hp=7ce0700027f6d3c4a77275feccccd11b6a4924e4;hb=e8915e9a336e0e0cef3758e7e10dec23443de6dc;hpb=4d4f64ffd9acc95b7c4f816a537d2c1dff8df367 diff --git a/kernel/pwqr.c b/kernel/pwqr.c index 7ce0700..233bd7a 100644 --- a/kernel/pwqr.c +++ b/kernel/pwqr.c @@ -26,34 +26,85 @@ #include #include #include +#include #include #include #include #include #include #include +#include + +/* + * The pthread workqueue regulator code is for now written as a proof of + * concept module, meant to work with 2.6.23+ kernels or redhat5 ones. + * + * For now it uses a device /dev/pwq, which spawns magic file-descriptors + * supporting a few ioctl operations (see Documentation/pwqr.adoc shipped in + * the same git repository). + * + * This code is meant to be merged into mainline, but after the following + * changes, kept here as a "todolist": + * + * - get rid of the device stuff (which is 100% of the init code for 2.6.23 + * kernels); + * + * - resubmit the patch that makes it possible to call + * preempt_notifier_unregister from sched_in/sched_out (just a matter of a + * hlist_for_each_safe instead of hlist_for_each), and fix + * pwqr_task_release to not require RCU anymore. It makes + * pwqr_preempt_noop_ops go away. + * + * - think about the possibility to add a pwq_notifier pointer directly into + * the task_struct, thought it's not *that* necessary, it grows the + * structure for a speed gain we don't really need (making pwqr_ctl + * faster). I think it's okay to crawl the preempt_notifier list instead. + * We may want to add nice "macros" for that though. + * + * - replace the ioctl with a pwqr_ctl syscall + * + * - create a pwqr_create() syscall to create a pwqr file-descriptor. + * + * Summary: most of the code should be untouched or almost not changed, + * pwqr_ioctl adapted to become a syscall, and the module boilerplate replaced + * with pwqr_create() and file-descriptor creation boilerplate instead. But + * looking at fs/eventfd.c this looks rather simple. + */ #ifndef CONFIG_PREEMPT_NOTIFIERS # error PWQ module requires CONFIG_PREEMPT_NOTIFIERS -#endif +#else #include "pwqr.h" -#define PWQR_UNPARK_DELAY (HZ / 10) -#define PWQR_HASH_BITS 5 -#define PWQR_HASH_SIZE (1 << PWQR_HASH_BITS) +#define PWQR_UC_DELAY (HZ / 10) +#define PWQR_OC_DELAY (HZ / 20) -struct pwqr_task_bucket { - spinlock_t lock; - struct hlist_head tasks; -}; +#define PWQR_STATE_NONE 0 +#define PWQR_STATE_UC 1 +#define PWQR_STATE_OC 2 +#define PWQR_STATE_DEAD (-1) + +/* + * This is the first inclusion of CONFIG_PREEMPT_NOTIFIERS in the kernel. + * + * Though I want it to work on older redhat 5 kernels, that have an emulation + * of the feature but not implemented the same way, and instead of linking the + * preempt_notifiers from the task_struct directly, they have a private + * h-table I don't have access to, so I need my own too. + * + * For vanilla kernels we crawl through the task_struct::preempt_notifiers + * hlist until we find our entry, this list is often very short, and it's no + * slower than the global h-table which also crawls a list anyway. + */ +#define IS_PRE_2_6_23 (LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 23)) struct pwqr_sb { struct kref kref; struct rcu_head rcu; struct timer_list timer; wait_queue_head_t wqh; - pid_t tgid; + wait_queue_head_t wqh_poll; unsigned concurrency; unsigned registered; @@ -63,23 +114,38 @@ struct pwqr_sb { unsigned parked; unsigned overcommit_wakes; - unsigned dead; + int state; + unsigned has_pollin; }; struct pwqr_task { struct preempt_notifier notifier; - struct hlist_node link; + struct pwqr_sb *sb; struct rcu_head rcu; +#if IS_PRE_2_6_23 + struct hlist_node link; struct task_struct *task; - struct pwqr_sb *sb; +#endif +}; + +#if IS_PRE_2_6_23 + +#define PWQR_HASH_BITS 5 +#define PWQR_HASH_SIZE (1 << PWQR_HASH_BITS) + +struct pwqr_task_bucket { + spinlock_t lock; + struct hlist_head tasks; }; +static struct pwqr_task_bucket pwqr_tasks_hash[PWQR_HASH_SIZE]; +#endif + /* * Global variables */ static struct class *pwqr_class; static int pwqr_major; -static struct pwqr_task_bucket pwqr_tasks_hash[PWQR_HASH_SIZE]; static struct preempt_ops pwqr_preempt_running_ops; static struct preempt_ops pwqr_preempt_blocked_ops; static struct preempt_ops pwqr_preempt_noop_ops; @@ -93,22 +159,27 @@ static struct preempt_ops pwqr_preempt_noop_ops; #define pwqr_sb_unlock_irqrestore(sb, flags) \ spin_unlock_irqrestore(&(sb)->wqh.lock, flags) +static inline void pwqr_arm_timer(struct pwqr_sb *sb, int how, int delay) +{ + if (timer_pending(&sb->timer) && sb->state == how) + return; + mod_timer(&sb->timer, jiffies + delay); + sb->state = how; +} + static inline void __pwqr_sb_update_state(struct pwqr_sb *sb, int running_delta) { sb->running += running_delta; - if (sb->running > sb->concurrency) { - /* TODO see ../Documentation/pwqr.adoc */ - } else if (sb->running == sb->concurrency) { - /* do nothing */ - } else if (sb->waiting == 0 && sb->parked) { - if (!timer_pending(&sb->timer)) { - mod_timer(&sb->timer, jiffies + PWQR_UNPARK_DELAY); - } - return; - } - if (timer_pending(&sb->timer)) - del_timer(&sb->timer); + if (sb->running < sb->concurrency && sb->waiting == 0 && sb->parked) { + pwqr_arm_timer(sb, PWQR_STATE_UC, PWQR_UC_DELAY); + } else if (sb->running > sb->concurrency) { + pwqr_arm_timer(sb, PWQR_STATE_OC, PWQR_OC_DELAY); + } else { + sb->state = PWQR_STATE_NONE; + if (!timer_pending(&sb->timer)) + del_timer(&sb->timer); + } } static void pwqr_sb_timer_cb(unsigned long arg) @@ -117,10 +188,15 @@ static void pwqr_sb_timer_cb(unsigned long arg) unsigned long flags; pwqr_sb_lock_irqsave(sb, flags); - if (sb->waiting == 0 && sb->parked && sb->running < sb->concurrency) { + if (sb->running < sb->concurrency && sb->waiting == 0 && sb->parked) { if (sb->overcommit_wakes == 0) wake_up_locked(&sb->wqh); } + if (sb->running > sb->concurrency) { + printk(KERN_DEBUG "wake up poll"); + wake_up_poll(&sb->wqh_poll, POLLIN); + sb->has_pollin = 1; + } pwqr_sb_unlock_irqrestore(sb, flags); } @@ -134,8 +210,8 @@ static struct pwqr_sb *pwqr_sb_create(void) kref_init(&sb->kref); init_waitqueue_head(&sb->wqh); - sb->tgid = current->tgid; - sb->concurrency = num_online_cpus(); + init_waitqueue_head(&sb->wqh_poll); + sb->concurrency = num_online_cpus(); init_timer(&sb->timer); sb->timer.function = pwqr_sb_timer_cb; sb->timer.data = (unsigned long)sb; @@ -161,6 +237,7 @@ static void pwqr_sb_release(struct kref *kref) struct pwqr_sb *sb = container_of(kref, struct pwqr_sb, kref); del_timer_sync(&sb->timer); + wake_up_poll(&sb->wqh_poll, POLLHUP); call_rcu(&sb->rcu, pwqr_sb_finalize); } static inline void pwqr_sb_put(struct pwqr_sb *sb) @@ -171,6 +248,7 @@ static inline void pwqr_sb_put(struct pwqr_sb *sb) /***************************************************************************** * tasks */ +#if IS_PRE_2_6_23 static inline struct pwqr_task_bucket *task_hbucket(struct task_struct *task) { return &pwqr_tasks_hash[hash_ptr(task, PWQR_HASH_BITS)]; @@ -190,10 +268,29 @@ static struct pwqr_task *pwqr_task_find(struct task_struct *task) spin_unlock(&b->lock); return pwqt; } +#else +static struct pwqr_task *pwqr_task_find(struct task_struct *task) +{ + struct hlist_node *node; + struct preempt_notifier *it; + struct pwqr_task *pwqt = NULL; + + hlist_for_each_entry(it, node, &task->preempt_notifiers, link) { + if (it->ops == &pwqr_preempt_running_ops || + it->ops == &pwqr_preempt_blocked_ops || + it->ops == &pwqr_preempt_noop_ops) + { + pwqt = container_of(it, struct pwqr_task, notifier); + break; + } + } + + return pwqt; +} +#endif static struct pwqr_task *pwqr_task_create(struct task_struct *task) { - struct pwqr_task_bucket *b = task_hbucket(task); struct pwqr_task *pwqt; pwqt = kmalloc(sizeof(*pwqt), GFP_KERNEL); @@ -202,12 +299,16 @@ static struct pwqr_task *pwqr_task_create(struct task_struct *task) preempt_notifier_init(&pwqt->notifier, &pwqr_preempt_running_ops); preempt_notifier_register(&pwqt->notifier); - pwqt->task = task; - - spin_lock(&b->lock); - hlist_add_head(&pwqt->link, &b->tasks); - spin_unlock(&b->lock); - +#if IS_PRE_2_6_23 + { + struct pwqr_task_bucket *b = task_hbucket(task); + + pwqt->task = task; + spin_lock(&b->lock); + hlist_add_head(&pwqt->link, &b->tasks); + spin_unlock(&b->lock); + } +#endif return pwqt; } @@ -243,11 +344,13 @@ static void pwqr_task_attach(struct pwqr_task *pwqt, struct pwqr_sb *sb) __cold static void pwqr_task_release(struct pwqr_task *pwqt, bool from_notifier) { +#if IS_PRE_2_6_23 struct pwqr_task_bucket *b = task_hbucket(pwqt->task); spin_lock(&b->lock); hlist_del(&pwqt->link); spin_unlock(&b->lock); +#endif pwqt->notifier.ops = &pwqr_preempt_noop_ops; if (from_notifier) { @@ -258,7 +361,7 @@ static void pwqr_task_release(struct pwqr_task *pwqt, bool from_notifier) * callbacks if we're not dying, it'll panic on the next * sched_{in,out} call. */ - BUG_ON(!(pwqt->task->state & TASK_DEAD)); + BUG_ON(!(current->state & TASK_DEAD)); kfree_rcu(pwqt, rcu); } else { preempt_notifier_unregister(&pwqt->notifier); @@ -281,7 +384,7 @@ static void pwqr_task_blocked_sched_in(struct preempt_notifier *notifier, int cp struct pwqr_sb *sb = pwqt->sb; unsigned long flags; - if (unlikely(sb->dead)) { + if (unlikely(sb->state < 0)) { pwqr_task_detach(pwqt, sb); pwqr_task_release(pwqt, true); return; @@ -294,13 +397,13 @@ static void pwqr_task_blocked_sched_in(struct preempt_notifier *notifier, int cp } static void pwqr_task_sched_out(struct preempt_notifier *notifier, - struct task_struct *next) + struct task_struct *next) { - struct pwqr_task *pwqt = container_of(notifier, struct pwqr_task, notifier); - struct pwqr_sb *sb = pwqt->sb; - struct task_struct *p = pwqt->task; + struct pwqr_task *pwqt = container_of(notifier, struct pwqr_task, notifier); + struct pwqr_sb *sb = pwqt->sb; + struct task_struct *p = current; - if (unlikely(p->state & TASK_DEAD) || unlikely(sb->dead)) { + if (unlikely(p->state & TASK_DEAD) || unlikely(sb->state < 0)) { pwqr_task_detach(pwqt, sb); pwqr_task_release(pwqt, true); return; @@ -350,16 +453,85 @@ static int pwqr_release(struct inode *inode, struct file *filp) unsigned long flags; pwqr_sb_lock_irqsave(sb, flags); - sb->dead = true; + sb->state = PWQR_STATE_DEAD; pwqr_sb_unlock_irqrestore(sb, flags); wake_up_all(&sb->wqh); pwqr_sb_put(sb); return 0; } +static unsigned int pwqr_poll(struct file *filp, poll_table *wait) +{ + struct pwqr_sb *sb = filp->private_data; + unsigned int events = 0; + unsigned long flags; + + poll_wait(filp, &sb->wqh_poll, wait); + + pwqr_sb_lock_irqsave(sb, flags); + if (sb->has_pollin) + events |= POLLIN; + if (sb->state < 0) + events |= POLLHUP; + pwqr_sb_unlock_irqrestore(sb, flags); + + return events; +} + +static inline ssize_t pwqr_sb_read(struct pwqr_sb *sb, int no_wait, u32 *cnt) +{ + DECLARE_WAITQUEUE(wait, current); + ssize_t rc = -EAGAIN; + + spin_lock_irq(&sb->wqh.lock); + if (sb->running > sb->concurrency) { + rc = 0; + } else if (!no_wait) { + add_wait_queue(&sb->wqh_poll, &wait); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (sb->running > sb->concurrency) { + rc = 0; + break; + } + if (signal_pending(current)) { + rc = -ERESTARTSYS; + break; + } + spin_unlock_irq(&sb->wqh.lock); + schedule(); + spin_lock_irq(&sb->wqh.lock); + } + remove_wait_queue(&sb->wqh_poll, &wait); + __set_current_state(TASK_RUNNING); + } + if (likely(rc == 0)) { + *cnt = sb->running - sb->concurrency; + sb->has_pollin = 0; + } + spin_unlock_irq(&sb->wqh.lock); + + return rc; +} + +static ssize_t +pwqr_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) +{ + struct pwqr_sb *sb = filp->private_data; + u32 cnt = 0; + ssize_t rc; + + if (count < sizeof(cnt)) + return -EINVAL; + rc = pwqr_sb_read(sb, filp->f_flags & O_NONBLOCK, &cnt); + if (rc < 0) + return rc; + return put_user(cnt, (u32 __user *)buf) ? -EFAULT : sizeof(cnt); +} + static long do_pwqr_wait(struct pwqr_sb *sb, struct pwqr_task *pwqt, - int is_wait, struct pwqr_ioc_wait __user *arg) + int is_wait, struct pwqr_ioc_wait __user *arg) { unsigned long flags; struct pwqr_ioc_wait wait; @@ -400,9 +572,8 @@ do_pwqr_wait(struct pwqr_sb *sb, struct pwqr_task *pwqt, } /* @ see */ - if (likely(!sb->dead)) { + if (likely(sb->state >= 0)) { DEFINE_WAIT(__wait); - __wait.flags |= WQ_FLAG_EXCLUSIVE; if (is_wait) { @@ -413,9 +584,9 @@ do_pwqr_wait(struct pwqr_sb *sb, struct pwqr_task *pwqt, __add_wait_queue_tail(&sb->wqh, &__wait); } __pwqr_sb_update_state(sb, -1); - set_current_state(TASK_INTERRUPTIBLE); do { + set_current_state(TASK_INTERRUPTIBLE); if (sb->overcommit_wakes) break; if (signal_pending(current)) { @@ -429,17 +600,17 @@ do_pwqr_wait(struct pwqr_sb *sb, struct pwqr_task *pwqt, break; if (sb->running + sb->waiting < sb->concurrency) break; - } while (likely(!sb->dead)); + } while (likely(sb->state >= 0)); __remove_wait_queue(&sb->wqh, &__wait); __set_current_state(TASK_RUNNING); - if (is_wait) { sb->waiting--; } else { sb->parked--; } __pwqr_sb_update_state(sb, 1); + if (sb->overcommit_wakes) sb->overcommit_wakes--; if (sb->waiting + sb->running > sb->concurrency) @@ -447,7 +618,7 @@ do_pwqr_wait(struct pwqr_sb *sb, struct pwqr_task *pwqt, } out_unlock: - if (unlikely(sb->dead)) + if (unlikely(sb->state < 0)) rc = -EBADFD; pwqr_sb_unlock_irqrestore(sb, flags); out: @@ -543,35 +714,32 @@ static long do_pwqr_wake(struct pwqr_sb *sb, int oc, int count) static long pwqr_ioctl(struct file *filp, unsigned command, unsigned long arg) { - struct pwqr_sb *sb = filp->private_data; + struct pwqr_sb *sb = filp->private_data; struct task_struct *task = current; - struct pwqr_task *pwqt; + struct pwqr_task *pwqt; int rc = 0; - if (sb->tgid != current->tgid) - return -EBADFD; - switch (command) { - case PWQR_GET_CONC: + case PWQR_CTL_GET_CONC: return sb->concurrency; - case PWQR_SET_CONC: + case PWQR_CTL_SET_CONC: return do_pwqr_set_conc(sb, (int)arg); - case PWQR_WAKE: - case PWQR_WAKE_OC: - return do_pwqr_wake(sb, command == PWQR_WAKE_OC, (int)arg); + case PWQR_CTL_WAKE: + case PWQR_CTL_WAKE_OC: + return do_pwqr_wake(sb, command == PWQR_CTL_WAKE_OC, (int)arg); - case PWQR_WAIT: - case PWQR_PARK: - case PWQR_REGISTER: - case PWQR_UNREGISTER: + case PWQR_CTL_WAIT: + case PWQR_CTL_PARK: + case PWQR_CTL_REGISTER: + case PWQR_CTL_UNREGISTER: break; default: return -EINVAL; } pwqt = pwqr_task_find(task); - if (command == PWQR_UNREGISTER) + if (command == PWQR_CTL_UNREGISTER) return do_pwqr_unregister(sb, pwqt); if (pwqt == NULL) { @@ -585,18 +753,13 @@ static long pwqr_ioctl(struct file *filp, unsigned command, unsigned long arg) } switch (command) { - case PWQR_WAIT: + case PWQR_CTL_WAIT: rc = do_pwqr_wait(sb, pwqt, true, (struct pwqr_ioc_wait __user *)arg); break; - case PWQR_PARK: + case PWQR_CTL_PARK: rc = do_pwqr_wait(sb, pwqt, false, NULL); break; } - - if (unlikely(sb->dead)) { - pwqr_task_detach(pwqt, pwqt->sb); - return -EBADFD; - } return rc; } @@ -604,6 +767,9 @@ static const struct file_operations pwqr_dev_fops = { .owner = THIS_MODULE, .open = pwqr_open, .release = pwqr_release, + .poll = pwqr_poll, + .read = pwqr_read, + .llseek = noop_llseek, .unlocked_ioctl = pwqr_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = pwqr_ioctl, @@ -615,12 +781,14 @@ static const struct file_operations pwqr_dev_fops = { */ static int __init pwqr_start(void) { +#if IS_PRE_2_6_23 int i; for (i = 0; i < PWQR_HASH_SIZE; i++) { spin_lock_init(&pwqr_tasks_hash[i].lock); INIT_HLIST_HEAD(&pwqr_tasks_hash[i].tasks); } +#endif /* Register as a character device */ pwqr_major = register_chrdev(0, "pwqr", &pwqr_dev_fops); @@ -655,5 +823,6 @@ module_exit(pwqr_end); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pierre Habouzit "); MODULE_DESCRIPTION("PThreads Work Queues Regulator"); +#endif // vim:noet:sw=8:cinoptions+=\:0,L-1,=1s: