Mailing List Archive

[patch 05/11] syslets: core code
From: Ingo Molnar <mingo@elte.hu>

the core syslet / async system calls infrastructure code.

Is built only if CONFIG_ASYNC_SUPPORT is enabled.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
kernel/Makefile | 1
kernel/async.c | 811 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 812 insertions(+)

Index: linux/kernel/Makefile
===================================================================
--- linux.orig/kernel/Makefile
+++ linux/kernel/Makefile
@@ -10,6 +10,7 @@ obj-y = sched.o fork.o exec_domain.o
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
hrtimer.o rwsem.o latency.o nsproxy.o srcu.o

+obj-$(CONFIG_ASYNC_SUPPORT) += async.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/
obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
Index: linux/kernel/async.c
===================================================================
--- /dev/null
+++ linux/kernel/async.c
@@ -0,0 +1,811 @@
+/*
+ * kernel/async.c
+ *
+ * The syslet subsystem - asynchronous syscall execution support.
+ *
+ * Started by Ingo Molnar:
+ *
+ * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * This file is released under the GPLv2.
+ *
+ * This code implements asynchronous syscalls via 'syslets'.
+ *
+ * Syslets consist of a set of 'syslet atoms' which are residing
+ * purely in user-space memory and have no kernel-space resource
+ * attached to them. These atoms can be linked to each other via
+ * pointers. Besides the fundamental ability to execute system
+ * calls, syslet atoms can also implement branches, loops and
+ * arithmetics.
+ *
+ * Thus syslets can be used to build small autonomous programs that
+ * the kernel can execute purely from kernel-space, without having
+ * to return to any user-space context. Syslets can be run by any
+ * unprivileged user-space application - they are executed safely
+ * by the kernel.
+ */
+#include <linux/syscalls.h>
+#include <linux/syslet.h>
+#include <linux/delay.h>
+#include <linux/async.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/err.h>
+
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+
+#include "async.h"
+
+typedef asmlinkage long (*syscall_fn_t)(long, long, long, long, long, long);
+
+extern syscall_fn_t sys_call_table[NR_syscalls];
+
+static void
+__mark_async_thread_ready(struct async_thread *at, struct async_head *ah)
+{
+ list_del(&at->entry);
+ list_add_tail(&at->entry, &ah->ready_async_threads);
+ if (list_empty(&ah->busy_async_threads))
+ wake_up(&ah->wait);
+}
+
+static void
+mark_async_thread_ready(struct async_thread *at, struct async_head *ah)
+{
+ spin_lock(&ah->lock);
+ __mark_async_thread_ready(at, ah);
+ spin_unlock(&ah->lock);
+}
+
+static void
+__mark_async_thread_busy(struct async_thread *at, struct async_head *ah)
+{
+ list_del(&at->entry);
+ list_add_tail(&at->entry, &ah->busy_async_threads);
+}
+
+static void
+mark_async_thread_busy(struct async_thread *at, struct async_head *ah)
+{
+ spin_lock(&ah->lock);
+ __mark_async_thread_busy(at, ah);
+ spin_unlock(&ah->lock);
+}
+
+static void
+__async_thread_init(struct task_struct *t, struct async_thread *at,
+ struct async_head *ah)
+{
+ INIT_LIST_HEAD(&at->entry);
+ at->exit = 0;
+ at->task = t;
+ at->ah = ah;
+ at->work = NULL;
+
+ t->at = at;
+ ah->nr_threads++;
+}
+
+static void
+async_thread_init(struct task_struct *t, struct async_thread *at,
+ struct async_head *ah)
+{
+ spin_lock(&ah->lock);
+ __async_thread_init(t, at, ah);
+ __mark_async_thread_ready(at, ah);
+ spin_unlock(&ah->lock);
+}
+
+
+static void
+async_thread_exit(struct async_thread *at, struct task_struct *t)
+{
+ struct async_head *ah;
+
+ ah = at->ah;
+
+ spin_lock(&ah->lock);
+ list_del_init(&at->entry);
+ if (at->exit)
+ complete(&ah->exit_done);
+ t->at = NULL;
+ at->task = NULL;
+ WARN_ON(!ah->nr_threads);
+ ah->nr_threads--;
+ spin_unlock(&ah->lock);
+}
+
+static struct async_thread *
+pick_ready_cachemiss_thread(struct async_head *ah)
+{
+ struct list_head *head = &ah->ready_async_threads;
+ struct async_thread *at;
+
+ if (list_empty(head))
+ return NULL;
+
+ at = list_entry(head->next, struct async_thread, entry);
+
+ return at;
+}
+
+static void pick_new_async_head(struct async_head *ah,
+ struct task_struct *t, struct pt_regs *old_regs)
+{
+ struct async_thread *new_async_thread;
+ struct async_thread *async_ready;
+ struct task_struct *new_task;
+ struct pt_regs *new_regs;
+
+ spin_lock(&ah->lock);
+
+ new_async_thread = pick_ready_cachemiss_thread(ah);
+ if (!new_async_thread)
+ goto out_unlock;
+
+ async_ready = t->async_ready;
+ WARN_ON(!async_ready);
+ t->async_ready = NULL;
+
+ new_task = new_async_thread->task;
+ new_regs = task_pt_regs(new_task);
+ *new_regs = *old_regs;
+
+ new_task->at = NULL;
+ t->ah = NULL;
+ new_task->ah = ah;
+
+ wake_up_process(new_task);
+
+ __async_thread_init(t, async_ready, ah);
+ __mark_async_thread_busy(t->at, ah);
+
+ out_unlock:
+ spin_unlock(&ah->lock);
+}
+
+void __async_schedule(struct task_struct *t)
+{
+ struct async_head *ah = t->ah;
+ struct pt_regs *old_regs = task_pt_regs(t);
+
+ pick_new_async_head(ah, t, old_regs);
+}
+
+static void async_schedule(struct task_struct *t)
+{
+ if (t->async_ready)
+ __async_schedule(t);
+}
+
+static long __exec_atom(struct task_struct *t, struct syslet_atom *atom)
+{
+ struct async_thread *async_ready_save;
+ long ret;
+
+ /*
+ * If user-space expects the syscall to schedule then
+ * (try to) switch user-space to another thread straight
+ * away and execute the syscall asynchronously:
+ */
+ if (unlikely(atom->flags & SYSLET_ASYNC))
+ async_schedule(t);
+ /*
+ * Does user-space want synchronous execution for this atom?:
+ */
+ async_ready_save = t->async_ready;
+ if (unlikely(atom->flags & SYSLET_SYNC))
+ t->async_ready = NULL;
+
+ if (unlikely(atom->nr >= NR_syscalls))
+ return -ENOSYS;
+
+ ret = sys_call_table[atom->nr](atom->args[0], atom->args[1],
+ atom->args[2], atom->args[3],
+ atom->args[4], atom->args[5]);
+ if (atom->ret_ptr && put_user(ret, atom->ret_ptr))
+ return -EFAULT;
+
+ if (t->ah)
+ t->async_ready = async_ready_save;
+
+ return ret;
+}
+
+/*
+ * Arithmetics syscall, add a value to a user-space memory location.
+ *
+ * Generic C version - in case the architecture has not implemented it
+ * in assembly.
+ */
+asmlinkage __attribute__((weak)) long
+sys_umem_add(unsigned long __user *uptr, unsigned long inc)
+{
+ unsigned long val, new_val;
+
+ if (get_user(val, uptr))
+ return -EFAULT;
+ /*
+ * inc == 0 means 'read memory value':
+ */
+ if (!inc)
+ return val;
+
+ new_val = val + inc;
+ __put_user(new_val, uptr);
+
+ return new_val;
+}
+
+/*
+ * Open-coded because this is a very hot codepath during syslet
+ * execution and every cycle counts ...
+ *
+ * [. NOTE: it's an explicit fastcall because optimized assembly code
+ * might depend on this. There are some kernels that disable regparm,
+ * so lets not break those if possible. ]
+ */
+fastcall __attribute__((weak)) long
+copy_uatom(struct syslet_atom *atom, struct syslet_uatom __user *uatom)
+{
+ unsigned long __user *arg_ptr;
+ long ret = 0;
+
+ if (!access_ok(VERIFY_WRITE, uatom, sizeof(*uatom)))
+ return -EFAULT;
+
+ ret = __get_user(atom->nr, &uatom->nr);
+ ret |= __get_user(atom->ret_ptr, &uatom->ret_ptr);
+ ret |= __get_user(atom->flags, &uatom->flags);
+ ret |= __get_user(atom->next, &uatom->next);
+
+ memset(atom->args, 0, sizeof(atom->args));
+
+ ret |= __get_user(arg_ptr, &uatom->arg_ptr[0]);
+ if (!arg_ptr)
+ return ret;
+ if (!access_ok(VERIFY_WRITE, arg_ptr, sizeof(*arg_ptr)))
+ return -EFAULT;
+ ret |= __get_user(atom->args[0], arg_ptr);
+
+ ret |= __get_user(arg_ptr, &uatom->arg_ptr[1]);
+ if (!arg_ptr)
+ return ret;
+ if (!access_ok(VERIFY_WRITE, arg_ptr, sizeof(*arg_ptr)))
+ return -EFAULT;
+ ret |= __get_user(atom->args[1], arg_ptr);
+
+ ret |= __get_user(arg_ptr, &uatom->arg_ptr[2]);
+ if (!arg_ptr)
+ return ret;
+ if (!access_ok(VERIFY_WRITE, arg_ptr, sizeof(*arg_ptr)))
+ return -EFAULT;
+ ret |= __get_user(atom->args[2], arg_ptr);
+
+ ret |= __get_user(arg_ptr, &uatom->arg_ptr[3]);
+ if (!arg_ptr)
+ return ret;
+ if (!access_ok(VERIFY_WRITE, arg_ptr, sizeof(*arg_ptr)))
+ return -EFAULT;
+ ret |= __get_user(atom->args[3], arg_ptr);
+
+ ret |= __get_user(arg_ptr, &uatom->arg_ptr[4]);
+ if (!arg_ptr)
+ return ret;
+ if (!access_ok(VERIFY_WRITE, arg_ptr, sizeof(*arg_ptr)))
+ return -EFAULT;
+ ret |= __get_user(atom->args[4], arg_ptr);
+
+ ret |= __get_user(arg_ptr, &uatom->arg_ptr[5]);
+ if (!arg_ptr)
+ return ret;
+ if (!access_ok(VERIFY_WRITE, arg_ptr, sizeof(*arg_ptr)))
+ return -EFAULT;
+ ret |= __get_user(atom->args[5], arg_ptr);
+
+ return ret;
+}
+
+/*
+ * Should the next atom run, depending on the return value of
+ * the current atom - or should we stop execution?
+ */
+static int run_next_atom(struct syslet_atom *atom, long ret)
+{
+ switch (atom->flags & SYSLET_STOP_MASK) {
+ case SYSLET_STOP_ON_NONZERO:
+ if (!ret)
+ return 1;
+ return 0;
+ case SYSLET_STOP_ON_ZERO:
+ if (ret)
+ return 1;
+ return 0;
+ case SYSLET_STOP_ON_NEGATIVE:
+ if (ret >= 0)
+ return 1;
+ return 0;
+ case SYSLET_STOP_ON_NON_POSITIVE:
+ if (ret > 0)
+ return 1;
+ return 0;
+ }
+ return 1;
+}
+
+static struct syslet_uatom __user *
+next_uatom(struct syslet_atom *atom, struct syslet_uatom *uatom, long ret)
+{
+ /*
+ * If the stop condition is false then continue
+ * to atom->next:
+ */
+ if (run_next_atom(atom, ret))
+ return atom->next;
+ /*
+ * Special-case: if the stop condition is true and the atom
+ * has SKIP_TO_NEXT_ON_STOP set, then instead of
+ * stopping we skip to the atom directly after this atom
+ * (in linear address-space).
+ *
+ * This, combined with the atom->next pointer and the
+ * stop condition flags is what allows true branches and
+ * loops in syslets:
+ */
+ if (atom->flags & SYSLET_SKIP_TO_NEXT_ON_STOP)
+ return uatom + 1;
+
+ return NULL;
+}
+
+/*
+ * If user-space requested a completion event then put the last
+ * executed uatom into the completion ring:
+ */
+static long
+complete_uatom(struct async_head *ah, struct task_struct *t,
+ struct syslet_atom *atom, struct syslet_uatom __user *uatom)
+{
+ struct syslet_uatom __user **ring_slot, *slot_val = NULL;
+ long ret;
+
+ WARN_ON(!t->at);
+ WARN_ON(t->ah);
+
+ if (unlikely(atom->flags & SYSLET_NO_COMPLETE))
+ return 0;
+
+ /*
+ * Asynchron threads can complete in parallel so use the
+ * head-lock to serialize:
+ */
+ spin_lock(&ah->lock);
+ ring_slot = ah->completion_ring + ah->curr_ring_idx;
+ ret = __copy_from_user_inatomic(&slot_val, ring_slot, sizeof(slot_val));
+ /*
+ * User-space submitted more work than what fits into the
+ * completion ring - do not stomp over it silently and signal
+ * the error condition:
+ */
+ if (unlikely(slot_val)) {
+ spin_unlock(&ah->lock);
+ return -EFAULT;
+ }
+ slot_val = uatom;
+ ret |= __copy_to_user_inatomic(ring_slot, &slot_val, sizeof(slot_val));
+
+ ah->curr_ring_idx++;
+ if (unlikely(ah->curr_ring_idx == ah->max_ring_idx))
+ ah->curr_ring_idx = 0;
+
+ /*
+ * See whether the async-head is waiting and needs a wakeup:
+ */
+ if (ah->events_left) {
+ ah->events_left--;
+ if (!ah->events_left)
+ wake_up(&ah->wait);
+ }
+
+ spin_unlock(&ah->lock);
+
+ return ret;
+}
+
+/*
+ * This is the main syslet atom execution loop. This fetches atoms
+ * and executes them until it runs out of atoms or until the
+ * exit condition becomes false:
+ */
+static struct syslet_uatom __user *
+exec_atom(struct async_head *ah, struct task_struct *t,
+ struct syslet_uatom __user *uatom)
+{
+ struct syslet_uatom __user *last_uatom;
+ struct syslet_atom atom;
+ long ret;
+
+ run_next:
+ if (unlikely(copy_uatom(&atom, uatom)))
+ return ERR_PTR(-EFAULT);
+
+ last_uatom = uatom;
+ ret = __exec_atom(t, &atom);
+ if (unlikely(signal_pending(t) || need_resched()))
+ goto stop;
+
+ uatom = next_uatom(&atom, uatom, ret);
+ if (uatom)
+ goto run_next;
+ stop:
+ /*
+ * We do completion only in async context:
+ */
+ if (t->at && complete_uatom(ah, t, &atom, last_uatom))
+ return ERR_PTR(-EFAULT);
+
+ return last_uatom;
+}
+
+static void cachemiss_execute(struct async_thread *at, struct async_head *ah,
+ struct task_struct *t)
+{
+ struct syslet_uatom __user *uatom;
+
+ uatom = at->work;
+ WARN_ON(!uatom);
+ at->work = NULL;
+
+ exec_atom(ah, t, uatom);
+}
+
+static void
+cachemiss_loop(struct async_thread *at, struct async_head *ah,
+ struct task_struct *t)
+{
+ for (;;) {
+ schedule();
+ mark_async_thread_busy(at, ah);
+ set_task_state(t, TASK_INTERRUPTIBLE);
+ if (at->work)
+ cachemiss_execute(at, ah, t);
+ if (unlikely(t->ah || at->exit || signal_pending(t)))
+ break;
+ mark_async_thread_ready(at, ah);
+ }
+ t->state = TASK_RUNNING;
+
+ async_thread_exit(at, t);
+}
+
+static int cachemiss_thread(void *data)
+{
+ struct task_struct *t = current;
+ struct async_head *ah = data;
+ struct async_thread at;
+
+ async_thread_init(t, &at, ah);
+ complete(&ah->start_done);
+
+ cachemiss_loop(&at, ah, t);
+ if (at.exit)
+ do_exit(0);
+
+ if (!t->ah && signal_pending(t)) {
+ WARN_ON(1);
+ do_exit(0);
+ }
+
+ /*
+ * Return to user-space with NULL:
+ */
+ return 0;
+}
+
+static void __notify_async_thread_exit(struct async_thread *at,
+ struct async_head *ah)
+{
+ list_del_init(&at->entry);
+ at->exit = 1;
+ init_completion(&ah->exit_done);
+ wake_up_process(at->task);
+}
+
+static void stop_cachemiss_threads(struct async_head *ah)
+{
+ struct async_thread *at;
+
+repeat:
+ spin_lock(&ah->lock);
+ list_for_each_entry(at, &ah->ready_async_threads, entry) {
+
+ __notify_async_thread_exit(at, ah);
+ spin_unlock(&ah->lock);
+
+ wait_for_completion(&ah->exit_done);
+
+ goto repeat;
+ }
+
+ list_for_each_entry(at, &ah->busy_async_threads, entry) {
+
+ __notify_async_thread_exit(at, ah);
+ spin_unlock(&ah->lock);
+
+ wait_for_completion(&ah->exit_done);
+
+ goto repeat;
+ }
+ spin_unlock(&ah->lock);
+}
+
+static void async_head_exit(struct async_head *ah, struct task_struct *t)
+{
+ stop_cachemiss_threads(ah);
+ WARN_ON(!list_empty(&ah->ready_async_threads));
+ WARN_ON(!list_empty(&ah->busy_async_threads));
+ WARN_ON(ah->nr_threads);
+ WARN_ON(spin_is_locked(&ah->lock));
+ kfree(ah);
+ t->ah = NULL;
+}
+
+/*
+ * Pretty arbitrary for now. The kernel resource-controls the number
+ * of threads anyway.
+ */
+#define DEFAULT_THREAD_LIMIT 1024
+
+/*
+ * Initialize the in-kernel async head, based on the user-space async
+ * head:
+ */
+static long
+async_head_init(struct task_struct *t, struct async_head_user __user *uah)
+{
+ unsigned long max_nr_threads, ring_size_bytes, max_ring_idx;
+ struct syslet_uatom __user **completion_ring;
+ struct async_head *ah;
+ long ret;
+
+ if (get_user(max_nr_threads, &uah->max_nr_threads))
+ return -EFAULT;
+ if (get_user(completion_ring, &uah->completion_ring))
+ return -EFAULT;
+ if (get_user(ring_size_bytes, &uah->ring_size_bytes))
+ return -EFAULT;
+ if (!ring_size_bytes)
+ return -EINVAL;
+ /*
+ * We pre-check the ring pointer, so that in the fastpath
+ * we can use __put_user():
+ */
+ if (!access_ok(VERIFY_WRITE, completion_ring, ring_size_bytes))
+ return -EFAULT;
+
+ max_ring_idx = ring_size_bytes / sizeof(void *);
+ if (ring_size_bytes != max_ring_idx * sizeof(void *))
+ return -EINVAL;
+
+ /*
+ * Lock down the ring. Note: user-space should not munlock() this,
+ * because if the ring pages get swapped out then the async
+ * completion code might return a -EFAULT instead of the expected
+ * completion. (the kernel safely handles that case too, so this
+ * isnt a security problem.)
+ *
+ * mlock() is better here because it gets resource-accounted
+ * properly, and even unprivileged userspace has a few pages
+ * of mlock-able memory available. (which is more than enough
+ * for the completion-pointers ringbuffer)
+ */
+ ret = sys_mlock((unsigned long)completion_ring, ring_size_bytes);
+ if (ret)
+ return ret;
+
+ /*
+ * -1 means: the kernel manages the optimal size of the async pool.
+ * Simple static limit for now.
+ */
+ if (max_nr_threads == -1UL)
+ max_nr_threads = DEFAULT_THREAD_LIMIT;
+ /*
+ * If the ring is smaller than the number of threads requested
+ * then lower the thread count - otherwise we might lose
+ * syslet completion events:
+ */
+ max_nr_threads = min(max_ring_idx, max_nr_threads);
+
+ ah = kmalloc(sizeof(*ah), GFP_KERNEL);
+ if (!ah)
+ return -ENOMEM;
+
+ spin_lock_init(&ah->lock);
+ ah->nr_threads = 0;
+ ah->max_nr_threads = max_nr_threads;
+ INIT_LIST_HEAD(&ah->ready_async_threads);
+ INIT_LIST_HEAD(&ah->busy_async_threads);
+ init_waitqueue_head(&ah->wait);
+ ah->events_left = 0;
+ ah->uah = uah;
+ ah->curr_ring_idx = 0;
+ ah->max_ring_idx = max_ring_idx;
+ ah->completion_ring = completion_ring;
+ ah->ring_size_bytes = ring_size_bytes;
+
+ ah->user_task = t;
+ t->ah = ah;
+
+ return 0;
+}
+
+/**
+ * sys_async_register - enable async syscall support
+ */
+asmlinkage long
+sys_async_register(struct async_head_user __user *uah, unsigned int len)
+{
+ struct task_struct *t = current;
+
+ /*
+ * This 'len' check enables future extension of
+ * the async_head ABI:
+ */
+ if (len != sizeof(struct async_head_user))
+ return -EINVAL;
+ /*
+ * Already registered?
+ */
+ if (t->ah)
+ return -EEXIST;
+
+ return async_head_init(t, uah);
+}
+
+/**
+ * sys_async_unregister - disable async syscall support
+ */
+asmlinkage long
+sys_async_unregister(struct async_head_user __user *uah, unsigned int len)
+{
+ struct syslet_uatom __user **completion_ring;
+ struct task_struct *t = current;
+ struct async_head *ah = t->ah;
+ unsigned long ring_size_bytes;
+
+ if (len != sizeof(struct async_head_user))
+ return -EINVAL;
+ /*
+ * Already unregistered?
+ */
+ if (!ah)
+ return -EINVAL;
+
+ completion_ring = ah->completion_ring;
+ ring_size_bytes = ah->ring_size_bytes;
+
+ async_head_exit(ah, t);
+
+ /*
+ * Unpin the ring:
+ */
+ return sys_munlock((unsigned long)completion_ring, ring_size_bytes);
+}
+
+/*
+ * Simple limit and pool management mechanism for now:
+ */
+static void refill_cachemiss_pool(struct async_head *ah)
+{
+ int pid;
+
+ if (ah->nr_threads >= ah->max_nr_threads)
+ return;
+
+ init_completion(&ah->start_done);
+
+ pid = create_async_thread(cachemiss_thread, (void *)ah,
+ CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND |
+ CLONE_PTRACE | CLONE_THREAD | CLONE_SYSVSEM);
+ if (pid < 0)
+ return;
+
+ wait_for_completion(&ah->start_done);
+}
+
+/**
+ * sys_async_wait - wait for async completion events
+ *
+ * This syscall waits for @min_wait_events syslet completion events
+ * to finish or for all async processing to finish (whichever
+ * comes first).
+ */
+asmlinkage long sys_async_wait(unsigned long min_wait_events)
+{
+ struct async_head *ah = current->ah;
+
+ if (!ah)
+ return -EINVAL;
+
+ if (min_wait_events) {
+ spin_lock(&ah->lock);
+ ah->events_left = min_wait_events;
+ spin_unlock(&ah->lock);
+ }
+
+ return wait_event_interruptible(ah->wait,
+ list_empty(&ah->busy_async_threads) || !ah->events_left);
+}
+
+/**
+ * sys_async_exec - execute a syslet.
+ *
+ * returns the uatom that was last executed, if the kernel was able to
+ * execute the syslet synchronously, or NULL if the syslet became
+ * asynchronous. (in the latter case syslet completion will be notified
+ * via the completion ring)
+ *
+ * (Various errors might also be returned via the usual negative numbers.)
+ */
+asmlinkage struct syslet_uatom __user *
+sys_async_exec(struct syslet_uatom __user *uatom)
+{
+ struct syslet_uatom __user *ret;
+ struct task_struct *t = current;
+ struct async_head *ah = t->ah;
+ struct async_thread at;
+
+ if (unlikely(!ah))
+ return ERR_PTR(-EINVAL);
+
+ if (list_empty(&ah->ready_async_threads))
+ refill_cachemiss_pool(ah);
+
+ t->async_ready = &at;
+ ret = exec_atom(ah, t, uatom);
+
+ if (t->ah) {
+ WARN_ON(!t->async_ready);
+ t->async_ready = NULL;
+ return ret;
+ }
+ ret = ERR_PTR(-EINTR);
+ if (!at.exit && !signal_pending(t)) {
+ set_task_state(t, TASK_INTERRUPTIBLE);
+ mark_async_thread_ready(&at, ah);
+ cachemiss_loop(&at, ah, t);
+ }
+ if (t->ah)
+ return NULL;
+ else
+ do_exit(0);
+}
+
+/*
+ * fork()-time initialization:
+ */
+void async_init(struct task_struct *t)
+{
+ t->at = NULL;
+ t->async_ready = NULL;
+ t->ah = NULL;
+}
+
+/*
+ * do_exit()-time cleanup:
+ */
+void async_exit(struct task_struct *t)
+{
+ struct async_thread *at = t->at;
+ struct async_head *ah = t->ah;
+
+ WARN_ON(at && ah);
+ WARN_ON(t->async_ready);
+
+ if (unlikely(at))
+ async_thread_exit(at, t);
+
+ if (unlikely(ah))
+ async_head_exit(ah, t);
+}
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Re: [patch 05/11] syslets: core code [ In reply to ]
Ingo Molnar <mingo@elte.hu> writes:

> +
> +static struct async_thread *
> +pick_ready_cachemiss_thread(struct async_head *ah)

The cachemiss names are confusing. I assume that's just a left over
from Tux?
> +
> + memset(atom->args, 0, sizeof(atom->args));
> +
> + ret |= __get_user(arg_ptr, &uatom->arg_ptr[0]);
> + if (!arg_ptr)
> + return ret;
> + if (!access_ok(VERIFY_WRITE, arg_ptr, sizeof(*arg_ptr)))
> + return -EFAULT;

It's a little unclear why you do that many individual access_ok()s.
And why is the target constant sized anyways?


+ /*
+ * Lock down the ring. Note: user-space should not munlock() this,
+ * because if the ring pages get swapped out then the async
+ * completion code might return a -EFAULT instead of the expected
+ * completion. (the kernel safely handles that case too, so this
+ * isnt a security problem.)
+ *
+ * mlock() is better here because it gets resource-accounted
+ * properly, and even unprivileged userspace has a few pages
+ * of mlock-able memory available. (which is more than enough
+ * for the completion-pointers ringbuffer)
+ */

If it's only a few pages you don't need any resource accounting.
If it's more then it's nasty to steal the users quota.
I think plain gup() would be better.


-Andi
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Re: [patch 05/11] syslets: core code [ In reply to ]
* Andi Kleen <andi@firstfloor.org> wrote:

> Ingo Molnar <mingo@elte.hu> writes:
>
> > +
> > +static struct async_thread *
> > +pick_ready_cachemiss_thread(struct async_head *ah)
>
> The cachemiss names are confusing. I assume that's just a left over
> from Tux?

yeah. Although 'stuff goes async' is quite similar to a cachemiss. We
didnt have some resource available right now so the syscall has to block
== i.e. some cache was not available.

> > +
> > + memset(atom->args, 0, sizeof(atom->args));
> > +
> > + ret |= __get_user(arg_ptr, &uatom->arg_ptr[0]);
> > + if (!arg_ptr)
> > + return ret;
> > + if (!access_ok(VERIFY_WRITE, arg_ptr, sizeof(*arg_ptr)))
> > + return -EFAULT;
>
> It's a little unclear why you do that many individual access_ok()s.
> And why is the target constant sized anyways?

each indirect pointer has to be checked separately, before dereferencing
it. (Andrew pointed out that they should be VERIFY_READ, i fixed that in
my tree)

it looks a bit scary in C but the assembly code is very fast and quite
straightforward.

> + /*
> + * Lock down the ring. Note: user-space should not munlock() this,
> + * because if the ring pages get swapped out then the async
> + * completion code might return a -EFAULT instead of the expected
> + * completion. (the kernel safely handles that case too, so this
> + * isnt a security problem.)
> + *
> + * mlock() is better here because it gets resource-accounted
> + * properly, and even unprivileged userspace has a few pages
> + * of mlock-able memory available. (which is more than enough
> + * for the completion-pointers ringbuffer)
> + */
>
> If it's only a few pages you don't need any resource accounting. If
> it's more then it's nasty to steal the users quota. I think plain
> gup() would be better.

get_user_pages() would have to be limited in some way - and i didnt want
to add yet another wacky limit thing - so i just used the already
existing mlock() infrastructure for this. If Oracle wants to set up a 10
MB ringbuffer, they can set the PAM resource limits to 11 MB and still
have enough stuff left. And i dont really expect GPG to start using
syslets - just yet ;-)

a single page is enough for 1024 completion pointers - that's more than
enough for most purposes - and the default mlock limit is 40K.

Ingo
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Re: [patch 05/11] syslets: core code [ In reply to ]
On Tue, Feb 13, 2007 at 11:24:43PM +0100, Ingo Molnar wrote:
> > > + memset(atom->args, 0, sizeof(atom->args));
> > > +
> > > + ret |= __get_user(arg_ptr, &uatom->arg_ptr[0]);
> > > + if (!arg_ptr)
> > > + return ret;
> > > + if (!access_ok(VERIFY_WRITE, arg_ptr, sizeof(*arg_ptr)))
> > > + return -EFAULT;
> >
> > It's a little unclear why you do that many individual access_ok()s.
> > And why is the target constant sized anyways?
>
> each indirect pointer has to be checked separately, before dereferencing
> it. (Andrew pointed out that they should be VERIFY_READ, i fixed that in
> my tree)

But why only constant sized? It could be a variable length object, couldn't it?

If it's an array it could be all checked together

(i must be missing something here)

> > If it's only a few pages you don't need any resource accounting. If
> > it's more then it's nasty to steal the users quota. I think plain
> > gup() would be better.
>
> get_user_pages() would have to be limited in some way - and i didnt want

If you only use it for a small ring buffer it is naturally limited.

Also beancounter will fix that eventually.

> a single page is enough for 1024 completion pointers - that's more than
> enough for most purposes - and the default mlock limit is 40K.

Then limit it to a single page and use gup

-Andi
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Re: [patch 05/11] syslets: core code [ In reply to ]
* Andi Kleen <andi@firstfloor.org> wrote:

> > > > + if (!access_ok(VERIFY_WRITE, arg_ptr, sizeof(*arg_ptr)))
> > > > + return -EFAULT;
> > >
> > > It's a little unclear why you do that many individual access_ok()s.
> > > And why is the target constant sized anyways?
> >
> > each indirect pointer has to be checked separately, before dereferencing
> > it. (Andrew pointed out that they should be VERIFY_READ, i fixed that in
> > my tree)
>
> But why only constant sized? It could be a variable length object,
> couldn't it?

i think what you might be missing is that it's only the 6 syscall
arguments that are fetched via indirect pointers - security checks are
then done by the system calls themselves. It's a bit awkward to think
about, but it is surprisingly clean in the assembly, and it simplified
syslet programming too.

> > get_user_pages() would have to be limited in some way - and i didnt
> > want
>
> If you only use it for a small ring buffer it is naturally limited.

yeah, but 'small' is a dangerous word when it comes to adding IO
interfaces ;-)

> > a single page is enough for 1024 completion pointers - that's more
> > than enough for most purposes - and the default mlock limit is 40K.
>
> Then limit it to a single page and use gup

1024 (512 on 64-bit) is alot but not ALOT. It is also certainly not
ALOOOOT :-) Really, people will want to have more than 512
disks/spindles in the same box. I have used such a beast myself. For Tux
workloads and benchmarks we had parallelism levels of millions of
pending requests (!) on a single system - networking, socket limits,
disk IO combined with thousands of clients do create such scenarios. I
really think that such 'pinned pages' are a pretty natural fit for
sys_mlock() and RLIMIT_MEMLOCK, and since the kernel side is careful to
use the _inatomic() uaccess methods, it's safe (and fast) as well.

Ingo
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Re: [patch 05/11] syslets: core code [ In reply to ]
> On Tue, 13 Feb 2007 23:24:43 +0100 Ingo Molnar <mingo@elte.hu> wrote:
> > If it's only a few pages you don't need any resource accounting. If
> > it's more then it's nasty to steal the users quota. I think plain
> > gup() would be better.
>
> get_user_pages() would have to be limited in some way - and i didnt want
> to add yet another wacky limit thing - so i just used the already
> existing mlock() infrastructure for this. If Oracle wants to set up a 10
> MB ringbuffer, they can set the PAM resource limits to 11 MB and still
> have enough stuff left. And i dont really expect GPG to start using
> syslets - just yet ;-)
>
> a single page is enough for 1024 completion pointers - that's more than
> enough for most purposes - and the default mlock limit is 40K.

So if I have an application which instantiates a single mlocked page
for this purpose, I can only run ten of them at once, and any other
mlock-using process which I'm using starts to mysteriously fail.

It seems like a problem to me..
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Re: [patch 05/11] syslets: core code [ In reply to ]
On Tue, Feb 13, 2007 at 11:41:31PM +0100, Ingo Molnar (mingo@elte.hu) wrote:
> > Then limit it to a single page and use gup
>
> 1024 (512 on 64-bit) is alot but not ALOT. It is also certainly not
> ALOOOOT :-) Really, people will want to have more than 512
> disks/spindles in the same box. I have used such a beast myself. For Tux
> workloads and benchmarks we had parallelism levels of millions of
> pending requests (!) on a single system - networking, socket limits,
> disk IO combined with thousands of clients do create such scenarios. I
> really think that such 'pinned pages' are a pretty natural fit for
> sys_mlock() and RLIMIT_MEMLOCK, and since the kernel side is careful to
> use the _inatomic() uaccess methods, it's safe (and fast) as well.

This will end up badly - I used the same approach in the early kevent
days and was proven to have swapable memory for the ring. I think it
would be much better to have userspace allocated ring and use
copy_to_user() there.

Btw, as a bit of advertisement, the whole completion part can be done
through kevent which already has ring buffer, queue operations and
non-racy updates... :)

> Ingo

--
Evgeniy Polyakov
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Re: [patch 05/11] syslets: core code [ In reply to ]
* Evgeniy Polyakov <johnpol@2ka.mipt.ru> wrote:

> This will end up badly - I used the same approach in the early kevent
> days and was proven to have swapable memory for the ring. I think it
> would be much better to have userspace allocated ring and use
> copy_to_user() there.

it is a userspace allocated ring - but pinned down by the kernel.

Ingo
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Re: [patch 05/11] syslets: core code [ In reply to ]
On Wed, Feb 14, 2007 at 10:46:29AM +0100, Ingo Molnar (mingo@elte.hu) wrote:
>
> * Evgeniy Polyakov <johnpol@2ka.mipt.ru> wrote:
>
> > This will end up badly - I used the same approach in the early kevent
> > days and was proven to have swapable memory for the ring. I think it
> > would be much better to have userspace allocated ring and use
> > copy_to_user() there.
>
> it is a userspace allocated ring - but pinned down by the kernel.

That's a problem - 1000/512 pages per 'usual' thread ends up with the
whole memory locked by malicious/stupid application (at least on Debian
and Mandrake there is no locked memory limit by default). And if such
a limit exists, this will hurt big-iron applications, which want to used
high-order rings legitimely.

> Ingo

--
Evgeniy Polyakov
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Re: [patch 05/11] syslets: core code [ In reply to ]
> (at least on Debian
> and Mandrake there is no locked memory limit by default).

that sounds like 2 very large bugtraq-worthy bugs in these distros.. so
bad a bug that I almost find it hard to believe...

--
if you want to mail me at work (you don't), use arjan (at) linux.intel.com
Test the interaction between Linux and your BIOS via http://www.linuxfirmwarekit.org

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Re: [patch 05/11] syslets: core code [ In reply to ]
On Wed, Feb 14, 2007 at 11:30:55AM +0100, Arjan van de Ven (arjan@infradead.org) wrote:
> > (at least on Debian
> > and Mandrake there is no locked memory limit by default).
>
> that sounds like 2 very large bugtraq-worthy bugs in these distros.. so
> bad a bug that I almost find it hard to believe...

Well:

$ ulimit -a
core file size (blocks, -c) 0
data seg size (kbytes, -d) unlimited
max nice (-e) 0
file size (blocks, -f) unlimited
pending signals (-i) unlimited
max locked memory (kbytes, -l) unlimited
max memory size (kbytes, -m) unlimited
open files (-n) 1024
pipe size (512 bytes, -p) 8
POSIX message queues (bytes, -q) unlimited
max rt priority (-r) 0
stack size (kbytes, -s) 8192
cpu time (seconds, -t) unlimited
max user processes (-u) unlimited
virtual memory (kbytes, -v) unlimited
file locks (-x) unlimited
$ cat /etc/debian_version
4.0

$ ulimit -a
core file size (blocks, -c) 0
data seg size (kbytes, -d) unlimited
file size (blocks, -f) unlimited
max locked memory (kbytes, -l) unlimited
max memory size (kbytes, -m) unlimited
open files (-n) 1024
pipe size (512 bytes, -p) 8
stack size (kbytes, -s) 8192
cpu time (seconds, -t) unlimited
max user processes (-u) 7168
virtual memory (kbytes, -v) unlimited
$ cat /etc/mandrake-release
Mandrake Linux release 10.0 (Community) for i586

Anyway, even if there is a limit like in fc5 - 32kb,
so I doubt any unpriveledged userspace application
will ever run there.

--
Evgeniy Polyakov
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Re: [patch 05/11] syslets: core code [ In reply to ]
Ingo Molnar a écrit :
> + if (unlikely(signal_pending(t) || need_resched()))
> + goto stop;
>

So, this is how you'll prevent me from running an infinite loop ;-)
The attached patch adds a cond_resched() instead, to allow infinite
loops without DoS. I dropped the unlikely() as it's already in the
definition of signal_pending().

> +asmlinkage long sys_async_wait(unsigned long min_wait_events)
>

Here I would expect:

sys_async_wait_for_all(struct syslet_atom *atoms, long nr_atoms)

and

sys_async_wait_for_any(struct syslet_atom *atoms, long nr_atoms).

This way syslets can be used by different parts of a program without
having them waiting for each other.

Thanks.

--
Guillaume
Re: [patch 05/11] syslets: core code [ In reply to ]
Hi Ingo,

On Tue, 13 Feb 2007 15:20:35 +0100 Ingo Molnar <mingo@elte.hu> wrote:
>
> From: Ingo Molnar <mingo@elte.hu>
>
> the core syslet / async system calls infrastructure code.

It occurred to me that the 32 compat code for 64 bit architectures for
all this could be very hairy ...

--
Cheers,
Stephen Rothwell sfr@canb.auug.org.au
http://www.canb.auug.org.au/~sfr/
Re: [patch 05/11] syslets: core code [ In reply to ]
On Tue, 13 Feb 2007, Ingo Molnar wrote:
>
> the core syslet / async system calls infrastructure code.

Ok, having now looked at it more, I can say:

- I hate it.

I dislike it intensely, because it's so _close_ to being usable. But the
programming interface looks absolutely horrid for any "casual" use, and
while the loops etc look like fun, I think they are likely to be less than
useful in practice. Yeah, you can do the "setup and teardown" just once,
but it ends up being "once per user", and it ends up being a lot of stuff
to do for somebody who wants to just do some simple async stuff.

And the whole "lock things down in memory" approach is bad. It's doing
expensive things like mlock(), making the overhead for _single_ system
calls much more expensive. Since I don't actually believe that the
non-single case is even all that interesting, I really don't like it.

I think it's clever and potentially useful to allow user mode to see the
data structures (and even allow user mode to *modify* them) while the
async thing is running, but it really seems to be a case of excessive
cleverness.

For example, how would you use this to emulate the *current* aio_read()
etc interfaces that don't have any user-level component except for the
actual call? And if you can't do that, the whole exercise is pointless.

Or how would you do the trivial example loop that I explained was a good
idea:

struct one_entry *prev = NULL;
struct dirent *de;

while ((de = readdir(dir)) != NULL) {
struct one_entry *entry = malloc(..);

/* Add it to the list, fill in the name */
entry->next = prev;
prev = entry;
strcpy(entry->name, de->d_name);

/* Do the stat lookup async */
async_stat(de->d_name, &entry->stat_buf);
}
wait_for_async();
.. Ta-daa! All done ..


Notice? This also "chains system calls together", but it does it using a
*much* more powerful entity called "user space". That's what user space
is. And yeah, it's a pretty complex sequencer, but happily we have
hardware support for accelerating it to the point that the kernel never
even needs to care.

The above is a *realistic* schenario, where you actually have things like
memory allocation etc going on. In contrast, just chaining system calls
together isn't a realistic schenario at all.

So I think we have one _known_ usage schenario:

- replacing the _existing_ aio_read() etc system calls (with not just
existing semantics, but actually binary-compatible)

- simple code use where people are willing to perhaps do something
Linux-specific, but because it's so _simple_, they'll do it.

In neither case does the "chaining atoms together" seem to really solve
the problem. It's clever, but it's not what people would actually do.

And yes, you can hide things like that behind an abstraction library, but
once you start doing that, I've got three questions for you:

- what's the point?
- we're adding overhead, so how are we getting it back
- how do we handle independent libraries each doing their own thing and
version skew between them?

In other words, the "let user space sort out the complexity" is not a good
answer. It just means that the interface is badly designed.

Linus
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Re: [patch 05/11] syslets: core code [ In reply to ]
* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> And the whole "lock things down in memory" approach is bad. It's doing
> expensive things like mlock(), making the overhead for _single_ system
> calls much more expensive. [...]

hm, there must be some misunderstanding here. That mlock is /only/ once
per the lifetime of the whole 'head' - i.e. per sys_async_register().
(And you can even forget i ever did it - it's 5 lines of code to turn
the completion ring into a swappable entity.)

never does any MMU trick ever enter the picture during the whole
operation of this thing, and that's very much intentional.

Ingo
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Re: [patch 05/11] syslets: core code [ In reply to ]
On Wed, 14 Feb 2007, Linus Torvalds wrote:

>
>
> On Tue, 13 Feb 2007, Ingo Molnar wrote:
> >
> > the core syslet / async system calls infrastructure code.
>
> Ok, having now looked at it more, I can say:
>
> - I hate it.
>
> I dislike it intensely, because it's so _close_ to being usable. But the
> programming interface looks absolutely horrid for any "casual" use, and
> while the loops etc look like fun, I think they are likely to be less than
> useful in practice. Yeah, you can do the "setup and teardown" just once,
> but it ends up being "once per user", and it ends up being a lot of stuff
> to do for somebody who wants to just do some simple async stuff.
>
> And the whole "lock things down in memory" approach is bad. It's doing
> expensive things like mlock(), making the overhead for _single_ system
> calls much more expensive. Since I don't actually believe that the
> non-single case is even all that interesting, I really don't like it.
>
> I think it's clever and potentially useful to allow user mode to see the
> data structures (and even allow user mode to *modify* them) while the
> async thing is running, but it really seems to be a case of excessive
> cleverness.

Ok, that makes the wierdo-count up to two :) I agree with you that the
chained API can be improved at least.



- Davide


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Re: [patch 05/11] syslets: core code [ In reply to ]
* Ingo Molnar <mingo@elte.hu> wrote:

> * Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
> > And the whole "lock things down in memory" approach is bad. It's
> > doing expensive things like mlock(), making the overhead for
> > _single_ system calls much more expensive. [...]
>
> hm, there must be some misunderstanding here. That mlock is /only/
> once per the lifetime of the whole 'head' - i.e. per
> sys_async_register(). (And you can even forget i ever did it - it's 5
> lines of code to turn the completion ring into a swappable entity.)
>
> never does any MMU trick ever enter the picture during the whole
> operation of this thing, and that's very much intentional.

to stress it: never does any mlocking or other lockdown happen of any
syslet atom - it is /only/ the completion ring of syslet pointers that i
made mlocked - but even that can be made generic memory no problem.

It's all about asynchronous system calls, and if you want you can have a
terabyte of syslets in user memory, half of it swapped out. They have
absolutely zero kernel context attached to them in the 'cached case' (be
that locked memory or some other kernel resource).

Ingo
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Re: [patch 05/11] syslets: core code [ In reply to ]
On Wed, 14 Feb 2007, Ingo Molnar wrote:
>
> hm, there must be some misunderstanding here. That mlock is /only/ once
> per the lifetime of the whole 'head' - i.e. per sys_async_register().
> (And you can even forget i ever did it - it's 5 lines of code to turn
> the completion ring into a swappable entity.)

But the whole point is that the notion of a "register" is wrong in the
first place. It's wrong because:

- it assumes we are going to make these complex state machines (which I
don't believe for a second that a real program will do)

- it assumes that we're going to make many async system calls that go
together (which breaks the whole notion of having different libraries
using this for their own internal reasons - they may not even *know*
about other libraries that _also_ do async IO for *their* reasons)

- it fundamentally is based on a broken notion that everything would use
this "AIO atom" in the first place, WHICH WE KNOW IS INCORRECT, since
current users use "aio_read()" that simply doesn't have that and
doesn't build up any such data structures.

So please answer my questions. The problem wasn't the mlock(), even though
that was just STUPID. The problem was much deeper. This is not a "prepare
to do a lot of very boutique linked list operations" problem. This is a
"people already use 'aio_read()' and want to extend on it" problem.

You didn't at all react to that fundamental issue: you have an overly
complex and clever thing that doesn't actually *match* what people do.

Linus
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Re: [patch 05/11] syslets: core code [ In reply to ]
* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> But the whole point is that the notion of a "register" is wrong in the
> first place. [...]

forget about it then. The thing we "register" is dead-simple:

struct async_head_user {
struct syslet_uatom __user **completion_ring;
unsigned long ring_size_bytes;
unsigned long max_nr_threads;
};

this can be passed in to sys_async_exec() as a second pointer, and the
kernel can put the expected-completion pointer (and the user ring idx
pointer) into its struct atom. It's just a few instructions, and only in
the cachemiss case.

that would make completions arbitrarily split-up-able. No registration
whatsoever. A waiter could specify which ring's events it is interested
in. A 'ring' could be a single-entry thing as well, for a single
instance of pending IO.

Ingo
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Re: [patch 05/11] syslets: core code [ In reply to ]
* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> - it fundamentally is based on a broken notion that everything would
> use this "AIO atom" in the first place, WHICH WE KNOW IS INCORRECT,
> since current users use "aio_read()" that simply doesn't have that
> and doesn't build up any such data structures.

i'm not sure what you mean here either - aio_read()/write()/etc. could
very much be implemented using syslets - and in fact one goal of syslets
is to enable such use. struct aiocb is mostly shaped by glibc internals,
and it currently has 32 bytes of free space. Enough to put a single atom
there. (or a pointer to an atom)

Ingo
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Re: [patch 05/11] syslets: core code [ In reply to ]
> - it assumes we are going to make these complex state machines (which I
> don't believe for a second that a real program will do)

They've not had the chance before and there are certain chains of them
which make huge amounts of sense because you don't want to keep taking
completion hits. Not so much looping ones but stuff like

cork write sendfile uncork close

are very natural sequences.

There seem to be a lot of typical sequences it doesn't represent however
(consider the trivial copy case where you use the result one syscall into
the next)

> - it assumes that we're going to make many async system calls that go
> together (which breaks the whole notion of having different libraries
> using this for their own internal reasons - they may not even *know*
> about other libraries that _also_ do async IO for *their* reasons)

They can each register their own async objects. They need to do this
anyway so that the libraries can use asynchronous I/O and hide it from
applications.

> this "AIO atom" in the first place, WHICH WE KNOW IS INCORRECT, since
> current users use "aio_read()" that simply doesn't have that and
> doesn't build up any such data structures.

Do current users do this because that is all they have, because it is
hard, or because the current option is all that makes sense ?

The ability to avoid asynchronous completion waits and
complete/wake/despatch cycles is a good thing of itself. I don't know if
it justifies the rest but it has potential for excellent performance.

Alan
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Re: [patch 05/11] syslets: core code [ In reply to ]
* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> Or how would you do the trivial example loop that I explained was a
> good idea:
>
> struct one_entry *prev = NULL;
> struct dirent *de;
>
> while ((de = readdir(dir)) != NULL) {
> struct one_entry *entry = malloc(..);
>
> /* Add it to the list, fill in the name */
> entry->next = prev;
> prev = entry;
> strcpy(entry->name, de->d_name);
>
> /* Do the stat lookup async */
> async_stat(de->d_name, &entry->stat_buf);
> }
> wait_for_async();
> .. Ta-daa! All done ..

i think you are banging on open doors. That async_stat() call is very
much what i'd like to see glibc to provide, not really the raw syslet
interface. Nor do i want to see raw syscalls exposed to applications.
Plus the single-atom thing is what i think will be used mostly
initially, so all my optimizations went into that case.

while i agree with you that state machines are hard, it's all a function
of where the concentration of processing is. If most of the application
complexity happens in user-space, then the logic should live there. But
for infrastructure things (like the async_stat() calls, or aio_read(),
or other, future interfaces) i wouldnt mind at all if they were
implemented using syslets. Likewise, if someone wants to implement the
hottest accept loop in Apache or Samba via syslets, keeping them from
wasting time on writing in-kernel webservers (oops, did i really say
that?), it can be done. If a JVM wants to use syslets, sure - it's an
abstraction machine anyway so application programmers are not exposed to
it.

syslets are just a realization that /if/ the thing we want to do is
mostly on the kernel side, then we might as well put the logic to the
kernel side. It's more of a 'compound interface builder' than the place
for real program logic. It makes our interfaces usable more flexibly,
and it allows the kernel to provide 'atomic' APIs, instead of having to
provide the most common compounded uses as well.

and note that if you actually try to do an async_stat() sanely, you do
get quite close to the point of having syslets. You get basically up to
a one-shot atom concept and 90% of what i have in kernel/async.c. The
remaining 10% of further execution control is easy and still it opens up
these new things that were not possible before: compounding, vectoring,
simple program logic, etc.

The 'cost' of syslets is mostly the atom->next pointer in essence. The
whole async infrastructure only takes up 20 nsecs more in the cached
case. (but with some crazier hacks i got the one-shot atom overhead
[compared to a simple synchronous null syscall] to below 10 nsecs, so
there's room in there for further optimizations. Our current null
syscall latency is around ~150 nsecs.)

Ingo
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Re: [patch 05/11] syslets: core code [ In reply to ]
* Alan <alan@lxorguk.ukuu.org.uk> wrote:

> > this "AIO atom" in the first place, WHICH WE KNOW IS INCORRECT,
> > since current users use "aio_read()" that simply doesn't have
> > that and doesn't build up any such data structures.
>
> Do current users do this because that is all they have, because it is
> hard, or because the current option is all that makes sense ?
>
> The ability to avoid asynchronous completion waits and
> complete/wake/despatch cycles is a good thing of itself. [...]

yeah, that's another key thing. I do plan to provide a sys_upcall()
syscall as well which calls a 5-parameter user-space function with a
special stack. (it's like a lightweight signal/event handler, without
any of the signal handler legacies and overhead - it's like a reverse
system call - a "user call". Obviously pure userspace would never use
sys_upcall(), unless as an act of sheer masochism.)

[. that way say a full HTTP request could be done by an asynchronous
context, because the HTTP parser could be done as a sys_upcall(). ]

so if it's simpler/easier for a syslet to do a step in user-space - as
long as it's an 'atom' of processing - it can be done.

or if processing is so heavily in user-space that most of the logic
lives there then just use plain pthreads. There's just no point in
moving complex user-space code to the syslet side if it's easier/faster
to do it in user-space. Syslets are there for asynchronous /kernel/
execution, and is centered around how the kernel does stuff: system
calls.

besides sys_upcall() i also plan two other extensions:

- a CLONE_ASYNC_WORKER for user-space to be able use its pthread as an
optional worker thread in the async engine. A thread executing
user-space code qualifies as a 'busy' thread - it has to call into
sys_async_cachemiss_thread() to 'offer' itself as a ready thread that
the 'head' could switch to anytime.

- support for multiple heads sharing the async context pool. All the
locking logic is there already (because cachemiss threads can already
access the queue), it only needs a refcount in struct async_head
(only accessed during fork/exit), and an update to the teardown logic
(that too is a slowpath).

Ingo
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Re: [patch 05/11] syslets: core code [ In reply to ]
On Wed, 14 Feb 2007, Ingo Molnar wrote:
>
> i think you are banging on open doors. That async_stat() call is very
> much what i'd like to see glibc to provide, not really the raw syslet
> interface.

Right. Which is why I wrote (and you removed) the rest of my email.

If the "raw" interfaces aren't actually what you use, and you just expect
glibc to translate things into them, WHY DO WE HAVE THEM AT ALL?

> The 'cost' of syslets is mostly the atom->next pointer in essence.

No. The cost is:

- indirect interfaces are harder to follow and debug. It's a LOT easier
to debug things that go wrong when it just does what you ask it for,
instead of writing to memory and doing something totally obscure.

I don't know about you, but I use "strace" a lot. That's the kind of
cost we have.

- the cost is the extra and totally unnecessary setup for the
indirection, that nobody reallyis likely to use.

> The whole async infrastructure only takes up 20 nsecs more in the cached
> case. (but with some crazier hacks i got the one-shot atom overhead
> [compared to a simple synchronous null syscall] to below 10 nsecs, so
> there's room in there for further optimizations. Our current null
> syscall latency is around ~150 nsecs.)

You are not counting the whole setup cost there, then, because your setup
cost is going to be at a minimum more expensive than the null system call.

And yes, for benchmarks, it's going to be done just once, and then the
benchmark will loop a million times. But for other things like libraries,
that don't know whether they get called once, or a million times, this is
a big deal.

This is why I'd like a "async_stat()" to basically be the *same* cost as a
"stat()". To within nanoseconds. WITH ALL THE SETUP! Because otherwise, a
library may not be able to use it without thinking about it a lot, because
it simply doesn't know whether the caller is going to call it once or many
times.

THIS was why I wanted the "synchronous mode". Exactly because it removes
all the questions about "is it worth it". If the cost overhead is
basically zero, you know it's always worth it.

Now, if you make the "async_submit()" _incldue_ the setup itself (as you
alluded to in one of your emails), and the cost of that is basically
negligible, and it still allows people to do things simply and just submit
a single system call without any real overhead, then hey, it's may be a
complex interface, but at least you can _use_ it as a simple one.

At that point most of my arguments against it go away. It might still be
over-engineered, but if the costs aren't visible, and it's obvious enough
that the over-engineering doesn't result in subtle bugs, THEN (and only
then) is a more complex and generic interface worth it even if nobody
actually ends up using it.

Linus
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Re: [patch 05/11] syslets: core code [ In reply to ]
* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> > case. (but with some crazier hacks i got the one-shot atom overhead
> > [compared to a simple synchronous null syscall] to below 10 nsecs,
> > so there's room in there for further optimizations. Our current null
> > syscall latency is around ~150 nsecs.)
>
> You are not counting the whole setup cost there, then, because your
> setup cost is going to be at a minimum more expensive than the null
> system call.

hm, this one-time cost was never on my radar. [. It's really dwarved by
other startup costs (a single fork() takes 100 usecs, an exec() takes
800 usecs.) ]

In any case, we can delay this cost into the first cachemiss, or can
eliminate it by making it a globally pooled thing. It does not seem like
a big issue.

Ingo
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

1 2 3  View All