Mailing List Archive

[PATCH 1/9] kernel: add a PF_FORCE_COMPAT flag
Add a flag to force processing a syscall as a compat syscall. This is
required so that in_compat_syscall() works for I/O submitted by io_uring
helper threads on behalf of compat syscalls.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
arch/sparc/include/asm/compat.h | 3 ++-
arch/x86/include/asm/compat.h | 2 +-
fs/io_uring.c | 9 +++++++++
include/linux/compat.h | 5 ++++-
include/linux/sched.h | 1 +
5 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h
index 40a267b3bd5208..fee6c51d36e869 100644
--- a/arch/sparc/include/asm/compat.h
+++ b/arch/sparc/include/asm/compat.h
@@ -211,7 +211,8 @@ static inline int is_compat_task(void)
static inline bool in_compat_syscall(void)
{
/* Vector 0x110 is LINUX_32BIT_SYSCALL_TRAP */
- return pt_regs_trap_type(current_pt_regs()) == 0x110;
+ return pt_regs_trap_type(current_pt_regs()) == 0x110 ||
+ (current->flags & PF_FORCE_COMPAT);
}
#define in_compat_syscall in_compat_syscall
#endif
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index d4edf281fff49d..fbab072d4e5b31 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -208,7 +208,7 @@ static inline bool in_32bit_syscall(void)
#ifdef CONFIG_COMPAT
static inline bool in_compat_syscall(void)
{
- return in_32bit_syscall();
+ return in_32bit_syscall() || (current->flags & PF_FORCE_COMPAT);
}
#define in_compat_syscall in_compat_syscall /* override the generic impl */
#endif
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3790c7fe9fee22..5755d557c3f7bc 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5449,6 +5449,9 @@ static int io_req_defer_prep(struct io_kiocb *req,
if (unlikely(ret))
return ret;

+ if (req->ctx->compat)
+ current->flags |= PF_FORCE_COMPAT;
+
switch (req->opcode) {
case IORING_OP_NOP:
break;
@@ -5546,6 +5549,7 @@ static int io_req_defer_prep(struct io_kiocb *req,
break;
}

+ current->flags &= ~PF_FORCE_COMPAT;
return ret;
}

@@ -5669,6 +5673,9 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
struct io_ring_ctx *ctx = req->ctx;
int ret;

+ if (ctx->compat)
+ current->flags |= PF_FORCE_COMPAT;
+
switch (req->opcode) {
case IORING_OP_NOP:
ret = io_nop(req, cs);
@@ -5898,6 +5905,8 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
break;
}

+ current->flags &= ~PF_FORCE_COMPAT;
+
if (ret)
return ret;

diff --git a/include/linux/compat.h b/include/linux/compat.h
index b354ce58966e2d..685066f7ad325f 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -891,7 +891,10 @@ asmlinkage long compat_sys_socketcall(int call, u32 __user *args);
*/

#ifndef in_compat_syscall
-static inline bool in_compat_syscall(void) { return is_compat_task(); }
+static inline bool in_compat_syscall(void)
+{
+ return is_compat_task() || (current->flags & PF_FORCE_COMPAT);
+}
#endif

/**
diff --git a/include/linux/sched.h b/include/linux/sched.h
index afe01e232935fa..c8b183b5655a1e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1491,6 +1491,7 @@ extern struct pid *cad_pid;
*/
#define PF_IDLE 0x00000002 /* I am an IDLE thread */
#define PF_EXITING 0x00000004 /* Getting shut down */
+#define PF_FORCE_COMPAT 0x00000008 /* acting as compat task */
#define PF_VCPU 0x00000010 /* I'm a virtual CPU */
#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */
#define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */
--
2.28.0
Re: [PATCH 1/9] kernel: add a PF_FORCE_COMPAT flag [ In reply to ]
On Fri, Sep 18, 2020 at 02:45:25PM +0200, Christoph Hellwig wrote:
> Add a flag to force processing a syscall as a compat syscall. This is
> required so that in_compat_syscall() works for I/O submitted by io_uring
> helper threads on behalf of compat syscalls.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> arch/sparc/include/asm/compat.h | 3 ++-
> arch/x86/include/asm/compat.h | 2 +-
> fs/io_uring.c | 9 +++++++++
> include/linux/compat.h | 5 ++++-
> include/linux/sched.h | 1 +
> 5 files changed, 17 insertions(+), 3 deletions(-)
>
> diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h
> index 40a267b3bd5208..fee6c51d36e869 100644
> --- a/arch/sparc/include/asm/compat.h
> +++ b/arch/sparc/include/asm/compat.h
> @@ -211,7 +211,8 @@ static inline int is_compat_task(void)
> static inline bool in_compat_syscall(void)
> {
> /* Vector 0x110 is LINUX_32BIT_SYSCALL_TRAP */
> - return pt_regs_trap_type(current_pt_regs()) == 0x110;
> + return pt_regs_trap_type(current_pt_regs()) == 0x110 ||
> + (current->flags & PF_FORCE_COMPAT);

Can't say I like that approach ;-/ Reasoning about the behaviour is much
harder when it's controlled like that - witness set_fs() shite...
Re: [PATCH 1/9] kernel: add a PF_FORCE_COMPAT flag [ In reply to ]
On Fri, Sep 18, 2020 at 02:40:12PM +0100, Al Viro wrote:
> > /* Vector 0x110 is LINUX_32BIT_SYSCALL_TRAP */
> > - return pt_regs_trap_type(current_pt_regs()) == 0x110;
> > + return pt_regs_trap_type(current_pt_regs()) == 0x110 ||
> > + (current->flags & PF_FORCE_COMPAT);
>
> Can't say I like that approach ;-/ Reasoning about the behaviour is much
> harder when it's controlled like that - witness set_fs() shite...

I don't particularly like it either. But do you have a better idea
how to deal with io_uring vs compat tasks?
Re: [PATCH 1/9] kernel: add a PF_FORCE_COMPAT flag [ In reply to ]
On Fri, Sep 18, 2020 at 03:44:06PM +0200, Christoph Hellwig wrote:
> On Fri, Sep 18, 2020 at 02:40:12PM +0100, Al Viro wrote:
> > > /* Vector 0x110 is LINUX_32BIT_SYSCALL_TRAP */
> > > - return pt_regs_trap_type(current_pt_regs()) == 0x110;
> > > + return pt_regs_trap_type(current_pt_regs()) == 0x110 ||
> > > + (current->flags & PF_FORCE_COMPAT);
> >
> > Can't say I like that approach ;-/ Reasoning about the behaviour is much
> > harder when it's controlled like that - witness set_fs() shite...
>
> I don't particularly like it either. But do you have a better idea
> how to deal with io_uring vs compat tasks?

<wry> git rm fs/io_uring.c would make a good starting point </wry>
Yes, I know it's not going to happen, but one can dream...

Said that, why not provide a variant that would take an explicit
"is it compat" argument and use it there? And have the normal
one pass in_compat_syscall() to that...
Re: [PATCH 1/9] kernel: add a PF_FORCE_COMPAT flag [ In reply to ]
On Fri, Sep 18, 2020 at 3:44 PM Christoph Hellwig <hch@lst.de> wrote:
>
> On Fri, Sep 18, 2020 at 02:40:12PM +0100, Al Viro wrote:
> > > /* Vector 0x110 is LINUX_32BIT_SYSCALL_TRAP */
> > > - return pt_regs_trap_type(current_pt_regs()) == 0x110;
> > > + return pt_regs_trap_type(current_pt_regs()) == 0x110 ||
> > > + (current->flags & PF_FORCE_COMPAT);
> >
> > Can't say I like that approach ;-/ Reasoning about the behaviour is much
> > harder when it's controlled like that - witness set_fs() shite...
>
> I don't particularly like it either. But do you have a better idea
> how to deal with io_uring vs compat tasks?

Do we need to worry about something other than the compat_iovec
struct for now? Regarding the code in io_import_iovec(), it would
seem that can easily be handled by exposing an internal helper.
Instead of

#ifdef CONFIG_COMPAT
if (req->ctx->compat)
return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
iovec, iter);
#endif
return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);

This could do

__import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec,
iter, req->ctx->compat);

With the normal import_iovec() becoming a trivial wrapper around
the same thing:

ssize_t import_iovec(int type, const struct iovec __user * uvector,
unsigned nr_segs, unsigned fast_segs,
struct iovec **iov, struct iov_iter *i)
{
return __import_iovec(type, uvector, nr_segs, fast_segs, iov,
i, in_compat_syscall());
}


Arnd
Re: [PATCH 1/9] kernel: add a PF_FORCE_COMPAT flag [ In reply to ]
On Fri, Sep 18, 2020 at 02:58:22PM +0100, Al Viro wrote:
> Said that, why not provide a variant that would take an explicit
> "is it compat" argument and use it there? And have the normal
> one pass in_compat_syscall() to that...

That would help to not introduce a regression with this series yes.
But it wouldn't fix existing bugs when io_uring is used to access
read or write methods that use in_compat_syscall(). One example that
I recently ran into is drivers/scsi/sg.c.
RE: [PATCH 1/9] kernel: add a PF_FORCE_COMPAT flag [ In reply to ]
From: Al Viro
> Sent: 18 September 2020 14:58
>
> On Fri, Sep 18, 2020 at 03:44:06PM +0200, Christoph Hellwig wrote:
> > On Fri, Sep 18, 2020 at 02:40:12PM +0100, Al Viro wrote:
> > > > /* Vector 0x110 is LINUX_32BIT_SYSCALL_TRAP */
> > > > - return pt_regs_trap_type(current_pt_regs()) == 0x110;
> > > > + return pt_regs_trap_type(current_pt_regs()) == 0x110 ||
> > > > + (current->flags & PF_FORCE_COMPAT);
> > >
> > > Can't say I like that approach ;-/ Reasoning about the behaviour is much
> > > harder when it's controlled like that - witness set_fs() shite...
> >
> > I don't particularly like it either. But do you have a better idea
> > how to deal with io_uring vs compat tasks?
>
> <wry> git rm fs/io_uring.c would make a good starting point </wry>
> Yes, I know it's not going to happen, but one can dream...

Maybe the io_uring code needs some changes to make it vaguely safe.
- No support for 32-bit compat mixed working (or at all?).
Plausibly a special worker could do 32bit work.
- ring structure (I'm assuming mapped by mmap()) never mapped
in more than one process (not cloned by fork()).
- No implicit handover of files to another process.
Would need an munmap, handover, mmap sequence.

In any case the io_ring rather abuses the import_iovec() interface.

The canonical sequence is (types from memory):
struct iovec cache[8], *iov = cache;
struct iter iter;
...
rval = import_iovec(..., &iov, 8, &iter);
// Do read/write user using 'iter'
free(iov);

I don't think there is any strict requirement that iter.iov
is set to either 'cache' or 'iov' (it probably must point
into one of them.)
But the io_uring code will make that assumption because the
actual copies can be done much later and it doesn't save 'iter'.
It gets itself in a right mess because it doesn't separate
the 'address I need to free' from 'the iov[] for any transfers'.

io_uring is also the only code that relies on import_iovec()
returning the iter.count on success.
It would be much better to have:
iov = import_iovec(..., &cache, ...);
free(iov);
and use ERR_PTR() et al for error detectoion.

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Re: [PATCH 1/9] kernel: add a PF_FORCE_COMPAT flag [ In reply to ]
On Fri, Sep 18, 2020 at 8:16 AM Christoph Hellwig <hch@lst.de> wrote:
>
> On Fri, Sep 18, 2020 at 02:58:22PM +0100, Al Viro wrote:
> > Said that, why not provide a variant that would take an explicit
> > "is it compat" argument and use it there? And have the normal
> > one pass in_compat_syscall() to that...
>
> That would help to not introduce a regression with this series yes.
> But it wouldn't fix existing bugs when io_uring is used to access
> read or write methods that use in_compat_syscall(). One example that
> I recently ran into is drivers/scsi/sg.c.

Aside from the potentially nasty use of per-task variables, one thing
I don't like about PF_FORCE_COMPAT is that it's one-way. If we're
going to have a generic mechanism for this, shouldn't we allow a full
override of the syscall arch instead of just allowing forcing compat
so that a compat syscall can do a non-compat operation?
Re: [PATCH 1/9] kernel: add a PF_FORCE_COMPAT flag [ In reply to ]
On Sat, Sep 19, 2020 at 6:21 PM Andy Lutomirski <luto@kernel.org> wrote:
> On Fri, Sep 18, 2020 at 8:16 AM Christoph Hellwig <hch@lst.de> wrote:
> > On Fri, Sep 18, 2020 at 02:58:22PM +0100, Al Viro wrote:
> > > Said that, why not provide a variant that would take an explicit
> > > "is it compat" argument and use it there? And have the normal
> > > one pass in_compat_syscall() to that...
> >
> > That would help to not introduce a regression with this series yes.
> > But it wouldn't fix existing bugs when io_uring is used to access
> > read or write methods that use in_compat_syscall(). One example that
> > I recently ran into is drivers/scsi/sg.c.

Ah, so reading /dev/input/event* would suffer from the same issue,
and that one would in fact be broken by your patch in the hypothetical
case that someone tried to use io_uring to read /dev/input/event on x32...

For reference, I checked the socket timestamp handling that has a
number of corner cases with time32/time64 formats in compat mode,
but none of those appear to be affected by the problem.

> Aside from the potentially nasty use of per-task variables, one thing
> I don't like about PF_FORCE_COMPAT is that it's one-way. If we're
> going to have a generic mechanism for this, shouldn't we allow a full
> override of the syscall arch instead of just allowing forcing compat
> so that a compat syscall can do a non-compat operation?

The only reason it's needed here is that the caller is in a kernel
thread rather than a system call. Are there any possible scenarios
where one would actually need the opposite?

Arnd
Re: [PATCH 1/9] kernel: add a PF_FORCE_COMPAT flag [ In reply to ]
On Sat, 19 Sep 2020, Arnd Bergmann wrote:

> On Sat, Sep 19, 2020 at 6:21 PM Andy Lutomirski <luto@kernel.org> wrote:
> > On Fri, Sep 18, 2020 at 8:16 AM Christoph Hellwig <hch@lst.de> wrote:
> > > On Fri, Sep 18, 2020 at 02:58:22PM +0100, Al Viro wrote:
> > > > Said that, why not provide a variant that would take an explicit
> > > > "is it compat" argument and use it there? And have the normal one
> > > > pass in_compat_syscall() to that...
> > >
> > > That would help to not introduce a regression with this series yes.
> > > But it wouldn't fix existing bugs when io_uring is used to access
> > > read or write methods that use in_compat_syscall(). One example
> > > that I recently ran into is drivers/scsi/sg.c.
>
> Ah, so reading /dev/input/event* would suffer from the same issue, and
> that one would in fact be broken by your patch in the hypothetical case
> that someone tried to use io_uring to read /dev/input/event on x32...
>
> For reference, I checked the socket timestamp handling that has a number
> of corner cases with time32/time64 formats in compat mode, but none of
> those appear to be affected by the problem.
>
> > Aside from the potentially nasty use of per-task variables, one thing
> > I don't like about PF_FORCE_COMPAT is that it's one-way. If we're
> > going to have a generic mechanism for this, shouldn't we allow a full
> > override of the syscall arch instead of just allowing forcing compat
> > so that a compat syscall can do a non-compat operation?
>
> The only reason it's needed here is that the caller is in a kernel
> thread rather than a system call. Are there any possible scenarios where
> one would actually need the opposite?
>

Quite possibly. The ext4 vs. compat getdents bug is still unresolved.
Please see,
https://lore.kernel.org/lkml/CAFEAcA9W+JK7_TrtTnL1P2ES1knNPJX9wcUvhfLwxLq9augq1w@mail.gmail.com/

> Arnd
>
Re: [PATCH 1/9] kernel: add a PF_FORCE_COMPAT flag [ In reply to ]
On Fri, Sep 18, 2020 at 05:16:15PM +0200, Christoph Hellwig wrote:
> On Fri, Sep 18, 2020 at 02:58:22PM +0100, Al Viro wrote:
> > Said that, why not provide a variant that would take an explicit
> > "is it compat" argument and use it there? And have the normal
> > one pass in_compat_syscall() to that...
>
> That would help to not introduce a regression with this series yes.
> But it wouldn't fix existing bugs when io_uring is used to access
> read or write methods that use in_compat_syscall(). One example that
> I recently ran into is drivers/scsi/sg.c.

So screw such read/write methods - don't use them with io_uring.
That, BTW, is one of the reasons I'm sceptical about burying the
decisions deep into the callchain - we don't _want_ different
data layouts on read/write depending upon the 32bit vs. 64bit
caller, let alone the pointer-chasing garbage that is /dev/sg.
Re: [PATCH 1/9] kernel: add a PF_FORCE_COMPAT flag [ In reply to ]
> On Sep 19, 2020, at 2:16 PM, Arnd Bergmann <arnd@arndb.de> wrote:
>
> ?On Sat, Sep 19, 2020 at 6:21 PM Andy Lutomirski <luto@kernel.org> wrote:
>>> On Fri, Sep 18, 2020 at 8:16 AM Christoph Hellwig <hch@lst.de> wrote:
>>> On Fri, Sep 18, 2020 at 02:58:22PM +0100, Al Viro wrote:
>>>> Said that, why not provide a variant that would take an explicit
>>>> "is it compat" argument and use it there? And have the normal
>>>> one pass in_compat_syscall() to that...
>>>
>>> That would help to not introduce a regression with this series yes.
>>> But it wouldn't fix existing bugs when io_uring is used to access
>>> read or write methods that use in_compat_syscall(). One example that
>>> I recently ran into is drivers/scsi/sg.c.
>
> Ah, so reading /dev/input/event* would suffer from the same issue,
> and that one would in fact be broken by your patch in the hypothetical
> case that someone tried to use io_uring to read /dev/input/event on x32...
>
> For reference, I checked the socket timestamp handling that has a
> number of corner cases with time32/time64 formats in compat mode,
> but none of those appear to be affected by the problem.
>
>> Aside from the potentially nasty use of per-task variables, one thing
>> I don't like about PF_FORCE_COMPAT is that it's one-way. If we're
>> going to have a generic mechanism for this, shouldn't we allow a full
>> override of the syscall arch instead of just allowing forcing compat
>> so that a compat syscall can do a non-compat operation?
>
> The only reason it's needed here is that the caller is in a kernel
> thread rather than a system call. Are there any possible scenarios
> where one would actually need the opposite?
>

I can certainly imagine needing to force x32 mode from a kernel thread.

As for the other direction: what exactly are the desired bitness/arch semantics of io_uring? Is the operation bitness chosen by the io_uring creation or by the io_uring_enter() bitness?
Re: [PATCH 1/9] kernel: add a PF_FORCE_COMPAT flag [ In reply to ]
> On Sep 19, 2020, at 3:09 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> ?On Fri, Sep 18, 2020 at 05:16:15PM +0200, Christoph Hellwig wrote:
>>> On Fri, Sep 18, 2020 at 02:58:22PM +0100, Al Viro wrote:
>>> Said that, why not provide a variant that would take an explicit
>>> "is it compat" argument and use it there? And have the normal
>>> one pass in_compat_syscall() to that...
>>
>> That would help to not introduce a regression with this series yes.
>> But it wouldn't fix existing bugs when io_uring is used to access
>> read or write methods that use in_compat_syscall(). One example that
>> I recently ran into is drivers/scsi/sg.c.
>
> So screw such read/write methods - don't use them with io_uring.
> That, BTW, is one of the reasons I'm sceptical about burying the
> decisions deep into the callchain - we don't _want_ different
> data layouts on read/write depending upon the 32bit vs. 64bit
> caller, let alone the pointer-chasing garbage that is /dev/sg.

Well, we could remove in_compat_syscall(), etc and instead have an implicit parameter in DEFINE_SYSCALL. Then everything would have to be explicit. This would probably be a win, although it could be quite a bit of work.
Re: [PATCH 1/9] kernel: add a PF_FORCE_COMPAT flag [ In reply to ]
On Sat, Sep 19, 2020 at 03:23:54PM -0700, Andy Lutomirski wrote:
>
> > On Sep 19, 2020, at 3:09 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> >
> > ?On Fri, Sep 18, 2020 at 05:16:15PM +0200, Christoph Hellwig wrote:
> >>> On Fri, Sep 18, 2020 at 02:58:22PM +0100, Al Viro wrote:
> >>> Said that, why not provide a variant that would take an explicit
> >>> "is it compat" argument and use it there? And have the normal
> >>> one pass in_compat_syscall() to that...
> >>
> >> That would help to not introduce a regression with this series yes.
> >> But it wouldn't fix existing bugs when io_uring is used to access
> >> read or write methods that use in_compat_syscall(). One example that
> >> I recently ran into is drivers/scsi/sg.c.
> >
> > So screw such read/write methods - don't use them with io_uring.
> > That, BTW, is one of the reasons I'm sceptical about burying the
> > decisions deep into the callchain - we don't _want_ different
> > data layouts on read/write depending upon the 32bit vs. 64bit
> > caller, let alone the pointer-chasing garbage that is /dev/sg.
>
> Well, we could remove in_compat_syscall(), etc and instead have an implicit parameter in DEFINE_SYSCALL. Then everything would have to be explicit. This would probably be a win, although it could be quite a bit of work.

It would not be a win - most of the syscalls don't give a damn
about 32bit vs. 64bit...
Re: [PATCH 1/9] kernel: add a PF_FORCE_COMPAT flag [ In reply to ]
> On Sep 19, 2020, at 3:41 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> ?On Sat, Sep 19, 2020 at 03:23:54PM -0700, Andy Lutomirski wrote:
>>
>>>> On Sep 19, 2020, at 3:09 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
>>>
>>> ?On Fri, Sep 18, 2020 at 05:16:15PM +0200, Christoph Hellwig wrote:
>>>>> On Fri, Sep 18, 2020 at 02:58:22PM +0100, Al Viro wrote:
>>>>> Said that, why not provide a variant that would take an explicit
>>>>> "is it compat" argument and use it there? And have the normal
>>>>> one pass in_compat_syscall() to that...
>>>>
>>>> That would help to not introduce a regression with this series yes.
>>>> But it wouldn't fix existing bugs when io_uring is used to access
>>>> read or write methods that use in_compat_syscall(). One example that
>>>> I recently ran into is drivers/scsi/sg.c.
>>>
>>> So screw such read/write methods - don't use them with io_uring.
>>> That, BTW, is one of the reasons I'm sceptical about burying the
>>> decisions deep into the callchain - we don't _want_ different
>>> data layouts on read/write depending upon the 32bit vs. 64bit
>>> caller, let alone the pointer-chasing garbage that is /dev/sg.
>>
>> Well, we could remove in_compat_syscall(), etc and instead have an implicit parameter in DEFINE_SYSCALL. Then everything would have to be explicit. This would probably be a win, although it could be quite a bit of work.
>
> It would not be a win - most of the syscalls don't give a damn
> about 32bit vs. 64bit...

Any reasonable implementation would optimize it out for syscalls that don’t care. Or it could be explicit:

DEFINE_MULTIARCH_SYSCALL(...)
Re: [PATCH 1/9] kernel: add a PF_FORCE_COMPAT flag [ In reply to ]
On Sat, Sep 19, 2020 at 03:53:40PM -0700, Andy Lutomirski wrote:

> > It would not be a win - most of the syscalls don't give a damn
> > about 32bit vs. 64bit...
>
> Any reasonable implementation would optimize it out for syscalls that don’t care. Or it could be explicit:
>
> DEFINE_MULTIARCH_SYSCALL(...)

1) what would that look like?
2) have you counted the syscalls that do and do not need that?
3) how many of those realistically *can* be unified with their
compat counterparts? [hint: ioctl(2) cannot]
Re: [PATCH 1/9] kernel: add a PF_FORCE_COMPAT flag [ In reply to ]
On Sat, Sep 19, 2020 at 4:24 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> On Sat, Sep 19, 2020 at 03:53:40PM -0700, Andy Lutomirski wrote:
>
> > > It would not be a win - most of the syscalls don't give a damn
> > > about 32bit vs. 64bit...
> >
> > Any reasonable implementation would optimize it out for syscalls that don’t care. Or it could be explicit:
> >
> > DEFINE_MULTIARCH_SYSCALL(...)
>
> 1) what would that look like?

In effect, it would work like this:

/* Arch-specific, but there's a generic case for sane architectures. */
enum syscall_arch {
SYSCALL_NATIVE,
SYSCALL_COMPAT,
SYSCALL_X32,
};

DEFINE_MULTIARCH_SYSCALLn(args, arch)
{
args are the args here, and arch is the arch.
}

> 2) have you counted the syscalls that do and do not need that?

No.

> 3) how many of those realistically *can* be unified with their
> compat counterparts? [hint: ioctl(2) cannot]

There would be no requirement to unify anything. The idea is that
we'd get rid of all the global state flags.

For ioctl, we'd have a new file_operation:

long ioctl(struct file *, unsigned int, unsigned long, enum syscall_arch);

I'm not saying this is easy, but I think it's possible and the result
would be more obviously correct than what we have now.
Re: [PATCH 1/9] kernel: add a PF_FORCE_COMPAT flag [ In reply to ]
On Sat, Sep 19, 2020 at 05:14:41PM -0700, Andy Lutomirski wrote:

> > 2) have you counted the syscalls that do and do not need that?
>
> No.

Might be illuminating...

> > 3) how many of those realistically *can* be unified with their
> > compat counterparts? [hint: ioctl(2) cannot]
>
> There would be no requirement to unify anything. The idea is that
> we'd get rid of all the global state flags.

_What_ global state flags? When you have separate SYSCALL_DEFINE3(ioctl...)
and COMPAT_SYSCALL_DEFINE3(ioctl...), there's no flags at all, global or
local. They only come into the play when you try to share the same function
for both, right on the top level.

> For ioctl, we'd have a new file_operation:
>
> long ioctl(struct file *, unsigned int, unsigned long, enum syscall_arch);
>
> I'm not saying this is easy, but I think it's possible and the result
> would be more obviously correct than what we have now.

No, it would not. Seriously, from time to time a bit of RTFS before grand
proposals turns out to be useful.
Re: [PATCH 1/9] kernel: add a PF_FORCE_COMPAT flag [ In reply to ]
On Sun, Sep 20, 2020 at 12:09 AM Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Fri, Sep 18, 2020 at 05:16:15PM +0200, Christoph Hellwig wrote:
> > On Fri, Sep 18, 2020 at 02:58:22PM +0100, Al Viro wrote:
> > > Said that, why not provide a variant that would take an explicit
> > > "is it compat" argument and use it there? And have the normal
> > > one pass in_compat_syscall() to that...
> >
> > That would help to not introduce a regression with this series yes.
> > But it wouldn't fix existing bugs when io_uring is used to access
> > read or write methods that use in_compat_syscall(). One example that
> > I recently ran into is drivers/scsi/sg.c.
>
> So screw such read/write methods - don't use them with io_uring.
> That, BTW, is one of the reasons I'm sceptical about burying the
> decisions deep into the callchain - we don't _want_ different
> data layouts on read/write depending upon the 32bit vs. 64bit
> caller, let alone the pointer-chasing garbage that is /dev/sg.

Would it be too late to limit what kind of file descriptors we allow
io_uring to read/write on?

If io_uring can get changed to return -EINVAL on trying to
read/write something other than S_IFREG file descriptors,
that particular problem space gets a lot simpler, but this
is of course only possible if nobody actually relies on it yet.

Arnd
Re: [PATCH 1/9] kernel: add a PF_FORCE_COMPAT flag [ In reply to ]
On Sun, Sep 20, 2020 at 03:55:47PM +0200, Arnd Bergmann wrote:
> On Sun, Sep 20, 2020 at 12:09 AM Al Viro <viro@zeniv.linux.org.uk> wrote:
> > On Fri, Sep 18, 2020 at 05:16:15PM +0200, Christoph Hellwig wrote:
> > > On Fri, Sep 18, 2020 at 02:58:22PM +0100, Al Viro wrote:
> > > > Said that, why not provide a variant that would take an explicit
> > > > "is it compat" argument and use it there? And have the normal
> > > > one pass in_compat_syscall() to that...
> > >
> > > That would help to not introduce a regression with this series yes.
> > > But it wouldn't fix existing bugs when io_uring is used to access
> > > read or write methods that use in_compat_syscall(). One example that
> > > I recently ran into is drivers/scsi/sg.c.
> >
> > So screw such read/write methods - don't use them with io_uring.
> > That, BTW, is one of the reasons I'm sceptical about burying the
> > decisions deep into the callchain - we don't _want_ different
> > data layouts on read/write depending upon the 32bit vs. 64bit
> > caller, let alone the pointer-chasing garbage that is /dev/sg.
>
> Would it be too late to limit what kind of file descriptors we allow
> io_uring to read/write on?
>
> If io_uring can get changed to return -EINVAL on trying to
> read/write something other than S_IFREG file descriptors,
> that particular problem space gets a lot simpler, but this
> is of course only possible if nobody actually relies on it yet.

S_IFREG is almost certainly too heavy as a restriction. Looking through
the stuff sensitive to 32bit/64bit, we seem to have
* /dev/sg - pointer-chasing horror
* sysfs files for efivar - different layouts for compat and native,
shitty userland ABI design (
struct efi_variable {
efi_char16_t VariableName[EFI_VAR_NAME_LEN/sizeof(efi_char16_t)];
efi_guid_t VendorGuid;
unsigned long DataSize;
__u8 Data[1024];
efi_status_t Status;
__u32 Attributes;
} __attribute__((packed));
) is the piece of crap in question; 'DataSize' is where the headache comes
from. Regular files, BTW...
* uhid - character device, milder pointer-chasing horror. Trouble
comes from this:
/* Obsolete! Use UHID_CREATE2. */
struct uhid_create_req {
__u8 name[128];
__u8 phys[64];
__u8 uniq[64];
__u8 __user *rd_data;
__u16 rd_size;

__u16 bus;
__u32 vendor;
__u32 product;
__u32 version;
__u32 country;
} __attribute__((__packed__));
and suggested replacement doesn't do any pointer-chasing (rd_data is an
embedded array in the end of struct uhid_create2_req).
* evdev, uinput - bitness-sensitive layout, due to timestamps
* /proc/bus/input/devices - weird crap with printing bitmap, different
_text_ layouts seen by 32bit and 64bit readers. Binary structures are PITA,
but with sufficient effort you can screw the text just as hard... Oh, and it's
a regular file.
* similar in sysfs analogue

And AFAICS, that's it for read/write-related method instances.
Re: [PATCH 1/9] kernel: add a PF_FORCE_COMPAT flag [ In reply to ]
On Fri, Sep 18, 2020 at 02:45:25PM +0200, Christoph Hellwig wrote:
> Add a flag to force processing a syscall as a compat syscall. This is
> required so that in_compat_syscall() works for I/O submitted by io_uring
> helper threads on behalf of compat syscalls.

Al doesn't like this much, but my suggestion is to introduce two new
opcodes -- IORING_OP_READV32 and IORING_OP_WRITEV32. The compat code
can translate IORING_OP_READV to IORING_OP_READV32 and then the core
code can know what that user pointer is pointing to.
Re: [PATCH 1/9] kernel: add a PF_FORCE_COMPAT flag [ In reply to ]
I really like that as it’s self-documenting and anyone debugging it can see what is actually being used at a glance.

> On Sep 20, 2020, at 09:15, Matthew Wilcox <willy@infradead.org> wrote:
>
> ?On Fri, Sep 18, 2020 at 02:45:25PM +0200, Christoph Hellwig wrote:
>> Add a flag to force processing a syscall as a compat syscall. This is
>> required so that in_compat_syscall() works for I/O submitted by io_uring
>> helper threads on behalf of compat syscalls.
>
> Al doesn't like this much, but my suggestion is to introduce two new
> opcodes -- IORING_OP_READV32 and IORING_OP_WRITEV32. The compat code
> can translate IORING_OP_READV to IORING_OP_READV32 and then the core
> code can know what that user pointer is pointing to.
Re: [PATCH 1/9] kernel: add a PF_FORCE_COMPAT flag [ In reply to ]
On Sun, Sep 20, 2020 at 5:15 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Fri, Sep 18, 2020 at 02:45:25PM +0200, Christoph Hellwig wrote:
> > Add a flag to force processing a syscall as a compat syscall. This is
> > required so that in_compat_syscall() works for I/O submitted by io_uring
> > helper threads on behalf of compat syscalls.
>
> Al doesn't like this much, but my suggestion is to introduce two new
> opcodes -- IORING_OP_READV32 and IORING_OP_WRITEV32. The compat code
> can translate IORING_OP_READV to IORING_OP_READV32 and then the core
> code can know what that user pointer is pointing to.

How is that different from the current approach of storing the ABI as
a flag in ctx->compat?

Arnd
Re: [PATCH 1/9] kernel: add a PF_FORCE_COMPAT flag [ In reply to ]
On Sat, Sep 19, 2020 at 7:57 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> On Sat, Sep 19, 2020 at 05:14:41PM -0700, Andy Lutomirski wrote:
>
> > > 2) have you counted the syscalls that do and do not need that?
> >
> > No.
>
> Might be illuminating...
>
> > > 3) how many of those realistically *can* be unified with their
> > > compat counterparts? [hint: ioctl(2) cannot]
> >
> > There would be no requirement to unify anything. The idea is that
> > we'd get rid of all the global state flags.
>
> _What_ global state flags? When you have separate SYSCALL_DEFINE3(ioctl...)
> and COMPAT_SYSCALL_DEFINE3(ioctl...), there's no flags at all, global or
> local. They only come into the play when you try to share the same function
> for both, right on the top level.

...

>
> > For ioctl, we'd have a new file_operation:
> >
> > long ioctl(struct file *, unsigned int, unsigned long, enum syscall_arch);
> >
> > I'm not saying this is easy, but I think it's possible and the result
> > would be more obviously correct than what we have now.
>
> No, it would not. Seriously, from time to time a bit of RTFS before grand
> proposals turns out to be useful.

As one example, look at __sys_setsockopt(). It's called for the
native and compat versions, and it contains an in_compat_syscall()
check. (This particularly check looks dubious to me, but that's
another story.) If this were to be done with equivalent semantics
without a separate COMPAT_DEFINE_SYSCALL and without
in_compat_syscall(), there would need to be some indication as to
whether this is compat or native setsockopt. There are other
setsockopt implementations in the net stack with more
legitimate-seeming uses of in_compat_syscall() that would need some
other mechanism if in_compat_syscall() were to go away.

setsockopt is (I hope!) out of scope for io_uring, but the situation
isn't fundamentally different from read and write.
Re: [PATCH 1/9] kernel: add a PF_FORCE_COMPAT flag [ In reply to ]
On Sun, Sep 20, 2020 at 04:15:10PM +0100, Matthew Wilcox wrote:
> On Fri, Sep 18, 2020 at 02:45:25PM +0200, Christoph Hellwig wrote:
> > Add a flag to force processing a syscall as a compat syscall. This is
> > required so that in_compat_syscall() works for I/O submitted by io_uring
> > helper threads on behalf of compat syscalls.
>
> Al doesn't like this much, but my suggestion is to introduce two new
> opcodes -- IORING_OP_READV32 and IORING_OP_WRITEV32. The compat code
> can translate IORING_OP_READV to IORING_OP_READV32 and then the core
> code can know what that user pointer is pointing to.

Let's separate two issues:
1) compat syscalls want 32bit iovecs. Nothing to do with the
drivers, dealt with just fine.
2) a few drivers are really fucked in head. They use different
*DATA* layouts for reads/writes, depending upon the calling process.
IOW, if you fork/exec a 32bit binary and your stdin is one of those,
reads from stdin in parent and child will yield different data layouts.
On the same struct file.
That's what Christoph worries about (/dev/sg he'd mentioned is
one of those).

IMO we should simply have that dozen or so of pathological files
marked with FMODE_SHITTY_ABI; it's not about how they'd been opened -
it describes the userland ABI provided by those. And it's cast in stone.

Any in_compat_syscall() in ->read()/->write() instances is an ABI
bug, plain and simple. Some are unfixable for compatibility reasons, but
any new caller like that should be a big red flag.

How we import iovec array is none of the drivers' concern; we do
not need to mess with in_compat_syscall() reporting the matching value,
etc. for that. It's about the instances that want in_compat_syscall() to
decide between the 32bit and 64bit data layouts. And I believe that
we should simply have them marked as such and rejected by io_uring. With
any new occurences getting slapped down hard.

Current list of those turds:
/dev/sg (pointer-chasing, generally insane)
/sys/firmware/efi/vars/*/raw_var (fucked binary structure)
/sys/firmware/efi/vars/new_var (fucked binary structure)
/sys/firmware/efi/vars/del_var (fucked binary structure)
/dev/uhid (pointer-chasing for one obsolete command)
/dev/input/event* (timestamps)
/dev/uinput (timestamps)
/proc/bus/input/devices (fucked bitmap-to-text representation)
/sys/class/input/*/capabilities/* (fucked bitmap-to-text representation)

1 2 3  View All