diff mbox series

[v5,08/14] iommufd/viommu: Add iommufd_viommu_report_event helper

Message ID b9b4f3df990934cb80b2f5e1e49e555e50c1c857.1736237481.git.nicolinc@nvidia.com
State New
Headers show
Series iommufd: Add vIOMMU infrastructure (Part-3: vEVENTQ) | expand

Commit Message

Nicolin Chen Jan. 7, 2025, 5:10 p.m. UTC
Similar to iommu_report_device_fault, this allows IOMMU drivers to report
vIOMMU events from threaded IRQ handlers to user space hypervisors.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
 include/linux/iommufd.h        | 11 +++++++++
 drivers/iommu/iommufd/driver.c | 43 ++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+)

Comments

Tian, Kevin Jan. 10, 2025, 7:12 a.m. UTC | #1
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Wednesday, January 8, 2025 1:10 AM
> 
> +/*
> + * Typically called in driver's threaded IRQ handler.
> + * The @type and @event_data must be defined in
> include/uapi/linux/iommufd.h
> + */
> +int iommufd_viommu_report_event(struct iommufd_viommu *viommu,
> +				enum iommu_veventq_type type, void
> *event_data,
> +				size_t data_len)
> +{
> +	struct iommufd_veventq *veventq;
> +	struct iommufd_vevent *vevent;
> +	int rc = 0;
> +
> +	if (!viommu)
> +		return -ENODEV;
> +	if (WARN_ON_ONCE(!viommu->ops || !viommu->ops-
> >supports_veventq ||
> +			 !viommu->ops->supports_veventq(type)))
> +		return -EOPNOTSUPP;

Hmm the driver knows which type is supported by itself before
calling this helper. Why bother having the helper calling into
the driver again to verify?
Jason Gunthorpe Jan. 10, 2025, 2:51 p.m. UTC | #2
On Fri, Jan 10, 2025 at 07:12:46AM +0000, Tian, Kevin wrote:

> > +	if (!viommu)
> > +		return -ENODEV;
> > +	if (WARN_ON_ONCE(!viommu->ops || !viommu->ops-
> > >supports_veventq ||
> > +			 !viommu->ops->supports_veventq(type)))
> > +		return -EOPNOTSUPP;
> 
> Hmm the driver knows which type is supported by itself before
> calling this helper. Why bother having the helper calling into
> the driver again to verify?

Indeed, it might make sense to protect this with

        if (IS_ENABLED(CONFIG_IOMMUFD_TEST))

As a compiled out assertion

Or drop it

We shouldn't have unnecessary argument validation on fast paths,
!viommu should go too.

Jason
Jason Gunthorpe Jan. 10, 2025, 5:41 p.m. UTC | #3
On Tue, Jan 07, 2025 at 09:10:11AM -0800, Nicolin Chen wrote:
> +/*
> + * Typically called in driver's threaded IRQ handler.
> + * The @type and @event_data must be defined in include/uapi/linux/iommufd.h
> + */
> +int iommufd_viommu_report_event(struct iommufd_viommu *viommu,
> +				enum iommu_veventq_type type, void *event_data,
> +				size_t data_len)
> +{
> +	struct iommufd_veventq *veventq;
> +	struct iommufd_vevent *vevent;
> +	int rc = 0;
> +
> +	if (!viommu)
> +		return -ENODEV;
> +	if (WARN_ON_ONCE(!viommu->ops || !viommu->ops->supports_veventq ||
> +			 !viommu->ops->supports_veventq(type)))
> +		return -EOPNOTSUPP;
> +	if (WARN_ON_ONCE(!data_len || !event_data))
> +		return -EINVAL;
> +
> +	down_read(&viommu->veventqs_rwsem);
> +
> +	veventq = iommufd_viommu_find_veventq(viommu, type);
> +	if (!veventq) {
> +		rc = -EOPNOTSUPP;
> +		goto out_unlock_veventqs;
> +	}
> +
> +	vevent = kmalloc(struct_size(vevent, event_data, data_len), GFP_KERNEL);
> +	if (!vevent) {
> +		rc = -ENOMEM;
> +		goto out_unlock_veventqs;
> +	}
> +	memcpy(vevent->event_data, event_data, data_len);

The page fault path is self limited because end point devices are only
able to issue a certain number of PRI's before they have to stop.

But the async events generated by something like the SMMU are not self
limiting and we can have a huge barrage of them. I think you need to
add some kind of limiting here otherwise we will OOM the kernel and
crash, eg if the VM spams protection errors.

The virtual event queue should behave the same as if the physical
event queue overflows, and that logic should be in the smmu driver -
this should return some Exxx to indicate the queue is filled.

I supposed we will need a way to indicate lost events to userspace on
top of this?

Presumably userspace should specify the max queue size.

Jason
Nicolin Chen Jan. 10, 2025, 6:38 p.m. UTC | #4
On Fri, Jan 10, 2025 at 01:41:32PM -0400, Jason Gunthorpe wrote:
> On Tue, Jan 07, 2025 at 09:10:11AM -0800, Nicolin Chen wrote:
> > +/*
> > + * Typically called in driver's threaded IRQ handler.
> > + * The @type and @event_data must be defined in include/uapi/linux/iommufd.h
> > + */
> > +int iommufd_viommu_report_event(struct iommufd_viommu *viommu,
> > +				enum iommu_veventq_type type, void *event_data,
> > +				size_t data_len)
> > +{
> > +	struct iommufd_veventq *veventq;
> > +	struct iommufd_vevent *vevent;
> > +	int rc = 0;
> > +
> > +	if (!viommu)
> > +		return -ENODEV;
> > +	if (WARN_ON_ONCE(!viommu->ops || !viommu->ops->supports_veventq ||
> > +			 !viommu->ops->supports_veventq(type)))
> > +		return -EOPNOTSUPP;
> > +	if (WARN_ON_ONCE(!data_len || !event_data))
> > +		return -EINVAL;
> > +
> > +	down_read(&viommu->veventqs_rwsem);
> > +
> > +	veventq = iommufd_viommu_find_veventq(viommu, type);
> > +	if (!veventq) {
> > +		rc = -EOPNOTSUPP;
> > +		goto out_unlock_veventqs;
> > +	}
> > +
> > +	vevent = kmalloc(struct_size(vevent, event_data, data_len), GFP_KERNEL);
> > +	if (!vevent) {
> > +		rc = -ENOMEM;
> > +		goto out_unlock_veventqs;
> > +	}
> > +	memcpy(vevent->event_data, event_data, data_len);
> 
> The page fault path is self limited because end point devices are only
> able to issue a certain number of PRI's before they have to stop.
> 
> But the async events generated by something like the SMMU are not self
> limiting and we can have a huge barrage of them. I think you need to
> add some kind of limiting here otherwise we will OOM the kernel and
> crash, eg if the VM spams protection errors.

Ack. I think we can just use an atomic counter in the producer
and consumer functions.

> The virtual event queue should behave the same as if the physical
> event queue overflows, and that logic should be in the smmu driver -
> this should return some Exxx to indicate the queue is filled.

Hmm, the driver only screams...

static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev)
{
[...]
		/*
		 * Not much we can do on overflow, so scream and pretend we're
		 * trying harder.
		 */
		if (queue_sync_prod_in(q) == -EOVERFLOW)
			dev_err(smmu->dev, "EVTQ overflow detected -- events lost\n");

> I supposed we will need a way to indicate lost events to userspace on
> top of this?

Perhaps another u32 flag in the arm_smmuv3_vevent struct to report
an overflow. That said, what userspace/VMM will need to do with it?

> Presumably userspace should specify the max queue size.

Yes. Similarly, vCMDQ has a vcmdq_log2size in the driver structure
for that. For veventq, this piece is core managed, so we will need
a veventq_size or so in the common iommufd_veventq_alloc structure.

Thanks!
Nicolin
Nicolin Chen Jan. 10, 2025, 6:40 p.m. UTC | #5
On Fri, Jan 10, 2025 at 10:51:49AM -0400, Jason Gunthorpe wrote:
> On Fri, Jan 10, 2025 at 07:12:46AM +0000, Tian, Kevin wrote:
> 
> > > +	if (!viommu)
> > > +		return -ENODEV;
> > > +	if (WARN_ON_ONCE(!viommu->ops || !viommu->ops-
> > > >supports_veventq ||
> > > +			 !viommu->ops->supports_veventq(type)))
> > > +		return -EOPNOTSUPP;
> > 
> > Hmm the driver knows which type is supported by itself before
> > calling this helper. Why bother having the helper calling into
> > the driver again to verify?
> 
> Indeed, it might make sense to protect this with
> 
>         if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
> 
> As a compiled out assertion
> 
> Or drop it
> 
> We shouldn't have unnecessary argument validation on fast paths,
> !viommu should go too.

Ack. I will drop those two if-statments.

Nicolin
Jason Gunthorpe Jan. 10, 2025, 7:51 p.m. UTC | #6
On Fri, Jan 10, 2025 at 10:38:42AM -0800, Nicolin Chen wrote:
> > The virtual event queue should behave the same as if the physical
> > event queue overflows, and that logic should be in the smmu driver -
> > this should return some Exxx to indicate the queue is filled.
> 
> Hmm, the driver only screams...
> 
> static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev)
> {
> [...]
> 		/*
> 		 * Not much we can do on overflow, so scream and pretend we're
> 		 * trying harder.
> 		 */
> 		if (queue_sync_prod_in(q) == -EOVERFLOW)
> 			dev_err(smmu->dev, "EVTQ overflow detected -- events lost\n");

Well it must know from the HW somehow that the overflow has happened??

> > I supposed we will need a way to indicate lost events to userspace on
> > top of this?
> 
> Perhaps another u32 flag in the arm_smmuv3_vevent struct to report
> an overflow. That said, what userspace/VMM will need to do with it?

Trigger the above code in the VM somehow?

Jason
Nicolin Chen Jan. 10, 2025, 7:56 p.m. UTC | #7
On Fri, Jan 10, 2025 at 03:51:14PM -0400, Jason Gunthorpe wrote:
> On Fri, Jan 10, 2025 at 10:38:42AM -0800, Nicolin Chen wrote:
> > > The virtual event queue should behave the same as if the physical
> > > event queue overflows, and that logic should be in the smmu driver -
> > > this should return some Exxx to indicate the queue is filled.
> > 
> > Hmm, the driver only screams...
> > 
> > static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev)
> > {
> > [...]
> > 		/*
> > 		 * Not much we can do on overflow, so scream and pretend we're
> > 		 * trying harder.
> > 		 */
> > 		if (queue_sync_prod_in(q) == -EOVERFLOW)
> > 			dev_err(smmu->dev, "EVTQ overflow detected -- events lost\n");
> 
> Well it must know from the HW somehow that the overflow has happened??
> 
> > > I supposed we will need a way to indicate lost events to userspace on
> > > top of this?
> > 
> > Perhaps another u32 flag in the arm_smmuv3_vevent struct to report
> > an overflow. That said, what userspace/VMM will need to do with it?
> 
> Trigger the above code in the VM somehow?

Oh, I see. I misunderstood somehow..

Thanks
Nicolin
Jason Gunthorpe Jan. 13, 2025, 7:21 p.m. UTC | #8
On Sun, Jan 12, 2025 at 09:37:41PM -0800, Nicolin Chen wrote:

> > > > I supposed we will need a way to indicate lost events to userspace on
> > > > top of this?
> > > 
> > > Perhaps another u32 flag in the arm_smmuv3_vevent struct to report
> > > an overflow. That said, what userspace/VMM will need to do with it?
> > 
> > Trigger the above code in the VM somehow?
> 
> I found two ways of forwarding an overflow flag:
> 
> 1. Return -EOVERFLOW to read(). But it cannot return the read bytes
> any more:

You could not return any bytes, it would have to be 0 bytes read, ie
immediately return EOVERFLOW and do nothing else.

Returning EOVERFLOW from read would have to also clear the overflow
indicator.

The other approach would be to add a sequence number to each event and
let userspace detect the non-montonicity. It would require adding a
header to the native ARM evt.

> 2. Return EPOLLERR via pollfd.revents. But it cannot use POLLERR
> for other errors any more:
> --------------------------------------------------
> @@ -420,2 +421,4 @@ static __poll_t iommufd_eventq_fops_poll(struct file *filep,
>         poll_wait(filep, &eventq->wait_queue, wait);
> +       if (test_bit(IOMMUFD_VEVENTQ_ERROR_OVERFLOW, veventq->errors))
> +               return EPOLLERR;
>         mutex_lock(&eventq->mutex);

But then how do you clear the error? I've only seen POLLERR used for
fatal conditions so there is no recovery, it is permanent.

Jason
Nicolin Chen Jan. 13, 2025, 7:47 p.m. UTC | #9
On Mon, Jan 13, 2025 at 03:21:44PM -0400, Jason Gunthorpe wrote:
> On Sun, Jan 12, 2025 at 09:37:41PM -0800, Nicolin Chen wrote:
> 
> > > > > I supposed we will need a way to indicate lost events to userspace on
> > > > > top of this?
> > > > 
> > > > Perhaps another u32 flag in the arm_smmuv3_vevent struct to report
> > > > an overflow. That said, what userspace/VMM will need to do with it?
> > > 
> > > Trigger the above code in the VM somehow?
> > 
> > I found two ways of forwarding an overflow flag:
> > 
> > 1. Return -EOVERFLOW to read(). But it cannot return the read bytes
> > any more:
> 
> You could not return any bytes, it would have to be 0 bytes read, ie
> immediately return EOVERFLOW and do nothing else.
> 
> Returning EOVERFLOW from read would have to also clear the overflow
> indicator.

OK. That means user space should read again for actual events in the
queue, after getting the first EOVERFLOW.

One concern is, if the report() keeps producing events to the queue,
it will always set the EOVERFLOW flag, then user space won't have a
chance to read the events out until the last report(). Wondering if
this would make sense, as I see SMMU driver's arm_smmu_evtq_thread()
reporting an OVERFLOW while allowing SW to continue reading the evtq.

> The other approach would be to add a sequence number to each event and
> let userspace detect the non-montonicity. It would require adding a
> header to the native ARM evt.

Yea, I thought about that. The tricky thing is that the header will
be a core-level header pairing with a driver-level vEVENTQ type and
can never change its length, though we can define a 64-bit flag that
can reserve the other 63 bits for future use?

That being asked, this seems to be a better approach as user space
can get the overflow flag while doing the read() that can potentially
clear the overflow flag too, v.s. blocked until the last report().

> > 2. Return EPOLLERR via pollfd.revents. But it cannot use POLLERR
> > for other errors any more:
> > --------------------------------------------------
> > @@ -420,2 +421,4 @@ static __poll_t iommufd_eventq_fops_poll(struct file *filep,
> >         poll_wait(filep, &eventq->wait_queue, wait);
> > +       if (test_bit(IOMMUFD_VEVENTQ_ERROR_OVERFLOW, veventq->errors))
> > +               return EPOLLERR;
> >         mutex_lock(&eventq->mutex);
> 
> But then how do you clear the error? I've only seen POLLERR used for
> fatal conditions so there is no recovery, it is permanent.

Overflow means the queue has tons of events for user space to read(),
so user space should read() to clear the error.

I found this piece in eventfd manual, so was wondering if we can resue:
https://man7.org/linux/man-pages/man2/eventfd.2.html
              ?  If an overflow of the counter value was detected, then
                 select(2) indicates the file descriptor as being both
                 readable and writable, and poll(2) returns a POLLERR
                 event.  As noted above, write(2) can never overflow the
                 counter.  However an overflow can occur if 2^64 eventfd
                 "signal posts" were performed by the KAIO subsystem
                 (theoretically possible, but practically unlikely).  If
                 an overflow has occurred, then read(2) will return that
                 maximum uint64_t value (i.e., 0xffffffffffffffff).

Thanks
Nicolin
Jason Gunthorpe Jan. 13, 2025, 7:54 p.m. UTC | #10
On Mon, Jan 13, 2025 at 11:47:52AM -0800, Nicolin Chen wrote:

> > You could not return any bytes, it would have to be 0 bytes read, ie
> > immediately return EOVERFLOW and do nothing else.
> > 
> > Returning EOVERFLOW from read would have to also clear the overflow
> > indicator.
> 
> OK. That means user space should read again for actual events in the
> queue, after getting the first EOVERFLOW.

Yes

> One concern is, if the report() keeps producing events to the queue,
> it will always set the EOVERFLOW flag, then user space won't have a
> chance to read the events out until the last report(). Wondering if
> this would make sense, as I see SMMU driver's arm_smmu_evtq_thread()
> reporting an OVERFLOW while allowing SW to continue reading the evtq.

Yes, this issue seems fatal to this idea. You need to report the
overflow at the right point in the queue so that userspace can read
the data out to free up the queue, otherwise it will livelock.

> > The other approach would be to add a sequence number to each event and
> > let userspace detect the non-montonicity. It would require adding a
> > header to the native ARM evt.
> 
> Yea, I thought about that. The tricky thing is that the header will
> be a core-level header pairing with a driver-level vEVENTQ type and
> can never change its length, though we can define a 64-bit flag that
> can reserve the other 63 bits for future use?

The header format could be revised by changing the driver specific
format tag.

You'd want to push a special event when the first overflow happens and
probably also report a counter so userspace can know how many events
got lost.

This seems most robust and simplest to implement..

I think I'd implement it by having a static overflow list entry so no
memory allocation is needed and just keep moving that entry to the
back of the list every time an event is lost. This way it will cover
lost events due to memory outages too

For old formats like the fault queue you could return EOVERFLOW
whenever the sequence number becomes discontiguous or it sees the
overflow event..

Jason
Nicolin Chen Jan. 17, 2025, 10:11 p.m. UTC | #11
On Tue, Jan 14, 2025 at 09:41:58AM -0400, Jason Gunthorpe wrote:
> On Mon, Jan 13, 2025 at 12:44:37PM -0800, Nicolin Chen wrote:
> > 	IOMMU_VEVENT_HEADER_FLAGS_OVERFLOW = (1 << 0),
> > };
> > 
> > struct iommufd_vevent_header_v1 {
> > 	__u64 flags;
> > 	__u32 num_events;
> > 	__u32 num_overflows; // valid if flag_overflow is set
> > };
> 
> num_overflows is hard, I'd just keep a counter.

Ack. How does this look overall?

@@ -1013,6 +1013,33 @@ struct iommu_ioas_change_process {
 #define IOMMU_IOAS_CHANGE_PROCESS \
        _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_CHANGE_PROCESS)

+/**
+ * enum iommu_veventq_state - state for struct iommufd_vevent_header
+ * @IOMMU_VEVENTQ_STATE_OK: vEVENTQ is running
+ * @IOMMU_VEVENTQ_STATE_OVERFLOW: vEVENTQ is overflowed
+ */
+enum iommu_veventq_state {
+       IOMMU_VEVENTQ_STATE_OK = (1 << 0),
+       IOMMU_VEVENTQ_STATE_OVERFLOW = (1 << 1),
+};
+
+/**
+ * struct iommufd_vevent_header - Virtual Event Header for a vEVENTQ Status
+ * @state: One of enum iommu_veventq_state
+ * @counter: A counter reflecting the state of the vEVENTQ
+ *
+ * ----------------------------------------------------------------------------
+ * | @state                       | @counter                                  |
+ * ----------------------------------------------------------------------------
+ * | IOMMU_VEVENTQ_STATE_OK       | number of readable vEVENTs in the vEVENTQ |
+ * | IOMMU_VEVENTQ_STATE_OVERFLOW | number of missed vEVENTs since overflow   |
+ * ----------------------------------------------------------------------------
+ */
+struct iommufd_vevent_header {
+       __u32 state;
+       __u32 counter;
+};
+
 /**
  * enum iommu_veventq_type - Virtual Event Queue Type
  * @IOMMU_VEVENTQ_TYPE_DEFAULT: Reserved for future use
@@ -1050,6 +1077,13 @@ struct iommu_vevent_arm_smmuv3 {
  *
  * Explicitly allocate a virtual event queue interface for a vIOMMU. A vIOMMU
  * can have multiple FDs for different types, but is confined to one per @type.
+ * User space should open the @out_veventq_fd to read vEVENTs out of a vEVENTQ,
+ * if there are vEVENTs available. A vEVENTQ will overflow if the number of the
+ * vEVENTs hits @veventq_depth.
+ *
+ * Each vEVENT in a vEVENTQ encloses a struct iommufd_vevent_header followed by
+ * a type-specific data structure. The iommufd_vevent_header reports the status
+ * of the vEVENTQ when the vEVENT is added to the vEVENTQ.
  */
 struct iommu_veventq_alloc {
        __u32 size;
Jason Gunthorpe Jan. 20, 2025, 6:18 p.m. UTC | #12
On Fri, Jan 17, 2025 at 02:11:15PM -0800, Nicolin Chen wrote:
> +/**
> + * struct iommufd_vevent_header - Virtual Event Header for a vEVENTQ Status
> + * @state: One of enum iommu_veventq_state

I'd probably just make this a flag with overflow as the only current flag?

> + * @counter: A counter reflecting the state of the vEVENTQ

> + * ----------------------------------------------------------------------------
> + * | @state                       | @counter                                  |
> + * ----------------------------------------------------------------------------
> + * | IOMMU_VEVENTQ_STATE_OK       | number of readable vEVENTs in the vEVENTQ |
> + * | IOMMU_VEVENTQ_STATE_OVERFLOW | number of missed vEVENTs since overflow   |
> + * ----------------------------------------------------------------------------

When I said counter I literally ment a counter of the number of events
that were sent into the queue. So if events are dropped there is a
trivial gap in the count. Very easy to implement

IOMMU_VEVENTQ_STATE_OVERFLOW with a 0 length event is seen if events
have been lost and no subsequent events are present. It exists to
ensure timely delivery of the overflow event to userspace. counter
will be the sequence number of the next successful event.

If events are lost in the middle of the queue then flags will remain 0
but counter will become non-montonic. A counter delta > 1 indicates
that many events have been lost.

Jason
Nicolin Chen Jan. 20, 2025, 8:52 p.m. UTC | #13
On Mon, Jan 20, 2025 at 02:18:54PM -0400, Jason Gunthorpe wrote:
> On Fri, Jan 17, 2025 at 02:11:15PM -0800, Nicolin Chen wrote:
> > +/**
> > + * struct iommufd_vevent_header - Virtual Event Header for a vEVENTQ Status
> > + * @state: One of enum iommu_veventq_state
> 
> I'd probably just make this a flag with overflow as the only current flag?

Ack.

> > + * @counter: A counter reflecting the state of the vEVENTQ
> 
> > + * ----------------------------------------------------------------------------
> > + * | @state                       | @counter                                  |
> > + * ----------------------------------------------------------------------------
> > + * | IOMMU_VEVENTQ_STATE_OK       | number of readable vEVENTs in the vEVENTQ |
> > + * | IOMMU_VEVENTQ_STATE_OVERFLOW | number of missed vEVENTs since overflow   |
> > + * ----------------------------------------------------------------------------
> 
> When I said counter I literally ment a counter of the number of events
> that were sent into the queue. So if events are dropped there is a
> trivial gap in the count. Very easy to implement

The counter of the number of events in the vEVENTQ could decrease
when userspace reads the queue. But you were saying "the number of
events that were sent into the queue", which is like a PROD index
that would keep growing but reset to 0 after UINT_MAX?

> IOMMU_VEVENTQ_STATE_OVERFLOW with a 0 length event is seen if events
> have been lost and no subsequent events are present. It exists to
> ensure timely delivery of the overflow event to userspace. counter
> will be the sequence number of the next successful event.

So userspace should first read the header to decide whether or not
to read a vEVENT. If header is overflows, it should skip the vEVENT
struct and read the next header?

> If events are lost in the middle of the queue then flags will remain 0
> but counter will become non-montonic. A counter delta > 1 indicates
> that many events have been lost.

I don't quite get the "no subsequent events" v.s. "in the middle of
the queue"..

The producer is the driver calling iommufd_viommu_report_event that
only produces a single vEVENT at a time. When the number of vEVENTs
in the vEVENTQ hits the @veventq_depth, it won't insert new vEVENTs
but add an overflow (or exception) node to the head of deliver list
and increase the producer index so the next vEVENT that can find an
empty space in the queue will have an index with a gap (delta >= 1)?

Thanks
Nicolin
Jason Gunthorpe Jan. 21, 2025, 6:36 p.m. UTC | #14
On Mon, Jan 20, 2025 at 12:52:09PM -0800, Nicolin Chen wrote:
> The counter of the number of events in the vEVENTQ could decrease
> when userspace reads the queue. But you were saying "the number of
> events that were sent into the queue", which is like a PROD index
> that would keep growing but reset to 0 after UINT_MAX?

yes

> > IOMMU_VEVENTQ_STATE_OVERFLOW with a 0 length event is seen if events
> > have been lost and no subsequent events are present. It exists to
> > ensure timely delivery of the overflow event to userspace. counter
> > will be the sequence number of the next successful event.
> 
> So userspace should first read the header to decide whether or not
> to read a vEVENT. If header is overflows, it should skip the vEVENT
> struct and read the next header?

Yes, but there won't be a next header. overflow would always be the
last thing in a read() response. If there is another event then
overflow is indicated by non-monotonic count.

> > If events are lost in the middle of the queue then flags will remain 0
> > but counter will become non-montonic. A counter delta > 1 indicates
> > that many events have been lost.
> 
> I don't quite get the "no subsequent events" v.s. "in the middle of
> the queue"..

I mean to supress specific overflow events to userspace if the counter already
fully indicates overflow.

The purpose of the overflow event is specifically, and only, to
indicate immediately that an overflow occured at the end of the queue,
and no additional events have been pushed since the overflow.

Without this we could loose an event and userspace may not realize
it for a long time.

> The producer is the driver calling iommufd_viommu_report_event that
> only produces a single vEVENT at a time. When the number of vEVENTs
> in the vEVENTQ hits the @veventq_depth, it won't insert new vEVENTs
> but add an overflow (or exception) node to the head of deliver list
> and increase the producer index so the next vEVENT that can find an
> empty space in the queue will have an index with a gap (delta >= 1)?

Yes, but each new overflow should move the single preallocated
overflow node back to the end of the list, and the read side should
skip the overflow node if it is not the last entry in the list

Jason
Jason Gunthorpe Jan. 21, 2025, 8:09 p.m. UTC | #15
On Tue, Jan 21, 2025 at 11:55:16AM -0800, Nicolin Chen wrote:
> Ack. Then I think we should name it "index", beside a "counter"
> indicating the number of events in the queue. Or perhaps a pair
> of consumer and producer indexes that wrap at end of a limit.

sequence perhaps would be a good name

> > > > IOMMU_VEVENTQ_STATE_OVERFLOW with a 0 length event is seen if events
> > > > have been lost and no subsequent events are present. It exists to
> > > > ensure timely delivery of the overflow event to userspace. counter
> > > > will be the sequence number of the next successful event.
> > > 
> > > So userspace should first read the header to decide whether or not
> > > to read a vEVENT. If header is overflows, it should skip the vEVENT
> > > struct and read the next header?
> > 
> > Yes, but there won't be a next header. overflow would always be the
> > last thing in a read() response. If there is another event then
> > overflow is indicated by non-monotonic count.
> 
> I am not 100% sure why "overflow would always be the last thing
> in a read() response". I thought that kernel should immediately
> report an overflow to user space when the vEVENTQ is overflowed.

As below, if you observe overflow then it was at the end of the kernel
queue and there is no further events after it. So it should always end
up last.

Perhaps we could enforce this directly in the kernel's read by making
it the only, first and last, response to read.

> Yet, thinking about this once again: user space actually has its
> own queue. There's probably no point in letting it know about an
> overflow immediately when the kernel vEVENTQ overflows until its
> own user queue overflows after it reads the entire vEVENTQ so it
> can trigger a vHW event/irq to the VM?

The kernel has no idea what userspace is doing, the kernel's job
should be to ensure timely delivery of all events, if an event is lost
it should ensure timely delivery of the lost event notification. There
is little else it can do.

I suppose userspace has a choice, it could discard events from the
kernel when its virtual HW queue gets full, or it could backpressure
the kernel and stop reading hoping the kernel queue will buffer it
futher.

> > Without this we could loose an event and userspace may not realize
> > it for a long time.
> 
> I see. Because there is no further new event, there would be no
> new index to indicate a gap. Thus, we need an overflow node.

yes

> If the number of events in the queue is below @veventq_depth as
> userspace consumed the events from the queue, I think a new
> iommufd_viommu_report_event call should delete the overflow node
> from the end of the list, right? 

You can do that, or the read side can ignore a non-end overflow node.

I'm not sure which option will turn out to be easier to implement..

Jason
Jason Gunthorpe Jan. 21, 2025, 9:14 p.m. UTC | #16
On Tue, Jan 21, 2025 at 01:02:16PM -0800, Nicolin Chen wrote:
> On Tue, Jan 21, 2025 at 04:09:24PM -0400, Jason Gunthorpe wrote:
> > On Tue, Jan 21, 2025 at 11:55:16AM -0800, Nicolin Chen wrote:
> > > > > > IOMMU_VEVENTQ_STATE_OVERFLOW with a 0 length event is seen if events
> > > > > > have been lost and no subsequent events are present. It exists to
> > > > > > ensure timely delivery of the overflow event to userspace. counter
> > > > > > will be the sequence number of the next successful event.
> > > > > 
> > > > > So userspace should first read the header to decide whether or not
> > > > > to read a vEVENT. If header is overflows, it should skip the vEVENT
> > > > > struct and read the next header?
> > > > 
> > > > Yes, but there won't be a next header. overflow would always be the
> > > > last thing in a read() response. If there is another event then
> > > > overflow is indicated by non-monotonic count.
> > > 
> > > I am not 100% sure why "overflow would always be the last thing
> > > in a read() response". I thought that kernel should immediately
> > > report an overflow to user space when the vEVENTQ is overflowed.
> > 
> > As below, if you observe overflow then it was at the end of the kernel
> > queue and there is no further events after it. So it should always end
> > up last.
> > 
> > Perhaps we could enforce this directly in the kernel's read by making
> > it the only, first and last, response to read.
> 
> Hmm, since the overflow node is the last node in the list, isn't
> it already an enforcement it's the only/first/last node to read?
> (Assuming we choose to delete the overflow node from the list if
>  new event can be inserted.)

Since we don't hold the spinlock the whole time there is a race where
we could pull the overflow off and then another entry could be pushed
while we do the copy_to_user.

> > > Yet, thinking about this once again: user space actually has its
> > > own queue. There's probably no point in letting it know about an
> > > overflow immediately when the kernel vEVENTQ overflows until its
> > > own user queue overflows after it reads the entire vEVENTQ so it
> > > can trigger a vHW event/irq to the VM?
> > 
> > The kernel has no idea what userspace is doing, the kernel's job
> > should be to ensure timely delivery of all events, if an event is lost
> > it should ensure timely delivery of the lost event notification. There
> > is little else it can do.
> 
> Yet, "timely" means still having an entire-queue-size-long delay
> since the overflow node is at the end of the queue, right?

Yes, but also in this case the vIOMMU isn't experiancing an overflow
so it doesn't need to know about it. 

The main point here is if we somehow loose an event the vIOMMU driver
may need to do something to recover from the lost event. It doesn't
become relavent until the lost event is present in the virtual HW
queue.

There is also the minor detail of what happens if the hypervisor HW
queue overflows - I don't know the answer here. It is security
concerning since the VM can spam DMA errors at high rate. :|

Jason
Nicolin Chen Jan. 21, 2025, 9:40 p.m. UTC | #17
On Tue, Jan 21, 2025 at 05:14:04PM -0400, Jason Gunthorpe wrote:
> On Tue, Jan 21, 2025 at 01:02:16PM -0800, Nicolin Chen wrote:
> > On Tue, Jan 21, 2025 at 04:09:24PM -0400, Jason Gunthorpe wrote:
> > > On Tue, Jan 21, 2025 at 11:55:16AM -0800, Nicolin Chen wrote:
> > > > > > > IOMMU_VEVENTQ_STATE_OVERFLOW with a 0 length event is seen if events
> > > > > > > have been lost and no subsequent events are present. It exists to
> > > > > > > ensure timely delivery of the overflow event to userspace. counter
> > > > > > > will be the sequence number of the next successful event.
> > > > > > 
> > > > > > So userspace should first read the header to decide whether or not
> > > > > > to read a vEVENT. If header is overflows, it should skip the vEVENT
> > > > > > struct and read the next header?
> > > > > 
> > > > > Yes, but there won't be a next header. overflow would always be the
> > > > > last thing in a read() response. If there is another event then
> > > > > overflow is indicated by non-monotonic count.
> > > > 
> > > > I am not 100% sure why "overflow would always be the last thing
> > > > in a read() response". I thought that kernel should immediately
> > > > report an overflow to user space when the vEVENTQ is overflowed.
> > > 
> > > As below, if you observe overflow then it was at the end of the kernel
> > > queue and there is no further events after it. So it should always end
> > > up last.
> > > 
> > > Perhaps we could enforce this directly in the kernel's read by making
> > > it the only, first and last, response to read.
> > 
> > Hmm, since the overflow node is the last node in the list, isn't
> > it already an enforcement it's the only/first/last node to read?
> > (Assuming we choose to delete the overflow node from the list if
> >  new event can be inserted.)
> 
> Since we don't hold the spinlock the whole time there is a race where
> we could pull the overflow off and then another entry could be pushed
> while we do the copy_to_user.

I see. I'll be careful around that. I can imagine that one tricky
thing can be to restore the overflow node back to the list when a
copy_to_user fails..

> > > > Yet, thinking about this once again: user space actually has its
> > > > own queue. There's probably no point in letting it know about an
> > > > overflow immediately when the kernel vEVENTQ overflows until its
> > > > own user queue overflows after it reads the entire vEVENTQ so it
> > > > can trigger a vHW event/irq to the VM?
> > > 
> > > The kernel has no idea what userspace is doing, the kernel's job
> > > should be to ensure timely delivery of all events, if an event is lost
> > > it should ensure timely delivery of the lost event notification. There
> > > is little else it can do.
> > 
> > Yet, "timely" means still having an entire-queue-size-long delay
> > since the overflow node is at the end of the queue, right?
> 
> Yes, but also in this case the vIOMMU isn't experiancing an overflow
> so it doesn't need to know about it. 
> 
> The main point here is if we somehow loose an event the vIOMMU driver
> may need to do something to recover from the lost event. It doesn't
> become relavent until the lost event is present in the virtual HW
> queue.

Ack.

> There is also the minor detail of what happens if the hypervisor HW
> queue overflows - I don't know the answer here. It is security
> concerning since the VM can spam DMA errors at high rate. :|

In my view, the hypervisor queue is the vHW queue for the VM, so
it should act like a HW, which means it's up to the guest kernel
driver that handles the high rate DMA errors..

The entire thing is complicated since we have a double buffer: a
kernel queue and a hypervisor queue. I am not very sure if both 
queues should have the same depth or perhaps the kernel one might
have a slightly bigger size to buffer the hypervisor one. If only
kernel could directly add events to the hypervisor queue and also
set the overflow flag in the hypervisor..

Thanks
Nicolin
Nicolin Chen Jan. 22, 2025, 7:15 a.m. UTC | #18
On Tue, Jan 21, 2025 at 08:21:28PM -0400, Jason Gunthorpe wrote:
> On Tue, Jan 21, 2025 at 01:40:05PM -0800, Nicolin Chen wrote:
> > > There is also the minor detail of what happens if the hypervisor HW
> > > queue overflows - I don't know the answer here. It is security
> > > concerning since the VM can spam DMA errors at high rate. :|
> > 
> > In my view, the hypervisor queue is the vHW queue for the VM, so
> > it should act like a HW, which means it's up to the guest kernel
> > driver that handles the high rate DMA errors..
> 
> I'm mainly wondering what happens if the single physical kernel
> event queue overflows because it is DOS'd by a VM and the hypervisor
> cannot drain it fast enough?
> 
> I haven't looked closely but is there some kind of rate limiting or
> otherwise to mitigate DOS attacks on the shared event queue from VMs?

SMMUv3 reads the event out of the physical kernel event queue,
and adds that to faultq or veventq or prints it out. So, it'd
not overflow because of DOS? And all other drivers should do
the same?

Thanks
Nicolin
Nicolin Chen Jan. 22, 2025, 7:54 p.m. UTC | #19
On Wed, Jan 22, 2025 at 09:33:35AM +0000, Tian, Kevin wrote:
> > From: Nicolin Chen <nicolinc@nvidia.com>
> > Sent: Wednesday, January 22, 2025 3:16 PM
> > 
> > On Tue, Jan 21, 2025 at 08:21:28PM -0400, Jason Gunthorpe wrote:
> > > On Tue, Jan 21, 2025 at 01:40:05PM -0800, Nicolin Chen wrote:
> > > > > There is also the minor detail of what happens if the hypervisor HW
> > > > > queue overflows - I don't know the answer here. It is security
> > > > > concerning since the VM can spam DMA errors at high rate. :|
> > > >
> > > > In my view, the hypervisor queue is the vHW queue for the VM, so
> > > > it should act like a HW, which means it's up to the guest kernel
> > > > driver that handles the high rate DMA errors..
> > >
> > > I'm mainly wondering what happens if the single physical kernel
> > > event queue overflows because it is DOS'd by a VM and the hypervisor
> > > cannot drain it fast enough?
> > >
> > > I haven't looked closely but is there some kind of rate limiting or
> > > otherwise to mitigate DOS attacks on the shared event queue from VMs?
> > 
> > SMMUv3 reads the event out of the physical kernel event queue,
> > and adds that to faultq or veventq or prints it out. So, it'd
> > not overflow because of DOS? And all other drivers should do
> > the same?
> > 
> 
> "add that to faultq or eventq" could take time or the irqthread
> could be preempted for various reasons then there is always an
> window within which an overflow condition could occur due to
> the smmu driver incapable of fetching pending events timely.

Oh, I see..

> On VT-d the driver could disable reporting non-recoverable fault
> for a given device via a control bit in the PASID entry, but I didn't
> see a similar knob for PRQ.

ARM has an event suppressing CD.R bit to disable event recording
for a device. However, the stage-1 CD is controlled by the guest
kernel or VMM having the control of the ram..

ARM seems to also have an interesting event merging feature:
 STE.MEV, bit [83]
 Merge Events arising from terminated transactions from this stream.
   0b0 Do not merge similar fault records
   0b1 Permit similar fault records to be merged
 The SMMU might be able to reduce the usage of the Event queue by
 coalescing fault records that share the same page granule of address,
 access type and SubstreamID.
   Setting MEV == 1 does not guarantee that faults will be coalesced.
   Setting MEV == 0 causes a physical SMMU to prevent coalescing of
   fault records, however, a hypervisor might not honour this setting
   if it deems a guest to be too verbose.
 Note: Software must expect, and be able to deal with, coalesced fault
 records even when MEV == 0.

Yet, the driver doesn't seem to care setting it at this moment.

Thanks
Nicolin
Jason Gunthorpe Jan. 23, 2025, 1:42 p.m. UTC | #20
On Wed, Jan 22, 2025 at 11:54:52AM -0800, Nicolin Chen wrote:

> ARM seems to also have an interesting event merging feature:
>  STE.MEV, bit [83]
>  Merge Events arising from terminated transactions from this stream.
>    0b0 Do not merge similar fault records
>    0b1 Permit similar fault records to be merged
>  The SMMU might be able to reduce the usage of the Event queue by
>  coalescing fault records that share the same page granule of address,
>  access type and SubstreamID.
>    Setting MEV == 1 does not guarantee that faults will be coalesced.
>    Setting MEV == 0 causes a physical SMMU to prevent coalescing of
>    fault records, however, a hypervisor might not honour this setting
>    if it deems a guest to be too verbose.
>  Note: Software must expect, and be able to deal with, coalesced fault
>  records even when MEV == 0.
> 
> Yet, the driver doesn't seem to care setting it at this moment.

Yeah, we will eventually need to implement whatever DOS mitigations
are included in the IOMMU architectures..

I think DOS testing the event architecture should be part of the
testing/qualification.

It should be quite easy to make a DOS spammer using VFIO in userspace
in the VM to test it with the mlx5 vfio driver.. (though you need VFIO
to work in a VM which RMR will prevent, but that can be hacked around
I guess)

Jason
diff mbox series

Patch

diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index a6dd9f8edcf3..55e71dca3664 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -11,6 +11,7 @@ 
 #include <linux/refcount.h>
 #include <linux/types.h>
 #include <linux/xarray.h>
+#include <uapi/linux/iommufd.h>
 
 struct device;
 struct file;
@@ -194,6 +195,9 @@  struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu,
 				       unsigned long vdev_id);
 unsigned long iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu,
 					 struct device *dev);
+int iommufd_viommu_report_event(struct iommufd_viommu *viommu,
+				enum iommu_veventq_type type, void *event_data,
+				size_t data_len);
 #else /* !CONFIG_IOMMUFD_DRIVER_CORE */
 static inline struct iommufd_object *
 _iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size,
@@ -213,6 +217,13 @@  iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu, struct device *dev)
 {
 	return 0;
 }
+
+static inline int iommufd_viommu_report_event(struct iommufd_viommu *viommu,
+					      enum iommu_veventq_type type,
+					      void *event_data, size_t data_len)
+{
+	return -EOPNOTSUPP;
+}
 #endif /* CONFIG_IOMMUFD_DRIVER_CORE */
 
 /*
diff --git a/drivers/iommu/iommufd/driver.c b/drivers/iommu/iommufd/driver.c
index e5d7397c0a6c..b03ceb016bee 100644
--- a/drivers/iommu/iommufd/driver.c
+++ b/drivers/iommu/iommufd/driver.c
@@ -69,5 +69,48 @@  unsigned long iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu,
 }
 EXPORT_SYMBOL_NS_GPL(iommufd_viommu_get_vdev_id, "IOMMUFD");
 
+/*
+ * Typically called in driver's threaded IRQ handler.
+ * The @type and @event_data must be defined in include/uapi/linux/iommufd.h
+ */
+int iommufd_viommu_report_event(struct iommufd_viommu *viommu,
+				enum iommu_veventq_type type, void *event_data,
+				size_t data_len)
+{
+	struct iommufd_veventq *veventq;
+	struct iommufd_vevent *vevent;
+	int rc = 0;
+
+	if (!viommu)
+		return -ENODEV;
+	if (WARN_ON_ONCE(!viommu->ops || !viommu->ops->supports_veventq ||
+			 !viommu->ops->supports_veventq(type)))
+		return -EOPNOTSUPP;
+	if (WARN_ON_ONCE(!data_len || !event_data))
+		return -EINVAL;
+
+	down_read(&viommu->veventqs_rwsem);
+
+	veventq = iommufd_viommu_find_veventq(viommu, type);
+	if (!veventq) {
+		rc = -EOPNOTSUPP;
+		goto out_unlock_veventqs;
+	}
+
+	vevent = kmalloc(struct_size(vevent, event_data, data_len), GFP_KERNEL);
+	if (!vevent) {
+		rc = -ENOMEM;
+		goto out_unlock_veventqs;
+	}
+	memcpy(vevent->event_data, event_data, data_len);
+	vevent->data_len = data_len;
+
+	iommufd_vevent_handler(veventq, vevent);
+out_unlock_veventqs:
+	up_read(&viommu->veventqs_rwsem);
+	return rc;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_viommu_report_event, "IOMMUFD");
+
 MODULE_DESCRIPTION("iommufd code shared with builtin modules");
 MODULE_LICENSE("GPL");