diff mbox series

[v6,05/13] iommufd/viommu: Add IOMMU_VIOMMU_ALLOC ioctl

Message ID 5d2efe229821f5db7a832d9d6be84217289ac3f4.1730313237.git.nicolinc@nvidia.com
State Superseded
Headers show
Series iommufd: Add vIOMMU infrastructure (Part-1) | expand

Commit Message

Nicolin Chen Oct. 30, 2024, 9:34 p.m. UTC
Add a new ioctl for user space to do a vIOMMU allocation. It must be based
on a nesting parent HWPT, so take its refcount.

IOMMU driver wanting to support vIOMMUs must define its IOMMU_VIOMMU_TYPE_
in the uAPI header and implement a viommu_alloc op in its iommu_ops.

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
 drivers/iommu/iommufd/Makefile          |  3 +-
 drivers/iommu/iommufd/iommufd_private.h |  3 +
 include/uapi/linux/iommufd.h            | 40 ++++++++++++
 drivers/iommu/iommufd/main.c            |  6 ++
 drivers/iommu/iommufd/viommu.c          | 81 +++++++++++++++++++++++++
 5 files changed, 132 insertions(+), 1 deletion(-)
 create mode 100644 drivers/iommu/iommufd/viommu.c

Comments

Alexey Kardashevskiy Nov. 7, 2024, 1:37 a.m. UTC | #1
On 31/10/24 08:34, Nicolin Chen wrote:
> Add a new ioctl for user space to do a vIOMMU allocation. It must be based
> on a nesting parent HWPT, so take its refcount.
> 
> IOMMU driver wanting to support vIOMMUs must define its IOMMU_VIOMMU_TYPE_
> in the uAPI header and implement a viommu_alloc op in its iommu_ops.
> 
> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
> Reviewed-by: Kevin Tian <kevin.tian@intel.com>
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> ---
>   drivers/iommu/iommufd/Makefile          |  3 +-
>   drivers/iommu/iommufd/iommufd_private.h |  3 +
>   include/uapi/linux/iommufd.h            | 40 ++++++++++++
>   drivers/iommu/iommufd/main.c            |  6 ++
>   drivers/iommu/iommufd/viommu.c          | 81 +++++++++++++++++++++++++
>   5 files changed, 132 insertions(+), 1 deletion(-)
>   create mode 100644 drivers/iommu/iommufd/viommu.c
> 
> diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
> index 266e23d657ff..8f6f0dd038d0 100644
> --- a/drivers/iommu/iommufd/Makefile
> +++ b/drivers/iommu/iommufd/Makefile
> @@ -7,7 +7,8 @@ iommufd-y := \
>   	ioas.o \
>   	main.o \
>   	pages.o \
> -	vfio_compat.o
> +	vfio_compat.o \
> +	viommu.o
>   
>   iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o
>   
> diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
> index be347f726fda..a8104d9d4cef 100644
> --- a/drivers/iommu/iommufd/iommufd_private.h
> +++ b/drivers/iommu/iommufd/iommufd_private.h
> @@ -506,6 +506,9 @@ static inline int iommufd_hwpt_replace_device(struct iommufd_device *idev,
>   	return iommu_group_replace_domain(idev->igroup->group, hwpt->domain);
>   }
>   
> +int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd);
> +void iommufd_viommu_destroy(struct iommufd_object *obj);
> +
>   #ifdef CONFIG_IOMMUFD_TEST
>   int iommufd_test(struct iommufd_ucmd *ucmd);
>   void iommufd_selftest_destroy(struct iommufd_object *obj);
> diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
> index 41b1a01e9293..302844136b02 100644
> --- a/include/uapi/linux/iommufd.h
> +++ b/include/uapi/linux/iommufd.h
> @@ -52,6 +52,7 @@ enum {
>   	IOMMUFD_CMD_HWPT_INVALIDATE = 0x8d,
>   	IOMMUFD_CMD_FAULT_QUEUE_ALLOC = 0x8e,
>   	IOMMUFD_CMD_IOAS_MAP_FILE = 0x8f,
> +	IOMMUFD_CMD_VIOMMU_ALLOC = 0x90,
>   };
>   
>   /**
> @@ -822,4 +823,43 @@ struct iommu_fault_alloc {
>   	__u32 out_fault_fd;
>   };
>   #define IOMMU_FAULT_QUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_FAULT_QUEUE_ALLOC)
> +
> +/**
> + * enum iommu_viommu_type - Virtual IOMMU Type
> + * @IOMMU_VIOMMU_TYPE_DEFAULT: Reserved for future use
> + */
> +enum iommu_viommu_type {
> +	IOMMU_VIOMMU_TYPE_DEFAULT = 0,
> +};
> +
> +/**
> + * struct iommu_viommu_alloc - ioctl(IOMMU_VIOMMU_ALLOC)
> + * @size: sizeof(struct iommu_viommu_alloc)
> + * @flags: Must be 0
> + * @type: Type of the virtual IOMMU. Must be defined in enum iommu_viommu_type
> + * @dev_id: The device's physical IOMMU will be used to back the virtual IOMMU
> + * @hwpt_id: ID of a nesting parent HWPT to associate to
> + * @out_viommu_id: Output virtual IOMMU ID for the allocated object
> + *
> + * Allocate a virtual IOMMU object, representing the underlying physical IOMMU's
> + * virtualization support that is a security-isolated slice of the real IOMMU HW
> + * that is unique to a specific VM. Operations global to the IOMMU are connected
> + * to the vIOMMU, such as:
> + * - Security namespace for guest owned ID, e.g. guest-controlled cache tags
> + * - Non-device-affiliated event reporting, e.g. invalidation queue errors
> + * - Access to a sharable nesting parent pagetable across physical IOMMUs
> + * - Virtualization of various platforms IDs, e.g. RIDs and others
> + * - Delivery of paravirtualized invalidation
> + * - Direct assigned invalidation queues
> + * - Direct assigned interrupts
> + */
> +struct iommu_viommu_alloc {
> +	__u32 size;
> +	__u32 flags;
> +	__u32 type;
> +	__u32 dev_id;
> +	__u32 hwpt_id;
> +	__u32 out_viommu_id;
> +};
> +#define IOMMU_VIOMMU_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIOMMU_ALLOC)
>   #endif
> diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
> index 30e6c2af3b45..cc514f9bc3e6 100644
> --- a/drivers/iommu/iommufd/main.c
> +++ b/drivers/iommu/iommufd/main.c
> @@ -307,6 +307,7 @@ union ucmd_buffer {
>   	struct iommu_ioas_unmap unmap;
>   	struct iommu_option option;
>   	struct iommu_vfio_ioas vfio_ioas;
> +	struct iommu_viommu_alloc viommu;
>   #ifdef CONFIG_IOMMUFD_TEST
>   	struct iommu_test_cmd test;
>   #endif
> @@ -360,6 +361,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
>   		 val64),
>   	IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas,
>   		 __reserved),
> +	IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl,
> +		 struct iommu_viommu_alloc, out_viommu_id),
>   #ifdef CONFIG_IOMMUFD_TEST
>   	IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last),
>   #endif
> @@ -495,6 +498,9 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
>   	[IOMMUFD_OBJ_FAULT] = {
>   		.destroy = iommufd_fault_destroy,
>   	},
> +	[IOMMUFD_OBJ_VIOMMU] = {
> +		.destroy = iommufd_viommu_destroy,
> +	},
>   #ifdef CONFIG_IOMMUFD_TEST
>   	[IOMMUFD_OBJ_SELFTEST] = {
>   		.destroy = iommufd_selftest_destroy,
> diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
> new file mode 100644
> index 000000000000..888239b78667
> --- /dev/null
> +++ b/drivers/iommu/iommufd/viommu.c
> @@ -0,0 +1,81 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
> + */
> +#include "iommufd_private.h"
> +
> +void iommufd_viommu_destroy(struct iommufd_object *obj)
> +{
> +	struct iommufd_viommu *viommu =
> +		container_of(obj, struct iommufd_viommu, obj);
> +
> +	if (viommu->ops && viommu->ops->destroy)
> +		viommu->ops->destroy(viommu);
> +	refcount_dec(&viommu->hwpt->common.obj.users);
> +}
> +
> +int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
> +{
> +	struct iommu_viommu_alloc *cmd = ucmd->cmd;
> +	struct iommufd_hwpt_paging *hwpt_paging;
> +	struct iommufd_viommu *viommu;
> +	struct iommufd_device *idev;
> +	const struct iommu_ops *ops;
> +	int rc;
> +
> +	if (cmd->flags || cmd->type == IOMMU_VIOMMU_TYPE_DEFAULT)
> +		return -EOPNOTSUPP;
> +
> +	idev = iommufd_get_device(ucmd, cmd->dev_id);
> +	if (IS_ERR(idev))
> +		return PTR_ERR(idev);
> +
> +	ops = dev_iommu_ops(idev->dev);
> +	if (!ops->viommu_alloc) {
> +		rc = -EOPNOTSUPP;
> +		goto out_put_idev;
> +	}
> +
> +	hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id);
> +	if (IS_ERR(hwpt_paging)) {
> +		rc = PTR_ERR(hwpt_paging);
> +		goto out_put_idev;
> +	}
> +
> +	if (!hwpt_paging->nest_parent) {


I am trying to use this for my so-called viommu which does not have this 
so this fails. My viommu is only visible to the VM via paravirt 
interface to populate sDTE (secure device table) so no viommu device in 
the guest or nested paging, nothing like this just yet.

Is there a writeup somewhere about possible uses of this 
IOMMU_HWPT_ALLOC_NEST_PARENT? I'd think one pass through device equals 1 
IOAS, 1 HWPT, 1 domain, 1 viommu, even with guest visible vIOMMU but it 
is not the case, is it?

btw is there a way to get a snapshot of all current objects in iommufd? 
They all use "dev_id" and not file descriptors so cannot look at 
/proc/><pid>/fd, and there is nothing in debugfs either.

For my current hack, I can pass IOMMU_HWPT_ALLOC_NEST_PARENT to QEMU's 
iommufd_backend_alloc_hwpt() but it is... meh. Thanks,


> +		rc = -EINVAL;
> +		goto out_put_hwpt;
> +	}
> +
> +	viommu = ops->viommu_alloc(idev->dev, hwpt_paging->common.domain,
> +				   ucmd->ictx, cmd->type);
> +	if (IS_ERR(viommu)) {
> +		rc = PTR_ERR(viommu);
> +		goto out_put_hwpt;
> +	}
> +
> +	viommu->type = cmd->type;
> +	viommu->ictx = ucmd->ictx;
> +	viommu->hwpt = hwpt_paging;
> +	refcount_inc(&viommu->hwpt->common.obj.users);
> +	/*
> +	 * It is the most likely case that a physical IOMMU is unpluggable. A
> +	 * pluggable IOMMU instance (if exists) is responsible for refcounting
> +	 * on its own.
> +	 */
> +	viommu->iommu_dev = __iommu_get_iommu_dev(idev->dev);
> +
> +	cmd->out_viommu_id = viommu->obj.id;
> +	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
> +	if (rc)
> +		goto out_abort;
> +	iommufd_object_finalize(ucmd->ictx, &viommu->obj);
> +	goto out_put_hwpt;
> +
> +out_abort:
> +	iommufd_object_abort_and_destroy(ucmd->ictx, &viommu->obj);
> +out_put_hwpt:
> +	iommufd_put_object(ucmd->ictx, &hwpt_paging->common.obj);
> +out_put_idev:
> +	iommufd_put_object(ucmd->ictx, &idev->obj);
> +	return rc;
> +}
Nicolin Chen Nov. 7, 2024, 2:03 a.m. UTC | #2
On Thu, Nov 07, 2024 at 12:37:59PM +1100, Alexey Kardashevskiy wrote:
> On 31/10/24 08:34, Nicolin Chen wrote:
> > +int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
> > +{
> > +     struct iommu_viommu_alloc *cmd = ucmd->cmd;
> > +     struct iommufd_hwpt_paging *hwpt_paging;
> > +     struct iommufd_viommu *viommu;
> > +     struct iommufd_device *idev;
> > +     const struct iommu_ops *ops;
> > +     int rc;
> > +
> > +     if (cmd->flags || cmd->type == IOMMU_VIOMMU_TYPE_DEFAULT)
> > +             return -EOPNOTSUPP;
> > +
> > +     idev = iommufd_get_device(ucmd, cmd->dev_id);
> > +     if (IS_ERR(idev))
> > +             return PTR_ERR(idev);
> > +
> > +     ops = dev_iommu_ops(idev->dev);
> > +     if (!ops->viommu_alloc) {
> > +             rc = -EOPNOTSUPP;
> > +             goto out_put_idev;
> > +     }
> > +
> > +     hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id);
> > +     if (IS_ERR(hwpt_paging)) {
> > +             rc = PTR_ERR(hwpt_paging);
> > +             goto out_put_idev;
> > +     }
> > +
> > +     if (!hwpt_paging->nest_parent) {
> 
> 
> I am trying to use this for my so-called viommu which does not have this
> so this fails. My viommu is only visible to the VM via paravirt
> interface to populate sDTE (secure device table) so no viommu device in
> the guest or nested paging, nothing like this just yet.
> 
> Is there a writeup somewhere about possible uses of this
> IOMMU_HWPT_ALLOC_NEST_PARENT? I'd think one pass through device equals 1
> IOAS, 1 HWPT, 1 domain, 1 viommu, even with guest visible vIOMMU but it
> is not the case, is it?

So far, the vIOMMU is exclusively designed for nesting, with the
only use case of ARM SMMUv3. So, a nesting parent HWPT/domain is
a must. So long as your use case is a good fit, we can drop this
when adding support for your case, assuming there would be some
new iommufd patches.

For nesting, there will be at least two HWPTs (i.e. two domains)
for 2 stages. And stage-2 HWPT/domain is the nesting parent one.

> btw is there a way to get a snapshot of all current objects in iommufd?
> They all use "dev_id" and not file descriptors so cannot look at
> /proc/><pid>/fd, and there is nothing in debugfs either.

I am not aware of anything like that. How would you like to use
it if there was one?

> For my current hack, I can pass IOMMU_HWPT_ALLOC_NEST_PARENT to QEMU's
> iommufd_backend_alloc_hwpt() but it is... meh. Thanks,
 
Again, assuming you have some other iommufd patches, I think you
could have a small patch dropping this check -- we might need an
additional flag or another HWPT type, but we can discuss later.

Thanks
Nicolin
Jason Gunthorpe Nov. 7, 2024, 6:46 p.m. UTC | #3
On Thu, Nov 07, 2024 at 12:37:59PM +1100, Alexey Kardashevskiy wrote:
> > +	hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id);
> > +	if (IS_ERR(hwpt_paging)) {
> > +		rc = PTR_ERR(hwpt_paging);
> > +		goto out_put_idev;
> > +	}
> > +
> > +	if (!hwpt_paging->nest_parent) {
> 
> 
> I am trying to use this for my so-called viommu which does not have this so
> this fails. My viommu is only visible to the VM via paravirt interface to
> populate sDTE (secure device table) so no viommu device in the guest or
> nested paging, nothing like this just yet.

The purpose of this is to define the translation underlying the
viommu and the VM.

If I understand AMD CC HW correctly, you still need to have a
translation to make the device work - so you should have a
nest_parent.

For AMD the nest_parent is simply a v1 domain and, with what is in
Joerg's tree the AMD driver can quickly be improved to support that
allocation option.

> Is there a writeup somewhere about possible uses of this
> IOMMU_HWPT_ALLOC_NEST_PARENT? I'd think one pass through device equals 1
> IOAS, 1 HWPT, 1 domain, 1 viommu, even with guest visible vIOMMU but it is
> not the case, is it?

It is intended for HW like AMD that requires selecting a special page
table format to be used on the hypervisor side. Ie select the v1 page
table format.

> btw is there a way to get a snapshot of all current objects in iommufd? They
> all use "dev_id" and not file descriptors so cannot look at /proc/><pid>/fd,
> and there is nothing in debugfs either.

Nothing has been done, a debugfs could be interesting

> For my current hack, I can pass IOMMU_HWPT_ALLOC_NEST_PARENT to QEMU's
> iommufd_backend_alloc_hwpt() but it is... meh. Thanks,

This is what I'd expect, yes. Qemu should be allocating domains that
are going to be used as part of advanced virtualization (viommu,
nesting, etc) using NEST_PARENT, AMD driver should accept this and
enforce the v1 format.

Jason
diff mbox series

Patch

diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
index 266e23d657ff..8f6f0dd038d0 100644
--- a/drivers/iommu/iommufd/Makefile
+++ b/drivers/iommu/iommufd/Makefile
@@ -7,7 +7,8 @@  iommufd-y := \
 	ioas.o \
 	main.o \
 	pages.o \
-	vfio_compat.o
+	vfio_compat.o \
+	viommu.o
 
 iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o
 
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index be347f726fda..a8104d9d4cef 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -506,6 +506,9 @@  static inline int iommufd_hwpt_replace_device(struct iommufd_device *idev,
 	return iommu_group_replace_domain(idev->igroup->group, hwpt->domain);
 }
 
+int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd);
+void iommufd_viommu_destroy(struct iommufd_object *obj);
+
 #ifdef CONFIG_IOMMUFD_TEST
 int iommufd_test(struct iommufd_ucmd *ucmd);
 void iommufd_selftest_destroy(struct iommufd_object *obj);
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 41b1a01e9293..302844136b02 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -52,6 +52,7 @@  enum {
 	IOMMUFD_CMD_HWPT_INVALIDATE = 0x8d,
 	IOMMUFD_CMD_FAULT_QUEUE_ALLOC = 0x8e,
 	IOMMUFD_CMD_IOAS_MAP_FILE = 0x8f,
+	IOMMUFD_CMD_VIOMMU_ALLOC = 0x90,
 };
 
 /**
@@ -822,4 +823,43 @@  struct iommu_fault_alloc {
 	__u32 out_fault_fd;
 };
 #define IOMMU_FAULT_QUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_FAULT_QUEUE_ALLOC)
+
+/**
+ * enum iommu_viommu_type - Virtual IOMMU Type
+ * @IOMMU_VIOMMU_TYPE_DEFAULT: Reserved for future use
+ */
+enum iommu_viommu_type {
+	IOMMU_VIOMMU_TYPE_DEFAULT = 0,
+};
+
+/**
+ * struct iommu_viommu_alloc - ioctl(IOMMU_VIOMMU_ALLOC)
+ * @size: sizeof(struct iommu_viommu_alloc)
+ * @flags: Must be 0
+ * @type: Type of the virtual IOMMU. Must be defined in enum iommu_viommu_type
+ * @dev_id: The device's physical IOMMU will be used to back the virtual IOMMU
+ * @hwpt_id: ID of a nesting parent HWPT to associate to
+ * @out_viommu_id: Output virtual IOMMU ID for the allocated object
+ *
+ * Allocate a virtual IOMMU object, representing the underlying physical IOMMU's
+ * virtualization support that is a security-isolated slice of the real IOMMU HW
+ * that is unique to a specific VM. Operations global to the IOMMU are connected
+ * to the vIOMMU, such as:
+ * - Security namespace for guest owned ID, e.g. guest-controlled cache tags
+ * - Non-device-affiliated event reporting, e.g. invalidation queue errors
+ * - Access to a sharable nesting parent pagetable across physical IOMMUs
+ * - Virtualization of various platforms IDs, e.g. RIDs and others
+ * - Delivery of paravirtualized invalidation
+ * - Direct assigned invalidation queues
+ * - Direct assigned interrupts
+ */
+struct iommu_viommu_alloc {
+	__u32 size;
+	__u32 flags;
+	__u32 type;
+	__u32 dev_id;
+	__u32 hwpt_id;
+	__u32 out_viommu_id;
+};
+#define IOMMU_VIOMMU_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIOMMU_ALLOC)
 #endif
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 30e6c2af3b45..cc514f9bc3e6 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -307,6 +307,7 @@  union ucmd_buffer {
 	struct iommu_ioas_unmap unmap;
 	struct iommu_option option;
 	struct iommu_vfio_ioas vfio_ioas;
+	struct iommu_viommu_alloc viommu;
 #ifdef CONFIG_IOMMUFD_TEST
 	struct iommu_test_cmd test;
 #endif
@@ -360,6 +361,8 @@  static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
 		 val64),
 	IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas,
 		 __reserved),
+	IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl,
+		 struct iommu_viommu_alloc, out_viommu_id),
 #ifdef CONFIG_IOMMUFD_TEST
 	IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last),
 #endif
@@ -495,6 +498,9 @@  static const struct iommufd_object_ops iommufd_object_ops[] = {
 	[IOMMUFD_OBJ_FAULT] = {
 		.destroy = iommufd_fault_destroy,
 	},
+	[IOMMUFD_OBJ_VIOMMU] = {
+		.destroy = iommufd_viommu_destroy,
+	},
 #ifdef CONFIG_IOMMUFD_TEST
 	[IOMMUFD_OBJ_SELFTEST] = {
 		.destroy = iommufd_selftest_destroy,
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
new file mode 100644
index 000000000000..888239b78667
--- /dev/null
+++ b/drivers/iommu/iommufd/viommu.c
@@ -0,0 +1,81 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+#include "iommufd_private.h"
+
+void iommufd_viommu_destroy(struct iommufd_object *obj)
+{
+	struct iommufd_viommu *viommu =
+		container_of(obj, struct iommufd_viommu, obj);
+
+	if (viommu->ops && viommu->ops->destroy)
+		viommu->ops->destroy(viommu);
+	refcount_dec(&viommu->hwpt->common.obj.users);
+}
+
+int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_viommu_alloc *cmd = ucmd->cmd;
+	struct iommufd_hwpt_paging *hwpt_paging;
+	struct iommufd_viommu *viommu;
+	struct iommufd_device *idev;
+	const struct iommu_ops *ops;
+	int rc;
+
+	if (cmd->flags || cmd->type == IOMMU_VIOMMU_TYPE_DEFAULT)
+		return -EOPNOTSUPP;
+
+	idev = iommufd_get_device(ucmd, cmd->dev_id);
+	if (IS_ERR(idev))
+		return PTR_ERR(idev);
+
+	ops = dev_iommu_ops(idev->dev);
+	if (!ops->viommu_alloc) {
+		rc = -EOPNOTSUPP;
+		goto out_put_idev;
+	}
+
+	hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id);
+	if (IS_ERR(hwpt_paging)) {
+		rc = PTR_ERR(hwpt_paging);
+		goto out_put_idev;
+	}
+
+	if (!hwpt_paging->nest_parent) {
+		rc = -EINVAL;
+		goto out_put_hwpt;
+	}
+
+	viommu = ops->viommu_alloc(idev->dev, hwpt_paging->common.domain,
+				   ucmd->ictx, cmd->type);
+	if (IS_ERR(viommu)) {
+		rc = PTR_ERR(viommu);
+		goto out_put_hwpt;
+	}
+
+	viommu->type = cmd->type;
+	viommu->ictx = ucmd->ictx;
+	viommu->hwpt = hwpt_paging;
+	refcount_inc(&viommu->hwpt->common.obj.users);
+	/*
+	 * It is the most likely case that a physical IOMMU is unpluggable. A
+	 * pluggable IOMMU instance (if exists) is responsible for refcounting
+	 * on its own.
+	 */
+	viommu->iommu_dev = __iommu_get_iommu_dev(idev->dev);
+
+	cmd->out_viommu_id = viommu->obj.id;
+	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+	if (rc)
+		goto out_abort;
+	iommufd_object_finalize(ucmd->ictx, &viommu->obj);
+	goto out_put_hwpt;
+
+out_abort:
+	iommufd_object_abort_and_destroy(ucmd->ictx, &viommu->obj);
+out_put_hwpt:
+	iommufd_put_object(ucmd->ictx, &hwpt_paging->common.obj);
+out_put_idev:
+	iommufd_put_object(ucmd->ictx, &idev->obj);
+	return rc;
+}