diff mbox series

[v4,8/9] blk-mq: use hk cpus only when isolcpus=managed_irq is enabled

Message ID 20241217-isolcpus-io-queues-v4-8-5d355fbb1e14@kernel.org
State Superseded
Headers show
Series blk: honor isolcpus configuration | expand

Commit Message

Daniel Wagner Dec. 17, 2024, 6:29 p.m. UTC
When isolcpus=managed_irq is enabled all hardware queues should run on
the housekeeping CPUs only. Thus ignore the affinity mask provided by
the driver. Also we can't use blk_mq_map_queues because it will map all
CPUs to first hctx unless, the CPU is the same as the hctx has the
affinity set to, e.g. 8 CPUs with isolcpus=managed_irq,2-3,6-7 config

  queue mapping for /dev/nvme0n1
        hctx0: default 2 3 4 6 7
        hctx1: default 5
        hctx2: default 0
        hctx3: default 1

  PCI name is 00:05.0: nvme0n1
        irq 57 affinity 0-1 effective 1 is_managed:0 nvme0q0
        irq 58 affinity 4 effective 4 is_managed:1 nvme0q1
        irq 59 affinity 5 effective 5 is_managed:1 nvme0q2
        irq 60 affinity 0 effective 0 is_managed:1 nvme0q3
        irq 61 affinity 1 effective 1 is_managed:1 nvme0q4

where as with blk_mq_hk_map_queues we get

  queue mapping for /dev/nvme0n1
        hctx0: default 2 4
        hctx1: default 3 5
        hctx2: default 0 6
        hctx3: default 1 7

  PCI name is 00:05.0: nvme0n1
        irq 56 affinity 0-1 effective 1 is_managed:0 nvme0q0
        irq 61 affinity 4 effective 4 is_managed:1 nvme0q1
        irq 62 affinity 5 effective 5 is_managed:1 nvme0q2
        irq 63 affinity 0 effective 0 is_managed:1 nvme0q3
        irq 64 affinity 1 effective 1 is_managed:1 nvme0q4

Signed-off-by: Daniel Wagner <wagi@kernel.org>
---
 block/blk-mq-cpumap.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

Comments

Ming Lei Dec. 19, 2024, 9:20 a.m. UTC | #1
On Tue, Dec 17, 2024 at 07:29:42PM +0100, Daniel Wagner wrote:
> When isolcpus=managed_irq is enabled all hardware queues should run on
> the housekeeping CPUs only. Thus ignore the affinity mask provided by
> the driver. Also we can't use blk_mq_map_queues because it will map all
> CPUs to first hctx unless, the CPU is the same as the hctx has the
> affinity set to, e.g. 8 CPUs with isolcpus=managed_irq,2-3,6-7 config
> 
>   queue mapping for /dev/nvme0n1
>         hctx0: default 2 3 4 6 7
>         hctx1: default 5
>         hctx2: default 0
>         hctx3: default 1
> 
>   PCI name is 00:05.0: nvme0n1
>         irq 57 affinity 0-1 effective 1 is_managed:0 nvme0q0
>         irq 58 affinity 4 effective 4 is_managed:1 nvme0q1
>         irq 59 affinity 5 effective 5 is_managed:1 nvme0q2
>         irq 60 affinity 0 effective 0 is_managed:1 nvme0q3
>         irq 61 affinity 1 effective 1 is_managed:1 nvme0q4
> 
> where as with blk_mq_hk_map_queues we get
> 
>   queue mapping for /dev/nvme0n1
>         hctx0: default 2 4
>         hctx1: default 3 5
>         hctx2: default 0 6
>         hctx3: default 1 7
> 
>   PCI name is 00:05.0: nvme0n1
>         irq 56 affinity 0-1 effective 1 is_managed:0 nvme0q0
>         irq 61 affinity 4 effective 4 is_managed:1 nvme0q1
>         irq 62 affinity 5 effective 5 is_managed:1 nvme0q2
>         irq 63 affinity 0 effective 0 is_managed:1 nvme0q3
>         irq 64 affinity 1 effective 1 is_managed:1 nvme0q4
> 
> Signed-off-by: Daniel Wagner <wagi@kernel.org>
> ---
>  block/blk-mq-cpumap.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 66 insertions(+)
> 
> diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
> index b3a863c2db3231624685ab54a1810b22af4111f4..38016bf1be8af14ef368e68d3fd12416858e3da6 100644
> --- a/block/blk-mq-cpumap.c
> +++ b/block/blk-mq-cpumap.c
> @@ -61,11 +61,74 @@ unsigned int blk_mq_num_online_queues(unsigned int max_queues)
>  }
>  EXPORT_SYMBOL_GPL(blk_mq_num_online_queues);
>  
> +/*
> + * blk_mq_map_hk_queues - Create housekeeping CPU to hardware queue mapping
> + * @qmap:	CPU to hardware queue map
> + *
> + * Create a housekeeping CPU to hardware queue mapping in @qmap. If the
> + * isolcpus feature is enabled and blk_mq_map_hk_queues returns true,
> + * @qmap contains a valid configuration honoring the managed_irq
> + * configuration. If the isolcpus feature is disabled this function
> + * returns false.
> + */
> +static bool blk_mq_map_hk_queues(struct blk_mq_queue_map *qmap)
> +{
> +	struct cpumask *hk_masks;
> +	cpumask_var_t isol_mask;
> +	unsigned int queue, cpu, nr_masks;
> +
> +	if (!housekeeping_enabled(HK_TYPE_MANAGED_IRQ))
> +		return false;
> +
> +	/* map housekeeping cpus to matching hardware context */
> +	nr_masks = qmap->nr_queues;
> +	hk_masks = group_cpus_evenly(&nr_masks);
> +	if (!hk_masks)
> +		goto fallback;
> +
> +	for (queue = 0; queue < qmap->nr_queues; queue++) {
> +		for_each_cpu(cpu, &hk_masks[queue % nr_masks])
> +			qmap->mq_map[cpu] = qmap->queue_offset + queue;
> +	}
> +
> +	kfree(hk_masks);
> +
> +	/* map isolcpus to hardware context */
> +	if (!alloc_cpumask_var(&isol_mask, GFP_KERNEL))
> +		goto fallback;
> +
> +	queue = 0;
> +	cpumask_andnot(isol_mask,
> +		       cpu_possible_mask,
> +		       housekeeping_cpumask(HK_TYPE_MANAGED_IRQ));
> +
> +	for_each_cpu(cpu, isol_mask) {
> +		qmap->mq_map[cpu] = qmap->queue_offset + queue;
> +		queue = (queue + 1) % qmap->nr_queues;
> +	}

Looks the IO hang issue in V3 isn't addressed yet, is it?

https://lore.kernel.org/linux-block/ZrtX4pzqwVUEgIPS@fedora/


Thanks,
Ming
Ming Lei Dec. 20, 2024, 8:54 a.m. UTC | #2
On Thu, Dec 19, 2024 at 04:38:43PM +0100, Daniel Wagner wrote:

> When isolcpus=managed_irq is enabled all hardware queues should run on
> the housekeeping CPUs only. Thus ignore the affinity mask provided by
> the driver.

Compared with in-tree code, the above words are misleading.

- irq core code respects isolated CPUs by trying to exclude isolated
CPUs from effective masks

- blk-mq won't schedule blockd on isolated CPUs

If application aren't run on isolated CPUs, IO interrupt usually won't
be triggered on isolated CPUs, so isolated CPUs are _not_ ignored.

> On Thu, Dec 19, 2024 at 05:20:44PM +0800, Ming Lei wrote:
> > > +	cpumask_andnot(isol_mask,
> > > +		       cpu_possible_mask,
> > > +		       housekeeping_cpumask(HK_TYPE_MANAGED_IRQ));
> > > +
> > > +	for_each_cpu(cpu, isol_mask) {
> > > +		qmap->mq_map[cpu] = qmap->queue_offset + queue;
> > > +		queue = (queue + 1) % qmap->nr_queues;
> > > +	}
> > 
> > Looks the IO hang issue in V3 isn't addressed yet, is it?
> > 
> > https://lore.kernel.org/linux-block/ZrtX4pzqwVUEgIPS@fedora/
> 
> I've added an explanation in the cover letter why this is not
> addressed. From the cover letter:
> 
> I've experimented for a while and all solutions I came up were horrible
> hacks (the hotpath needs to be touched) and I don't want to slow down all
> other users (which are almost everyone). IMO, it's just not worth trying

IMO, this patchset is one improvement on existed best-effort approach, which
works fine most of times, so why you do think it slows down everyone?

> to fix this corner case. If the user is using isolcpus and does CPU
> hotplug, we can expect that the user can also first offline the isolated
> CPUs. I've discussed this topic during ALPSS and the room came to the
> same conclusion. Thus I just added a patch which issues a warning that
> IOs are likely to hang.

If the change need userspace cooperation for using 'managed_irq', the exact
behavior need to be documented in both this commit and Documentation/admin-guide/kernel-parameters.txt,
instead of cover-letter only.

But this patch does cause regression for old applications which can't
follow the new introduced rule:

	```
	If the user is using isolcpus and does CPU hotplug, we can expect that the
	user can also first offline the isolated CPUs.
	```

Thanks,
Ming
Daniel Wagner Jan. 10, 2025, 9:21 a.m. UTC | #3
Hi Ming,

On Fri, Dec 20, 2024 at 04:54:21PM +0800, Ming Lei wrote:
> On Thu, Dec 19, 2024 at 04:38:43PM +0100, Daniel Wagner wrote:
> 
> > When isolcpus=managed_irq is enabled all hardware queues should run on
> > the housekeeping CPUs only. Thus ignore the affinity mask provided by
> > the driver.
> 
> Compared with in-tree code, the above words are misleading.
> 
> - irq core code respects isolated CPUs by trying to exclude isolated
> CPUs from effective masks
> 
> - blk-mq won't schedule blockd on isolated CPUs

I see your point, the commit should highlight the fact when an
application is issuing an I/O, this can lead to stalls.

What about a commit message like:

  When isolcpus=managed_irq is enabled, and the last housekeeping CPU for
  a given hardware context goes offline, there is no CPU left which
  handles the IOs anymore. If isolated CPUs mapped to this hardware
  context are online and an application running on these isolated CPUs
  issue an IO this will lead to stalls.

  The kernel will not schedule IO to isolated CPUS thus this avoids IO
  stalls.

  Thus issue a warning when housekeeping CPUs are offlined for a hardware
  context while there are still isolated CPUs online.

> If application aren't run on isolated CPUs, IO interrupt usually won't
> be triggered on isolated CPUs, so isolated CPUs are _not_ ignored.

FWIW, the 'usually' part is what made our customers nervous. They saw
some IRQ noise on the isolated CPUs with their workload and reported
with these changes all was good. Unfortunately, we never got the hands
on the workload, hard to say what was causing it.

> > On Thu, Dec 19, 2024 at 05:20:44PM +0800, Ming Lei wrote:
> > > > +	cpumask_andnot(isol_mask,
> > > > +		       cpu_possible_mask,
> > > > +		       housekeeping_cpumask(HK_TYPE_MANAGED_IRQ));
> > > > +
> > > > +	for_each_cpu(cpu, isol_mask) {
> > > > +		qmap->mq_map[cpu] = qmap->queue_offset + queue;
> > > > +		queue = (queue + 1) % qmap->nr_queues;
> > > > +	}
> > > 
> > > Looks the IO hang issue in V3 isn't addressed yet, is it?
> > > 
> > > https://lore.kernel.org/linux-block/ZrtX4pzqwVUEgIPS@fedora/
> > 
> > I've added an explanation in the cover letter why this is not
> > addressed. From the cover letter:
> > 
> > I've experimented for a while and all solutions I came up were horrible
> > hacks (the hotpath needs to be touched) and I don't want to slow down all
> > other users (which are almost everyone). IMO, it's just not worth trying
> 
> IMO, this patchset is one improvement on existed best-effort approach, which
> works fine most of times, so why you do think it slows down everyone?

I was talking about implementing the feature which would remap the
isolated CPUs to online hardware context when the current hardware
context goes offline. I didn't find a solution which I think would be
worth presenting. All involved some sort of locking/refcounting in the
hotpath, which I think we should just avoid.

> > to fix this corner case. If the user is using isolcpus and does CPU
> > hotplug, we can expect that the user can also first offline the isolated
> > CPUs. I've discussed this topic during ALPSS and the room came to the
> > same conclusion. Thus I just added a patch which issues a warning that
> > IOs are likely to hang.
> 
> If the change need userspace cooperation for using 'managed_irq', the exact
> behavior need to be documented in both this commit and Documentation/admin-guide/kernel-parameters.txt,
> instead of cover-letter only.
> 
> But this patch does cause regression for old applications which can't
> follow the new introduced rule:
> 
> 	```
> 	If the user is using isolcpus and does CPU hotplug, we can expect that the
> 	user can also first offline the isolated CPUs.
> 	```

Indeed, I forgot to update the documentation. I'll update it accordingly.

Thanks,
Daniel
Ming Lei Jan. 11, 2025, 3:31 a.m. UTC | #4
On Fri, Jan 10, 2025 at 10:21:49AM +0100, Daniel Wagner wrote:
> Hi Ming,
> 
> On Fri, Dec 20, 2024 at 04:54:21PM +0800, Ming Lei wrote:
> > On Thu, Dec 19, 2024 at 04:38:43PM +0100, Daniel Wagner wrote:
> > 
> > > When isolcpus=managed_irq is enabled all hardware queues should run on
> > > the housekeeping CPUs only. Thus ignore the affinity mask provided by
> > > the driver.
> > 
> > Compared with in-tree code, the above words are misleading.
> > 
> > - irq core code respects isolated CPUs by trying to exclude isolated
> > CPUs from effective masks
> > 
> > - blk-mq won't schedule blockd on isolated CPUs
> 
> I see your point, the commit should highlight the fact when an
> application is issuing an I/O, this can lead to stalls.
> 
> What about a commit message like:
> 
>   When isolcpus=managed_irq is enabled, and the last housekeeping CPU for
>   a given hardware context goes offline, there is no CPU left which
>   handles the IOs anymore. If isolated CPUs mapped to this hardware
>   context are online and an application running on these isolated CPUs
>   issue an IO this will lead to stalls.

It isn't correct, the in-tree code doesn't have such stall, no matter if
IO is issued from HK or isolated CPUs since the managed irq is guaranteed to
live if any mapped CPU is online.

> 
>   The kernel will not schedule IO to isolated CPUS thus this avoids IO
>   stalls.
> 
>   Thus issue a warning when housekeeping CPUs are offlined for a hardware
>   context while there are still isolated CPUs online.
> 
> > If application aren't run on isolated CPUs, IO interrupt usually won't
> > be triggered on isolated CPUs, so isolated CPUs are _not_ ignored.
> 
> FWIW, the 'usually' part is what made our customers nervous. They saw
> some IRQ noise on the isolated CPUs with their workload and reported
> with these changes all was good. Unfortunately, we never got the hands
> on the workload, hard to say what was causing it.

Please see irq_do_set_affinity():

        if (irqd_affinity_is_managed(data) &&
              housekeeping_enabled(HK_TYPE_MANAGED_IRQ)) {
                  const struct cpumask *hk_mask;

                  hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ);

                  cpumask_and(&tmp_mask, mask, hk_mask);
                  if (!cpumask_intersects(&tmp_mask, cpu_online_mask))
                          prog_mask = mask;
                  else
                          prog_mask = &tmp_mask;
          } else {
                  prog_mask = mask;

The whole mask which may include isolated CPUs is only programmed to
hardware if there isn't any online CPU in `irq_mask & hk_mask`.

> 
> > > On Thu, Dec 19, 2024 at 05:20:44PM +0800, Ming Lei wrote:
> > > > > +	cpumask_andnot(isol_mask,
> > > > > +		       cpu_possible_mask,
> > > > > +		       housekeeping_cpumask(HK_TYPE_MANAGED_IRQ));
> > > > > +
> > > > > +	for_each_cpu(cpu, isol_mask) {
> > > > > +		qmap->mq_map[cpu] = qmap->queue_offset + queue;
> > > > > +		queue = (queue + 1) % qmap->nr_queues;
> > > > > +	}
> > > > 
> > > > Looks the IO hang issue in V3 isn't addressed yet, is it?
> > > > 
> > > > https://lore.kernel.org/linux-block/ZrtX4pzqwVUEgIPS@fedora/
> > > 
> > > I've added an explanation in the cover letter why this is not
> > > addressed. From the cover letter:
> > > 
> > > I've experimented for a while and all solutions I came up were horrible
> > > hacks (the hotpath needs to be touched) and I don't want to slow down all
> > > other users (which are almost everyone). IMO, it's just not worth trying
> > 
> > IMO, this patchset is one improvement on existed best-effort approach, which
> > works fine most of times, so why you do think it slows down everyone?
> 
> I was talking about implementing the feature which would remap the
> isolated CPUs to online hardware context when the current hardware
> context goes offline. I didn't find a solution which I think would be
> worth presenting. All involved some sort of locking/refcounting in the
> hotpath, which I think we should just avoid.

I understand the trouble, but it is still one improvement from user
viewpoint instead of feature since the interface of 'isolcpus=manage_irq'
isn't changed.

> 
> > > to fix this corner case. If the user is using isolcpus and does CPU
> > > hotplug, we can expect that the user can also first offline the isolated
> > > CPUs. I've discussed this topic during ALPSS and the room came to the
> > > same conclusion. Thus I just added a patch which issues a warning that
> > > IOs are likely to hang.
> > 
> > If the change need userspace cooperation for using 'managed_irq', the exact
> > behavior need to be documented in both this commit and Documentation/admin-guide/kernel-parameters.txt,
> > instead of cover-letter only.
> > 
> > But this patch does cause regression for old applications which can't
> > follow the new introduced rule:
> > 
> > 	```
> > 	If the user is using isolcpus and does CPU hotplug, we can expect that the
> > 	user can also first offline the isolated CPUs.
> > 	```
> 
> Indeed, I forgot to update the documentation. I'll update it accordingly.

It isn't documentation thing, it breaks the no-regression policy, which crosses
our red-line.

If you really want to move on, please add one new kernel command
line with documenting the new usage which requires applications to
offline CPU in order.


thanks,
Ming
diff mbox series

Patch

diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index b3a863c2db3231624685ab54a1810b22af4111f4..38016bf1be8af14ef368e68d3fd12416858e3da6 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -61,11 +61,74 @@  unsigned int blk_mq_num_online_queues(unsigned int max_queues)
 }
 EXPORT_SYMBOL_GPL(blk_mq_num_online_queues);
 
+/*
+ * blk_mq_map_hk_queues - Create housekeeping CPU to hardware queue mapping
+ * @qmap:	CPU to hardware queue map
+ *
+ * Create a housekeeping CPU to hardware queue mapping in @qmap. If the
+ * isolcpus feature is enabled and blk_mq_map_hk_queues returns true,
+ * @qmap contains a valid configuration honoring the managed_irq
+ * configuration. If the isolcpus feature is disabled this function
+ * returns false.
+ */
+static bool blk_mq_map_hk_queues(struct blk_mq_queue_map *qmap)
+{
+	struct cpumask *hk_masks;
+	cpumask_var_t isol_mask;
+	unsigned int queue, cpu, nr_masks;
+
+	if (!housekeeping_enabled(HK_TYPE_MANAGED_IRQ))
+		return false;
+
+	/* map housekeeping cpus to matching hardware context */
+	nr_masks = qmap->nr_queues;
+	hk_masks = group_cpus_evenly(&nr_masks);
+	if (!hk_masks)
+		goto fallback;
+
+	for (queue = 0; queue < qmap->nr_queues; queue++) {
+		for_each_cpu(cpu, &hk_masks[queue % nr_masks])
+			qmap->mq_map[cpu] = qmap->queue_offset + queue;
+	}
+
+	kfree(hk_masks);
+
+	/* map isolcpus to hardware context */
+	if (!alloc_cpumask_var(&isol_mask, GFP_KERNEL))
+		goto fallback;
+
+	queue = 0;
+	cpumask_andnot(isol_mask,
+		       cpu_possible_mask,
+		       housekeeping_cpumask(HK_TYPE_MANAGED_IRQ));
+
+	for_each_cpu(cpu, isol_mask) {
+		qmap->mq_map[cpu] = qmap->queue_offset + queue;
+		queue = (queue + 1) % qmap->nr_queues;
+	}
+
+	free_cpumask_var(isol_mask);
+
+	return true;
+
+fallback:
+	/* map all cpus to hardware context ignoring any affinity */
+	queue = 0;
+	for_each_possible_cpu(cpu) {
+		qmap->mq_map[cpu] = qmap->queue_offset + queue;
+		queue = (queue + 1) % qmap->nr_queues;
+	}
+	return true;
+}
+
 void blk_mq_map_queues(struct blk_mq_queue_map *qmap)
 {
 	const struct cpumask *masks;
 	unsigned int queue, cpu, nr_masks;
 
+	if (blk_mq_map_hk_queues(qmap))
+		return;
+
 	nr_masks = qmap->nr_queues;
 	masks = group_cpus_evenly(&nr_masks);
 	if (!masks) {
@@ -121,6 +184,9 @@  void blk_mq_map_hw_queues(struct blk_mq_queue_map *qmap,
 	if (!dev->bus->irq_get_affinity)
 		goto fallback;
 
+	if (blk_mq_map_hk_queues(qmap))
+		return;
+
 	for (queue = 0; queue < qmap->nr_queues; queue++) {
 		mask = dev->bus->irq_get_affinity(dev, queue + offset);
 		if (!mask)