@@ -222,6 +222,109 @@ void spapr_set_associativity(uint32_t *assoc, int node_id, int cpu_index,
assoc[4] = cpu_to_be32(node_id);
}
+static void spapr_numa_assoc_assign_domain(SpaprMachineClass *smc,
+ uint8_t nodeA, uint8_t nodeB,
+ uint8_t numaLevel,
+ uint8_t curr_domain)
+{
+ uint8_t assoc_A, assoc_B;
+
+ assoc_A = smc->numa_assoc_domains[nodeA][numaLevel];
+ assoc_B = smc->numa_assoc_domains[nodeB][numaLevel];
+
+ /* No associativity domains on both. Assign and move on */
+ if ((assoc_A | assoc_B) == 0) {
+ smc->numa_assoc_domains[nodeA][numaLevel] = curr_domain;
+ smc->numa_assoc_domains[nodeB][numaLevel] = curr_domain;
+ return;
+ }
+
+ /* Use the existing assoc domain of any of the nodes to not
+ * disrupt previous associations already defined */
+ if (assoc_A != 0) {
+ smc->numa_assoc_domains[nodeB][numaLevel] = assoc_A;
+ } else {
+ smc->numa_assoc_domains[nodeA][numaLevel] = assoc_B;
+ }
+}
+
+static void spapr_init_numa_assoc_domains(MachineState *machine)
+{
+ SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
+ int nb_numa_nodes = machine->numa_state->num_nodes;
+ NodeInfo *numa_info = machine->numa_state->nodes;
+ uint8_t existing_nodes[nb_numa_nodes];
+ int i, j, src_node, dst_node, index = 0;
+
+ /* We don't have information about any extra NUMA nodes that
+ * the machine might create at this point (e.g. NVLink2 GPUs).
+ * Assigning associativity domains with low numbers might have
+ * unintended consequences in the presence of GPUs, which are
+ * supposed to always be at maximum distance of everything else,
+ * because we might end up using a GPU numa_id identifier by
+ * accident.
+ *
+ * Starting this counter at MAX_NODES avoids any possible
+ * collision since no NUMA id can reach this value. */
+ uint8_t assoc_domain = MAX_NODES;
+
+ /* We will assume that the NUMA nodes might be sparsed. This
+ * preliminary fetch step is required to avoid having to search
+ * for an existing NUMA node more than once. */
+ for (i = 0; i < MAX_NODES; i++) {
+ if (numa_info[i].present) {
+ existing_nodes[index++] = i;
+ if (index == nb_numa_nodes) {
+ break;
+ }
+ }
+ }
+
+ /* Start iterating through the existing numa nodes to
+ * define associativity groups */
+ for (i = 0; i < nb_numa_nodes; i++) {
+ uint8_t distance = 20;
+ uint8_t lower_end = 10;
+ uint8_t upper_end = 0;
+
+ src_node = existing_nodes[i];
+
+ /* Calculate all associativity domains src_node belongs to. */
+ for(index = 0; index < 3; index++) {
+ upper_end = distance/2 + distance;
+
+ for(j = i + 1; j < nb_numa_nodes; j++) {
+ uint8_t node_dist;
+
+ dst_node = existing_nodes[j];
+ node_dist = numa_info[src_node].distance[dst_node];
+
+ if (node_dist > lower_end && node_dist <= upper_end) {
+ spapr_numa_assoc_assign_domain(smc, src_node, dst_node,
+ 2 - index, assoc_domain);
+ assoc_domain++;
+ }
+ }
+
+ lower_end = upper_end;
+ distance *= 2;
+ }
+ }
+
+ /* Zero (0) is considered a valid associativity domain identifier.
+ * To avoid NUMA nodes having matches where it wasn't intended, fill
+ * the zeros with unique identifiers. */
+ for (i = 0; i < nb_numa_nodes; i++) {
+ src_node = existing_nodes[i];
+ for (j = 0; j < 3; j++) {
+ if (smc->numa_assoc_domains[src_node][j] == 0) {
+ smc->numa_assoc_domains[src_node][j] = assoc_domain;
+ assoc_domain++;
+ }
+ }
+ }
+ }
+
static int spapr_fixup_cpu_numa_dt(void *fdt, int offset, PowerPCCPU *cpu,
MachineState *machine)
{
@@ -2887,6 +2990,12 @@ static void spapr_machine_init(MachineState *machine)
spapr->current_numa_id = 0;
spapr->extra_numa_nodes = 0;
+ /* We don't need to init the NUMA matrix if we're running in
+ * legacy NUMA mode. */
+ if (!spapr_machine_using_legacy_numa(spapr)) {
+ spapr_init_numa_assoc_domains(machine);
+ }
+
if ((!kvm_enabled() || kvmppc_has_cap_mmu_radix()) &&
ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
spapr->max_compat_pvr)) {
This patch puts all the pieces together to finally allow user input when defining the NUMA topology of the spapr guest. The logic is centered in the new spapr_init_numa_assoc_domains() function. This is called once at machine_init(), if we're not using legacy_numa mode, to initiate the numa_assoc_domain matrix introduced in the previous patch. We can divide the logic in two that are mashed together in the body of this function. First stage is to sanitize the user input from numa_state. Due to the nature of what ACPI allows the user to do (directly define the distances the guest will see in the DT) versus what PAPR allows (we can hint at associativity relations, the OS must decide what to do), we had to bake in kernel logic in here. The kernel allows 4 levels of NUMA, where the last one is always the node_id itself, with distance = 10. The other levels doubles the distances of previous levels, meaning that the pSeries kernel will only show distances of 20, 40, 80 and 160 (in case no match is found). This first stage is then to get the distances defined by the user and approximate them to those discrete values: - user distance 11 to 30 will be interpreted as 20 - user distance 31 to 60 will be interpreted as 40 - user distance 61 to 120 will be interpreted as 80 - user distance 121 and beyond will be interpreted as 160 - user distance 10 stays 10 The other stage is defining the associativity domains based on the NUMA level match. Again, more than one strategy exists for this same problem, with different results. The approach taken is to re-use any existing associativity values to the new matches, instead of overwriting them with a new associativity match. This decision is necessary because neither we, nor the pSeries kernel, supports multiple associativity domains for each resource, meaning that we have to decide what to preserve. With the current logic, the associativities established by the earlier nodes take precedence, i.e. associativities defined by the first node are retained along all other nodes. These decisions have direct impact on how the user will interact with the NUMA topology, and none of them are perfect. To keep this commit message no longer than it already is, let's update the existing documentation in ppc-spapr-numa.rst with more in depth details and design considerations/drawbacks in the next patch. Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com> --- hw/ppc/spapr.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+)