From a9c0fc5e0e136eb31ce2d5eab4b7332b99626aa8 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 11 Mar 2026 23:05:38 +0000 Subject: [PATCH 01/49] Revert "NVIDIA: SAUCE: untested: arm_mpam: resctrl: Allow resctrl to enable NUMA nid as MB domain-id" This reverts commit 448da3e0e4a3f779a02072996770318b9b8ecce5. The commit will be replaced by newer numa code. Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 49 ---------------------------------- include/linux/arm_mpam.h | 11 ++++++-- 2 files changed, 9 insertions(+), 51 deletions(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 8e87afa90656a..da079b343c1bd 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include @@ -1574,54 +1573,6 @@ static int mpam_resctrl_monitor_init_abmc(struct mpam_resctrl_mon *mon) return 0; } -bool resctrl_arch_get_mb_uses_numa_nid(void) -{ - return mb_uses_numa_nid; -} - -int resctrl_arch_set_mb_uses_numa_nid(bool enabled) -{ - struct rdt_resource *r; - struct mpam_resctrl_res *res; - struct mpam_resctrl_dom *dom; - struct rdt_ctrl_domain *ctrl_d; - - lockdep_assert_cpus_held(); - lockdep_assert_mems_held(); - - if (!mb_numa_nid_possible) - return -EOPNOTSUPP; - - if (mb_uses_numa_nid == enabled) - return 0; - - /* Domain IDs as NUMA nid is only defined for MBA */ - res = &mpam_resctrl_controls[RDT_RESOURCE_MBA]; - if (!res->class) - return -EOPNOTSUPP; - r = &res->resctrl_res; - - /* repaint the domain IDs */ - mb_uses_numa_nid = enabled; - list_for_each_entry(ctrl_d, &r->ctrl_domains, hdr.list) { - int cpu = cpumask_any(&ctrl_d->hdr.cpu_mask); - - dom = container_of(ctrl_d, struct mpam_resctrl_dom, resctrl_ctrl_dom); - ctrl_d->hdr.id = mpam_resctrl_pick_domain_id(cpu, dom->ctrl_comp); - } - - /* monitor domains are unaffected and should continue to use the L3 */ - - if (!enabled && mb_l3_cache_id_possible) - r->alloc_capable = true; - else if (enabled && mb_numa_nid_possible) - r->alloc_capable = true; - else - r->alloc_capable = false; - - return 0; -} - static void mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, enum resctrl_event_id type) { diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index aa7d6e1854741..b43494e734ded 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -78,8 +78,15 @@ struct rdt_resource; void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, enum resctrl_event_id evtid); void resctrl_arch_mon_ctx_free(struct rdt_resource *r, enum resctrl_event_id evtid, void *ctx); -bool resctrl_arch_get_mb_uses_numa_nid(void); -int resctrl_arch_set_mb_uses_numa_nid(bool enabled); +static inline bool resctrl_arch_get_mb_uses_numa_nid(void) +{ + return false; +} + +static inline bool resctrl_arch_set_mb_uses_numa_nid(bool enabled) +{ + return false; +} /* * The CPU configuration for MPAM is cheap to write, and is only written if it From 49ac1c17256aa8b2cb7e497b876f1479fc859af4 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 11 Mar 2026 23:05:49 +0000 Subject: [PATCH 02/49] Revert "NVIDIA: SAUCE: arm_mpam: resctrl: Add NUMA node notifier for domain online/offline" This reverts commit c20b7f2cf107b170ab63bdf520efaf526c0b6dd1. The commit will be replaced by newer numa code. Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 108 --------------------------------- include/linux/memory.h | 1 - 2 files changed, 109 deletions(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index da079b343c1bd..cc7c69e2c3a7b 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -2119,26 +2118,6 @@ mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res) return mpam_resctrl_get_mon_domain_from_cpu(cpu); } -static struct mpam_resctrl_dom * -mpam_get_domain_from_nid(int nid, struct mpam_resctrl_res *res) -{ - struct rdt_ctrl_domain *d; - struct mpam_resctrl_dom *dom; - - list_for_each_entry(d, &res->resctrl_res.ctrl_domains, hdr.list) { - dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom); - - /* Only the memory class uses comp_id as nid */ - if (dom->ctrl_comp->class->type != MPAM_CLASS_MEMORY) - continue; - - if (dom->ctrl_comp->comp_id == nid) - return dom; - } - - return NULL; -} - int mpam_resctrl_online_cpu(unsigned int cpu) { int i, err = 0; @@ -2219,88 +2198,6 @@ int mpam_resctrl_offline_cpu(unsigned int cpu) return 0; } -static int mpam_resctrl_online_node(unsigned int nid) -{ - struct mpam_resctrl_dom *dom; - struct mpam_resctrl_res *res; - - /* Domain IDs as NUMA nid is only defined for MBA */ - res = &mpam_resctrl_controls[RDT_RESOURCE_MBA]; - if (!res->class) - return 0; // dummy_resource; - - dom = mpam_get_domain_from_nid(nid, res); - if (!dom) - dom = mpam_resctrl_alloc_domain_nid(nid, res); - if (IS_ERR(dom)) - return PTR_ERR(dom); - - return 0; -} - -static int mpam_resctrl_offline_node(unsigned int nid) -{ - struct mpam_resctrl_res *res; - struct mpam_resctrl_dom *dom; - struct rdt_mon_domain *mon_d; - struct rdt_ctrl_domain *ctrl_d; - - /* Domain IDs as NUMA nid is only defined for MBA */ - res = &mpam_resctrl_controls[RDT_RESOURCE_MBA]; - if (!res->class) - return 0; // dummy_resource; - - dom = mpam_get_domain_from_nid(nid, res); - if (WARN_ON_ONCE(!dom)) - return 0; - - ctrl_d = &dom->resctrl_ctrl_dom; - resctrl_offline_ctrl_domain(&res->resctrl_res, ctrl_d); - if (!mpam_resctrl_offline_domain_hdr(cpu_possible_mask, &ctrl_d->hdr)) - return 0; - - // TODO: skip monitor domains if there are no monitors for this resource - mon_d = &dom->resctrl_mon_dom; - resctrl_offline_mon_domain(&res->resctrl_res, mon_d); - if (!mpam_resctrl_offline_domain_hdr(cpu_possible_mask, &mon_d->hdr)) - return 0; - - kfree(dom); - - return 0; -} - -static int mpam_resctrl_node_notifier(struct notifier_block *self, - unsigned long action, void *arg) -{ - struct node_notify *nn = arg; - - if (nn->nid < 0 || !mb_uses_numa_nid) - return NOTIFY_OK; - - /* - * Ignore nid that have CPUs. Resctrl needs to see the cpu offline - * call for each CPU to update the CPUs in control groups. Moving - * the overflow handler isn't an issue as only L3 can be mon_capable, - * and NUMA nid used as domain-id are only an option for MBA. - */ - if (!cpumask_empty(cpumask_of_node(nn->nid))) - return NOTIFY_OK; - - switch (action) { - case NODE_ADDED_FIRST_MEMORY: - mpam_resctrl_online_node(nn->nid); - break; - case NODE_REMOVED_LAST_MEMORY: - mpam_resctrl_offline_node(nn->nid); - break; - default: - /* don't care */ - } - - return NOTIFY_OK; -} - int mpam_resctrl_setup(void) { int err = 0; @@ -2347,11 +2244,6 @@ int mpam_resctrl_setup(void) mpam_resctrl_monitor_init(mon, j); } - if (mb_numa_nid_possible) { - hotplug_node_notifier(mpam_resctrl_node_notifier, - RESCTRL_CALLBACK_PRI); - } - cpus_read_unlock(); if (err || (!exposed_alloc_capable && !exposed_mon_capable)) { diff --git a/include/linux/memory.h b/include/linux/memory.h index 2a770e7c6ab1e..40eb70ccb09d5 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -126,7 +126,6 @@ struct mem_section; #define CPUSET_CALLBACK_PRI 10 #define MEMTIER_HOTPLUG_PRI 100 #define KSM_CALLBACK_PRI 100 -#define RESCTRL_CALLBACK_PRI 100 #ifndef CONFIG_MEMORY_HOTPLUG static inline void memory_dev_init(void) From 8f89d239e552bead1bc323b979d9a91bbf2adac4 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 11 Mar 2026 23:06:05 +0000 Subject: [PATCH 03/49] Revert "NVIDIA: SAUCE: untested: arm_mpam: resctrl: Split mpam_resctrl_alloc_domain() to have CPU and node" This reverts commit e6673d188bde917fb84c48ea2f1798efbef0882c. The commit will be replaced by newer numa code. Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 82 +++++++++------------------------- 1 file changed, 22 insertions(+), 60 deletions(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index cc7c69e2c3a7b..c32a49fea2a74 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -1913,19 +1912,36 @@ static void mpam_resctrl_domain_insert(struct list_head *list, } static struct mpam_resctrl_dom * -mpam_resctrl_alloc_domain(const struct cpumask *onlined_cpus, int nid, - struct mpam_component *ctrl_comp, +mpam_resctrl_alloc_domain(const struct cpumask *onlined_cpus, struct mpam_resctrl_res *res) { int err; struct mpam_resctrl_dom *dom; struct rdt_mon_domain *mon_d; struct rdt_ctrl_domain *ctrl_d; + int cpu = cpumask_any(onlined_cpus); + struct mpam_class *class = res->class; + struct mpam_component *comp_iter, *ctrl_comp; struct rdt_resource *r = &res->resctrl_res; lockdep_assert_held(&domain_list_lock); - dom = kzalloc_node(sizeof(*dom), GFP_KERNEL, nid); + ctrl_comp = NULL; + idx = srcu_read_lock(&mpam_srcu); + list_for_each_entry_srcu(comp_iter, &class->components, class_list, + srcu_read_lock_held(&mpam_srcu)) { + if (cpumask_test_cpu(cpu, &comp_iter->affinity)) { + ctrl_comp = comp_iter; + break; + } + } + srcu_read_unlock(&mpam_srcu, idx); + + /* cpu with unknown exported component? */ + if (WARN_ON_ONCE(!ctrl_comp)) + return ERR_PTR(-EINVAL); + + dom = kzalloc_node(sizeof(*dom), GFP_KERNEL, cpu_to_node(cpu)); if (!dom) return ERR_PTR(-ENOMEM); @@ -1933,6 +1949,7 @@ mpam_resctrl_alloc_domain(const struct cpumask *onlined_cpus, int nid, dom->ctrl_comp = ctrl_comp; ctrl_d = &dom->resctrl_ctrl_dom; + mpam_resctrl_domain_hdr_init(onlined_cpus, ctrl_comp, &ctrl_d->hdr); ctrl_d->hdr.type = RESCTRL_CTRL_DOMAIN; mpam_resctrl_domain_insert(&r->ctrl_domains, &ctrl_d->hdr); @@ -2040,61 +2057,6 @@ static struct mpam_resctrl_dom *mpam_resctrl_get_mon_domain_from_cpu(int cpu) * component. * For the monitors, we need to search the list of events... */ -static struct mpam_resctrl_dom * -mpam_resctrl_alloc_domain_cpu(int cpu, struct mpam_resctrl_res *res) -{ - struct mpam_component *comp_iter, *ctrl_comp; - struct mpam_class *class = res->class; - int idx; - - ctrl_comp = NULL; - idx = srcu_read_lock(&mpam_srcu); - list_for_each_entry_srcu(comp_iter, &class->components, class_list, - srcu_read_lock_held(&mpam_srcu)) { - if (cpumask_test_cpu(cpu, &comp_iter->affinity)) { - ctrl_comp = comp_iter; - break; - } - } - srcu_read_unlock(&mpam_srcu, idx); - - /* cpu with unknown exported component? */ - if (WARN_ON_ONCE(!ctrl_comp)) - return ERR_PTR(-EINVAL); - - return mpam_resctrl_alloc_domain(cpumask_of(cpu), cpu_to_node(cpu), - ctrl_comp, res); -} - -static struct mpam_resctrl_dom * -mpam_resctrl_alloc_domain_nid(int nid, struct mpam_resctrl_res *res) -{ - struct mpam_component *comp_iter, *ctrl_comp; - struct mpam_class *class = res->class; - int idx; - - /* Only the memory class uses comp_id as nid */ - if (class->type != MPAM_CLASS_MEMORY) - return ERR_PTR(-EINVAL); - - ctrl_comp = NULL; - idx = srcu_read_lock(&mpam_srcu); - list_for_each_entry_srcu(comp_iter, &class->components, class_list, - srcu_read_lock_held(&mpam_srcu)) { - if (comp_iter->comp_id == nid) { - ctrl_comp = comp_iter; - break; - } - } - srcu_read_unlock(&mpam_srcu, idx); - - /* cpu with unknown exported component? */ - if (WARN_ON_ONCE(!ctrl_comp)) - return ERR_PTR(-EINVAL); - - return mpam_resctrl_alloc_domain(cpu_possible_mask, nid, ctrl_comp, res); -} - static struct mpam_resctrl_dom * mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res) { @@ -2132,7 +2094,7 @@ int mpam_resctrl_online_cpu(unsigned int cpu) dom = mpam_resctrl_get_domain_from_cpu(cpu, res); if (!dom) - dom = mpam_resctrl_alloc_domain_cpu(cpu, res); + dom = mpam_resctrl_alloc_domain(cpumask_of(cpu), res); if (IS_ERR(dom)) { err = PTR_ERR(dom); break; From 09d98ec1e1d5331d445fdf1ed51c0e938faadf31 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 11 Mar 2026 23:06:15 +0000 Subject: [PATCH 04/49] Revert "NVIDIA: SAUCE: arm_mpam: resctrl: Change domain_hdr online/offline to work with a set of CPUs" This reverts commit 48a102d9ed9f77967098b028c09d48333d9d729f. The commit will be replaced by newer numa code. Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 61 ++++++++++------------------------ 1 file changed, 17 insertions(+), 44 deletions(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index c32a49fea2a74..e0ed713a52d0e 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -1830,46 +1830,30 @@ void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) mpam_reset_class_locked(res->class); } -/** - * mpam_resctrl_domain_hdr_init() - Bring a subset of a domain online. - * @onlined_cpus: The set of CPUs that are online from the domain's - * perspective. - * @comp: The mpam component being brought online. - * @hdr: The header representing the domain. - * - * Adds @onlined_cpus to @hdr's cpu_mask, and sets the @hdr id. - * For NUMA nodes, @onlined_cpus will be cpu_possible_mask. - */ -static void mpam_resctrl_domain_hdr_init(const struct cpumask *onlined_cpus, - struct mpam_component *comp, +static void mpam_resctrl_domain_hdr_init(int cpu, struct mpam_component *comp, struct rdt_domain_hdr *hdr) { - int cpu = cpumask_any(onlined_cpus); - lockdep_assert_cpus_held(); INIT_LIST_HEAD(&hdr->list); hdr->id = mpam_resctrl_pick_domain_id(cpu, comp); - cpumask_and(&hdr->cpu_mask, &hdr->cpu_mask, onlined_cpus); + cpumask_set_cpu(cpu, &hdr->cpu_mask); } /** - * mpam_resctrl_offline_domain_hdr() - Take a subset of a domain offline. - * @offlined_cpus: The set of CPUs that are offline from the domain's - * perspective. + * mpam_resctrl_offline_domain_hdr() - Update the domain header to remove a CPU. + * @cpu: The CPU to remove from the domain. * @hdr: The domain's header. * - * Removes @offlined_cpus from @hdr's cpu_mask. If the list is empty, + * Removes @cpu from the header mask. If this was the last CPU in the domain, * the domain header is removed from its parent list and true is returned, * indicating the parent structure can be freed. * If there are other CPUs in the domain, returns false. - * - * For NUMA nodes, @offlined_cpus will be cpu_possible_mask. */ -static bool mpam_resctrl_offline_domain_hdr(const struct cpumask *offlined_cpus, +static bool mpam_resctrl_offline_domain_hdr(unsigned int cpu, struct rdt_domain_hdr *hdr) { - cpumask_andnot(&hdr->cpu_mask, &hdr->cpu_mask, offlined_cpus); + cpumask_clear_cpu(cpu, &hdr->cpu_mask); if (cpumask_empty(&hdr->cpu_mask)) { list_del(&hdr->list); return true; @@ -1878,18 +1862,14 @@ static bool mpam_resctrl_offline_domain_hdr(const struct cpumask *offlined_cpus, return false; } -static struct mpam_component *find_component(struct mpam_class *victim, - const struct cpumask *onlined_cpus) +static struct mpam_component *find_component(struct mpam_class *victim, int cpu) { struct mpam_component *victim_comp; guard(srcu)(&mpam_srcu); list_for_each_entry_srcu(victim_comp, &victim->components, class_list, srcu_read_lock_held(&mpam_srcu)) { - struct cpumask tmp; - - cpumask_andnot(&tmp, onlined_cpus, &victim_comp->affinity); - if (cpumask_empty(&tmp)) + if (cpumask_test_cpu(cpu, &victim_comp->affinity)) return victim_comp; } @@ -1912,14 +1892,12 @@ static void mpam_resctrl_domain_insert(struct list_head *list, } static struct mpam_resctrl_dom * -mpam_resctrl_alloc_domain(const struct cpumask *onlined_cpus, - struct mpam_resctrl_res *res) +mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) { int err; struct mpam_resctrl_dom *dom; struct rdt_mon_domain *mon_d; struct rdt_ctrl_domain *ctrl_d; - int cpu = cpumask_any(onlined_cpus); struct mpam_class *class = res->class; struct mpam_component *comp_iter, *ctrl_comp; struct rdt_resource *r = &res->resctrl_res; @@ -1949,8 +1927,7 @@ mpam_resctrl_alloc_domain(const struct cpumask *onlined_cpus, dom->ctrl_comp = ctrl_comp; ctrl_d = &dom->resctrl_ctrl_dom; - - mpam_resctrl_domain_hdr_init(onlined_cpus, ctrl_comp, &ctrl_d->hdr); + mpam_resctrl_domain_hdr_init(cpu, ctrl_comp, &ctrl_d->hdr); ctrl_d->hdr.type = RESCTRL_CTRL_DOMAIN; mpam_resctrl_domain_insert(&r->ctrl_domains, &ctrl_d->hdr); err = resctrl_online_ctrl_domain(r, ctrl_d); @@ -1980,7 +1957,7 @@ mpam_resctrl_alloc_domain(const struct cpumask *onlined_cpus, if (!mon->class) continue; // dummy resource - mon_comp = find_component(mon->class, onlined_cpus); + mon_comp = find_component(mon->class, cpu); dom->mon_comp[i] = mon_comp; if (mon_comp) any_mon_comp = mon_comp; @@ -1990,8 +1967,7 @@ mpam_resctrl_alloc_domain(const struct cpumask *onlined_cpus, dom->mbm_local_evt_cfg = MPAM_RESTRL_EVT_CONFIG_VALID; mon_d = &dom->resctrl_mon_dom; - mpam_resctrl_domain_hdr_init(onlined_cpus, any_mon_comp, - &mon_d->hdr); + mpam_resctrl_domain_hdr_init(cpu, any_mon_comp, &mon_d->hdr); mon_d->hdr.type = RESCTRL_MON_DOMAIN; mpam_resctrl_domain_insert(&r->mon_domains, &mon_d->hdr); err = resctrl_online_mon_domain(r, mon_d); @@ -2005,8 +1981,7 @@ mpam_resctrl_alloc_domain(const struct cpumask *onlined_cpus, goto out; offline_mon_hdr: - mpam_resctrl_offline_domain_hdr(onlined_cpus, &ctrl_d->hdr); - + mpam_resctrl_offline_domain_hdr(cpu, &mon_d->hdr); offline_ctrl_domain: resctrl_offline_ctrl_domain(r, ctrl_d); out: @@ -2094,7 +2069,7 @@ int mpam_resctrl_online_cpu(unsigned int cpu) dom = mpam_resctrl_get_domain_from_cpu(cpu, res); if (!dom) - dom = mpam_resctrl_alloc_domain(cpumask_of(cpu), res); + dom = mpam_resctrl_alloc_domain(cpu, res); if (IS_ERR(dom)) { err = PTR_ERR(dom); break; @@ -2137,8 +2112,7 @@ int mpam_resctrl_offline_cpu(unsigned int cpu) mpam_reset_component_locked(dom->ctrl_comp); ctrl_d = &dom->resctrl_ctrl_dom; - ctrl_dom_empty = mpam_resctrl_offline_domain_hdr(cpumask_of(cpu), - &ctrl_d->hdr); + ctrl_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr); if (ctrl_dom_empty) resctrl_offline_ctrl_domain(&res->resctrl_res, ctrl_d); } @@ -2146,8 +2120,7 @@ int mpam_resctrl_offline_cpu(unsigned int cpu) mon_dom_empty = true; if (exposed_mon_capable) { mon_d = &dom->resctrl_mon_dom; - mon_dom_empty = mpam_resctrl_offline_domain_hdr(cpumask_of(cpu), - &mon_d->hdr); + mon_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &mon_d->hdr); if (mon_dom_empty) resctrl_offline_mon_domain(&res->resctrl_res, mon_d); } From a85bdf2269db3931510bc7d16b2de3dddc1ea282 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 11 Mar 2026 23:06:39 +0000 Subject: [PATCH 05/49] Revert "NVIDIA: SAUCE: arm_mpam: resctrl: Pick whether MB can use NUMA nid instead of cache-id" This reverts commit bdc6890ccd05f9a9aa605ba6a008e083b1a2d4dd. The commit will be replaced by newer numa code. Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 57 +++------------------------------- 1 file changed, 5 insertions(+), 52 deletions(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index e0ed713a52d0e..a773433767361 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -54,14 +54,6 @@ static bool exposed_mon_capable; */ static bool cdp_enabled; -/* - * To support CPU-less NUMA nodes, user-space needs to opt in to the MB - * domain IDs being the NUMA nid instead of the corresponding CPU's L3 - * cache-id. - */ -static bool mb_uses_numa_nid; -static bool mb_numa_nid_possible; -static bool mb_l3_cache_id_possible; /* * If resctrl_init() succeeded, resctrl_exit() can be used to remove support * for the filesystem in the event of an error. @@ -980,15 +972,6 @@ static bool topology_matches_l3(struct mpam_class *victim) return true; } -static bool topology_matches_numa(struct mpam_class *victim) -{ - /* - * For now, check this is a memory class, in which case component - * id are already NUMA nid. - */ - return (victim->type == MPAM_CLASS_MEMORY); -} - /* Test whether we can export MPAM_CLASS_CACHE:{2,3}? */ static void mpam_resctrl_pick_caches(void) { @@ -1058,8 +1041,6 @@ static void mpam_resctrl_pick_mba(void) list_for_each_entry_srcu(class, &mpam_classes, classes_list, srcu_read_lock_held(&mpam_srcu)) { struct mpam_props *cprops = &class->props; - bool l3_cache_id_possible = false; - bool numa_nid_possible = false; if (class->level < 3) { pr_debug("class %u is before L3\n", class->level); @@ -1076,18 +1057,8 @@ static void mpam_resctrl_pick_mba(void) continue; } - if (topology_matches_numa(class)) { - pr_debug("class %u topology matches NUMA domains\n", class->level); - numa_nid_possible = true; - } - - if (topology_matches_l3(class)) { - pr_debug("class %u topology matches L3\n", class->level); - l3_cache_id_possible = true; - } - - if (!l3_cache_id_possible && !numa_nid_possible) { - pr_debug("class %u has no matching topology for MB\n", class->level); + if (!topology_matches_l3(class)) { + pr_debug("class %u topology doesn't match L3\n", class->level); continue; } @@ -1096,17 +1067,8 @@ static void mpam_resctrl_pick_mba(void) * mbm_local is implicitly part of the L3, pick a resource to be MBA * that as close as possible to the L3. */ - if (!candidate_class || class->level < candidate_class->level) { - /* - * Refuse to pick a closer class if it would prevent cache-id - * being used as domain-id by default. - */ - if (!candidate_class || l3_cache_id_possible) { - candidate_class = class; - mb_l3_cache_id_possible = l3_cache_id_possible; - mb_numa_nid_possible = numa_nid_possible; - } - } + if (!candidate_class || class->level < candidate_class->level) + candidate_class = class; } if (candidate_class) { @@ -1483,10 +1445,7 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res, break; case RDT_RESOURCE_MBA: - /* Domain ID is the L3 cache-id by default */ - if (mb_l3_cache_id_possible) - r->alloc_capable = true; - + r->alloc_capable = true; r->schema_fmt = RESCTRL_SCHEMA_PERCENT; r->ctrl_scope = RESCTRL_L3_CACHE; @@ -1508,14 +1467,8 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res, static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) { - bool is_mb; struct mpam_class *class = comp->class; - is_mb = (mpam_resctrl_controls[RDT_RESOURCE_MBA].class == class); - - if (is_mb && mb_uses_numa_nid && topology_matches_numa(class)) - return comp->comp_id; - if (class->type == MPAM_CLASS_CACHE) return comp->comp_id; From 94b11e83d11264bd93f7c339899956fc85258d5e Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 11 Mar 2026 23:06:48 +0000 Subject: [PATCH 06/49] Revert "NVIDIA: SAUCE: Fix unused variable warning" This reverts commit 4b4b38f5ebcb6f91c728fc7d3b7f78f1bd618622. The commit will be replaced by newer numa code. Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index a773433767361..5f8354a3b60a1 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -1847,7 +1847,7 @@ static void mpam_resctrl_domain_insert(struct list_head *list, static struct mpam_resctrl_dom * mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) { - int err; + int err, idx; struct mpam_resctrl_dom *dom; struct rdt_mon_domain *mon_d; struct rdt_ctrl_domain *ctrl_d; From c31f19fec275494dce1f12812470efcc1e0d9ae8 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 11 Mar 2026 23:06:55 +0000 Subject: [PATCH 07/49] Revert "NVIDIA: SAUCE: fs/resctrl: Add mount option for mb_uses_numa_nid and arch stubs" This reverts commit 1e1f7a63321378f5d6fc272347cf36928d93287a. The commit will be replaced by newer numa code. Signed-off-by: Fenghua Yu --- arch/x86/include/asm/resctrl.h | 9 --------- fs/resctrl/internal.h | 1 - fs/resctrl/rdtgroup.c | 26 ++++---------------------- include/linux/arm_mpam.h | 10 ---------- 4 files changed, 4 insertions(+), 42 deletions(-) diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index 279aba8e97bf5..40a74a0617345 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -198,15 +198,6 @@ static inline bool resctrl_arch_mon_can_overflow(void) void resctrl_cpu_detect(struct cpuinfo_x86 *c); -static inline bool resctrl_arch_get_mb_uses_numa_nid(void) -{ - return false; -} - -static inline int resctrl_arch_set_mb_uses_numa_nid(bool enabled) -{ - return -EOPNOTSUPP; -} #else static inline void resctrl_arch_sched_in(struct task_struct *tsk) {} diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index f5f74342af317..24f340f5f4de0 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -42,7 +42,6 @@ struct rdt_fs_context { bool enable_cdpl3; bool enable_mba_mbps; bool enable_debug; - bool mb_uses_numa_nid; bool enable_abi_playground; }; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 3c9981f545017..e2451c93123fd 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -2768,7 +2768,6 @@ static void rdt_disable_ctx(void) { resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); - resctrl_arch_set_mb_uses_numa_nid(false); set_mba_sc(false); resctrl_debug = false; @@ -2799,17 +2798,8 @@ static int rdt_enable_ctx(struct rdt_fs_context *ctx) if (ctx->enable_debug) resctrl_debug = true; - if (ctx->mb_uses_numa_nid) { - ret = resctrl_arch_set_mb_uses_numa_nid(true); - if (ret) - goto out_debug; - } - return 0; -out_debug: - resctrl_debug = false; - set_mba_sc(false); out_cdpl3: resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); out_cdpl2: @@ -3098,17 +3088,15 @@ enum rdt_param { Opt_cdpl2, Opt_mba_mbps, Opt_debug, - Opt_mb_uses_numa_nid, Opt_not_abi_playground, nr__rdt_params }; static const struct fs_parameter_spec rdt_fs_parameters[] = { - fsparam_flag("cdp", Opt_cdp), - fsparam_flag("cdpl2", Opt_cdpl2), - fsparam_flag("mba_MBps", Opt_mba_mbps), - fsparam_flag("debug", Opt_debug), - fsparam_flag("mb_uses_numa_nid", Opt_mb_uses_numa_nid), + fsparam_flag("cdp", Opt_cdp), + fsparam_flag("cdpl2", Opt_cdpl2), + fsparam_flag("mba_MBps", Opt_mba_mbps), + fsparam_flag("debug", Opt_debug), /* * Some of MPAM's out of tree code exposes things through resctrl @@ -3146,9 +3134,6 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_debug: ctx->enable_debug = true; return 0; - case Opt_mb_uses_numa_nid: - ctx->mb_uses_numa_nid = true; - return 0; case Opt_not_abi_playground: ctx->enable_abi_playground = true; return 0; @@ -4406,9 +4391,6 @@ static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) if (resctrl_debug) seq_puts(seq, ",debug"); - if (resctrl_arch_get_mb_uses_numa_nid()) - seq_puts(seq, ",mb_uses_numa_nid"); - if (static_branch_unlikely(&resctrl_abi_playground)) seq_puts(seq, ",this_is_not_abi"); diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index b43494e734ded..06827f240cf9e 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -78,16 +78,6 @@ struct rdt_resource; void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, enum resctrl_event_id evtid); void resctrl_arch_mon_ctx_free(struct rdt_resource *r, enum resctrl_event_id evtid, void *ctx); -static inline bool resctrl_arch_get_mb_uses_numa_nid(void) -{ - return false; -} - -static inline bool resctrl_arch_set_mb_uses_numa_nid(bool enabled) -{ - return false; -} - /* * The CPU configuration for MPAM is cheap to write, and is only written if it * has changed. No need for fine grained enables. From ab79bc7624d5c9883d9b99849fcb231384df04d4 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 11 Mar 2026 23:07:39 +0000 Subject: [PATCH 08/49] Revert "NVIDIA: SAUCE: fs/resctrl: Take memory hotplug lock whenever taking CPU hotplug lock" This reverts commit c50be78a4e44848af0e95149a83c7fbbdef4df0f. Memory hotplug feature will not be supported. Revert it to avoid any issue. Signed-off-by: Fenghua Yu --- fs/resctrl/monitor.c | 5 ----- fs/resctrl/pseudo_lock.c | 3 --- fs/resctrl/rdtgroup.c | 17 ----------------- 3 files changed, 25 deletions(-) diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index e62432467817f..2a50109cf7c91 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -18,7 +18,6 @@ #define pr_fmt(fmt) "resctrl: " fmt #include -#include #include #include #include @@ -764,7 +763,6 @@ void cqm_handle_limbo(struct work_struct *work) struct rdt_mon_domain *d; cpus_read_lock(); - get_online_mems(); mutex_lock(&rdtgroup_mutex); d = container_of(work, struct rdt_mon_domain, cqm_limbo.work); @@ -779,7 +777,6 @@ void cqm_handle_limbo(struct work_struct *work) } mutex_unlock(&rdtgroup_mutex); - put_online_mems(); cpus_read_unlock(); } @@ -813,7 +810,6 @@ void mbm_handle_overflow(struct work_struct *work) struct rdt_resource *r; cpus_read_lock(); - get_online_mems(); mutex_lock(&rdtgroup_mutex); /* @@ -847,7 +843,6 @@ void mbm_handle_overflow(struct work_struct *work) out_unlock: mutex_unlock(&rdtgroup_mutex); - put_online_mems(); cpus_read_unlock(); } diff --git a/fs/resctrl/pseudo_lock.c b/fs/resctrl/pseudo_lock.c index 4086e61df3e1c..87bbc2605de12 100644 --- a/fs/resctrl/pseudo_lock.c +++ b/fs/resctrl/pseudo_lock.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include @@ -695,7 +694,6 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) int ret = -1; cpus_read_lock(); - get_online_mems(); mutex_lock(&rdtgroup_mutex); if (rdtgrp->flags & RDT_DELETED) { @@ -743,7 +741,6 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) out: mutex_unlock(&rdtgroup_mutex); - put_online_mems(); cpus_read_unlock(); return ret; } diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index e2451c93123fd..48ecde1ca9fd2 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -1156,7 +1155,6 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, u32 ctrl_val; cpus_read_lock(); - get_online_mems(); mutex_lock(&rdtgroup_mutex); hw_shareable = r->cache.shareable_bits; list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { @@ -1217,7 +1215,6 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, } seq_putc(seq, '\n'); mutex_unlock(&rdtgroup_mutex); - put_online_mems(); cpus_read_unlock(); return 0; } @@ -1722,7 +1719,6 @@ static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid bool sep = false; cpus_read_lock(); - get_online_mems(); mutex_lock(&rdtgroup_mutex); list_for_each_entry(dom, &r->mon_domains, hdr.list) { @@ -1741,7 +1737,6 @@ static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid seq_puts(s, "\n"); mutex_unlock(&rdtgroup_mutex); - put_online_mems(); cpus_read_unlock(); return 0; @@ -1886,7 +1881,6 @@ static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, return -EINVAL; cpus_read_lock(); - get_online_mems(); mutex_lock(&rdtgroup_mutex); rdt_last_cmd_clear(); @@ -1896,7 +1890,6 @@ static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID); mutex_unlock(&rdtgroup_mutex); - put_online_mems(); cpus_read_unlock(); return ret ?: nbytes; @@ -1914,7 +1907,6 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, return -EINVAL; cpus_read_lock(); - get_online_mems(); mutex_lock(&rdtgroup_mutex); rdt_last_cmd_clear(); @@ -1924,7 +1916,6 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID); mutex_unlock(&rdtgroup_mutex); - put_online_mems(); cpus_read_unlock(); return ret ?: nbytes; @@ -2736,7 +2727,6 @@ struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) rdtgroup_kn_get(rdtgrp, kn); cpus_read_lock(); - get_online_mems(); mutex_lock(&rdtgroup_mutex); /* Was this group deleted while we waited? */ @@ -2754,7 +2744,6 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn) return; mutex_unlock(&rdtgroup_mutex); - put_online_mems(); cpus_read_unlock(); rdtgroup_kn_put(rdtgrp, kn); @@ -2973,7 +2962,6 @@ static int rdt_get_tree(struct fs_context *fc) enable_abi_playground(); cpus_read_lock(); - get_online_mems(); mutex_lock(&rdtgroup_mutex); /* * resctrl file system can only be mounted once. @@ -3078,7 +3066,6 @@ static int rdt_get_tree(struct fs_context *fc) out: rdt_last_cmd_clear(); mutex_unlock(&rdtgroup_mutex); - put_online_mems(); cpus_read_unlock(); return ret; } @@ -3363,7 +3350,6 @@ static void rdt_kill_sb(struct super_block *sb) struct rdt_resource *r; cpus_read_lock(); - get_online_mems(); mutex_lock(&rdtgroup_mutex); rdt_disable_ctx(); @@ -3380,7 +3366,6 @@ static void rdt_kill_sb(struct super_block *sb) resctrl_mounted = false; kernfs_kill_sb(sb); mutex_unlock(&rdtgroup_mutex); - put_online_mems(); cpus_read_unlock(); if (static_branch_unlikely(&resctrl_abi_playground)) @@ -4771,14 +4756,12 @@ static bool resctrl_online_domains_exist(void) void resctrl_exit(void) { cpus_read_lock(); - get_online_mems(); WARN_ON_ONCE(resctrl_online_domains_exist()); mutex_lock(&rdtgroup_mutex); resctrl_fs_teardown(); mutex_unlock(&rdtgroup_mutex); - put_online_mems(); cpus_read_unlock(); debugfs_remove_recursive(debugfs_resctrl); From bea908d73f0179442d53bdf46249ffb03ea0bdaa Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 11 Mar 2026 23:07:47 +0000 Subject: [PATCH 09/49] Revert "NVIDIA: SAUCE: mm,memory_hotplug: Add lockdep assertion helper" This reverts commit de478cb2efa20550f8d5cf49cdea1449d7fc252a. The helpers are not used. Revert it to avoid any issue. Signed-off-by: Fenghua Yu --- include/linux/memory_hotplug.h | 6 ------ mm/memory_hotplug.c | 11 ----------- 2 files changed, 17 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index acc5ac1e92491..23f038a162319 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -334,10 +334,4 @@ extern int arch_create_linear_mapping(int nid, u64 start, u64 size, void arch_remove_linear_mapping(u64 start, u64 size); #endif /* CONFIG_MEMORY_HOTPLUG */ -#if defined(CONFIG_LOCKDEP) && defined(CONFIG_MEMORY_HOTPLUG) -void lockdep_assert_mems_held(void); -#else -static inline void lockdep_assert_mems_held(void) { } -#endif - #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 89ec5ed8c488b..74318c7877156 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -218,17 +218,6 @@ void put_online_mems(void) percpu_up_read(&mem_hotplug_lock); } -#ifdef CONFIG_LOCKDEP -void lockdep_assert_mems_held(void) -{ - /* See lockdep_assert_cpus_held() */ - if (system_state < SYSTEM_RUNNING) - return; - - percpu_rwsem_assert_held(&mem_hotplug_lock); -} -#endif - bool movable_node_enabled = false; static int mhp_default_online_type = -1; From 9566a3de8508c7cf692b8d6cde761a5a325514a8 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 12 Mar 2026 21:35:18 +0000 Subject: [PATCH 10/49] Revert "NVIDIA: SAUCE: untested: arm_mpam: resctrl: Allow monitors to be configured with filters" This reverts commit 27629a49640893aa971e110c6f502ab5d8568578. This feature is low priority but the commit may be buggy and hard to maintain. Remove it to avoid any potential issues. May support this feature in the future. Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_devices.c | 35 --------- drivers/resctrl/mpam_internal.h | 9 --- drivers/resctrl/mpam_resctrl.c | 127 ++++++-------------------------- 3 files changed, 24 insertions(+), 147 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index f0740b5d59b5b..711152b071ddf 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -1579,41 +1579,6 @@ int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, return err; } -void mpam_msmon_reset_all_mbwu(struct mpam_component *comp) -{ - int idx, i; - struct mpam_msc *msc; - struct mpam_vmsc *vmsc; - struct mpam_msc_ris *ris; - - if (!mpam_is_enabled()) - return; - - idx = srcu_read_lock(&mpam_srcu); - list_for_each_entry_rcu(vmsc, &comp->vmsc, comp_list) { - if (!mpam_has_feature(mpam_feat_msmon_mbwu, &vmsc->props)) - continue; - - msc = vmsc->msc; - mpam_mon_sel_outer_lock(msc); - list_for_each_entry_rcu(ris, &msc->ris, vmsc_list) { - if (!mpam_has_feature(mpam_feat_msmon_mbwu, &ris->props)) - continue; - - if (WARN_ON_ONCE(!mpam_mon_sel_inner_lock(msc))) - continue; - - for (i = 0; i < ris->props.num_mbwu_mon; i++) { - ris->mbwu_state[i].correction = 0; - ris->mbwu_state[i].reset_on_next_read = true; - } - mpam_mon_sel_inner_unlock(msc); - } - mpam_mon_sel_outer_unlock(msc); - } - srcu_read_unlock(&mpam_srcu, idx); -} - void mpam_msmon_reset_mbwu(struct mpam_component *comp, struct mon_cfg *ctx) { struct mpam_msc *msc; diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index c2cb5129e3e21..1f82a2183c1c1 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -45,12 +45,6 @@ DECLARE_STATIC_KEY_FALSE(mpam_enabled); */ #define USE_PRE_ALLOCATED (U16_MAX + 1) -/* - * Only these event configuration bits are supported. MPAM can't know if - * data is being written back, these will show up as a write. - */ -#define MPAM_RESTRL_EVT_CONFIG_VALID (READS_TO_LOCAL_MEM | NON_TEMP_WRITE_TO_LOCAL_MEM) - static inline bool mpam_is_enabled(void) { return static_branch_likely(&mpam_enabled); @@ -456,8 +450,6 @@ struct mpam_resctrl_dom { struct rdt_ctrl_domain resctrl_ctrl_dom; struct rdt_mon_domain resctrl_mon_dom; - - u32 mbm_local_evt_cfg; }; struct mpam_resctrl_res { @@ -541,7 +533,6 @@ int mpam_apply_config(struct mpam_component *comp, u16 partid, int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, enum mpam_device_features, u64 *val); void mpam_msmon_reset_mbwu(struct mpam_component *comp, struct mon_cfg *ctx); -void mpam_msmon_reset_all_mbwu(struct mpam_component *comp); int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, cpumask_t *affinity); diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 5f8354a3b60a1..5c5e9876e5ed2 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -100,6 +100,23 @@ bool resctrl_arch_mon_capable(void) return exposed_mon_capable; } +bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt) +{ + return false; +} + +void resctrl_arch_mon_event_config_read(void *info) +{ +} + +void resctrl_arch_mon_event_config_write(void *info) +{ +} + +void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d) +{ +} + bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level rid) { switch (rid) { @@ -455,7 +472,7 @@ bool resctrl_arch_mon_can_overflow(void) static int __read_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, - enum mpam_device_features mon_type, enum mon_filter_options mon_opts, + enum mpam_device_features mon_type, int mon_idx, enum resctrl_conf_type cdp_type, u32 closid, u32 rmid, u64 *val) { @@ -484,7 +501,6 @@ __read_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, cfg.match_pmg = true; cfg.partid = closid; cfg.pmg = rmid; - cfg.opts = mon_opts; if (irqs_disabled()) { /* Check if we can access this domain without an IPI */ @@ -495,41 +511,29 @@ __read_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, } static int read_mon_cdp_safe(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, - enum mpam_device_features mon_type, enum mon_filter_options mon_opts, + enum mpam_device_features mon_type, int mon_idx, u32 closid, u32 rmid, u64 *val) { if (cdp_enabled) { u64 cdp_val = 0; int err; - err = __read_mon(mon, mon_comp, mon_type, mon_opts, mon_idx, + err = __read_mon(mon, mon_comp, mon_type, mon_idx, CDP_CODE, closid, rmid, &cdp_val); if (err) return err; - err = __read_mon(mon, mon_comp, mon_type, mon_opts, mon_idx, + err = __read_mon(mon, mon_comp, mon_type, mon_idx, CDP_DATA, closid, rmid, &cdp_val); if (!err) *val += cdp_val; return err; } - return __read_mon(mon, mon_comp, mon_type, mon_idx, mon_opts, + return __read_mon(mon, mon_comp, mon_type, mon_idx, CDP_NONE, closid, rmid, val); } -static enum mon_filter_options resctrl_evt_config_to_mpam(u32 local_evt_cfg) -{ - switch (local_evt_cfg) { - case READS_TO_LOCAL_MEM: - return COUNT_READ; - case NON_TEMP_WRITE_TO_LOCAL_MEM: - return COUNT_WRITE; - default: - return COUNT_BOTH; - } -} - /* MBWU when not in ABMC mode, and CSU counters. */ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, u32 closid, u32 rmid, enum resctrl_event_id eventid, @@ -537,7 +541,6 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, { struct mpam_resctrl_dom *l3_dom; struct mpam_component *mon_comp; - enum mon_filter_options mon_opts; u32 mon_idx = *(u32 *)arch_mon_ctx; enum mpam_device_features mon_type; struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[eventid]; @@ -552,7 +555,6 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, l3_dom = container_of(d, struct mpam_resctrl_dom, resctrl_mon_dom); mon_comp = l3_dom->mon_comp[eventid]; - mon_opts = resctrl_evt_config_to_mpam(l3_dom->mbm_local_evt_cfg); switch (eventid) { case QOS_L3_OCCUP_EVENT_ID: @@ -566,7 +568,7 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, return -EINVAL; } - return read_mon_cdp_safe(mon, mon_comp, mon_type, mon_opts, mon_idx, + return read_mon_cdp_safe(mon, mon_comp, mon_type, mon_idx, closid, rmid, val); } @@ -576,7 +578,6 @@ int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, enum resctrl_event_id eventid, u64 *val) { struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[eventid]; - enum mon_filter_options mon_opts; struct mpam_resctrl_dom *l3_dom; struct mpam_component *mon_comp; @@ -588,10 +589,9 @@ int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, l3_dom = container_of(d, struct mpam_resctrl_dom, resctrl_mon_dom); mon_comp = l3_dom->mon_comp[eventid]; - mon_opts = resctrl_evt_config_to_mpam(l3_dom->mbm_local_evt_cfg); return read_mon_cdp_safe(mon, mon_comp, mpam_feat_msmon_mbwu, mon_idx, - mon_opts, closid, rmid, val); + closid, rmid, val); } static void __reset_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, @@ -1262,82 +1262,6 @@ static void mpam_resctrl_pick_counters(void) mpam_resctrl_counters[QOS_L3_MBM_TOTAL_EVENT_ID].class); } -bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt) -{ - struct mpam_class *class; - struct mpam_props *cprops; - - class = mpam_resctrl_counters[evt].class; - if (!class) - return false; - - cprops = &class->props; - - return mpam_has_feature(mpam_feat_msmon_mbwu_rwbw, cprops); -} - -void resctrl_arch_mon_event_config_read(void *info) -{ - struct mpam_resctrl_dom *dom; - struct resctrl_mon_config_info *mon_info = info; - - if (!mpam_is_enabled()) { - mon_info->mon_config = 0; - return; - } - - dom = container_of(mon_info->d, struct mpam_resctrl_dom, resctrl_mon_dom); - mon_info->mon_config = dom->mbm_local_evt_cfg & MAX_EVT_CONFIG_BITS; -} - -void resctrl_arch_mon_event_config_write(void *info) -{ - struct mpam_resctrl_dom *dom; - struct resctrl_mon_config_info *mon_info = info; - - WARN_ON_ONCE(mon_info->mon_config & ~MPAM_RESTRL_EVT_CONFIG_VALID); - - dom = container_of(mon_info->d, struct mpam_resctrl_dom, resctrl_mon_dom); - - if (!mpam_is_enabled()) { - dom->mbm_local_evt_cfg = 0; - return; - } - - dom->mbm_local_evt_cfg = mon_info->mon_config & MPAM_RESTRL_EVT_CONFIG_VALID; -} - -void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) -{ - int i; - struct mpam_resctrl_dom *dom; - struct mpam_resctrl_mon *mon; - struct mpam_component *mon_comp; - - dom = container_of(d, struct mpam_resctrl_dom, resctrl_mon_dom); - if (!mpam_is_enabled()) { - dom->mbm_local_evt_cfg = 0; - return; - } - dom->mbm_local_evt_cfg = MPAM_RESTRL_EVT_CONFIG_VALID; - - /* - * Monitors may be backed by different classes of MSC, all - * possible components need to be reset... - */ - for (i = 0; i < QOS_NUM_EVENTS; i++) { - mon = &mpam_resctrl_counters[i]; - if (!mon->class) - continue; // dummy resource - - mon_comp = dom->mon_comp[i]; - if (!mon_comp) - continue; - - mpam_msmon_reset_all_mbwu(mon_comp); - } -} - static void __config_cntr(struct mpam_resctrl_mon *mon, u32 cntr_id, enum resctrl_conf_type cdp_type, u32 closid, u32 rmid, bool assign) @@ -1575,7 +1499,6 @@ static void mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, case QOS_L3_MBM_LOCAL_EVENT_ID: case QOS_L3_MBM_TOTAL_EVENT_ID: mpam_resctrl_monitor_init_abmc(mon); - l3->mon.mbm_cfg_mask = MPAM_RESTRL_EVT_CONFIG_VALID; return; default: @@ -1917,8 +1840,6 @@ mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) } WARN_ON_ONCE(!any_mon_comp); - dom->mbm_local_evt_cfg = MPAM_RESTRL_EVT_CONFIG_VALID; - mon_d = &dom->resctrl_mon_dom; mpam_resctrl_domain_hdr_init(cpu, any_mon_comp, &mon_d->hdr); mon_d->hdr.type = RESCTRL_MON_DOMAIN; From cea4bbc0cd54731b195a4320b2d1d404a52e6474 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Fri, 13 Mar 2026 18:44:04 +0000 Subject: [PATCH 11/49] Revert "NVIDIA: SAUCE: arm_mpam: Add workaround for T241-MPAM-4" This reverts commit 56038ccc4cf9453c50b2dad2c8dc78c57f7662d0. This workaround is buggy and will be replace by a new workaround. Signed-off-by: Fenghua Yu --- Documentation/arch/arm64/silicon-errata.rst | 2 - drivers/resctrl/mpam_devices.c | 63 +++++---------------- drivers/resctrl/mpam_internal.h | 1 - 3 files changed, 14 insertions(+), 52 deletions(-) diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst index 3667650036fba..4b1076311455c 100644 --- a/Documentation/arch/arm64/silicon-errata.rst +++ b/Documentation/arch/arm64/silicon-errata.rst @@ -251,8 +251,6 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | NVIDIA | T241 MPAM | T241-MPAM-1 | N/A | +----------------+-----------------+-----------------+-----------------------------+ -| NVIDIA | T241 MPAM | T241-MPAM-4 | N/A | -+----------------+-----------------+-----------------+-----------------------------+ | NVIDIA | T241 MPAM | T241-MPAM-6 | N/A | +----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+ diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 711152b071ddf..7ca2a1c4a13a2 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -877,12 +877,6 @@ static const struct mpam_quirk mpam_quirks[] = { .workaround = T241_SCRUB_SHADOW_REGS, }, { - /* NVIDIA t241 erratum T241-MPAM-4 */ - .iidr = IIDR_PROD(0x241) | IIDR_VAR(0) | IIDR_REV(0) | IIDR_IMP(0x36b), - .iidr_mask = IIDR_MATCH_ONE, - .workaround = T241_FORCE_MBW_MIN_TO_ONE, - }, - { /* NVIDIA t241 erratum T241-MPAM-6 */ .iidr = IIDR_PROD(0x241) | IIDR_VAR(0) | IIDR_REV(0) | IIDR_IMP(0x36b), .iidr_mask = IIDR_MATCH_ONE, @@ -1888,22 +1882,6 @@ static void mpam_init_reset_cfg(struct mpam_config *reset_cfg) bitmap_fill(reset_cfg->features, MPAM_FEATURE_LAST); } -/* - * This is not part of mpam_init_reset_cfg() as high level callers have the - * class, and low level callers a ris. - */ -static void mpam_wa_t241_force_mbw_min_to_one(struct mpam_config *cfg, - struct mpam_props *props) -{ - u16 max_hw_value, min_hw_granule, res0_bits; - - res0_bits = 16 - props->bwa_wd; - max_hw_value = ((1 << props->bwa_wd) - 1) << res0_bits; - min_hw_granule = ~max_hw_value; - - cfg->mbw_min = min_hw_granule + 1; -} - /* * Called via smp_call_on_cpu() to prevent migration, while still being * pre-emptible. Caller must hold mpam_srcu. @@ -1913,14 +1891,11 @@ static int mpam_reset_ris(void *arg) struct mpam_config reset_cfg; struct mpam_msc_ris *ris = arg; struct reprogram_ris reprogram_arg; - struct mpam_msc *msc = ris->vmsc->msc; if (ris->in_reset_state) return 0; mpam_init_reset_cfg(&reset_cfg); - if (mpam_has_quirk(T241_FORCE_MBW_MIN_TO_ONE, msc)) - mpam_wa_t241_force_mbw_min_to_one(&reset_cfg, &ris->props); reprogram_arg.ris = ris; reprogram_arg.cfg = &reset_cfg; @@ -2953,19 +2928,14 @@ static void __destroy_component_cfg(struct mpam_component *comp) static void mpam_reset_component_cfg(struct mpam_component *comp) { int i; - struct mpam_class *class = comp->class; mpam_assert_partid_sizes_fixed(); if (!comp->cfg) return; - for (i = 0; i < mpam_partid_max + 1; i++) { + for (i = 0; i < mpam_partid_max + 1; i++) mpam_init_reset_cfg(&comp->cfg[i]); - if (mpam_has_quirk(T241_FORCE_MBW_MIN_TO_ONE, class)) - mpam_wa_t241_force_mbw_min_to_one(&comp->cfg[i], - &class->props); - } } static int __allocate_component_cfg(struct mpam_component *comp) @@ -3398,18 +3368,6 @@ static void mpam_extend_config(struct mpam_class *class, struct mpam_config *cfg u16 min, min_hw_granule, delta; u16 max_hw_value, res0_bits; - /* - * Calculate the values the 'min' control can hold. - * e.g. on a platform with bwa_wd = 8, min_hw_granule is 0x00ff because - * those bits are RES0. Configurations of this value are effectively - * zero. But configurations need to saturate at min_hw_granule on - * systems with mismatched bwa_wd, where the 'less than 0' values are - * implemented on some MSC, but not others. - */ - res0_bits = 16 - cprops->bwa_wd; - max_hw_value = ((1 << cprops->bwa_wd) - 1) << res0_bits; - min_hw_granule = ~max_hw_value; - /* * MAX and MIN should be set together. If only one is provided, * generate a configuration for the other. If only one control @@ -3419,6 +3377,19 @@ static void mpam_extend_config(struct mpam_class *class, struct mpam_config *cfg */ if (mpam_has_feature(mpam_feat_mbw_max, cfg) && !mpam_has_feature(mpam_feat_mbw_min, cfg)) { + /* + * Calculate the values the 'min' control can hold. + * e.g. on a platform with bwa_wd = 8, min_hw_granule is 0x00ff + * because those bits are RES0. Configurations of this value + * are effectively zero. But configurations need to saturate + * at min_hw_granule on systems with mismatched bwa_wd, where + * the 'less than 0' values are implemented on some MSC, but + * not others. + */ + res0_bits = 16 - cprops->bwa_wd; + max_hw_value = ((1 << cprops->bwa_wd) - 1) << res0_bits; + min_hw_granule = ~max_hw_value; + delta = ((5 * MPAMCFG_MBW_MAX_MAX) / 100) - 1; if (cfg->mbw_max > delta) min = cfg->mbw_max - delta; @@ -3428,12 +3399,6 @@ static void mpam_extend_config(struct mpam_class *class, struct mpam_config *cfg cfg->mbw_min = max(min, min_hw_granule); mpam_set_feature(mpam_feat_mbw_min, cfg); } - - if (mpam_has_quirk(T241_FORCE_MBW_MIN_TO_ONE, class) && - cfg->mbw_min <= min_hw_granule) { - cfg->mbw_min = min_hw_granule + 1; - mpam_set_feature(mpam_feat_mbw_min, cfg); - } } int mpam_apply_config(struct mpam_component *comp, u16 partid, diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 1f82a2183c1c1..8147ce91860ee 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -259,7 +259,6 @@ struct mpam_props { /* Workaround bits for msc->quirks */ enum mpam_device_quirks { T241_SCRUB_SHADOW_REGS, - T241_FORCE_MBW_MIN_TO_ONE, T241_MBW_COUNTER_SCALE_64, IGNORE_CSU_NRDY, MPAM_QUIRK_LAST, From 294734f29a1533f8a9cf913647ff6a89c7869304 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Fri, 5 Dec 2025 21:58:59 +0000 Subject: [PATCH 12/49] NVIDIA: SAUCE: arm_mpam: Add workaround for T241-MPAM-4 In the T241 implementation of memory-bandwidth partitioning, in the absence of contention for bandwidth, the minimum bandwidth setting can affect the amount of achieved bandwidth. Specifically, the achieved bandwidth in the absence of contention can settle to any value between the values of MPAMCFG_MBW_MIN and MPAMCFG_MBW_MAX. Also, if MPAMCFG_MBW_MIN is set zero (below 0.78125%), once a core enters a throttled state, it will never leave that state. The first issue is not a concern if the MPAM software allows to program MPAMCFG_MBW_MIN through the sysfs interface. This patch ensures program MBW_MIN=1 (0.78125%) whenever MPAMCFG_MBW_MIN=0 is programmed. In the scenario where the resctrl doesn't support the MBW_MIN interface via sysfs, to achieve bandwidth closer to MBW_MAX in the absence of contention, software should configure a relatively narrow gap between MBW_MIN and MBW_MAX. The recommendation is to use a 5% gap to mitigate the problem. Clear the feature MBW_MIN feature from the class to ensure we don't accidentally change behaviour when resctrl adds support for a MBW_MIN interface. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Fenghua Yu Signed-off-by: Shanker Donthineni Signed-off-by: James Morse Signed-off-by: Ben Horgan (backported from de0a00982d0aefb3d94828e908179aca02feaa85 https://gitlab.arm.com/linux-arm/linux-bh/-/tree/mpam_resctrl_glue_v6?ref_type=heads) [fenghuay: Changes are: 1. Fix minor conflicts in struct mpam_quirks, enum mpam_device_quirks, mpam_reprogram_ris_partid(), and silicon-errata.rst. 2. Fix compilation issues in iidr and iidr_mask in mpam_quirks[]] Signed-off-by: Fenghua Yu --- Documentation/arch/arm64/silicon-errata.rst | 2 + drivers/resctrl/mpam_devices.c | 55 +++++++++++++++++++-- drivers/resctrl/mpam_internal.h | 1 + 3 files changed, 55 insertions(+), 3 deletions(-) diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst index 4b1076311455c..3667650036fba 100644 --- a/Documentation/arch/arm64/silicon-errata.rst +++ b/Documentation/arch/arm64/silicon-errata.rst @@ -251,6 +251,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | NVIDIA | T241 MPAM | T241-MPAM-1 | N/A | +----------------+-----------------+-----------------+-----------------------------+ +| NVIDIA | T241 MPAM | T241-MPAM-4 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ | NVIDIA | T241 MPAM | T241-MPAM-6 | N/A | +----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+ diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 7ca2a1c4a13a2..b7643c76025e7 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -877,6 +877,12 @@ static const struct mpam_quirk mpam_quirks[] = { .workaround = T241_SCRUB_SHADOW_REGS, }, { + /* NVIDIA t241 erratum T241-MPAM-4 */ + .iidr = IIDR_PROD(0x241) | IIDR_VAR(0) | IIDR_REV(0) | IIDR_IMP(0x36b), + .iidr_mask = IIDR_MATCH_ONE, + .workaround = T241_FORCE_MBW_MIN_TO_ONE, + }, + { /* NVIDIA t241 erratum T241-MPAM-6 */ .iidr = IIDR_PROD(0x241) | IIDR_VAR(0) | IIDR_REV(0) | IIDR_IMP(0x36b), .iidr_mask = IIDR_MATCH_ONE, @@ -1672,6 +1678,37 @@ static void mpam_quirk_post_config_change(struct mpam_msc_ris *ris, u16 partid, mpam_apply_t241_erratum(ris, partid); } +static u16 mpam_wa_t241_force_mbw_min_to_one(struct mpam_props *props) +{ + u16 max_hw_value, min_hw_granule, res0_bits; + + res0_bits = 16 - props->bwa_wd; + max_hw_value = ((1 << props->bwa_wd) - 1) << res0_bits; + min_hw_granule = ~max_hw_value; + + return min_hw_granule + 1; +} + +static u16 mpam_wa_t241_calc_min_from_max(struct mpam_props *props, + struct mpam_config *cfg) +{ + u16 val = 0; + u16 max; + u16 delta = ((5 * MPAMCFG_MBW_MAX_MAX) / 100) - 1; + + if (mpam_has_feature(mpam_feat_mbw_max, cfg)) { + max = cfg->mbw_max; + } else { + /* Resetting. Hence, use the ris specific default. */ + max = GENMASK(15, 16 - props->bwa_wd); + } + + if (max > delta) + val = max - delta; + + return val; +} + /* Called via IPI. Call while holding an SRCU reference */ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, struct mpam_config *cfg) @@ -1716,9 +1753,18 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, mpam_write_partsel_reg(msc, MBW_PBM, cfg->mbw_pbm); } - if (mpam_has_feature(mpam_feat_mbw_min, rprops) && - mpam_has_feature(mpam_feat_mbw_min, cfg)) - mpam_write_partsel_reg(msc, MBW_MIN, cfg->mbw_min); + if (mpam_has_feature(mpam_feat_mbw_min, rprops)) { + u16 val = 0; + + if (mpam_has_quirk(T241_FORCE_MBW_MIN_TO_ONE, msc)) { + u16 min = mpam_wa_t241_force_mbw_min_to_one(rprops); + + val = mpam_wa_t241_calc_min_from_max(rprops, cfg); + val = max(val, min); + } + + mpam_write_partsel_reg(msc, MBW_MIN, val); + } if (mpam_has_feature(mpam_feat_mbw_max, rprops) && mpam_has_feature(mpam_feat_mbw_max, cfg)) @@ -2699,6 +2745,9 @@ static void mpam_enable_merge_class_features(struct mpam_component *comp) list_for_each_entry(vmsc, &comp->vmsc, comp_list) __class_props_mismatch(class, vmsc); + + if (mpam_has_quirk(T241_FORCE_MBW_MIN_TO_ONE, class)) + mpam_clear_feature(mpam_feat_mbw_min, &class->props); } /* diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 8147ce91860ee..1f82a2183c1c1 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -259,6 +259,7 @@ struct mpam_props { /* Workaround bits for msc->quirks */ enum mpam_device_quirks { T241_SCRUB_SHADOW_REGS, + T241_FORCE_MBW_MIN_TO_ONE, T241_MBW_COUNTER_SCALE_64, IGNORE_CSU_NRDY, MPAM_QUIRK_LAST, From 1b51f4e1fa0192d02514bec93a7b40b6358142b3 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:20:48 -0800 Subject: [PATCH 13/49] x86,fs/resctrl: Improve domain type checking Every resctrl resource has a list of domain structures. struct rdt_ctrl_domain and struct rdt_mon_domain both begin with struct rdt_domain_hdr with rdt_domain_hdr::type used in validity checks before accessing the domain of a particular type. Add the resource id to struct rdt_domain_hdr in preparation for a new monitoring domain structure that will be associated with a new monitoring resource. Improve existing domain validity checks with a new helper domain_header_is_valid() that checks both domain type and resource id. domain_header_is_valid() should be used before every call to container_of() that accesses a domain structure. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Shanker Donthineni (backported from commit 03eb578b37659e10bed14c2d9e7cc45dfe24123b) [fenghuay: fix a minor conflict in struct rdt_domain_hdr] Signed-off-by: Fenghua Yu --- arch/x86/kernel/cpu/resctrl/core.c | 10 ++++++---- fs/resctrl/ctrlmondata.c | 2 +- include/linux/resctrl.h | 9 +++++++++ 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 42fcc9d7ff7a2..1df5b8205ef4d 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -459,7 +459,7 @@ static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r) hdr = resctrl_find_domain(&r->ctrl_domains, id, &add_pos); if (hdr) { - if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN)) + if (!domain_header_is_valid(hdr, RESCTRL_CTRL_DOMAIN, r->rid)) return; d = container_of(hdr, struct rdt_ctrl_domain, hdr); @@ -476,6 +476,7 @@ static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r) d = &hw_dom->d_resctrl; d->hdr.id = id; d->hdr.type = RESCTRL_CTRL_DOMAIN; + d->hdr.rid = r->rid; cpumask_set_cpu(cpu, &d->hdr.cpu_mask); rdt_domain_reconfigure_cdp(r); @@ -515,7 +516,7 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) hdr = resctrl_find_domain(&r->mon_domains, id, &add_pos); if (hdr) { - if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, r->rid)) return; d = container_of(hdr, struct rdt_mon_domain, hdr); @@ -533,6 +534,7 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) d = &hw_dom->d_resctrl; d->hdr.id = id; d->hdr.type = RESCTRL_MON_DOMAIN; + d->hdr.rid = r->rid; ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); if (!ci) { pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name); @@ -593,7 +595,7 @@ static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r) return; } - if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN)) + if (!domain_header_is_valid(hdr, RESCTRL_CTRL_DOMAIN, r->rid)) return; d = container_of(hdr, struct rdt_ctrl_domain, hdr); @@ -639,7 +641,7 @@ static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r) return; } - if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, r->rid)) return; d = container_of(hdr, struct rdt_mon_domain, hdr); diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index c3688cbe0ff5c..8944295c4030c 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -647,7 +647,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) * the resource to find the domain with "domid". */ hdr = resctrl_find_domain(&r->mon_domains, domid, NULL); - if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) { + if (!hdr || !domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, resid)) { ret = -ENOENT; goto out; } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 055f27045b4da..3b9c158d150d9 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -134,15 +134,24 @@ enum resctrl_domain_type { * @list: all instances of this resource * @id: unique id for this instance * @type: type of this instance + * @rid: resource id for this instance * @cpu_mask: which CPUs share this resource */ struct rdt_domain_hdr { struct list_head list; u32 id; enum resctrl_domain_type type; + enum resctrl_res_level rid; struct cpumask cpu_mask; }; +static inline bool domain_header_is_valid(struct rdt_domain_hdr *hdr, + enum resctrl_domain_type type, + enum resctrl_res_level rid) +{ + return !WARN_ON_ONCE(hdr->type != type || hdr->rid != rid); +} + /** * struct rdt_ctrl_domain - group of CPUs sharing a resctrl control resource * @hdr: common header for different domain types From 6b650718b057bde34ee90c178b4fc6110271d81a Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:20:49 -0800 Subject: [PATCH 14/49] x86/resctrl: Move L3 initialization into new helper function Carve out the resource monitoring domain init code into a separate helper in order to be able to initialize new types of monitoring domains besides the usual L3 ones. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com (cherry picked from commit 0d6447623d788806b5504182032a0837ffa2174c) Signed-off-by: Shanker Donthineni Signed-off-by: Fenghua Yu --- arch/x86/kernel/cpu/resctrl/core.c | 64 ++++++++++++++++-------------- 1 file changed, 34 insertions(+), 30 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 1df5b8205ef4d..c3b2e77622512 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -496,37 +496,13 @@ static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r) } } -static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) +static void l3_mon_domain_setup(int cpu, int id, struct rdt_resource *r, struct list_head *add_pos) { - int id = get_domain_id_from_scope(cpu, r->mon_scope); - struct list_head *add_pos = NULL; struct rdt_hw_mon_domain *hw_dom; - struct rdt_domain_hdr *hdr; struct rdt_mon_domain *d; struct cacheinfo *ci; int err; - lockdep_assert_held(&domain_list_lock); - - if (id < 0) { - pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n", - cpu, r->mon_scope, r->name); - return; - } - - hdr = resctrl_find_domain(&r->mon_domains, id, &add_pos); - if (hdr) { - if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, r->rid)) - return; - d = container_of(hdr, struct rdt_mon_domain, hdr); - - cpumask_set_cpu(cpu, &d->hdr.cpu_mask); - /* Update the mbm_assign_mode state for the CPU if supported */ - if (r->mon.mbm_cntr_assignable) - resctrl_arch_mbm_cntr_assign_set_one(r); - return; - } - hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu)); if (!hw_dom) return; @@ -534,7 +510,7 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) d = &hw_dom->d_resctrl; d->hdr.id = id; d->hdr.type = RESCTRL_MON_DOMAIN; - d->hdr.rid = r->rid; + d->hdr.rid = RDT_RESOURCE_L3; ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); if (!ci) { pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name); @@ -544,10 +520,6 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) d->ci_id = ci->id; cpumask_set_cpu(cpu, &d->hdr.cpu_mask); - /* Update the mbm_assign_mode state for the CPU if supported */ - if (r->mon.mbm_cntr_assignable) - resctrl_arch_mbm_cntr_assign_set_one(r); - arch_mon_domain_online(r, d); if (arch_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) { @@ -565,6 +537,38 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) } } +static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) +{ + int id = get_domain_id_from_scope(cpu, r->mon_scope); + struct list_head *add_pos = NULL; + struct rdt_domain_hdr *hdr; + + lockdep_assert_held(&domain_list_lock); + + if (id < 0) { + pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n", + cpu, r->mon_scope, r->name); + return; + } + + hdr = resctrl_find_domain(&r->mon_domains, id, &add_pos); + if (hdr) + cpumask_set_cpu(cpu, &hdr->cpu_mask); + + switch (r->rid) { + case RDT_RESOURCE_L3: + /* Update the mbm_assign_mode state for the CPU if supported */ + if (r->mon.mbm_cntr_assignable) + resctrl_arch_mbm_cntr_assign_set_one(r); + if (!hdr) + l3_mon_domain_setup(cpu, id, r, add_pos); + break; + default: + pr_warn_once("Unknown resource rid=%d\n", r->rid); + break; + } +} + static void domain_add_cpu(int cpu, struct rdt_resource *r) { if (r->alloc_capable) From 2f17b934ee8e7d48d09486403c4a96d4a213bc74 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:20:50 -0800 Subject: [PATCH 15/49] x86/resctrl: Refactor domain_remove_cpu_mon() ready for new domain types New telemetry events will be associated with a new package scoped resource with a new domain structure. Refactor domain_remove_cpu_mon() so all the L3 domain processing is separate from the general domain action of clearing the CPU bit in the mask. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com (cherry picked from commit 6396fc5351ea9130a72f6a2fc58eb7298ce6c15a) Signed-off-by: Shanker Donthineni Signed-off-by: Fenghua Yu --- arch/x86/kernel/cpu/resctrl/core.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index c3b2e77622512..66aab504882f0 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -626,9 +626,7 @@ static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r) static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r) { int id = get_domain_id_from_scope(cpu, r->mon_scope); - struct rdt_hw_mon_domain *hw_dom; struct rdt_domain_hdr *hdr; - struct rdt_mon_domain *d; lockdep_assert_held(&domain_list_lock); @@ -645,20 +643,29 @@ static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r) return; } - if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, r->rid)) + cpumask_clear_cpu(cpu, &hdr->cpu_mask); + if (!cpumask_empty(&hdr->cpu_mask)) return; - d = container_of(hdr, struct rdt_mon_domain, hdr); - hw_dom = resctrl_to_arch_mon_dom(d); + switch (r->rid) { + case RDT_RESOURCE_L3: { + struct rdt_hw_mon_domain *hw_dom; + struct rdt_mon_domain *d; - cpumask_clear_cpu(cpu, &d->hdr.cpu_mask); - if (cpumask_empty(&d->hdr.cpu_mask)) { + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + return; + + d = container_of(hdr, struct rdt_mon_domain, hdr); + hw_dom = resctrl_to_arch_mon_dom(d); resctrl_offline_mon_domain(r, d); - list_del_rcu(&d->hdr.list); + list_del_rcu(&hdr->list); synchronize_rcu(); mon_domain_free(hw_dom); - - return; + break; + } + default: + pr_warn_once("Unknown resource rid=%d\n", r->rid); + break; } } From 1eac33a0609634fd8b08c2ea55379212821044ea Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:20:51 -0800 Subject: [PATCH 16/49] x86/resctrl: Clean up domain_remove_cpu_ctrl() For symmetry with domain_remove_cpu_mon() refactor domain_remove_cpu_ctrl() to take an early return when removing a CPU does not empty the domain. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com (cherry picked from commit c1b630573c8ca51a89bd480f7eeaf8754c7609f2) Signed-off-by: Shanker Donthineni Signed-off-by: Fenghua Yu --- arch/x86/kernel/cpu/resctrl/core.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 66aab504882f0..3aa89ef34a69f 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -599,28 +599,27 @@ static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r) return; } + cpumask_clear_cpu(cpu, &hdr->cpu_mask); + if (!cpumask_empty(&hdr->cpu_mask)) + return; + if (!domain_header_is_valid(hdr, RESCTRL_CTRL_DOMAIN, r->rid)) return; d = container_of(hdr, struct rdt_ctrl_domain, hdr); hw_dom = resctrl_to_arch_ctrl_dom(d); - cpumask_clear_cpu(cpu, &d->hdr.cpu_mask); - if (cpumask_empty(&d->hdr.cpu_mask)) { - resctrl_offline_ctrl_domain(r, d); - list_del_rcu(&d->hdr.list); - synchronize_rcu(); - - /* - * rdt_ctrl_domain "d" is going to be freed below, so clear - * its pointer from pseudo_lock_region struct. - */ - if (d->plr) - d->plr->d = NULL; - ctrl_domain_free(hw_dom); + resctrl_offline_ctrl_domain(r, d); + list_del_rcu(&hdr->list); + synchronize_rcu(); - return; - } + /* + * rdt_ctrl_domain "d" is going to be freed below, so clear + * its pointer from pseudo_lock_region struct. + */ + if (d->plr) + d->plr->d = NULL; + ctrl_domain_free(hw_dom); } static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r) From a0d06954a456e9755f03ddae83e05cd879876253 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:20:52 -0800 Subject: [PATCH 17/49] x86,fs/resctrl: Refactor domain create/remove using struct rdt_domain_hdr Up until now, all monitoring events were associated with the L3 resource and it made sense to use the L3 specific "struct rdt_mon_domain *" argument to functions operating on domains. Telemetry events will be tied to a new resource with its instances represented by a new domain structure that, just like struct rdt_mon_domain, starts with the generic struct rdt_domain_hdr. Prepare to support domains belonging to different resources by changing the calling convention of functions operating on domains. Pass the generic header and use that to find the domain specific structure where needed. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com (cherry picked from commit 97fec06d35b2c1ce6d80cf3b01bfddd82c720a2d) Signed-off-by: Shanker Donthineni Signed-off-by: Fenghua Yu --- arch/x86/kernel/cpu/resctrl/core.c | 4 +- fs/resctrl/ctrlmondata.c | 14 ++++-- fs/resctrl/internal.h | 2 +- fs/resctrl/rdtgroup.c | 69 +++++++++++++++++++++--------- include/linux/resctrl.h | 4 +- 5 files changed, 63 insertions(+), 30 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 3aa89ef34a69f..a232ddd09d03f 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -529,7 +529,7 @@ static void l3_mon_domain_setup(int cpu, int id, struct rdt_resource *r, struct list_add_tail_rcu(&d->hdr.list, add_pos); - err = resctrl_online_mon_domain(r, d); + err = resctrl_online_mon_domain(r, &d->hdr); if (err) { list_del_rcu(&d->hdr.list); synchronize_rcu(); @@ -656,7 +656,7 @@ static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r) d = container_of(hdr, struct rdt_mon_domain, hdr); hw_dom = resctrl_to_arch_mon_dom(d); - resctrl_offline_mon_domain(r, d); + resctrl_offline_mon_domain(r, hdr); list_del_rcu(&hdr->list); synchronize_rcu(); mon_domain_free(hw_dom); diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 8944295c4030c..ba24374587345 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -545,14 +545,21 @@ struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id, } void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, - struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, + struct rdt_domain_hdr *hdr, struct rdtgroup *rdtgrp, cpumask_t *cpumask, int evtid, int first) { + struct rdt_mon_domain *d = NULL; int cpu; /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */ lockdep_assert_cpus_held(); + if (hdr) { + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + return; + d = container_of(hdr, struct rdt_mon_domain, hdr); + } + /* * Setup the parameters to pass to mon_event_count() to read the data. */ @@ -647,12 +654,11 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) * the resource to find the domain with "domid". */ hdr = resctrl_find_domain(&r->mon_domains, domid, NULL); - if (!hdr || !domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, resid)) { + if (!hdr) { ret = -ENOENT; goto out; } - d = container_of(hdr, struct rdt_mon_domain, hdr); - mon_event_read(&rr, r, d, rdtgrp, &d->hdr.cpu_mask, evtid, false); + mon_event_read(&rr, r, hdr, rdtgrp, &hdr->cpu_mask, evtid, false); } checkresult: diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 24f340f5f4de0..b43c5807dcb82 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -370,7 +370,7 @@ void mon_event_count(void *info); int rdtgroup_mondata_show(struct seq_file *m, void *arg); void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, - struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, + struct rdt_domain_hdr *hdr, struct rdtgroup *rdtgrp, cpumask_t *cpumask, int evtid, int first); int resctrl_mon_resource_init(void); diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 48ecde1ca9fd2..4d0d562a85efc 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -3422,17 +3422,22 @@ static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subn * when last domain being summed is removed. */ static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - struct rdt_mon_domain *d) + struct rdt_domain_hdr *hdr) { struct rdtgroup *prgrp, *crgrp; + struct rdt_mon_domain *d; char subname[32]; bool snc_mode; char name[32]; + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + return; + + d = container_of(hdr, struct rdt_mon_domain, hdr); snc_mode = r->mon_scope == RESCTRL_L3_NODE; - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id); + sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : hdr->id); if (snc_mode) - sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id); + sprintf(subname, "mon_sub_%s_%02d", r->name, hdr->id); list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname); @@ -3442,15 +3447,20 @@ static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, } } -static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, +static int mon_add_all_files(struct kernfs_node *kn, struct rdt_domain_hdr *hdr, struct rdt_resource *r, struct rdtgroup *prgrp, bool do_sum) { struct rmid_read rr = {0}; + struct rdt_mon_domain *d; struct mon_data *priv; struct mon_evt *mevt; int ret, domid; + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + return -EINVAL; + + d = container_of(hdr, struct rdt_mon_domain, hdr); for_each_mon_event(mevt) { if (mevt->rid != r->rid || !mevt->enabled) continue; @@ -3464,23 +3474,28 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, return ret; if (!do_sum && resctrl_is_mbm_event(mevt->evtid)) - mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true); + mon_event_read(&rr, r, hdr, prgrp, &hdr->cpu_mask, mevt->evtid, true); } return 0; } static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, - struct rdt_mon_domain *d, + struct rdt_domain_hdr *hdr, struct rdt_resource *r, struct rdtgroup *prgrp) { struct kernfs_node *kn, *ckn; + struct rdt_mon_domain *d; char name[32]; bool snc_mode; int ret = 0; lockdep_assert_held(&rdtgroup_mutex); + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + return -EINVAL; + + d = container_of(hdr, struct rdt_mon_domain, hdr); snc_mode = r->mon_scope == RESCTRL_L3_NODE; sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id); kn = kernfs_find_and_get(parent_kn, name); @@ -3498,13 +3513,13 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, ret = rdtgroup_kn_set_ugid(kn); if (ret) goto out_destroy; - ret = mon_add_all_files(kn, d, r, prgrp, snc_mode); + ret = mon_add_all_files(kn, hdr, r, prgrp, snc_mode); if (ret) goto out_destroy; } if (snc_mode) { - sprintf(name, "mon_sub_%s_%02d", r->name, d->hdr.id); + sprintf(name, "mon_sub_%s_%02d", r->name, hdr->id); ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp); if (IS_ERR(ckn)) { ret = -EINVAL; @@ -3515,7 +3530,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, if (ret) goto out_destroy; - ret = mon_add_all_files(ckn, d, r, prgrp, false); + ret = mon_add_all_files(ckn, hdr, r, prgrp, false); if (ret) goto out_destroy; } @@ -3533,7 +3548,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, * and "monitor" groups with given domain id. */ static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - struct rdt_mon_domain *d) + struct rdt_domain_hdr *hdr) { struct kernfs_node *parent_kn; struct rdtgroup *prgrp, *crgrp; @@ -3541,12 +3556,12 @@ static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { parent_kn = prgrp->mon.mon_data_kn; - mkdir_mondata_subdir(parent_kn, d, r, prgrp); + mkdir_mondata_subdir(parent_kn, hdr, r, prgrp); head = &prgrp->mon.crdtgrp_list; list_for_each_entry(crgrp, head, mon.crdtgrp_list) { parent_kn = crgrp->mon.mon_data_kn; - mkdir_mondata_subdir(parent_kn, d, r, crgrp); + mkdir_mondata_subdir(parent_kn, hdr, r, crgrp); } } } @@ -3555,14 +3570,14 @@ static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, struct rdt_resource *r, struct rdtgroup *prgrp) { - struct rdt_mon_domain *dom; + struct rdt_domain_hdr *hdr; int ret; /* Walking r->domains, ensure it can't race with cpuhp */ lockdep_assert_cpus_held(); - list_for_each_entry(dom, &r->mon_domains, hdr.list) { - ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); + list_for_each_entry(hdr, &r->mon_domains, list) { + ret = mkdir_mondata_subdir(parent_kn, hdr, r, prgrp); if (ret) return ret; } @@ -4448,16 +4463,23 @@ void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain mutex_unlock(&rdtgroup_mutex); } -void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) +void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr) { + struct rdt_mon_domain *d; + mutex_lock(&rdtgroup_mutex); + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + goto out_unlock; + + d = container_of(hdr, struct rdt_mon_domain, hdr); + /* * If resctrl is mounted, remove all the * per domain monitor data directories. */ if (resctrl_mounted && resctrl_arch_mon_capable()) - rmdir_mondata_subdir_allrdtgrp(r, d); + rmdir_mondata_subdir_allrdtgrp(r, hdr); if (resctrl_is_mbm_enabled()) cancel_delayed_work(&d->mbm_over); @@ -4475,7 +4497,7 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d } domain_destroy_mon_state(d); - +out_unlock: mutex_unlock(&rdtgroup_mutex); } @@ -4548,12 +4570,17 @@ int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d return err; } -int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) +int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr) { - int err; + struct rdt_mon_domain *d; + int err = -EINVAL; mutex_lock(&rdtgroup_mutex); + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + goto out_unlock; + + d = container_of(hdr, struct rdt_mon_domain, hdr); err = domain_setup_mon_state(r, d); if (err) goto out_unlock; @@ -4574,7 +4601,7 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) * If resctrl is mounted, add per domain monitor data directories. */ if (resctrl_mounted && resctrl_arch_mon_capable()) - mkdir_mondata_subdir_allrdtgrp(r, d); + mkdir_mondata_subdir_allrdtgrp(r, hdr); out_unlock: mutex_unlock(&rdtgroup_mutex); diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 3b9c158d150d9..3bcd3a9a099d2 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -550,9 +550,9 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type type); int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d); -int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d); +int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr); void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d); -void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d); +void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr); void resctrl_online_cpu(unsigned int cpu); void resctrl_offline_cpu(unsigned int cpu); From 08acebe0926eddc043aee440bb2fb6cfcca1d59c Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:20:53 -0800 Subject: [PATCH 18/49] fs/resctrl: Split L3 dependent parts out of __mon_event_count() Carve out the L3 resource specific event reading code into a separate helper to support reading event data from a new monitoring resource. Suggested-by: Reinette Chatre Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com (cherry picked from commit ad5c2ff75e0c53d2588dfc10eb87458e759b6bbe) Signed-off-by: Shanker Donthineni Signed-off-by: Fenghua Yu --- fs/resctrl/monitor.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 2a50109cf7c91..047967532ce12 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -440,7 +440,7 @@ static void mbm_cntr_free(struct rdt_mon_domain *d, int cntr_id) memset(&d->cntr_cfg[cntr_id], 0, sizeof(*d->cntr_cfg)); } -static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) +static int __l3_mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) { int cpu = smp_processor_id(); u32 closid = rdtgrp->closid; @@ -521,6 +521,17 @@ static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) return ret; } +static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) +{ + switch (rr->r->rid) { + case RDT_RESOURCE_L3: + return __l3_mon_event_count(rdtgrp, rr); + default: + rr->err = -EINVAL; + return -EINVAL; + } +} + /* * mbm_bw_count() - Update bw count from values previously read by * __mon_event_count(). From 7e11828cc8dcd41818c01ca85a197d8129675a4d Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:20:54 -0800 Subject: [PATCH 19/49] x86,fs/resctrl: Use struct rdt_domain_hdr when reading counters Convert the whole call sequence from mon_event_read() to resctrl_arch_rmid_read() to pass resource independent struct rdt_domain_hdr instead of an L3 specific domain structure to prepare for monitoring events in other resources. This additional layer of indirection obscures which aspects of event counting depend on a valid domain. Event initialization, support for assignable counters, and normal event counting implicitly depend on a valid domain while summing of domains does not. Split summing domains from the core event counting handling to make their respective dependencies obvious. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Shanker Donthineni (backported from commit 6b10cf7b6ea857cdf9570e21c077a05803f60575) [fenghuay: fix a minor conflict in __check_limbo()] Signed-off-by: Fenghua Yu --- arch/x86/kernel/cpu/resctrl/monitor.c | 12 +++- fs/resctrl/ctrlmondata.c | 9 +-- fs/resctrl/internal.h | 18 +++--- fs/resctrl/monitor.c | 85 ++++++++++++++++++--------- include/linux/resctrl.h | 4 +- 5 files changed, 78 insertions(+), 50 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index fe1a2aa53c16a..982dcf23183cb 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -238,19 +238,25 @@ static u64 get_corrected_val(struct rdt_resource *r, struct rdt_mon_domain *d, return chunks * hw_res->mon_scale; } -int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, u32 unused, u32 rmid, enum resctrl_event_id eventid, u64 *val, void *ignored) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); - int cpu = cpumask_any(&d->hdr.cpu_mask); + struct rdt_hw_mon_domain *hw_dom; struct arch_mbm_state *am; + struct rdt_mon_domain *d; u64 msr_val; u32 prmid; + int cpu; int ret; resctrl_arch_rmid_read_context_check(); + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + return -EINVAL; + d = container_of(hdr, struct rdt_mon_domain, hdr); + hw_dom = resctrl_to_arch_mon_dom(d); + cpu = cpumask_any(&hdr->cpu_mask); prmid = logical_rmid_to_physical_rmid(cpu, rmid); ret = __rmid_read_phys(prmid, eventid, &msr_val); diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index ba24374587345..7df4bf1835296 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -548,25 +548,18 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, struct rdt_domain_hdr *hdr, struct rdtgroup *rdtgrp, cpumask_t *cpumask, int evtid, int first) { - struct rdt_mon_domain *d = NULL; int cpu; /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */ lockdep_assert_cpus_held(); - if (hdr) { - if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) - return; - d = container_of(hdr, struct rdt_mon_domain, hdr); - } - /* * Setup the parameters to pass to mon_event_count() to read the data. */ rr->rgrp = rdtgrp; rr->evtid = evtid; rr->r = r; - rr->d = d; + rr->hdr = hdr; rr->first = first; if (resctrl_arch_mbm_cntr_assign_enabled(r) && resctrl_is_mbm_event(evtid)) { diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index b43c5807dcb82..141031ebb706d 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -107,24 +107,26 @@ struct mon_data { * resource group then its event count is summed with the count from all * its child resource groups. * @r: Resource describing the properties of the event being read. - * @d: Domain that the counter should be read from. If NULL then sum all - * domains in @r sharing L3 @ci.id + * @hdr: Header of domain that the counter should be read from. If NULL then + * sum all domains in @r sharing L3 @ci.id * @evtid: Which monitor event to read. * @first: Initialize MBM counter when true. - * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains. + * @ci: Cacheinfo for L3. Only set when @hdr is NULL. Used when summing + * domains. * @is_mbm_cntr: true if "mbm_event" counter assignment mode is enabled and it * is an MBM event. * @err: Error encountered when reading counter. - * @val: Returned value of event counter. If @rgrp is a parent resource group, - * @val includes the sum of event counts from its child resource groups. - * If @d is NULL, @val includes the sum of all domains in @r sharing @ci.id, - * (summed across child resource groups if @rgrp is a parent resource group). + * @val: Returned value of event counter. If @rgrp is a parent resource + * group, @val includes the sum of event counts from its child + * resource groups. If @hdr is NULL, @val includes the sum of all + * domains in @r sharing @ci.id, (summed across child resource groups + * if @rgrp is a parent resource group). * @arch_mon_ctx: Hardware monitor allocated for this read request (MPAM only). */ struct rmid_read { struct rdtgroup *rgrp; struct rdt_resource *r; - struct rdt_mon_domain *d; + struct rdt_domain_hdr *hdr; enum resctrl_event_id evtid; bool first; struct cacheinfo *ci; diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 047967532ce12..dcc4d9ea720f5 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -180,7 +180,7 @@ void __check_limbo(struct rdt_mon_domain *d, bool force_free) entry = __rmid_entry(idx); if (!entry) break; - if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, + if (resctrl_arch_rmid_read(r, &d->hdr, entry->closid, entry->rmid, QOS_L3_OCCUP_EVENT_ID, &val, arch_mon_ctx)) { rmid_dirty = true; @@ -448,11 +448,16 @@ static int __l3_mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) struct rdt_mon_domain *d; int cntr_id = -ENOENT; struct mbm_state *m; - int err, ret; u64 tval = 0; + if (!domain_header_is_valid(rr->hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) { + rr->err = -EIO; + return -EINVAL; + } + d = container_of(rr->hdr, struct rdt_mon_domain, hdr); + if (rr->is_mbm_cntr) { - cntr_id = mbm_cntr_get(rr->r, rr->d, rdtgrp, rr->evtid); + cntr_id = mbm_cntr_get(rr->r, d, rdtgrp, rr->evtid); if (cntr_id < 0) { rr->err = -ENOENT; return -EINVAL; @@ -461,31 +466,50 @@ static int __l3_mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) if (rr->first) { if (rr->is_mbm_cntr) - resctrl_arch_reset_cntr(rr->r, rr->d, closid, rmid, cntr_id, rr->evtid); + resctrl_arch_reset_cntr(rr->r, d, closid, rmid, cntr_id, rr->evtid); else - resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); - m = get_mbm_state(rr->d, closid, rmid, rr->evtid); + resctrl_arch_reset_rmid(rr->r, d, closid, rmid, rr->evtid); + m = get_mbm_state(d, closid, rmid, rr->evtid); if (m) memset(m, 0, sizeof(struct mbm_state)); return 0; } - if (rr->d) { - /* Reading a single domain, must be on a CPU in that domain. */ - if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask)) - return -EINVAL; - if (rr->is_mbm_cntr) - rr->err = resctrl_arch_cntr_read(rr->r, rr->d, closid, rmid, cntr_id, - rr->evtid, &tval); - else - rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); - if (rr->err) - return rr->err; + /* Reading a single domain, must be on a CPU in that domain. */ + if (!cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) + return -EINVAL; + if (rr->is_mbm_cntr) + rr->err = resctrl_arch_cntr_read(rr->r, d, closid, rmid, cntr_id, + rr->evtid, &tval); + else + rr->err = resctrl_arch_rmid_read(rr->r, rr->hdr, closid, rmid, + rr->evtid, &tval, rr->arch_mon_ctx); + if (rr->err) + return rr->err; - rr->val += tval; + rr->val += tval; - return 0; + return 0; +} + +static int __l3_mon_event_count_sum(struct rdtgroup *rdtgrp, struct rmid_read *rr) +{ + int cpu = smp_processor_id(); + u32 closid = rdtgrp->closid; + u32 rmid = rdtgrp->mon.rmid; + struct rdt_mon_domain *d; + u64 tval = 0; + int err, ret; + + /* + * Summing across domains is only done for systems that implement + * Sub-NUMA Cluster. There is no overlap with systems that support + * assignable counters. + */ + if (rr->is_mbm_cntr) { + pr_warn_once("Summing domains using assignable counters is not supported\n"); + rr->err = -EINVAL; + return -EINVAL; } /* Summing domains that share a cache, must be on a CPU for that cache. */ @@ -503,12 +527,8 @@ static int __l3_mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { if (d->ci_id != rr->ci->id) continue; - if (rr->is_mbm_cntr) - err = resctrl_arch_cntr_read(rr->r, d, closid, rmid, cntr_id, - rr->evtid, &tval); - else - err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); + err = resctrl_arch_rmid_read(rr->r, &d->hdr, closid, rmid, + rr->evtid, &tval, rr->arch_mon_ctx); if (!err) { rr->val += tval; ret = 0; @@ -525,7 +545,10 @@ static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) { switch (rr->r->rid) { case RDT_RESOURCE_L3: - return __l3_mon_event_count(rdtgrp, rr); + if (rr->hdr) + return __l3_mon_event_count(rdtgrp, rr); + else + return __l3_mon_event_count_sum(rdtgrp, rr); default: rr->err = -EINVAL; return -EINVAL; @@ -549,9 +572,13 @@ static void mbm_bw_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) u64 cur_bw, bytes, cur_bytes; u32 closid = rdtgrp->closid; u32 rmid = rdtgrp->mon.rmid; + struct rdt_mon_domain *d; struct mbm_state *m; - m = get_mbm_state(rr->d, closid, rmid, rr->evtid); + if (!domain_header_is_valid(rr->hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + return; + d = container_of(rr->hdr, struct rdt_mon_domain, hdr); + m = get_mbm_state(d, closid, rmid, rr->evtid); if (WARN_ON_ONCE(!m)) return; @@ -724,7 +751,7 @@ static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain * struct rmid_read rr = {0}; rr.r = r; - rr.d = d; + rr.hdr = &d->hdr; rr.evtid = evtid; if (resctrl_arch_mbm_cntr_assign_enabled(r)) { rr.is_mbm_cntr = true; diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 3bcd3a9a099d2..7d24f92e91c5a 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -560,7 +560,7 @@ void resctrl_offline_cpu(unsigned int cpu); * resctrl_arch_rmid_read() - Read the eventid counter corresponding to rmid * for this resource and domain. * @r: resource that the counter should be read from. - * @d: domain that the counter should be read from. + * @hdr: Header of domain that the counter should be read from. * @closid: closid that matches the rmid. Depending on the architecture, the * counter may match traffic of both @closid and @rmid, or @rmid * only. @@ -581,7 +581,7 @@ void resctrl_offline_cpu(unsigned int cpu); * Return: * 0 on success, or -EIO, -EINVAL etc on error. */ -int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, u32 closid, u32 rmid, enum resctrl_event_id eventid, u64 *val, void *arch_mon_ctx); From a1dffa925aa79ed98b9bd7dc6dbb3bf105b88f3b Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:20:55 -0800 Subject: [PATCH 20/49] x86,fs/resctrl: Rename struct rdt_mon_domain and rdt_hw_mon_domain The upcoming telemetry event monitoring is not tied to the L3 resource and will have a new domain structure. Rename the L3 resource specific domain data structures to include "l3_" in their names to avoid confusion between the different resource specific domain structures: rdt_mon_domain -> rdt_l3_mon_domain rdt_hw_mon_domain -> rdt_hw_l3_mon_domain No functional change. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com (cherry picked from commit 4bc3ef46ff41d5e7ba557e56e9cd2031527cd7f8) Signed-off-by: Shanker Donthineni Signed-off-by: Fenghua Yu --- arch/x86/kernel/cpu/resctrl/core.c | 14 +++--- arch/x86/kernel/cpu/resctrl/internal.h | 16 +++--- arch/x86/kernel/cpu/resctrl/monitor.c | 36 ++++++------- fs/resctrl/ctrlmondata.c | 2 +- fs/resctrl/internal.h | 8 +-- fs/resctrl/monitor.c | 70 +++++++++++++------------- fs/resctrl/rdtgroup.c | 40 +++++++-------- include/linux/resctrl.h | 22 ++++---- 8 files changed, 104 insertions(+), 104 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index a232ddd09d03f..f11b981882ef4 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -363,7 +363,7 @@ static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom) kfree(hw_dom); } -static void mon_domain_free(struct rdt_hw_mon_domain *hw_dom) +static void mon_domain_free(struct rdt_hw_l3_mon_domain *hw_dom) { int idx; @@ -400,7 +400,7 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain * * @num_rmid: The size of the MBM counter array * @hw_dom: The domain that owns the allocated arrays */ -static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom) +static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_l3_mon_domain *hw_dom) { size_t tsize = sizeof(*hw_dom->arch_mbm_states[0]); enum resctrl_event_id eventid; @@ -498,8 +498,8 @@ static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r) static void l3_mon_domain_setup(int cpu, int id, struct rdt_resource *r, struct list_head *add_pos) { - struct rdt_hw_mon_domain *hw_dom; - struct rdt_mon_domain *d; + struct rdt_hw_l3_mon_domain *hw_dom; + struct rdt_l3_mon_domain *d; struct cacheinfo *ci; int err; @@ -648,13 +648,13 @@ static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r) switch (r->rid) { case RDT_RESOURCE_L3: { - struct rdt_hw_mon_domain *hw_dom; - struct rdt_mon_domain *d; + struct rdt_hw_l3_mon_domain *hw_dom; + struct rdt_l3_mon_domain *d; if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) return; - d = container_of(hdr, struct rdt_mon_domain, hdr); + d = container_of(hdr, struct rdt_l3_mon_domain, hdr); hw_dom = resctrl_to_arch_mon_dom(d); resctrl_offline_mon_domain(r, hdr); list_del_rcu(&hdr->list); diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 9f4c2f0aaf5c8..c9871b34995f9 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -60,17 +60,17 @@ struct rdt_hw_ctrl_domain { }; /** - * struct rdt_hw_mon_domain - Arch private attributes of a set of CPUs that share - * a resource for a monitor function - * @d_resctrl: Properties exposed to the resctrl file system + * struct rdt_hw_l3_mon_domain - Arch private attributes of a set of CPUs sharing + * RDT_RESOURCE_L3 monitoring + * @d_resctrl: Properties exposed to the resctrl file system * @arch_mbm_states: Per-event pointer to the MBM event's saved state. * An MBM event's state is an array of struct arch_mbm_state * indexed by RMID on x86. * * Members of this structure are accessed via helpers that provide abstraction. */ -struct rdt_hw_mon_domain { - struct rdt_mon_domain d_resctrl; +struct rdt_hw_l3_mon_domain { + struct rdt_l3_mon_domain d_resctrl; struct arch_mbm_state *arch_mbm_states[QOS_NUM_L3_MBM_EVENTS]; }; @@ -79,9 +79,9 @@ static inline struct rdt_hw_ctrl_domain *resctrl_to_arch_ctrl_dom(struct rdt_ctr return container_of(r, struct rdt_hw_ctrl_domain, d_resctrl); } -static inline struct rdt_hw_mon_domain *resctrl_to_arch_mon_dom(struct rdt_mon_domain *r) +static inline struct rdt_hw_l3_mon_domain *resctrl_to_arch_mon_dom(struct rdt_l3_mon_domain *r) { - return container_of(r, struct rdt_hw_mon_domain, d_resctrl); + return container_of(r, struct rdt_hw_l3_mon_domain, d_resctrl); } /** @@ -135,7 +135,7 @@ static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r extern struct rdt_hw_resource rdt_resources_all[]; -void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d); +void arch_mon_domain_online(struct rdt_resource *r, struct rdt_l3_mon_domain *d); /* CPUID.(EAX=10H, ECX=ResID=1).EAX */ union cpuid_0x10_1_eax { diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 982dcf23183cb..8b293fc4e9461 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -109,7 +109,7 @@ static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) * * In RMID sharing mode there are fewer "logical RMID" values available * to accumulate data ("physical RMIDs" are divided evenly between SNC - * nodes that share an L3 cache). Linux creates an rdt_mon_domain for + * nodes that share an L3 cache). Linux creates an rdt_l3_mon_domain for * each SNC node. * * The value loaded into IA32_PQR_ASSOC is the "logical RMID". @@ -157,7 +157,7 @@ static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val) return 0; } -static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom, +static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_l3_mon_domain *hw_dom, u32 rmid, enum resctrl_event_id eventid) { @@ -171,11 +171,11 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_do return state ? &state[rmid] : NULL; } -void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 unused, u32 rmid, enum resctrl_event_id eventid) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); int cpu = cpumask_any(&d->hdr.cpu_mask); struct arch_mbm_state *am; u32 prmid; @@ -194,9 +194,9 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, * Assumes that hardware counters are also reset and thus that there is * no need to record initial non-zero counts. */ -void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) +void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); enum resctrl_event_id eventid; int idx; @@ -217,10 +217,10 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) return chunks >> shift; } -static u64 get_corrected_val(struct rdt_resource *r, struct rdt_mon_domain *d, +static u64 get_corrected_val(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 rmid, enum resctrl_event_id eventid, u64 msr_val) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); struct arch_mbm_state *am; u64 chunks; @@ -242,9 +242,9 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, u32 unused, u32 rmid, enum resctrl_event_id eventid, u64 *val, void *ignored) { - struct rdt_hw_mon_domain *hw_dom; + struct rdt_hw_l3_mon_domain *hw_dom; + struct rdt_l3_mon_domain *d; struct arch_mbm_state *am; - struct rdt_mon_domain *d; u64 msr_val; u32 prmid; int cpu; @@ -254,7 +254,7 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) return -EINVAL; - d = container_of(hdr, struct rdt_mon_domain, hdr); + d = container_of(hdr, struct rdt_l3_mon_domain, hdr); hw_dom = resctrl_to_arch_mon_dom(d); cpu = cpumask_any(&hdr->cpu_mask); prmid = logical_rmid_to_physical_rmid(cpu, rmid); @@ -308,11 +308,11 @@ static int __cntr_id_read(u32 cntr_id, u64 *val) return 0; } -void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 unused, u32 rmid, int cntr_id, enum resctrl_event_id eventid) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); struct arch_mbm_state *am; am = get_arch_mbm_state(hw_dom, rmid, eventid); @@ -324,7 +324,7 @@ void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, } } -int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 unused, u32 rmid, int cntr_id, enum resctrl_event_id eventid, u64 *val) { @@ -354,7 +354,7 @@ int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, * must adjust RMID counter numbers based on SNC node. See * logical_rmid_to_physical_rmid() for code that does this. */ -void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d) +void arch_mon_domain_online(struct rdt_resource *r, struct rdt_l3_mon_domain *d) { if (snc_nodes_per_l3_cache > 1) msr_clear_bit(MSR_RMID_SNC_CONFIG, 0); @@ -515,7 +515,7 @@ static void resctrl_abmc_set_one_amd(void *arg) */ static void _resctrl_abmc_enable(struct rdt_resource *r, bool enable) { - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; lockdep_assert_cpus_held(); @@ -554,11 +554,11 @@ static void resctrl_abmc_config_one_amd(void *info) /* * Send an IPI to the domain to assign the counter to RMID, event pair. */ -void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, enum resctrl_event_id evtid, u32 rmid, u32 closid, u32 cntr_id, bool assign) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); union l3_qos_abmc_cfg abmc_cfg = { 0 }; struct arch_mbm_state *am; diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 7df4bf1835296..1d5a0d3bcd5fd 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -594,9 +594,9 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) struct kernfs_open_file *of = m->private; enum resctrl_res_level resid; enum resctrl_event_id evtid; + struct rdt_l3_mon_domain *d; struct rdt_domain_hdr *hdr; struct rmid_read rr = {0}; - struct rdt_mon_domain *d; struct rdtgroup *rdtgrp; int domid, cpu, ret = 0; struct rdt_resource *r; diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 141031ebb706d..a04e2b8174da9 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -377,7 +377,7 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, int resctrl_mon_resource_init(void); -void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, +void mbm_setup_overflow_handler(struct rdt_l3_mon_domain *dom, unsigned long delay_ms, int exclude_cpu); @@ -385,14 +385,14 @@ void mbm_handle_overflow(struct work_struct *work); bool is_mba_sc(struct rdt_resource *r); -void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, +void cqm_setup_limbo_handler(struct rdt_l3_mon_domain *dom, unsigned long delay_ms, int exclude_cpu); void cqm_handle_limbo(struct work_struct *work); -bool has_busy_rmid(struct rdt_mon_domain *d); +bool has_busy_rmid(struct rdt_l3_mon_domain *d); -void __check_limbo(struct rdt_mon_domain *d, bool force_free); +void __check_limbo(struct rdt_l3_mon_domain *d, bool force_free); void resctrl_file_fflags_init(const char *config, unsigned long fflags); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index dcc4d9ea720f5..b857e382b6383 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -149,7 +149,7 @@ static void limbo_release_entry(struct rmid_entry *entry) * decrement the count. If the busy count gets to zero on an RMID, we * free the RMID */ -void __check_limbo(struct rdt_mon_domain *d, bool force_free) +void __check_limbo(struct rdt_l3_mon_domain *d, bool force_free) { struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); u32 idx_limit = resctrl_arch_system_num_rmid_idx(); @@ -209,7 +209,7 @@ void __check_limbo(struct rdt_mon_domain *d, bool force_free) resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx); } -bool has_busy_rmid(struct rdt_mon_domain *d) +bool has_busy_rmid(struct rdt_l3_mon_domain *d) { u32 idx_limit = resctrl_arch_system_num_rmid_idx(); @@ -310,7 +310,7 @@ int alloc_rmid(u32 closid) static void add_rmid_to_limbo(struct rmid_entry *entry) { struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; u32 idx; lockdep_assert_held(&rdtgroup_mutex); @@ -369,7 +369,7 @@ void free_rmid(u32 closid, u32 rmid) list_add_tail(&entry->list, &rmid_free_lru); } -static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, +static struct mbm_state *get_mbm_state(struct rdt_l3_mon_domain *d, u32 closid, u32 rmid, enum resctrl_event_id evtid) { u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); @@ -389,7 +389,7 @@ static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, * Return: * Valid counter ID on success, or -ENOENT on failure. */ -static int mbm_cntr_get(struct rdt_resource *r, struct rdt_mon_domain *d, +static int mbm_cntr_get(struct rdt_resource *r, struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) { int cntr_id; @@ -416,7 +416,7 @@ static int mbm_cntr_get(struct rdt_resource *r, struct rdt_mon_domain *d, * Return: * Valid counter ID on success, or -ENOSPC on failure. */ -static int mbm_cntr_alloc(struct rdt_resource *r, struct rdt_mon_domain *d, +static int mbm_cntr_alloc(struct rdt_resource *r, struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) { int cntr_id; @@ -435,7 +435,7 @@ static int mbm_cntr_alloc(struct rdt_resource *r, struct rdt_mon_domain *d, /* * mbm_cntr_free() - Clear the counter ID configuration details in the domain @d. */ -static void mbm_cntr_free(struct rdt_mon_domain *d, int cntr_id) +static void mbm_cntr_free(struct rdt_l3_mon_domain *d, int cntr_id) { memset(&d->cntr_cfg[cntr_id], 0, sizeof(*d->cntr_cfg)); } @@ -445,7 +445,7 @@ static int __l3_mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) int cpu = smp_processor_id(); u32 closid = rdtgrp->closid; u32 rmid = rdtgrp->mon.rmid; - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; int cntr_id = -ENOENT; struct mbm_state *m; u64 tval = 0; @@ -454,7 +454,7 @@ static int __l3_mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) rr->err = -EIO; return -EINVAL; } - d = container_of(rr->hdr, struct rdt_mon_domain, hdr); + d = container_of(rr->hdr, struct rdt_l3_mon_domain, hdr); if (rr->is_mbm_cntr) { cntr_id = mbm_cntr_get(rr->r, d, rdtgrp, rr->evtid); @@ -497,7 +497,7 @@ static int __l3_mon_event_count_sum(struct rdtgroup *rdtgrp, struct rmid_read *r int cpu = smp_processor_id(); u32 closid = rdtgrp->closid; u32 rmid = rdtgrp->mon.rmid; - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; u64 tval = 0; int err, ret; @@ -572,12 +572,12 @@ static void mbm_bw_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) u64 cur_bw, bytes, cur_bytes; u32 closid = rdtgrp->closid; u32 rmid = rdtgrp->mon.rmid; - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; struct mbm_state *m; if (!domain_header_is_valid(rr->hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) return; - d = container_of(rr->hdr, struct rdt_mon_domain, hdr); + d = container_of(rr->hdr, struct rdt_l3_mon_domain, hdr); m = get_mbm_state(d, closid, rmid, rr->evtid); if (WARN_ON_ONCE(!m)) return; @@ -677,7 +677,7 @@ static struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, * throttle MSRs already have low percentage values. To avoid * unnecessarily restricting such rdtgroups, we also increase the bandwidth. */ -static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) +static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_l3_mon_domain *dom_mbm) { u32 closid, rmid, cur_msr_val, new_msr_val; struct mbm_state *pmbm_data, *cmbm_data; @@ -745,7 +745,7 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); } -static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d, +static void mbm_update_one_event(struct rdt_resource *r, struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) { struct rmid_read rr = {0}; @@ -777,7 +777,7 @@ static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain * resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); } -static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, +static void mbm_update(struct rdt_resource *r, struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp) { /* @@ -798,12 +798,12 @@ static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, void cqm_handle_limbo(struct work_struct *work) { unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; cpus_read_lock(); mutex_lock(&rdtgroup_mutex); - d = container_of(work, struct rdt_mon_domain, cqm_limbo.work); + d = container_of(work, struct rdt_l3_mon_domain, cqm_limbo.work); __check_limbo(d, false); @@ -826,7 +826,7 @@ void cqm_handle_limbo(struct work_struct *work) * @exclude_cpu: Which CPU the handler should not run on, * RESCTRL_PICK_ANY_CPU to pick any CPU. */ -void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, +void cqm_setup_limbo_handler(struct rdt_l3_mon_domain *dom, unsigned long delay_ms, int exclude_cpu) { unsigned long delay = msecs_to_jiffies(delay_ms); @@ -843,7 +843,7 @@ void mbm_handle_overflow(struct work_struct *work) { unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); struct rdtgroup *prgrp, *crgrp; - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; struct list_head *head; struct rdt_resource *r; @@ -858,7 +858,7 @@ void mbm_handle_overflow(struct work_struct *work) goto out_unlock; r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - d = container_of(work, struct rdt_mon_domain, mbm_over.work); + d = container_of(work, struct rdt_l3_mon_domain, mbm_over.work); list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { mbm_update(r, d, prgrp); @@ -892,7 +892,7 @@ void mbm_handle_overflow(struct work_struct *work) * @exclude_cpu: Which CPU the handler should not run on, * RESCTRL_PICK_ANY_CPU to pick any CPU. */ -void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, +void mbm_setup_overflow_handler(struct rdt_l3_mon_domain *dom, unsigned long delay_ms, int exclude_cpu) { unsigned long delay = msecs_to_jiffies(delay_ms); @@ -1150,7 +1150,7 @@ ssize_t resctrl_mbm_assign_on_mkdir_write(struct kernfs_open_file *of, char *buf * mbm_cntr_free_all() - Clear all the counter ID configuration details in the * domain @d. Called when mbm_assign_mode is changed. */ -static void mbm_cntr_free_all(struct rdt_resource *r, struct rdt_mon_domain *d) +static void mbm_cntr_free_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d) { memset(d->cntr_cfg, 0, sizeof(*d->cntr_cfg) * r->mon.num_mbm_cntrs); } @@ -1159,7 +1159,7 @@ static void mbm_cntr_free_all(struct rdt_resource *r, struct rdt_mon_domain *d) * resctrl_reset_rmid_all() - Reset all non-architecture states for all the * supported RMIDs. */ -static void resctrl_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) +static void resctrl_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d) { u32 idx_limit = resctrl_arch_system_num_rmid_idx(); enum resctrl_event_id evt; @@ -1180,7 +1180,7 @@ static void resctrl_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain * Assign the counter if @assign is true else unassign the counter. Reset the * associated non-architectural state. */ -static void rdtgroup_assign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +static void rdtgroup_assign_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, enum resctrl_event_id evtid, u32 rmid, u32 closid, u32 cntr_id, bool assign) { @@ -1200,7 +1200,7 @@ static void rdtgroup_assign_cntr(struct rdt_resource *r, struct rdt_mon_domain * * Return: * 0 on success, < 0 on failure. */ -static int rdtgroup_alloc_assign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +static int rdtgroup_alloc_assign_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, struct mon_evt *mevt) { int cntr_id; @@ -1235,7 +1235,7 @@ static int rdtgroup_alloc_assign_cntr(struct rdt_resource *r, struct rdt_mon_dom * Return: * 0 on success, < 0 on failure. */ -static int rdtgroup_assign_cntr_event(struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, +static int rdtgroup_assign_cntr_event(struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, struct mon_evt *mevt) { struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid); @@ -1285,7 +1285,7 @@ void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp) * rdtgroup_free_unassign_cntr() - Unassign and reset the counter ID configuration * for the event pointed to by @mevt within the domain @d and resctrl group @rdtgrp. */ -static void rdtgroup_free_unassign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +static void rdtgroup_free_unassign_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, struct mon_evt *mevt) { int cntr_id; @@ -1306,7 +1306,7 @@ static void rdtgroup_free_unassign_cntr(struct rdt_resource *r, struct rdt_mon_d * the event structure @mevt from the domain @d and the group @rdtgrp. Unassign * the counters from all the domains if @d is NULL else unassign from @d. */ -static void rdtgroup_unassign_cntr_event(struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, +static void rdtgroup_unassign_cntr_event(struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, struct mon_evt *mevt) { struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid); @@ -1381,7 +1381,7 @@ static int resctrl_parse_mem_transactions(char *tok, u32 *val) static void rdtgroup_update_cntr_event(struct rdt_resource *r, struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) { - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; int cntr_id; list_for_each_entry(d, &r->mon_domains, hdr.list) { @@ -1487,7 +1487,7 @@ ssize_t resctrl_mbm_assign_mode_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; int ret = 0; bool enable; @@ -1560,7 +1560,7 @@ int resctrl_num_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - struct rdt_mon_domain *dom; + struct rdt_l3_mon_domain *dom; bool sep = false; cpus_read_lock(); @@ -1584,7 +1584,7 @@ int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - struct rdt_mon_domain *dom; + struct rdt_l3_mon_domain *dom; bool sep = false; u32 cntrs, i; int ret = 0; @@ -1625,7 +1625,7 @@ int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; struct rdtgroup *rdtgrp; struct mon_evt *mevt; int ret = 0; @@ -1688,7 +1688,7 @@ static struct mon_evt *mbm_get_mon_event_by_name(struct rdt_resource *r, char *n return NULL; } -static int rdtgroup_modify_assign_state(char *assign, struct rdt_mon_domain *d, +static int rdtgroup_modify_assign_state(char *assign, struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, struct mon_evt *mevt) { int ret = 0; @@ -1714,7 +1714,7 @@ static int rdtgroup_modify_assign_state(char *assign, struct rdt_mon_domain *d, static int resctrl_parse_mbm_assignment(struct rdt_resource *r, struct rdtgroup *rdtgrp, char *event, char *tok) { - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; unsigned long dom_id = 0; char *dom_str, *id_str; struct mon_evt *mevt; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 4d0d562a85efc..551d009ceed40 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1715,7 +1715,7 @@ static void mondata_config_read(struct resctrl_mon_config_info *mon_info) static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid) { struct resctrl_mon_config_info mon_info; - struct rdt_mon_domain *dom; + struct rdt_l3_mon_domain *dom; bool sep = false; cpus_read_lock(); @@ -1787,7 +1787,7 @@ static int resctrl_schema_format_show(struct kernfs_open_file *of, } static void mbm_config_write_domain(struct rdt_resource *r, - struct rdt_mon_domain *d, u32 evtid, u32 val) + struct rdt_l3_mon_domain *d, u32 evtid, u32 val) { struct resctrl_mon_config_info mon_info = {0}; @@ -1828,8 +1828,8 @@ static void mbm_config_write_domain(struct rdt_resource *r, static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) { char *dom_str = NULL, *id_str; + struct rdt_l3_mon_domain *d; unsigned long dom_id, val; - struct rdt_mon_domain *d; /* Walking r->domains, ensure it can't race with cpuhp */ lockdep_assert_cpus_held(); @@ -2954,7 +2954,7 @@ static int rdt_get_tree(struct fs_context *fc) { struct rdt_fs_context *ctx = rdt_fc2context(fc); unsigned long flags = RFTYPE_CTRL_BASE; - struct rdt_mon_domain *dom; + struct rdt_l3_mon_domain *dom; struct rdt_resource *r; int ret; @@ -3425,7 +3425,7 @@ static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, struct rdt_domain_hdr *hdr) { struct rdtgroup *prgrp, *crgrp; - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; char subname[32]; bool snc_mode; char name[32]; @@ -3433,7 +3433,7 @@ static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) return; - d = container_of(hdr, struct rdt_mon_domain, hdr); + d = container_of(hdr, struct rdt_l3_mon_domain, hdr); snc_mode = r->mon_scope == RESCTRL_L3_NODE; sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : hdr->id); if (snc_mode) @@ -3451,8 +3451,8 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_domain_hdr *hdr, struct rdt_resource *r, struct rdtgroup *prgrp, bool do_sum) { + struct rdt_l3_mon_domain *d; struct rmid_read rr = {0}; - struct rdt_mon_domain *d; struct mon_data *priv; struct mon_evt *mevt; int ret, domid; @@ -3460,7 +3460,7 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_domain_hdr *hdr, if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) return -EINVAL; - d = container_of(hdr, struct rdt_mon_domain, hdr); + d = container_of(hdr, struct rdt_l3_mon_domain, hdr); for_each_mon_event(mevt) { if (mevt->rid != r->rid || !mevt->enabled) continue; @@ -3485,7 +3485,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, struct rdt_resource *r, struct rdtgroup *prgrp) { struct kernfs_node *kn, *ckn; - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; char name[32]; bool snc_mode; int ret = 0; @@ -3495,7 +3495,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) return -EINVAL; - d = container_of(hdr, struct rdt_mon_domain, hdr); + d = container_of(hdr, struct rdt_l3_mon_domain, hdr); snc_mode = r->mon_scope == RESCTRL_L3_NODE; sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id); kn = kernfs_find_and_get(parent_kn, name); @@ -4441,7 +4441,7 @@ static void rdtgroup_setup_default(void) mutex_unlock(&rdtgroup_mutex); } -static void domain_destroy_mon_state(struct rdt_mon_domain *d) +static void domain_destroy_mon_state(struct rdt_l3_mon_domain *d) { int idx; @@ -4465,14 +4465,14 @@ void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr) { - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; mutex_lock(&rdtgroup_mutex); if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) goto out_unlock; - d = container_of(hdr, struct rdt_mon_domain, hdr); + d = container_of(hdr, struct rdt_l3_mon_domain, hdr); /* * If resctrl is mounted, remove all the @@ -4514,7 +4514,7 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *h * * Returns 0 for success, or -ENOMEM. */ -static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d) +static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_l3_mon_domain *d) { u32 idx_limit = resctrl_arch_system_num_rmid_idx(); size_t tsize = sizeof(*d->mbm_states[0]); @@ -4572,7 +4572,7 @@ int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr) { - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; int err = -EINVAL; mutex_lock(&rdtgroup_mutex); @@ -4580,7 +4580,7 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) goto out_unlock; - d = container_of(hdr, struct rdt_mon_domain, hdr); + d = container_of(hdr, struct rdt_l3_mon_domain, hdr); err = domain_setup_mon_state(r, d); if (err) goto out_unlock; @@ -4627,10 +4627,10 @@ static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) } } -static struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, - struct rdt_resource *r) +static struct rdt_l3_mon_domain *get_mon_domain_from_cpu(int cpu, + struct rdt_resource *r) { - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; lockdep_assert_cpus_held(); @@ -4646,7 +4646,7 @@ static struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, void resctrl_offline_cpu(unsigned int cpu) { struct rdt_resource *l3 = resctrl_arch_get_resource(RDT_RESOURCE_L3); - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; struct rdtgroup *rdtgrp; mutex_lock(&rdtgroup_mutex); diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 7d24f92e91c5a..35dedef903388 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -181,7 +181,7 @@ struct mbm_cntr_cfg { }; /** - * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor resource + * struct rdt_l3_mon_domain - group of CPUs sharing RDT_RESOURCE_L3 monitoring * @hdr: common header for different domain types * @ci_id: cache info id for this domain * @rmid_busy_llc: bitmap of which limbo RMIDs are above threshold @@ -195,7 +195,7 @@ struct mbm_cntr_cfg { * @cntr_cfg: array of assignable counters' configuration (indexed * by counter ID) */ -struct rdt_mon_domain { +struct rdt_l3_mon_domain { struct rdt_domain_hdr hdr; unsigned int ci_id; unsigned long *rmid_busy_llc; @@ -387,10 +387,10 @@ struct resctrl_cpu_defaults { }; struct resctrl_mon_config_info { - struct rdt_resource *r; - struct rdt_mon_domain *d; - u32 evtid; - u32 mon_config; + struct rdt_resource *r; + struct rdt_l3_mon_domain *d; + u32 evtid; + u32 mon_config; }; /** @@ -628,7 +628,7 @@ struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id, * * This can be called from any CPU. */ -void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 closid, u32 rmid, enum resctrl_event_id eventid); @@ -641,7 +641,7 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, * * This can be called from any CPU. */ -void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d); +void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d); /** * resctrl_arch_reset_all_ctrls() - Reset the control for each CLOSID to its @@ -667,7 +667,7 @@ void resctrl_arch_reset_all_ctrls(struct rdt_resource *r); * * This can be called from any CPU. */ -void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, enum resctrl_event_id evtid, u32 rmid, u32 closid, u32 cntr_id, bool assign); @@ -690,7 +690,7 @@ void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, * Return: * 0 on success, or -EIO, -EINVAL etc on error. */ -int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 closid, u32 rmid, int cntr_id, enum resctrl_event_id eventid, u64 *val); @@ -705,7 +705,7 @@ int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, * * This can be called from any CPU. */ -void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 closid, u32 rmid, int cntr_id, enum resctrl_event_id eventid); From 6d2df1f9c7a01e53d1137791165ec4162627edd9 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:20:56 -0800 Subject: [PATCH 21/49] x86,fs/resctrl: Rename some L3 specific functions With the arrival of monitor events tied to new domains associated with a different resource it would be clearer if the L3 resource specific functions are more accurately named. Rename three groups of functions: Functions that allocate/free architecture per-RMID MBM state information: arch_domain_mbm_alloc() -> l3_mon_domain_mbm_alloc() mon_domain_free() -> l3_mon_domain_free() Functions that allocate/free filesystem per-RMID MBM state information: domain_setup_mon_state() -> domain_setup_l3_mon_state() domain_destroy_mon_state() -> domain_destroy_l3_mon_state() Initialization/exit: rdt_get_mon_l3_config() -> rdt_get_l3_mon_config() resctrl_mon_resource_init() -> resctrl_l3_mon_resource_init() resctrl_mon_resource_exit() -> resctrl_l3_mon_resource_exit() Ensure kernel-doc descriptions of these functions' return values are present and correctly formatted. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Shanker Donthineni (backported from commit 9c214d10c50990c7a61b95887493df9ae713eec5) [fenghuay: fix a minor conflict in resctrl_init()] Signed-off-by: Fenghua Yu --- arch/x86/kernel/cpu/resctrl/core.c | 20 +++++++++++--------- arch/x86/kernel/cpu/resctrl/internal.h | 2 +- arch/x86/kernel/cpu/resctrl/monitor.c | 2 +- fs/resctrl/internal.h | 6 +++--- fs/resctrl/monitor.c | 8 ++++---- fs/resctrl/rdtgroup.c | 24 ++++++++++++------------ 6 files changed, 32 insertions(+), 30 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index f11b981882ef4..4c2072fc0078c 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -363,7 +363,7 @@ static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom) kfree(hw_dom); } -static void mon_domain_free(struct rdt_hw_l3_mon_domain *hw_dom) +static void l3_mon_domain_free(struct rdt_hw_l3_mon_domain *hw_dom) { int idx; @@ -396,11 +396,13 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain * } /** - * arch_domain_mbm_alloc() - Allocate arch private storage for the MBM counters + * l3_mon_domain_mbm_alloc() - Allocate arch private storage for the MBM counters * @num_rmid: The size of the MBM counter array * @hw_dom: The domain that owns the allocated arrays + * + * Return: 0 for success, or -ENOMEM. */ -static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_l3_mon_domain *hw_dom) +static int l3_mon_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_l3_mon_domain *hw_dom) { size_t tsize = sizeof(*hw_dom->arch_mbm_states[0]); enum resctrl_event_id eventid; @@ -514,7 +516,7 @@ static void l3_mon_domain_setup(int cpu, int id, struct rdt_resource *r, struct ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); if (!ci) { pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name); - mon_domain_free(hw_dom); + l3_mon_domain_free(hw_dom); return; } d->ci_id = ci->id; @@ -522,8 +524,8 @@ static void l3_mon_domain_setup(int cpu, int id, struct rdt_resource *r, struct arch_mon_domain_online(r, d); - if (arch_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) { - mon_domain_free(hw_dom); + if (l3_mon_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) { + l3_mon_domain_free(hw_dom); return; } @@ -533,7 +535,7 @@ static void l3_mon_domain_setup(int cpu, int id, struct rdt_resource *r, struct if (err) { list_del_rcu(&d->hdr.list); synchronize_rcu(); - mon_domain_free(hw_dom); + l3_mon_domain_free(hw_dom); } } @@ -659,7 +661,7 @@ static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r) resctrl_offline_mon_domain(r, hdr); list_del_rcu(&hdr->list); synchronize_rcu(); - mon_domain_free(hw_dom); + l3_mon_domain_free(hw_dom); break; } default: @@ -908,7 +910,7 @@ static __init bool get_rdt_mon_resources(void) if (!ret) return false; - return !rdt_get_mon_l3_config(r); + return !rdt_get_l3_mon_config(r); } static __init void __check_quirks_intel(void) diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index c9871b34995f9..ed5c044c56d3b 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -208,7 +208,7 @@ union l3_qos_abmc_cfg { void rdt_ctrl_update(void *arg); -int rdt_get_mon_l3_config(struct rdt_resource *r); +int rdt_get_l3_mon_config(struct rdt_resource *r); bool rdt_cpu_has(int flag); diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 8b293fc4e9461..2d1453c905bc6 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -423,7 +423,7 @@ static __init int snc_get_config(void) return ret; } -int __init rdt_get_mon_l3_config(struct rdt_resource *r) +int __init rdt_get_l3_mon_config(struct rdt_resource *r) { unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index a04e2b8174da9..42963acf0dc92 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -365,7 +365,9 @@ int alloc_rmid(u32 closid); void free_rmid(u32 closid, u32 rmid); -void resctrl_mon_resource_exit(void); +int resctrl_l3_mon_resource_init(void); + +void resctrl_l3_mon_resource_exit(void); void mon_event_count(void *info); @@ -375,8 +377,6 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, struct rdt_domain_hdr *hdr, struct rdtgroup *rdtgrp, cpumask_t *cpumask, int evtid, int first); -int resctrl_mon_resource_init(void); - void mbm_setup_overflow_handler(struct rdt_l3_mon_domain *dom, unsigned long delay_ms, int exclude_cpu); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index b857e382b6383..9260be6232ee8 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -1810,7 +1810,7 @@ ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, } /** - * resctrl_mon_resource_init() - Initialise global monitoring structures. + * resctrl_l3_mon_resource_init() - Initialise global monitoring structures. * * Allocate and initialise global monitor resources that do not belong to a * specific domain. i.e. the rmid_ptrs[] used for the limbo and free lists. @@ -1819,9 +1819,9 @@ ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, * Resctrl's cpuhp callbacks may be called before this point to bring a domain * online. * - * Returns 0 for success, or -ENOMEM. + * Return: 0 for success, or -ENOMEM. */ -int resctrl_mon_resource_init(void) +int resctrl_l3_mon_resource_init(void) { struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); int ret; @@ -1871,7 +1871,7 @@ int resctrl_mon_resource_init(void) return 0; } -void resctrl_mon_resource_exit(void) +void resctrl_l3_mon_resource_exit(void) { struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 551d009ceed40..e5572b1d401f1 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -4441,7 +4441,7 @@ static void rdtgroup_setup_default(void) mutex_unlock(&rdtgroup_mutex); } -static void domain_destroy_mon_state(struct rdt_l3_mon_domain *d) +static void domain_destroy_l3_mon_state(struct rdt_l3_mon_domain *d) { int idx; @@ -4496,13 +4496,13 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *h cancel_delayed_work(&d->cqm_limbo); } - domain_destroy_mon_state(d); + domain_destroy_l3_mon_state(d); out_unlock: mutex_unlock(&rdtgroup_mutex); } /** - * domain_setup_mon_state() - Initialise domain monitoring structures. + * domain_setup_l3_mon_state() - Initialise domain monitoring structures. * @r: The resource for the newly online domain. * @d: The newly online domain. * @@ -4510,11 +4510,11 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *h * Called when the first CPU of a domain comes online, regardless of whether * the filesystem is mounted. * During boot this may be called before global allocations have been made by - * resctrl_mon_resource_init(). + * resctrl_l3_mon_resource_init(). * - * Returns 0 for success, or -ENOMEM. + * Return: 0 for success, or -ENOMEM. */ -static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_l3_mon_domain *d) +static int domain_setup_l3_mon_state(struct rdt_resource *r, struct rdt_l3_mon_domain *d) { u32 idx_limit = resctrl_arch_system_num_rmid_idx(); size_t tsize = sizeof(*d->mbm_states[0]); @@ -4581,7 +4581,7 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr goto out_unlock; d = container_of(hdr, struct rdt_l3_mon_domain, hdr); - err = domain_setup_mon_state(r, d); + err = domain_setup_l3_mon_state(r, d); if (err) goto out_unlock; @@ -4696,13 +4696,13 @@ int resctrl_init(void) thread_throttle_mode_init(); - ret = resctrl_mon_resource_init(); + ret = resctrl_l3_mon_resource_init(); if (ret) return ret; ret = sysfs_create_mount_point(fs_kobj, "resctrl"); if (ret) { - resctrl_mon_resource_exit(); + resctrl_l3_mon_resource_exit(); return ret; } @@ -4737,7 +4737,7 @@ int resctrl_init(void) cleanup_mountpoint: sysfs_remove_mount_point(fs_kobj, "resctrl"); - resctrl_mon_resource_exit(); + resctrl_l3_mon_resource_exit(); return ret; } @@ -4773,7 +4773,7 @@ static bool resctrl_online_domains_exist(void) * When called by the architecture code, all CPUs and resctrl domains must be * offline. This ensures the limbo and overflow handlers are not scheduled to * run, meaning the data structures they access can be freed by - * resctrl_mon_resource_exit(). + * resctrl_l3_mon_resource_exit(). * * After resctrl_exit() returns, the architecture code should return an * error from all resctrl_arch_ functions that can do this. @@ -4800,5 +4800,5 @@ void resctrl_exit(void) * it can be used to umount resctrl. */ - resctrl_mon_resource_exit(); + resctrl_l3_mon_resource_exit(); } From bfc5d53471d3a0e95b97bbd6a0ec1f3fc5cae797 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:20:57 -0800 Subject: [PATCH 22/49] fs/resctrl: Make event details accessible to functions when reading events Reading monitoring event data from MMIO requires more context than the event id to be able to read the correct memory location. struct mon_evt is the appropriate place for this event specific context. Prepare for addition of extra fields to struct mon_evt by changing the calling conventions to pass a pointer to the mon_evt structure instead of just the event id. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com (cherry picked from commit dd110880e80d35ad07e460e7a8da007c8058e7bf) Signed-off-by: Shanker Donthineni Signed-off-by: Fenghua Yu --- fs/resctrl/ctrlmondata.c | 18 +++++++++--------- fs/resctrl/internal.h | 10 +++++----- fs/resctrl/monitor.c | 22 +++++++++++----------- fs/resctrl/rdtgroup.c | 6 +++--- 4 files changed, 28 insertions(+), 28 deletions(-) diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 1d5a0d3bcd5fd..53a6e156144bf 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -546,7 +546,7 @@ struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id, void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, struct rdt_domain_hdr *hdr, struct rdtgroup *rdtgrp, - cpumask_t *cpumask, int evtid, int first) + cpumask_t *cpumask, struct mon_evt *evt, int first) { int cpu; @@ -557,15 +557,15 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, * Setup the parameters to pass to mon_event_count() to read the data. */ rr->rgrp = rdtgrp; - rr->evtid = evtid; + rr->evt = evt; rr->r = r; rr->hdr = hdr; rr->first = first; if (resctrl_arch_mbm_cntr_assign_enabled(r) && - resctrl_is_mbm_event(evtid)) { + resctrl_is_mbm_event(evt->evtid)) { rr->is_mbm_cntr = true; } else { - rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); + rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evt->evtid); if (IS_ERR(rr->arch_mon_ctx)) { rr->err = -EINVAL; return; @@ -586,14 +586,13 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); if (rr->arch_mon_ctx) - resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); + resctrl_arch_mon_ctx_free(r, evt->evtid, rr->arch_mon_ctx); } int rdtgroup_mondata_show(struct seq_file *m, void *arg) { struct kernfs_open_file *of = m->private; enum resctrl_res_level resid; - enum resctrl_event_id evtid; struct rdt_l3_mon_domain *d; struct rdt_domain_hdr *hdr; struct rmid_read rr = {0}; @@ -601,6 +600,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) int domid, cpu, ret = 0; struct rdt_resource *r; struct cacheinfo *ci; + struct mon_evt *evt; struct mon_data *md; rdtgrp = rdtgroup_kn_lock_live(of->kn); @@ -617,7 +617,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) resid = md->rid; domid = md->domid; - evtid = md->evtid; + evt = md->evt; r = resctrl_arch_get_resource(resid); if (md->sum) { @@ -635,7 +635,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) continue; rr.ci = ci; mon_event_read(&rr, r, NULL, rdtgrp, - &ci->shared_cpu_map, evtid, false); + &ci->shared_cpu_map, evt, false); goto checkresult; } } @@ -651,7 +651,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) ret = -ENOENT; goto out; } - mon_event_read(&rr, r, hdr, rdtgrp, &hdr->cpu_mask, evtid, false); + mon_event_read(&rr, r, hdr, rdtgrp, &hdr->cpu_mask, evt, false); } checkresult: diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 42963acf0dc92..92aadf639daeb 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -82,7 +82,7 @@ extern struct mon_evt mon_event_all[QOS_NUM_EVENTS]; * struct mon_data - Monitoring details for each event file. * @list: Member of the global @mon_data_kn_priv_list list. * @rid: Resource id associated with the event file. - * @evtid: Event id associated with the event file. + * @evt: Event structure associated with the event file. * @sum: Set when event must be summed across multiple * domains. * @domid: When @sum is zero this is the domain to which @@ -96,7 +96,7 @@ extern struct mon_evt mon_event_all[QOS_NUM_EVENTS]; struct mon_data { struct list_head list; enum resctrl_res_level rid; - enum resctrl_event_id evtid; + struct mon_evt *evt; int domid; bool sum; }; @@ -109,7 +109,7 @@ struct mon_data { * @r: Resource describing the properties of the event being read. * @hdr: Header of domain that the counter should be read from. If NULL then * sum all domains in @r sharing L3 @ci.id - * @evtid: Which monitor event to read. + * @evt: Which monitor event to read. * @first: Initialize MBM counter when true. * @ci: Cacheinfo for L3. Only set when @hdr is NULL. Used when summing * domains. @@ -127,7 +127,7 @@ struct rmid_read { struct rdtgroup *rgrp; struct rdt_resource *r; struct rdt_domain_hdr *hdr; - enum resctrl_event_id evtid; + struct mon_evt *evt; bool first; struct cacheinfo *ci; bool is_mbm_cntr; @@ -375,7 +375,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg); void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, struct rdt_domain_hdr *hdr, struct rdtgroup *rdtgrp, - cpumask_t *cpumask, int evtid, int first); + cpumask_t *cpumask, struct mon_evt *evt, int first); void mbm_setup_overflow_handler(struct rdt_l3_mon_domain *dom, unsigned long delay_ms, diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 9260be6232ee8..d44c4b900f733 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -457,7 +457,7 @@ static int __l3_mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) d = container_of(rr->hdr, struct rdt_l3_mon_domain, hdr); if (rr->is_mbm_cntr) { - cntr_id = mbm_cntr_get(rr->r, d, rdtgrp, rr->evtid); + cntr_id = mbm_cntr_get(rr->r, d, rdtgrp, rr->evt->evtid); if (cntr_id < 0) { rr->err = -ENOENT; return -EINVAL; @@ -466,10 +466,10 @@ static int __l3_mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) if (rr->first) { if (rr->is_mbm_cntr) - resctrl_arch_reset_cntr(rr->r, d, closid, rmid, cntr_id, rr->evtid); + resctrl_arch_reset_cntr(rr->r, d, closid, rmid, cntr_id, rr->evt->evtid); else - resctrl_arch_reset_rmid(rr->r, d, closid, rmid, rr->evtid); - m = get_mbm_state(d, closid, rmid, rr->evtid); + resctrl_arch_reset_rmid(rr->r, d, closid, rmid, rr->evt->evtid); + m = get_mbm_state(d, closid, rmid, rr->evt->evtid); if (m) memset(m, 0, sizeof(struct mbm_state)); return 0; @@ -480,10 +480,10 @@ static int __l3_mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) return -EINVAL; if (rr->is_mbm_cntr) rr->err = resctrl_arch_cntr_read(rr->r, d, closid, rmid, cntr_id, - rr->evtid, &tval); + rr->evt->evtid, &tval); else rr->err = resctrl_arch_rmid_read(rr->r, rr->hdr, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); + rr->evt->evtid, &tval, rr->arch_mon_ctx); if (rr->err) return rr->err; @@ -528,7 +528,7 @@ static int __l3_mon_event_count_sum(struct rdtgroup *rdtgrp, struct rmid_read *r if (d->ci_id != rr->ci->id) continue; err = resctrl_arch_rmid_read(rr->r, &d->hdr, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); + rr->evt->evtid, &tval, rr->arch_mon_ctx); if (!err) { rr->val += tval; ret = 0; @@ -578,7 +578,7 @@ static void mbm_bw_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) if (!domain_header_is_valid(rr->hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) return; d = container_of(rr->hdr, struct rdt_l3_mon_domain, hdr); - m = get_mbm_state(d, closid, rmid, rr->evtid); + m = get_mbm_state(d, closid, rmid, rr->evt->evtid); if (WARN_ON_ONCE(!m)) return; @@ -752,11 +752,11 @@ static void mbm_update_one_event(struct rdt_resource *r, struct rdt_l3_mon_domai rr.r = r; rr.hdr = &d->hdr; - rr.evtid = evtid; + rr.evt = &mon_event_all[evtid]; if (resctrl_arch_mbm_cntr_assign_enabled(r)) { rr.is_mbm_cntr = true; } else { - rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, evtid); if (IS_ERR(rr.arch_mon_ctx)) { pr_warn_ratelimited("Failed to allocate monitor context: %ld", PTR_ERR(rr.arch_mon_ctx)); @@ -774,7 +774,7 @@ static void mbm_update_one_event(struct rdt_resource *r, struct rdt_l3_mon_domai mbm_bw_count(rdtgrp, &rr); if (rr.arch_mon_ctx) - resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); + resctrl_arch_mon_ctx_free(rr.r, evtid, rr.arch_mon_ctx); } static void mbm_update(struct rdt_resource *r, struct rdt_l3_mon_domain *d, diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index e5572b1d401f1..6a503729ff353 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -3293,7 +3293,7 @@ static struct mon_data *mon_get_kn_priv(enum resctrl_res_level rid, int domid, list_for_each_entry(priv, &mon_data_kn_priv_list, list) { if (priv->rid == rid && priv->domid == domid && - priv->sum == do_sum && priv->evtid == mevt->evtid) + priv->sum == do_sum && priv->evt == mevt) return priv; } @@ -3304,7 +3304,7 @@ static struct mon_data *mon_get_kn_priv(enum resctrl_res_level rid, int domid, priv->rid = rid; priv->domid = domid; priv->sum = do_sum; - priv->evtid = mevt->evtid; + priv->evt = mevt; list_add_tail(&priv->list, &mon_data_kn_priv_list); return priv; @@ -3474,7 +3474,7 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_domain_hdr *hdr, return ret; if (!do_sum && resctrl_is_mbm_event(mevt->evtid)) - mon_event_read(&rr, r, hdr, prgrp, &hdr->cpu_mask, mevt->evtid, true); + mon_event_read(&rr, r, hdr, prgrp, &hdr->cpu_mask, mevt, true); } return 0; From 945d10ef5e4cc2415c2b23dd3bbe5a1f6c99fd2c Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:20:58 -0800 Subject: [PATCH 23/49] x86,fs/resctrl: Handle events that can be read from any CPU resctrl assumes that monitor events can only be read from a CPU in the cpumask_t set of each domain. This is true for x86 events accessed with an MSR interface, but may not be true for other access methods such as MMIO. Introduce and use flag mon_evt::any_cpu, settable by architecture, that indicates there are no restrictions on which CPU can read that event. This flag is not supported by the L3 event reading that requires to be run on a CPU that belongs to the L3 domain of the event being read. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com (cherry picked from commit ab0308aee3819a3eccde42f9eb5bb01d6733be38) Signed-off-by: Shanker Donthineni Signed-off-by: Fenghua Yu --- arch/x86/kernel/cpu/resctrl/core.c | 6 +++--- fs/resctrl/ctrlmondata.c | 6 ++++++ fs/resctrl/internal.h | 2 ++ fs/resctrl/monitor.c | 4 +++- include/linux/resctrl.h | 2 +- 5 files changed, 15 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 4c2072fc0078c..bb0b4c2292dc5 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -893,15 +893,15 @@ static __init bool get_rdt_mon_resources(void) bool ret = false; if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) { - resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID); + resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID, false); ret = true; } if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { - resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); + resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID, false); ret = true; } if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { - resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); + resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID, false); ret = true; } if (rdt_cpu_has(X86_FEATURE_ABMC)) diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 53a6e156144bf..dee8a4df154fb 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -572,6 +572,11 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, } } + if (evt->any_cpu) { + mon_event_count(rr); + goto out_ctx_free; + } + cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU); /* @@ -585,6 +590,7 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, else smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); +out_ctx_free: if (rr->arch_mon_ctx) resctrl_arch_mon_ctx_free(r, evt->evtid, rr->arch_mon_ctx); } diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 92aadf639daeb..a19f6dc009914 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -62,6 +62,7 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) * READS_TO_REMOTE_MEM) being tracked by @evtid. * Only valid if @evtid is an MBM event. * @configurable: true if the event is configurable + * @any_cpu: true if the event can be read from any CPU * @enabled: true if the event is enabled */ struct mon_evt { @@ -70,6 +71,7 @@ struct mon_evt { char *name; u32 evt_cfg; bool configurable; + bool any_cpu; bool enabled; }; diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index d44c4b900f733..8f98a7a45e043 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -545,6 +545,7 @@ static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) { switch (rr->r->rid) { case RDT_RESOURCE_L3: + WARN_ON_ONCE(rr->evt->any_cpu); if (rr->hdr) return __l3_mon_event_count(rdtgrp, rr); else @@ -1017,7 +1018,7 @@ struct mon_evt mon_event_all[QOS_NUM_EVENTS] = { }, }; -void resctrl_enable_mon_event(enum resctrl_event_id eventid) +void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu) { if (WARN_ON_ONCE(eventid < QOS_FIRST_EVENT || eventid >= QOS_NUM_EVENTS)) return; @@ -1026,6 +1027,7 @@ void resctrl_enable_mon_event(enum resctrl_event_id eventid) return; } + mon_event_all[eventid].any_cpu = any_cpu; mon_event_all[eventid].enabled = true; } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 35dedef903388..9bbe30737c541 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -453,7 +453,7 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r); u32 resctrl_arch_system_num_rmid_idx(void); int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); -void resctrl_enable_mon_event(enum resctrl_event_id eventid); +void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu); bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid); From 87a79d9bd4bea1b75adb7b15529a1c1d61c91983 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:20:59 -0800 Subject: [PATCH 24/49] x86,fs/resctrl: Support binary fixed point event counters resctrl assumes that all monitor events can be displayed as unsigned decimal integers. Hardware architecture counters may provide some telemetry events with greater precision where the event is not a simple count, but is a measurement of some sort (e.g. Joules for energy consumed). Add a new argument to resctrl_enable_mon_event() for architecture code to inform the file system that the value for a counter is a fixed-point value with a specific number of binary places. Only allow architecture to use floating point format on events that the file system has marked with mon_evt::is_floating_point which reflects the contract with user space on how the event values are displayed. Display fixed point values with values rounded to ceil(binary_bits * log10(2)) decimal places. Special case for zero binary bits to print "{value}.0". Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com (cherry picked from commit e37c9a3dc9f9645532780d5ef34ea3b8fcf9ddef) Signed-off-by: Shanker Donthineni Signed-off-by: Fenghua Yu --- arch/x86/kernel/cpu/resctrl/core.c | 6 +-- fs/resctrl/ctrlmondata.c | 74 ++++++++++++++++++++++++++++++ fs/resctrl/internal.h | 8 ++++ fs/resctrl/monitor.c | 10 +++- include/linux/resctrl.h | 3 +- 5 files changed, 95 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index bb0b4c2292dc5..dcd0a4b07fc25 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -893,15 +893,15 @@ static __init bool get_rdt_mon_resources(void) bool ret = false; if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) { - resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID, false); + resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID, false, 0); ret = true; } if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { - resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID, false); + resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID, false, 0); ret = true; } if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { - resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID, false); + resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID, false, 0); ret = true; } if (rdt_cpu_has(X86_FEATURE_ABMC)) diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index dee8a4df154fb..8396f7b05d712 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -595,6 +596,77 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, resctrl_arch_mon_ctx_free(r, evt->evtid, rr->arch_mon_ctx); } +/* + * Decimal place precision to use for each number of fixed-point + * binary bits computed from ceil(binary_bits * log10(2)) except + * binary_bits == 0 which will print "value.0" + */ +static const unsigned int decplaces[MAX_BINARY_BITS + 1] = { + [0] = 1, + [1] = 1, + [2] = 1, + [3] = 1, + [4] = 2, + [5] = 2, + [6] = 2, + [7] = 3, + [8] = 3, + [9] = 3, + [10] = 4, + [11] = 4, + [12] = 4, + [13] = 4, + [14] = 5, + [15] = 5, + [16] = 5, + [17] = 6, + [18] = 6, + [19] = 6, + [20] = 7, + [21] = 7, + [22] = 7, + [23] = 7, + [24] = 8, + [25] = 8, + [26] = 8, + [27] = 9 +}; + +static void print_event_value(struct seq_file *m, unsigned int binary_bits, u64 val) +{ + unsigned long long frac = 0; + + if (binary_bits) { + /* Mask off the integer part of the fixed-point value. */ + frac = val & GENMASK_ULL(binary_bits - 1, 0); + + /* + * Multiply by 10^{desired decimal places}. The integer part of + * the fixed point value is now almost what is needed. + */ + frac *= int_pow(10ull, decplaces[binary_bits]); + + /* + * Round to nearest by adding a value that would be a "1" in the + * binary_bits + 1 place. Integer part of fixed point value is + * now the needed value. + */ + frac += 1ull << (binary_bits - 1); + + /* + * Extract the integer part of the value. This is the decimal + * representation of the original fixed-point fractional value. + */ + frac >>= binary_bits; + } + + /* + * "frac" is now in the range [0 .. 10^decplaces). I.e. string + * representation will fit into chosen number of decimal places. + */ + seq_printf(m, "%llu.%0*llu\n", val >> binary_bits, decplaces[binary_bits], frac); +} + int rdtgroup_mondata_show(struct seq_file *m, void *arg) { struct kernfs_open_file *of = m->private; @@ -672,6 +744,8 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) seq_puts(m, "Unavailable\n"); else if (rr.err == -ENOENT) seq_puts(m, "Unassigned\n"); + else if (evt->is_floating_point) + print_event_value(m, evt->binary_bits, rr.val); else seq_printf(m, "%llu\n", rr.val); diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index a19f6dc009914..8c89fb9b2b417 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -63,6 +63,9 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) * Only valid if @evtid is an MBM event. * @configurable: true if the event is configurable * @any_cpu: true if the event can be read from any CPU + * @is_floating_point: event values are displayed in floating point format + * @binary_bits: number of fixed-point binary bits from architecture, + * only valid if @is_floating_point is true * @enabled: true if the event is enabled */ struct mon_evt { @@ -72,6 +75,8 @@ struct mon_evt { u32 evt_cfg; bool configurable; bool any_cpu; + bool is_floating_point; + unsigned int binary_bits; bool enabled; }; @@ -80,6 +85,9 @@ extern struct mon_evt mon_event_all[QOS_NUM_EVENTS]; #define for_each_mon_event(mevt) for (mevt = &mon_event_all[QOS_FIRST_EVENT]; \ mevt < &mon_event_all[QOS_NUM_EVENTS]; mevt++) +/* Limit for mon_evt::binary_bits */ +#define MAX_BINARY_BITS 27 + /** * struct mon_data - Monitoring details for each event file. * @list: Member of the global @mon_data_kn_priv_list list. diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 8f98a7a45e043..6dd054746e8c5 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -1018,16 +1018,22 @@ struct mon_evt mon_event_all[QOS_NUM_EVENTS] = { }, }; -void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu) +void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu, unsigned int binary_bits) { - if (WARN_ON_ONCE(eventid < QOS_FIRST_EVENT || eventid >= QOS_NUM_EVENTS)) + if (WARN_ON_ONCE(eventid < QOS_FIRST_EVENT || eventid >= QOS_NUM_EVENTS || + binary_bits > MAX_BINARY_BITS)) return; if (mon_event_all[eventid].enabled) { pr_warn("Duplicate enable for event %d\n", eventid); return; } + if (binary_bits && !mon_event_all[eventid].is_floating_point) { + pr_warn("Event %d may not be floating point\n", eventid); + return; + } mon_event_all[eventid].any_cpu = any_cpu; + mon_event_all[eventid].binary_bits = binary_bits; mon_event_all[eventid].enabled = true; } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 9bbe30737c541..b8ac94285d9b7 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -453,7 +453,8 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r); u32 resctrl_arch_system_num_rmid_idx(void); int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); -void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu); +void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu, + unsigned int binary_bits); bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid); From c4d5709411cf7b3bda4c0842b91c2d0dec45ace1 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 8 Jan 2026 09:42:25 -0800 Subject: [PATCH 25/49] x86,fs/resctrl: Add an architectural hook called for first mount Enumeration of Intel telemetry events is an asynchronous process involving several mutually dependent drivers added as auxiliary devices during the device_initcall() phase of Linux boot. The process finishes after the probe functions of these drivers completes. But this happens after resctrl_arch_late_init() is executed. Tracing the enumeration process shows that it does complete a full seven seconds before the earliest possible mount of the resctrl file system (when included in /etc/fstab for automatic mount by systemd). Add a hook for use by telemetry event enumeration and initialization and run it once at the beginning of resctrl mount without any locks held. The architecture is responsible for any required locking. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20260105191711.GBaVwON5nZn-uO6Sqg@fat_crate.local Signed-off-by: Shanker Donthineni (backported from commit 39208e73a40e0e81a5b12ddc11157c0a414df307) [fenghuay: fix minor conflicts in include headers and rdt_get_tree()] Signed-off-by: Fenghua Yu --- arch/x86/kernel/cpu/resctrl/core.c | 4 ++++ fs/resctrl/rdtgroup.c | 3 +++ include/linux/resctrl.h | 6 ++++++ 3 files changed, 13 insertions(+) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index dcd0a4b07fc25..634e14a1e4c2e 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -721,6 +721,10 @@ static int resctrl_arch_offline_cpu(unsigned int cpu) return 0; } +void resctrl_arch_pre_mount(void) +{ +} + enum { RDT_FLAG_CMT, RDT_FLAG_MBM_TOTAL, diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 6a503729ff353..a2dabf0d6a411 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -2958,6 +2959,8 @@ static int rdt_get_tree(struct fs_context *fc) struct rdt_resource *r; int ret; + DO_ONCE_SLEEPABLE(resctrl_arch_pre_mount); + if (ctx->enable_abi_playground) enable_abi_playground(); diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index b8ac94285d9b7..fd90152836c2c 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -557,6 +557,12 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *h void resctrl_online_cpu(unsigned int cpu); void resctrl_offline_cpu(unsigned int cpu); +/* + * Architecture hook called at beginning of first file system mount attempt. + * No locks are held. + */ +void resctrl_arch_pre_mount(void); + /** * resctrl_arch_rmid_read() - Read the eventid counter corresponding to rmid * for this resource and domain. From 4b2e85d405761918dbf5cdc4f0df4a2439e7846a Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:21:01 -0800 Subject: [PATCH 26/49] x86,fs/resctrl: Add and initialize a resource for package scope monitoring Add a new PERF_PKG resource and introduce package level scope for monitoring telemetry events so that CPU hotplug notifiers can build domains at the package granularity. Use the physical package ID available via topology_physical_package_id() to identify the monitoring domains with package level scope. This enables user space to use: /sys/devices/system/cpu/cpuX/topology/physical_package_id to identify the monitoring domain a CPU is associated with. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Shanker Donthineni (backported from commit 2e53ad66686a46b141c3395719afeee3057ffe2f) [fenghuay: fix minor conflicts in definitions of rdt_resources_all[], struct rdtgroup, fflags_from_resource(), and enum resctrl_res_level] Signed-off-by: Fenghua Yu --- arch/x86/kernel/cpu/resctrl/core.c | 10 ++++++++++ fs/resctrl/internal.h | 2 ++ fs/resctrl/rdtgroup.c | 2 ++ include/linux/resctrl.h | 2 ++ 4 files changed, 16 insertions(+) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 634e14a1e4c2e..42dd2a8c37c6b 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -98,6 +98,14 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_SMBA), }, }, + [RDT_RESOURCE_PERF_PKG] = + { + .r_resctrl = { + .name = "PERF_PKG", + .mon_scope = RESCTRL_PACKAGE, + .mon_domains = mon_domain_init(RDT_RESOURCE_PERF_PKG), + }, + }, }; u32 resctrl_arch_system_num_rmid_idx(void) @@ -435,6 +443,8 @@ static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope) return get_cpu_cacheinfo_id(cpu, scope); case RESCTRL_L3_NODE: return cpu_to_node(cpu); + case RESCTRL_PACKAGE: + return topology_physical_package_id(cpu); default: break; } diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 8c89fb9b2b417..8cc0372c3fb76 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -261,6 +261,8 @@ struct rdtgroup { #define RFTYPE_SCHEMA_PERCENT BIT(12) #define RFTYPE_SCHEMA_MBPS BIT(13) +#define RFTYPE_RES_PERF_PKG BIT(12) + #define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) #define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index a2dabf0d6a411..1684c8d730865 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -2488,6 +2488,8 @@ static unsigned long fflags_from_resource(struct rdt_resource *r) case RDT_RESOURCE_MBA: case RDT_RESOURCE_SMBA: return RFTYPE_RES_MB; + case RDT_RESOURCE_PERF_PKG: + return RFTYPE_RES_PERF_PKG; } return 0; diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index fd90152836c2c..e3953387b3ff6 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -54,6 +54,7 @@ enum resctrl_res_level { RDT_RESOURCE_L2, RDT_RESOURCE_MBA, RDT_RESOURCE_SMBA, + RDT_RESOURCE_PERF_PKG, RDT_RESOURCE_L3_MAX, RDT_RESOURCE_L2_MAX, @@ -276,6 +277,7 @@ enum resctrl_scope { RESCTRL_L2_CACHE = 2, RESCTRL_L3_CACHE = 3, RESCTRL_L3_NODE, + RESCTRL_PACKAGE, }; /** From f8a97b4035c275262068fe81ddbe946546f4459e Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:21:02 -0800 Subject: [PATCH 27/49] fs/resctrl: Emphasize that L3 monitoring resource is required for summing domains The feature to sum event data across multiple domains supports systems with Sub-NUMA Cluster (SNC) mode enabled. The top-level monitoring files in each "mon_L3_XX" directory provide the sum of data across all SNC nodes sharing an L3 cache instance while the "mon_sub_L3_YY" sub-directories provide the event data of the individual nodes. SNC is only associated with the L3 resource and domains and as a result the flow handling the sum of event data implicitly assumes it is working with the L3 resource and domains. Reading of telemetry events does not require to sum event data so this feature can remain dedicated to SNC and keep the implicit assumption of working with the L3 resource and domains. Add a WARN to where the implicit assumption of working with the L3 resource is made and add comments on how the structure controlling the event sum feature is used. Suggested-by: Reinette Chatre Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com (cherry picked from commit db64994d115e7c2cd72fec11b854467e97169379) Signed-off-by: Shanker Donthineni Signed-off-by: Fenghua Yu --- fs/resctrl/ctrlmondata.c | 8 +++++++- fs/resctrl/internal.h | 4 ++-- fs/resctrl/rdtgroup.c | 3 ++- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 8396f7b05d712..e04b8a5f76c3d 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -671,7 +671,6 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) { struct kernfs_open_file *of = m->private; enum resctrl_res_level resid; - struct rdt_l3_mon_domain *d; struct rdt_domain_hdr *hdr; struct rmid_read rr = {0}; struct rdtgroup *rdtgrp; @@ -699,6 +698,13 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) r = resctrl_arch_get_resource(resid); if (md->sum) { + struct rdt_l3_mon_domain *d; + + if (WARN_ON_ONCE(resid != RDT_RESOURCE_L3)) { + ret = -EINVAL; + goto out; + } + /* * This file requires summing across all domains that share * the L3 cache id that was provided in the "domid" field of the diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 8cc0372c3fb76..6e391118cf4ea 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -93,8 +93,8 @@ extern struct mon_evt mon_event_all[QOS_NUM_EVENTS]; * @list: Member of the global @mon_data_kn_priv_list list. * @rid: Resource id associated with the event file. * @evt: Event structure associated with the event file. - * @sum: Set when event must be summed across multiple - * domains. + * @sum: Set for RDT_RESOURCE_L3 when event must be summed + * across multiple domains. * @domid: When @sum is zero this is the domain to which * the event file belongs. When @sum is one this * is the id of the L3 cache that all domains to be diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 1684c8d730865..f1dfeb42d2e4b 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -3286,7 +3286,8 @@ static void rmdir_all_sub(void) * @rid: The resource id for the event file being created. * @domid: The domain id for the event file being created. * @mevt: The type of event file being created. - * @do_sum: Whether SNC summing monitors are being created. + * @do_sum: Whether SNC summing monitors are being created. Only set + * when @rid == RDT_RESOURCE_L3. */ static struct mon_data *mon_get_kn_priv(enum resctrl_res_level rid, int domid, struct mon_evt *mevt, From 1203adf85a1843813e0f8550037eb1d0ebca6b1a Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 8 Jan 2026 09:42:26 -0800 Subject: [PATCH 28/49] x86/resctrl: Discover hardware telemetry events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each CPU collects data for telemetry events that it sends to the nearest telemetry event aggregator either when the value of MSR_IA32_PQR_ASSOC.RMID changes, or when a two millisecond timer expires. There is a feature type ("energy" or "perf"), GUID, and MMIO region associated with each aggregator. This combination links to an XML description of the set of telemetry events tracked by the aggregator. XML files are published by Intel in a GitHub repository¹. The telemetry event aggregators maintain per-RMID per-event counts of the total seen for all the CPUs. There may be multiple telemetry event aggregators per package. There are separate sets of aggregators for each feature type. Aggregators in a set may have different GUIDs. All aggregators with the same feature type and GUID are symmetric keeping counts for the same set of events for the CPUs that provide data to them. The XML file for each aggregator provides the following information: 0) Feature type of the events ("perf" or "energy") 1) Which telemetry events are tracked by the aggregator. 2) The order in which the event counters appear for each RMID. 3) The value type of each event counter (integer or fixed-point). 4) The number of RMIDs supported. 5) Which additional aggregator status registers are included. 6) The total size of the MMIO region for an aggregator. Introduce struct event_group that condenses the relevant information from an XML file. Hereafter an "event group" refers to a group of events of a particular feature type (event_group::pfname set to "energy" or "perf") with a particular GUID. Use event_group::pfname to determine the feature id needed to obtain the aggregator details. It will later be used in console messages and with the rdt= boot parameter. The INTEL_PMT_TELEMETRY driver enumerates support for telemetry events. This driver provides intel_pmt_get_regions_by_feature() to list all available telemetry event aggregators of a given feature type. The list includes the "guid", the base address in MMIO space for the region where the event counters are exposed, and the package id where the all the CPUs that report to this aggregator are located. Call INTEL_PMT_TELEMETRY's intel_pmt_get_regions_by_feature() for each event group to obtain a private copy of that event group's aggregator data. Duplicate the aggregator data between event groups that have the same feature type but different GUID. Further processing on this private copy will be unique to the event group. ¹https://github.com/intel/Intel-PMT [ bp: Zap text explaining the code, s/guid/GUID/g ] Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com (cherry picked from commit 1fb2daa60de640efb13f907d43d72d28763f696c) Signed-off-by: Shanker Donthineni Signed-off-by: Fenghua Yu --- arch/x86/Kconfig | 13 +++ arch/x86/kernel/cpu/resctrl/Makefile | 1 + arch/x86/kernel/cpu/resctrl/core.c | 4 + arch/x86/kernel/cpu/resctrl/intel_aet.c | 109 ++++++++++++++++++++++++ arch/x86/kernel/cpu/resctrl/internal.h | 8 ++ 5 files changed, 135 insertions(+) create mode 100644 arch/x86/kernel/cpu/resctrl/intel_aet.c diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5902dde9f4477..75766f3da5ac4 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -525,6 +525,19 @@ config X86_CPU_RESCTRL Say N if unsure. +config X86_CPU_RESCTRL_INTEL_AET + bool "Intel Application Energy Telemetry" + depends on X86_CPU_RESCTRL && CPU_SUP_INTEL && INTEL_PMT_TELEMETRY=y && INTEL_TPMI=y + help + Enable per-RMID telemetry events in resctrl. + + Intel feature that collects per-RMID execution data + about energy consumption, measure of frequency independent + activity and other performance metrics. Data is aggregated + per package. + + Say N if unsure. + config X86_FRED bool "Flexible Return and Event Delivery" depends on X86_64 diff --git a/arch/x86/kernel/cpu/resctrl/Makefile b/arch/x86/kernel/cpu/resctrl/Makefile index d8a04b195da21..273ddfa308366 100644 --- a/arch/x86/kernel/cpu/resctrl/Makefile +++ b/arch/x86/kernel/cpu/resctrl/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o +obj-$(CONFIG_X86_CPU_RESCTRL_INTEL_AET) += intel_aet.o obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK) += pseudo_lock.o # To allow define_trace.h's recursive include: diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 42dd2a8c37c6b..e27c26be74a47 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -733,6 +733,8 @@ static int resctrl_arch_offline_cpu(unsigned int cpu) void resctrl_arch_pre_mount(void) { + if (!intel_aet_get_events()) + return; } enum { @@ -1090,6 +1092,8 @@ late_initcall(resctrl_arch_late_init); static void __exit resctrl_arch_exit(void) { + intel_aet_exit(); + cpuhp_remove_state(rdt_online); resctrl_exit(); diff --git a/arch/x86/kernel/cpu/resctrl/intel_aet.c b/arch/x86/kernel/cpu/resctrl/intel_aet.c new file mode 100644 index 0000000000000..404564739befe --- /dev/null +++ b/arch/x86/kernel/cpu/resctrl/intel_aet.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Resource Director Technology(RDT) + * - Intel Application Energy Telemetry + * + * Copyright (C) 2025 Intel Corporation + * + * Author: + * Tony Luck + */ + +#define pr_fmt(fmt) "resctrl: " fmt + +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/** + * struct event_group - Events with the same feature type ("energy" or "perf") and GUID. + * @pfname: PMT feature name ("energy" or "perf") of this event group. + * @pfg: Points to the aggregated telemetry space information + * returned by the intel_pmt_get_regions_by_feature() + * call to the INTEL_PMT_TELEMETRY driver that contains + * data for all telemetry regions of type @pfname. + * Valid if the system supports the event group, + * NULL otherwise. + */ +struct event_group { + /* Data fields for additional structures to manage this group. */ + const char *pfname; + struct pmt_feature_group *pfg; +}; + +static struct event_group *known_event_groups[] = { +}; + +#define for_each_event_group(_peg) \ + for (_peg = known_event_groups; \ + _peg < &known_event_groups[ARRAY_SIZE(known_event_groups)]; \ + _peg++) + +/* Stub for now */ +static bool enable_events(struct event_group *e, struct pmt_feature_group *p) +{ + return false; +} + +static enum pmt_feature_id lookup_pfid(const char *pfname) +{ + if (!strcmp(pfname, "energy")) + return FEATURE_PER_RMID_ENERGY_TELEM; + else if (!strcmp(pfname, "perf")) + return FEATURE_PER_RMID_PERF_TELEM; + + pr_warn("Unknown PMT feature name '%s'\n", pfname); + + return FEATURE_INVALID; +} + +/* + * Request a copy of struct pmt_feature_group for each event group. If there is + * one, the returned structure has an array of telemetry_region structures, + * each element of the array describes one telemetry aggregator. The + * telemetry aggregators may have different GUIDs so obtain duplicate struct + * pmt_feature_group for event groups with same feature type but different + * GUID. Post-processing ensures an event group can only use the telemetry + * aggregators that match its GUID. An event group keeps a pointer to its + * struct pmt_feature_group to indicate that its events are successfully + * enabled. + */ +bool intel_aet_get_events(void) +{ + struct pmt_feature_group *p; + enum pmt_feature_id pfid; + struct event_group **peg; + bool ret = false; + + for_each_event_group(peg) { + pfid = lookup_pfid((*peg)->pfname); + p = intel_pmt_get_regions_by_feature(pfid); + if (IS_ERR_OR_NULL(p)) + continue; + if (enable_events(*peg, p)) { + (*peg)->pfg = p; + ret = true; + } else { + intel_pmt_put_feature_group(p); + } + } + + return ret; +} + +void __exit intel_aet_exit(void) +{ + struct event_group **peg; + + for_each_event_group(peg) { + if ((*peg)->pfg) { + intel_pmt_put_feature_group((*peg)->pfg); + (*peg)->pfg = NULL; + } + } +} diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index ed5c044c56d3b..ad8b787827abf 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -217,4 +217,12 @@ void __init intel_rdt_mbm_apply_quirk(void); void rdt_domain_reconfigure_cdp(struct rdt_resource *r); void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r); +#ifdef CONFIG_X86_CPU_RESCTRL_INTEL_AET +bool intel_aet_get_events(void); +void __exit intel_aet_exit(void); +#else +static inline bool intel_aet_get_events(void) { return false; } +static inline void __exit intel_aet_exit(void) { } +#endif + #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ From c445f3b7bd8e373aa9e87a1726f13c138f11ae84 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:21:04 -0800 Subject: [PATCH 29/49] x86,fs/resctrl: Fill in details of events for performance and energy GUIDs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The telemetry event aggregators of the Intel Clearwater Forest CPU support two RMID-based feature types: "energy" with GUID 0x26696143¹, and "perf" with GUID 0x26557651². The event counter offsets in an aggregator's MMIO space are arranged in groups for each RMID. E.g., the "energy" counters for GUID 0x26696143 are arranged like this: MMIO offset:0x0000 Counter for RMID 0 PMT_EVENT_ENERGY MMIO offset:0x0008 Counter for RMID 0 PMT_EVENT_ACTIVITY MMIO offset:0x0010 Counter for RMID 1 PMT_EVENT_ENERGY MMIO offset:0x0018 Counter for RMID 1 PMT_EVENT_ACTIVITY ... MMIO offset:0x23F0 Counter for RMID 575 PMT_EVENT_ENERGY MMIO offset:0x23F8 Counter for RMID 575 PMT_EVENT_ACTIVITY After all counters there are three status registers that provide indications of how many times an aggregator was unable to process event counts, the time stamp for the most recent loss of data, and the time stamp of the most recent successful update. MMIO offset:0x2400 AGG_DATA_LOSS_COUNT MMIO offset:0x2408 AGG_DATA_LOSS_TIMESTAMP MMIO offset:0x2410 LAST_UPDATE_TIMESTAMP Define event_group structures for both of these aggregator types and define the events tracked by the aggregators in the file system code. PMT_EVENT_ENERGY and PMT_EVENT_ACTIVITY are produced in fixed point format. File system code must output as floating point values. ¹https://github.com/intel/Intel-PMT/blob/main/xml/CWF/OOBMSM/RMID-ENERGY/cwf_aggregator.xml ²https://github.com/intel/Intel-PMT/blob/main/xml/CWF/OOBMSM/RMID-PERF/cwf_aggregator.xml [ bp: Massage commit message. ] Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com (cherry picked from commit 8f6b6ad69b50bf16bb762ffafbfa44a4884f9a17) Signed-off-by: Shanker Donthineni Signed-off-by: Fenghua Yu --- arch/x86/kernel/cpu/resctrl/intel_aet.c | 66 +++++++++++++++++++++++++ fs/resctrl/monitor.c | 35 +++++++------ include/linux/resctrl_types.h | 11 +++++ 3 files changed, 97 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/intel_aet.c b/arch/x86/kernel/cpu/resctrl/intel_aet.c index 404564739befe..8e042b530c914 100644 --- a/arch/x86/kernel/cpu/resctrl/intel_aet.c +++ b/arch/x86/kernel/cpu/resctrl/intel_aet.c @@ -11,15 +11,33 @@ #define pr_fmt(fmt) "resctrl: " fmt +#include #include #include #include #include #include +#include #include +#include #include "internal.h" +/** + * struct pmt_event - Telemetry event. + * @id: Resctrl event id. + * @idx: Counter index within each per-RMID block of counters. + * @bin_bits: Zero for integer valued events, else number bits in fraction + * part of fixed-point. + */ +struct pmt_event { + enum resctrl_event_id id; + unsigned int idx; + unsigned int bin_bits; +}; + +#define EVT(_id, _idx, _bits) { .id = _id, .idx = _idx, .bin_bits = _bits } + /** * struct event_group - Events with the same feature type ("energy" or "perf") and GUID. * @pfname: PMT feature name ("energy" or "perf") of this event group. @@ -29,14 +47,62 @@ * data for all telemetry regions of type @pfname. * Valid if the system supports the event group, * NULL otherwise. + * @guid: Unique number per XML description file. + * @mmio_size: Number of bytes of MMIO registers for this group. + * @num_events: Number of events in this group. + * @evts: Array of event descriptors. */ struct event_group { /* Data fields for additional structures to manage this group. */ const char *pfname; struct pmt_feature_group *pfg; + + /* Remaining fields initialized from XML file. */ + u32 guid; + size_t mmio_size; + unsigned int num_events; + struct pmt_event evts[] __counted_by(num_events); +}; + +#define XML_MMIO_SIZE(num_rmids, num_events, num_extra_status) \ + (((num_rmids) * (num_events) + (num_extra_status)) * sizeof(u64)) + +/* + * Link: https://github.com/intel/Intel-PMT/blob/main/xml/CWF/OOBMSM/RMID-ENERGY/cwf_aggregator.xml + */ +static struct event_group energy_0x26696143 = { + .pfname = "energy", + .guid = 0x26696143, + .mmio_size = XML_MMIO_SIZE(576, 2, 3), + .num_events = 2, + .evts = { + EVT(PMT_EVENT_ENERGY, 0, 18), + EVT(PMT_EVENT_ACTIVITY, 1, 18), + } +}; + +/* + * Link: https://github.com/intel/Intel-PMT/blob/main/xml/CWF/OOBMSM/RMID-PERF/cwf_aggregator.xml + */ +static struct event_group perf_0x26557651 = { + .pfname = "perf", + .guid = 0x26557651, + .mmio_size = XML_MMIO_SIZE(576, 7, 3), + .num_events = 7, + .evts = { + EVT(PMT_EVENT_STALLS_LLC_HIT, 0, 0), + EVT(PMT_EVENT_C1_RES, 1, 0), + EVT(PMT_EVENT_UNHALTED_CORE_CYCLES, 2, 0), + EVT(PMT_EVENT_STALLS_LLC_MISS, 3, 0), + EVT(PMT_EVENT_AUTO_C6_RES, 4, 0), + EVT(PMT_EVENT_UNHALTED_REF_CYCLES, 5, 0), + EVT(PMT_EVENT_UOPS_RETIRED, 6, 0), + } }; static struct event_group *known_event_groups[] = { + &energy_0x26696143, + &perf_0x26557651, }; #define for_each_event_group(_peg) \ diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 6dd054746e8c5..40215d1135868 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -995,27 +995,32 @@ static void dom_data_exit(struct rdt_resource *r) mutex_unlock(&rdtgroup_mutex); } +#define MON_EVENT(_eventid, _name, _res, _fp) \ + [_eventid] = { \ + .name = _name, \ + .evtid = _eventid, \ + .rid = _res, \ + .is_floating_point = _fp, \ +} + /* * All available events. Architecture code marks the ones that * are supported by a system using resctrl_enable_mon_event() * to set .enabled. */ struct mon_evt mon_event_all[QOS_NUM_EVENTS] = { - [QOS_L3_OCCUP_EVENT_ID] = { - .name = "llc_occupancy", - .evtid = QOS_L3_OCCUP_EVENT_ID, - .rid = RDT_RESOURCE_L3, - }, - [QOS_L3_MBM_TOTAL_EVENT_ID] = { - .name = "mbm_total_bytes", - .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, - .rid = RDT_RESOURCE_L3, - }, - [QOS_L3_MBM_LOCAL_EVENT_ID] = { - .name = "mbm_local_bytes", - .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, - .rid = RDT_RESOURCE_L3, - }, + MON_EVENT(QOS_L3_OCCUP_EVENT_ID, "llc_occupancy", RDT_RESOURCE_L3, false), + MON_EVENT(QOS_L3_MBM_TOTAL_EVENT_ID, "mbm_total_bytes", RDT_RESOURCE_L3, false), + MON_EVENT(QOS_L3_MBM_LOCAL_EVENT_ID, "mbm_local_bytes", RDT_RESOURCE_L3, false), + MON_EVENT(PMT_EVENT_ENERGY, "core_energy", RDT_RESOURCE_PERF_PKG, true), + MON_EVENT(PMT_EVENT_ACTIVITY, "activity", RDT_RESOURCE_PERF_PKG, true), + MON_EVENT(PMT_EVENT_STALLS_LLC_HIT, "stalls_llc_hit", RDT_RESOURCE_PERF_PKG, false), + MON_EVENT(PMT_EVENT_C1_RES, "c1_res", RDT_RESOURCE_PERF_PKG, false), + MON_EVENT(PMT_EVENT_UNHALTED_CORE_CYCLES, "unhalted_core_cycles", RDT_RESOURCE_PERF_PKG, false), + MON_EVENT(PMT_EVENT_STALLS_LLC_MISS, "stalls_llc_miss", RDT_RESOURCE_PERF_PKG, false), + MON_EVENT(PMT_EVENT_AUTO_C6_RES, "c6_res", RDT_RESOURCE_PERF_PKG, false), + MON_EVENT(PMT_EVENT_UNHALTED_REF_CYCLES, "unhalted_ref_cycles", RDT_RESOURCE_PERF_PKG, false), + MON_EVENT(PMT_EVENT_UOPS_RETIRED, "uops_retired", RDT_RESOURCE_PERF_PKG, false), }; void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu, unsigned int binary_bits) diff --git a/include/linux/resctrl_types.h b/include/linux/resctrl_types.h index acfe07860b346..a5f56faa18d22 100644 --- a/include/linux/resctrl_types.h +++ b/include/linux/resctrl_types.h @@ -50,6 +50,17 @@ enum resctrl_event_id { QOS_L3_MBM_TOTAL_EVENT_ID = 0x02, QOS_L3_MBM_LOCAL_EVENT_ID = 0x03, + /* Intel Telemetry Events */ + PMT_EVENT_ENERGY, + PMT_EVENT_ACTIVITY, + PMT_EVENT_STALLS_LLC_HIT, + PMT_EVENT_C1_RES, + PMT_EVENT_UNHALTED_CORE_CYCLES, + PMT_EVENT_STALLS_LLC_MISS, + PMT_EVENT_AUTO_C6_RES, + PMT_EVENT_UNHALTED_REF_CYCLES, + PMT_EVENT_UOPS_RETIRED, + /* Must be the last */ QOS_NUM_EVENTS, }; From b5817464cde51751f69f129dcbe39ccff5018ebb Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:21:05 -0800 Subject: [PATCH 30/49] x86,fs/resctrl: Add architectural event pointer The resctrl file system layer passes the domain, RMID, and event id to the architecture to fetch an event counter. Fetching a telemetry event counter requires additional information that is private to the architecture, for example, the offset into MMIO space from where the counter should be read. Add mon_evt::arch_priv that architecture can use for any private data related to the event. The resctrl filesystem initializes mon_evt::arch_priv when the architecture enables the event and passes it back to architecture when needing to fetch an event counter. Suggested-by: Reinette Chatre Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Shanker Donthineni (backported from commit 8ccb1f8fa6a3dfde32cf33e7ded3558014e6cca2) [fenghuay: fix minor conflicts in __check_limbo()] Signed-off-by: Fenghua Yu --- arch/x86/kernel/cpu/resctrl/core.c | 6 +++--- arch/x86/kernel/cpu/resctrl/monitor.c | 2 +- fs/resctrl/internal.h | 4 ++++ fs/resctrl/monitor.c | 14 ++++++++++---- include/linux/resctrl.h | 7 +++++-- 5 files changed, 23 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index e27c26be74a47..fc6281f5812f5 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -909,15 +909,15 @@ static __init bool get_rdt_mon_resources(void) bool ret = false; if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) { - resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID, false, 0); + resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID, false, 0, NULL); ret = true; } if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { - resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID, false, 0); + resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID, false, 0, NULL); ret = true; } if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { - resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID, false, 0); + resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID, false, 0, NULL); ret = true; } if (rdt_cpu_has(X86_FEATURE_ABMC)) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 2d1453c905bc6..2f62a834787d1 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -240,7 +240,7 @@ static u64 get_corrected_val(struct rdt_resource *r, struct rdt_l3_mon_domain *d int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, u32 unused, u32 rmid, enum resctrl_event_id eventid, - u64 *val, void *ignored) + void *arch_priv, u64 *val, void *ignored) { struct rdt_hw_l3_mon_domain *hw_dom; struct rdt_l3_mon_domain *d; diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 6e391118cf4ea..706310a5dcfaa 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -67,6 +67,9 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) * @binary_bits: number of fixed-point binary bits from architecture, * only valid if @is_floating_point is true * @enabled: true if the event is enabled + * @arch_priv: Architecture private data for this event. + * The @arch_priv provided by the architecture via + * resctrl_enable_mon_event(). */ struct mon_evt { enum resctrl_event_id evtid; @@ -78,6 +81,7 @@ struct mon_evt { bool is_floating_point; unsigned int binary_bits; bool enabled; + void *arch_priv; }; extern struct mon_evt mon_event_all[QOS_NUM_EVENTS]; diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 40215d1135868..d46ff736db5a8 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -156,9 +156,11 @@ void __check_limbo(struct rdt_l3_mon_domain *d, bool force_free) struct rmid_entry *entry; u32 idx, cur_idx = 1; void *arch_mon_ctx; + void *arch_priv; bool rmid_dirty; u64 val = 0; + arch_priv = mon_event_all[QOS_L3_OCCUP_EVENT_ID].arch_priv; arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID); if (IS_ERR(arch_mon_ctx)) { pr_warn_ratelimited("Failed to allocate monitor context: %ld", @@ -181,7 +183,7 @@ void __check_limbo(struct rdt_l3_mon_domain *d, bool force_free) if (!entry) break; if (resctrl_arch_rmid_read(r, &d->hdr, entry->closid, entry->rmid, - QOS_L3_OCCUP_EVENT_ID, &val, + QOS_L3_OCCUP_EVENT_ID, arch_priv, &val, arch_mon_ctx)) { rmid_dirty = true; } else { @@ -483,7 +485,8 @@ static int __l3_mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) rr->evt->evtid, &tval); else rr->err = resctrl_arch_rmid_read(rr->r, rr->hdr, closid, rmid, - rr->evt->evtid, &tval, rr->arch_mon_ctx); + rr->evt->evtid, rr->evt->arch_priv, + &tval, rr->arch_mon_ctx); if (rr->err) return rr->err; @@ -528,7 +531,8 @@ static int __l3_mon_event_count_sum(struct rdtgroup *rdtgrp, struct rmid_read *r if (d->ci_id != rr->ci->id) continue; err = resctrl_arch_rmid_read(rr->r, &d->hdr, closid, rmid, - rr->evt->evtid, &tval, rr->arch_mon_ctx); + rr->evt->evtid, rr->evt->arch_priv, + &tval, rr->arch_mon_ctx); if (!err) { rr->val += tval; ret = 0; @@ -1023,7 +1027,8 @@ struct mon_evt mon_event_all[QOS_NUM_EVENTS] = { MON_EVENT(PMT_EVENT_UOPS_RETIRED, "uops_retired", RDT_RESOURCE_PERF_PKG, false), }; -void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu, unsigned int binary_bits) +void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu, + unsigned int binary_bits, void *arch_priv) { if (WARN_ON_ONCE(eventid < QOS_FIRST_EVENT || eventid >= QOS_NUM_EVENTS || binary_bits > MAX_BINARY_BITS)) @@ -1039,6 +1044,7 @@ void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu, unsig mon_event_all[eventid].any_cpu = any_cpu; mon_event_all[eventid].binary_bits = binary_bits; + mon_event_all[eventid].arch_priv = arch_priv; mon_event_all[eventid].enabled = true; } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index e3953387b3ff6..3004573c7df83 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -456,7 +456,7 @@ u32 resctrl_arch_system_num_rmid_idx(void); int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu, - unsigned int binary_bits); + unsigned int binary_bits, void *arch_priv); bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid); @@ -575,6 +575,9 @@ void resctrl_arch_pre_mount(void); * only. * @rmid: rmid of the counter to read. * @eventid: eventid to read, e.g. L3 occupancy. + * @arch_priv: Architecture private data for this event. + * The @arch_priv provided by the architecture via + * resctrl_enable_mon_event(). * @val: result of the counter read in bytes. * @arch_mon_ctx: An architecture specific value from * resctrl_arch_mon_ctx_alloc(), for MPAM this identifies @@ -592,7 +595,7 @@ void resctrl_arch_pre_mount(void); */ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, u32 closid, u32 rmid, enum resctrl_event_id eventid, - u64 *val, void *arch_mon_ctx); + void *arch_priv, u64 *val, void *arch_mon_ctx); /** * resctrl_arch_rmid_read_context_check() - warn about invalid contexts From 62e408c04f3824346407db9bc89e52b6a7576b1f Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:21:06 -0800 Subject: [PATCH 31/49] x86/resctrl: Find and enable usable telemetry events Every event group has a private copy of the data of all telemetry event aggregators (aka "telemetry regions") tracking its feature type. Included may be regions that have the same feature type but tracking different GUID from the event group's. Traverse the event group's telemetry region data and mark all regions that are not usable by the event group as unusable by clearing those regions' MMIO addresses. A region is considered unusable if: 1) GUID does not match the GUID of the event group. 2) Package ID is invalid. 3) The enumerated size of the MMIO region does not match the expected value from the XML description file. Hereafter any telemetry region with an MMIO address is considered valid for the event group it is associated with. Enable all the event group's events as long as there is at least one usable region from where data for its events can be read. Enabling of an event can fail if the same event has already been enabled as part of another event group. It should never happen that the same event is described by different GUID supported by the same system so just WARN (via resctrl_enable_mon_event()) and skip the event. Note that it is architecturally possible that some telemetry events are only supported by a subset of the packages in the system. It is not expected that systems will ever do this. If they do the user will see event files in resctrl that always return "Unavailable". Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com (cherry picked from commit 7e6df9614546ae7eb1f1b2074d7b6039bb01540d) Signed-off-by: Shanker Donthineni Signed-off-by: Fenghua Yu --- arch/x86/kernel/cpu/resctrl/intel_aet.c | 63 ++++++++++++++++++++++++- fs/resctrl/monitor.c | 10 ++-- include/linux/resctrl.h | 2 +- 3 files changed, 68 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/intel_aet.c b/arch/x86/kernel/cpu/resctrl/intel_aet.c index 8e042b530c914..7d0bd7b070a79 100644 --- a/arch/x86/kernel/cpu/resctrl/intel_aet.c +++ b/arch/x86/kernel/cpu/resctrl/intel_aet.c @@ -16,9 +16,11 @@ #include #include #include +#include #include #include #include +#include #include #include "internal.h" @@ -110,12 +112,69 @@ static struct event_group *known_event_groups[] = { _peg < &known_event_groups[ARRAY_SIZE(known_event_groups)]; \ _peg++) -/* Stub for now */ -static bool enable_events(struct event_group *e, struct pmt_feature_group *p) +static bool skip_telem_region(struct telemetry_region *tr, struct event_group *e) { + if (tr->guid != e->guid) + return true; + if (tr->plat_info.package_id >= topology_max_packages()) { + pr_warn("Bad package %u in guid 0x%x\n", tr->plat_info.package_id, + tr->guid); + return true; + } + if (tr->size != e->mmio_size) { + pr_warn("MMIO space wrong size (%zu bytes) for guid 0x%x. Expected %zu bytes.\n", + tr->size, e->guid, e->mmio_size); + return true; + } + return false; } +static bool group_has_usable_regions(struct event_group *e, struct pmt_feature_group *p) +{ + bool usable_regions = false; + + for (int i = 0; i < p->count; i++) { + if (skip_telem_region(&p->regions[i], e)) { + /* + * Clear the address field of regions that did not pass the checks in + * skip_telem_region() so they will not be used by intel_aet_read_event(). + * This is safe to do because intel_pmt_get_regions_by_feature() allocates + * a new pmt_feature_group structure to return to each caller and only makes + * use of the pmt_feature_group::kref field when intel_pmt_put_feature_group() + * returns the structure. + */ + p->regions[i].addr = NULL; + + continue; + } + usable_regions = true; + } + + return usable_regions; +} + +static bool enable_events(struct event_group *e, struct pmt_feature_group *p) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_PERF_PKG].r_resctrl; + int skipped_events = 0; + + if (!group_has_usable_regions(e, p)) + return false; + + for (int j = 0; j < e->num_events; j++) { + if (!resctrl_enable_mon_event(e->evts[j].id, true, + e->evts[j].bin_bits, &e->evts[j])) + skipped_events++; + } + if (e->num_events == skipped_events) { + pr_info("No events enabled in %s %s:0x%x\n", r->name, e->pfname, e->guid); + return false; + } + + return true; +} + static enum pmt_feature_id lookup_pfid(const char *pfname) { if (!strcmp(pfname, "energy")) diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index d46ff736db5a8..1bb7c60a84516 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -1027,25 +1027,27 @@ struct mon_evt mon_event_all[QOS_NUM_EVENTS] = { MON_EVENT(PMT_EVENT_UOPS_RETIRED, "uops_retired", RDT_RESOURCE_PERF_PKG, false), }; -void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu, +bool resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu, unsigned int binary_bits, void *arch_priv) { if (WARN_ON_ONCE(eventid < QOS_FIRST_EVENT || eventid >= QOS_NUM_EVENTS || binary_bits > MAX_BINARY_BITS)) - return; + return false; if (mon_event_all[eventid].enabled) { pr_warn("Duplicate enable for event %d\n", eventid); - return; + return false; } if (binary_bits && !mon_event_all[eventid].is_floating_point) { pr_warn("Event %d may not be floating point\n", eventid); - return; + return false; } mon_event_all[eventid].any_cpu = any_cpu; mon_event_all[eventid].binary_bits = binary_bits; mon_event_all[eventid].arch_priv = arch_priv; mon_event_all[eventid].enabled = true; + + return true; } bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid) diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 3004573c7df83..6ada78d6d441f 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -455,7 +455,7 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r); u32 resctrl_arch_system_num_rmid_idx(void); int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); -void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu, +bool resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu, unsigned int binary_bits, void *arch_priv); bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid); From 826e8b6201be68e1bbe2f078cf7cbda66060abdc Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:21:07 -0800 Subject: [PATCH 32/49] x86/resctrl: Read telemetry events Introduce intel_aet_read_event() to read telemetry events for resource RDT_RESOURCE_PERF_PKG. There may be multiple aggregators tracking each package, so scan all of them and add up all counters. Aggregators may return an invalid data indication if they have received no records for a given RMID. The user will see "Unavailable" if none of the aggregators on a package provide valid counts. Resctrl now uses readq() so depends on X86_64. Update Kconfig. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com (cherry picked from commit 51541f6ca7718d8278e12fe80af80033268743b2) Signed-off-by: Shanker Donthineni Signed-off-by: Fenghua Yu --- arch/x86/Kconfig | 2 +- arch/x86/kernel/cpu/resctrl/intel_aet.c | 51 +++++++++++++++++++++++++ arch/x86/kernel/cpu/resctrl/internal.h | 5 +++ arch/x86/kernel/cpu/resctrl/monitor.c | 4 ++ fs/resctrl/monitor.c | 14 +++++++ 5 files changed, 75 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 75766f3da5ac4..61d86219d4719 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -527,7 +527,7 @@ config X86_CPU_RESCTRL config X86_CPU_RESCTRL_INTEL_AET bool "Intel Application Energy Telemetry" - depends on X86_CPU_RESCTRL && CPU_SUP_INTEL && INTEL_PMT_TELEMETRY=y && INTEL_TPMI=y + depends on X86_64 && X86_CPU_RESCTRL && CPU_SUP_INTEL && INTEL_PMT_TELEMETRY=y && INTEL_TPMI=y help Enable per-RMID telemetry events in resctrl. diff --git a/arch/x86/kernel/cpu/resctrl/intel_aet.c b/arch/x86/kernel/cpu/resctrl/intel_aet.c index 7d0bd7b070a79..96d627e2c52d0 100644 --- a/arch/x86/kernel/cpu/resctrl/intel_aet.c +++ b/arch/x86/kernel/cpu/resctrl/intel_aet.c @@ -11,11 +11,15 @@ #define pr_fmt(fmt) "resctrl: " fmt +#include #include +#include #include +#include #include #include #include +#include #include #include #include @@ -232,3 +236,50 @@ void __exit intel_aet_exit(void) } } } + +#define DATA_VALID BIT_ULL(63) +#define DATA_BITS GENMASK_ULL(62, 0) + +/* + * Read counter for an event on a domain (summing all aggregators on the + * domain). If an aggregator hasn't received any data for a specific RMID, + * the MMIO read indicates that data is not valid. Return success if at + * least one aggregator has valid data. + */ +int intel_aet_read_event(int domid, u32 rmid, void *arch_priv, u64 *val) +{ + struct pmt_event *pevt = arch_priv; + struct event_group *e; + bool valid = false; + u64 total = 0; + u64 evtcount; + void *pevt0; + u32 idx; + + pevt0 = pevt - pevt->idx; + e = container_of(pevt0, struct event_group, evts); + idx = rmid * e->num_events; + idx += pevt->idx; + + if (idx * sizeof(u64) + sizeof(u64) > e->mmio_size) { + pr_warn_once("MMIO index %u out of range\n", idx); + return -EIO; + } + + for (int i = 0; i < e->pfg->count; i++) { + if (!e->pfg->regions[i].addr) + continue; + if (e->pfg->regions[i].plat_info.package_id != domid) + continue; + evtcount = readq(e->pfg->regions[i].addr + idx * sizeof(u64)); + if (!(evtcount & DATA_VALID)) + continue; + total += evtcount & DATA_BITS; + valid = true; + } + + if (valid) + *val = total; + + return valid ? 0 : -EINVAL; +} diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index ad8b787827abf..7e8b40f85373b 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -220,9 +220,14 @@ void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r); #ifdef CONFIG_X86_CPU_RESCTRL_INTEL_AET bool intel_aet_get_events(void); void __exit intel_aet_exit(void); +int intel_aet_read_event(int domid, u32 rmid, void *arch_priv, u64 *val); #else static inline bool intel_aet_get_events(void) { return false; } static inline void __exit intel_aet_exit(void) { } +static inline int intel_aet_read_event(int domid, u32 rmid, void *arch_priv, u64 *val) +{ + return -EINVAL; +} #endif #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 2f62a834787d1..c10e1ae5ee8a7 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -251,6 +251,10 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, int ret; resctrl_arch_rmid_read_context_check(); + + if (r->rid == RDT_RESOURCE_PERF_PKG) + return intel_aet_read_event(hdr->id, rmid, arch_priv, val); + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) return -EINVAL; diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 1bb7c60a84516..2f436c868e87c 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -554,6 +554,20 @@ static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) return __l3_mon_event_count(rdtgrp, rr); else return __l3_mon_event_count_sum(rdtgrp, rr); + case RDT_RESOURCE_PERF_PKG: { + u64 tval = 0; + + rr->err = resctrl_arch_rmid_read(rr->r, rr->hdr, rdtgrp->closid, + rdtgrp->mon.rmid, rr->evt->evtid, + rr->evt->arch_priv, + &tval, rr->arch_mon_ctx); + if (rr->err) + return rr->err; + + rr->val += tval; + + return 0; + } default: rr->err = -EINVAL; return -EINVAL; From dd474e1ab6459d27a86eb130083a40cc82586a1c Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:21:08 -0800 Subject: [PATCH 33/49] fs/resctrl: Refactor mkdir_mondata_subdir() Population of a monitor group's mon_data directory is unreasonably complicated because of the support for Sub-NUMA Cluster (SNC) mode. Split out the SNC code into a helper function to make it easier to add support for a new telemetry resource. Move all the duplicated code to make and set owner of domain directories into the mon_add_all_files() helper and rename to _mkdir_mondata_subdir(). Suggested-by: Reinette Chatre Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com (cherry picked from commit 0ec1db4cac8239bb32da87586c3638200b65dd8c) Signed-off-by: Shanker Donthineni Signed-off-by: Fenghua Yu --- fs/resctrl/rdtgroup.c | 108 +++++++++++++++++++++++------------------- 1 file changed, 58 insertions(+), 50 deletions(-) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index f1dfeb42d2e4b..b0dac91bb5537 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -3453,57 +3453,65 @@ static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, } } -static int mon_add_all_files(struct kernfs_node *kn, struct rdt_domain_hdr *hdr, - struct rdt_resource *r, struct rdtgroup *prgrp, - bool do_sum) +/* + * Create a directory for a domain and populate it with monitor files. Create + * summing monitors when @hdr is NULL. No need to initialize summing monitors. + */ +static struct kernfs_node *_mkdir_mondata_subdir(struct kernfs_node *parent_kn, char *name, + struct rdt_domain_hdr *hdr, + struct rdt_resource *r, + struct rdtgroup *prgrp, int domid) { - struct rdt_l3_mon_domain *d; struct rmid_read rr = {0}; + struct kernfs_node *kn; struct mon_data *priv; struct mon_evt *mevt; - int ret, domid; + int ret; - if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) - return -EINVAL; + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + if (IS_ERR(kn)) + return kn; + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; - d = container_of(hdr, struct rdt_l3_mon_domain, hdr); for_each_mon_event(mevt) { if (mevt->rid != r->rid || !mevt->enabled) continue; - domid = do_sum ? d->ci_id : d->hdr.id; - priv = mon_get_kn_priv(r->rid, domid, mevt, do_sum); - if (WARN_ON_ONCE(!priv)) - return -EINVAL; + priv = mon_get_kn_priv(r->rid, domid, mevt, !hdr); + if (WARN_ON_ONCE(!priv)) { + ret = -EINVAL; + goto out_destroy; + } ret = mon_addfile(kn, mevt->name, priv); if (ret) - return ret; + goto out_destroy; - if (!do_sum && resctrl_is_mbm_event(mevt->evtid)) + if (hdr && resctrl_is_mbm_event(mevt->evtid)) mon_event_read(&rr, r, hdr, prgrp, &hdr->cpu_mask, mevt, true); } - return 0; + return kn; +out_destroy: + kernfs_remove(kn); + return ERR_PTR(ret); } -static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, - struct rdt_domain_hdr *hdr, - struct rdt_resource *r, struct rdtgroup *prgrp) +static int mkdir_mondata_subdir_snc(struct kernfs_node *parent_kn, + struct rdt_domain_hdr *hdr, + struct rdt_resource *r, struct rdtgroup *prgrp) { - struct kernfs_node *kn, *ckn; + struct kernfs_node *ckn, *kn; struct rdt_l3_mon_domain *d; char name[32]; - bool snc_mode; - int ret = 0; - - lockdep_assert_held(&rdtgroup_mutex); if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) return -EINVAL; d = container_of(hdr, struct rdt_l3_mon_domain, hdr); - snc_mode = r->mon_scope == RESCTRL_L3_NODE; - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id); + sprintf(name, "mon_%s_%02d", r->name, d->ci_id); kn = kernfs_find_and_get(parent_kn, name); if (kn) { /* @@ -3512,41 +3520,41 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, */ kernfs_put(kn); } else { - kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + kn = _mkdir_mondata_subdir(parent_kn, name, NULL, r, prgrp, d->ci_id); if (IS_ERR(kn)) return PTR_ERR(kn); + } - ret = rdtgroup_kn_set_ugid(kn); - if (ret) - goto out_destroy; - ret = mon_add_all_files(kn, hdr, r, prgrp, snc_mode); - if (ret) - goto out_destroy; + sprintf(name, "mon_sub_%s_%02d", r->name, hdr->id); + ckn = _mkdir_mondata_subdir(kn, name, hdr, r, prgrp, hdr->id); + if (IS_ERR(ckn)) { + kernfs_remove(kn); + return PTR_ERR(ckn); } - if (snc_mode) { - sprintf(name, "mon_sub_%s_%02d", r->name, hdr->id); - ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp); - if (IS_ERR(ckn)) { - ret = -EINVAL; - goto out_destroy; - } + kernfs_activate(kn); + return 0; +} - ret = rdtgroup_kn_set_ugid(ckn); - if (ret) - goto out_destroy; +static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, + struct rdt_domain_hdr *hdr, + struct rdt_resource *r, struct rdtgroup *prgrp) +{ + struct kernfs_node *kn; + char name[32]; - ret = mon_add_all_files(ckn, hdr, r, prgrp, false); - if (ret) - goto out_destroy; - } + lockdep_assert_held(&rdtgroup_mutex); + + if (r->rid == RDT_RESOURCE_L3 && r->mon_scope == RESCTRL_L3_NODE) + return mkdir_mondata_subdir_snc(parent_kn, hdr, r, prgrp); + + sprintf(name, "mon_%s_%02d", r->name, hdr->id); + kn = _mkdir_mondata_subdir(parent_kn, name, hdr, r, prgrp, hdr->id); + if (IS_ERR(kn)) + return PTR_ERR(kn); kernfs_activate(kn); return 0; - -out_destroy: - kernfs_remove(kn); - return ret; } /* From c9b4ae20cf1bd634ded576fbde397b7cd20611f5 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:21:09 -0800 Subject: [PATCH 34/49] fs/resctrl: Refactor rmdir_mondata_subdir_allrdtgrp() Clearing a monitor group's mon_data directory is complicated because of the support for Sub-NUMA Cluster (SNC) mode. Refactor the SNC case into a helper function to make it easier to add support for a new telemetry resource. Suggested-by: Reinette Chatre Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com (cherry picked from commit 93d9fd89995181d7ff420752328cc8b4b228f100) Signed-off-by: Shanker Donthineni Signed-off-by: Fenghua Yu --- fs/resctrl/rdtgroup.c | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index b0dac91bb5537..30f01c4195c1b 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -3422,28 +3422,24 @@ static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subn } /* - * Remove all subdirectories of mon_data of ctrl_mon groups - * and monitor groups for the given domain. - * Remove files and directories containing "sum" of domain data - * when last domain being summed is removed. + * Remove files and directories for one SNC node. If it is the last node + * sharing an L3 cache, then remove the upper level directory containing + * the "sum" files too. */ -static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - struct rdt_domain_hdr *hdr) +static void rmdir_mondata_subdir_allrdtgrp_snc(struct rdt_resource *r, + struct rdt_domain_hdr *hdr) { struct rdtgroup *prgrp, *crgrp; struct rdt_l3_mon_domain *d; char subname[32]; - bool snc_mode; char name[32]; if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) return; d = container_of(hdr, struct rdt_l3_mon_domain, hdr); - snc_mode = r->mon_scope == RESCTRL_L3_NODE; - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : hdr->id); - if (snc_mode) - sprintf(subname, "mon_sub_%s_%02d", r->name, hdr->id); + sprintf(name, "mon_%s_%02d", r->name, d->ci_id); + sprintf(subname, "mon_sub_%s_%02d", r->name, hdr->id); list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname); @@ -3453,6 +3449,30 @@ static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, } } +/* + * Remove all subdirectories of mon_data of ctrl_mon groups + * and monitor groups for the given domain. + */ +static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + struct rdt_domain_hdr *hdr) +{ + struct rdtgroup *prgrp, *crgrp; + char name[32]; + + if (r->rid == RDT_RESOURCE_L3 && r->mon_scope == RESCTRL_L3_NODE) { + rmdir_mondata_subdir_allrdtgrp_snc(r, hdr); + return; + } + + sprintf(name, "mon_%s_%02d", r->name, hdr->id); + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + kernfs_remove_by_name(prgrp->mon.mon_data_kn, name); + + list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) + kernfs_remove_by_name(crgrp->mon.mon_data_kn, name); + } +} + /* * Create a directory for a domain and populate it with monitor files. Create * summing monitors when @hdr is NULL. No need to initialize summing monitors. From e09285f9e81686590b83fbc54f80663373e54085 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:21:10 -0800 Subject: [PATCH 35/49] x86,fs/resctrl: Handle domain creation/deletion for RDT_RESOURCE_PERF_PKG The L3 resource has several requirements for domains. There are per-domain structures that hold the 64-bit values of counters, and elements to keep track of the overflow and limbo threads. None of these are needed for the PERF_PKG resource. The hardware counters are wide enough that they do not wrap around for decades. Define a new rdt_perf_pkg_mon_domain structure which just consists of the standard rdt_domain_hdr to keep track of domain id and CPU mask. Update resctrl_online_mon_domain() for RDT_RESOURCE_PERF_PKG. The only action needed for this resource is to create and populate domain directories if a domain is added while resctrl is mounted. Similarly resctrl_offline_mon_domain() only needs to remove domain directories. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com (cherry picked from commit f4e0cd80d3e7c31327459008b01d63804838a89d) Signed-off-by: Shanker Donthineni Signed-off-by: Fenghua Yu --- arch/x86/kernel/cpu/resctrl/core.c | 17 +++++++++++++++ arch/x86/kernel/cpu/resctrl/intel_aet.c | 29 +++++++++++++++++++++++++ arch/x86/kernel/cpu/resctrl/internal.h | 13 +++++++++++ fs/resctrl/rdtgroup.c | 17 ++++++++++----- 4 files changed, 71 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index fc6281f5812f5..9d9ea0bf2802b 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -575,6 +575,10 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) if (!hdr) l3_mon_domain_setup(cpu, id, r, add_pos); break; + case RDT_RESOURCE_PERF_PKG: + if (!hdr) + intel_aet_mon_domain_setup(cpu, id, r, add_pos); + break; default: pr_warn_once("Unknown resource rid=%d\n", r->rid); break; @@ -674,6 +678,19 @@ static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r) l3_mon_domain_free(hw_dom); break; } + case RDT_RESOURCE_PERF_PKG: { + struct rdt_perf_pkg_mon_domain *pkgd; + + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_PERF_PKG)) + return; + + pkgd = container_of(hdr, struct rdt_perf_pkg_mon_domain, hdr); + resctrl_offline_mon_domain(r, hdr); + list_del_rcu(&hdr->list); + synchronize_rcu(); + kfree(pkgd); + break; + } default: pr_warn_once("Unknown resource rid=%d\n", r->rid); break; diff --git a/arch/x86/kernel/cpu/resctrl/intel_aet.c b/arch/x86/kernel/cpu/resctrl/intel_aet.c index 96d627e2c52d0..9351fe5b645af 100644 --- a/arch/x86/kernel/cpu/resctrl/intel_aet.c +++ b/arch/x86/kernel/cpu/resctrl/intel_aet.c @@ -14,15 +14,20 @@ #include #include #include +#include #include #include +#include #include #include #include #include #include +#include +#include #include #include +#include #include #include #include @@ -283,3 +288,27 @@ int intel_aet_read_event(int domid, u32 rmid, void *arch_priv, u64 *val) return valid ? 0 : -EINVAL; } + +void intel_aet_mon_domain_setup(int cpu, int id, struct rdt_resource *r, + struct list_head *add_pos) +{ + struct rdt_perf_pkg_mon_domain *d; + int err; + + d = kzalloc_node(sizeof(*d), GFP_KERNEL, cpu_to_node(cpu)); + if (!d) + return; + + d->hdr.id = id; + d->hdr.type = RESCTRL_MON_DOMAIN; + d->hdr.rid = RDT_RESOURCE_PERF_PKG; + cpumask_set_cpu(cpu, &d->hdr.cpu_mask); + list_add_tail_rcu(&d->hdr.list, add_pos); + + err = resctrl_online_mon_domain(r, &d->hdr); + if (err) { + list_del_rcu(&d->hdr.list); + synchronize_rcu(); + kfree(d); + } +} diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 7e8b40f85373b..d1642d820bf90 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -84,6 +84,14 @@ static inline struct rdt_hw_l3_mon_domain *resctrl_to_arch_mon_dom(struct rdt_l3 return container_of(r, struct rdt_hw_l3_mon_domain, d_resctrl); } +/** + * struct rdt_perf_pkg_mon_domain - CPUs sharing an package scoped resctrl monitor resource + * @hdr: common header for different domain types + */ +struct rdt_perf_pkg_mon_domain { + struct rdt_domain_hdr hdr; +}; + /** * struct msr_param - set a range of MSRs from a domain * @res: The resource to use @@ -221,6 +229,8 @@ void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r); bool intel_aet_get_events(void); void __exit intel_aet_exit(void); int intel_aet_read_event(int domid, u32 rmid, void *arch_priv, u64 *val); +void intel_aet_mon_domain_setup(int cpu, int id, struct rdt_resource *r, + struct list_head *add_pos); #else static inline bool intel_aet_get_events(void) { return false; } static inline void __exit intel_aet_exit(void) { } @@ -228,6 +238,9 @@ static inline int intel_aet_read_event(int domid, u32 rmid, void *arch_priv, u64 { return -EINVAL; } + +static inline void intel_aet_mon_domain_setup(int cpu, int id, struct rdt_resource *r, + struct list_head *add_pos) { } #endif #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 30f01c4195c1b..07fa1b756dad7 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -4503,11 +4503,6 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *h mutex_lock(&rdtgroup_mutex); - if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) - goto out_unlock; - - d = container_of(hdr, struct rdt_l3_mon_domain, hdr); - /* * If resctrl is mounted, remove all the * per domain monitor data directories. @@ -4515,6 +4510,13 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *h if (resctrl_mounted && resctrl_arch_mon_capable()) rmdir_mondata_subdir_allrdtgrp(r, hdr); + if (r->rid != RDT_RESOURCE_L3) + goto out_unlock; + + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + goto out_unlock; + + d = container_of(hdr, struct rdt_l3_mon_domain, hdr); if (resctrl_is_mbm_enabled()) cancel_delayed_work(&d->mbm_over); if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID) && has_busy_rmid(d)) { @@ -4611,6 +4613,9 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr mutex_lock(&rdtgroup_mutex); + if (r->rid != RDT_RESOURCE_L3) + goto mkdir; + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) goto out_unlock; @@ -4628,6 +4633,8 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); +mkdir: + err = 0; /* * If the filesystem is not mounted then only the default resource group * exists. Creation of its directories is deferred until mount time From 568c9cb8ec9f108b756f79a70926ca4399258fc6 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:21:11 -0800 Subject: [PATCH 36/49] x86/resctrl: Add energy/perf choices to rdt boot option Legacy resctrl features are enumerated by X86_FEATURE_* flags. These may be overridden by quirks to disable features in the case of errata. Users can use kernel command line options to either disable a feature, or to force enable a feature that was disabled by a quirk. A different approach is needed for hardware features that do not have an X86_FEATURE_* flag. Update parsing of the "rdt=" boot parameter to call the telemetry driver directly to handle new "perf" and "energy" options that controls activation of telemetry monitoring of the named type. By itself a "perf" or "energy" option controls the forced enabling or disabling (with ! prefix) of all event groups of the named type. A ":guid" suffix allows for fine grained control per event group. [ bp: s/intel_aet_option/intel_handle_aet_option/g ] Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Shanker Donthineni (backported from commit 842e7f97d71a4116a650ec0045d6444b4377b512) [fenghuay: fix a minor conflict in kernel-parameters.txt doc] Signed-off-by: Fenghua Yu --- .../admin-guide/kernel-parameters.txt | 6 +++ arch/x86/kernel/cpu/resctrl/core.c | 2 + arch/x86/kernel/cpu/resctrl/intel_aet.c | 38 +++++++++++++++++++ arch/x86/kernel/cpu/resctrl/internal.h | 2 + 4 files changed, 48 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 44a70e1ab59a2..6cd934ae385d6 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6174,8 +6174,14 @@ Turn on/off individual RDT features. List is: cmt, mbmtotal, mbmlocal, l3cat, l3cdp, l2cat, l2cdp, mba, smba, bmec, abmc. + mba, smba, bmec, abmc, sdciae, energy[:guid], + perf[:guid]. E.g. to turn on cmt and turn off mba use: rdt=cmt,!mba + To turn off all energy telemetry monitoring and ensure that + perf telemetry monitoring associated with guid 0x12345 + is enabled use: + rdt=!energy,perf:0x12345 reboot= [KNL] Format (x86 or x86_64): diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 9d9ea0bf2802b..3c29a4d133c8d 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -807,6 +807,8 @@ static int __init set_rdt_options(char *str) force_off = *tok == '!'; if (force_off) tok++; + if (intel_handle_aet_option(force_off, tok)) + continue; for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) { if (strcmp(tok, o->name) == 0) { if (force_off) diff --git a/arch/x86/kernel/cpu/resctrl/intel_aet.c b/arch/x86/kernel/cpu/resctrl/intel_aet.c index 9351fe5b645af..dc25e8d2527dc 100644 --- a/arch/x86/kernel/cpu/resctrl/intel_aet.c +++ b/arch/x86/kernel/cpu/resctrl/intel_aet.c @@ -52,12 +52,17 @@ struct pmt_event { /** * struct event_group - Events with the same feature type ("energy" or "perf") and GUID. * @pfname: PMT feature name ("energy" or "perf") of this event group. + * Used by boot rdt= option. * @pfg: Points to the aggregated telemetry space information * returned by the intel_pmt_get_regions_by_feature() * call to the INTEL_PMT_TELEMETRY driver that contains * data for all telemetry regions of type @pfname. * Valid if the system supports the event group, * NULL otherwise. + * @force_off: True when "rdt" command line or architecture code disables + * this event group. + * @force_on: True when "rdt" command line overrides disable of this + * event group. * @guid: Unique number per XML description file. * @mmio_size: Number of bytes of MMIO registers for this group. * @num_events: Number of events in this group. @@ -67,6 +72,7 @@ struct event_group { /* Data fields for additional structures to manage this group. */ const char *pfname; struct pmt_feature_group *pfg; + bool force_off, force_on; /* Remaining fields initialized from XML file. */ u32 guid; @@ -121,6 +127,35 @@ static struct event_group *known_event_groups[] = { _peg < &known_event_groups[ARRAY_SIZE(known_event_groups)]; \ _peg++) +bool intel_handle_aet_option(bool force_off, char *tok) +{ + struct event_group **peg; + bool ret = false; + u32 guid = 0; + char *name; + + if (!tok) + return false; + + name = strsep(&tok, ":"); + if (tok && kstrtou32(tok, 16, &guid)) + return false; + + for_each_event_group(peg) { + if (strcmp(name, (*peg)->pfname)) + continue; + if (guid && (*peg)->guid != guid) + continue; + if (force_off) + (*peg)->force_off = true; + else + (*peg)->force_on = true; + ret = true; + } + + return ret; +} + static bool skip_telem_region(struct telemetry_region *tr, struct event_group *e) { if (tr->guid != e->guid) @@ -168,6 +203,9 @@ static bool enable_events(struct event_group *e, struct pmt_feature_group *p) struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_PERF_PKG].r_resctrl; int skipped_events = 0; + if (e->force_off) + return false; + if (!group_has_usable_regions(e, p)) return false; diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index d1642d820bf90..eb923cd978e08 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -231,6 +231,7 @@ void __exit intel_aet_exit(void); int intel_aet_read_event(int domid, u32 rmid, void *arch_priv, u64 *val); void intel_aet_mon_domain_setup(int cpu, int id, struct rdt_resource *r, struct list_head *add_pos); +bool intel_handle_aet_option(bool force_off, char *tok); #else static inline bool intel_aet_get_events(void) { return false; } static inline void __exit intel_aet_exit(void) { } @@ -241,6 +242,7 @@ static inline int intel_aet_read_event(int domid, u32 rmid, void *arch_priv, u64 static inline void intel_aet_mon_domain_setup(int cpu, int id, struct rdt_resource *r, struct list_head *add_pos) { } +static inline bool intel_handle_aet_option(bool force_off, char *tok) { return false; } #endif #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ From 04d5b8138de75b47d3a46f1276aaa7d96d858440 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:21:12 -0800 Subject: [PATCH 37/49] x86/resctrl: Handle number of RMIDs supported by RDT_RESOURCE_PERF_PKG There are now three meanings for "number of RMIDs": 1) The number for legacy features enumerated by CPUID leaf 0xF. This is the maximum number of distinct values that can be loaded into MSR_IA32_PQR_ASSOC. Note that systems with Sub-NUMA Cluster mode enabled will force scaling down the CPUID enumerated value by the number of SNC nodes per L3-cache. 2) The number of registers in MMIO space for each event. This is enumerated in the XML files and is the value initialized into event_group::num_rmid. 3) The number of "hardware counters" (this isn't a strictly accurate description of how things work, but serves as a useful analogy that does describe the limitations) feeding to those MMIO registers. This is enumerated in telemetry_region::num_rmids returned by intel_pmt_get_regions_by_feature(). Event groups with insufficient "hardware counters" to track all RMIDs are difficult for users to use, since the system may reassign "hardware counters" at any time. This means that users cannot reliably collect two consecutive event counts to compute the rate at which events are occurring. Disable such event groups by default. The user may override this with a command line "rdt=" option. In this case limit an under-resourced event group's number of possible monitor resource groups to the lowest number of "hardware counters". Scan all enabled event groups and assign the RDT_RESOURCE_PERF_PKG resource "num_rmid" value to the smallest of these values as this value will be used later to compare against the number of RMIDs supported by other resources to determine how many monitoring resource groups are supported. N.B. Change type of resctrl_mon::num_rmid to u32 to match its usage and the type of event_group::num_rmid so that min(r->num_rmid, e->num_rmid) won't complain about mixing signed and unsigned types. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com (cherry picked from commit 67640e333b983298be624a41c43e3a8ed4713a73) Signed-off-by: Shanker Donthineni Signed-off-by: Fenghua Yu --- arch/x86/kernel/cpu/resctrl/intel_aet.c | 53 ++++++++++++++++++++++++- fs/resctrl/rdtgroup.c | 2 +- include/linux/resctrl.h | 2 +- 3 files changed, 54 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/intel_aet.c b/arch/x86/kernel/cpu/resctrl/intel_aet.c index dc25e8d2527dc..aba9971350031 100644 --- a/arch/x86/kernel/cpu/resctrl/intel_aet.c +++ b/arch/x86/kernel/cpu/resctrl/intel_aet.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -60,10 +61,14 @@ struct pmt_event { * Valid if the system supports the event group, * NULL otherwise. * @force_off: True when "rdt" command line or architecture code disables - * this event group. + * this event group due to insufficient RMIDs. * @force_on: True when "rdt" command line overrides disable of this * event group. * @guid: Unique number per XML description file. + * @num_rmid: Number of RMIDs supported by this group. May be + * adjusted downwards if enumeration from + * intel_pmt_get_regions_by_feature() indicates fewer + * RMIDs can be tracked simultaneously. * @mmio_size: Number of bytes of MMIO registers for this group. * @num_events: Number of events in this group. * @evts: Array of event descriptors. @@ -76,6 +81,7 @@ struct event_group { /* Remaining fields initialized from XML file. */ u32 guid; + u32 num_rmid; size_t mmio_size; unsigned int num_events; struct pmt_event evts[] __counted_by(num_events); @@ -90,6 +96,7 @@ struct event_group { static struct event_group energy_0x26696143 = { .pfname = "energy", .guid = 0x26696143, + .num_rmid = 576, .mmio_size = XML_MMIO_SIZE(576, 2, 3), .num_events = 2, .evts = { @@ -104,6 +111,7 @@ static struct event_group energy_0x26696143 = { static struct event_group perf_0x26557651 = { .pfname = "perf", .guid = 0x26557651, + .num_rmid = 576, .mmio_size = XML_MMIO_SIZE(576, 7, 3), .num_events = 7, .evts = { @@ -198,6 +206,23 @@ static bool group_has_usable_regions(struct event_group *e, struct pmt_feature_g return usable_regions; } +static bool all_regions_have_sufficient_rmid(struct event_group *e, struct pmt_feature_group *p) +{ + struct telemetry_region *tr; + + for (int i = 0; i < p->count; i++) { + if (!p->regions[i].addr) + continue; + tr = &p->regions[i]; + if (tr->num_rmids < e->num_rmid) { + e->force_off = true; + return false; + } + } + + return true; +} + static bool enable_events(struct event_group *e, struct pmt_feature_group *p) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_PERF_PKG].r_resctrl; @@ -209,6 +234,27 @@ static bool enable_events(struct event_group *e, struct pmt_feature_group *p) if (!group_has_usable_regions(e, p)) return false; + /* + * Only enable event group with insufficient RMIDs if the user requested + * it from the kernel command line. + */ + if (!all_regions_have_sufficient_rmid(e, p) && !e->force_on) { + pr_info("%s %s:0x%x monitoring not enabled due to insufficient RMIDs\n", + r->name, e->pfname, e->guid); + return false; + } + + for (int i = 0; i < p->count; i++) { + if (!p->regions[i].addr) + continue; + /* + * e->num_rmid only adjusted lower if user (via rdt= kernel + * parameter) forces an event group with insufficient RMID + * to be enabled. + */ + e->num_rmid = min(e->num_rmid, p->regions[i].num_rmids); + } + for (int j = 0; j < e->num_events; j++) { if (!resctrl_enable_mon_event(e->evts[j].id, true, e->evts[j].bin_bits, &e->evts[j])) @@ -219,6 +265,11 @@ static bool enable_events(struct event_group *e, struct pmt_feature_group *p) return false; } + if (r->mon.num_rmid) + r->mon.num_rmid = min(r->mon.num_rmid, e->num_rmid); + else + r->mon.num_rmid = e->num_rmid; + return true; } diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 07fa1b756dad7..54478a648b482 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1234,7 +1234,7 @@ static int rdt_num_rmids_show(struct kernfs_open_file *of, { struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - seq_printf(seq, "%d\n", r->mon.num_rmid); + seq_printf(seq, "%u\n", r->mon.num_rmid); return 0; } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 6ada78d6d441f..2901cbd34459c 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -305,7 +305,7 @@ enum resctrl_schema_fmt { * events of monitor groups created via mkdir. */ struct resctrl_mon { - int num_rmid; + u32 num_rmid; unsigned int mbm_cfg_mask; int num_mbm_cntrs; bool mbm_cntr_assignable; From 897660cdb37c5a51d73cb86205683952ef6f8575 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:21:13 -0800 Subject: [PATCH 38/49] fs/resctrl: Move allocation/free of closid_num_dirty_rmid[] closid_num_dirty_rmid[] and rmid_ptrs[] are allocated together during resctrl initialization and freed together during resctrl exit. Telemetry events are enumerated on resctrl mount so only at resctrl mount will the number of RMID supported by all monitoring resources and needed as size for rmid_ptrs[] be known. Separate closid_num_dirty_rmid[] and rmid_ptrs[] allocation and free in preparation for rmid_ptrs[] to be allocated on resctrl mount. Keep the rdtgroup_mutex protection around the allocation and free of closid_num_dirty_rmid[] as ARM needs this to guarantee memory ordering. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com (cherry picked from commit ee7f6af79f0916b6c49e15edd4cba020b3e4c4ac) Signed-off-by: Shanker Donthineni Signed-off-by: Fenghua Yu --- fs/resctrl/monitor.c | 79 ++++++++++++++++++++++++++++---------------- 1 file changed, 51 insertions(+), 28 deletions(-) diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 2f436c868e87c..3c37402f217f3 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -935,36 +935,14 @@ void mbm_setup_overflow_handler(struct rdt_l3_mon_domain *dom, unsigned long del static int dom_data_init(struct rdt_resource *r) { u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - u32 num_closid = resctrl_arch_get_num_closid(r); struct rmid_entry *entry = NULL; int err = 0, i; u32 idx; mutex_lock(&rdtgroup_mutex); - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - u32 *tmp; - - /* - * If the architecture hasn't provided a sanitised value here, - * this may result in larger arrays than necessary. Resctrl will - * use a smaller system wide value based on the resources in - * use. - */ - tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); - if (!tmp) { - err = -ENOMEM; - goto out_unlock; - } - - closid_num_dirty_rmid = tmp; - } rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL); if (!rmid_ptrs) { - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - kfree(closid_num_dirty_rmid); - closid_num_dirty_rmid = NULL; - } err = -ENOMEM; goto out_unlock; } @@ -1001,11 +979,6 @@ static void dom_data_exit(struct rdt_resource *r) if (!r->mon_capable) goto out_unlock; - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - kfree(closid_num_dirty_rmid); - closid_num_dirty_rmid = NULL; - } - kfree(rmid_ptrs); rmid_ptrs = NULL; @@ -1844,6 +1817,45 @@ ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, return ret ?: nbytes; } +static int closid_num_dirty_rmid_alloc(struct rdt_resource *r) +{ + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + u32 num_closid = resctrl_arch_get_num_closid(r); + u32 *tmp; + + /* For ARM memory ordering access to closid_num_dirty_rmid */ + mutex_lock(&rdtgroup_mutex); + + /* + * If the architecture hasn't provided a sanitised value here, + * this may result in larger arrays than necessary. Resctrl will + * use a smaller system wide value based on the resources in + * use. + */ + tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); + if (!tmp) { + mutex_unlock(&rdtgroup_mutex); + return -ENOMEM; + } + + closid_num_dirty_rmid = tmp; + + mutex_unlock(&rdtgroup_mutex); + } + + return 0; +} + +static void closid_num_dirty_rmid_free(void) +{ + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + mutex_lock(&rdtgroup_mutex); + kfree(closid_num_dirty_rmid); + closid_num_dirty_rmid = NULL; + mutex_unlock(&rdtgroup_mutex); + } +} + /** * resctrl_l3_mon_resource_init() - Initialise global monitoring structures. * @@ -1864,10 +1876,16 @@ int resctrl_l3_mon_resource_init(void) if (!r->mon_capable) return 0; - ret = dom_data_init(r); + ret = closid_num_dirty_rmid_alloc(r); if (ret) return ret; + ret = dom_data_init(r); + if (ret) { + closid_num_dirty_rmid_free(); + return ret; + } + if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) { mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].configurable = true; resctrl_file_fflags_init("mbm_total_bytes_config", @@ -1910,5 +1928,10 @@ void resctrl_l3_mon_resource_exit(void) { struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + if (!r->mon_capable) + return; + + closid_num_dirty_rmid_free(); + dom_data_exit(r); } From 3ce2fa24448547aed2a076220f2959b49293426e Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:21:14 -0800 Subject: [PATCH 39/49] x86,fs/resctrl: Compute number of RMIDs as minimum across resources resctrl assumes that only the L3 resource supports monitor events, so it simply takes the rdt_resource::num_rmid from RDT_RESOURCE_L3 as the system's number of RMIDs. The addition of telemetry events in a different resource breaks that assumption. Compute the number of available RMIDs as the minimum value across all mon_capable resources (analogous to how the number of CLOSIDs is computed across alloc_capable resources). Note that mount time enumeration of the telemetry resource means that this number can be reduced. If this happens, then some memory will be wasted as the allocations for rdt_l3_mon_domain::mbm_states[] and rdt_l3_mon_domain::rmid_busy_llc created during resctrl initialization will be larger than needed. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com (cherry picked from commit 0ecc988b0232259cbdb2b7e452bda74f550f0911) Signed-off-by: Shanker Donthineni Signed-off-by: Fenghua Yu --- arch/x86/kernel/cpu/resctrl/core.c | 15 +++++++++++++-- fs/resctrl/rdtgroup.c | 6 ++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 3c29a4d133c8d..7fe323865a223 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -108,12 +108,23 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { }, }; +/** + * resctrl_arch_system_num_rmid_idx - Compute number of supported RMIDs + * (minimum across all mon_capable resource) + * + * Return: Number of supported RMIDs at time of call. Note that mount time + * enumeration of resources may reduce the number. + */ u32 resctrl_arch_system_num_rmid_idx(void) { - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + u32 num_rmids = U32_MAX; + struct rdt_resource *r; + + for_each_mon_capable_rdt_resource(r) + num_rmids = min(num_rmids, r->mon.num_rmid); /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */ - return r->mon.num_rmid; + return num_rmids == U32_MAX ? 0 : num_rmids; } struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 54478a648b482..c5b81c9e107d6 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -4548,6 +4548,12 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *h * During boot this may be called before global allocations have been made by * resctrl_l3_mon_resource_init(). * + * Called during CPU online that may run as soon as CPU online callbacks + * are set up during resctrl initialization. The number of supported RMIDs + * may be reduced if additional mon_capable resources are enumerated + * at mount time. This means the rdt_l3_mon_domain::mbm_states[] and + * rdt_l3_mon_domain::rmid_busy_llc allocations may be larger than needed. + * * Return: 0 for success, or -ENOMEM. */ static int domain_setup_l3_mon_state(struct rdt_resource *r, struct rdt_l3_mon_domain *d) From 9c75c47cbc8bd85003269aede57f6fc070706250 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:21:15 -0800 Subject: [PATCH 40/49] fs/resctrl: Move RMID initialization to first mount L3 monitor features are enumerated during resctrl initialization and rmid_ptrs[] that tracks all RMIDs and depends on the number of supported RMIDs is allocated during this time. Telemetry monitor features are enumerated during first resctrl mount and may support a different number of RMIDs compared to L3 monitor features. Delay allocation and initialization of rmid_ptrs[] until first mount. Since the number of RMIDs cannot change on later mounts, keep the same set of rmid_ptrs[] until resctrl_exit(). This is required because the limbo handler keeps running after resctrl is unmounted and needs to access rmid_ptrs[] as it keeps tracking busy RMIDs after unmount. Rename routines to match what they now do: dom_data_init() -> setup_rmid_lru_list() dom_data_exit() -> free_rmid_lru_list() Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com (backported from commit d0891647fbc6e931f27517364cbc4ee1811d76db) Signed-off-by: Shanker Donthineni [fenghuay: fix minor conflicts in setup_rmid_lru_list() and dom_data_exit()] Signed-off-by: Fenghua Yu --- fs/resctrl/internal.h | 4 ++++ fs/resctrl/monitor.c | 54 ++++++++++++++++++++----------------------- fs/resctrl/rdtgroup.c | 5 ++++ 3 files changed, 34 insertions(+), 29 deletions(-) diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 706310a5dcfaa..ee32965eedb5b 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -377,6 +377,10 @@ int closids_supported(void); void closid_free(int closid); +int setup_rmid_lru_list(void); + +void free_rmid_lru_list(void); + int alloc_rmid(u32 closid); void free_rmid(u32 closid, u32 rmid); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 3c37402f217f3..7ed420cc47504 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -932,20 +932,29 @@ void mbm_setup_overflow_handler(struct rdt_l3_mon_domain *dom, unsigned long del schedule_delayed_work_on(cpu, &dom->mbm_over, delay); } -static int dom_data_init(struct rdt_resource *r) +int setup_rmid_lru_list(void) { - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); struct rmid_entry *entry = NULL; - int err = 0, i; + u32 idx_limit; u32 idx; + int i; - mutex_lock(&rdtgroup_mutex); + if (!resctrl_arch_mon_capable()) + return 0; + + /* + * Called on every mount, but the number of RMIDs cannot change + * after the first mount, so keep using the same set of rmid_ptrs[] + * until resctrl_exit(). Note that the limbo handler continues to + * access rmid_ptrs[] after resctrl is unmounted. + */ + if (rmid_ptrs) + return 0; + idx_limit = resctrl_arch_system_num_rmid_idx(); rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL); - if (!rmid_ptrs) { - err = -ENOMEM; - goto out_unlock; - } + if (!rmid_ptrs) + return -ENOMEM; for (i = 0; i < idx_limit; i++) { entry = &rmid_ptrs[i]; @@ -958,7 +967,7 @@ static int dom_data_init(struct rdt_resource *r) /* * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and * are always allocated. These are used for the rdtgroup_default - * control group, which will be setup later in resctrl_init(). + * control group, which was setup earlier in rdtgroup_setup_default(). */ idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, RESCTRL_RESERVED_RMID); @@ -966,23 +975,17 @@ static int dom_data_init(struct rdt_resource *r) WARN_ON_ONCE(!entry); list_del(&entry->list); -out_unlock: - mutex_unlock(&rdtgroup_mutex); - - return err; + return 0; } -static void dom_data_exit(struct rdt_resource *r) +void free_rmid_lru_list(void) { + if (!resctrl_arch_mon_capable()) + return; + mutex_lock(&rdtgroup_mutex); - - if (!r->mon_capable) - goto out_unlock; - kfree(rmid_ptrs); rmid_ptrs = NULL; - -out_unlock: mutex_unlock(&rdtgroup_mutex); } @@ -1860,7 +1863,8 @@ static void closid_num_dirty_rmid_free(void) * resctrl_l3_mon_resource_init() - Initialise global monitoring structures. * * Allocate and initialise global monitor resources that do not belong to a - * specific domain. i.e. the rmid_ptrs[] used for the limbo and free lists. + * specific domain. i.e. the closid_num_dirty_rmid[] used to find the CLOSID + * with the cleanest set of RMIDs. * Called once during boot after the struct rdt_resource's have been configured * but before the filesystem is mounted. * Resctrl's cpuhp callbacks may be called before this point to bring a domain @@ -1880,12 +1884,6 @@ int resctrl_l3_mon_resource_init(void) if (ret) return ret; - ret = dom_data_init(r); - if (ret) { - closid_num_dirty_rmid_free(); - return ret; - } - if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) { mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].configurable = true; resctrl_file_fflags_init("mbm_total_bytes_config", @@ -1932,6 +1930,4 @@ void resctrl_l3_mon_resource_exit(void) return; closid_num_dirty_rmid_free(); - - dom_data_exit(r); } diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index c5b81c9e107d6..a1721a672ee89 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -2976,6 +2976,10 @@ static int rdt_get_tree(struct fs_context *fc) goto out; } + ret = setup_rmid_lru_list(); + if (ret) + goto out; + ret = rdtgroup_setup_root(ctx); if (ret) goto out; @@ -4848,4 +4852,5 @@ void resctrl_exit(void) */ resctrl_l3_mon_resource_exit(); + free_rmid_lru_list(); } From 3e029a827c8a777eb3de3c336b200128325f11b0 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 8 Jan 2026 09:42:27 -0800 Subject: [PATCH 41/49] x86/resctrl: Enable RDT_RESOURCE_PERF_PKG Since telemetry events are enumerated on resctrl mount the RDT_RESOURCE_PERF_PKG resource is not considered "monitoring capable" during early resctrl initialization. This means that the domain list for RDT_RESOURCE_PERF_PKG is not built when the CPU hotplug notifiers are registered and run for the first time right after resctrl initialization. Mark the RDT_RESOURCE_PERF_PKG as "monitoring capable" upon successful telemetry event enumeration to ensure future CPU hotplug events include this resource and initialize its domain list for CPUs that are already online. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com (cherry picked from commit 4bbfc90122e974ccbd9aa80c964413052b9519f3) Signed-off-by: Shanker Donthineni Signed-off-by: Fenghua Yu --- arch/x86/kernel/cpu/resctrl/core.c | 16 ++++++++++++++++ arch/x86/kernel/cpu/resctrl/intel_aet.c | 6 ++++++ 2 files changed, 22 insertions(+) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 7fe323865a223..ed1d4c6e50f37 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -761,8 +761,24 @@ static int resctrl_arch_offline_cpu(unsigned int cpu) void resctrl_arch_pre_mount(void) { + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_PERF_PKG].r_resctrl; + int cpu; + if (!intel_aet_get_events()) return; + + /* + * Late discovery of telemetry events means the domains for the + * resource were not built. Do that now. + */ + cpus_read_lock(); + mutex_lock(&domain_list_lock); + r->mon_capable = true; + rdt_mon_capable = true; + for_each_online_cpu(cpu) + domain_add_cpu_mon(cpu, r); + mutex_unlock(&domain_list_lock); + cpus_read_unlock(); } enum { diff --git a/arch/x86/kernel/cpu/resctrl/intel_aet.c b/arch/x86/kernel/cpu/resctrl/intel_aet.c index aba9971350031..89b8b619d5d53 100644 --- a/arch/x86/kernel/cpu/resctrl/intel_aet.c +++ b/arch/x86/kernel/cpu/resctrl/intel_aet.c @@ -270,6 +270,12 @@ static bool enable_events(struct event_group *e, struct pmt_feature_group *p) else r->mon.num_rmid = e->num_rmid; + if (skipped_events) + pr_info("%s %s:0x%x monitoring detected (skipped %d events)\n", r->name, + e->pfname, e->guid, skipped_events); + else + pr_info("%s %s:0x%x monitoring detected\n", r->name, e->pfname, e->guid); + return true; } From 7aa78a79625a7d5ff38f869d71eb9f1442c0a9a2 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:21:19 -0800 Subject: [PATCH 42/49] x86,fs/resctrl: Update documentation for telemetry events Update resctrl filesystem documentation with the details about the resctrl files that support telemetry events. [ bp: Drop the debugfs hunk of the documentation until a better debugging solution is found. ] Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com (cherry picked from commit a8848c4b43ad00c8a18db080206e3ffa53a08b91) Signed-off-by: Shanker Donthineni Signed-off-by: Fenghua Yu --- Documentation/filesystems/resctrl.rst | 66 ++++++++++++++++++++++----- 1 file changed, 54 insertions(+), 12 deletions(-) diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index b9f6aa44fc4d7..c38a14331cd79 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -167,13 +167,12 @@ with respect to allocation: bandwidth percentages are directly applied to the threads running on the core -If RDT monitoring is available there will be an "L3_MON" directory +If L3 monitoring is available there will be an "L3_MON" directory with the following files: "num_rmids": - The number of RMIDs available. This is the - upper bound for how many "CTRL_MON" + "MON" - groups can be created. + The number of RMIDs supported by hardware for + L3 monitoring events. "mon_features": Lists the monitoring events if @@ -399,6 +398,24 @@ with the following files: bytes) at which a previously used LLC_occupancy counter can be considered for re-use. +If telemetry monitoring is available there will be a "PERF_PKG_MON" directory +with the following files: + +"num_rmids": + The number of RMIDs for telemetry monitoring events. + + On Intel resctrl will not enable telemetry events if the number of + RMIDs that can be tracked concurrently is lower than the total number + of RMIDs supported. Telemetry events can be force-enabled with the + "rdt=" kernel parameter, but this may reduce the number of + monitoring groups that can be created. + +"mon_features": + Lists the telemetry monitoring events that are enabled on this system. + +The upper bound for how many "CTRL_MON" + "MON" can be created +is the smaller of the L3_MON and PERF_PKG_MON "num_rmids" values. + Finally, in the top level of the "info" directory there is a file named "last_cmd_status". This is reset with every "command" issued via the file system (making new directories or writing to any of the @@ -504,15 +521,40 @@ When control is enabled all CTRL_MON groups will also contain: When monitoring is enabled all MON groups will also contain: "mon_data": - This contains a set of files organized by L3 domain and by - RDT event. E.g. on a system with two L3 domains there will - be subdirectories "mon_L3_00" and "mon_L3_01". Each of these - directories have one file per event (e.g. "llc_occupancy", - "mbm_total_bytes", and "mbm_local_bytes"). In a MON group these - files provide a read out of the current value of the event for - all tasks in the group. In CTRL_MON groups these files provide - the sum for all tasks in the CTRL_MON group and all tasks in + This contains directories for each monitor domain. + + If L3 monitoring is enabled, there will be a "mon_L3_XX" directory for + each instance of an L3 cache. Each directory contains files for the enabled + L3 events (e.g. "llc_occupancy", "mbm_total_bytes", and "mbm_local_bytes"). + + If telemetry monitoring is enabled, there will be a "mon_PERF_PKG_YY" + directory for each physical processor package. Each directory contains + files for the enabled telemetry events (e.g. "core_energy". "activity", + "uops_retired", etc.) + + The info/`*`/mon_features files provide the full list of enabled + event/file names. + + "core energy" reports a floating point number for the energy (in Joules) + consumed by cores (registers, arithmetic units, TLB and L1/L2 caches) + during execution of instructions summed across all logical CPUs on a + package for the current monitoring group. + + "activity" also reports a floating point value (in Farads). This provides + an estimate of work done independent of the frequency that the CPUs used + for execution. + + Note that "core energy" and "activity" only measure energy/activity in the + "core" of the CPU (arithmetic units, TLB, L1 and L2 caches, etc.). They + do not include L3 cache, memory, I/O devices etc. + + All other events report decimal integer values. + + In a MON group these files provide a read out of the current value of + the event for all tasks in the group. In CTRL_MON groups these files + provide the sum for all tasks in the CTRL_MON group and all tasks in MON groups. Please see example section for more details on usage. + On systems with Sub-NUMA Cluster (SNC) enabled there are extra directories for each node (located within the "mon_L3_XX" directory for the L3 cache they occupy). These are named "mon_sub_L3_YY" From 9aee1a040a7389762cd9acb056d78af0bbda46b4 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 12 Mar 2026 01:57:21 +0000 Subject: [PATCH 43/49] NVIDIA: VR: SAUCE: arm_mpam: Fix compilation errors to adapt to resctrl L3 domain and arch API updates Upstream resctrl renamed the L3 monitor domain type and extended the arch hooks: 1. Use struct rdt_l3_mon_domain in MPAM's resctrl integration, 2. Pass struct rdt_domain_hdr * into resctrl_online_mon_domain() / resctrl_offline_mon_domain(), 3. Match the new resctrl_arch_rmid_read() prototype (header pointer + arch_priv). 4. Update resctrl_arch_cntr_read(), resctrl_arch_reset_rmid(), resctrl_arch_reset_cntr(), and resctrl_arch_config_cntr() to take struct rdt_l3_mon_domain *. 5. Call the new resctrl_enable_mon_event() signature when wiring monitor events and set mon_capable from its return value. 6. Add a no-op resctrl_arch_pre_mount() so MPAM builds with the generic resctrl mount path. Fixes: a42549e64ce0 ("NVIDIA: SAUCE: arm_mpam: resctrl: Add boilerplate cpuhp and domain allocation") Fixes: ae2a29c5ebb8 ("NVIDIA: SAUCE: arm_mpam: resctrl: Add support for csu counters") Fixes: 1cbc0f2c3d5d ("NVIDIA: SAUCE: arm_mpam: resctrl: Add resctrl_arch_config_cntr() for ABMC use") Fixes: dd44394e2b41 ("NVIDIA: SAUCE: arm_mpam: resctrl: Add resctrl_arch_rmid_read() and resctrl_arch_reset_rmid()") Fixes: 842967000721 ("NVIDIA: SAUCE: arm_mpam: resctrl: Add resctrl_arch_cntr_read() & resctrl_arch_reset_cntr()") Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_internal.h | 2 +- drivers/resctrl/mpam_resctrl.c | 38 ++++++++++++++++++++++----------- 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 1f82a2183c1c1..d51d0ad7c8208 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -449,7 +449,7 @@ struct mpam_resctrl_dom { struct mpam_component *mon_comp[QOS_NUM_EVENTS]; struct rdt_ctrl_domain resctrl_ctrl_dom; - struct rdt_mon_domain resctrl_mon_dom; + struct rdt_l3_mon_domain resctrl_mon_dom; }; struct mpam_resctrl_res { diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 5c5e9876e5ed2..9b2e11a27c7a2 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -100,6 +100,10 @@ bool resctrl_arch_mon_capable(void) return exposed_mon_capable; } +/* + * Provide empty implementations for compilation. The feature are not + * needed on MPAM platforms. + */ bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt) { return false; @@ -117,6 +121,10 @@ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domai { } +void resctrl_arch_pre_mount(void) +{ +} + bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level rid) { switch (rid) { @@ -535,9 +543,9 @@ static int read_mon_cdp_safe(struct mpam_resctrl_mon *mon, struct mpam_component } /* MBWU when not in ABMC mode, and CSU counters. */ -int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, u32 closid, u32 rmid, enum resctrl_event_id eventid, - u64 *val, void *arch_mon_ctx) + void *arch_priv, u64 *val, void *arch_mon_ctx) { struct mpam_resctrl_dom *l3_dom; struct mpam_component *mon_comp; @@ -553,7 +561,7 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, if (eventid >= QOS_NUM_EVENTS || !mon->class) return -EINVAL; - l3_dom = container_of(d, struct mpam_resctrl_dom, resctrl_mon_dom); + l3_dom = container_of(hdr, struct mpam_resctrl_dom, resctrl_mon_dom.hdr); mon_comp = l3_dom->mon_comp[eventid]; switch (eventid) { @@ -573,7 +581,7 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, } /* MBWU counters when in ABMC mode */ -int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 closid, u32 rmid, int mon_idx, enum resctrl_event_id eventid, u64 *val) { @@ -629,7 +637,7 @@ static void reset_mon_cdp_safe(struct mpam_resctrl_mon *mon, struct mpam_compone } /* Called via IPI. Call with read_cpus_lock() held. */ -void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 closid, u32 rmid, enum resctrl_event_id eventid) { struct mpam_resctrl_dom *l3_dom; @@ -650,7 +658,7 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, } /* Reset an assigned counter */ -void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 closid, u32 rmid, int cntr_id, enum resctrl_event_id eventid) { @@ -1278,7 +1286,7 @@ static void __config_cntr(struct mpam_resctrl_mon *mon, u32 cntr_id, mon->mbwu_idx_to_mon[mbwu_idx] = -1; } -void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, enum resctrl_event_id evtid, u32 rmid, u32 closid, u32 cntr_id, bool assign) { @@ -1453,6 +1461,9 @@ static void mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, { struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; struct rdt_resource *l3 = &res->resctrl_res; + struct rdt_resource *r; + + r = &res->resctrl_res; lockdep_assert_cpus_held(); @@ -1478,7 +1489,8 @@ static void mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, l3->name = "L3"; l3->mon_scope = RESCTRL_L3_CACHE; - resctrl_enable_mon_event(type); + if (resctrl_enable_mon_event(type, false, 0, NULL)) + r->mon_capable = true; /* * Unfortunately, num_rmid doesn't mean anything for @@ -1772,7 +1784,7 @@ mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) { int err, idx; struct mpam_resctrl_dom *dom; - struct rdt_mon_domain *mon_d; + struct rdt_l3_mon_domain *mon_d; struct rdt_ctrl_domain *ctrl_d; struct mpam_class *class = res->class; struct mpam_component *comp_iter, *ctrl_comp; @@ -1844,7 +1856,7 @@ mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) mpam_resctrl_domain_hdr_init(cpu, any_mon_comp, &mon_d->hdr); mon_d->hdr.type = RESCTRL_MON_DOMAIN; mpam_resctrl_domain_insert(&r->mon_domains, &mon_d->hdr); - err = resctrl_online_mon_domain(r, mon_d); + err = resctrl_online_mon_domain(r, &mon_d->hdr); if (err) { dom = ERR_PTR(err); goto offline_mon_hdr; @@ -1872,7 +1884,7 @@ mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) static struct mpam_resctrl_dom *mpam_resctrl_get_mon_domain_from_cpu(int cpu) { u32 cache_id; - struct rdt_mon_domain *mon_d; + struct rdt_l3_mon_domain *mon_d; struct mpam_resctrl_dom *dom; struct mpam_resctrl_res *l3 = &mpam_resctrl_controls[RDT_RESOURCE_L3]; @@ -1965,7 +1977,7 @@ int mpam_resctrl_offline_cpu(unsigned int cpu) int i; struct mpam_resctrl_res *res; struct mpam_resctrl_dom *dom; - struct rdt_mon_domain *mon_d; + struct rdt_l3_mon_domain *mon_d; struct rdt_ctrl_domain *ctrl_d; bool ctrl_dom_empty, mon_dom_empty; @@ -1996,7 +2008,7 @@ int mpam_resctrl_offline_cpu(unsigned int cpu) mon_d = &dom->resctrl_mon_dom; mon_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &mon_d->hdr); if (mon_dom_empty) - resctrl_offline_mon_domain(&res->resctrl_res, mon_d); + resctrl_offline_mon_domain(&res->resctrl_res, &mon_d->hdr); } if (ctrl_dom_empty && mon_dom_empty) From cdbf3f00209f8fd338cc9141e74ca689917fea3c Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Mon, 24 Nov 2025 17:14:47 -0600 Subject: [PATCH 44/49] NVIDIA: SAUCE: arm_mpam: Avoid MSC teardown for the SW programming errors No need to destory MSC instance for the user/admin programming errors sicne it's not causing any functional issues. Signed-off-by: Shanker Donthineni (cherry picked from 316e5833ccb2ef66f50290e48c45b70bf286c8fd dev/dev-main-nvidia-pset-linux-6.19.6) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_devices.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index b7643c76025e7..f107c745f2a41 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -2844,6 +2844,12 @@ static irqreturn_t __mpam_irq_handler(int irq, struct mpam_msc *msc) msc->id, mpam_errcode_names[errcode], partid, pmg, ris); + /* No action is required for the MPAM programming errors */ + if ((errcode != MPAM_ERRCODE_REQ_PARTID_RANGE) && + (errcode != MPAM_ERRCODE_REQ_PMG_RANGE)) { + return IRQ_HANDLED; + } + /* Disable this interrupt. */ mpam_disable_msc_ecr(msc); From 14e7ba549a1ba4ca023884af6f6d530528399b1b Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Tue, 13 May 2025 11:44:23 -0500 Subject: [PATCH 45/49] NVIDIA: VR: SAUCE: arm_mpam: Handle CPU-less numa nodes In a NUMA system, each node may include CPUs, memory, MPAM MSC instances, or any combination thereof. Some high-end servers may have NUMA nodes that include MPAM MSC but no CPUs. In such cases, associate all possible CPUs for those MSCs. Signed-off-by: Shanker Donthineni (cherry picked from f902b5abf39fe10a50b7062dc9ae9d2cfc723248 dev/dev-main-nvidia-pset-linux-6.19.6) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_devices.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index f107c745f2a41..a59bf8a6c6fd1 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -723,6 +723,10 @@ static int mpam_ris_get_affinity(struct mpam_msc *msc, cpumask_t *affinity, case MPAM_CLASS_MEMORY: get_cpumask_from_node_id(comp->comp_id, affinity); /* affinity may be empty for CPU-less memory nodes */ + if (cpumask_empty(affinity)) { + dev_warn_once(&msc->pdev->dev, "CPU-less numa node"); + cpumask_copy(affinity, cpu_possible_mask); + } break; case MPAM_CLASS_UNKNOWN: return 0; From 879eb1b0ad2ff1eb50cff8cb62434500cba2a208 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Mon, 24 Nov 2025 15:04:47 -0600 Subject: [PATCH 46/49] NVIDIA: VR: SAUCE: arm_mpam: Include all associated MSC components during domain setup The current MPAM driver only considers the first component associated with an online/offline CPU during domain creation and teardown. This is insufficient, as CPU-initiated traffic may traverse multiple MSCs before reaching the target, and each MSC must be programmed consistently for proper resource partitioning. Update the MPAM driver to include all components associated with a given CPU during domain setup/teardown to expose expected schemata to userspace for effective resource control. Signed-off-by: Shanker Donthineni (backported from 4309ce9856f87170670c9db40546d9f2fc9dbb86 dev/dev-main-nvidia-pset-linux-6.19.6) [fenghuay: In addition to the core change, this backport includes the following adaptations to bridge the gap between the 24.04 (6.17) MPAM driver and the 6.19.6 base the original was written against: - Add for_each_mpam_resctrl_control() and for_each_mpam_resctrl_mon() iteration macros (from pset c15c0662b23a2 and 4f42221abdc92) - Add MPAM_MAX_EVENT constant to bound the monitor event array - Add traffic_matches_l3() to validate that a memory-class MSC's traffic matches L3 egress topology (from pset ebc07609cda08) Remove redundant if (class->type != MPAM_CLASS_MEMORY) - Replace exposed_alloc_capable/exposed_mon_capable static bools with dynamic resctrl_arch_alloc_capable()/resctrl_arch_mon_capable() that iterate over resources - Change mpam_resctrl_offline_cpu() return type from int to void - Change mpam_resctrl_monitor_init() return type from void to int and propagate errors - Change num_rmid from mpam_pmg_max + 1 to resctrl_arch_system_num_rmid_idx() - Use guard(mutex) for domain_list_lock - Use INIT_LIST_HEAD_RCU for domain lists - Fix not found mba issue on GMEM by only checking traffic_matches_l3() in mpam_resctrl_pick_mba() on class that doesn't have NUMA node] Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_devices.c | 3 +- drivers/resctrl/mpam_internal.h | 4 +- drivers/resctrl/mpam_resctrl.c | 553 ++++++++++++++++++++------------ fs/resctrl/internal.h | 9 +- fs/resctrl/monitor.c | 95 ++++-- fs/resctrl/rdtgroup.c | 26 +- 6 files changed, 442 insertions(+), 248 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index a59bf8a6c6fd1..3f3f31a200a09 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -726,7 +726,8 @@ static int mpam_ris_get_affinity(struct mpam_msc *msc, cpumask_t *affinity, if (cpumask_empty(affinity)) { dev_warn_once(&msc->pdev->dev, "CPU-less numa node"); cpumask_copy(affinity, cpu_possible_mask); - } + } else if (class->level > 3) + cpumask_copy(affinity, cpu_possible_mask); break; case MPAM_CLASS_UNKNOWN: return 0; diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index d51d0ad7c8208..8a18a0c377b33 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -545,13 +545,13 @@ void mpam_pcc_rx_callback(struct mbox_client *cl, void *msg); int mpam_resctrl_setup(void); void mpam_resctrl_exit(void); int mpam_resctrl_online_cpu(unsigned int cpu); -int mpam_resctrl_offline_cpu(unsigned int cpu); +void mpam_resctrl_offline_cpu(unsigned int cpu); void mpam_resctrl_teardown_class(struct mpam_class *class); #else static inline int mpam_resctrl_setup(void) { return 0; } static inline void mpam_resctrl_exit(void) { } static inline int mpam_resctrl_online_cpu(unsigned int cpu) { return 0; } -static inline int mpam_resctrl_offline_cpu(unsigned int cpu) { return 0; } +static inline void mpam_resctrl_offline_cpu(unsigned int cpu) { } static inline void mpam_resctrl_teardown_class(struct mpam_class *class) { } #endif /* CONFIG_RESCTRL_FS */ diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 9b2e11a27c7a2..a89f4765051d9 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -32,6 +32,11 @@ DECLARE_WAIT_QUEUE_HEAD(resctrl_mon_ctx_waiters); */ static struct mpam_resctrl_res mpam_resctrl_controls[RDT_NUM_RESOURCES]; +#define for_each_mpam_resctrl_control(res, rid) \ + for (rid = 0, res = &mpam_resctrl_controls[rid]; \ + rid < RDT_NUM_RESOURCES; \ + rid++, res = &mpam_resctrl_controls[rid]) + /* The lock for modifying resctrl's domain lists from cpuhp callbacks. */ static DEFINE_MUTEX(domain_list_lock); @@ -43,10 +48,13 @@ static DEFINE_MUTEX(domain_list_lock); * make use of them, we pretend they are on L3. * Class pointer may be NULL. */ -static struct mpam_resctrl_mon mpam_resctrl_counters[QOS_NUM_EVENTS]; +#define MPAM_MAX_EVENT QOS_L3_MBM_LOCAL_EVENT_ID +static struct mpam_resctrl_mon mpam_resctrl_counters[MPAM_MAX_EVENT + 1]; -static bool exposed_alloc_capable; -static bool exposed_mon_capable; +#define for_each_mpam_resctrl_mon(mon, eventid) \ + for (eventid = QOS_FIRST_EVENT, mon = &mpam_resctrl_counters[eventid]; \ + eventid <= MPAM_MAX_EVENT; \ + eventid++, mon = &mpam_resctrl_counters[eventid]) /* * MPAM emulates CDP by setting different PARTID in the I/D fields of MPAM0_EL1. @@ -92,12 +100,24 @@ static bool mpam_resctrl_abmc_enabled(void) bool resctrl_arch_alloc_capable(void) { - return exposed_alloc_capable; + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + + for_each_mpam_resctrl_control(res, rid) { + if (res->resctrl_res.alloc_capable) + return true; + } + + return false; } bool resctrl_arch_mon_capable(void) { - return exposed_mon_capable; + struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + struct rdt_resource *l3 = &res->resctrl_res; + + /* All monitors are presented as being on the L3 cache */ + return l3->mon_capable; } /* @@ -159,18 +179,48 @@ static void resctrl_reset_task_closids(void) read_unlock(&tasklist_lock); } -static void mpam_resctrl_monitor_sync_abmc_vals(struct rdt_resource *l3) +static struct mpam_resctrl_mon *mpam_resctrl_mon_from_res(struct mpam_resctrl_res *res) +{ + struct mpam_resctrl_mon *mon; + enum resctrl_event_id eventid; + + if (!res->class) + return NULL; + + for_each_mpam_resctrl_mon(mon, eventid) { + if (mon->class == res->class) + return mon; + } + return NULL; +} + +static struct mpam_resctrl_res *mpam_resctrl_res_from_mon(struct mpam_resctrl_mon *mon) +{ + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + + if (!mon->class) + return NULL; + + for_each_mpam_resctrl_control(res, rid) { + if (res->class == mon->class) + return res; + } + return NULL; +} + +static void mpam_resctrl_monitor_sync_abmc_vals(struct rdt_resource *r) { - l3->mon.num_mbm_cntrs = l3_num_allocated_mbwu; + r->mon.num_mbm_cntrs = l3_num_allocated_mbwu; if (cdp_enabled) - l3->mon.num_mbm_cntrs /= 2; + r->mon.num_mbm_cntrs /= 2; - if (l3->mon.num_mbm_cntrs) { - l3->mon.mbm_cntr_assignable = mpam_resctrl_abmc_enabled(); - l3->mon.mbm_assign_on_mkdir = mpam_resctrl_abmc_enabled(); + if (r->mon.num_mbm_cntrs) { + r->mon.mbm_cntr_assignable = mpam_resctrl_abmc_enabled(); + r->mon.mbm_assign_on_mkdir = mpam_resctrl_abmc_enabled(); } else { - l3->mon.mbm_cntr_assignable = false; - l3->mon.mbm_assign_on_mkdir = false; + r->mon.mbm_cntr_assignable = false; + r->mon.mbm_assign_on_mkdir = false; } } @@ -939,7 +989,8 @@ static bool topology_matches_l3(struct mpam_class *victim) { int cpu, err; struct mpam_component *victim_iter; - cpumask_var_t __free(free_cpumask_var) tmp_cpumask; + bool matched_once = false; + cpumask_var_t __free(free_cpumask_var) tmp_cpumask = NULL; if (!alloc_cpumask_var(&tmp_cpumask, GFP_KERNEL)) return false; @@ -953,7 +1004,10 @@ static bool topology_matches_l3(struct mpam_class *victim) return false; } - cpu = cpumask_any(&victim_iter->affinity); + cpu = cpumask_any_and(&victim_iter->affinity, cpu_online_mask); + if (matched_once && (cpu >= nr_cpu_ids)) + continue; + if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) return false; @@ -975,6 +1029,60 @@ static bool topology_matches_l3(struct mpam_class *victim) return false; } + matched_once = true; + } + + return true; +} + +/* + * Test if the traffic for a class matches that at egress from the L3. For + * MSC at memory controllers this is only possible if there is a single L3 + * as otherwise the counters at the memory can include bandwidth from the + * non-local L3. + */ +static bool traffic_matches_l3(struct mpam_class *class) +{ + int err, cpu; + + lockdep_assert_cpus_held(); + + if (class->type == MPAM_CLASS_CACHE && class->level == 3) + return true; + + if (class->type == MPAM_CLASS_CACHE && class->level != 3) { + pr_debug("class %u is a different cache from L3\n", class->level); + return false; + } + + cpumask_var_t __free(free_cpumask_var) tmp_cpumask = NULL; + if (!alloc_cpumask_var(&tmp_cpumask, GFP_KERNEL)) { + pr_debug("cpumask allocation failed\n"); + return false; + } + + if (class->type != MPAM_CLASS_MEMORY) { + pr_debug("class %u is neither of type cache or memory\n", + class->level); + return false; + } + + cpu = cpumask_any_and(&class->affinity, cpu_online_mask); + err = find_l3_equivalent_bitmask(cpu, tmp_cpumask); + if (err) { + pr_debug("Failed to find L3 downstream to cpu %d\n", cpu); + return false; + } + + if (!cpumask_equal(tmp_cpumask, cpu_possible_mask)) { + pr_debug("There is more than one L3\n"); + return false; + } + + /* Be strict; the traffic might stop in the intermediate cache. */ + if (get_cpu_cacheinfo_id(cpu, 4) != -1) { + pr_debug("L3 isn't the last level of cache\n"); + return false; } return true; @@ -1024,7 +1132,6 @@ static void mpam_resctrl_pick_caches(void) else res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; res->class = class; - exposed_alloc_capable = true; } if (has_cmax) { pr_debug("pick_caches: Class has CMAX\n"); @@ -1033,7 +1140,6 @@ static void mpam_resctrl_pick_caches(void) else res = &mpam_resctrl_controls[RDT_RESOURCE_L3_MAX]; res->class = class; - exposed_alloc_capable = true; } } } @@ -1065,11 +1171,19 @@ static void mpam_resctrl_pick_mba(void) continue; } - if (!topology_matches_l3(class)) { + if ((class->level == 3) && (!topology_matches_l3(class))) { pr_debug("class %u topology doesn't match L3\n", class->level); continue; } + /* Check memory at egress from L3 for MSC with L3 */ + if (!cpumask_equal(&class->affinity, cpu_possible_mask) && + !traffic_matches_l3(class)) { + pr_debug("class %u traffic doesn't match L3 egress\n", + class->level); + continue; + } + /* * mba_sc reads the mbm_local counter, and waggles the MBA controls. * mbm_local is implicitly part of the L3, pick a resource to be MBA @@ -1083,7 +1197,6 @@ static void mpam_resctrl_pick_mba(void) pr_debug("selected class %u to back MBA\n", candidate_class->level); res = &mpam_resctrl_controls[RDT_RESOURCE_MBA]; res->class = candidate_class; - exposed_alloc_capable = true; } } @@ -1155,7 +1268,6 @@ static void counter_update_class(enum resctrl_event_id evt_id, pr_debug("Updating event %u to use class %u\n", evt_id, class->level); mon->class = class; - exposed_mon_capable = true; if (evt_id == QOS_L3_OCCUP_EVENT_ID) return; @@ -1239,7 +1351,10 @@ static void mpam_resctrl_pick_counters(void) } has_mbwu = class_has_usable_mbwu(class); - if (has_mbwu && topology_matches_l3(class)) { + if (has_mbwu && + ((class->type == MPAM_CLASS_MEMORY) || + (topology_matches_l3(class) && + traffic_matches_l3(class)))) { pr_debug("class %u has usable MBWU, and matches L3 topology", class->level); /* @@ -1309,10 +1424,16 @@ void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain * bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) { - if (r != &mpam_resctrl_controls[RDT_RESOURCE_L3].resctrl_res) + struct mpam_resctrl_res *res; + struct mpam_resctrl_mon *mon; + + res = container_of(r, struct mpam_resctrl_res, resctrl_res); + + mon = mpam_resctrl_mon_from_res(res); + if (!mon) return false; - return mpam_resctrl_abmc_enabled(); + return mon->assigned_counters ? true : false; } int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable) @@ -1380,7 +1501,6 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res, r->alloc_capable = true; r->schema_fmt = RESCTRL_SCHEMA_PERCENT; r->ctrl_scope = RESCTRL_L3_CACHE; - r->mba.delay_linear = true; r->mba.throttle_mode = THREAD_THROTTLE_UNDEFINED; r->membw.min_bw = get_mba_min(cprops); @@ -1404,6 +1524,9 @@ static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) if (class->type == MPAM_CLASS_CACHE) return comp->comp_id; + if ((class->type == MPAM_CLASS_MEMORY) && (class->level > 3)) + return comp->comp_id; + if (topology_matches_l3(class)) { /* Use the corresponding L3 component ID as the domain ID */ int id = get_cpu_cacheinfo_id(cpu, 3); @@ -1427,10 +1550,10 @@ static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) */ static int mpam_resctrl_monitor_init_abmc(struct mpam_resctrl_mon *mon) { - struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + struct mpam_resctrl_res *res = mpam_resctrl_res_from_mon(mon); size_t array_size = resctrl_arch_system_num_rmid_idx() * sizeof(int); int *rmid_array __free(kfree) = kmalloc(array_size, GFP_KERNEL); - struct rdt_resource *l3 = &res->resctrl_res; + struct rdt_resource *r = &res->resctrl_res; struct mpam_class *class = mon->class; u16 num_mbwu_mon; @@ -1451,25 +1574,33 @@ static int mpam_resctrl_monitor_init_abmc(struct mpam_resctrl_mon *mon) return PTR_ERR(mon->assigned_counters); mon->mbwu_idx_to_mon = no_free_ptr(rmid_array); - mpam_resctrl_monitor_sync_abmc_vals(l3); + mpam_resctrl_monitor_sync_abmc_vals(r); return 0; } -static void mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, - enum resctrl_event_id type) +static int mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, + enum resctrl_event_id type) { - struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; - struct rdt_resource *l3 = &res->resctrl_res; + struct mpam_resctrl_res *res; struct rdt_resource *r; + if ((mon->class->type == MPAM_CLASS_MEMORY) && (mon->class->level > 3)) + res = &mpam_resctrl_controls[RDT_RESOURCE_MBA]; + else + res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + r = &res->resctrl_res; lockdep_assert_cpus_held(); - /* There also needs to be an L3 cache present */ + /* + * There also needs to be an L3 cache present. + * The check just requires any online CPU and it can't go offline as we + * hold the cpu lock. + */ if (get_cpu_cacheinfo_id(smp_processor_id(), 3) == -1) - return; + return 0; /* * If there are no MPAM resources on L3, force it into existence. @@ -1481,42 +1612,43 @@ static void mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, res->class = mpam_resctrl_counters[type].class; } - /* Called multiple times!, once per event type */ - if (exposed_mon_capable) { - l3->mon_capable = true; - - /* Setting name is necessary on monitor only platforms */ - l3->name = "L3"; - l3->mon_scope = RESCTRL_L3_CACHE; + /* + * Called multiple times!, once per event type that has a + * monitoring class. + * Setting name is necessary on monitor only platforms. + */ + if ((mon->class->type == MPAM_CLASS_MEMORY) && (mon->class->level > 3)) { + r->name = "MB"; + } else { + r->name = "L3"; + } + r->mon_scope = RESCTRL_L3_CACHE; - if (resctrl_enable_mon_event(type, false, 0, NULL)) - r->mon_capable = true; + /* + * num-rmid is the upper bound for the number of monitoring + * groups that can exist simultaneously, including the + * default monitoring group for each control group. Hence, + * advertise the whole rmid_idx space even though each + * control group has its own pmg/rmid space. Unfortunately, + * this does mean userspace needs to know the architecture + * to correctly interpret this value. + */ + r->mon.num_rmid = resctrl_arch_system_num_rmid_idx(); - /* - * Unfortunately, num_rmid doesn't mean anything for - * mpam, and its exposed to user-space! - * - * num-rmid is supposed to mean the minimum number of - * monitoring groups that can exist simultaneously, including - * the default monitoring group for each control group. - * - * For mpam, each control group has its own pmg/rmid space, so - * it is not appropriate to advertise the whole rmid_idx space - * here. But the pmgs corresponding to the parent control - * group can be allocated freely: - */ - l3->mon.num_rmid = mpam_pmg_max + 1;; + if (resctrl_enable_mon_event(type, false, 0, NULL)) + r->mon_capable = true; - switch (type) { - case QOS_L3_MBM_LOCAL_EVENT_ID: - case QOS_L3_MBM_TOTAL_EVENT_ID: - mpam_resctrl_monitor_init_abmc(mon); + switch (type) { + case QOS_L3_MBM_LOCAL_EVENT_ID: + case QOS_L3_MBM_TOTAL_EVENT_ID: + mpam_resctrl_monitor_init_abmc(mon); - return; - default: - return; - } + return 0; + default: + return 0; } + + return 0; } u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, @@ -1719,12 +1851,22 @@ void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) } static void mpam_resctrl_domain_hdr_init(int cpu, struct mpam_component *comp, + enum resctrl_res_level rid, struct rdt_domain_hdr *hdr) { lockdep_assert_cpus_held(); INIT_LIST_HEAD(&hdr->list); hdr->id = mpam_resctrl_pick_domain_id(cpu, comp); + hdr->rid = rid; + cpumask_set_cpu(cpu, &hdr->cpu_mask); +} + +static void mpam_resctrl_online_domain_hdr(unsigned int cpu, + struct rdt_domain_hdr *hdr) +{ + lockdep_assert_cpus_held(); + cpumask_set_cpu(cpu, &hdr->cpu_mask); } @@ -1780,56 +1922,40 @@ static void mpam_resctrl_domain_insert(struct list_head *list, } static struct mpam_resctrl_dom * -mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) +mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res, + struct mpam_component *comp) { - int err, idx; + int err; struct mpam_resctrl_dom *dom; struct rdt_l3_mon_domain *mon_d; struct rdt_ctrl_domain *ctrl_d; - struct mpam_class *class = res->class; - struct mpam_component *comp_iter, *ctrl_comp; struct rdt_resource *r = &res->resctrl_res; lockdep_assert_held(&domain_list_lock); - ctrl_comp = NULL; - idx = srcu_read_lock(&mpam_srcu); - list_for_each_entry_srcu(comp_iter, &class->components, class_list, - srcu_read_lock_held(&mpam_srcu)) { - if (cpumask_test_cpu(cpu, &comp_iter->affinity)) { - ctrl_comp = comp_iter; - break; - } - } - srcu_read_unlock(&mpam_srcu, idx); - - /* cpu with unknown exported component? */ - if (WARN_ON_ONCE(!ctrl_comp)) - return ERR_PTR(-EINVAL); - dom = kzalloc_node(sizeof(*dom), GFP_KERNEL, cpu_to_node(cpu)); if (!dom) return ERR_PTR(-ENOMEM); - if (exposed_alloc_capable) { - dom->ctrl_comp = ctrl_comp; + if (resctrl_arch_alloc_capable()) { + dom->ctrl_comp = comp; ctrl_d = &dom->resctrl_ctrl_dom; - mpam_resctrl_domain_hdr_init(cpu, ctrl_comp, &ctrl_d->hdr); + mpam_resctrl_domain_hdr_init(cpu, comp, r->rid, &ctrl_d->hdr); ctrl_d->hdr.type = RESCTRL_CTRL_DOMAIN; - mpam_resctrl_domain_insert(&r->ctrl_domains, &ctrl_d->hdr); err = resctrl_online_ctrl_domain(r, ctrl_d); - if (err) { - dom = ERR_PTR(err); - goto offline_ctrl_domain; - } + if (err) + goto free_domain; + + mpam_resctrl_domain_insert(&r->ctrl_domains, &ctrl_d->hdr); } else { pr_debug("Skipped control domain online - no controls\n"); } - if (exposed_mon_capable) { + if (resctrl_arch_mon_capable()) { int i; - struct mpam_component *mon_comp, *any_mon_comp; + struct mpam_component *any_mon_comp; + struct mpam_resctrl_mon *mon; /* * Even if the monitor domain is backed by a different component, @@ -1838,39 +1964,47 @@ mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) * Search each event's class list for a component with overlapping * CPUs and set up the dom->mon_comp array. */ - for (i = 0; i < QOS_NUM_EVENTS; i++) { - struct mpam_resctrl_mon *mon; - mon = &mpam_resctrl_counters[i]; + for_each_mpam_resctrl_mon(mon, i) { + struct mpam_component *mon_comp; + if (!mon->class) continue; // dummy resource - mon_comp = find_component(mon->class, cpu); + mon_comp = comp ? comp: find_component(mon->class, cpu); dom->mon_comp[i] = mon_comp; if (mon_comp) any_mon_comp = mon_comp; } - WARN_ON_ONCE(!any_mon_comp); + if (!any_mon_comp) { + WARN_ON_ONCE(0); + err = -EFAULT; + goto offline_ctrl_domain; + } mon_d = &dom->resctrl_mon_dom; - mpam_resctrl_domain_hdr_init(cpu, any_mon_comp, &mon_d->hdr); + mpam_resctrl_domain_hdr_init(cpu, any_mon_comp, r->rid, &mon_d->hdr); mon_d->hdr.type = RESCTRL_MON_DOMAIN; - mpam_resctrl_domain_insert(&r->mon_domains, &mon_d->hdr); err = resctrl_online_mon_domain(r, &mon_d->hdr); - if (err) { - dom = ERR_PTR(err); - goto offline_mon_hdr; - } + if (err) + goto offline_ctrl_domain; + + mpam_resctrl_domain_insert(&r->mon_domains, &mon_d->hdr); } else { pr_debug("Skipped monitor domain online - no monitors\n"); } - goto out; -offline_mon_hdr: - mpam_resctrl_offline_domain_hdr(cpu, &mon_d->hdr); + return dom; + offline_ctrl_domain: - resctrl_offline_ctrl_domain(r, ctrl_d); -out: + if (resctrl_arch_alloc_capable()) { + mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr); + resctrl_offline_ctrl_domain(r, ctrl_d); + } +free_domain: + kfree(dom); + dom = ERR_PTR(err); + return dom; } @@ -1881,10 +2015,10 @@ mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) * This relies on mpam_resctrl_pick_domain_id() using the L3 cache-id * for anything that is not a cache. */ -static struct mpam_resctrl_dom *mpam_resctrl_get_mon_domain_from_cpu(int cpu) +static struct mpam_resctrl_dom * +mpam_resctrl_get_mon_domain_from_cpu(int cpu, struct mpam_component *comp) { u32 cache_id; - struct rdt_l3_mon_domain *mon_d; struct mpam_resctrl_dom *dom; struct mpam_resctrl_res *l3 = &mpam_resctrl_controls[RDT_RESOURCE_L3]; @@ -1895,10 +2029,10 @@ static struct mpam_resctrl_dom *mpam_resctrl_get_mon_domain_from_cpu(int cpu) if (cache_id == ~0) return NULL; - list_for_each_entry(mon_d, &l3->resctrl_res.mon_domains, hdr.list) { - dom = container_of(mon_d, struct mpam_resctrl_dom, resctrl_mon_dom); - - if (mon_d->hdr.id == cache_id) + list_for_each_entry(dom, &l3->resctrl_res.mon_domains, resctrl_mon_dom.hdr.list) { + if (comp && (dom->ctrl_comp != comp)) + continue; + if (dom->resctrl_mon_dom.hdr.id == cache_id) return dom; } @@ -1919,17 +2053,17 @@ static struct mpam_resctrl_dom *mpam_resctrl_get_mon_domain_from_cpu(int cpu) * For the monitors, we need to search the list of events... */ static struct mpam_resctrl_dom * -mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res) +mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res, + struct mpam_component *comp) { struct mpam_resctrl_dom *dom; - struct rdt_ctrl_domain *ctrl_d; struct rdt_resource *r = &res->resctrl_res; lockdep_assert_cpus_held(); - list_for_each_entry(ctrl_d, &r->ctrl_domains, hdr.list) { - dom = container_of(ctrl_d, struct mpam_resctrl_dom, resctrl_ctrl_dom); - + list_for_each_entry_rcu(dom, &r->ctrl_domains, resctrl_ctrl_dom.hdr.list) { + if (comp && (dom->ctrl_comp != comp)) + continue; if (cpumask_test_cpu(cpu, &dom->ctrl_comp->affinity)) return dom; } @@ -1938,85 +2072,103 @@ mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res) return NULL; /* Search the mon domain list too - needed on monitor only platforms. */ - return mpam_resctrl_get_mon_domain_from_cpu(cpu); + return mpam_resctrl_get_mon_domain_from_cpu(cpu, comp); } int mpam_resctrl_online_cpu(unsigned int cpu) { - int i, err = 0; - struct mpam_resctrl_dom *dom; + struct rdt_l3_mon_domain *mon_d; + struct rdt_ctrl_domain *ctrl_d; struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + struct mpam_component *comp; + + guard(mutex)(&domain_list_lock); + for_each_mpam_resctrl_control(res, rid) { + struct mpam_resctrl_dom *dom; - mutex_lock(&domain_list_lock); - for (i = 0; i < RDT_NUM_RESOURCES; i++) { - res = &mpam_resctrl_controls[i]; if (!res->class) continue; // dummy_resource; - dom = mpam_resctrl_get_domain_from_cpu(cpu, res); - if (!dom) - dom = mpam_resctrl_alloc_domain(cpu, res); - if (IS_ERR(dom)) { - err = PTR_ERR(dom); - break; - } + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(comp, &res->class->components, class_list, + srcu_read_lock_held(&mpam_srcu)) { + if (!cpumask_test_cpu(cpu, &comp->affinity)) + continue; + + dom = mpam_resctrl_get_domain_from_cpu(cpu, res, comp); + if (!dom) { + dom = mpam_resctrl_alloc_domain(cpu, res, comp); + } else { + if (resctrl_arch_alloc_capable()) { + ctrl_d = &dom->resctrl_ctrl_dom; + mpam_resctrl_online_domain_hdr(cpu, &ctrl_d->hdr); + } + if (resctrl_arch_mon_capable()) { + mon_d = &dom->resctrl_mon_dom; + mpam_resctrl_online_domain_hdr(cpu, &mon_d->hdr); + } + } - cpumask_set_cpu(cpu, &dom->resctrl_ctrl_dom.hdr.cpu_mask); - cpumask_set_cpu(cpu, &dom->resctrl_mon_dom.hdr.cpu_mask); + if (IS_ERR(dom)) { + return PTR_ERR(dom); + } + } } - mutex_unlock(&domain_list_lock); - if (!err) - resctrl_online_cpu(cpu); + resctrl_online_cpu(cpu); - return err; + return 0; } -int mpam_resctrl_offline_cpu(unsigned int cpu) +void mpam_resctrl_offline_cpu(unsigned int cpu) { - int i; + struct mpam_component *comp; struct mpam_resctrl_res *res; - struct mpam_resctrl_dom *dom; - struct rdt_l3_mon_domain *mon_d; - struct rdt_ctrl_domain *ctrl_d; - bool ctrl_dom_empty, mon_dom_empty; + enum resctrl_res_level rid; resctrl_offline_cpu(cpu); - mutex_lock(&domain_list_lock); - for (i = 0; i < RDT_NUM_RESOURCES; i++) { - res = &mpam_resctrl_controls[i]; + guard(mutex)(&domain_list_lock); + for_each_mpam_resctrl_control(res, rid) { + struct mpam_resctrl_dom *dom; + struct rdt_l3_mon_domain *mon_d; + struct rdt_ctrl_domain *ctrl_d; + bool ctrl_dom_empty, mon_dom_empty; + if (!res->class) continue; // dummy resource - dom = mpam_resctrl_get_domain_from_cpu(cpu, res); - if (WARN_ON_ONCE(!dom)) - continue; + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(comp, &res->class->components, class_list, + srcu_read_lock_held(&mpam_srcu)) { + if (!cpumask_test_cpu(cpu, &comp->affinity)) + continue; - ctrl_dom_empty = true; - if (exposed_alloc_capable) { - mpam_reset_component_locked(dom->ctrl_comp); + dom = mpam_resctrl_get_domain_from_cpu(cpu, res, comp); + if (WARN_ON_ONCE(!dom)) + continue; - ctrl_d = &dom->resctrl_ctrl_dom; - ctrl_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr); - if (ctrl_dom_empty) - resctrl_offline_ctrl_domain(&res->resctrl_res, ctrl_d); - } + ctrl_dom_empty = true; + if (resctrl_arch_alloc_capable()) { + ctrl_d = &dom->resctrl_ctrl_dom; + ctrl_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr); + if (ctrl_dom_empty) + resctrl_offline_ctrl_domain(&res->resctrl_res, ctrl_d); + } - mon_dom_empty = true; - if (exposed_mon_capable) { - mon_d = &dom->resctrl_mon_dom; - mon_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &mon_d->hdr); - if (mon_dom_empty) - resctrl_offline_mon_domain(&res->resctrl_res, &mon_d->hdr); - } + mon_dom_empty = true; + if (resctrl_arch_mon_capable()) { + mon_d = &dom->resctrl_mon_dom; + mon_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &mon_d->hdr); + if (mon_dom_empty) + resctrl_offline_mon_domain(&res->resctrl_res, &mon_d->hdr); + } - if (ctrl_dom_empty && mon_dom_empty) - kfree(dom); + if (ctrl_dom_empty && mon_dom_empty) + kfree(dom); + } } - mutex_unlock(&domain_list_lock); - - return 0; } int mpam_resctrl_setup(void) @@ -2030,10 +2182,9 @@ int mpam_resctrl_setup(void) wait_event(wait_cacheinfo_ready, cacheinfo_ready); cpus_read_lock(); - for (i = 0; i < RDT_NUM_RESOURCES; i++) { - res = &mpam_resctrl_controls[i]; - INIT_LIST_HEAD(&res->resctrl_res.ctrl_domains); - INIT_LIST_HEAD(&res->resctrl_res.mon_domains); + for_each_mpam_resctrl_control(res, i) { + INIT_LIST_HEAD_RCU(&res->resctrl_res.ctrl_domains); + INIT_LIST_HEAD_RCU(&res->resctrl_res.mon_domains); res->resctrl_res.rid = i; } @@ -2042,55 +2193,49 @@ int mpam_resctrl_setup(void) mpam_resctrl_pick_mba(); /* Initialise the resctrl structures from the classes */ - for (i = 0; i < RDT_NUM_RESOURCES; i++) { - res = &mpam_resctrl_controls[i]; + for_each_mpam_resctrl_control(res, i) { if (!res->class) continue; // dummy resource err = mpam_resctrl_control_init(res, i); if (err) { pr_debug("Failed to initialise rid %u\n", i); - break; + goto internal_error; } } /* Find some classes to use for monitors */ mpam_resctrl_pick_counters(); - for (j = 0; j < QOS_NUM_EVENTS; j++) { - mon = &mpam_resctrl_counters[j]; + for_each_mpam_resctrl_mon(mon, j) { if (!mon->class) continue; // dummy resource - mpam_resctrl_monitor_init(mon, j); + err = mpam_resctrl_monitor_init(mon, j); + if (err) { + pr_debug("Failed to initialise event %u\n", j); + goto internal_error; + } } cpus_read_unlock(); - if (err || (!exposed_alloc_capable && !exposed_mon_capable)) { - if (err) - pr_debug("Internal error %d - resctrl not supported\n", err); - else - pr_debug("No alloc(%u) or monitor(%u) found - resctrl not supported\n", - exposed_alloc_capable, exposed_mon_capable); - err = -EOPNOTSUPP; + if (!resctrl_arch_alloc_capable() && !resctrl_arch_mon_capable()) { + pr_debug("No alloc(%u) or monitor(%u) found - resctrl not supported\n", + resctrl_arch_alloc_capable(), resctrl_arch_mon_capable()); + return -EOPNOTSUPP; } - if (!err) { - if (!is_power_of_2(mpam_pmg_max + 1)) { - /* - * If not all the partid*pmg values are valid indexes, - * resctrl may allocate pmg that don't exist. This - * should cause an error interrupt. - */ - pr_warn("Number of PMG is not a power of 2! resctrl may misbehave"); - } + err = resctrl_init(); + if (err) + return err; + WRITE_ONCE(resctrl_enabled, true); - err = resctrl_init(); - if (!err) - WRITE_ONCE(resctrl_enabled, true); - } + return 0; +internal_error: + cpus_read_unlock(); + pr_debug("Internal error %d - resctrl not supported\n", err); return err; } @@ -2132,16 +2277,14 @@ void mpam_resctrl_teardown_class(struct mpam_class *class) might_sleep(); - for (i = 0; i < RDT_NUM_RESOURCES; i++) { - res = &mpam_resctrl_controls[i]; + for_each_mpam_resctrl_control(res, i) { if (res->class == class) { mpam_resctrl_exit(); res->class = NULL; break; } } - for (i = 0; i < QOS_NUM_EVENTS; i++) { - mon = &mpam_resctrl_counters[i]; + for_each_mpam_resctrl_mon(mon, i) { if (mon->class == class) { mpam_resctrl_exit(); mon->class = NULL; diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index ee32965eedb5b..08cdc8546a8fd 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -385,9 +385,9 @@ int alloc_rmid(u32 closid); void free_rmid(u32 closid, u32 rmid); -int resctrl_l3_mon_resource_init(void); +int resctrl_mon_init(void); -void resctrl_l3_mon_resource_exit(void); +void resctrl_mon_exit(void); void mon_event_count(void *info); @@ -457,6 +457,11 @@ int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, voi ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); +int mbm_MB_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v); + +ssize_t mbm_MB_assignments_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 7ed420cc47504..47a6651aecfb9 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -452,7 +452,7 @@ static int __l3_mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) struct mbm_state *m; u64 tval = 0; - if (!domain_header_is_valid(rr->hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) { + if (!domain_header_is_valid(rr->hdr, RESCTRL_MON_DOMAIN, rr->r->rid)) { rr->err = -EIO; return -EINVAL; } @@ -549,6 +549,7 @@ static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) { switch (rr->r->rid) { case RDT_RESOURCE_L3: + case RDT_RESOURCE_MBA: WARN_ON_ONCE(rr->evt->any_cpu); if (rr->hdr) return __l3_mon_event_count(rdtgrp, rr); @@ -594,7 +595,7 @@ static void mbm_bw_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) struct rdt_l3_mon_domain *d; struct mbm_state *m; - if (!domain_header_is_valid(rr->hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + if (!domain_header_is_valid(rr->hdr, RESCTRL_MON_DOMAIN, rr->r->rid)) return; d = container_of(rr->hdr, struct rdt_l3_mon_domain, hdr); m = get_mbm_state(d, closid, rmid, rr->evt->evtid); @@ -1004,7 +1005,7 @@ void free_rmid_lru_list(void) */ struct mon_evt mon_event_all[QOS_NUM_EVENTS] = { MON_EVENT(QOS_L3_OCCUP_EVENT_ID, "llc_occupancy", RDT_RESOURCE_L3, false), - MON_EVENT(QOS_L3_MBM_TOTAL_EVENT_ID, "mbm_total_bytes", RDT_RESOURCE_L3, false), + MON_EVENT(QOS_L3_MBM_TOTAL_EVENT_ID, "mbm_total_bytes", RDT_RESOURCE_MBA, false), MON_EVENT(QOS_L3_MBM_LOCAL_EVENT_ID, "mbm_local_bytes", RDT_RESOURCE_L3, false), MON_EVENT(PMT_EVENT_ENERGY, "core_energy", RDT_RESOURCE_PERF_PKG, true), MON_EVENT(PMT_EVENT_ACTIVITY, "activity", RDT_RESOURCE_PERF_PKG, true), @@ -1633,9 +1634,9 @@ int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, return ret; } -int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v) +static int mbm_assignments_show(struct kernfs_open_file *of, struct seq_file *s, + void *v, struct rdt_resource *r) { - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); struct rdt_l3_mon_domain *d; struct rdtgroup *rdtgrp; struct mon_evt *mevt; @@ -1681,6 +1682,18 @@ int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, voi return ret; } +int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v) +{ + return mbm_assignments_show(of, s, v, + resctrl_arch_get_resource(RDT_RESOURCE_L3)); +} + +int mbm_MB_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v) +{ + return mbm_assignments_show(of, s, v, + resctrl_arch_get_resource(RDT_RESOURCE_MBA)); +} + /* * mbm_get_mon_event_by_name() - Return the mon_evt entry for the matching * event name. @@ -1775,10 +1788,10 @@ static int resctrl_parse_mbm_assignment(struct rdt_resource *r, struct rdtgroup return -EINVAL; } -ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, - size_t nbytes, loff_t off) +static ssize_t mbm_assignments_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off, + struct rdt_resource *r) { - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); struct rdtgroup *rdtgrp; char *token, *event; int ret = 0; @@ -1820,6 +1833,20 @@ ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, return ret ?: nbytes; } +ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + return mbm_assignments_write(of, buf, nbytes, off, + resctrl_arch_get_resource(RDT_RESOURCE_L3)); +} + +ssize_t mbm_MB_assignments_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + return mbm_assignments_write(of, buf, nbytes, off, + resctrl_arch_get_resource(RDT_RESOURCE_MBA)); +} + static int closid_num_dirty_rmid_alloc(struct rdt_resource *r) { if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { @@ -1860,7 +1887,7 @@ static void closid_num_dirty_rmid_free(void) } /** - * resctrl_l3_mon_resource_init() - Initialise global monitoring structures. + * resctrl_mon_resource_init() - Initialise global monitoring structures. * * Allocate and initialise global monitor resources that do not belong to a * specific domain. i.e. the closid_num_dirty_rmid[] used to find the CLOSID @@ -1872,27 +1899,21 @@ static void closid_num_dirty_rmid_free(void) * * Return: 0 for success, or -ENOMEM. */ -int resctrl_l3_mon_resource_init(void) +static void resctrl_mon_resource_init(struct rdt_resource *r) { - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - int ret; + unsigned long fflags; - if (!r->mon_capable) - return 0; - - ret = closid_num_dirty_rmid_alloc(r); - if (ret) - return ret; + fflags = (r->rid == RDT_RESOURCE_MBA) ? RFTYPE_RES_MB :RFTYPE_RES_CACHE; if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) { mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].configurable = true; resctrl_file_fflags_init("mbm_total_bytes_config", - RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + RFTYPE_MON_INFO | fflags); } if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) { mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].configurable = true; resctrl_file_fflags_init("mbm_local_bytes_config", - RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + RFTYPE_MON_INFO | fflags); } if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) @@ -1910,19 +1931,43 @@ int resctrl_l3_mon_resource_init(void) NON_TEMP_WRITE_TO_LOCAL_MEM); r->mon.mbm_assign_on_mkdir = true; resctrl_file_fflags_init("num_mbm_cntrs", - RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + RFTYPE_MON_INFO | fflags); resctrl_file_fflags_init("available_mbm_cntrs", - RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + RFTYPE_MON_INFO | fflags); resctrl_file_fflags_init("event_filter", RFTYPE_ASSIGN_CONFIG); resctrl_file_fflags_init("mbm_assign_on_mkdir", RFTYPE_MON_INFO | - RFTYPE_RES_CACHE); - resctrl_file_fflags_init("mbm_L3_assignments", RFTYPE_MON_BASE); + fflags); + if (r->rid == RDT_RESOURCE_MBA) + resctrl_file_fflags_init("mbm_MB_assignments", RFTYPE_MON_BASE); + else + resctrl_file_fflags_init("mbm_L3_assignments", RFTYPE_MON_BASE); + resctrl_file_fflags_init("mbm_assign_mode", RFTYPE_MON_INFO | + fflags); } +} + +int resctrl_mon_init(void) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + int ret; + + if (!r->mon_capable) + return 0; + + ret = closid_num_dirty_rmid_alloc(r); + if (ret) + return ret; + + resctrl_mon_resource_init(r); + + r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); + if (r) + resctrl_mon_resource_init(r); return 0; } -void resctrl_l3_mon_resource_exit(void) +void resctrl_mon_exit(void) { struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index a1721a672ee89..c2aed590ad897 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -2133,6 +2133,13 @@ static struct rftype res_common_files[] = { .seq_show = mbm_L3_assignments_show, .write = mbm_L3_assignments_write, }, + { + .name = "mbm_MB_assignments", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = mbm_MB_assignments_show, + .write = mbm_MB_assignments_write, + }, { .name = "mbm_assign_mode", .mode = 0644, @@ -4514,10 +4521,7 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *h if (resctrl_mounted && resctrl_arch_mon_capable()) rmdir_mondata_subdir_allrdtgrp(r, hdr); - if (r->rid != RDT_RESOURCE_L3) - goto out_unlock; - - if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, r->rid)) goto out_unlock; d = container_of(hdr, struct rdt_l3_mon_domain, hdr); @@ -4623,10 +4627,7 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr mutex_lock(&rdtgroup_mutex); - if (r->rid != RDT_RESOURCE_L3) - goto mkdir; - - if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, r->rid)) goto out_unlock; d = container_of(hdr, struct rdt_l3_mon_domain, hdr); @@ -4643,7 +4644,6 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); -mkdir: err = 0; /* * If the filesystem is not mounted then only the default resource group @@ -4747,13 +4747,13 @@ int resctrl_init(void) thread_throttle_mode_init(); - ret = resctrl_l3_mon_resource_init(); + ret = resctrl_mon_init(); if (ret) return ret; ret = sysfs_create_mount_point(fs_kobj, "resctrl"); if (ret) { - resctrl_l3_mon_resource_exit(); + resctrl_mon_exit(); return ret; } @@ -4788,7 +4788,7 @@ int resctrl_init(void) cleanup_mountpoint: sysfs_remove_mount_point(fs_kobj, "resctrl"); - resctrl_l3_mon_resource_exit(); + resctrl_mon_exit(); return ret; } @@ -4851,6 +4851,6 @@ void resctrl_exit(void) * it can be used to umount resctrl. */ - resctrl_l3_mon_resource_exit(); + resctrl_mon_exit(); free_rmid_lru_list(); } From a01fa4a6e5a08c2053d7b235dd7c9be53fe5f4d7 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Wed, 4 Mar 2026 11:53:32 -0600 Subject: [PATCH 47/49] NVIDIA: SAUCE: resctrl/mpam: reset RIS by applying explicit default config Reset an RIS by building a default mpam_config and applying it via mpam_reprogram_ris_partid(), like any other config. - mpam_init_reset_cfg(): set features and default values only for controls supported by the RIS (cpor_part, mbw_part, mbw_max, mbw_prop, cmax_cmax, cmax_cmin). Use full masks for CPBM/MBW_PBM and MPAMCFG_* defaults for MBW_MAX, CMAX, CMIN. - mpam_reprogram_ris_partid(): apply cfg for all supported controls (no separate reset path). Signed-off-by: Shanker Donthineni (backported from c076b208842db87ed50b1c63cff302975a9c8f67 dev/dev-main-nvidia-pset-linux-6.19.6) [fenghuay: Fix porting conflicts and compilaton errors. Remove this sentence in the commit message to avoid confusion because MBW_PROP feature is not supported on Vera/Grace: "Include mpam_feat_mbw_prop when supported so MBW_PROP is written to 0 on reset."] Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_devices.c | 128 ++++++++++++++------------------ drivers/resctrl/mpam_internal.h | 3 - drivers/resctrl/mpam_resctrl.c | 18 +---- 3 files changed, 59 insertions(+), 90 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 3f3f31a200a09..20dc9a0fb2142 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -1617,34 +1617,6 @@ void mpam_msmon_reset_mbwu(struct mpam_component *comp, struct mon_cfg *ctx) } } -static void mpam_reset_msc_bitmap(struct mpam_msc *msc, u16 reg, u16 wd) -{ - u32 num_words, msb; - u32 bm = ~0; - int i; - - lockdep_assert_held(&msc->part_sel_lock); - - if (wd == 0) - return; - - /* - * Write all ~0 to all but the last 32bit-word, which may - * have fewer bits... - */ - num_words = DIV_ROUND_UP(wd, 32); - for (i = 0; i < num_words - 1; i++, reg += sizeof(bm)) - __mpam_write_reg(msc, reg, bm); - - /* - * ....and then the last (maybe) partial 32bit word. When wd is a - * multiple of 32, msb should be 31 to write a full 32bit word. - */ - msb = (wd - 1) % 32; - bm = GENMASK(msb, 0); - __mpam_write_reg(msc, reg, bm); -} - static void mpam_apply_t241_erratum(struct mpam_msc_ris *ris, u16 partid) { int sidx, i, lcount = 1000; @@ -1719,7 +1691,6 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, struct mpam_config *cfg) { u32 pri_val = 0; - u16 cmax = MPAMCFG_CMAX_CMAX; struct mpam_msc *msc = ris->vmsc->msc; struct mpam_props *rprops = &ris->props; u16 dspri = GENMASK(rprops->dspri_wd, 0); @@ -1741,22 +1712,12 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, } if (mpam_has_feature(mpam_feat_cpor_part, rprops) && - mpam_has_feature(mpam_feat_cpor_part, cfg)) { - if (cfg->reset_cpbm) - mpam_reset_msc_bitmap(msc, MPAMCFG_CPBM, - rprops->cpbm_wd); - else - mpam_write_partsel_reg(msc, CPBM, cfg->cpbm); - } + mpam_has_feature(mpam_feat_cpor_part, cfg)) + mpam_write_partsel_reg(msc, CPBM, cfg->cpbm); if (mpam_has_feature(mpam_feat_mbw_part, rprops) && - mpam_has_feature(mpam_feat_mbw_part, cfg)) { - if (cfg->reset_mbw_pbm) - mpam_reset_msc_bitmap(msc, MPAMCFG_MBW_PBM, - rprops->mbw_pbm_bits); - else - mpam_write_partsel_reg(msc, MBW_PBM, cfg->mbw_pbm); - } + mpam_has_feature(mpam_feat_mbw_part, cfg)) + mpam_write_partsel_reg(msc, MBW_PBM, cfg->mbw_pbm); if (mpam_has_feature(mpam_feat_mbw_min, rprops)) { u16 val = 0; @@ -1779,25 +1740,18 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, mpam_has_feature(mpam_feat_mbw_prop, cfg)) mpam_write_partsel_reg(msc, MBW_PROP, 0); - if (mpam_has_feature(mpam_feat_cmax_cmax, rprops)) { - if (mpam_has_feature(mpam_feat_cmax_cmax, cfg)) { - u32 cmax_val = cfg->cmax; + if (mpam_has_feature(mpam_feat_cmax_cmax, rprops) && + mpam_has_feature(mpam_feat_cmax_cmax, cfg)) { + u32 cmax = cfg->cmax; - if (cfg->cmax_softlim) - cmax_val |= MPAMCFG_CMAX_SOFTLIM; - mpam_write_partsel_reg(msc, CMAX, cmax_val); - } else { - mpam_write_partsel_reg(msc, CMAX, cmax); - } + if (cfg->cmax_softlim) + cmax |= MPAMCFG_CMAX_SOFTLIM; + mpam_write_partsel_reg(msc, CMAX, cmax); } - if (mpam_has_feature(mpam_feat_cmax_cmin, rprops)) { - if (mpam_has_feature(mpam_feat_cmax_cmin, cfg)) { - mpam_write_partsel_reg(msc, CMIN, cfg->cmin); - } else { - mpam_write_partsel_reg(msc, CMIN, 0); - } - } + if (mpam_has_feature(mpam_feat_cmax_cmin, rprops) && + mpam_has_feature(mpam_feat_cmax_cmin, cfg)) + mpam_write_partsel_reg(msc, CMIN, cfg->cmin); if (mpam_has_feature(mpam_feat_cmax_cassoc, rprops)) mpam_write_partsel_reg(msc, CASSOC, MPAMCFG_CASSOC_CASSOC); @@ -1920,17 +1874,32 @@ static int mpam_save_mbwu_state(void *arg) return 0; } -static void mpam_init_reset_cfg(struct mpam_config *reset_cfg) +static void mpam_init_reset_cfg(struct mpam_config *reset_cfg, + const struct mpam_props *props) { - *reset_cfg = (struct mpam_config) { - .cpbm = ~0, - .mbw_pbm = ~0, - .mbw_max = MPAMCFG_MBW_MAX_MAX, + memset(reset_cfg, 0, sizeof(*reset_cfg)); - .reset_cpbm = true, - .reset_mbw_pbm = true, - }; - bitmap_fill(reset_cfg->features, MPAM_FEATURE_LAST); + /* Set features and explicit default values for controls supported by this RIS. */ + if (mpam_has_feature(mpam_feat_cpor_part, props)) { + mpam_set_feature(mpam_feat_cpor_part, reset_cfg); + reset_cfg->cpbm = GENMASK(props->cpbm_wd - 1, 0); + } + if (mpam_has_feature(mpam_feat_mbw_part, props)) { + mpam_set_feature(mpam_feat_mbw_part, reset_cfg); + reset_cfg->mbw_pbm = GENMASK(props->mbw_pbm_bits - 1, 0); + } + if (mpam_has_feature(mpam_feat_mbw_max, props)) { + mpam_set_feature(mpam_feat_mbw_max, reset_cfg); + reset_cfg->mbw_max = MPAMCFG_MBW_MAX_MAX; + } + if (mpam_has_feature(mpam_feat_cmax_cmax, props)) { + mpam_set_feature(mpam_feat_cmax_cmax, reset_cfg); + reset_cfg->cmax = MPAMCFG_CMAX_CMAX; + } + if (mpam_has_feature(mpam_feat_cmax_cmin, props)) { + mpam_set_feature(mpam_feat_cmax_cmin, reset_cfg); + reset_cfg->cmin = 0; + } } /* @@ -1946,7 +1915,7 @@ static int mpam_reset_ris(void *arg) if (ris->in_reset_state) return 0; - mpam_init_reset_cfg(&reset_cfg); + mpam_init_reset_cfg(&reset_cfg, &ris->props); reprogram_arg.ris = ris; reprogram_arg.cfg = &reset_cfg; @@ -2988,14 +2957,31 @@ static void __destroy_component_cfg(struct mpam_component *comp) static void mpam_reset_component_cfg(struct mpam_component *comp) { int i; + struct mpam_props *cprops = &comp->class->props; mpam_assert_partid_sizes_fixed(); if (!comp->cfg) return; - for (i = 0; i < mpam_partid_max + 1; i++) - mpam_init_reset_cfg(&comp->cfg[i]); + for (i = 0; i < mpam_partid_max + 1; i++) { + if (cprops->cpbm_wd) { + comp->cfg[i].cpbm = GENMASK(cprops->cpbm_wd - 1, 0); + mpam_set_feature(mpam_feat_cpor_part, &comp->cfg[i]); + } + if (cprops->mbw_pbm_bits) { + comp->cfg[i].mbw_pbm = GENMASK(cprops->mbw_pbm_bits - 1, 0); + mpam_set_feature(mpam_feat_mbw_part, &comp->cfg[i]); + } + if (cprops->bwa_wd) { + comp->cfg[i].mbw_max = MPAMCFG_MBW_MAX_MAX; + mpam_set_feature(mpam_feat_mbw_max, &comp->cfg[i]); + } + if (cprops->cmax_wd) { + comp->cfg[i].cmax = MPAMCFG_CMAX_CMAX; + mpam_set_feature(mpam_feat_cmax_cmax, &comp->cfg[i]); + } + } } static int __allocate_component_cfg(struct mpam_component *comp) diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 8a18a0c377b33..0206fa67be21e 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -363,9 +363,6 @@ struct mpam_config { bool cmax_softlim; - bool reset_cpbm; - bool reset_mbw_pbm; - struct mpam_garbage garbage; }; diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index a89f4765051d9..0accede8cc09c 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -890,14 +890,7 @@ static u32 percent_to_mbw_pbm(u8 pc, struct mpam_props *cprops) */ static u32 fract16_to_percent(u16 fract, u8 wd) { - u32 val = fract; - - val >>= 16 - wd; - val += 1; - val *= MAX_MBA_BW; - val = DIV_ROUND_CLOSEST(val, 1 << wd); - - return val; + return DIV_ROUND_CLOSEST((fract + 1) * 100, 65536); } /* @@ -912,14 +905,7 @@ static u32 fract16_to_percent(u16 fract, u8 wd) */ static u16 percent_to_fract16(u8 pc, u8 wd) { - u32 val = pc; - - val <<= wd; - val = DIV_ROUND_CLOSEST(val, MAX_MBA_BW); - val = max(val, 1) - 1; - val <<= 16 - wd; - - return val; + return pc ? (((pc * 65536) / 100) - 1) : 0; } static u32 mbw_max_to_percent(u16 mbw_max, struct mpam_props *cprops) From 10ede13d6ba209f0599d1c53410a580ae3d39500 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Tue, 6 May 2025 21:02:21 -0500 Subject: [PATCH 48/49] NVIDIA: SAUCE: iommu/arm-smmu-v3: Fix MPAM for indentity-mappings There is no struct arm_smmu_domain context for domains configured with identity mappings. Use the device to obtain the necessary information to program PARTID and PMGID. Signed-off-by: Shanker Donthineni (backported from e5020b38475ef58c5bb3d1a92028d4e0dd7aff4d dev/dev-main-nvidia-pset-linux-6.19.6) [fenghuay: Koba Ko fixes a typo in iommu_group_get_qos_params(): s/!ops->set_group_qos_params/!ops->get_group_qos_params/] Signed-off-by: Fenghua Yu --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 73 +++++++-------------- drivers/iommu/iommu.c | 52 ++++++--------- include/linux/iommu.h | 4 +- 3 files changed, 45 insertions(+), 84 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 394c69e5ed838..39e8ac236b90a 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -3733,94 +3733,67 @@ static int arm_smmu_def_domain_type(struct device *dev) return 0; } -static int arm_smmu_group_set_mpam(struct iommu_group *group, u16 partid, +static int arm_smmu_group_set_mpam(struct device *dev, u16 partid, u8 pmg) { int i; u32 sid; - unsigned long flags; struct arm_smmu_ste *step; - struct iommu_domain *domain; struct arm_smmu_device *smmu; struct arm_smmu_master *master; struct arm_smmu_cmdq_batch cmds; - struct arm_smmu_domain *smmu_domain; struct arm_smmu_cmdq_ent cmd = { .opcode = CMDQ_OP_CFGI_STE, .cfgi = { .leaf = true, }, }; - struct arm_smmu_master_domain *master_domain; - domain = iommu_get_domain_for_group(group); - smmu_domain = to_smmu_domain(domain); - if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_MPAM)) + master = dev_iommu_priv_get(dev); + if (!(master->smmu->features & ARM_SMMU_FEAT_MPAM)) return -EIO; - smmu = smmu_domain->smmu; + smmu = master->smmu; arm_smmu_cmdq_batch_init(smmu, &cmds, &cmd); - spin_lock_irqsave(&smmu_domain->devices_lock, flags); - list_for_each_entry(master_domain, &smmu_domain->devices, - devices_elm) { - master = master_domain->master; - - for (i = 0; i < master->num_streams; i++) { - sid = master->streams[i].id; - step = arm_smmu_get_step_for_sid(smmu, sid); - - /* These need locking if the VMSPtr is ever used */ - step->data[4] = FIELD_PREP(STRTAB_STE_4_PARTID, partid); - step->data[5] = FIELD_PREP(STRTAB_STE_5_PMG, pmg); + for (i = 0; i < master->num_streams; i++) { + sid = master->streams[i].id; + step = arm_smmu_get_step_for_sid(smmu, sid); - cmd.cfgi.sid = sid; - arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd); - } + /* These need locking if the VMSPtr is ever used */ + step->data[4] = FIELD_PREP(STRTAB_STE_4_PARTID, partid); + step->data[5] = FIELD_PREP(STRTAB_STE_5_PMG, pmg); - master->partid = partid; - master->pmg = pmg; + cmd.cfgi.sid = sid; + arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd); } - spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); + + master->partid = partid; + master->pmg = pmg; arm_smmu_cmdq_batch_submit(smmu, &cmds); return 0; } -static int arm_smmu_group_get_mpam(struct iommu_group *group, u16 *partid, +static int arm_smmu_group_get_mpam(struct device *dev, u16 *partid, u8 *pmg) { - int err = -EINVAL; - unsigned long flags; - struct iommu_domain *domain; struct arm_smmu_master *master; - struct arm_smmu_domain *smmu_domain; - struct arm_smmu_master_domain *master_domain; - domain = iommu_get_domain_for_group(group); - smmu_domain = to_smmu_domain(domain); - if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_MPAM)) + master = dev_iommu_priv_get(dev); + if (!(master->smmu->features & ARM_SMMU_FEAT_MPAM)) return -EIO; if (!partid && !pmg) return 0; - spin_lock_irqsave(&smmu_domain->devices_lock, flags); - list_for_each_entry(master_domain, &smmu_domain->devices, - devices_elm) { - master = master_domain->master; - if (master) { - if (partid) - *partid = master->partid; - if (pmg) - *pmg = master->pmg; - err = 0; - } - } - spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); + if (partid) + *partid = master->partid; + if (pmg) + *pmg = master->pmg; - return err; + return 0; } static const struct iommu_ops arm_smmu_ops = { diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index db770b73e3a8f..4c243f6000c3a 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3914,25 +3914,19 @@ int iommu_group_set_qos_params(struct iommu_group *group, { const struct iommu_ops *ops; struct group_device *device; - int ret; + int ret = -ENODEV; mutex_lock(&group->mutex); - device = list_first_entry_or_null(&group->devices, typeof(*device), - list); - if (!device) { - ret = -ENODEV; - goto out_unlock; - } - - ops = dev_iommu_ops(device->dev); - if (!ops->set_group_qos_params) { - ret = -EOPNOTSUPP; - goto out_unlock; + for_each_group_device(group, device) { + ops = dev_iommu_ops(device->dev); + if (!ops->set_group_qos_params) { + ret = -EOPNOTSUPP; + break; + } + ret = ops->set_group_qos_params(device->dev, partition, perf_mon_grp); + if (ret < 0) + break; } - - ret = ops->set_group_qos_params(group, partition, perf_mon_grp); - -out_unlock: mutex_unlock(&group->mutex); return ret; @@ -3952,25 +3946,19 @@ int iommu_group_get_qos_params(struct iommu_group *group, { const struct iommu_ops *ops; struct group_device *device; - int ret; + int ret = -ENODEV; mutex_lock(&group->mutex); - device = list_first_entry_or_null(&group->devices, typeof(*device), - list); - if (!device) { - ret = -ENODEV; - goto out_unlock; - } - - ops = dev_iommu_ops(device->dev); - if (!ops->get_group_qos_params) { - ret = -EOPNOTSUPP; - goto out_unlock; + for_each_group_device(group, device) { + ops = dev_iommu_ops(device->dev); + if (!ops->get_group_qos_params) { + ret = -EOPNOTSUPP; + break; + } + ret = ops->get_group_qos_params(device->dev, partition, perf_mon_grp); + if (!ret) + break; } - - ret = ops->get_group_qos_params(group, partition, perf_mon_grp); - -out_unlock: mutex_unlock(&group->mutex); return ret; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index b74228f9f1ce0..a22521af2d242 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -704,9 +704,9 @@ struct iommu_ops { const struct iommu_user_data *user_data); /* Per group IOMMU features */ - int (*get_group_qos_params)(struct iommu_group *group, u16 *partition, + int (*get_group_qos_params)(struct device *dev, u16 *partition, u8 *perf_mon_grp); - int (*set_group_qos_params)(struct iommu_group *group, u16 partition, + int (*set_group_qos_params)(struct device *dev, u16 partition, u8 perf_mon_grp); const struct iommu_domain_ops *default_domain_ops; From ecd11fd63137254963ef98113e2a74834531e573 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 11 Mar 2026 23:05:38 +0000 Subject: [PATCH 49/49] NVIDIA: VR: SAUCE: arm_mpam: Resolve MBWU type before feature check in mpam_msmon_read Resolve mpam_feat_msmon_mbwu to the concrete counter type (31/44/63) before mpam_has_feature() and before filling the mon_read arg. This avoids -EOPNOTSUPP when only a specific MBWU feature is set, and ensures _msmon_read() gets the resolved type in arg.type. Fixes: 5b910050e654 ("NVIDIA: SAUCE: arm_mpam: Use long MBWU counters if supported") Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_devices.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 20dc9a0fb2142..7ee8925aec9ef 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -1552,6 +1552,9 @@ int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, if (!mpam_has_feature(type, cprops)) return -EOPNOTSUPP; + if (type == mpam_feat_msmon_mbwu) + type = mpam_msmon_choose_counter(class); + arg = (struct mon_read) { .ctx = ctx, .type = type, @@ -1559,9 +1562,6 @@ int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, }; *val = 0; - if (type == mpam_feat_msmon_mbwu) - type = mpam_msmon_choose_counter(class); - err = _msmon_read(comp, &arg); if (err == -EBUSY && class->nrdy_usec) wait_jiffies = usecs_to_jiffies(class->nrdy_usec);