diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 44a70e1ab59a2..6cd934ae385d6 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6174,8 +6174,14 @@ Turn on/off individual RDT features. List is: cmt, mbmtotal, mbmlocal, l3cat, l3cdp, l2cat, l2cdp, mba, smba, bmec, abmc. + mba, smba, bmec, abmc, sdciae, energy[:guid], + perf[:guid]. E.g. to turn on cmt and turn off mba use: rdt=cmt,!mba + To turn off all energy telemetry monitoring and ensure that + perf telemetry monitoring associated with guid 0x12345 + is enabled use: + rdt=!energy,perf:0x12345 reboot= [KNL] Format (x86 or x86_64): diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index b9f6aa44fc4d7..c38a14331cd79 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -167,13 +167,12 @@ with respect to allocation: bandwidth percentages are directly applied to the threads running on the core -If RDT monitoring is available there will be an "L3_MON" directory +If L3 monitoring is available there will be an "L3_MON" directory with the following files: "num_rmids": - The number of RMIDs available. This is the - upper bound for how many "CTRL_MON" + "MON" - groups can be created. + The number of RMIDs supported by hardware for + L3 monitoring events. "mon_features": Lists the monitoring events if @@ -399,6 +398,24 @@ with the following files: bytes) at which a previously used LLC_occupancy counter can be considered for re-use. +If telemetry monitoring is available there will be a "PERF_PKG_MON" directory +with the following files: + +"num_rmids": + The number of RMIDs for telemetry monitoring events. + + On Intel resctrl will not enable telemetry events if the number of + RMIDs that can be tracked concurrently is lower than the total number + of RMIDs supported. Telemetry events can be force-enabled with the + "rdt=" kernel parameter, but this may reduce the number of + monitoring groups that can be created. + +"mon_features": + Lists the telemetry monitoring events that are enabled on this system. + +The upper bound for how many "CTRL_MON" + "MON" can be created +is the smaller of the L3_MON and PERF_PKG_MON "num_rmids" values. + Finally, in the top level of the "info" directory there is a file named "last_cmd_status". This is reset with every "command" issued via the file system (making new directories or writing to any of the @@ -504,15 +521,40 @@ When control is enabled all CTRL_MON groups will also contain: When monitoring is enabled all MON groups will also contain: "mon_data": - This contains a set of files organized by L3 domain and by - RDT event. E.g. on a system with two L3 domains there will - be subdirectories "mon_L3_00" and "mon_L3_01". Each of these - directories have one file per event (e.g. "llc_occupancy", - "mbm_total_bytes", and "mbm_local_bytes"). In a MON group these - files provide a read out of the current value of the event for - all tasks in the group. In CTRL_MON groups these files provide - the sum for all tasks in the CTRL_MON group and all tasks in + This contains directories for each monitor domain. + + If L3 monitoring is enabled, there will be a "mon_L3_XX" directory for + each instance of an L3 cache. Each directory contains files for the enabled + L3 events (e.g. "llc_occupancy", "mbm_total_bytes", and "mbm_local_bytes"). + + If telemetry monitoring is enabled, there will be a "mon_PERF_PKG_YY" + directory for each physical processor package. Each directory contains + files for the enabled telemetry events (e.g. "core_energy". "activity", + "uops_retired", etc.) + + The info/`*`/mon_features files provide the full list of enabled + event/file names. + + "core energy" reports a floating point number for the energy (in Joules) + consumed by cores (registers, arithmetic units, TLB and L1/L2 caches) + during execution of instructions summed across all logical CPUs on a + package for the current monitoring group. + + "activity" also reports a floating point value (in Farads). This provides + an estimate of work done independent of the frequency that the CPUs used + for execution. + + Note that "core energy" and "activity" only measure energy/activity in the + "core" of the CPU (arithmetic units, TLB, L1 and L2 caches, etc.). They + do not include L3 cache, memory, I/O devices etc. + + All other events report decimal integer values. + + In a MON group these files provide a read out of the current value of + the event for all tasks in the group. In CTRL_MON groups these files + provide the sum for all tasks in the CTRL_MON group and all tasks in MON groups. Please see example section for more details on usage. + On systems with Sub-NUMA Cluster (SNC) enabled there are extra directories for each node (located within the "mon_L3_XX" directory for the L3 cache they occupy). These are named "mon_sub_L3_YY" diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5902dde9f4477..61d86219d4719 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -525,6 +525,19 @@ config X86_CPU_RESCTRL Say N if unsure. +config X86_CPU_RESCTRL_INTEL_AET + bool "Intel Application Energy Telemetry" + depends on X86_64 && X86_CPU_RESCTRL && CPU_SUP_INTEL && INTEL_PMT_TELEMETRY=y && INTEL_TPMI=y + help + Enable per-RMID telemetry events in resctrl. + + Intel feature that collects per-RMID execution data + about energy consumption, measure of frequency independent + activity and other performance metrics. Data is aggregated + per package. + + Say N if unsure. + config X86_FRED bool "Flexible Return and Event Delivery" depends on X86_64 diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index 279aba8e97bf5..40a74a0617345 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -198,15 +198,6 @@ static inline bool resctrl_arch_mon_can_overflow(void) void resctrl_cpu_detect(struct cpuinfo_x86 *c); -static inline bool resctrl_arch_get_mb_uses_numa_nid(void) -{ - return false; -} - -static inline int resctrl_arch_set_mb_uses_numa_nid(bool enabled) -{ - return -EOPNOTSUPP; -} #else static inline void resctrl_arch_sched_in(struct task_struct *tsk) {} diff --git a/arch/x86/kernel/cpu/resctrl/Makefile b/arch/x86/kernel/cpu/resctrl/Makefile index d8a04b195da21..273ddfa308366 100644 --- a/arch/x86/kernel/cpu/resctrl/Makefile +++ b/arch/x86/kernel/cpu/resctrl/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o +obj-$(CONFIG_X86_CPU_RESCTRL_INTEL_AET) += intel_aet.o obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK) += pseudo_lock.o # To allow define_trace.h's recursive include: diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 42fcc9d7ff7a2..ed1d4c6e50f37 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -98,14 +98,33 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_SMBA), }, }, + [RDT_RESOURCE_PERF_PKG] = + { + .r_resctrl = { + .name = "PERF_PKG", + .mon_scope = RESCTRL_PACKAGE, + .mon_domains = mon_domain_init(RDT_RESOURCE_PERF_PKG), + }, + }, }; +/** + * resctrl_arch_system_num_rmid_idx - Compute number of supported RMIDs + * (minimum across all mon_capable resource) + * + * Return: Number of supported RMIDs at time of call. Note that mount time + * enumeration of resources may reduce the number. + */ u32 resctrl_arch_system_num_rmid_idx(void) { - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + u32 num_rmids = U32_MAX; + struct rdt_resource *r; + + for_each_mon_capable_rdt_resource(r) + num_rmids = min(num_rmids, r->mon.num_rmid); /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */ - return r->mon.num_rmid; + return num_rmids == U32_MAX ? 0 : num_rmids; } struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) @@ -363,7 +382,7 @@ static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom) kfree(hw_dom); } -static void mon_domain_free(struct rdt_hw_mon_domain *hw_dom) +static void l3_mon_domain_free(struct rdt_hw_l3_mon_domain *hw_dom) { int idx; @@ -396,11 +415,13 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain * } /** - * arch_domain_mbm_alloc() - Allocate arch private storage for the MBM counters + * l3_mon_domain_mbm_alloc() - Allocate arch private storage for the MBM counters * @num_rmid: The size of the MBM counter array * @hw_dom: The domain that owns the allocated arrays + * + * Return: 0 for success, or -ENOMEM. */ -static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom) +static int l3_mon_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_l3_mon_domain *hw_dom) { size_t tsize = sizeof(*hw_dom->arch_mbm_states[0]); enum resctrl_event_id eventid; @@ -433,6 +454,8 @@ static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope) return get_cpu_cacheinfo_id(cpu, scope); case RESCTRL_L3_NODE: return cpu_to_node(cpu); + case RESCTRL_PACKAGE: + return topology_physical_package_id(cpu); default: break; } @@ -459,7 +482,7 @@ static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r) hdr = resctrl_find_domain(&r->ctrl_domains, id, &add_pos); if (hdr) { - if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN)) + if (!domain_header_is_valid(hdr, RESCTRL_CTRL_DOMAIN, r->rid)) return; d = container_of(hdr, struct rdt_ctrl_domain, hdr); @@ -476,6 +499,7 @@ static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r) d = &hw_dom->d_resctrl; d->hdr.id = id; d->hdr.type = RESCTRL_CTRL_DOMAIN; + d->hdr.rid = r->rid; cpumask_set_cpu(cpu, &d->hdr.cpu_mask); rdt_domain_reconfigure_cdp(r); @@ -495,37 +519,13 @@ static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r) } } -static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) +static void l3_mon_domain_setup(int cpu, int id, struct rdt_resource *r, struct list_head *add_pos) { - int id = get_domain_id_from_scope(cpu, r->mon_scope); - struct list_head *add_pos = NULL; - struct rdt_hw_mon_domain *hw_dom; - struct rdt_domain_hdr *hdr; - struct rdt_mon_domain *d; + struct rdt_hw_l3_mon_domain *hw_dom; + struct rdt_l3_mon_domain *d; struct cacheinfo *ci; int err; - lockdep_assert_held(&domain_list_lock); - - if (id < 0) { - pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n", - cpu, r->mon_scope, r->name); - return; - } - - hdr = resctrl_find_domain(&r->mon_domains, id, &add_pos); - if (hdr) { - if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) - return; - d = container_of(hdr, struct rdt_mon_domain, hdr); - - cpumask_set_cpu(cpu, &d->hdr.cpu_mask); - /* Update the mbm_assign_mode state for the CPU if supported */ - if (r->mon.mbm_cntr_assignable) - resctrl_arch_mbm_cntr_assign_set_one(r); - return; - } - hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu)); if (!hw_dom) return; @@ -533,33 +533,66 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) d = &hw_dom->d_resctrl; d->hdr.id = id; d->hdr.type = RESCTRL_MON_DOMAIN; + d->hdr.rid = RDT_RESOURCE_L3; ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); if (!ci) { pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name); - mon_domain_free(hw_dom); + l3_mon_domain_free(hw_dom); return; } d->ci_id = ci->id; cpumask_set_cpu(cpu, &d->hdr.cpu_mask); - /* Update the mbm_assign_mode state for the CPU if supported */ - if (r->mon.mbm_cntr_assignable) - resctrl_arch_mbm_cntr_assign_set_one(r); - arch_mon_domain_online(r, d); - if (arch_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) { - mon_domain_free(hw_dom); + if (l3_mon_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) { + l3_mon_domain_free(hw_dom); return; } list_add_tail_rcu(&d->hdr.list, add_pos); - err = resctrl_online_mon_domain(r, d); + err = resctrl_online_mon_domain(r, &d->hdr); if (err) { list_del_rcu(&d->hdr.list); synchronize_rcu(); - mon_domain_free(hw_dom); + l3_mon_domain_free(hw_dom); + } +} + +static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) +{ + int id = get_domain_id_from_scope(cpu, r->mon_scope); + struct list_head *add_pos = NULL; + struct rdt_domain_hdr *hdr; + + lockdep_assert_held(&domain_list_lock); + + if (id < 0) { + pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n", + cpu, r->mon_scope, r->name); + return; + } + + hdr = resctrl_find_domain(&r->mon_domains, id, &add_pos); + if (hdr) + cpumask_set_cpu(cpu, &hdr->cpu_mask); + + switch (r->rid) { + case RDT_RESOURCE_L3: + /* Update the mbm_assign_mode state for the CPU if supported */ + if (r->mon.mbm_cntr_assignable) + resctrl_arch_mbm_cntr_assign_set_one(r); + if (!hdr) + l3_mon_domain_setup(cpu, id, r, add_pos); + break; + case RDT_RESOURCE_PERF_PKG: + if (!hdr) + intel_aet_mon_domain_setup(cpu, id, r, add_pos); + break; + default: + pr_warn_once("Unknown resource rid=%d\n", r->rid); + break; } } @@ -593,36 +626,33 @@ static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r) return; } - if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN)) + cpumask_clear_cpu(cpu, &hdr->cpu_mask); + if (!cpumask_empty(&hdr->cpu_mask)) + return; + + if (!domain_header_is_valid(hdr, RESCTRL_CTRL_DOMAIN, r->rid)) return; d = container_of(hdr, struct rdt_ctrl_domain, hdr); hw_dom = resctrl_to_arch_ctrl_dom(d); - cpumask_clear_cpu(cpu, &d->hdr.cpu_mask); - if (cpumask_empty(&d->hdr.cpu_mask)) { - resctrl_offline_ctrl_domain(r, d); - list_del_rcu(&d->hdr.list); - synchronize_rcu(); - - /* - * rdt_ctrl_domain "d" is going to be freed below, so clear - * its pointer from pseudo_lock_region struct. - */ - if (d->plr) - d->plr->d = NULL; - ctrl_domain_free(hw_dom); + resctrl_offline_ctrl_domain(r, d); + list_del_rcu(&hdr->list); + synchronize_rcu(); - return; - } + /* + * rdt_ctrl_domain "d" is going to be freed below, so clear + * its pointer from pseudo_lock_region struct. + */ + if (d->plr) + d->plr->d = NULL; + ctrl_domain_free(hw_dom); } static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r) { int id = get_domain_id_from_scope(cpu, r->mon_scope); - struct rdt_hw_mon_domain *hw_dom; struct rdt_domain_hdr *hdr; - struct rdt_mon_domain *d; lockdep_assert_held(&domain_list_lock); @@ -639,20 +669,42 @@ static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r) return; } - if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) + cpumask_clear_cpu(cpu, &hdr->cpu_mask); + if (!cpumask_empty(&hdr->cpu_mask)) return; - d = container_of(hdr, struct rdt_mon_domain, hdr); - hw_dom = resctrl_to_arch_mon_dom(d); + switch (r->rid) { + case RDT_RESOURCE_L3: { + struct rdt_hw_l3_mon_domain *hw_dom; + struct rdt_l3_mon_domain *d; - cpumask_clear_cpu(cpu, &d->hdr.cpu_mask); - if (cpumask_empty(&d->hdr.cpu_mask)) { - resctrl_offline_mon_domain(r, d); - list_del_rcu(&d->hdr.list); + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + return; + + d = container_of(hdr, struct rdt_l3_mon_domain, hdr); + hw_dom = resctrl_to_arch_mon_dom(d); + resctrl_offline_mon_domain(r, hdr); + list_del_rcu(&hdr->list); synchronize_rcu(); - mon_domain_free(hw_dom); + l3_mon_domain_free(hw_dom); + break; + } + case RDT_RESOURCE_PERF_PKG: { + struct rdt_perf_pkg_mon_domain *pkgd; - return; + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_PERF_PKG)) + return; + + pkgd = container_of(hdr, struct rdt_perf_pkg_mon_domain, hdr); + resctrl_offline_mon_domain(r, hdr); + list_del_rcu(&hdr->list); + synchronize_rcu(); + kfree(pkgd); + break; + } + default: + pr_warn_once("Unknown resource rid=%d\n", r->rid); + break; } } @@ -707,6 +759,28 @@ static int resctrl_arch_offline_cpu(unsigned int cpu) return 0; } +void resctrl_arch_pre_mount(void) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_PERF_PKG].r_resctrl; + int cpu; + + if (!intel_aet_get_events()) + return; + + /* + * Late discovery of telemetry events means the domains for the + * resource were not built. Do that now. + */ + cpus_read_lock(); + mutex_lock(&domain_list_lock); + r->mon_capable = true; + rdt_mon_capable = true; + for_each_online_cpu(cpu) + domain_add_cpu_mon(cpu, r); + mutex_unlock(&domain_list_lock); + cpus_read_unlock(); +} + enum { RDT_FLAG_CMT, RDT_FLAG_MBM_TOTAL, @@ -760,6 +834,8 @@ static int __init set_rdt_options(char *str) force_off = *tok == '!'; if (force_off) tok++; + if (intel_handle_aet_option(force_off, tok)) + continue; for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) { if (strcmp(tok, o->name) == 0) { if (force_off) @@ -879,15 +955,15 @@ static __init bool get_rdt_mon_resources(void) bool ret = false; if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) { - resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID); + resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID, false, 0, NULL); ret = true; } if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { - resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); + resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID, false, 0, NULL); ret = true; } if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { - resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); + resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID, false, 0, NULL); ret = true; } if (rdt_cpu_has(X86_FEATURE_ABMC)) @@ -896,7 +972,7 @@ static __init bool get_rdt_mon_resources(void) if (!ret) return false; - return !rdt_get_mon_l3_config(r); + return !rdt_get_l3_mon_config(r); } static __init void __check_quirks_intel(void) @@ -1062,6 +1138,8 @@ late_initcall(resctrl_arch_late_init); static void __exit resctrl_arch_exit(void) { + intel_aet_exit(); + cpuhp_remove_state(rdt_online); resctrl_exit(); diff --git a/arch/x86/kernel/cpu/resctrl/intel_aet.c b/arch/x86/kernel/cpu/resctrl/intel_aet.c new file mode 100644 index 0000000000000..89b8b619d5d53 --- /dev/null +++ b/arch/x86/kernel/cpu/resctrl/intel_aet.c @@ -0,0 +1,409 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Resource Director Technology(RDT) + * - Intel Application Energy Telemetry + * + * Copyright (C) 2025 Intel Corporation + * + * Author: + * Tony Luck + */ + +#define pr_fmt(fmt) "resctrl: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/** + * struct pmt_event - Telemetry event. + * @id: Resctrl event id. + * @idx: Counter index within each per-RMID block of counters. + * @bin_bits: Zero for integer valued events, else number bits in fraction + * part of fixed-point. + */ +struct pmt_event { + enum resctrl_event_id id; + unsigned int idx; + unsigned int bin_bits; +}; + +#define EVT(_id, _idx, _bits) { .id = _id, .idx = _idx, .bin_bits = _bits } + +/** + * struct event_group - Events with the same feature type ("energy" or "perf") and GUID. + * @pfname: PMT feature name ("energy" or "perf") of this event group. + * Used by boot rdt= option. + * @pfg: Points to the aggregated telemetry space information + * returned by the intel_pmt_get_regions_by_feature() + * call to the INTEL_PMT_TELEMETRY driver that contains + * data for all telemetry regions of type @pfname. + * Valid if the system supports the event group, + * NULL otherwise. + * @force_off: True when "rdt" command line or architecture code disables + * this event group due to insufficient RMIDs. + * @force_on: True when "rdt" command line overrides disable of this + * event group. + * @guid: Unique number per XML description file. + * @num_rmid: Number of RMIDs supported by this group. May be + * adjusted downwards if enumeration from + * intel_pmt_get_regions_by_feature() indicates fewer + * RMIDs can be tracked simultaneously. + * @mmio_size: Number of bytes of MMIO registers for this group. + * @num_events: Number of events in this group. + * @evts: Array of event descriptors. + */ +struct event_group { + /* Data fields for additional structures to manage this group. */ + const char *pfname; + struct pmt_feature_group *pfg; + bool force_off, force_on; + + /* Remaining fields initialized from XML file. */ + u32 guid; + u32 num_rmid; + size_t mmio_size; + unsigned int num_events; + struct pmt_event evts[] __counted_by(num_events); +}; + +#define XML_MMIO_SIZE(num_rmids, num_events, num_extra_status) \ + (((num_rmids) * (num_events) + (num_extra_status)) * sizeof(u64)) + +/* + * Link: https://github.com/intel/Intel-PMT/blob/main/xml/CWF/OOBMSM/RMID-ENERGY/cwf_aggregator.xml + */ +static struct event_group energy_0x26696143 = { + .pfname = "energy", + .guid = 0x26696143, + .num_rmid = 576, + .mmio_size = XML_MMIO_SIZE(576, 2, 3), + .num_events = 2, + .evts = { + EVT(PMT_EVENT_ENERGY, 0, 18), + EVT(PMT_EVENT_ACTIVITY, 1, 18), + } +}; + +/* + * Link: https://github.com/intel/Intel-PMT/blob/main/xml/CWF/OOBMSM/RMID-PERF/cwf_aggregator.xml + */ +static struct event_group perf_0x26557651 = { + .pfname = "perf", + .guid = 0x26557651, + .num_rmid = 576, + .mmio_size = XML_MMIO_SIZE(576, 7, 3), + .num_events = 7, + .evts = { + EVT(PMT_EVENT_STALLS_LLC_HIT, 0, 0), + EVT(PMT_EVENT_C1_RES, 1, 0), + EVT(PMT_EVENT_UNHALTED_CORE_CYCLES, 2, 0), + EVT(PMT_EVENT_STALLS_LLC_MISS, 3, 0), + EVT(PMT_EVENT_AUTO_C6_RES, 4, 0), + EVT(PMT_EVENT_UNHALTED_REF_CYCLES, 5, 0), + EVT(PMT_EVENT_UOPS_RETIRED, 6, 0), + } +}; + +static struct event_group *known_event_groups[] = { + &energy_0x26696143, + &perf_0x26557651, +}; + +#define for_each_event_group(_peg) \ + for (_peg = known_event_groups; \ + _peg < &known_event_groups[ARRAY_SIZE(known_event_groups)]; \ + _peg++) + +bool intel_handle_aet_option(bool force_off, char *tok) +{ + struct event_group **peg; + bool ret = false; + u32 guid = 0; + char *name; + + if (!tok) + return false; + + name = strsep(&tok, ":"); + if (tok && kstrtou32(tok, 16, &guid)) + return false; + + for_each_event_group(peg) { + if (strcmp(name, (*peg)->pfname)) + continue; + if (guid && (*peg)->guid != guid) + continue; + if (force_off) + (*peg)->force_off = true; + else + (*peg)->force_on = true; + ret = true; + } + + return ret; +} + +static bool skip_telem_region(struct telemetry_region *tr, struct event_group *e) +{ + if (tr->guid != e->guid) + return true; + if (tr->plat_info.package_id >= topology_max_packages()) { + pr_warn("Bad package %u in guid 0x%x\n", tr->plat_info.package_id, + tr->guid); + return true; + } + if (tr->size != e->mmio_size) { + pr_warn("MMIO space wrong size (%zu bytes) for guid 0x%x. Expected %zu bytes.\n", + tr->size, e->guid, e->mmio_size); + return true; + } + + return false; +} + +static bool group_has_usable_regions(struct event_group *e, struct pmt_feature_group *p) +{ + bool usable_regions = false; + + for (int i = 0; i < p->count; i++) { + if (skip_telem_region(&p->regions[i], e)) { + /* + * Clear the address field of regions that did not pass the checks in + * skip_telem_region() so they will not be used by intel_aet_read_event(). + * This is safe to do because intel_pmt_get_regions_by_feature() allocates + * a new pmt_feature_group structure to return to each caller and only makes + * use of the pmt_feature_group::kref field when intel_pmt_put_feature_group() + * returns the structure. + */ + p->regions[i].addr = NULL; + + continue; + } + usable_regions = true; + } + + return usable_regions; +} + +static bool all_regions_have_sufficient_rmid(struct event_group *e, struct pmt_feature_group *p) +{ + struct telemetry_region *tr; + + for (int i = 0; i < p->count; i++) { + if (!p->regions[i].addr) + continue; + tr = &p->regions[i]; + if (tr->num_rmids < e->num_rmid) { + e->force_off = true; + return false; + } + } + + return true; +} + +static bool enable_events(struct event_group *e, struct pmt_feature_group *p) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_PERF_PKG].r_resctrl; + int skipped_events = 0; + + if (e->force_off) + return false; + + if (!group_has_usable_regions(e, p)) + return false; + + /* + * Only enable event group with insufficient RMIDs if the user requested + * it from the kernel command line. + */ + if (!all_regions_have_sufficient_rmid(e, p) && !e->force_on) { + pr_info("%s %s:0x%x monitoring not enabled due to insufficient RMIDs\n", + r->name, e->pfname, e->guid); + return false; + } + + for (int i = 0; i < p->count; i++) { + if (!p->regions[i].addr) + continue; + /* + * e->num_rmid only adjusted lower if user (via rdt= kernel + * parameter) forces an event group with insufficient RMID + * to be enabled. + */ + e->num_rmid = min(e->num_rmid, p->regions[i].num_rmids); + } + + for (int j = 0; j < e->num_events; j++) { + if (!resctrl_enable_mon_event(e->evts[j].id, true, + e->evts[j].bin_bits, &e->evts[j])) + skipped_events++; + } + if (e->num_events == skipped_events) { + pr_info("No events enabled in %s %s:0x%x\n", r->name, e->pfname, e->guid); + return false; + } + + if (r->mon.num_rmid) + r->mon.num_rmid = min(r->mon.num_rmid, e->num_rmid); + else + r->mon.num_rmid = e->num_rmid; + + if (skipped_events) + pr_info("%s %s:0x%x monitoring detected (skipped %d events)\n", r->name, + e->pfname, e->guid, skipped_events); + else + pr_info("%s %s:0x%x monitoring detected\n", r->name, e->pfname, e->guid); + + return true; +} + +static enum pmt_feature_id lookup_pfid(const char *pfname) +{ + if (!strcmp(pfname, "energy")) + return FEATURE_PER_RMID_ENERGY_TELEM; + else if (!strcmp(pfname, "perf")) + return FEATURE_PER_RMID_PERF_TELEM; + + pr_warn("Unknown PMT feature name '%s'\n", pfname); + + return FEATURE_INVALID; +} + +/* + * Request a copy of struct pmt_feature_group for each event group. If there is + * one, the returned structure has an array of telemetry_region structures, + * each element of the array describes one telemetry aggregator. The + * telemetry aggregators may have different GUIDs so obtain duplicate struct + * pmt_feature_group for event groups with same feature type but different + * GUID. Post-processing ensures an event group can only use the telemetry + * aggregators that match its GUID. An event group keeps a pointer to its + * struct pmt_feature_group to indicate that its events are successfully + * enabled. + */ +bool intel_aet_get_events(void) +{ + struct pmt_feature_group *p; + enum pmt_feature_id pfid; + struct event_group **peg; + bool ret = false; + + for_each_event_group(peg) { + pfid = lookup_pfid((*peg)->pfname); + p = intel_pmt_get_regions_by_feature(pfid); + if (IS_ERR_OR_NULL(p)) + continue; + if (enable_events(*peg, p)) { + (*peg)->pfg = p; + ret = true; + } else { + intel_pmt_put_feature_group(p); + } + } + + return ret; +} + +void __exit intel_aet_exit(void) +{ + struct event_group **peg; + + for_each_event_group(peg) { + if ((*peg)->pfg) { + intel_pmt_put_feature_group((*peg)->pfg); + (*peg)->pfg = NULL; + } + } +} + +#define DATA_VALID BIT_ULL(63) +#define DATA_BITS GENMASK_ULL(62, 0) + +/* + * Read counter for an event on a domain (summing all aggregators on the + * domain). If an aggregator hasn't received any data for a specific RMID, + * the MMIO read indicates that data is not valid. Return success if at + * least one aggregator has valid data. + */ +int intel_aet_read_event(int domid, u32 rmid, void *arch_priv, u64 *val) +{ + struct pmt_event *pevt = arch_priv; + struct event_group *e; + bool valid = false; + u64 total = 0; + u64 evtcount; + void *pevt0; + u32 idx; + + pevt0 = pevt - pevt->idx; + e = container_of(pevt0, struct event_group, evts); + idx = rmid * e->num_events; + idx += pevt->idx; + + if (idx * sizeof(u64) + sizeof(u64) > e->mmio_size) { + pr_warn_once("MMIO index %u out of range\n", idx); + return -EIO; + } + + for (int i = 0; i < e->pfg->count; i++) { + if (!e->pfg->regions[i].addr) + continue; + if (e->pfg->regions[i].plat_info.package_id != domid) + continue; + evtcount = readq(e->pfg->regions[i].addr + idx * sizeof(u64)); + if (!(evtcount & DATA_VALID)) + continue; + total += evtcount & DATA_BITS; + valid = true; + } + + if (valid) + *val = total; + + return valid ? 0 : -EINVAL; +} + +void intel_aet_mon_domain_setup(int cpu, int id, struct rdt_resource *r, + struct list_head *add_pos) +{ + struct rdt_perf_pkg_mon_domain *d; + int err; + + d = kzalloc_node(sizeof(*d), GFP_KERNEL, cpu_to_node(cpu)); + if (!d) + return; + + d->hdr.id = id; + d->hdr.type = RESCTRL_MON_DOMAIN; + d->hdr.rid = RDT_RESOURCE_PERF_PKG; + cpumask_set_cpu(cpu, &d->hdr.cpu_mask); + list_add_tail_rcu(&d->hdr.list, add_pos); + + err = resctrl_online_mon_domain(r, &d->hdr); + if (err) { + list_del_rcu(&d->hdr.list); + synchronize_rcu(); + kfree(d); + } +} diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 9f4c2f0aaf5c8..eb923cd978e08 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -60,17 +60,17 @@ struct rdt_hw_ctrl_domain { }; /** - * struct rdt_hw_mon_domain - Arch private attributes of a set of CPUs that share - * a resource for a monitor function - * @d_resctrl: Properties exposed to the resctrl file system + * struct rdt_hw_l3_mon_domain - Arch private attributes of a set of CPUs sharing + * RDT_RESOURCE_L3 monitoring + * @d_resctrl: Properties exposed to the resctrl file system * @arch_mbm_states: Per-event pointer to the MBM event's saved state. * An MBM event's state is an array of struct arch_mbm_state * indexed by RMID on x86. * * Members of this structure are accessed via helpers that provide abstraction. */ -struct rdt_hw_mon_domain { - struct rdt_mon_domain d_resctrl; +struct rdt_hw_l3_mon_domain { + struct rdt_l3_mon_domain d_resctrl; struct arch_mbm_state *arch_mbm_states[QOS_NUM_L3_MBM_EVENTS]; }; @@ -79,11 +79,19 @@ static inline struct rdt_hw_ctrl_domain *resctrl_to_arch_ctrl_dom(struct rdt_ctr return container_of(r, struct rdt_hw_ctrl_domain, d_resctrl); } -static inline struct rdt_hw_mon_domain *resctrl_to_arch_mon_dom(struct rdt_mon_domain *r) +static inline struct rdt_hw_l3_mon_domain *resctrl_to_arch_mon_dom(struct rdt_l3_mon_domain *r) { - return container_of(r, struct rdt_hw_mon_domain, d_resctrl); + return container_of(r, struct rdt_hw_l3_mon_domain, d_resctrl); } +/** + * struct rdt_perf_pkg_mon_domain - CPUs sharing an package scoped resctrl monitor resource + * @hdr: common header for different domain types + */ +struct rdt_perf_pkg_mon_domain { + struct rdt_domain_hdr hdr; +}; + /** * struct msr_param - set a range of MSRs from a domain * @res: The resource to use @@ -135,7 +143,7 @@ static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r extern struct rdt_hw_resource rdt_resources_all[]; -void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d); +void arch_mon_domain_online(struct rdt_resource *r, struct rdt_l3_mon_domain *d); /* CPUID.(EAX=10H, ECX=ResID=1).EAX */ union cpuid_0x10_1_eax { @@ -208,7 +216,7 @@ union l3_qos_abmc_cfg { void rdt_ctrl_update(void *arg); -int rdt_get_mon_l3_config(struct rdt_resource *r); +int rdt_get_l3_mon_config(struct rdt_resource *r); bool rdt_cpu_has(int flag); @@ -217,4 +225,24 @@ void __init intel_rdt_mbm_apply_quirk(void); void rdt_domain_reconfigure_cdp(struct rdt_resource *r); void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r); +#ifdef CONFIG_X86_CPU_RESCTRL_INTEL_AET +bool intel_aet_get_events(void); +void __exit intel_aet_exit(void); +int intel_aet_read_event(int domid, u32 rmid, void *arch_priv, u64 *val); +void intel_aet_mon_domain_setup(int cpu, int id, struct rdt_resource *r, + struct list_head *add_pos); +bool intel_handle_aet_option(bool force_off, char *tok); +#else +static inline bool intel_aet_get_events(void) { return false; } +static inline void __exit intel_aet_exit(void) { } +static inline int intel_aet_read_event(int domid, u32 rmid, void *arch_priv, u64 *val) +{ + return -EINVAL; +} + +static inline void intel_aet_mon_domain_setup(int cpu, int id, struct rdt_resource *r, + struct list_head *add_pos) { } +static inline bool intel_handle_aet_option(bool force_off, char *tok) { return false; } +#endif + #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index fe1a2aa53c16a..c10e1ae5ee8a7 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -109,7 +109,7 @@ static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) * * In RMID sharing mode there are fewer "logical RMID" values available * to accumulate data ("physical RMIDs" are divided evenly between SNC - * nodes that share an L3 cache). Linux creates an rdt_mon_domain for + * nodes that share an L3 cache). Linux creates an rdt_l3_mon_domain for * each SNC node. * * The value loaded into IA32_PQR_ASSOC is the "logical RMID". @@ -157,7 +157,7 @@ static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val) return 0; } -static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom, +static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_l3_mon_domain *hw_dom, u32 rmid, enum resctrl_event_id eventid) { @@ -171,11 +171,11 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_do return state ? &state[rmid] : NULL; } -void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 unused, u32 rmid, enum resctrl_event_id eventid) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); int cpu = cpumask_any(&d->hdr.cpu_mask); struct arch_mbm_state *am; u32 prmid; @@ -194,9 +194,9 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, * Assumes that hardware counters are also reset and thus that there is * no need to record initial non-zero counts. */ -void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) +void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); enum resctrl_event_id eventid; int idx; @@ -217,10 +217,10 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) return chunks >> shift; } -static u64 get_corrected_val(struct rdt_resource *r, struct rdt_mon_domain *d, +static u64 get_corrected_val(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 rmid, enum resctrl_event_id eventid, u64 msr_val) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); struct arch_mbm_state *am; u64 chunks; @@ -238,19 +238,29 @@ static u64 get_corrected_val(struct rdt_resource *r, struct rdt_mon_domain *d, return chunks * hw_res->mon_scale; } -int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, u32 unused, u32 rmid, enum resctrl_event_id eventid, - u64 *val, void *ignored) + void *arch_priv, u64 *val, void *ignored) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); - int cpu = cpumask_any(&d->hdr.cpu_mask); + struct rdt_hw_l3_mon_domain *hw_dom; + struct rdt_l3_mon_domain *d; struct arch_mbm_state *am; u64 msr_val; u32 prmid; + int cpu; int ret; resctrl_arch_rmid_read_context_check(); + if (r->rid == RDT_RESOURCE_PERF_PKG) + return intel_aet_read_event(hdr->id, rmid, arch_priv, val); + + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + return -EINVAL; + + d = container_of(hdr, struct rdt_l3_mon_domain, hdr); + hw_dom = resctrl_to_arch_mon_dom(d); + cpu = cpumask_any(&hdr->cpu_mask); prmid = logical_rmid_to_physical_rmid(cpu, rmid); ret = __rmid_read_phys(prmid, eventid, &msr_val); @@ -302,11 +312,11 @@ static int __cntr_id_read(u32 cntr_id, u64 *val) return 0; } -void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 unused, u32 rmid, int cntr_id, enum resctrl_event_id eventid) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); struct arch_mbm_state *am; am = get_arch_mbm_state(hw_dom, rmid, eventid); @@ -318,7 +328,7 @@ void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, } } -int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 unused, u32 rmid, int cntr_id, enum resctrl_event_id eventid, u64 *val) { @@ -348,7 +358,7 @@ int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, * must adjust RMID counter numbers based on SNC node. See * logical_rmid_to_physical_rmid() for code that does this. */ -void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d) +void arch_mon_domain_online(struct rdt_resource *r, struct rdt_l3_mon_domain *d) { if (snc_nodes_per_l3_cache > 1) msr_clear_bit(MSR_RMID_SNC_CONFIG, 0); @@ -417,7 +427,7 @@ static __init int snc_get_config(void) return ret; } -int __init rdt_get_mon_l3_config(struct rdt_resource *r) +int __init rdt_get_l3_mon_config(struct rdt_resource *r) { unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); @@ -509,7 +519,7 @@ static void resctrl_abmc_set_one_amd(void *arg) */ static void _resctrl_abmc_enable(struct rdt_resource *r, bool enable) { - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; lockdep_assert_cpus_held(); @@ -548,11 +558,11 @@ static void resctrl_abmc_config_one_amd(void *info) /* * Send an IPI to the domain to assign the counter to RMID, event pair. */ -void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, enum resctrl_event_id evtid, u32 rmid, u32 closid, u32 cntr_id, bool assign) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); union l3_qos_abmc_cfg abmc_cfg = { 0 }; struct arch_mbm_state *am; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 394c69e5ed838..39e8ac236b90a 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -3733,94 +3733,67 @@ static int arm_smmu_def_domain_type(struct device *dev) return 0; } -static int arm_smmu_group_set_mpam(struct iommu_group *group, u16 partid, +static int arm_smmu_group_set_mpam(struct device *dev, u16 partid, u8 pmg) { int i; u32 sid; - unsigned long flags; struct arm_smmu_ste *step; - struct iommu_domain *domain; struct arm_smmu_device *smmu; struct arm_smmu_master *master; struct arm_smmu_cmdq_batch cmds; - struct arm_smmu_domain *smmu_domain; struct arm_smmu_cmdq_ent cmd = { .opcode = CMDQ_OP_CFGI_STE, .cfgi = { .leaf = true, }, }; - struct arm_smmu_master_domain *master_domain; - domain = iommu_get_domain_for_group(group); - smmu_domain = to_smmu_domain(domain); - if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_MPAM)) + master = dev_iommu_priv_get(dev); + if (!(master->smmu->features & ARM_SMMU_FEAT_MPAM)) return -EIO; - smmu = smmu_domain->smmu; + smmu = master->smmu; arm_smmu_cmdq_batch_init(smmu, &cmds, &cmd); - spin_lock_irqsave(&smmu_domain->devices_lock, flags); - list_for_each_entry(master_domain, &smmu_domain->devices, - devices_elm) { - master = master_domain->master; - - for (i = 0; i < master->num_streams; i++) { - sid = master->streams[i].id; - step = arm_smmu_get_step_for_sid(smmu, sid); - - /* These need locking if the VMSPtr is ever used */ - step->data[4] = FIELD_PREP(STRTAB_STE_4_PARTID, partid); - step->data[5] = FIELD_PREP(STRTAB_STE_5_PMG, pmg); + for (i = 0; i < master->num_streams; i++) { + sid = master->streams[i].id; + step = arm_smmu_get_step_for_sid(smmu, sid); - cmd.cfgi.sid = sid; - arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd); - } + /* These need locking if the VMSPtr is ever used */ + step->data[4] = FIELD_PREP(STRTAB_STE_4_PARTID, partid); + step->data[5] = FIELD_PREP(STRTAB_STE_5_PMG, pmg); - master->partid = partid; - master->pmg = pmg; + cmd.cfgi.sid = sid; + arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd); } - spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); + + master->partid = partid; + master->pmg = pmg; arm_smmu_cmdq_batch_submit(smmu, &cmds); return 0; } -static int arm_smmu_group_get_mpam(struct iommu_group *group, u16 *partid, +static int arm_smmu_group_get_mpam(struct device *dev, u16 *partid, u8 *pmg) { - int err = -EINVAL; - unsigned long flags; - struct iommu_domain *domain; struct arm_smmu_master *master; - struct arm_smmu_domain *smmu_domain; - struct arm_smmu_master_domain *master_domain; - domain = iommu_get_domain_for_group(group); - smmu_domain = to_smmu_domain(domain); - if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_MPAM)) + master = dev_iommu_priv_get(dev); + if (!(master->smmu->features & ARM_SMMU_FEAT_MPAM)) return -EIO; if (!partid && !pmg) return 0; - spin_lock_irqsave(&smmu_domain->devices_lock, flags); - list_for_each_entry(master_domain, &smmu_domain->devices, - devices_elm) { - master = master_domain->master; - if (master) { - if (partid) - *partid = master->partid; - if (pmg) - *pmg = master->pmg; - err = 0; - } - } - spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); + if (partid) + *partid = master->partid; + if (pmg) + *pmg = master->pmg; - return err; + return 0; } static const struct iommu_ops arm_smmu_ops = { diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index db770b73e3a8f..4c243f6000c3a 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3914,25 +3914,19 @@ int iommu_group_set_qos_params(struct iommu_group *group, { const struct iommu_ops *ops; struct group_device *device; - int ret; + int ret = -ENODEV; mutex_lock(&group->mutex); - device = list_first_entry_or_null(&group->devices, typeof(*device), - list); - if (!device) { - ret = -ENODEV; - goto out_unlock; - } - - ops = dev_iommu_ops(device->dev); - if (!ops->set_group_qos_params) { - ret = -EOPNOTSUPP; - goto out_unlock; + for_each_group_device(group, device) { + ops = dev_iommu_ops(device->dev); + if (!ops->set_group_qos_params) { + ret = -EOPNOTSUPP; + break; + } + ret = ops->set_group_qos_params(device->dev, partition, perf_mon_grp); + if (ret < 0) + break; } - - ret = ops->set_group_qos_params(group, partition, perf_mon_grp); - -out_unlock: mutex_unlock(&group->mutex); return ret; @@ -3952,25 +3946,19 @@ int iommu_group_get_qos_params(struct iommu_group *group, { const struct iommu_ops *ops; struct group_device *device; - int ret; + int ret = -ENODEV; mutex_lock(&group->mutex); - device = list_first_entry_or_null(&group->devices, typeof(*device), - list); - if (!device) { - ret = -ENODEV; - goto out_unlock; - } - - ops = dev_iommu_ops(device->dev); - if (!ops->get_group_qos_params) { - ret = -EOPNOTSUPP; - goto out_unlock; + for_each_group_device(group, device) { + ops = dev_iommu_ops(device->dev); + if (!ops->get_group_qos_params) { + ret = -EOPNOTSUPP; + break; + } + ret = ops->get_group_qos_params(device->dev, partition, perf_mon_grp); + if (!ret) + break; } - - ret = ops->get_group_qos_params(group, partition, perf_mon_grp); - -out_unlock: mutex_unlock(&group->mutex); return ret; diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index f0740b5d59b5b..7ee8925aec9ef 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -723,6 +723,11 @@ static int mpam_ris_get_affinity(struct mpam_msc *msc, cpumask_t *affinity, case MPAM_CLASS_MEMORY: get_cpumask_from_node_id(comp->comp_id, affinity); /* affinity may be empty for CPU-less memory nodes */ + if (cpumask_empty(affinity)) { + dev_warn_once(&msc->pdev->dev, "CPU-less numa node"); + cpumask_copy(affinity, cpu_possible_mask); + } else if (class->level > 3) + cpumask_copy(affinity, cpu_possible_mask); break; case MPAM_CLASS_UNKNOWN: return 0; @@ -1547,6 +1552,9 @@ int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, if (!mpam_has_feature(type, cprops)) return -EOPNOTSUPP; + if (type == mpam_feat_msmon_mbwu) + type = mpam_msmon_choose_counter(class); + arg = (struct mon_read) { .ctx = ctx, .type = type, @@ -1554,9 +1562,6 @@ int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, }; *val = 0; - if (type == mpam_feat_msmon_mbwu) - type = mpam_msmon_choose_counter(class); - err = _msmon_read(comp, &arg); if (err == -EBUSY && class->nrdy_usec) wait_jiffies = usecs_to_jiffies(class->nrdy_usec); @@ -1579,41 +1584,6 @@ int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, return err; } -void mpam_msmon_reset_all_mbwu(struct mpam_component *comp) -{ - int idx, i; - struct mpam_msc *msc; - struct mpam_vmsc *vmsc; - struct mpam_msc_ris *ris; - - if (!mpam_is_enabled()) - return; - - idx = srcu_read_lock(&mpam_srcu); - list_for_each_entry_rcu(vmsc, &comp->vmsc, comp_list) { - if (!mpam_has_feature(mpam_feat_msmon_mbwu, &vmsc->props)) - continue; - - msc = vmsc->msc; - mpam_mon_sel_outer_lock(msc); - list_for_each_entry_rcu(ris, &msc->ris, vmsc_list) { - if (!mpam_has_feature(mpam_feat_msmon_mbwu, &ris->props)) - continue; - - if (WARN_ON_ONCE(!mpam_mon_sel_inner_lock(msc))) - continue; - - for (i = 0; i < ris->props.num_mbwu_mon; i++) { - ris->mbwu_state[i].correction = 0; - ris->mbwu_state[i].reset_on_next_read = true; - } - mpam_mon_sel_inner_unlock(msc); - } - mpam_mon_sel_outer_unlock(msc); - } - srcu_read_unlock(&mpam_srcu, idx); -} - void mpam_msmon_reset_mbwu(struct mpam_component *comp, struct mon_cfg *ctx) { struct mpam_msc *msc; @@ -1647,34 +1617,6 @@ void mpam_msmon_reset_mbwu(struct mpam_component *comp, struct mon_cfg *ctx) } } -static void mpam_reset_msc_bitmap(struct mpam_msc *msc, u16 reg, u16 wd) -{ - u32 num_words, msb; - u32 bm = ~0; - int i; - - lockdep_assert_held(&msc->part_sel_lock); - - if (wd == 0) - return; - - /* - * Write all ~0 to all but the last 32bit-word, which may - * have fewer bits... - */ - num_words = DIV_ROUND_UP(wd, 32); - for (i = 0; i < num_words - 1; i++, reg += sizeof(bm)) - __mpam_write_reg(msc, reg, bm); - - /* - * ....and then the last (maybe) partial 32bit word. When wd is a - * multiple of 32, msb should be 31 to write a full 32bit word. - */ - msb = (wd - 1) % 32; - bm = GENMASK(msb, 0); - __mpam_write_reg(msc, reg, bm); -} - static void mpam_apply_t241_erratum(struct mpam_msc_ris *ris, u16 partid) { int sidx, i, lcount = 1000; @@ -1713,12 +1655,42 @@ static void mpam_quirk_post_config_change(struct mpam_msc_ris *ris, u16 partid, mpam_apply_t241_erratum(ris, partid); } +static u16 mpam_wa_t241_force_mbw_min_to_one(struct mpam_props *props) +{ + u16 max_hw_value, min_hw_granule, res0_bits; + + res0_bits = 16 - props->bwa_wd; + max_hw_value = ((1 << props->bwa_wd) - 1) << res0_bits; + min_hw_granule = ~max_hw_value; + + return min_hw_granule + 1; +} + +static u16 mpam_wa_t241_calc_min_from_max(struct mpam_props *props, + struct mpam_config *cfg) +{ + u16 val = 0; + u16 max; + u16 delta = ((5 * MPAMCFG_MBW_MAX_MAX) / 100) - 1; + + if (mpam_has_feature(mpam_feat_mbw_max, cfg)) { + max = cfg->mbw_max; + } else { + /* Resetting. Hence, use the ris specific default. */ + max = GENMASK(15, 16 - props->bwa_wd); + } + + if (max > delta) + val = max - delta; + + return val; +} + /* Called via IPI. Call while holding an SRCU reference */ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, struct mpam_config *cfg) { u32 pri_val = 0; - u16 cmax = MPAMCFG_CMAX_CMAX; struct mpam_msc *msc = ris->vmsc->msc; struct mpam_props *rprops = &ris->props; u16 dspri = GENMASK(rprops->dspri_wd, 0); @@ -1740,26 +1712,25 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, } if (mpam_has_feature(mpam_feat_cpor_part, rprops) && - mpam_has_feature(mpam_feat_cpor_part, cfg)) { - if (cfg->reset_cpbm) - mpam_reset_msc_bitmap(msc, MPAMCFG_CPBM, - rprops->cpbm_wd); - else - mpam_write_partsel_reg(msc, CPBM, cfg->cpbm); - } + mpam_has_feature(mpam_feat_cpor_part, cfg)) + mpam_write_partsel_reg(msc, CPBM, cfg->cpbm); if (mpam_has_feature(mpam_feat_mbw_part, rprops) && - mpam_has_feature(mpam_feat_mbw_part, cfg)) { - if (cfg->reset_mbw_pbm) - mpam_reset_msc_bitmap(msc, MPAMCFG_MBW_PBM, - rprops->mbw_pbm_bits); - else - mpam_write_partsel_reg(msc, MBW_PBM, cfg->mbw_pbm); - } + mpam_has_feature(mpam_feat_mbw_part, cfg)) + mpam_write_partsel_reg(msc, MBW_PBM, cfg->mbw_pbm); + + if (mpam_has_feature(mpam_feat_mbw_min, rprops)) { + u16 val = 0; - if (mpam_has_feature(mpam_feat_mbw_min, rprops) && - mpam_has_feature(mpam_feat_mbw_min, cfg)) - mpam_write_partsel_reg(msc, MBW_MIN, cfg->mbw_min); + if (mpam_has_quirk(T241_FORCE_MBW_MIN_TO_ONE, msc)) { + u16 min = mpam_wa_t241_force_mbw_min_to_one(rprops); + + val = mpam_wa_t241_calc_min_from_max(rprops, cfg); + val = max(val, min); + } + + mpam_write_partsel_reg(msc, MBW_MIN, val); + } if (mpam_has_feature(mpam_feat_mbw_max, rprops) && mpam_has_feature(mpam_feat_mbw_max, cfg)) @@ -1769,25 +1740,18 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, mpam_has_feature(mpam_feat_mbw_prop, cfg)) mpam_write_partsel_reg(msc, MBW_PROP, 0); - if (mpam_has_feature(mpam_feat_cmax_cmax, rprops)) { - if (mpam_has_feature(mpam_feat_cmax_cmax, cfg)) { - u32 cmax_val = cfg->cmax; + if (mpam_has_feature(mpam_feat_cmax_cmax, rprops) && + mpam_has_feature(mpam_feat_cmax_cmax, cfg)) { + u32 cmax = cfg->cmax; - if (cfg->cmax_softlim) - cmax_val |= MPAMCFG_CMAX_SOFTLIM; - mpam_write_partsel_reg(msc, CMAX, cmax_val); - } else { - mpam_write_partsel_reg(msc, CMAX, cmax); - } + if (cfg->cmax_softlim) + cmax |= MPAMCFG_CMAX_SOFTLIM; + mpam_write_partsel_reg(msc, CMAX, cmax); } - if (mpam_has_feature(mpam_feat_cmax_cmin, rprops)) { - if (mpam_has_feature(mpam_feat_cmax_cmin, cfg)) { - mpam_write_partsel_reg(msc, CMIN, cfg->cmin); - } else { - mpam_write_partsel_reg(msc, CMIN, 0); - } - } + if (mpam_has_feature(mpam_feat_cmax_cmin, rprops) && + mpam_has_feature(mpam_feat_cmax_cmin, cfg)) + mpam_write_partsel_reg(msc, CMIN, cfg->cmin); if (mpam_has_feature(mpam_feat_cmax_cassoc, rprops)) mpam_write_partsel_reg(msc, CASSOC, MPAMCFG_CASSOC_CASSOC); @@ -1910,33 +1874,32 @@ static int mpam_save_mbwu_state(void *arg) return 0; } -static void mpam_init_reset_cfg(struct mpam_config *reset_cfg) -{ - *reset_cfg = (struct mpam_config) { - .cpbm = ~0, - .mbw_pbm = ~0, - .mbw_max = MPAMCFG_MBW_MAX_MAX, - - .reset_cpbm = true, - .reset_mbw_pbm = true, - }; - bitmap_fill(reset_cfg->features, MPAM_FEATURE_LAST); -} - -/* - * This is not part of mpam_init_reset_cfg() as high level callers have the - * class, and low level callers a ris. - */ -static void mpam_wa_t241_force_mbw_min_to_one(struct mpam_config *cfg, - struct mpam_props *props) +static void mpam_init_reset_cfg(struct mpam_config *reset_cfg, + const struct mpam_props *props) { - u16 max_hw_value, min_hw_granule, res0_bits; + memset(reset_cfg, 0, sizeof(*reset_cfg)); - res0_bits = 16 - props->bwa_wd; - max_hw_value = ((1 << props->bwa_wd) - 1) << res0_bits; - min_hw_granule = ~max_hw_value; - - cfg->mbw_min = min_hw_granule + 1; + /* Set features and explicit default values for controls supported by this RIS. */ + if (mpam_has_feature(mpam_feat_cpor_part, props)) { + mpam_set_feature(mpam_feat_cpor_part, reset_cfg); + reset_cfg->cpbm = GENMASK(props->cpbm_wd - 1, 0); + } + if (mpam_has_feature(mpam_feat_mbw_part, props)) { + mpam_set_feature(mpam_feat_mbw_part, reset_cfg); + reset_cfg->mbw_pbm = GENMASK(props->mbw_pbm_bits - 1, 0); + } + if (mpam_has_feature(mpam_feat_mbw_max, props)) { + mpam_set_feature(mpam_feat_mbw_max, reset_cfg); + reset_cfg->mbw_max = MPAMCFG_MBW_MAX_MAX; + } + if (mpam_has_feature(mpam_feat_cmax_cmax, props)) { + mpam_set_feature(mpam_feat_cmax_cmax, reset_cfg); + reset_cfg->cmax = MPAMCFG_CMAX_CMAX; + } + if (mpam_has_feature(mpam_feat_cmax_cmin, props)) { + mpam_set_feature(mpam_feat_cmax_cmin, reset_cfg); + reset_cfg->cmin = 0; + } } /* @@ -1948,14 +1911,11 @@ static int mpam_reset_ris(void *arg) struct mpam_config reset_cfg; struct mpam_msc_ris *ris = arg; struct reprogram_ris reprogram_arg; - struct mpam_msc *msc = ris->vmsc->msc; if (ris->in_reset_state) return 0; - mpam_init_reset_cfg(&reset_cfg); - if (mpam_has_quirk(T241_FORCE_MBW_MIN_TO_ONE, msc)) - mpam_wa_t241_force_mbw_min_to_one(&reset_cfg, &ris->props); + mpam_init_reset_cfg(&reset_cfg, &ris->props); reprogram_arg.ris = ris; reprogram_arg.cfg = &reset_cfg; @@ -2759,6 +2719,9 @@ static void mpam_enable_merge_class_features(struct mpam_component *comp) list_for_each_entry(vmsc, &comp->vmsc, comp_list) __class_props_mismatch(class, vmsc); + + if (mpam_has_quirk(T241_FORCE_MBW_MIN_TO_ONE, class)) + mpam_clear_feature(mpam_feat_mbw_min, &class->props); } /* @@ -2855,6 +2818,12 @@ static irqreturn_t __mpam_irq_handler(int irq, struct mpam_msc *msc) msc->id, mpam_errcode_names[errcode], partid, pmg, ris); + /* No action is required for the MPAM programming errors */ + if ((errcode != MPAM_ERRCODE_REQ_PARTID_RANGE) && + (errcode != MPAM_ERRCODE_REQ_PMG_RANGE)) { + return IRQ_HANDLED; + } + /* Disable this interrupt. */ mpam_disable_msc_ecr(msc); @@ -2988,7 +2957,7 @@ static void __destroy_component_cfg(struct mpam_component *comp) static void mpam_reset_component_cfg(struct mpam_component *comp) { int i; - struct mpam_class *class = comp->class; + struct mpam_props *cprops = &comp->class->props; mpam_assert_partid_sizes_fixed(); @@ -2996,10 +2965,22 @@ static void mpam_reset_component_cfg(struct mpam_component *comp) return; for (i = 0; i < mpam_partid_max + 1; i++) { - mpam_init_reset_cfg(&comp->cfg[i]); - if (mpam_has_quirk(T241_FORCE_MBW_MIN_TO_ONE, class)) - mpam_wa_t241_force_mbw_min_to_one(&comp->cfg[i], - &class->props); + if (cprops->cpbm_wd) { + comp->cfg[i].cpbm = GENMASK(cprops->cpbm_wd - 1, 0); + mpam_set_feature(mpam_feat_cpor_part, &comp->cfg[i]); + } + if (cprops->mbw_pbm_bits) { + comp->cfg[i].mbw_pbm = GENMASK(cprops->mbw_pbm_bits - 1, 0); + mpam_set_feature(mpam_feat_mbw_part, &comp->cfg[i]); + } + if (cprops->bwa_wd) { + comp->cfg[i].mbw_max = MPAMCFG_MBW_MAX_MAX; + mpam_set_feature(mpam_feat_mbw_max, &comp->cfg[i]); + } + if (cprops->cmax_wd) { + comp->cfg[i].cmax = MPAMCFG_CMAX_CMAX; + mpam_set_feature(mpam_feat_cmax_cmax, &comp->cfg[i]); + } } } @@ -3433,18 +3414,6 @@ static void mpam_extend_config(struct mpam_class *class, struct mpam_config *cfg u16 min, min_hw_granule, delta; u16 max_hw_value, res0_bits; - /* - * Calculate the values the 'min' control can hold. - * e.g. on a platform with bwa_wd = 8, min_hw_granule is 0x00ff because - * those bits are RES0. Configurations of this value are effectively - * zero. But configurations need to saturate at min_hw_granule on - * systems with mismatched bwa_wd, where the 'less than 0' values are - * implemented on some MSC, but not others. - */ - res0_bits = 16 - cprops->bwa_wd; - max_hw_value = ((1 << cprops->bwa_wd) - 1) << res0_bits; - min_hw_granule = ~max_hw_value; - /* * MAX and MIN should be set together. If only one is provided, * generate a configuration for the other. If only one control @@ -3454,6 +3423,19 @@ static void mpam_extend_config(struct mpam_class *class, struct mpam_config *cfg */ if (mpam_has_feature(mpam_feat_mbw_max, cfg) && !mpam_has_feature(mpam_feat_mbw_min, cfg)) { + /* + * Calculate the values the 'min' control can hold. + * e.g. on a platform with bwa_wd = 8, min_hw_granule is 0x00ff + * because those bits are RES0. Configurations of this value + * are effectively zero. But configurations need to saturate + * at min_hw_granule on systems with mismatched bwa_wd, where + * the 'less than 0' values are implemented on some MSC, but + * not others. + */ + res0_bits = 16 - cprops->bwa_wd; + max_hw_value = ((1 << cprops->bwa_wd) - 1) << res0_bits; + min_hw_granule = ~max_hw_value; + delta = ((5 * MPAMCFG_MBW_MAX_MAX) / 100) - 1; if (cfg->mbw_max > delta) min = cfg->mbw_max - delta; @@ -3463,12 +3445,6 @@ static void mpam_extend_config(struct mpam_class *class, struct mpam_config *cfg cfg->mbw_min = max(min, min_hw_granule); mpam_set_feature(mpam_feat_mbw_min, cfg); } - - if (mpam_has_quirk(T241_FORCE_MBW_MIN_TO_ONE, class) && - cfg->mbw_min <= min_hw_granule) { - cfg->mbw_min = min_hw_granule + 1; - mpam_set_feature(mpam_feat_mbw_min, cfg); - } } int mpam_apply_config(struct mpam_component *comp, u16 partid, diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index c2cb5129e3e21..0206fa67be21e 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -45,12 +45,6 @@ DECLARE_STATIC_KEY_FALSE(mpam_enabled); */ #define USE_PRE_ALLOCATED (U16_MAX + 1) -/* - * Only these event configuration bits are supported. MPAM can't know if - * data is being written back, these will show up as a write. - */ -#define MPAM_RESTRL_EVT_CONFIG_VALID (READS_TO_LOCAL_MEM | NON_TEMP_WRITE_TO_LOCAL_MEM) - static inline bool mpam_is_enabled(void) { return static_branch_likely(&mpam_enabled); @@ -369,9 +363,6 @@ struct mpam_config { bool cmax_softlim; - bool reset_cpbm; - bool reset_mbw_pbm; - struct mpam_garbage garbage; }; @@ -455,9 +446,7 @@ struct mpam_resctrl_dom { struct mpam_component *mon_comp[QOS_NUM_EVENTS]; struct rdt_ctrl_domain resctrl_ctrl_dom; - struct rdt_mon_domain resctrl_mon_dom; - - u32 mbm_local_evt_cfg; + struct rdt_l3_mon_domain resctrl_mon_dom; }; struct mpam_resctrl_res { @@ -541,7 +530,6 @@ int mpam_apply_config(struct mpam_component *comp, u16 partid, int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, enum mpam_device_features, u64 *val); void mpam_msmon_reset_mbwu(struct mpam_component *comp, struct mon_cfg *ctx); -void mpam_msmon_reset_all_mbwu(struct mpam_component *comp); int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, cpumask_t *affinity); @@ -554,13 +542,13 @@ void mpam_pcc_rx_callback(struct mbox_client *cl, void *msg); int mpam_resctrl_setup(void); void mpam_resctrl_exit(void); int mpam_resctrl_online_cpu(unsigned int cpu); -int mpam_resctrl_offline_cpu(unsigned int cpu); +void mpam_resctrl_offline_cpu(unsigned int cpu); void mpam_resctrl_teardown_class(struct mpam_class *class); #else static inline int mpam_resctrl_setup(void) { return 0; } static inline void mpam_resctrl_exit(void) { } static inline int mpam_resctrl_online_cpu(unsigned int cpu) { return 0; } -static inline int mpam_resctrl_offline_cpu(unsigned int cpu) { return 0; } +static inline void mpam_resctrl_offline_cpu(unsigned int cpu) { } static inline void mpam_resctrl_teardown_class(struct mpam_class *class) { } #endif /* CONFIG_RESCTRL_FS */ diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 8e87afa90656a..0accede8cc09c 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -12,9 +12,6 @@ #include #include #include -#include -#include -#include #include #include #include @@ -35,6 +32,11 @@ DECLARE_WAIT_QUEUE_HEAD(resctrl_mon_ctx_waiters); */ static struct mpam_resctrl_res mpam_resctrl_controls[RDT_NUM_RESOURCES]; +#define for_each_mpam_resctrl_control(res, rid) \ + for (rid = 0, res = &mpam_resctrl_controls[rid]; \ + rid < RDT_NUM_RESOURCES; \ + rid++, res = &mpam_resctrl_controls[rid]) + /* The lock for modifying resctrl's domain lists from cpuhp callbacks. */ static DEFINE_MUTEX(domain_list_lock); @@ -46,10 +48,13 @@ static DEFINE_MUTEX(domain_list_lock); * make use of them, we pretend they are on L3. * Class pointer may be NULL. */ -static struct mpam_resctrl_mon mpam_resctrl_counters[QOS_NUM_EVENTS]; +#define MPAM_MAX_EVENT QOS_L3_MBM_LOCAL_EVENT_ID +static struct mpam_resctrl_mon mpam_resctrl_counters[MPAM_MAX_EVENT + 1]; -static bool exposed_alloc_capable; -static bool exposed_mon_capable; +#define for_each_mpam_resctrl_mon(mon, eventid) \ + for (eventid = QOS_FIRST_EVENT, mon = &mpam_resctrl_counters[eventid]; \ + eventid <= MPAM_MAX_EVENT; \ + eventid++, mon = &mpam_resctrl_counters[eventid]) /* * MPAM emulates CDP by setting different PARTID in the I/D fields of MPAM0_EL1. @@ -57,14 +62,6 @@ static bool exposed_mon_capable; */ static bool cdp_enabled; -/* - * To support CPU-less NUMA nodes, user-space needs to opt in to the MB - * domain IDs being the NUMA nid instead of the corresponding CPU's L3 - * cache-id. - */ -static bool mb_uses_numa_nid; -static bool mb_numa_nid_possible; -static bool mb_l3_cache_id_possible; /* * If resctrl_init() succeeded, resctrl_exit() can be used to remove support * for the filesystem in the event of an error. @@ -103,12 +100,49 @@ static bool mpam_resctrl_abmc_enabled(void) bool resctrl_arch_alloc_capable(void) { - return exposed_alloc_capable; + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + + for_each_mpam_resctrl_control(res, rid) { + if (res->resctrl_res.alloc_capable) + return true; + } + + return false; } bool resctrl_arch_mon_capable(void) { - return exposed_mon_capable; + struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + struct rdt_resource *l3 = &res->resctrl_res; + + /* All monitors are presented as being on the L3 cache */ + return l3->mon_capable; +} + +/* + * Provide empty implementations for compilation. The feature are not + * needed on MPAM platforms. + */ +bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt) +{ + return false; +} + +void resctrl_arch_mon_event_config_read(void *info) +{ +} + +void resctrl_arch_mon_event_config_write(void *info) +{ +} + +void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d) +{ +} + +void resctrl_arch_pre_mount(void) +{ } bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level rid) @@ -145,18 +179,48 @@ static void resctrl_reset_task_closids(void) read_unlock(&tasklist_lock); } -static void mpam_resctrl_monitor_sync_abmc_vals(struct rdt_resource *l3) +static struct mpam_resctrl_mon *mpam_resctrl_mon_from_res(struct mpam_resctrl_res *res) +{ + struct mpam_resctrl_mon *mon; + enum resctrl_event_id eventid; + + if (!res->class) + return NULL; + + for_each_mpam_resctrl_mon(mon, eventid) { + if (mon->class == res->class) + return mon; + } + return NULL; +} + +static struct mpam_resctrl_res *mpam_resctrl_res_from_mon(struct mpam_resctrl_mon *mon) { - l3->mon.num_mbm_cntrs = l3_num_allocated_mbwu; + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + + if (!mon->class) + return NULL; + + for_each_mpam_resctrl_control(res, rid) { + if (res->class == mon->class) + return res; + } + return NULL; +} + +static void mpam_resctrl_monitor_sync_abmc_vals(struct rdt_resource *r) +{ + r->mon.num_mbm_cntrs = l3_num_allocated_mbwu; if (cdp_enabled) - l3->mon.num_mbm_cntrs /= 2; + r->mon.num_mbm_cntrs /= 2; - if (l3->mon.num_mbm_cntrs) { - l3->mon.mbm_cntr_assignable = mpam_resctrl_abmc_enabled(); - l3->mon.mbm_assign_on_mkdir = mpam_resctrl_abmc_enabled(); + if (r->mon.num_mbm_cntrs) { + r->mon.mbm_cntr_assignable = mpam_resctrl_abmc_enabled(); + r->mon.mbm_assign_on_mkdir = mpam_resctrl_abmc_enabled(); } else { - l3->mon.mbm_cntr_assignable = false; - l3->mon.mbm_assign_on_mkdir = false; + r->mon.mbm_cntr_assignable = false; + r->mon.mbm_assign_on_mkdir = false; } } @@ -466,7 +530,7 @@ bool resctrl_arch_mon_can_overflow(void) static int __read_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, - enum mpam_device_features mon_type, enum mon_filter_options mon_opts, + enum mpam_device_features mon_type, int mon_idx, enum resctrl_conf_type cdp_type, u32 closid, u32 rmid, u64 *val) { @@ -495,7 +559,6 @@ __read_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, cfg.match_pmg = true; cfg.partid = closid; cfg.pmg = rmid; - cfg.opts = mon_opts; if (irqs_disabled()) { /* Check if we can access this domain without an IPI */ @@ -506,49 +569,36 @@ __read_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, } static int read_mon_cdp_safe(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, - enum mpam_device_features mon_type, enum mon_filter_options mon_opts, + enum mpam_device_features mon_type, int mon_idx, u32 closid, u32 rmid, u64 *val) { if (cdp_enabled) { u64 cdp_val = 0; int err; - err = __read_mon(mon, mon_comp, mon_type, mon_opts, mon_idx, + err = __read_mon(mon, mon_comp, mon_type, mon_idx, CDP_CODE, closid, rmid, &cdp_val); if (err) return err; - err = __read_mon(mon, mon_comp, mon_type, mon_opts, mon_idx, + err = __read_mon(mon, mon_comp, mon_type, mon_idx, CDP_DATA, closid, rmid, &cdp_val); if (!err) *val += cdp_val; return err; } - return __read_mon(mon, mon_comp, mon_type, mon_idx, mon_opts, + return __read_mon(mon, mon_comp, mon_type, mon_idx, CDP_NONE, closid, rmid, val); } -static enum mon_filter_options resctrl_evt_config_to_mpam(u32 local_evt_cfg) -{ - switch (local_evt_cfg) { - case READS_TO_LOCAL_MEM: - return COUNT_READ; - case NON_TEMP_WRITE_TO_LOCAL_MEM: - return COUNT_WRITE; - default: - return COUNT_BOTH; - } -} - /* MBWU when not in ABMC mode, and CSU counters. */ -int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, u32 closid, u32 rmid, enum resctrl_event_id eventid, - u64 *val, void *arch_mon_ctx) + void *arch_priv, u64 *val, void *arch_mon_ctx) { struct mpam_resctrl_dom *l3_dom; struct mpam_component *mon_comp; - enum mon_filter_options mon_opts; u32 mon_idx = *(u32 *)arch_mon_ctx; enum mpam_device_features mon_type; struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[eventid]; @@ -561,9 +611,8 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, if (eventid >= QOS_NUM_EVENTS || !mon->class) return -EINVAL; - l3_dom = container_of(d, struct mpam_resctrl_dom, resctrl_mon_dom); + l3_dom = container_of(hdr, struct mpam_resctrl_dom, resctrl_mon_dom.hdr); mon_comp = l3_dom->mon_comp[eventid]; - mon_opts = resctrl_evt_config_to_mpam(l3_dom->mbm_local_evt_cfg); switch (eventid) { case QOS_L3_OCCUP_EVENT_ID: @@ -577,17 +626,16 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, return -EINVAL; } - return read_mon_cdp_safe(mon, mon_comp, mon_type, mon_opts, mon_idx, + return read_mon_cdp_safe(mon, mon_comp, mon_type, mon_idx, closid, rmid, val); } /* MBWU counters when in ABMC mode */ -int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 closid, u32 rmid, int mon_idx, enum resctrl_event_id eventid, u64 *val) { struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[eventid]; - enum mon_filter_options mon_opts; struct mpam_resctrl_dom *l3_dom; struct mpam_component *mon_comp; @@ -599,10 +647,9 @@ int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, l3_dom = container_of(d, struct mpam_resctrl_dom, resctrl_mon_dom); mon_comp = l3_dom->mon_comp[eventid]; - mon_opts = resctrl_evt_config_to_mpam(l3_dom->mbm_local_evt_cfg); return read_mon_cdp_safe(mon, mon_comp, mpam_feat_msmon_mbwu, mon_idx, - mon_opts, closid, rmid, val); + closid, rmid, val); } static void __reset_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, @@ -640,7 +687,7 @@ static void reset_mon_cdp_safe(struct mpam_resctrl_mon *mon, struct mpam_compone } /* Called via IPI. Call with read_cpus_lock() held. */ -void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 closid, u32 rmid, enum resctrl_event_id eventid) { struct mpam_resctrl_dom *l3_dom; @@ -661,7 +708,7 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, } /* Reset an assigned counter */ -void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 closid, u32 rmid, int cntr_id, enum resctrl_event_id eventid) { @@ -843,14 +890,7 @@ static u32 percent_to_mbw_pbm(u8 pc, struct mpam_props *cprops) */ static u32 fract16_to_percent(u16 fract, u8 wd) { - u32 val = fract; - - val >>= 16 - wd; - val += 1; - val *= MAX_MBA_BW; - val = DIV_ROUND_CLOSEST(val, 1 << wd); - - return val; + return DIV_ROUND_CLOSEST((fract + 1) * 100, 65536); } /* @@ -865,14 +905,7 @@ static u32 fract16_to_percent(u16 fract, u8 wd) */ static u16 percent_to_fract16(u8 pc, u8 wd) { - u32 val = pc; - - val <<= wd; - val = DIV_ROUND_CLOSEST(val, MAX_MBA_BW); - val = max(val, 1) - 1; - val <<= 16 - wd; - - return val; + return pc ? (((pc * 65536) / 100) - 1) : 0; } static u32 mbw_max_to_percent(u16 mbw_max, struct mpam_props *cprops) @@ -942,7 +975,8 @@ static bool topology_matches_l3(struct mpam_class *victim) { int cpu, err; struct mpam_component *victim_iter; - cpumask_var_t __free(free_cpumask_var) tmp_cpumask; + bool matched_once = false; + cpumask_var_t __free(free_cpumask_var) tmp_cpumask = NULL; if (!alloc_cpumask_var(&tmp_cpumask, GFP_KERNEL)) return false; @@ -956,7 +990,10 @@ static bool topology_matches_l3(struct mpam_class *victim) return false; } - cpu = cpumask_any(&victim_iter->affinity); + cpu = cpumask_any_and(&victim_iter->affinity, cpu_online_mask); + if (matched_once && (cpu >= nr_cpu_ids)) + continue; + if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) return false; @@ -978,18 +1015,63 @@ static bool topology_matches_l3(struct mpam_class *victim) return false; } + matched_once = true; } return true; } -static bool topology_matches_numa(struct mpam_class *victim) +/* + * Test if the traffic for a class matches that at egress from the L3. For + * MSC at memory controllers this is only possible if there is a single L3 + * as otherwise the counters at the memory can include bandwidth from the + * non-local L3. + */ +static bool traffic_matches_l3(struct mpam_class *class) { - /* - * For now, check this is a memory class, in which case component - * id are already NUMA nid. - */ - return (victim->type == MPAM_CLASS_MEMORY); + int err, cpu; + + lockdep_assert_cpus_held(); + + if (class->type == MPAM_CLASS_CACHE && class->level == 3) + return true; + + if (class->type == MPAM_CLASS_CACHE && class->level != 3) { + pr_debug("class %u is a different cache from L3\n", class->level); + return false; + } + + cpumask_var_t __free(free_cpumask_var) tmp_cpumask = NULL; + if (!alloc_cpumask_var(&tmp_cpumask, GFP_KERNEL)) { + pr_debug("cpumask allocation failed\n"); + return false; + } + + if (class->type != MPAM_CLASS_MEMORY) { + pr_debug("class %u is neither of type cache or memory\n", + class->level); + return false; + } + + cpu = cpumask_any_and(&class->affinity, cpu_online_mask); + err = find_l3_equivalent_bitmask(cpu, tmp_cpumask); + if (err) { + pr_debug("Failed to find L3 downstream to cpu %d\n", cpu); + return false; + } + + if (!cpumask_equal(tmp_cpumask, cpu_possible_mask)) { + pr_debug("There is more than one L3\n"); + return false; + } + + /* Be strict; the traffic might stop in the intermediate cache. */ + if (get_cpu_cacheinfo_id(cpu, 4) != -1) { + pr_debug("L3 isn't the last level of cache\n"); + return false; + } + + return true; } /* Test whether we can export MPAM_CLASS_CACHE:{2,3}? */ @@ -1036,7 +1118,6 @@ static void mpam_resctrl_pick_caches(void) else res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; res->class = class; - exposed_alloc_capable = true; } if (has_cmax) { pr_debug("pick_caches: Class has CMAX\n"); @@ -1045,7 +1126,6 @@ static void mpam_resctrl_pick_caches(void) else res = &mpam_resctrl_controls[RDT_RESOURCE_L3_MAX]; res->class = class; - exposed_alloc_capable = true; } } } @@ -1061,8 +1141,6 @@ static void mpam_resctrl_pick_mba(void) list_for_each_entry_srcu(class, &mpam_classes, classes_list, srcu_read_lock_held(&mpam_srcu)) { struct mpam_props *cprops = &class->props; - bool l3_cache_id_possible = false; - bool numa_nid_possible = false; if (class->level < 3) { pr_debug("class %u is before L3\n", class->level); @@ -1079,18 +1157,16 @@ static void mpam_resctrl_pick_mba(void) continue; } - if (topology_matches_numa(class)) { - pr_debug("class %u topology matches NUMA domains\n", class->level); - numa_nid_possible = true; - } - - if (topology_matches_l3(class)) { - pr_debug("class %u topology matches L3\n", class->level); - l3_cache_id_possible = true; + if ((class->level == 3) && (!topology_matches_l3(class))) { + pr_debug("class %u topology doesn't match L3\n", class->level); + continue; } - if (!l3_cache_id_possible && !numa_nid_possible) { - pr_debug("class %u has no matching topology for MB\n", class->level); + /* Check memory at egress from L3 for MSC with L3 */ + if (!cpumask_equal(&class->affinity, cpu_possible_mask) && + !traffic_matches_l3(class)) { + pr_debug("class %u traffic doesn't match L3 egress\n", + class->level); continue; } @@ -1099,24 +1175,14 @@ static void mpam_resctrl_pick_mba(void) * mbm_local is implicitly part of the L3, pick a resource to be MBA * that as close as possible to the L3. */ - if (!candidate_class || class->level < candidate_class->level) { - /* - * Refuse to pick a closer class if it would prevent cache-id - * being used as domain-id by default. - */ - if (!candidate_class || l3_cache_id_possible) { - candidate_class = class; - mb_l3_cache_id_possible = l3_cache_id_possible; - mb_numa_nid_possible = numa_nid_possible; - } - } + if (!candidate_class || class->level < candidate_class->level) + candidate_class = class; } if (candidate_class) { pr_debug("selected class %u to back MBA\n", candidate_class->level); res = &mpam_resctrl_controls[RDT_RESOURCE_MBA]; res->class = candidate_class; - exposed_alloc_capable = true; } } @@ -1188,7 +1254,6 @@ static void counter_update_class(enum resctrl_event_id evt_id, pr_debug("Updating event %u to use class %u\n", evt_id, class->level); mon->class = class; - exposed_mon_capable = true; if (evt_id == QOS_L3_OCCUP_EVENT_ID) return; @@ -1272,7 +1337,10 @@ static void mpam_resctrl_pick_counters(void) } has_mbwu = class_has_usable_mbwu(class); - if (has_mbwu && topology_matches_l3(class)) { + if (has_mbwu && + ((class->type == MPAM_CLASS_MEMORY) || + (topology_matches_l3(class) && + traffic_matches_l3(class)))) { pr_debug("class %u has usable MBWU, and matches L3 topology", class->level); /* @@ -1303,82 +1371,6 @@ static void mpam_resctrl_pick_counters(void) mpam_resctrl_counters[QOS_L3_MBM_TOTAL_EVENT_ID].class); } -bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt) -{ - struct mpam_class *class; - struct mpam_props *cprops; - - class = mpam_resctrl_counters[evt].class; - if (!class) - return false; - - cprops = &class->props; - - return mpam_has_feature(mpam_feat_msmon_mbwu_rwbw, cprops); -} - -void resctrl_arch_mon_event_config_read(void *info) -{ - struct mpam_resctrl_dom *dom; - struct resctrl_mon_config_info *mon_info = info; - - if (!mpam_is_enabled()) { - mon_info->mon_config = 0; - return; - } - - dom = container_of(mon_info->d, struct mpam_resctrl_dom, resctrl_mon_dom); - mon_info->mon_config = dom->mbm_local_evt_cfg & MAX_EVT_CONFIG_BITS; -} - -void resctrl_arch_mon_event_config_write(void *info) -{ - struct mpam_resctrl_dom *dom; - struct resctrl_mon_config_info *mon_info = info; - - WARN_ON_ONCE(mon_info->mon_config & ~MPAM_RESTRL_EVT_CONFIG_VALID); - - dom = container_of(mon_info->d, struct mpam_resctrl_dom, resctrl_mon_dom); - - if (!mpam_is_enabled()) { - dom->mbm_local_evt_cfg = 0; - return; - } - - dom->mbm_local_evt_cfg = mon_info->mon_config & MPAM_RESTRL_EVT_CONFIG_VALID; -} - -void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) -{ - int i; - struct mpam_resctrl_dom *dom; - struct mpam_resctrl_mon *mon; - struct mpam_component *mon_comp; - - dom = container_of(d, struct mpam_resctrl_dom, resctrl_mon_dom); - if (!mpam_is_enabled()) { - dom->mbm_local_evt_cfg = 0; - return; - } - dom->mbm_local_evt_cfg = MPAM_RESTRL_EVT_CONFIG_VALID; - - /* - * Monitors may be backed by different classes of MSC, all - * possible components need to be reset... - */ - for (i = 0; i < QOS_NUM_EVENTS; i++) { - mon = &mpam_resctrl_counters[i]; - if (!mon->class) - continue; // dummy resource - - mon_comp = dom->mon_comp[i]; - if (!mon_comp) - continue; - - mpam_msmon_reset_all_mbwu(mon_comp); - } -} - static void __config_cntr(struct mpam_resctrl_mon *mon, u32 cntr_id, enum resctrl_conf_type cdp_type, u32 closid, u32 rmid, bool assign) @@ -1395,7 +1387,7 @@ static void __config_cntr(struct mpam_resctrl_mon *mon, u32 cntr_id, mon->mbwu_idx_to_mon[mbwu_idx] = -1; } -void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, enum resctrl_event_id evtid, u32 rmid, u32 closid, u32 cntr_id, bool assign) { @@ -1418,10 +1410,16 @@ void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) { - if (r != &mpam_resctrl_controls[RDT_RESOURCE_L3].resctrl_res) + struct mpam_resctrl_res *res; + struct mpam_resctrl_mon *mon; + + res = container_of(r, struct mpam_resctrl_res, resctrl_res); + + mon = mpam_resctrl_mon_from_res(res); + if (!mon) return false; - return mpam_resctrl_abmc_enabled(); + return mon->assigned_counters ? true : false; } int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable) @@ -1486,13 +1484,9 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res, break; case RDT_RESOURCE_MBA: - /* Domain ID is the L3 cache-id by default */ - if (mb_l3_cache_id_possible) - r->alloc_capable = true; - + r->alloc_capable = true; r->schema_fmt = RESCTRL_SCHEMA_PERCENT; r->ctrl_scope = RESCTRL_L3_CACHE; - r->mba.delay_linear = true; r->mba.throttle_mode = THREAD_THROTTLE_UNDEFINED; r->membw.min_bw = get_mba_min(cprops); @@ -1511,15 +1505,12 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res, static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) { - bool is_mb; struct mpam_class *class = comp->class; - is_mb = (mpam_resctrl_controls[RDT_RESOURCE_MBA].class == class); - - if (is_mb && mb_uses_numa_nid && topology_matches_numa(class)) + if (class->type == MPAM_CLASS_CACHE) return comp->comp_id; - if (class->type == MPAM_CLASS_CACHE) + if ((class->type == MPAM_CLASS_MEMORY) && (class->level > 3)) return comp->comp_id; if (topology_matches_l3(class)) { @@ -1545,10 +1536,10 @@ static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) */ static int mpam_resctrl_monitor_init_abmc(struct mpam_resctrl_mon *mon) { - struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + struct mpam_resctrl_res *res = mpam_resctrl_res_from_mon(mon); size_t array_size = resctrl_arch_system_num_rmid_idx() * sizeof(int); int *rmid_array __free(kfree) = kmalloc(array_size, GFP_KERNEL); - struct rdt_resource *l3 = &res->resctrl_res; + struct rdt_resource *r = &res->resctrl_res; struct mpam_class *class = mon->class; u16 num_mbwu_mon; @@ -1569,70 +1560,33 @@ static int mpam_resctrl_monitor_init_abmc(struct mpam_resctrl_mon *mon) return PTR_ERR(mon->assigned_counters); mon->mbwu_idx_to_mon = no_free_ptr(rmid_array); - mpam_resctrl_monitor_sync_abmc_vals(l3); + mpam_resctrl_monitor_sync_abmc_vals(r); return 0; } -bool resctrl_arch_get_mb_uses_numa_nid(void) -{ - return mb_uses_numa_nid; -} - -int resctrl_arch_set_mb_uses_numa_nid(bool enabled) +static int mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, + enum resctrl_event_id type) { - struct rdt_resource *r; struct mpam_resctrl_res *res; - struct mpam_resctrl_dom *dom; - struct rdt_ctrl_domain *ctrl_d; - - lockdep_assert_cpus_held(); - lockdep_assert_mems_held(); - - if (!mb_numa_nid_possible) - return -EOPNOTSUPP; - - if (mb_uses_numa_nid == enabled) - return 0; - - /* Domain IDs as NUMA nid is only defined for MBA */ - res = &mpam_resctrl_controls[RDT_RESOURCE_MBA]; - if (!res->class) - return -EOPNOTSUPP; - r = &res->resctrl_res; - - /* repaint the domain IDs */ - mb_uses_numa_nid = enabled; - list_for_each_entry(ctrl_d, &r->ctrl_domains, hdr.list) { - int cpu = cpumask_any(&ctrl_d->hdr.cpu_mask); - - dom = container_of(ctrl_d, struct mpam_resctrl_dom, resctrl_ctrl_dom); - ctrl_d->hdr.id = mpam_resctrl_pick_domain_id(cpu, dom->ctrl_comp); - } - - /* monitor domains are unaffected and should continue to use the L3 */ + struct rdt_resource *r; - if (!enabled && mb_l3_cache_id_possible) - r->alloc_capable = true; - else if (enabled && mb_numa_nid_possible) - r->alloc_capable = true; + if ((mon->class->type == MPAM_CLASS_MEMORY) && (mon->class->level > 3)) + res = &mpam_resctrl_controls[RDT_RESOURCE_MBA]; else - r->alloc_capable = false; - - return 0; -} + res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; -static void mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, - enum resctrl_event_id type) -{ - struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; - struct rdt_resource *l3 = &res->resctrl_res; + r = &res->resctrl_res; lockdep_assert_cpus_held(); - /* There also needs to be an L3 cache present */ + /* + * There also needs to be an L3 cache present. + * The check just requires any online CPU and it can't go offline as we + * hold the cpu lock. + */ if (get_cpu_cacheinfo_id(smp_processor_id(), 3) == -1) - return; + return 0; /* * If there are no MPAM resources on L3, force it into existence. @@ -1644,42 +1598,43 @@ static void mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, res->class = mpam_resctrl_counters[type].class; } - /* Called multiple times!, once per event type */ - if (exposed_mon_capable) { - l3->mon_capable = true; - - /* Setting name is necessary on monitor only platforms */ - l3->name = "L3"; - l3->mon_scope = RESCTRL_L3_CACHE; + /* + * Called multiple times!, once per event type that has a + * monitoring class. + * Setting name is necessary on monitor only platforms. + */ + if ((mon->class->type == MPAM_CLASS_MEMORY) && (mon->class->level > 3)) { + r->name = "MB"; + } else { + r->name = "L3"; + } + r->mon_scope = RESCTRL_L3_CACHE; - resctrl_enable_mon_event(type); + /* + * num-rmid is the upper bound for the number of monitoring + * groups that can exist simultaneously, including the + * default monitoring group for each control group. Hence, + * advertise the whole rmid_idx space even though each + * control group has its own pmg/rmid space. Unfortunately, + * this does mean userspace needs to know the architecture + * to correctly interpret this value. + */ + r->mon.num_rmid = resctrl_arch_system_num_rmid_idx(); - /* - * Unfortunately, num_rmid doesn't mean anything for - * mpam, and its exposed to user-space! - * - * num-rmid is supposed to mean the minimum number of - * monitoring groups that can exist simultaneously, including - * the default monitoring group for each control group. - * - * For mpam, each control group has its own pmg/rmid space, so - * it is not appropriate to advertise the whole rmid_idx space - * here. But the pmgs corresponding to the parent control - * group can be allocated freely: - */ - l3->mon.num_rmid = mpam_pmg_max + 1;; + if (resctrl_enable_mon_event(type, false, 0, NULL)) + r->mon_capable = true; - switch (type) { - case QOS_L3_MBM_LOCAL_EVENT_ID: - case QOS_L3_MBM_TOTAL_EVENT_ID: - mpam_resctrl_monitor_init_abmc(mon); - l3->mon.mbm_cfg_mask = MPAM_RESTRL_EVT_CONFIG_VALID; + switch (type) { + case QOS_L3_MBM_LOCAL_EVENT_ID: + case QOS_L3_MBM_TOTAL_EVENT_ID: + mpam_resctrl_monitor_init_abmc(mon); - return; - default: - return; - } + return 0; + default: + return 0; } + + return 0; } u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, @@ -1881,46 +1836,40 @@ void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) mpam_reset_class_locked(res->class); } -/** - * mpam_resctrl_domain_hdr_init() - Bring a subset of a domain online. - * @onlined_cpus: The set of CPUs that are online from the domain's - * perspective. - * @comp: The mpam component being brought online. - * @hdr: The header representing the domain. - * - * Adds @onlined_cpus to @hdr's cpu_mask, and sets the @hdr id. - * For NUMA nodes, @onlined_cpus will be cpu_possible_mask. - */ -static void mpam_resctrl_domain_hdr_init(const struct cpumask *onlined_cpus, - struct mpam_component *comp, +static void mpam_resctrl_domain_hdr_init(int cpu, struct mpam_component *comp, + enum resctrl_res_level rid, struct rdt_domain_hdr *hdr) { - int cpu = cpumask_any(onlined_cpus); - lockdep_assert_cpus_held(); INIT_LIST_HEAD(&hdr->list); hdr->id = mpam_resctrl_pick_domain_id(cpu, comp); - cpumask_and(&hdr->cpu_mask, &hdr->cpu_mask, onlined_cpus); + hdr->rid = rid; + cpumask_set_cpu(cpu, &hdr->cpu_mask); +} + +static void mpam_resctrl_online_domain_hdr(unsigned int cpu, + struct rdt_domain_hdr *hdr) +{ + lockdep_assert_cpus_held(); + + cpumask_set_cpu(cpu, &hdr->cpu_mask); } /** - * mpam_resctrl_offline_domain_hdr() - Take a subset of a domain offline. - * @offlined_cpus: The set of CPUs that are offline from the domain's - * perspective. + * mpam_resctrl_offline_domain_hdr() - Update the domain header to remove a CPU. + * @cpu: The CPU to remove from the domain. * @hdr: The domain's header. * - * Removes @offlined_cpus from @hdr's cpu_mask. If the list is empty, + * Removes @cpu from the header mask. If this was the last CPU in the domain, * the domain header is removed from its parent list and true is returned, * indicating the parent structure can be freed. * If there are other CPUs in the domain, returns false. - * - * For NUMA nodes, @offlined_cpus will be cpu_possible_mask. */ -static bool mpam_resctrl_offline_domain_hdr(const struct cpumask *offlined_cpus, +static bool mpam_resctrl_offline_domain_hdr(unsigned int cpu, struct rdt_domain_hdr *hdr) { - cpumask_andnot(&hdr->cpu_mask, &hdr->cpu_mask, offlined_cpus); + cpumask_clear_cpu(cpu, &hdr->cpu_mask); if (cpumask_empty(&hdr->cpu_mask)) { list_del(&hdr->list); return true; @@ -1929,18 +1878,14 @@ static bool mpam_resctrl_offline_domain_hdr(const struct cpumask *offlined_cpus, return false; } -static struct mpam_component *find_component(struct mpam_class *victim, - const struct cpumask *onlined_cpus) +static struct mpam_component *find_component(struct mpam_class *victim, int cpu) { struct mpam_component *victim_comp; guard(srcu)(&mpam_srcu); list_for_each_entry_srcu(victim_comp, &victim->components, class_list, srcu_read_lock_held(&mpam_srcu)) { - struct cpumask tmp; - - cpumask_andnot(&tmp, onlined_cpus, &victim_comp->affinity); - if (cpumask_empty(&tmp)) + if (cpumask_test_cpu(cpu, &victim_comp->affinity)) return victim_comp; } @@ -1963,41 +1908,40 @@ static void mpam_resctrl_domain_insert(struct list_head *list, } static struct mpam_resctrl_dom * -mpam_resctrl_alloc_domain(const struct cpumask *onlined_cpus, int nid, - struct mpam_component *ctrl_comp, - struct mpam_resctrl_res *res) +mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res, + struct mpam_component *comp) { int err; struct mpam_resctrl_dom *dom; - struct rdt_mon_domain *mon_d; + struct rdt_l3_mon_domain *mon_d; struct rdt_ctrl_domain *ctrl_d; struct rdt_resource *r = &res->resctrl_res; lockdep_assert_held(&domain_list_lock); - dom = kzalloc_node(sizeof(*dom), GFP_KERNEL, nid); + dom = kzalloc_node(sizeof(*dom), GFP_KERNEL, cpu_to_node(cpu)); if (!dom) return ERR_PTR(-ENOMEM); - if (exposed_alloc_capable) { - dom->ctrl_comp = ctrl_comp; + if (resctrl_arch_alloc_capable()) { + dom->ctrl_comp = comp; ctrl_d = &dom->resctrl_ctrl_dom; - mpam_resctrl_domain_hdr_init(onlined_cpus, ctrl_comp, &ctrl_d->hdr); + mpam_resctrl_domain_hdr_init(cpu, comp, r->rid, &ctrl_d->hdr); ctrl_d->hdr.type = RESCTRL_CTRL_DOMAIN; - mpam_resctrl_domain_insert(&r->ctrl_domains, &ctrl_d->hdr); err = resctrl_online_ctrl_domain(r, ctrl_d); - if (err) { - dom = ERR_PTR(err); - goto offline_ctrl_domain; - } + if (err) + goto free_domain; + + mpam_resctrl_domain_insert(&r->ctrl_domains, &ctrl_d->hdr); } else { pr_debug("Skipped control domain online - no controls\n"); } - if (exposed_mon_capable) { + if (resctrl_arch_mon_capable()) { int i; - struct mpam_component *mon_comp, *any_mon_comp; + struct mpam_component *any_mon_comp; + struct mpam_resctrl_mon *mon; /* * Even if the monitor domain is backed by a different component, @@ -2006,43 +1950,47 @@ mpam_resctrl_alloc_domain(const struct cpumask *onlined_cpus, int nid, * Search each event's class list for a component with overlapping * CPUs and set up the dom->mon_comp array. */ - for (i = 0; i < QOS_NUM_EVENTS; i++) { - struct mpam_resctrl_mon *mon; - mon = &mpam_resctrl_counters[i]; + for_each_mpam_resctrl_mon(mon, i) { + struct mpam_component *mon_comp; + if (!mon->class) continue; // dummy resource - mon_comp = find_component(mon->class, onlined_cpus); + mon_comp = comp ? comp: find_component(mon->class, cpu); dom->mon_comp[i] = mon_comp; if (mon_comp) any_mon_comp = mon_comp; } - WARN_ON_ONCE(!any_mon_comp); - - dom->mbm_local_evt_cfg = MPAM_RESTRL_EVT_CONFIG_VALID; + if (!any_mon_comp) { + WARN_ON_ONCE(0); + err = -EFAULT; + goto offline_ctrl_domain; + } mon_d = &dom->resctrl_mon_dom; - mpam_resctrl_domain_hdr_init(onlined_cpus, any_mon_comp, - &mon_d->hdr); + mpam_resctrl_domain_hdr_init(cpu, any_mon_comp, r->rid, &mon_d->hdr); mon_d->hdr.type = RESCTRL_MON_DOMAIN; + err = resctrl_online_mon_domain(r, &mon_d->hdr); + if (err) + goto offline_ctrl_domain; + mpam_resctrl_domain_insert(&r->mon_domains, &mon_d->hdr); - err = resctrl_online_mon_domain(r, mon_d); - if (err) { - dom = ERR_PTR(err); - goto offline_mon_hdr; - } } else { pr_debug("Skipped monitor domain online - no monitors\n"); } - goto out; -offline_mon_hdr: - mpam_resctrl_offline_domain_hdr(onlined_cpus, &ctrl_d->hdr); + return dom; offline_ctrl_domain: - resctrl_offline_ctrl_domain(r, ctrl_d); -out: + if (resctrl_arch_alloc_capable()) { + mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr); + resctrl_offline_ctrl_domain(r, ctrl_d); + } +free_domain: + kfree(dom); + dom = ERR_PTR(err); + return dom; } @@ -2053,10 +2001,10 @@ mpam_resctrl_alloc_domain(const struct cpumask *onlined_cpus, int nid, * This relies on mpam_resctrl_pick_domain_id() using the L3 cache-id * for anything that is not a cache. */ -static struct mpam_resctrl_dom *mpam_resctrl_get_mon_domain_from_cpu(int cpu) +static struct mpam_resctrl_dom * +mpam_resctrl_get_mon_domain_from_cpu(int cpu, struct mpam_component *comp) { u32 cache_id; - struct rdt_mon_domain *mon_d; struct mpam_resctrl_dom *dom; struct mpam_resctrl_res *l3 = &mpam_resctrl_controls[RDT_RESOURCE_L3]; @@ -2067,10 +2015,10 @@ static struct mpam_resctrl_dom *mpam_resctrl_get_mon_domain_from_cpu(int cpu) if (cache_id == ~0) return NULL; - list_for_each_entry(mon_d, &l3->resctrl_res.mon_domains, hdr.list) { - dom = container_of(mon_d, struct mpam_resctrl_dom, resctrl_mon_dom); - - if (mon_d->hdr.id == cache_id) + list_for_each_entry(dom, &l3->resctrl_res.mon_domains, resctrl_mon_dom.hdr.list) { + if (comp && (dom->ctrl_comp != comp)) + continue; + if (dom->resctrl_mon_dom.hdr.id == cache_id) return dom; } @@ -2091,72 +2039,17 @@ static struct mpam_resctrl_dom *mpam_resctrl_get_mon_domain_from_cpu(int cpu) * For the monitors, we need to search the list of events... */ static struct mpam_resctrl_dom * -mpam_resctrl_alloc_domain_cpu(int cpu, struct mpam_resctrl_res *res) -{ - struct mpam_component *comp_iter, *ctrl_comp; - struct mpam_class *class = res->class; - int idx; - - ctrl_comp = NULL; - idx = srcu_read_lock(&mpam_srcu); - list_for_each_entry_srcu(comp_iter, &class->components, class_list, - srcu_read_lock_held(&mpam_srcu)) { - if (cpumask_test_cpu(cpu, &comp_iter->affinity)) { - ctrl_comp = comp_iter; - break; - } - } - srcu_read_unlock(&mpam_srcu, idx); - - /* cpu with unknown exported component? */ - if (WARN_ON_ONCE(!ctrl_comp)) - return ERR_PTR(-EINVAL); - - return mpam_resctrl_alloc_domain(cpumask_of(cpu), cpu_to_node(cpu), - ctrl_comp, res); -} - -static struct mpam_resctrl_dom * -mpam_resctrl_alloc_domain_nid(int nid, struct mpam_resctrl_res *res) -{ - struct mpam_component *comp_iter, *ctrl_comp; - struct mpam_class *class = res->class; - int idx; - - /* Only the memory class uses comp_id as nid */ - if (class->type != MPAM_CLASS_MEMORY) - return ERR_PTR(-EINVAL); - - ctrl_comp = NULL; - idx = srcu_read_lock(&mpam_srcu); - list_for_each_entry_srcu(comp_iter, &class->components, class_list, - srcu_read_lock_held(&mpam_srcu)) { - if (comp_iter->comp_id == nid) { - ctrl_comp = comp_iter; - break; - } - } - srcu_read_unlock(&mpam_srcu, idx); - - /* cpu with unknown exported component? */ - if (WARN_ON_ONCE(!ctrl_comp)) - return ERR_PTR(-EINVAL); - - return mpam_resctrl_alloc_domain(cpu_possible_mask, nid, ctrl_comp, res); -} - -static struct mpam_resctrl_dom * -mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res) +mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res, + struct mpam_component *comp) { struct mpam_resctrl_dom *dom; - struct rdt_ctrl_domain *ctrl_d; struct rdt_resource *r = &res->resctrl_res; lockdep_assert_cpus_held(); - list_for_each_entry(ctrl_d, &r->ctrl_domains, hdr.list) { - dom = container_of(ctrl_d, struct mpam_resctrl_dom, resctrl_ctrl_dom); - + list_for_each_entry_rcu(dom, &r->ctrl_domains, resctrl_ctrl_dom.hdr.list) { + if (comp && (dom->ctrl_comp != comp)) + continue; if (cpumask_test_cpu(cpu, &dom->ctrl_comp->affinity)) return dom; } @@ -2165,189 +2058,103 @@ mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res) return NULL; /* Search the mon domain list too - needed on monitor only platforms. */ - return mpam_resctrl_get_mon_domain_from_cpu(cpu); -} - -static struct mpam_resctrl_dom * -mpam_get_domain_from_nid(int nid, struct mpam_resctrl_res *res) -{ - struct rdt_ctrl_domain *d; - struct mpam_resctrl_dom *dom; - - list_for_each_entry(d, &res->resctrl_res.ctrl_domains, hdr.list) { - dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom); - - /* Only the memory class uses comp_id as nid */ - if (dom->ctrl_comp->class->type != MPAM_CLASS_MEMORY) - continue; - - if (dom->ctrl_comp->comp_id == nid) - return dom; - } - - return NULL; + return mpam_resctrl_get_mon_domain_from_cpu(cpu, comp); } int mpam_resctrl_online_cpu(unsigned int cpu) { - int i, err = 0; - struct mpam_resctrl_dom *dom; - struct mpam_resctrl_res *res; - - mutex_lock(&domain_list_lock); - for (i = 0; i < RDT_NUM_RESOURCES; i++) { - res = &mpam_resctrl_controls[i]; - if (!res->class) - continue; // dummy_resource; - - dom = mpam_resctrl_get_domain_from_cpu(cpu, res); - if (!dom) - dom = mpam_resctrl_alloc_domain_cpu(cpu, res); - if (IS_ERR(dom)) { - err = PTR_ERR(dom); - break; - } - - cpumask_set_cpu(cpu, &dom->resctrl_ctrl_dom.hdr.cpu_mask); - cpumask_set_cpu(cpu, &dom->resctrl_mon_dom.hdr.cpu_mask); - } - mutex_unlock(&domain_list_lock); - - if (!err) - resctrl_online_cpu(cpu); - - return err; -} - -int mpam_resctrl_offline_cpu(unsigned int cpu) -{ - int i; - struct mpam_resctrl_res *res; - struct mpam_resctrl_dom *dom; - struct rdt_mon_domain *mon_d; + struct rdt_l3_mon_domain *mon_d; struct rdt_ctrl_domain *ctrl_d; - bool ctrl_dom_empty, mon_dom_empty; + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + struct mpam_component *comp; - resctrl_offline_cpu(cpu); + guard(mutex)(&domain_list_lock); + for_each_mpam_resctrl_control(res, rid) { + struct mpam_resctrl_dom *dom; - mutex_lock(&domain_list_lock); - for (i = 0; i < RDT_NUM_RESOURCES; i++) { - res = &mpam_resctrl_controls[i]; if (!res->class) - continue; // dummy resource - - dom = mpam_resctrl_get_domain_from_cpu(cpu, res); - if (WARN_ON_ONCE(!dom)) - continue; + continue; // dummy_resource; - ctrl_dom_empty = true; - if (exposed_alloc_capable) { - mpam_reset_component_locked(dom->ctrl_comp); + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(comp, &res->class->components, class_list, + srcu_read_lock_held(&mpam_srcu)) { + if (!cpumask_test_cpu(cpu, &comp->affinity)) + continue; - ctrl_d = &dom->resctrl_ctrl_dom; - ctrl_dom_empty = mpam_resctrl_offline_domain_hdr(cpumask_of(cpu), - &ctrl_d->hdr); - if (ctrl_dom_empty) - resctrl_offline_ctrl_domain(&res->resctrl_res, ctrl_d); - } + dom = mpam_resctrl_get_domain_from_cpu(cpu, res, comp); + if (!dom) { + dom = mpam_resctrl_alloc_domain(cpu, res, comp); + } else { + if (resctrl_arch_alloc_capable()) { + ctrl_d = &dom->resctrl_ctrl_dom; + mpam_resctrl_online_domain_hdr(cpu, &ctrl_d->hdr); + } + if (resctrl_arch_mon_capable()) { + mon_d = &dom->resctrl_mon_dom; + mpam_resctrl_online_domain_hdr(cpu, &mon_d->hdr); + } + } - mon_dom_empty = true; - if (exposed_mon_capable) { - mon_d = &dom->resctrl_mon_dom; - mon_dom_empty = mpam_resctrl_offline_domain_hdr(cpumask_of(cpu), - &mon_d->hdr); - if (mon_dom_empty) - resctrl_offline_mon_domain(&res->resctrl_res, mon_d); + if (IS_ERR(dom)) { + return PTR_ERR(dom); + } } - - if (ctrl_dom_empty && mon_dom_empty) - kfree(dom); } - mutex_unlock(&domain_list_lock); - - return 0; -} - -static int mpam_resctrl_online_node(unsigned int nid) -{ - struct mpam_resctrl_dom *dom; - struct mpam_resctrl_res *res; - /* Domain IDs as NUMA nid is only defined for MBA */ - res = &mpam_resctrl_controls[RDT_RESOURCE_MBA]; - if (!res->class) - return 0; // dummy_resource; - - dom = mpam_get_domain_from_nid(nid, res); - if (!dom) - dom = mpam_resctrl_alloc_domain_nid(nid, res); - if (IS_ERR(dom)) - return PTR_ERR(dom); + resctrl_online_cpu(cpu); return 0; } -static int mpam_resctrl_offline_node(unsigned int nid) +void mpam_resctrl_offline_cpu(unsigned int cpu) { + struct mpam_component *comp; struct mpam_resctrl_res *res; - struct mpam_resctrl_dom *dom; - struct rdt_mon_domain *mon_d; - struct rdt_ctrl_domain *ctrl_d; - - /* Domain IDs as NUMA nid is only defined for MBA */ - res = &mpam_resctrl_controls[RDT_RESOURCE_MBA]; - if (!res->class) - return 0; // dummy_resource; - - dom = mpam_get_domain_from_nid(nid, res); - if (WARN_ON_ONCE(!dom)) - return 0; + enum resctrl_res_level rid; - ctrl_d = &dom->resctrl_ctrl_dom; - resctrl_offline_ctrl_domain(&res->resctrl_res, ctrl_d); - if (!mpam_resctrl_offline_domain_hdr(cpu_possible_mask, &ctrl_d->hdr)) - return 0; + resctrl_offline_cpu(cpu); - // TODO: skip monitor domains if there are no monitors for this resource - mon_d = &dom->resctrl_mon_dom; - resctrl_offline_mon_domain(&res->resctrl_res, mon_d); - if (!mpam_resctrl_offline_domain_hdr(cpu_possible_mask, &mon_d->hdr)) - return 0; + guard(mutex)(&domain_list_lock); + for_each_mpam_resctrl_control(res, rid) { + struct mpam_resctrl_dom *dom; + struct rdt_l3_mon_domain *mon_d; + struct rdt_ctrl_domain *ctrl_d; + bool ctrl_dom_empty, mon_dom_empty; - kfree(dom); + if (!res->class) + continue; // dummy resource - return 0; -} + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(comp, &res->class->components, class_list, + srcu_read_lock_held(&mpam_srcu)) { + if (!cpumask_test_cpu(cpu, &comp->affinity)) + continue; -static int mpam_resctrl_node_notifier(struct notifier_block *self, - unsigned long action, void *arg) -{ - struct node_notify *nn = arg; + dom = mpam_resctrl_get_domain_from_cpu(cpu, res, comp); + if (WARN_ON_ONCE(!dom)) + continue; - if (nn->nid < 0 || !mb_uses_numa_nid) - return NOTIFY_OK; + ctrl_dom_empty = true; + if (resctrl_arch_alloc_capable()) { + ctrl_d = &dom->resctrl_ctrl_dom; + ctrl_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr); + if (ctrl_dom_empty) + resctrl_offline_ctrl_domain(&res->resctrl_res, ctrl_d); + } - /* - * Ignore nid that have CPUs. Resctrl needs to see the cpu offline - * call for each CPU to update the CPUs in control groups. Moving - * the overflow handler isn't an issue as only L3 can be mon_capable, - * and NUMA nid used as domain-id are only an option for MBA. - */ - if (!cpumask_empty(cpumask_of_node(nn->nid))) - return NOTIFY_OK; + mon_dom_empty = true; + if (resctrl_arch_mon_capable()) { + mon_d = &dom->resctrl_mon_dom; + mon_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &mon_d->hdr); + if (mon_dom_empty) + resctrl_offline_mon_domain(&res->resctrl_res, &mon_d->hdr); + } - switch (action) { - case NODE_ADDED_FIRST_MEMORY: - mpam_resctrl_online_node(nn->nid); - break; - case NODE_REMOVED_LAST_MEMORY: - mpam_resctrl_offline_node(nn->nid); - break; - default: - /* don't care */ + if (ctrl_dom_empty && mon_dom_empty) + kfree(dom); + } } - - return NOTIFY_OK; } int mpam_resctrl_setup(void) @@ -2361,10 +2168,9 @@ int mpam_resctrl_setup(void) wait_event(wait_cacheinfo_ready, cacheinfo_ready); cpus_read_lock(); - for (i = 0; i < RDT_NUM_RESOURCES; i++) { - res = &mpam_resctrl_controls[i]; - INIT_LIST_HEAD(&res->resctrl_res.ctrl_domains); - INIT_LIST_HEAD(&res->resctrl_res.mon_domains); + for_each_mpam_resctrl_control(res, i) { + INIT_LIST_HEAD_RCU(&res->resctrl_res.ctrl_domains); + INIT_LIST_HEAD_RCU(&res->resctrl_res.mon_domains); res->resctrl_res.rid = i; } @@ -2373,60 +2179,49 @@ int mpam_resctrl_setup(void) mpam_resctrl_pick_mba(); /* Initialise the resctrl structures from the classes */ - for (i = 0; i < RDT_NUM_RESOURCES; i++) { - res = &mpam_resctrl_controls[i]; + for_each_mpam_resctrl_control(res, i) { if (!res->class) continue; // dummy resource err = mpam_resctrl_control_init(res, i); if (err) { pr_debug("Failed to initialise rid %u\n", i); - break; + goto internal_error; } } /* Find some classes to use for monitors */ mpam_resctrl_pick_counters(); - for (j = 0; j < QOS_NUM_EVENTS; j++) { - mon = &mpam_resctrl_counters[j]; + for_each_mpam_resctrl_mon(mon, j) { if (!mon->class) continue; // dummy resource - mpam_resctrl_monitor_init(mon, j); - } - - if (mb_numa_nid_possible) { - hotplug_node_notifier(mpam_resctrl_node_notifier, - RESCTRL_CALLBACK_PRI); + err = mpam_resctrl_monitor_init(mon, j); + if (err) { + pr_debug("Failed to initialise event %u\n", j); + goto internal_error; + } } cpus_read_unlock(); - if (err || (!exposed_alloc_capable && !exposed_mon_capable)) { - if (err) - pr_debug("Internal error %d - resctrl not supported\n", err); - else - pr_debug("No alloc(%u) or monitor(%u) found - resctrl not supported\n", - exposed_alloc_capable, exposed_mon_capable); - err = -EOPNOTSUPP; + if (!resctrl_arch_alloc_capable() && !resctrl_arch_mon_capable()) { + pr_debug("No alloc(%u) or monitor(%u) found - resctrl not supported\n", + resctrl_arch_alloc_capable(), resctrl_arch_mon_capable()); + return -EOPNOTSUPP; } - if (!err) { - if (!is_power_of_2(mpam_pmg_max + 1)) { - /* - * If not all the partid*pmg values are valid indexes, - * resctrl may allocate pmg that don't exist. This - * should cause an error interrupt. - */ - pr_warn("Number of PMG is not a power of 2! resctrl may misbehave"); - } + err = resctrl_init(); + if (err) + return err; + WRITE_ONCE(resctrl_enabled, true); - err = resctrl_init(); - if (!err) - WRITE_ONCE(resctrl_enabled, true); - } + return 0; +internal_error: + cpus_read_unlock(); + pr_debug("Internal error %d - resctrl not supported\n", err); return err; } @@ -2468,16 +2263,14 @@ void mpam_resctrl_teardown_class(struct mpam_class *class) might_sleep(); - for (i = 0; i < RDT_NUM_RESOURCES; i++) { - res = &mpam_resctrl_controls[i]; + for_each_mpam_resctrl_control(res, i) { if (res->class == class) { mpam_resctrl_exit(); res->class = NULL; break; } } - for (i = 0; i < QOS_NUM_EVENTS; i++) { - mon = &mpam_resctrl_counters[i]; + for_each_mpam_resctrl_mon(mon, i) { if (mon->class == class) { mpam_resctrl_exit(); mon->class = NULL; diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index c3688cbe0ff5c..e04b8a5f76c3d 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -545,8 +546,8 @@ struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id, } void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, - struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, - cpumask_t *cpumask, int evtid, int first) + struct rdt_domain_hdr *hdr, struct rdtgroup *rdtgrp, + cpumask_t *cpumask, struct mon_evt *evt, int first) { int cpu; @@ -557,21 +558,26 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, * Setup the parameters to pass to mon_event_count() to read the data. */ rr->rgrp = rdtgrp; - rr->evtid = evtid; + rr->evt = evt; rr->r = r; - rr->d = d; + rr->hdr = hdr; rr->first = first; if (resctrl_arch_mbm_cntr_assign_enabled(r) && - resctrl_is_mbm_event(evtid)) { + resctrl_is_mbm_event(evt->evtid)) { rr->is_mbm_cntr = true; } else { - rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); + rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evt->evtid); if (IS_ERR(rr->arch_mon_ctx)) { rr->err = -EINVAL; return; } } + if (evt->any_cpu) { + mon_event_count(rr); + goto out_ctx_free; + } + cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU); /* @@ -585,22 +591,93 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, else smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); +out_ctx_free: if (rr->arch_mon_ctx) - resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); + resctrl_arch_mon_ctx_free(r, evt->evtid, rr->arch_mon_ctx); +} + +/* + * Decimal place precision to use for each number of fixed-point + * binary bits computed from ceil(binary_bits * log10(2)) except + * binary_bits == 0 which will print "value.0" + */ +static const unsigned int decplaces[MAX_BINARY_BITS + 1] = { + [0] = 1, + [1] = 1, + [2] = 1, + [3] = 1, + [4] = 2, + [5] = 2, + [6] = 2, + [7] = 3, + [8] = 3, + [9] = 3, + [10] = 4, + [11] = 4, + [12] = 4, + [13] = 4, + [14] = 5, + [15] = 5, + [16] = 5, + [17] = 6, + [18] = 6, + [19] = 6, + [20] = 7, + [21] = 7, + [22] = 7, + [23] = 7, + [24] = 8, + [25] = 8, + [26] = 8, + [27] = 9 +}; + +static void print_event_value(struct seq_file *m, unsigned int binary_bits, u64 val) +{ + unsigned long long frac = 0; + + if (binary_bits) { + /* Mask off the integer part of the fixed-point value. */ + frac = val & GENMASK_ULL(binary_bits - 1, 0); + + /* + * Multiply by 10^{desired decimal places}. The integer part of + * the fixed point value is now almost what is needed. + */ + frac *= int_pow(10ull, decplaces[binary_bits]); + + /* + * Round to nearest by adding a value that would be a "1" in the + * binary_bits + 1 place. Integer part of fixed point value is + * now the needed value. + */ + frac += 1ull << (binary_bits - 1); + + /* + * Extract the integer part of the value. This is the decimal + * representation of the original fixed-point fractional value. + */ + frac >>= binary_bits; + } + + /* + * "frac" is now in the range [0 .. 10^decplaces). I.e. string + * representation will fit into chosen number of decimal places. + */ + seq_printf(m, "%llu.%0*llu\n", val >> binary_bits, decplaces[binary_bits], frac); } int rdtgroup_mondata_show(struct seq_file *m, void *arg) { struct kernfs_open_file *of = m->private; enum resctrl_res_level resid; - enum resctrl_event_id evtid; struct rdt_domain_hdr *hdr; struct rmid_read rr = {0}; - struct rdt_mon_domain *d; struct rdtgroup *rdtgrp; int domid, cpu, ret = 0; struct rdt_resource *r; struct cacheinfo *ci; + struct mon_evt *evt; struct mon_data *md; rdtgrp = rdtgroup_kn_lock_live(of->kn); @@ -617,10 +694,17 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) resid = md->rid; domid = md->domid; - evtid = md->evtid; + evt = md->evt; r = resctrl_arch_get_resource(resid); if (md->sum) { + struct rdt_l3_mon_domain *d; + + if (WARN_ON_ONCE(resid != RDT_RESOURCE_L3)) { + ret = -EINVAL; + goto out; + } + /* * This file requires summing across all domains that share * the L3 cache id that was provided in the "domid" field of the @@ -635,7 +719,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) continue; rr.ci = ci; mon_event_read(&rr, r, NULL, rdtgrp, - &ci->shared_cpu_map, evtid, false); + &ci->shared_cpu_map, evt, false); goto checkresult; } } @@ -647,12 +731,11 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) * the resource to find the domain with "domid". */ hdr = resctrl_find_domain(&r->mon_domains, domid, NULL); - if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) { + if (!hdr) { ret = -ENOENT; goto out; } - d = container_of(hdr, struct rdt_mon_domain, hdr); - mon_event_read(&rr, r, d, rdtgrp, &d->hdr.cpu_mask, evtid, false); + mon_event_read(&rr, r, hdr, rdtgrp, &hdr->cpu_mask, evt, false); } checkresult: @@ -667,6 +750,8 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) seq_puts(m, "Unavailable\n"); else if (rr.err == -ENOENT) seq_puts(m, "Unassigned\n"); + else if (evt->is_floating_point) + print_event_value(m, evt->binary_bits, rr.val); else seq_printf(m, "%llu\n", rr.val); diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index f5f74342af317..08cdc8546a8fd 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -42,7 +42,6 @@ struct rdt_fs_context { bool enable_cdpl3; bool enable_mba_mbps; bool enable_debug; - bool mb_uses_numa_nid; bool enable_abi_playground; }; @@ -63,7 +62,14 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) * READS_TO_REMOTE_MEM) being tracked by @evtid. * Only valid if @evtid is an MBM event. * @configurable: true if the event is configurable + * @any_cpu: true if the event can be read from any CPU + * @is_floating_point: event values are displayed in floating point format + * @binary_bits: number of fixed-point binary bits from architecture, + * only valid if @is_floating_point is true * @enabled: true if the event is enabled + * @arch_priv: Architecture private data for this event. + * The @arch_priv provided by the architecture via + * resctrl_enable_mon_event(). */ struct mon_evt { enum resctrl_event_id evtid; @@ -71,7 +77,11 @@ struct mon_evt { char *name; u32 evt_cfg; bool configurable; + bool any_cpu; + bool is_floating_point; + unsigned int binary_bits; bool enabled; + void *arch_priv; }; extern struct mon_evt mon_event_all[QOS_NUM_EVENTS]; @@ -79,13 +89,16 @@ extern struct mon_evt mon_event_all[QOS_NUM_EVENTS]; #define for_each_mon_event(mevt) for (mevt = &mon_event_all[QOS_FIRST_EVENT]; \ mevt < &mon_event_all[QOS_NUM_EVENTS]; mevt++) +/* Limit for mon_evt::binary_bits */ +#define MAX_BINARY_BITS 27 + /** * struct mon_data - Monitoring details for each event file. * @list: Member of the global @mon_data_kn_priv_list list. * @rid: Resource id associated with the event file. - * @evtid: Event id associated with the event file. - * @sum: Set when event must be summed across multiple - * domains. + * @evt: Event structure associated with the event file. + * @sum: Set for RDT_RESOURCE_L3 when event must be summed + * across multiple domains. * @domid: When @sum is zero this is the domain to which * the event file belongs. When @sum is one this * is the id of the L3 cache that all domains to be @@ -97,7 +110,7 @@ extern struct mon_evt mon_event_all[QOS_NUM_EVENTS]; struct mon_data { struct list_head list; enum resctrl_res_level rid; - enum resctrl_event_id evtid; + struct mon_evt *evt; int domid; bool sum; }; @@ -108,25 +121,27 @@ struct mon_data { * resource group then its event count is summed with the count from all * its child resource groups. * @r: Resource describing the properties of the event being read. - * @d: Domain that the counter should be read from. If NULL then sum all - * domains in @r sharing L3 @ci.id - * @evtid: Which monitor event to read. + * @hdr: Header of domain that the counter should be read from. If NULL then + * sum all domains in @r sharing L3 @ci.id + * @evt: Which monitor event to read. * @first: Initialize MBM counter when true. - * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains. + * @ci: Cacheinfo for L3. Only set when @hdr is NULL. Used when summing + * domains. * @is_mbm_cntr: true if "mbm_event" counter assignment mode is enabled and it * is an MBM event. * @err: Error encountered when reading counter. - * @val: Returned value of event counter. If @rgrp is a parent resource group, - * @val includes the sum of event counts from its child resource groups. - * If @d is NULL, @val includes the sum of all domains in @r sharing @ci.id, - * (summed across child resource groups if @rgrp is a parent resource group). + * @val: Returned value of event counter. If @rgrp is a parent resource + * group, @val includes the sum of event counts from its child + * resource groups. If @hdr is NULL, @val includes the sum of all + * domains in @r sharing @ci.id, (summed across child resource groups + * if @rgrp is a parent resource group). * @arch_mon_ctx: Hardware monitor allocated for this read request (MPAM only). */ struct rmid_read { struct rdtgroup *rgrp; struct rdt_resource *r; - struct rdt_mon_domain *d; - enum resctrl_event_id evtid; + struct rdt_domain_hdr *hdr; + struct mon_evt *evt; bool first; struct cacheinfo *ci; bool is_mbm_cntr; @@ -250,6 +265,8 @@ struct rdtgroup { #define RFTYPE_SCHEMA_PERCENT BIT(12) #define RFTYPE_SCHEMA_MBPS BIT(13) +#define RFTYPE_RES_PERF_PKG BIT(12) + #define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) #define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) @@ -360,23 +377,27 @@ int closids_supported(void); void closid_free(int closid); +int setup_rmid_lru_list(void); + +void free_rmid_lru_list(void); + int alloc_rmid(u32 closid); void free_rmid(u32 closid, u32 rmid); -void resctrl_mon_resource_exit(void); +int resctrl_mon_init(void); + +void resctrl_mon_exit(void); void mon_event_count(void *info); int rdtgroup_mondata_show(struct seq_file *m, void *arg); void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, - struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, - cpumask_t *cpumask, int evtid, int first); - -int resctrl_mon_resource_init(void); + struct rdt_domain_hdr *hdr, struct rdtgroup *rdtgrp, + cpumask_t *cpumask, struct mon_evt *evt, int first); -void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, +void mbm_setup_overflow_handler(struct rdt_l3_mon_domain *dom, unsigned long delay_ms, int exclude_cpu); @@ -384,14 +405,14 @@ void mbm_handle_overflow(struct work_struct *work); bool is_mba_sc(struct rdt_resource *r); -void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, +void cqm_setup_limbo_handler(struct rdt_l3_mon_domain *dom, unsigned long delay_ms, int exclude_cpu); void cqm_handle_limbo(struct work_struct *work); -bool has_busy_rmid(struct rdt_mon_domain *d); +bool has_busy_rmid(struct rdt_l3_mon_domain *d); -void __check_limbo(struct rdt_mon_domain *d, bool force_free); +void __check_limbo(struct rdt_l3_mon_domain *d, bool force_free); void resctrl_file_fflags_init(const char *config, unsigned long fflags); @@ -436,6 +457,11 @@ int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, voi ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); +int mbm_MB_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v); + +ssize_t mbm_MB_assignments_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index e62432467817f..47a6651aecfb9 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -18,7 +18,6 @@ #define pr_fmt(fmt) "resctrl: " fmt #include -#include #include #include #include @@ -150,16 +149,18 @@ static void limbo_release_entry(struct rmid_entry *entry) * decrement the count. If the busy count gets to zero on an RMID, we * free the RMID */ -void __check_limbo(struct rdt_mon_domain *d, bool force_free) +void __check_limbo(struct rdt_l3_mon_domain *d, bool force_free) { struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); u32 idx_limit = resctrl_arch_system_num_rmid_idx(); struct rmid_entry *entry; u32 idx, cur_idx = 1; void *arch_mon_ctx; + void *arch_priv; bool rmid_dirty; u64 val = 0; + arch_priv = mon_event_all[QOS_L3_OCCUP_EVENT_ID].arch_priv; arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID); if (IS_ERR(arch_mon_ctx)) { pr_warn_ratelimited("Failed to allocate monitor context: %ld", @@ -181,8 +182,8 @@ void __check_limbo(struct rdt_mon_domain *d, bool force_free) entry = __rmid_entry(idx); if (!entry) break; - if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, - QOS_L3_OCCUP_EVENT_ID, &val, + if (resctrl_arch_rmid_read(r, &d->hdr, entry->closid, entry->rmid, + QOS_L3_OCCUP_EVENT_ID, arch_priv, &val, arch_mon_ctx)) { rmid_dirty = true; } else { @@ -210,7 +211,7 @@ void __check_limbo(struct rdt_mon_domain *d, bool force_free) resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx); } -bool has_busy_rmid(struct rdt_mon_domain *d) +bool has_busy_rmid(struct rdt_l3_mon_domain *d) { u32 idx_limit = resctrl_arch_system_num_rmid_idx(); @@ -311,7 +312,7 @@ int alloc_rmid(u32 closid) static void add_rmid_to_limbo(struct rmid_entry *entry) { struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; u32 idx; lockdep_assert_held(&rdtgroup_mutex); @@ -370,7 +371,7 @@ void free_rmid(u32 closid, u32 rmid) list_add_tail(&entry->list, &rmid_free_lru); } -static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, +static struct mbm_state *get_mbm_state(struct rdt_l3_mon_domain *d, u32 closid, u32 rmid, enum resctrl_event_id evtid) { u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); @@ -390,7 +391,7 @@ static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, * Return: * Valid counter ID on success, or -ENOENT on failure. */ -static int mbm_cntr_get(struct rdt_resource *r, struct rdt_mon_domain *d, +static int mbm_cntr_get(struct rdt_resource *r, struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) { int cntr_id; @@ -417,7 +418,7 @@ static int mbm_cntr_get(struct rdt_resource *r, struct rdt_mon_domain *d, * Return: * Valid counter ID on success, or -ENOSPC on failure. */ -static int mbm_cntr_alloc(struct rdt_resource *r, struct rdt_mon_domain *d, +static int mbm_cntr_alloc(struct rdt_resource *r, struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) { int cntr_id; @@ -436,24 +437,29 @@ static int mbm_cntr_alloc(struct rdt_resource *r, struct rdt_mon_domain *d, /* * mbm_cntr_free() - Clear the counter ID configuration details in the domain @d. */ -static void mbm_cntr_free(struct rdt_mon_domain *d, int cntr_id) +static void mbm_cntr_free(struct rdt_l3_mon_domain *d, int cntr_id) { memset(&d->cntr_cfg[cntr_id], 0, sizeof(*d->cntr_cfg)); } -static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) +static int __l3_mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) { int cpu = smp_processor_id(); u32 closid = rdtgrp->closid; u32 rmid = rdtgrp->mon.rmid; - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; int cntr_id = -ENOENT; struct mbm_state *m; - int err, ret; u64 tval = 0; + if (!domain_header_is_valid(rr->hdr, RESCTRL_MON_DOMAIN, rr->r->rid)) { + rr->err = -EIO; + return -EINVAL; + } + d = container_of(rr->hdr, struct rdt_l3_mon_domain, hdr); + if (rr->is_mbm_cntr) { - cntr_id = mbm_cntr_get(rr->r, rr->d, rdtgrp, rr->evtid); + cntr_id = mbm_cntr_get(rr->r, d, rdtgrp, rr->evt->evtid); if (cntr_id < 0) { rr->err = -ENOENT; return -EINVAL; @@ -462,31 +468,51 @@ static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) if (rr->first) { if (rr->is_mbm_cntr) - resctrl_arch_reset_cntr(rr->r, rr->d, closid, rmid, cntr_id, rr->evtid); + resctrl_arch_reset_cntr(rr->r, d, closid, rmid, cntr_id, rr->evt->evtid); else - resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); - m = get_mbm_state(rr->d, closid, rmid, rr->evtid); + resctrl_arch_reset_rmid(rr->r, d, closid, rmid, rr->evt->evtid); + m = get_mbm_state(d, closid, rmid, rr->evt->evtid); if (m) memset(m, 0, sizeof(struct mbm_state)); return 0; } - if (rr->d) { - /* Reading a single domain, must be on a CPU in that domain. */ - if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask)) - return -EINVAL; - if (rr->is_mbm_cntr) - rr->err = resctrl_arch_cntr_read(rr->r, rr->d, closid, rmid, cntr_id, - rr->evtid, &tval); - else - rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); - if (rr->err) - return rr->err; + /* Reading a single domain, must be on a CPU in that domain. */ + if (!cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) + return -EINVAL; + if (rr->is_mbm_cntr) + rr->err = resctrl_arch_cntr_read(rr->r, d, closid, rmid, cntr_id, + rr->evt->evtid, &tval); + else + rr->err = resctrl_arch_rmid_read(rr->r, rr->hdr, closid, rmid, + rr->evt->evtid, rr->evt->arch_priv, + &tval, rr->arch_mon_ctx); + if (rr->err) + return rr->err; - rr->val += tval; + rr->val += tval; - return 0; + return 0; +} + +static int __l3_mon_event_count_sum(struct rdtgroup *rdtgrp, struct rmid_read *rr) +{ + int cpu = smp_processor_id(); + u32 closid = rdtgrp->closid; + u32 rmid = rdtgrp->mon.rmid; + struct rdt_l3_mon_domain *d; + u64 tval = 0; + int err, ret; + + /* + * Summing across domains is only done for systems that implement + * Sub-NUMA Cluster. There is no overlap with systems that support + * assignable counters. + */ + if (rr->is_mbm_cntr) { + pr_warn_once("Summing domains using assignable counters is not supported\n"); + rr->err = -EINVAL; + return -EINVAL; } /* Summing domains that share a cache, must be on a CPU for that cache. */ @@ -504,12 +530,9 @@ static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { if (d->ci_id != rr->ci->id) continue; - if (rr->is_mbm_cntr) - err = resctrl_arch_cntr_read(rr->r, d, closid, rmid, cntr_id, - rr->evtid, &tval); - else - err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); + err = resctrl_arch_rmid_read(rr->r, &d->hdr, closid, rmid, + rr->evt->evtid, rr->evt->arch_priv, + &tval, rr->arch_mon_ctx); if (!err) { rr->val += tval; ret = 0; @@ -522,6 +545,36 @@ static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) return ret; } +static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) +{ + switch (rr->r->rid) { + case RDT_RESOURCE_L3: + case RDT_RESOURCE_MBA: + WARN_ON_ONCE(rr->evt->any_cpu); + if (rr->hdr) + return __l3_mon_event_count(rdtgrp, rr); + else + return __l3_mon_event_count_sum(rdtgrp, rr); + case RDT_RESOURCE_PERF_PKG: { + u64 tval = 0; + + rr->err = resctrl_arch_rmid_read(rr->r, rr->hdr, rdtgrp->closid, + rdtgrp->mon.rmid, rr->evt->evtid, + rr->evt->arch_priv, + &tval, rr->arch_mon_ctx); + if (rr->err) + return rr->err; + + rr->val += tval; + + return 0; + } + default: + rr->err = -EINVAL; + return -EINVAL; + } +} + /* * mbm_bw_count() - Update bw count from values previously read by * __mon_event_count(). @@ -539,9 +592,13 @@ static void mbm_bw_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) u64 cur_bw, bytes, cur_bytes; u32 closid = rdtgrp->closid; u32 rmid = rdtgrp->mon.rmid; + struct rdt_l3_mon_domain *d; struct mbm_state *m; - m = get_mbm_state(rr->d, closid, rmid, rr->evtid); + if (!domain_header_is_valid(rr->hdr, RESCTRL_MON_DOMAIN, rr->r->rid)) + return; + d = container_of(rr->hdr, struct rdt_l3_mon_domain, hdr); + m = get_mbm_state(d, closid, rmid, rr->evt->evtid); if (WARN_ON_ONCE(!m)) return; @@ -640,7 +697,7 @@ static struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, * throttle MSRs already have low percentage values. To avoid * unnecessarily restricting such rdtgroups, we also increase the bandwidth. */ -static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) +static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_l3_mon_domain *dom_mbm) { u32 closid, rmid, cur_msr_val, new_msr_val; struct mbm_state *pmbm_data, *cmbm_data; @@ -708,18 +765,18 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); } -static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d, +static void mbm_update_one_event(struct rdt_resource *r, struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) { struct rmid_read rr = {0}; rr.r = r; - rr.d = d; - rr.evtid = evtid; + rr.hdr = &d->hdr; + rr.evt = &mon_event_all[evtid]; if (resctrl_arch_mbm_cntr_assign_enabled(r)) { rr.is_mbm_cntr = true; } else { - rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, evtid); if (IS_ERR(rr.arch_mon_ctx)) { pr_warn_ratelimited("Failed to allocate monitor context: %ld", PTR_ERR(rr.arch_mon_ctx)); @@ -737,10 +794,10 @@ static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain * mbm_bw_count(rdtgrp, &rr); if (rr.arch_mon_ctx) - resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); + resctrl_arch_mon_ctx_free(rr.r, evtid, rr.arch_mon_ctx); } -static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, +static void mbm_update(struct rdt_resource *r, struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp) { /* @@ -761,13 +818,12 @@ static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, void cqm_handle_limbo(struct work_struct *work) { unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; cpus_read_lock(); - get_online_mems(); mutex_lock(&rdtgroup_mutex); - d = container_of(work, struct rdt_mon_domain, cqm_limbo.work); + d = container_of(work, struct rdt_l3_mon_domain, cqm_limbo.work); __check_limbo(d, false); @@ -779,7 +835,6 @@ void cqm_handle_limbo(struct work_struct *work) } mutex_unlock(&rdtgroup_mutex); - put_online_mems(); cpus_read_unlock(); } @@ -791,7 +846,7 @@ void cqm_handle_limbo(struct work_struct *work) * @exclude_cpu: Which CPU the handler should not run on, * RESCTRL_PICK_ANY_CPU to pick any CPU. */ -void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, +void cqm_setup_limbo_handler(struct rdt_l3_mon_domain *dom, unsigned long delay_ms, int exclude_cpu) { unsigned long delay = msecs_to_jiffies(delay_ms); @@ -808,12 +863,11 @@ void mbm_handle_overflow(struct work_struct *work) { unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); struct rdtgroup *prgrp, *crgrp; - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; struct list_head *head; struct rdt_resource *r; cpus_read_lock(); - get_online_mems(); mutex_lock(&rdtgroup_mutex); /* @@ -824,7 +878,7 @@ void mbm_handle_overflow(struct work_struct *work) goto out_unlock; r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - d = container_of(work, struct rdt_mon_domain, mbm_over.work); + d = container_of(work, struct rdt_l3_mon_domain, mbm_over.work); list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { mbm_update(r, d, prgrp); @@ -847,7 +901,6 @@ void mbm_handle_overflow(struct work_struct *work) out_unlock: mutex_unlock(&rdtgroup_mutex); - put_online_mems(); cpus_read_unlock(); } @@ -859,7 +912,7 @@ void mbm_handle_overflow(struct work_struct *work) * @exclude_cpu: Which CPU the handler should not run on, * RESCTRL_PICK_ANY_CPU to pick any CPU. */ -void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, +void mbm_setup_overflow_handler(struct rdt_l3_mon_domain *dom, unsigned long delay_ms, int exclude_cpu) { unsigned long delay = msecs_to_jiffies(delay_ms); @@ -880,42 +933,29 @@ void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ schedule_delayed_work_on(cpu, &dom->mbm_over, delay); } -static int dom_data_init(struct rdt_resource *r) +int setup_rmid_lru_list(void) { - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - u32 num_closid = resctrl_arch_get_num_closid(r); struct rmid_entry *entry = NULL; - int err = 0, i; + u32 idx_limit; u32 idx; + int i; - mutex_lock(&rdtgroup_mutex); - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - u32 *tmp; - - /* - * If the architecture hasn't provided a sanitised value here, - * this may result in larger arrays than necessary. Resctrl will - * use a smaller system wide value based on the resources in - * use. - */ - tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); - if (!tmp) { - err = -ENOMEM; - goto out_unlock; - } + if (!resctrl_arch_mon_capable()) + return 0; - closid_num_dirty_rmid = tmp; - } + /* + * Called on every mount, but the number of RMIDs cannot change + * after the first mount, so keep using the same set of rmid_ptrs[] + * until resctrl_exit(). Note that the limbo handler continues to + * access rmid_ptrs[] after resctrl is unmounted. + */ + if (rmid_ptrs) + return 0; + idx_limit = resctrl_arch_system_num_rmid_idx(); rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL); - if (!rmid_ptrs) { - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - kfree(closid_num_dirty_rmid); - closid_num_dirty_rmid = NULL; - } - err = -ENOMEM; - goto out_unlock; - } + if (!rmid_ptrs) + return -ENOMEM; for (i = 0; i < idx_limit; i++) { entry = &rmid_ptrs[i]; @@ -928,7 +968,7 @@ static int dom_data_init(struct rdt_resource *r) /* * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and * are always allocated. These are used for the rdtgroup_default - * control group, which will be setup later in resctrl_init(). + * control group, which was setup earlier in rdtgroup_setup_default(). */ idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, RESCTRL_RESERVED_RMID); @@ -936,64 +976,69 @@ static int dom_data_init(struct rdt_resource *r) WARN_ON_ONCE(!entry); list_del(&entry->list); -out_unlock: - mutex_unlock(&rdtgroup_mutex); - - return err; + return 0; } -static void dom_data_exit(struct rdt_resource *r) +void free_rmid_lru_list(void) { + if (!resctrl_arch_mon_capable()) + return; + mutex_lock(&rdtgroup_mutex); - - if (!r->mon_capable) - goto out_unlock; - - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - kfree(closid_num_dirty_rmid); - closid_num_dirty_rmid = NULL; - } - kfree(rmid_ptrs); rmid_ptrs = NULL; - -out_unlock: mutex_unlock(&rdtgroup_mutex); } +#define MON_EVENT(_eventid, _name, _res, _fp) \ + [_eventid] = { \ + .name = _name, \ + .evtid = _eventid, \ + .rid = _res, \ + .is_floating_point = _fp, \ +} + /* * All available events. Architecture code marks the ones that * are supported by a system using resctrl_enable_mon_event() * to set .enabled. */ struct mon_evt mon_event_all[QOS_NUM_EVENTS] = { - [QOS_L3_OCCUP_EVENT_ID] = { - .name = "llc_occupancy", - .evtid = QOS_L3_OCCUP_EVENT_ID, - .rid = RDT_RESOURCE_L3, - }, - [QOS_L3_MBM_TOTAL_EVENT_ID] = { - .name = "mbm_total_bytes", - .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, - .rid = RDT_RESOURCE_L3, - }, - [QOS_L3_MBM_LOCAL_EVENT_ID] = { - .name = "mbm_local_bytes", - .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, - .rid = RDT_RESOURCE_L3, - }, + MON_EVENT(QOS_L3_OCCUP_EVENT_ID, "llc_occupancy", RDT_RESOURCE_L3, false), + MON_EVENT(QOS_L3_MBM_TOTAL_EVENT_ID, "mbm_total_bytes", RDT_RESOURCE_MBA, false), + MON_EVENT(QOS_L3_MBM_LOCAL_EVENT_ID, "mbm_local_bytes", RDT_RESOURCE_L3, false), + MON_EVENT(PMT_EVENT_ENERGY, "core_energy", RDT_RESOURCE_PERF_PKG, true), + MON_EVENT(PMT_EVENT_ACTIVITY, "activity", RDT_RESOURCE_PERF_PKG, true), + MON_EVENT(PMT_EVENT_STALLS_LLC_HIT, "stalls_llc_hit", RDT_RESOURCE_PERF_PKG, false), + MON_EVENT(PMT_EVENT_C1_RES, "c1_res", RDT_RESOURCE_PERF_PKG, false), + MON_EVENT(PMT_EVENT_UNHALTED_CORE_CYCLES, "unhalted_core_cycles", RDT_RESOURCE_PERF_PKG, false), + MON_EVENT(PMT_EVENT_STALLS_LLC_MISS, "stalls_llc_miss", RDT_RESOURCE_PERF_PKG, false), + MON_EVENT(PMT_EVENT_AUTO_C6_RES, "c6_res", RDT_RESOURCE_PERF_PKG, false), + MON_EVENT(PMT_EVENT_UNHALTED_REF_CYCLES, "unhalted_ref_cycles", RDT_RESOURCE_PERF_PKG, false), + MON_EVENT(PMT_EVENT_UOPS_RETIRED, "uops_retired", RDT_RESOURCE_PERF_PKG, false), }; -void resctrl_enable_mon_event(enum resctrl_event_id eventid) +bool resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu, + unsigned int binary_bits, void *arch_priv) { - if (WARN_ON_ONCE(eventid < QOS_FIRST_EVENT || eventid >= QOS_NUM_EVENTS)) - return; + if (WARN_ON_ONCE(eventid < QOS_FIRST_EVENT || eventid >= QOS_NUM_EVENTS || + binary_bits > MAX_BINARY_BITS)) + return false; if (mon_event_all[eventid].enabled) { pr_warn("Duplicate enable for event %d\n", eventid); - return; + return false; + } + if (binary_bits && !mon_event_all[eventid].is_floating_point) { + pr_warn("Event %d may not be floating point\n", eventid); + return false; } + mon_event_all[eventid].any_cpu = any_cpu; + mon_event_all[eventid].binary_bits = binary_bits; + mon_event_all[eventid].arch_priv = arch_priv; mon_event_all[eventid].enabled = true; + + return true; } bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid) @@ -1117,7 +1162,7 @@ ssize_t resctrl_mbm_assign_on_mkdir_write(struct kernfs_open_file *of, char *buf * mbm_cntr_free_all() - Clear all the counter ID configuration details in the * domain @d. Called when mbm_assign_mode is changed. */ -static void mbm_cntr_free_all(struct rdt_resource *r, struct rdt_mon_domain *d) +static void mbm_cntr_free_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d) { memset(d->cntr_cfg, 0, sizeof(*d->cntr_cfg) * r->mon.num_mbm_cntrs); } @@ -1126,7 +1171,7 @@ static void mbm_cntr_free_all(struct rdt_resource *r, struct rdt_mon_domain *d) * resctrl_reset_rmid_all() - Reset all non-architecture states for all the * supported RMIDs. */ -static void resctrl_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) +static void resctrl_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d) { u32 idx_limit = resctrl_arch_system_num_rmid_idx(); enum resctrl_event_id evt; @@ -1147,7 +1192,7 @@ static void resctrl_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain * Assign the counter if @assign is true else unassign the counter. Reset the * associated non-architectural state. */ -static void rdtgroup_assign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +static void rdtgroup_assign_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, enum resctrl_event_id evtid, u32 rmid, u32 closid, u32 cntr_id, bool assign) { @@ -1167,7 +1212,7 @@ static void rdtgroup_assign_cntr(struct rdt_resource *r, struct rdt_mon_domain * * Return: * 0 on success, < 0 on failure. */ -static int rdtgroup_alloc_assign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +static int rdtgroup_alloc_assign_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, struct mon_evt *mevt) { int cntr_id; @@ -1202,7 +1247,7 @@ static int rdtgroup_alloc_assign_cntr(struct rdt_resource *r, struct rdt_mon_dom * Return: * 0 on success, < 0 on failure. */ -static int rdtgroup_assign_cntr_event(struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, +static int rdtgroup_assign_cntr_event(struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, struct mon_evt *mevt) { struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid); @@ -1252,7 +1297,7 @@ void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp) * rdtgroup_free_unassign_cntr() - Unassign and reset the counter ID configuration * for the event pointed to by @mevt within the domain @d and resctrl group @rdtgrp. */ -static void rdtgroup_free_unassign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +static void rdtgroup_free_unassign_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, struct mon_evt *mevt) { int cntr_id; @@ -1273,7 +1318,7 @@ static void rdtgroup_free_unassign_cntr(struct rdt_resource *r, struct rdt_mon_d * the event structure @mevt from the domain @d and the group @rdtgrp. Unassign * the counters from all the domains if @d is NULL else unassign from @d. */ -static void rdtgroup_unassign_cntr_event(struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, +static void rdtgroup_unassign_cntr_event(struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, struct mon_evt *mevt) { struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid); @@ -1348,7 +1393,7 @@ static int resctrl_parse_mem_transactions(char *tok, u32 *val) static void rdtgroup_update_cntr_event(struct rdt_resource *r, struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) { - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; int cntr_id; list_for_each_entry(d, &r->mon_domains, hdr.list) { @@ -1454,7 +1499,7 @@ ssize_t resctrl_mbm_assign_mode_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; int ret = 0; bool enable; @@ -1527,7 +1572,7 @@ int resctrl_num_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - struct rdt_mon_domain *dom; + struct rdt_l3_mon_domain *dom; bool sep = false; cpus_read_lock(); @@ -1551,7 +1596,7 @@ int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - struct rdt_mon_domain *dom; + struct rdt_l3_mon_domain *dom; bool sep = false; u32 cntrs, i; int ret = 0; @@ -1589,10 +1634,10 @@ int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, return ret; } -int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v) +static int mbm_assignments_show(struct kernfs_open_file *of, struct seq_file *s, + void *v, struct rdt_resource *r) { - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; struct rdtgroup *rdtgrp; struct mon_evt *mevt; int ret = 0; @@ -1637,6 +1682,18 @@ int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, voi return ret; } +int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v) +{ + return mbm_assignments_show(of, s, v, + resctrl_arch_get_resource(RDT_RESOURCE_L3)); +} + +int mbm_MB_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v) +{ + return mbm_assignments_show(of, s, v, + resctrl_arch_get_resource(RDT_RESOURCE_MBA)); +} + /* * mbm_get_mon_event_by_name() - Return the mon_evt entry for the matching * event name. @@ -1655,7 +1712,7 @@ static struct mon_evt *mbm_get_mon_event_by_name(struct rdt_resource *r, char *n return NULL; } -static int rdtgroup_modify_assign_state(char *assign, struct rdt_mon_domain *d, +static int rdtgroup_modify_assign_state(char *assign, struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, struct mon_evt *mevt) { int ret = 0; @@ -1681,7 +1738,7 @@ static int rdtgroup_modify_assign_state(char *assign, struct rdt_mon_domain *d, static int resctrl_parse_mbm_assignment(struct rdt_resource *r, struct rdtgroup *rdtgrp, char *event, char *tok) { - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; unsigned long dom_id = 0; char *dom_str, *id_str; struct mon_evt *mevt; @@ -1731,10 +1788,10 @@ static int resctrl_parse_mbm_assignment(struct rdt_resource *r, struct rdtgroup return -EINVAL; } -ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, - size_t nbytes, loff_t off) +static ssize_t mbm_assignments_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off, + struct rdt_resource *r) { - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); struct rdtgroup *rdtgrp; char *token, *event; int ret = 0; @@ -1776,39 +1833,87 @@ ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, return ret ?: nbytes; } +ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + return mbm_assignments_write(of, buf, nbytes, off, + resctrl_arch_get_resource(RDT_RESOURCE_L3)); +} + +ssize_t mbm_MB_assignments_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + return mbm_assignments_write(of, buf, nbytes, off, + resctrl_arch_get_resource(RDT_RESOURCE_MBA)); +} + +static int closid_num_dirty_rmid_alloc(struct rdt_resource *r) +{ + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + u32 num_closid = resctrl_arch_get_num_closid(r); + u32 *tmp; + + /* For ARM memory ordering access to closid_num_dirty_rmid */ + mutex_lock(&rdtgroup_mutex); + + /* + * If the architecture hasn't provided a sanitised value here, + * this may result in larger arrays than necessary. Resctrl will + * use a smaller system wide value based on the resources in + * use. + */ + tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); + if (!tmp) { + mutex_unlock(&rdtgroup_mutex); + return -ENOMEM; + } + + closid_num_dirty_rmid = tmp; + + mutex_unlock(&rdtgroup_mutex); + } + + return 0; +} + +static void closid_num_dirty_rmid_free(void) +{ + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + mutex_lock(&rdtgroup_mutex); + kfree(closid_num_dirty_rmid); + closid_num_dirty_rmid = NULL; + mutex_unlock(&rdtgroup_mutex); + } +} + /** * resctrl_mon_resource_init() - Initialise global monitoring structures. * * Allocate and initialise global monitor resources that do not belong to a - * specific domain. i.e. the rmid_ptrs[] used for the limbo and free lists. + * specific domain. i.e. the closid_num_dirty_rmid[] used to find the CLOSID + * with the cleanest set of RMIDs. * Called once during boot after the struct rdt_resource's have been configured * but before the filesystem is mounted. * Resctrl's cpuhp callbacks may be called before this point to bring a domain * online. * - * Returns 0 for success, or -ENOMEM. + * Return: 0 for success, or -ENOMEM. */ -int resctrl_mon_resource_init(void) +static void resctrl_mon_resource_init(struct rdt_resource *r) { - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - int ret; + unsigned long fflags; - if (!r->mon_capable) - return 0; - - ret = dom_data_init(r); - if (ret) - return ret; + fflags = (r->rid == RDT_RESOURCE_MBA) ? RFTYPE_RES_MB :RFTYPE_RES_CACHE; if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) { mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].configurable = true; resctrl_file_fflags_init("mbm_total_bytes_config", - RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + RFTYPE_MON_INFO | fflags); } if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) { mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].configurable = true; resctrl_file_fflags_init("mbm_local_bytes_config", - RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + RFTYPE_MON_INFO | fflags); } if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) @@ -1826,21 +1931,48 @@ int resctrl_mon_resource_init(void) NON_TEMP_WRITE_TO_LOCAL_MEM); r->mon.mbm_assign_on_mkdir = true; resctrl_file_fflags_init("num_mbm_cntrs", - RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + RFTYPE_MON_INFO | fflags); resctrl_file_fflags_init("available_mbm_cntrs", - RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + RFTYPE_MON_INFO | fflags); resctrl_file_fflags_init("event_filter", RFTYPE_ASSIGN_CONFIG); resctrl_file_fflags_init("mbm_assign_on_mkdir", RFTYPE_MON_INFO | - RFTYPE_RES_CACHE); - resctrl_file_fflags_init("mbm_L3_assignments", RFTYPE_MON_BASE); + fflags); + if (r->rid == RDT_RESOURCE_MBA) + resctrl_file_fflags_init("mbm_MB_assignments", RFTYPE_MON_BASE); + else + resctrl_file_fflags_init("mbm_L3_assignments", RFTYPE_MON_BASE); + resctrl_file_fflags_init("mbm_assign_mode", RFTYPE_MON_INFO | + fflags); } +} + +int resctrl_mon_init(void) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + int ret; + + if (!r->mon_capable) + return 0; + + ret = closid_num_dirty_rmid_alloc(r); + if (ret) + return ret; + + resctrl_mon_resource_init(r); + + r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); + if (r) + resctrl_mon_resource_init(r); return 0; } -void resctrl_mon_resource_exit(void) +void resctrl_mon_exit(void) { struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - dom_data_exit(r); + if (!r->mon_capable) + return; + + closid_num_dirty_rmid_free(); } diff --git a/fs/resctrl/pseudo_lock.c b/fs/resctrl/pseudo_lock.c index 4086e61df3e1c..87bbc2605de12 100644 --- a/fs/resctrl/pseudo_lock.c +++ b/fs/resctrl/pseudo_lock.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include @@ -695,7 +694,6 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) int ret = -1; cpus_read_lock(); - get_online_mems(); mutex_lock(&rdtgroup_mutex); if (rdtgrp->flags & RDT_DELETED) { @@ -743,7 +741,6 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) out: mutex_unlock(&rdtgroup_mutex); - put_online_mems(); cpus_read_unlock(); return ret; } diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 3c9981f545017..c2aed590ad897 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include @@ -1156,7 +1156,6 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, u32 ctrl_val; cpus_read_lock(); - get_online_mems(); mutex_lock(&rdtgroup_mutex); hw_shareable = r->cache.shareable_bits; list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { @@ -1217,7 +1216,6 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, } seq_putc(seq, '\n'); mutex_unlock(&rdtgroup_mutex); - put_online_mems(); cpus_read_unlock(); return 0; } @@ -1236,7 +1234,7 @@ static int rdt_num_rmids_show(struct kernfs_open_file *of, { struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - seq_printf(seq, "%d\n", r->mon.num_rmid); + seq_printf(seq, "%u\n", r->mon.num_rmid); return 0; } @@ -1718,11 +1716,10 @@ static void mondata_config_read(struct resctrl_mon_config_info *mon_info) static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid) { struct resctrl_mon_config_info mon_info; - struct rdt_mon_domain *dom; + struct rdt_l3_mon_domain *dom; bool sep = false; cpus_read_lock(); - get_online_mems(); mutex_lock(&rdtgroup_mutex); list_for_each_entry(dom, &r->mon_domains, hdr.list) { @@ -1741,7 +1738,6 @@ static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid seq_puts(s, "\n"); mutex_unlock(&rdtgroup_mutex); - put_online_mems(); cpus_read_unlock(); return 0; @@ -1792,7 +1788,7 @@ static int resctrl_schema_format_show(struct kernfs_open_file *of, } static void mbm_config_write_domain(struct rdt_resource *r, - struct rdt_mon_domain *d, u32 evtid, u32 val) + struct rdt_l3_mon_domain *d, u32 evtid, u32 val) { struct resctrl_mon_config_info mon_info = {0}; @@ -1833,8 +1829,8 @@ static void mbm_config_write_domain(struct rdt_resource *r, static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) { char *dom_str = NULL, *id_str; + struct rdt_l3_mon_domain *d; unsigned long dom_id, val; - struct rdt_mon_domain *d; /* Walking r->domains, ensure it can't race with cpuhp */ lockdep_assert_cpus_held(); @@ -1886,7 +1882,6 @@ static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, return -EINVAL; cpus_read_lock(); - get_online_mems(); mutex_lock(&rdtgroup_mutex); rdt_last_cmd_clear(); @@ -1896,7 +1891,6 @@ static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID); mutex_unlock(&rdtgroup_mutex); - put_online_mems(); cpus_read_unlock(); return ret ?: nbytes; @@ -1914,7 +1908,6 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, return -EINVAL; cpus_read_lock(); - get_online_mems(); mutex_lock(&rdtgroup_mutex); rdt_last_cmd_clear(); @@ -1924,7 +1917,6 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID); mutex_unlock(&rdtgroup_mutex); - put_online_mems(); cpus_read_unlock(); return ret ?: nbytes; @@ -2141,6 +2133,13 @@ static struct rftype res_common_files[] = { .seq_show = mbm_L3_assignments_show, .write = mbm_L3_assignments_write, }, + { + .name = "mbm_MB_assignments", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = mbm_MB_assignments_show, + .write = mbm_MB_assignments_write, + }, { .name = "mbm_assign_mode", .mode = 0644, @@ -2496,6 +2495,8 @@ static unsigned long fflags_from_resource(struct rdt_resource *r) case RDT_RESOURCE_MBA: case RDT_RESOURCE_SMBA: return RFTYPE_RES_MB; + case RDT_RESOURCE_PERF_PKG: + return RFTYPE_RES_PERF_PKG; } return 0; @@ -2736,7 +2737,6 @@ struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) rdtgroup_kn_get(rdtgrp, kn); cpus_read_lock(); - get_online_mems(); mutex_lock(&rdtgroup_mutex); /* Was this group deleted while we waited? */ @@ -2754,7 +2754,6 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn) return; mutex_unlock(&rdtgroup_mutex); - put_online_mems(); cpus_read_unlock(); rdtgroup_kn_put(rdtgrp, kn); @@ -2768,7 +2767,6 @@ static void rdt_disable_ctx(void) { resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); - resctrl_arch_set_mb_uses_numa_nid(false); set_mba_sc(false); resctrl_debug = false; @@ -2799,17 +2797,8 @@ static int rdt_enable_ctx(struct rdt_fs_context *ctx) if (ctx->enable_debug) resctrl_debug = true; - if (ctx->mb_uses_numa_nid) { - ret = resctrl_arch_set_mb_uses_numa_nid(true); - if (ret) - goto out_debug; - } - return 0; -out_debug: - resctrl_debug = false; - set_mba_sc(false); out_cdpl3: resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); out_cdpl2: @@ -2975,15 +2964,16 @@ static int rdt_get_tree(struct fs_context *fc) { struct rdt_fs_context *ctx = rdt_fc2context(fc); unsigned long flags = RFTYPE_CTRL_BASE; - struct rdt_mon_domain *dom; + struct rdt_l3_mon_domain *dom; struct rdt_resource *r; int ret; + DO_ONCE_SLEEPABLE(resctrl_arch_pre_mount); + if (ctx->enable_abi_playground) enable_abi_playground(); cpus_read_lock(); - get_online_mems(); mutex_lock(&rdtgroup_mutex); /* * resctrl file system can only be mounted once. @@ -2993,6 +2983,10 @@ static int rdt_get_tree(struct fs_context *fc) goto out; } + ret = setup_rmid_lru_list(); + if (ret) + goto out; + ret = rdtgroup_setup_root(ctx); if (ret) goto out; @@ -3088,7 +3082,6 @@ static int rdt_get_tree(struct fs_context *fc) out: rdt_last_cmd_clear(); mutex_unlock(&rdtgroup_mutex); - put_online_mems(); cpus_read_unlock(); return ret; } @@ -3098,17 +3091,15 @@ enum rdt_param { Opt_cdpl2, Opt_mba_mbps, Opt_debug, - Opt_mb_uses_numa_nid, Opt_not_abi_playground, nr__rdt_params }; static const struct fs_parameter_spec rdt_fs_parameters[] = { - fsparam_flag("cdp", Opt_cdp), - fsparam_flag("cdpl2", Opt_cdpl2), - fsparam_flag("mba_MBps", Opt_mba_mbps), - fsparam_flag("debug", Opt_debug), - fsparam_flag("mb_uses_numa_nid", Opt_mb_uses_numa_nid), + fsparam_flag("cdp", Opt_cdp), + fsparam_flag("cdpl2", Opt_cdpl2), + fsparam_flag("mba_MBps", Opt_mba_mbps), + fsparam_flag("debug", Opt_debug), /* * Some of MPAM's out of tree code exposes things through resctrl @@ -3146,9 +3137,6 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_debug: ctx->enable_debug = true; return 0; - case Opt_mb_uses_numa_nid: - ctx->mb_uses_numa_nid = true; - return 0; case Opt_not_abi_playground: ctx->enable_abi_playground = true; return 0; @@ -3309,7 +3297,8 @@ static void rmdir_all_sub(void) * @rid: The resource id for the event file being created. * @domid: The domain id for the event file being created. * @mevt: The type of event file being created. - * @do_sum: Whether SNC summing monitors are being created. + * @do_sum: Whether SNC summing monitors are being created. Only set + * when @rid == RDT_RESOURCE_L3. */ static struct mon_data *mon_get_kn_priv(enum resctrl_res_level rid, int domid, struct mon_evt *mevt, @@ -3321,7 +3310,7 @@ static struct mon_data *mon_get_kn_priv(enum resctrl_res_level rid, int domid, list_for_each_entry(priv, &mon_data_kn_priv_list, list) { if (priv->rid == rid && priv->domid == domid && - priv->sum == do_sum && priv->evtid == mevt->evtid) + priv->sum == do_sum && priv->evt == mevt) return priv; } @@ -3332,7 +3321,7 @@ static struct mon_data *mon_get_kn_priv(enum resctrl_res_level rid, int domid, priv->rid = rid; priv->domid = domid; priv->sum = do_sum; - priv->evtid = mevt->evtid; + priv->evt = mevt; list_add_tail(&priv->list, &mon_data_kn_priv_list); return priv; @@ -3378,7 +3367,6 @@ static void rdt_kill_sb(struct super_block *sb) struct rdt_resource *r; cpus_read_lock(); - get_online_mems(); mutex_lock(&rdtgroup_mutex); rdt_disable_ctx(); @@ -3395,7 +3383,6 @@ static void rdt_kill_sb(struct super_block *sb) resctrl_mounted = false; kernfs_kill_sb(sb); mutex_unlock(&rdtgroup_mutex); - put_online_mems(); cpus_read_unlock(); if (static_branch_unlikely(&resctrl_abi_playground)) @@ -3446,23 +3433,24 @@ static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subn } /* - * Remove all subdirectories of mon_data of ctrl_mon groups - * and monitor groups for the given domain. - * Remove files and directories containing "sum" of domain data - * when last domain being summed is removed. + * Remove files and directories for one SNC node. If it is the last node + * sharing an L3 cache, then remove the upper level directory containing + * the "sum" files too. */ -static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - struct rdt_mon_domain *d) +static void rmdir_mondata_subdir_allrdtgrp_snc(struct rdt_resource *r, + struct rdt_domain_hdr *hdr) { struct rdtgroup *prgrp, *crgrp; + struct rdt_l3_mon_domain *d; char subname[32]; - bool snc_mode; char name[32]; - snc_mode = r->mon_scope == RESCTRL_L3_NODE; - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id); - if (snc_mode) - sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id); + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + return; + + d = container_of(hdr, struct rdt_l3_mon_domain, hdr); + sprintf(name, "mon_%s_%02d", r->name, d->ci_id); + sprintf(subname, "mon_sub_%s_%02d", r->name, hdr->id); list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname); @@ -3472,47 +3460,89 @@ static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, } } -static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, - struct rdt_resource *r, struct rdtgroup *prgrp, - bool do_sum) +/* + * Remove all subdirectories of mon_data of ctrl_mon groups + * and monitor groups for the given domain. + */ +static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + struct rdt_domain_hdr *hdr) +{ + struct rdtgroup *prgrp, *crgrp; + char name[32]; + + if (r->rid == RDT_RESOURCE_L3 && r->mon_scope == RESCTRL_L3_NODE) { + rmdir_mondata_subdir_allrdtgrp_snc(r, hdr); + return; + } + + sprintf(name, "mon_%s_%02d", r->name, hdr->id); + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + kernfs_remove_by_name(prgrp->mon.mon_data_kn, name); + + list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) + kernfs_remove_by_name(crgrp->mon.mon_data_kn, name); + } +} + +/* + * Create a directory for a domain and populate it with monitor files. Create + * summing monitors when @hdr is NULL. No need to initialize summing monitors. + */ +static struct kernfs_node *_mkdir_mondata_subdir(struct kernfs_node *parent_kn, char *name, + struct rdt_domain_hdr *hdr, + struct rdt_resource *r, + struct rdtgroup *prgrp, int domid) { struct rmid_read rr = {0}; + struct kernfs_node *kn; struct mon_data *priv; struct mon_evt *mevt; - int ret, domid; + int ret; + + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + if (IS_ERR(kn)) + return kn; + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; for_each_mon_event(mevt) { if (mevt->rid != r->rid || !mevt->enabled) continue; - domid = do_sum ? d->ci_id : d->hdr.id; - priv = mon_get_kn_priv(r->rid, domid, mevt, do_sum); - if (WARN_ON_ONCE(!priv)) - return -EINVAL; + priv = mon_get_kn_priv(r->rid, domid, mevt, !hdr); + if (WARN_ON_ONCE(!priv)) { + ret = -EINVAL; + goto out_destroy; + } ret = mon_addfile(kn, mevt->name, priv); if (ret) - return ret; + goto out_destroy; - if (!do_sum && resctrl_is_mbm_event(mevt->evtid)) - mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true); + if (hdr && resctrl_is_mbm_event(mevt->evtid)) + mon_event_read(&rr, r, hdr, prgrp, &hdr->cpu_mask, mevt, true); } - return 0; + return kn; +out_destroy: + kernfs_remove(kn); + return ERR_PTR(ret); } -static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, - struct rdt_mon_domain *d, - struct rdt_resource *r, struct rdtgroup *prgrp) +static int mkdir_mondata_subdir_snc(struct kernfs_node *parent_kn, + struct rdt_domain_hdr *hdr, + struct rdt_resource *r, struct rdtgroup *prgrp) { - struct kernfs_node *kn, *ckn; + struct kernfs_node *ckn, *kn; + struct rdt_l3_mon_domain *d; char name[32]; - bool snc_mode; - int ret = 0; - lockdep_assert_held(&rdtgroup_mutex); + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + return -EINVAL; - snc_mode = r->mon_scope == RESCTRL_L3_NODE; - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id); + d = container_of(hdr, struct rdt_l3_mon_domain, hdr); + sprintf(name, "mon_%s_%02d", r->name, d->ci_id); kn = kernfs_find_and_get(parent_kn, name); if (kn) { /* @@ -3521,41 +3551,41 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, */ kernfs_put(kn); } else { - kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + kn = _mkdir_mondata_subdir(parent_kn, name, NULL, r, prgrp, d->ci_id); if (IS_ERR(kn)) return PTR_ERR(kn); + } - ret = rdtgroup_kn_set_ugid(kn); - if (ret) - goto out_destroy; - ret = mon_add_all_files(kn, d, r, prgrp, snc_mode); - if (ret) - goto out_destroy; + sprintf(name, "mon_sub_%s_%02d", r->name, hdr->id); + ckn = _mkdir_mondata_subdir(kn, name, hdr, r, prgrp, hdr->id); + if (IS_ERR(ckn)) { + kernfs_remove(kn); + return PTR_ERR(ckn); } - if (snc_mode) { - sprintf(name, "mon_sub_%s_%02d", r->name, d->hdr.id); - ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp); - if (IS_ERR(ckn)) { - ret = -EINVAL; - goto out_destroy; - } + kernfs_activate(kn); + return 0; +} - ret = rdtgroup_kn_set_ugid(ckn); - if (ret) - goto out_destroy; +static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, + struct rdt_domain_hdr *hdr, + struct rdt_resource *r, struct rdtgroup *prgrp) +{ + struct kernfs_node *kn; + char name[32]; - ret = mon_add_all_files(ckn, d, r, prgrp, false); - if (ret) - goto out_destroy; - } + lockdep_assert_held(&rdtgroup_mutex); + + if (r->rid == RDT_RESOURCE_L3 && r->mon_scope == RESCTRL_L3_NODE) + return mkdir_mondata_subdir_snc(parent_kn, hdr, r, prgrp); + + sprintf(name, "mon_%s_%02d", r->name, hdr->id); + kn = _mkdir_mondata_subdir(parent_kn, name, hdr, r, prgrp, hdr->id); + if (IS_ERR(kn)) + return PTR_ERR(kn); kernfs_activate(kn); return 0; - -out_destroy: - kernfs_remove(kn); - return ret; } /* @@ -3563,7 +3593,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, * and "monitor" groups with given domain id. */ static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - struct rdt_mon_domain *d) + struct rdt_domain_hdr *hdr) { struct kernfs_node *parent_kn; struct rdtgroup *prgrp, *crgrp; @@ -3571,12 +3601,12 @@ static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { parent_kn = prgrp->mon.mon_data_kn; - mkdir_mondata_subdir(parent_kn, d, r, prgrp); + mkdir_mondata_subdir(parent_kn, hdr, r, prgrp); head = &prgrp->mon.crdtgrp_list; list_for_each_entry(crgrp, head, mon.crdtgrp_list) { parent_kn = crgrp->mon.mon_data_kn; - mkdir_mondata_subdir(parent_kn, d, r, crgrp); + mkdir_mondata_subdir(parent_kn, hdr, r, crgrp); } } } @@ -3585,14 +3615,14 @@ static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, struct rdt_resource *r, struct rdtgroup *prgrp) { - struct rdt_mon_domain *dom; + struct rdt_domain_hdr *hdr; int ret; /* Walking r->domains, ensure it can't race with cpuhp */ lockdep_assert_cpus_held(); - list_for_each_entry(dom, &r->mon_domains, hdr.list) { - ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); + list_for_each_entry(hdr, &r->mon_domains, list) { + ret = mkdir_mondata_subdir(parent_kn, hdr, r, prgrp); if (ret) return ret; } @@ -4406,9 +4436,6 @@ static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) if (resctrl_debug) seq_puts(seq, ",debug"); - if (resctrl_arch_get_mb_uses_numa_nid()) - seq_puts(seq, ",mb_uses_numa_nid"); - if (static_branch_unlikely(&resctrl_abi_playground)) seq_puts(seq, ",this_is_not_abi"); @@ -4459,7 +4486,7 @@ static void rdtgroup_setup_default(void) mutex_unlock(&rdtgroup_mutex); } -static void domain_destroy_mon_state(struct rdt_mon_domain *d) +static void domain_destroy_l3_mon_state(struct rdt_l3_mon_domain *d) { int idx; @@ -4481,8 +4508,10 @@ void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain mutex_unlock(&rdtgroup_mutex); } -void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) +void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr) { + struct rdt_l3_mon_domain *d; + mutex_lock(&rdtgroup_mutex); /* @@ -4490,8 +4519,12 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d * per domain monitor data directories. */ if (resctrl_mounted && resctrl_arch_mon_capable()) - rmdir_mondata_subdir_allrdtgrp(r, d); + rmdir_mondata_subdir_allrdtgrp(r, hdr); + + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, r->rid)) + goto out_unlock; + d = container_of(hdr, struct rdt_l3_mon_domain, hdr); if (resctrl_is_mbm_enabled()) cancel_delayed_work(&d->mbm_over); if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID) && has_busy_rmid(d)) { @@ -4507,13 +4540,13 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d cancel_delayed_work(&d->cqm_limbo); } - domain_destroy_mon_state(d); - + domain_destroy_l3_mon_state(d); +out_unlock: mutex_unlock(&rdtgroup_mutex); } /** - * domain_setup_mon_state() - Initialise domain monitoring structures. + * domain_setup_l3_mon_state() - Initialise domain monitoring structures. * @r: The resource for the newly online domain. * @d: The newly online domain. * @@ -4521,11 +4554,17 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d * Called when the first CPU of a domain comes online, regardless of whether * the filesystem is mounted. * During boot this may be called before global allocations have been made by - * resctrl_mon_resource_init(). + * resctrl_l3_mon_resource_init(). + * + * Called during CPU online that may run as soon as CPU online callbacks + * are set up during resctrl initialization. The number of supported RMIDs + * may be reduced if additional mon_capable resources are enumerated + * at mount time. This means the rdt_l3_mon_domain::mbm_states[] and + * rdt_l3_mon_domain::rmid_busy_llc allocations may be larger than needed. * - * Returns 0 for success, or -ENOMEM. + * Return: 0 for success, or -ENOMEM. */ -static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d) +static int domain_setup_l3_mon_state(struct rdt_resource *r, struct rdt_l3_mon_domain *d) { u32 idx_limit = resctrl_arch_system_num_rmid_idx(); size_t tsize = sizeof(*d->mbm_states[0]); @@ -4581,13 +4620,18 @@ int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d return err; } -int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) +int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr) { - int err; + struct rdt_l3_mon_domain *d; + int err = -EINVAL; mutex_lock(&rdtgroup_mutex); - err = domain_setup_mon_state(r, d); + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, r->rid)) + goto out_unlock; + + d = container_of(hdr, struct rdt_l3_mon_domain, hdr); + err = domain_setup_l3_mon_state(r, d); if (err) goto out_unlock; @@ -4600,6 +4644,7 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); + err = 0; /* * If the filesystem is not mounted then only the default resource group * exists. Creation of its directories is deferred until mount time @@ -4607,7 +4652,7 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) * If resctrl is mounted, add per domain monitor data directories. */ if (resctrl_mounted && resctrl_arch_mon_capable()) - mkdir_mondata_subdir_allrdtgrp(r, d); + mkdir_mondata_subdir_allrdtgrp(r, hdr); out_unlock: mutex_unlock(&rdtgroup_mutex); @@ -4633,10 +4678,10 @@ static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) } } -static struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, - struct rdt_resource *r) +static struct rdt_l3_mon_domain *get_mon_domain_from_cpu(int cpu, + struct rdt_resource *r) { - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; lockdep_assert_cpus_held(); @@ -4652,7 +4697,7 @@ static struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, void resctrl_offline_cpu(unsigned int cpu) { struct rdt_resource *l3 = resctrl_arch_get_resource(RDT_RESOURCE_L3); - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; struct rdtgroup *rdtgrp; mutex_lock(&rdtgroup_mutex); @@ -4702,13 +4747,13 @@ int resctrl_init(void) thread_throttle_mode_init(); - ret = resctrl_mon_resource_init(); + ret = resctrl_mon_init(); if (ret) return ret; ret = sysfs_create_mount_point(fs_kobj, "resctrl"); if (ret) { - resctrl_mon_resource_exit(); + resctrl_mon_exit(); return ret; } @@ -4743,7 +4788,7 @@ int resctrl_init(void) cleanup_mountpoint: sysfs_remove_mount_point(fs_kobj, "resctrl"); - resctrl_mon_resource_exit(); + resctrl_mon_exit(); return ret; } @@ -4779,7 +4824,7 @@ static bool resctrl_online_domains_exist(void) * When called by the architecture code, all CPUs and resctrl domains must be * offline. This ensures the limbo and overflow handlers are not scheduled to * run, meaning the data structures they access can be freed by - * resctrl_mon_resource_exit(). + * resctrl_l3_mon_resource_exit(). * * After resctrl_exit() returns, the architecture code should return an * error from all resctrl_arch_ functions that can do this. @@ -4789,14 +4834,12 @@ static bool resctrl_online_domains_exist(void) void resctrl_exit(void) { cpus_read_lock(); - get_online_mems(); WARN_ON_ONCE(resctrl_online_domains_exist()); mutex_lock(&rdtgroup_mutex); resctrl_fs_teardown(); mutex_unlock(&rdtgroup_mutex); - put_online_mems(); cpus_read_unlock(); debugfs_remove_recursive(debugfs_resctrl); @@ -4808,5 +4851,6 @@ void resctrl_exit(void) * it can be used to umount resctrl. */ - resctrl_mon_resource_exit(); + resctrl_mon_exit(); + free_rmid_lru_list(); } diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index aa7d6e1854741..06827f240cf9e 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -78,9 +78,6 @@ struct rdt_resource; void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, enum resctrl_event_id evtid); void resctrl_arch_mon_ctx_free(struct rdt_resource *r, enum resctrl_event_id evtid, void *ctx); -bool resctrl_arch_get_mb_uses_numa_nid(void); -int resctrl_arch_set_mb_uses_numa_nid(bool enabled); - /* * The CPU configuration for MPAM is cheap to write, and is only written if it * has changed. No need for fine grained enables. diff --git a/include/linux/iommu.h b/include/linux/iommu.h index b74228f9f1ce0..a22521af2d242 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -704,9 +704,9 @@ struct iommu_ops { const struct iommu_user_data *user_data); /* Per group IOMMU features */ - int (*get_group_qos_params)(struct iommu_group *group, u16 *partition, + int (*get_group_qos_params)(struct device *dev, u16 *partition, u8 *perf_mon_grp); - int (*set_group_qos_params)(struct iommu_group *group, u16 partition, + int (*set_group_qos_params)(struct device *dev, u16 partition, u8 perf_mon_grp); const struct iommu_domain_ops *default_domain_ops; diff --git a/include/linux/memory.h b/include/linux/memory.h index 2a770e7c6ab1e..40eb70ccb09d5 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -126,7 +126,6 @@ struct mem_section; #define CPUSET_CALLBACK_PRI 10 #define MEMTIER_HOTPLUG_PRI 100 #define KSM_CALLBACK_PRI 100 -#define RESCTRL_CALLBACK_PRI 100 #ifndef CONFIG_MEMORY_HOTPLUG static inline void memory_dev_init(void) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index acc5ac1e92491..23f038a162319 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -334,10 +334,4 @@ extern int arch_create_linear_mapping(int nid, u64 start, u64 size, void arch_remove_linear_mapping(u64 start, u64 size); #endif /* CONFIG_MEMORY_HOTPLUG */ -#if defined(CONFIG_LOCKDEP) && defined(CONFIG_MEMORY_HOTPLUG) -void lockdep_assert_mems_held(void); -#else -static inline void lockdep_assert_mems_held(void) { } -#endif - #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 055f27045b4da..2901cbd34459c 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -54,6 +54,7 @@ enum resctrl_res_level { RDT_RESOURCE_L2, RDT_RESOURCE_MBA, RDT_RESOURCE_SMBA, + RDT_RESOURCE_PERF_PKG, RDT_RESOURCE_L3_MAX, RDT_RESOURCE_L2_MAX, @@ -134,15 +135,24 @@ enum resctrl_domain_type { * @list: all instances of this resource * @id: unique id for this instance * @type: type of this instance + * @rid: resource id for this instance * @cpu_mask: which CPUs share this resource */ struct rdt_domain_hdr { struct list_head list; u32 id; enum resctrl_domain_type type; + enum resctrl_res_level rid; struct cpumask cpu_mask; }; +static inline bool domain_header_is_valid(struct rdt_domain_hdr *hdr, + enum resctrl_domain_type type, + enum resctrl_res_level rid) +{ + return !WARN_ON_ONCE(hdr->type != type || hdr->rid != rid); +} + /** * struct rdt_ctrl_domain - group of CPUs sharing a resctrl control resource * @hdr: common header for different domain types @@ -172,7 +182,7 @@ struct mbm_cntr_cfg { }; /** - * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor resource + * struct rdt_l3_mon_domain - group of CPUs sharing RDT_RESOURCE_L3 monitoring * @hdr: common header for different domain types * @ci_id: cache info id for this domain * @rmid_busy_llc: bitmap of which limbo RMIDs are above threshold @@ -186,7 +196,7 @@ struct mbm_cntr_cfg { * @cntr_cfg: array of assignable counters' configuration (indexed * by counter ID) */ -struct rdt_mon_domain { +struct rdt_l3_mon_domain { struct rdt_domain_hdr hdr; unsigned int ci_id; unsigned long *rmid_busy_llc; @@ -267,6 +277,7 @@ enum resctrl_scope { RESCTRL_L2_CACHE = 2, RESCTRL_L3_CACHE = 3, RESCTRL_L3_NODE, + RESCTRL_PACKAGE, }; /** @@ -294,7 +305,7 @@ enum resctrl_schema_fmt { * events of monitor groups created via mkdir. */ struct resctrl_mon { - int num_rmid; + u32 num_rmid; unsigned int mbm_cfg_mask; int num_mbm_cntrs; bool mbm_cntr_assignable; @@ -378,10 +389,10 @@ struct resctrl_cpu_defaults { }; struct resctrl_mon_config_info { - struct rdt_resource *r; - struct rdt_mon_domain *d; - u32 evtid; - u32 mon_config; + struct rdt_resource *r; + struct rdt_l3_mon_domain *d; + u32 evtid; + u32 mon_config; }; /** @@ -444,7 +455,8 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r); u32 resctrl_arch_system_num_rmid_idx(void); int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); -void resctrl_enable_mon_event(enum resctrl_event_id eventid); +bool resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu, + unsigned int binary_bits, void *arch_priv); bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid); @@ -541,22 +553,31 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type type); int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d); -int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d); +int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr); void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d); -void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d); +void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr); void resctrl_online_cpu(unsigned int cpu); void resctrl_offline_cpu(unsigned int cpu); +/* + * Architecture hook called at beginning of first file system mount attempt. + * No locks are held. + */ +void resctrl_arch_pre_mount(void); + /** * resctrl_arch_rmid_read() - Read the eventid counter corresponding to rmid * for this resource and domain. * @r: resource that the counter should be read from. - * @d: domain that the counter should be read from. + * @hdr: Header of domain that the counter should be read from. * @closid: closid that matches the rmid. Depending on the architecture, the * counter may match traffic of both @closid and @rmid, or @rmid * only. * @rmid: rmid of the counter to read. * @eventid: eventid to read, e.g. L3 occupancy. + * @arch_priv: Architecture private data for this event. + * The @arch_priv provided by the architecture via + * resctrl_enable_mon_event(). * @val: result of the counter read in bytes. * @arch_mon_ctx: An architecture specific value from * resctrl_arch_mon_ctx_alloc(), for MPAM this identifies @@ -572,9 +593,9 @@ void resctrl_offline_cpu(unsigned int cpu); * Return: * 0 on success, or -EIO, -EINVAL etc on error. */ -int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, u32 closid, u32 rmid, enum resctrl_event_id eventid, - u64 *val, void *arch_mon_ctx); + void *arch_priv, u64 *val, void *arch_mon_ctx); /** * resctrl_arch_rmid_read_context_check() - warn about invalid contexts @@ -619,7 +640,7 @@ struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id, * * This can be called from any CPU. */ -void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 closid, u32 rmid, enum resctrl_event_id eventid); @@ -632,7 +653,7 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, * * This can be called from any CPU. */ -void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d); +void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d); /** * resctrl_arch_reset_all_ctrls() - Reset the control for each CLOSID to its @@ -658,7 +679,7 @@ void resctrl_arch_reset_all_ctrls(struct rdt_resource *r); * * This can be called from any CPU. */ -void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, enum resctrl_event_id evtid, u32 rmid, u32 closid, u32 cntr_id, bool assign); @@ -681,7 +702,7 @@ void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, * Return: * 0 on success, or -EIO, -EINVAL etc on error. */ -int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 closid, u32 rmid, int cntr_id, enum resctrl_event_id eventid, u64 *val); @@ -696,7 +717,7 @@ int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, * * This can be called from any CPU. */ -void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 closid, u32 rmid, int cntr_id, enum resctrl_event_id eventid); diff --git a/include/linux/resctrl_types.h b/include/linux/resctrl_types.h index acfe07860b346..a5f56faa18d22 100644 --- a/include/linux/resctrl_types.h +++ b/include/linux/resctrl_types.h @@ -50,6 +50,17 @@ enum resctrl_event_id { QOS_L3_MBM_TOTAL_EVENT_ID = 0x02, QOS_L3_MBM_LOCAL_EVENT_ID = 0x03, + /* Intel Telemetry Events */ + PMT_EVENT_ENERGY, + PMT_EVENT_ACTIVITY, + PMT_EVENT_STALLS_LLC_HIT, + PMT_EVENT_C1_RES, + PMT_EVENT_UNHALTED_CORE_CYCLES, + PMT_EVENT_STALLS_LLC_MISS, + PMT_EVENT_AUTO_C6_RES, + PMT_EVENT_UNHALTED_REF_CYCLES, + PMT_EVENT_UOPS_RETIRED, + /* Must be the last */ QOS_NUM_EVENTS, }; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 89ec5ed8c488b..74318c7877156 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -218,17 +218,6 @@ void put_online_mems(void) percpu_up_read(&mem_hotplug_lock); } -#ifdef CONFIG_LOCKDEP -void lockdep_assert_mems_held(void) -{ - /* See lockdep_assert_cpus_held() */ - if (system_state < SYSTEM_RUNNING) - return; - - percpu_rwsem_assert_held(&mem_hotplug_lock); -} -#endif - bool movable_node_enabled = false; static int mhp_default_online_type = -1;